]>
Commit | Line | Data |
---|---|---|
1c79356b A |
1 | /* |
2 | * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. | |
3 | * | |
4 | * @APPLE_LICENSE_HEADER_START@ | |
5 | * | |
de355530 A |
6 | * The contents of this file constitute Original Code as defined in and |
7 | * are subject to the Apple Public Source License Version 1.1 (the | |
8 | * "License"). You may not use this file except in compliance with the | |
9 | * License. Please obtain a copy of the License at | |
10 | * http://www.apple.com/publicsource and read it before using this file. | |
1c79356b | 11 | * |
de355530 A |
12 | * This Original Code and all software distributed under the License are |
13 | * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER | |
1c79356b A |
14 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, |
15 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, | |
de355530 A |
16 | * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the |
17 | * License for the specific language governing rights and limitations | |
18 | * under the License. | |
1c79356b A |
19 | * |
20 | * @APPLE_LICENSE_HEADER_END@ | |
21 | */ | |
9bccf70c A |
22 | * Copyright (c) 1998-2001 Luigi Rizzo, Universita` di Pisa |
23 | * Portions Copyright (c) 2000 Akamba Corp. | |
24 | * All rights reserved | |
1c79356b | 25 | * |
9bccf70c A |
26 | * Redistribution and use in source and binary forms, with or without |
27 | * modification, are permitted provided that the following conditions | |
28 | * are met: | |
29 | * 1. Redistributions of source code must retain the above copyright | |
30 | * notice, this list of conditions and the following disclaimer. | |
31 | * 2. Redistributions in binary form must reproduce the above copyright | |
32 | * notice, this list of conditions and the following disclaimer in the | |
33 | * documentation and/or other materials provided with the distribution. | |
1c79356b | 34 | * |
9bccf70c A |
35 | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND |
36 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
37 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |
38 | * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE | |
39 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |
40 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS | |
41 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | |
42 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | |
43 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY | |
44 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | |
45 | * SUCH DAMAGE. | |
1c79356b | 46 | * |
9bccf70c | 47 | * $FreeBSD: src/sys/netinet/ip_dummynet.c,v 1.24.2.11 2001/02/09 23:18:08 luigi Exp $ |
1c79356b A |
48 | */ |
49 | ||
9bccf70c A |
50 | #define DEB(x) |
51 | #define DDB(x) x | |
52 | ||
1c79356b A |
53 | /* |
54 | * This module implements IP dummynet, a bandwidth limiter/delay emulator | |
55 | * used in conjunction with the ipfw package. | |
9bccf70c A |
56 | * Description of the data structures used is in ip_dummynet.h |
57 | * Here you mainly find the following blocks of code: | |
58 | * + variable declarations; | |
59 | * + heap management functions; | |
60 | * + scheduler and dummynet functions; | |
61 | * + configuration and initialization. | |
62 | * | |
63 | * NOTA BENE: critical sections are protected by splimp()/splx() | |
64 | * pairs. One would think that splnet() is enough as for most of | |
65 | * the netinet code, but it is not so because when used with | |
66 | * bridging, dummynet is invoked at splimp(). | |
1c79356b | 67 | * |
9bccf70c | 68 | * Most important Changes: |
1c79356b | 69 | * |
9bccf70c A |
70 | * 010124: Fixed WF2Q behaviour |
71 | * 010122: Fixed spl protection. | |
72 | * 000601: WF2Q support | |
73 | * 000106: large rewrite, use heaps to handle very many pipes. | |
1c79356b | 74 | * 980513: initial release |
9bccf70c A |
75 | * |
76 | * include files marked with XXX are probably not needed | |
1c79356b A |
77 | */ |
78 | ||
1c79356b A |
79 | #include <sys/param.h> |
80 | #include <sys/systm.h> | |
81 | #include <sys/malloc.h> | |
82 | #include <sys/mbuf.h> | |
83 | #include <sys/queue.h> /* XXX */ | |
84 | #include <sys/kernel.h> | |
85 | #include <sys/socket.h> | |
86 | #include <sys/socketvar.h> | |
87 | #include <sys/time.h> | |
88 | #include <sys/sysctl.h> | |
89 | #include <net/if.h> | |
90 | #include <net/route.h> | |
91 | #include <netinet/in.h> | |
92 | #include <netinet/in_systm.h> | |
93 | #include <netinet/in_var.h> | |
94 | #include <netinet/ip.h> | |
95 | #include <netinet/ip_fw.h> | |
96 | #include <netinet/ip_dummynet.h> | |
97 | #include <netinet/ip_var.h> | |
98 | ||
99 | #if BRIDGE | |
100 | #include <netinet/if_ether.h> /* for struct arpcom */ | |
101 | #include <net/bridge.h> | |
102 | #endif | |
103 | ||
9bccf70c A |
104 | /* |
105 | * We keep a private variable for the simulation time, but we could | |
106 | * probably use an existing one ("softticks" in sys/kern/kern_timer.c) | |
107 | */ | |
108 | static dn_key curr_time = 0 ; /* current simulation time */ | |
109 | ||
110 | static int dn_hash_size = 64 ; /* default hash size */ | |
111 | ||
112 | /* statistics on number of queue searches and search steps */ | |
113 | static int searches, search_steps ; | |
114 | static int pipe_expire = 1 ; /* expire queue if empty */ | |
115 | static int dn_max_ratio = 16 ; /* max queues/buckets ratio */ | |
116 | ||
117 | static int red_lookup_depth = 256; /* RED - default lookup table depth */ | |
118 | static int red_avg_pkt_size = 512; /* RED - default medium packet size */ | |
119 | static int red_max_pkt_size = 1500; /* RED - default max packet size */ | |
120 | ||
121 | /* | |
122 | * Three heaps contain queues and pipes that the scheduler handles: | |
123 | * | |
124 | * ready_heap contains all dn_flow_queue related to fixed-rate pipes. | |
125 | * | |
126 | * wfq_ready_heap contains the pipes associated with WF2Q flows | |
127 | * | |
128 | * extract_heap contains pipes associated with delay lines. | |
129 | * | |
130 | */ | |
131 | static struct dn_heap ready_heap, extract_heap, wfq_ready_heap ; | |
132 | ||
133 | static int heap_init(struct dn_heap *h, int size) ; | |
134 | static int heap_insert (struct dn_heap *h, dn_key key1, void *p); | |
135 | static void heap_extract(struct dn_heap *h, void *obj); | |
136 | ||
137 | static void transmit_event(struct dn_pipe *pipe); | |
138 | static void ready_event(struct dn_flow_queue *q); | |
139 | ||
1c79356b | 140 | static struct dn_pipe *all_pipes = NULL ; /* list of all pipes */ |
9bccf70c | 141 | static struct dn_flow_set *all_flow_sets = NULL ;/* list of all flow_sets */ |
1c79356b | 142 | |
9bccf70c A |
143 | #if SYSCTL_NODE |
144 | SYSCTL_NODE(_net_inet_ip, OID_AUTO, dummynet, | |
145 | CTLFLAG_RW, 0, "Dummynet"); | |
146 | SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, hash_size, | |
147 | CTLFLAG_RW, &dn_hash_size, 0, "Default hash table size"); | |
148 | SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, curr_time, | |
149 | CTLFLAG_RD, &curr_time, 0, "Current tick"); | |
150 | SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, ready_heap, | |
151 | CTLFLAG_RD, &ready_heap.size, 0, "Size of ready heap"); | |
152 | SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, extract_heap, | |
153 | CTLFLAG_RD, &extract_heap.size, 0, "Size of extract heap"); | |
154 | SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, searches, | |
155 | CTLFLAG_RD, &searches, 0, "Number of queue searches"); | |
156 | SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, search_steps, | |
157 | CTLFLAG_RD, &search_steps, 0, "Number of queue search steps"); | |
158 | SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, expire, | |
159 | CTLFLAG_RW, &pipe_expire, 0, "Expire queue if empty"); | |
160 | SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, max_chain_len, | |
161 | CTLFLAG_RW, &dn_max_ratio, 0, | |
162 | "Max ratio between dynamic queues and buckets"); | |
163 | SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_lookup_depth, | |
164 | CTLFLAG_RD, &red_lookup_depth, 0, "Depth of RED lookup table"); | |
165 | SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_avg_pkt_size, | |
166 | CTLFLAG_RD, &red_avg_pkt_size, 0, "RED Medium packet size"); | |
167 | SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_max_pkt_size, | |
168 | CTLFLAG_RD, &red_max_pkt_size, 0, "RED Max packet size"); | |
1c79356b A |
169 | #endif |
170 | ||
9bccf70c | 171 | static int config_pipe(struct dn_pipe *p); |
1c79356b A |
172 | static int ip_dn_ctl(struct sockopt *sopt); |
173 | ||
174 | static void rt_unref(struct rtentry *); | |
175 | static void dummynet(void *); | |
1c79356b | 176 | static void dummynet_flush(void); |
9bccf70c A |
177 | void dummynet_drain(void); |
178 | int if_tx_rdy(struct ifnet *ifp); | |
1c79356b A |
179 | |
180 | /* | |
9bccf70c | 181 | * ip_fw_chain is used when deleting a pipe, because ipfw rules can |
1c79356b A |
182 | * hold references to the pipe. |
183 | */ | |
9bccf70c | 184 | extern LIST_HEAD (ip_fw_head, ip_fw_chain) ip_fw_chain_head; |
1c79356b A |
185 | |
186 | static void | |
187 | rt_unref(struct rtentry *rt) | |
188 | { | |
189 | if (rt == NULL) | |
190 | return ; | |
191 | if (rt->rt_refcnt <= 0) | |
9bccf70c A |
192 | printf("-- warning, refcnt now %ld, decreasing\n", rt->rt_refcnt); |
193 | rtfree(rt); | |
194 | } | |
195 | ||
196 | /* | |
197 | * Heap management functions. | |
198 | * | |
199 | * In the heap, first node is element 0. Children of i are 2i+1 and 2i+2. | |
200 | * Some macros help finding parent/children so we can optimize them. | |
201 | * | |
202 | * heap_init() is called to expand the heap when needed. | |
203 | * Increment size in blocks of 16 entries. | |
204 | * XXX failure to allocate a new element is a pretty bad failure | |
205 | * as we basically stall a whole queue forever!! | |
206 | * Returns 1 on error, 0 on success | |
207 | */ | |
208 | #define HEAP_FATHER(x) ( ( (x) - 1 ) / 2 ) | |
209 | #define HEAP_LEFT(x) ( 2*(x) + 1 ) | |
210 | #define HEAP_IS_LEFT(x) ( (x) & 1 ) | |
211 | #define HEAP_RIGHT(x) ( 2*(x) + 2 ) | |
212 | #define HEAP_SWAP(a, b, buffer) { buffer = a ; a = b ; b = buffer ; } | |
213 | #define HEAP_INCREMENT 15 | |
214 | ||
215 | static int | |
216 | heap_init(struct dn_heap *h, int new_size) | |
217 | { | |
218 | struct dn_heap_entry *p; | |
219 | ||
220 | if (h->size >= new_size ) { | |
221 | printf("heap_init, Bogus call, have %d want %d\n", | |
222 | h->size, new_size); | |
223 | return 0 ; | |
224 | } | |
225 | new_size = (new_size + HEAP_INCREMENT ) & ~HEAP_INCREMENT ; | |
226 | p = _MALLOC(new_size * sizeof(*p), M_IPFW, M_DONTWAIT ); | |
227 | if (p == NULL) { | |
228 | printf(" heap_init, resize %d failed\n", new_size ); | |
229 | return 1 ; /* error */ | |
230 | } | |
231 | if (h->size > 0) { | |
232 | bcopy(h->p, p, h->size * sizeof(*p) ); | |
233 | FREE(h->p, M_IPFW); | |
234 | } | |
235 | h->p = p ; | |
236 | h->size = new_size ; | |
237 | return 0 ; | |
238 | } | |
239 | ||
240 | /* | |
241 | * Insert element in heap. Normally, p != NULL, we insert p in | |
242 | * a new position and bubble up. If p == NULL, then the element is | |
243 | * already in place, and key is the position where to start the | |
244 | * bubble-up. | |
245 | * Returns 1 on failure (cannot allocate new heap entry) | |
246 | * | |
247 | * If offset > 0 the position (index, int) of the element in the heap is | |
248 | * also stored in the element itself at the given offset in bytes. | |
249 | */ | |
250 | #define SET_OFFSET(heap, node) \ | |
251 | if (heap->offset > 0) \ | |
252 | *((int *)((char *)(heap->p[node].object) + heap->offset)) = node ; | |
253 | /* | |
254 | * RESET_OFFSET is used for sanity checks. It sets offset to an invalid value. | |
255 | */ | |
256 | #define RESET_OFFSET(heap, node) \ | |
257 | if (heap->offset > 0) \ | |
258 | *((int *)((char *)(heap->p[node].object) + heap->offset)) = -1 ; | |
259 | static int | |
260 | heap_insert(struct dn_heap *h, dn_key key1, void *p) | |
261 | { | |
262 | int son = h->elements ; | |
263 | ||
264 | if (p == NULL) /* data already there, set starting point */ | |
265 | son = key1 ; | |
266 | else { /* insert new element at the end, possibly resize */ | |
267 | son = h->elements ; | |
268 | if (son == h->size) /* need resize... */ | |
269 | if (heap_init(h, h->elements+1) ) | |
270 | return 1 ; /* failure... */ | |
271 | h->p[son].object = p ; | |
272 | h->p[son].key = key1 ; | |
273 | h->elements++ ; | |
274 | } | |
275 | while (son > 0) { /* bubble up */ | |
276 | int father = HEAP_FATHER(son) ; | |
277 | struct dn_heap_entry tmp ; | |
278 | ||
279 | if (DN_KEY_LT( h->p[father].key, h->p[son].key ) ) | |
280 | break ; /* found right position */ | |
281 | /* son smaller than father, swap and repeat */ | |
282 | HEAP_SWAP(h->p[son], h->p[father], tmp) ; | |
283 | SET_OFFSET(h, son); | |
284 | son = father ; | |
285 | } | |
286 | SET_OFFSET(h, son); | |
287 | return 0 ; | |
1c79356b A |
288 | } |
289 | ||
290 | /* | |
9bccf70c | 291 | * remove top element from heap, or obj if obj != NULL |
1c79356b A |
292 | */ |
293 | static void | |
9bccf70c A |
294 | heap_extract(struct dn_heap *h, void *obj) |
295 | { | |
296 | int child, father, max = h->elements - 1 ; | |
297 | ||
298 | if (max < 0) { | |
299 | printf("warning, extract from empty heap 0x%p\n", h); | |
300 | return ; | |
301 | } | |
302 | father = 0 ; /* default: move up smallest child */ | |
303 | if (obj != NULL) { /* extract specific element, index is at offset */ | |
304 | if (h->offset <= 0) | |
305 | panic("*** heap_extract from middle not supported on this heap!!!\n"); | |
306 | father = *((int *)((char *)obj + h->offset)) ; | |
307 | if (father < 0 || father >= h->elements) { | |
308 | printf("dummynet: heap_extract, father %d out of bound 0..%d\n", | |
309 | father, h->elements); | |
310 | panic("heap_extract"); | |
311 | } | |
312 | } | |
313 | RESET_OFFSET(h, father); | |
314 | child = HEAP_LEFT(father) ; /* left child */ | |
315 | while (child <= max) { /* valid entry */ | |
316 | if (child != max && DN_KEY_LT(h->p[child+1].key, h->p[child].key) ) | |
317 | child = child+1 ; /* take right child, otherwise left */ | |
318 | h->p[father] = h->p[child] ; | |
319 | SET_OFFSET(h, father); | |
320 | father = child ; | |
321 | child = HEAP_LEFT(child) ; /* left child for next loop */ | |
322 | } | |
323 | h->elements-- ; | |
324 | if (father != max) { | |
1c79356b | 325 | /* |
9bccf70c | 326 | * Fill hole with last entry and bubble up, reusing the insert code |
1c79356b | 327 | */ |
9bccf70c A |
328 | h->p[father] = h->p[max] ; |
329 | heap_insert(h, father, NULL); /* this one cannot fail */ | |
330 | } | |
331 | } | |
1c79356b | 332 | |
9bccf70c A |
333 | #if 0 |
334 | /* | |
335 | * change object position and update references | |
336 | * XXX this one is never used! | |
337 | */ | |
338 | static void | |
339 | heap_move(struct dn_heap *h, dn_key new_key, void *object) | |
340 | { | |
341 | int temp; | |
342 | int i ; | |
343 | int max = h->elements-1 ; | |
344 | struct dn_heap_entry buf ; | |
1c79356b | 345 | |
9bccf70c A |
346 | if (h->offset <= 0) |
347 | panic("cannot move items on this heap"); | |
348 | ||
349 | i = *((int *)((char *)object + h->offset)); | |
350 | if (DN_KEY_LT(new_key, h->p[i].key) ) { /* must move up */ | |
351 | h->p[i].key = new_key ; | |
352 | for (; i>0 && DN_KEY_LT(new_key, h->p[(temp = HEAP_FATHER(i))].key) ; | |
353 | i = temp ) { /* bubble up */ | |
354 | HEAP_SWAP(h->p[i], h->p[temp], buf) ; | |
355 | SET_OFFSET(h, i); | |
356 | } | |
357 | } else { /* must move down */ | |
358 | h->p[i].key = new_key ; | |
359 | while ( (temp = HEAP_LEFT(i)) <= max ) { /* found left child */ | |
360 | if ((temp != max) && DN_KEY_GT(h->p[temp].key, h->p[temp+1].key)) | |
361 | temp++ ; /* select child with min key */ | |
362 | if (DN_KEY_GT(new_key, h->p[temp].key)) { /* go down */ | |
363 | HEAP_SWAP(h->p[i], h->p[temp], buf) ; | |
364 | SET_OFFSET(h, i); | |
365 | } else | |
366 | break ; | |
367 | i = temp ; | |
1c79356b | 368 | } |
1c79356b | 369 | } |
9bccf70c A |
370 | SET_OFFSET(h, i); |
371 | } | |
372 | #endif /* heap_move, unused */ | |
373 | ||
374 | /* | |
375 | * heapify() will reorganize data inside an array to maintain the | |
376 | * heap property. It is needed when we delete a bunch of entries. | |
377 | */ | |
378 | static void | |
379 | heapify(struct dn_heap *h) | |
380 | { | |
381 | int i ; | |
382 | ||
383 | for (i = 0 ; i < h->elements ; i++ ) | |
384 | heap_insert(h, i , NULL) ; | |
385 | } | |
386 | ||
387 | /* | |
388 | * cleanup the heap and free data structure | |
389 | */ | |
390 | static void | |
391 | heap_free(struct dn_heap *h) | |
392 | { | |
393 | if (h->size >0 ) | |
394 | FREE(h->p, M_IPFW); | |
395 | bzero(h, sizeof(*h) ); | |
396 | } | |
397 | ||
398 | /* | |
399 | * --- end of heap management functions --- | |
400 | */ | |
401 | ||
402 | /* | |
403 | * Scheduler functions: | |
404 | * | |
405 | * transmit_event() is called when the delay-line needs to enter | |
406 | * the scheduler, either because of existing pkts getting ready, | |
407 | * or new packets entering the queue. The event handled is the delivery | |
408 | * time of the packet. | |
409 | * | |
410 | * ready_event() does something similar with fixed-rate queues, and the | |
411 | * event handled is the finish time of the head pkt. | |
412 | * | |
413 | * wfq_ready_event() does something similar with WF2Q queues, and the | |
414 | * event handled is the start time of the head pkt. | |
415 | * | |
416 | * In all cases, we make sure that the data structures are consistent | |
417 | * before passing pkts out, because this might trigger recursive | |
418 | * invocations of the procedures. | |
419 | */ | |
420 | static void | |
421 | transmit_event(struct dn_pipe *pipe) | |
422 | { | |
423 | struct dn_pkt *pkt ; | |
1c79356b | 424 | |
9bccf70c | 425 | while ( (pkt = pipe->head) && DN_KEY_LEQ(pkt->output_time, curr_time) ) { |
1c79356b | 426 | /* |
9bccf70c A |
427 | * first unlink, then call procedures, since ip_input() can invoke |
428 | * ip_output() and viceversa, thus causing nested calls | |
1c79356b | 429 | */ |
9bccf70c | 430 | pipe->head = DN_NEXT(pkt) ; |
1c79356b A |
431 | |
432 | /* | |
9bccf70c A |
433 | * The actual mbuf is preceded by a struct dn_pkt, resembling an mbuf |
434 | * (NOT A REAL one, just a small block of malloc'ed memory) with | |
435 | * m_type = MT_DUMMYNET | |
436 | * m_next = actual mbuf to be processed by ip_input/output | |
437 | * m_data = the matching rule | |
438 | * and some other fields. | |
439 | * The block IS FREED HERE because it contains parameters passed | |
440 | * to the called routine. | |
1c79356b A |
441 | */ |
442 | switch (pkt->dn_dir) { | |
9bccf70c A |
443 | case DN_TO_IP_OUT: |
444 | (void)ip_output((struct mbuf *)pkt, NULL, NULL, 0, NULL); | |
445 | rt_unref (pkt->ro.ro_rt) ; | |
1c79356b | 446 | break ; |
9bccf70c | 447 | |
1c79356b A |
448 | case DN_TO_IP_IN : |
449 | ip_input((struct mbuf *)pkt) ; | |
450 | break ; | |
9bccf70c | 451 | |
1c79356b | 452 | #if BRIDGE |
9bccf70c A |
453 | case DN_TO_BDG_FWD : { |
454 | struct mbuf *m = (struct mbuf *)pkt ; | |
455 | struct ether_header *eh; | |
456 | ||
457 | if (pkt->dn_m->m_len < ETHER_HDR_LEN | |
458 | && (pkt->dn_m = m_pullup(pkt->dn_m, ETHER_HDR_LEN)) == NULL) { | |
459 | printf("dummynet/bridge: pullup fail, dropping pkt\n"); | |
460 | break; | |
461 | } | |
462 | /* | |
463 | * same as ether_input, make eh be a pointer into the mbuf | |
464 | */ | |
465 | eh = mtod(pkt->dn_m, struct ether_header *); | |
466 | m_adj(pkt->dn_m, ETHER_HDR_LEN); | |
467 | /* | |
468 | * bdg_forward() wants a pointer to the pseudo-mbuf-header, but | |
469 | * on return it will supply the pointer to the actual packet | |
470 | * (originally pkt->dn_m, but could be something else now) if | |
471 | * it has not consumed it. | |
472 | */ | |
473 | m = bdg_forward(m, eh, pkt->ifp); | |
474 | if (m) | |
475 | m_freem(m); | |
476 | } | |
1c79356b A |
477 | break ; |
478 | #endif | |
9bccf70c | 479 | |
1c79356b A |
480 | default: |
481 | printf("dummynet: bad switch %d!\n", pkt->dn_dir); | |
482 | m_freem(pkt->dn_m); | |
1c79356b A |
483 | break ; |
484 | } | |
9bccf70c | 485 | FREE(pkt, M_IPFW); |
1c79356b | 486 | } |
9bccf70c A |
487 | /* if there are leftover packets, put into the heap for next event */ |
488 | if ( (pkt = pipe->head) ) | |
489 | heap_insert(&extract_heap, pkt->output_time, pipe ) ; | |
490 | /* XXX should check errors on heap_insert, by draining the | |
491 | * whole pipe p and hoping in the future we are more successful | |
492 | */ | |
1c79356b | 493 | } |
9bccf70c | 494 | |
1c79356b | 495 | /* |
9bccf70c A |
496 | * the following macro computes how many ticks we have to wait |
497 | * before being able to transmit a packet. The credit is taken from | |
498 | * either a pipe (WF2Q) or a flow_queue (per-flow queueing) | |
1c79356b | 499 | */ |
9bccf70c A |
500 | #define SET_TICKS(pkt, q, p) \ |
501 | (pkt->dn_m->m_pkthdr.len*8*hz - (q)->numbytes + p->bandwidth - 1 ) / \ | |
502 | p->bandwidth ; | |
503 | ||
504 | /* | |
505 | * extract pkt from queue, compute output time (could be now) | |
506 | * and put into delay line (p_queue) | |
507 | */ | |
508 | static void | |
509 | move_pkt(struct dn_pkt *pkt, struct dn_flow_queue *q, | |
510 | struct dn_pipe *p, int len) | |
1c79356b | 511 | { |
9bccf70c A |
512 | q->head = DN_NEXT(pkt) ; |
513 | q->len-- ; | |
514 | q->len_bytes -= len ; | |
1c79356b | 515 | |
9bccf70c | 516 | pkt->output_time = curr_time + p->delay ; |
1c79356b | 517 | |
9bccf70c A |
518 | if (p->head == NULL) |
519 | p->head = pkt; | |
520 | else | |
521 | DN_NEXT(p->tail) = pkt; | |
522 | p->tail = pkt; | |
523 | DN_NEXT(p->tail) = NULL; | |
1c79356b A |
524 | } |
525 | ||
526 | /* | |
9bccf70c A |
527 | * ready_event() is invoked every time the queue must enter the |
528 | * scheduler, either because the first packet arrives, or because | |
529 | * a previously scheduled event fired. | |
530 | * On invokation, drain as many pkts as possible (could be 0) and then | |
531 | * if there are leftover packets reinsert the pkt in the scheduler. | |
1c79356b | 532 | */ |
9bccf70c A |
533 | static void |
534 | ready_event(struct dn_flow_queue *q) | |
1c79356b A |
535 | { |
536 | struct dn_pkt *pkt; | |
9bccf70c A |
537 | struct dn_pipe *p = q->fs->pipe ; |
538 | int p_was_empty ; | |
1c79356b | 539 | |
9bccf70c A |
540 | if (p == NULL) { |
541 | printf("ready_event- pipe is gone\n"); | |
542 | return ; | |
543 | } | |
544 | p_was_empty = (p->head == NULL) ; | |
1c79356b | 545 | |
1c79356b | 546 | /* |
9bccf70c A |
547 | * schedule fixed-rate queues linked to this pipe: |
548 | * Account for the bw accumulated since last scheduling, then | |
549 | * drain as many pkts as allowed by q->numbytes and move to | |
550 | * the delay line (in p) computing output time. | |
551 | * bandwidth==0 (no limit) means we can drain the whole queue, | |
552 | * setting len_scaled = 0 does the job. | |
553 | */ | |
554 | q->numbytes += ( curr_time - q->sched_time ) * p->bandwidth; | |
555 | while ( (pkt = q->head) != NULL ) { | |
556 | int len = pkt->dn_m->m_pkthdr.len; | |
557 | int len_scaled = p->bandwidth ? len*8*hz : 0 ; | |
558 | if (len_scaled > q->numbytes ) | |
559 | break ; | |
560 | q->numbytes -= len_scaled ; | |
561 | move_pkt(pkt, q, p, len); | |
562 | } | |
563 | /* | |
564 | * If we have more packets queued, schedule next ready event | |
565 | * (can only occur when bandwidth != 0, otherwise we would have | |
566 | * flushed the whole queue in the previous loop). | |
567 | * To this purpose we record the current time and compute how many | |
568 | * ticks to go for the finish time of the packet. | |
569 | */ | |
570 | if ( (pkt = q->head) != NULL ) { /* this implies bandwidth != 0 */ | |
571 | dn_key t = SET_TICKS(pkt, q, p); /* ticks i have to wait */ | |
572 | q->sched_time = curr_time ; | |
573 | heap_insert(&ready_heap, curr_time + t, (void *)q ); | |
574 | /* XXX should check errors on heap_insert, and drain the whole | |
575 | * queue on error hoping next time we are luckier. | |
576 | */ | |
577 | } else /* RED needs to know when the queue becomes empty */ | |
578 | q->q_time = curr_time; | |
579 | /* | |
580 | * If the delay line was empty call transmit_event(p) now. | |
581 | * Otherwise, the scheduler will take care of it. | |
1c79356b | 582 | */ |
9bccf70c A |
583 | if (p_was_empty) |
584 | transmit_event(p); | |
585 | } | |
1c79356b | 586 | |
9bccf70c A |
587 | /* |
588 | * Called when we can transmit packets on WF2Q queues. Take pkts out of | |
589 | * the queues at their start time, and enqueue into the delay line. | |
590 | * Packets are drained until p->numbytes < 0. As long as | |
591 | * len_scaled >= p->numbytes, the packet goes into the delay line | |
592 | * with a deadline p->delay. For the last packet, if p->numbytes<0, | |
593 | * there is an additional delay. | |
594 | */ | |
595 | static void | |
596 | ready_event_wfq(struct dn_pipe *p) | |
597 | { | |
598 | int p_was_empty = (p->head == NULL) ; | |
599 | struct dn_heap *sch = &(p->scheduler_heap); | |
600 | struct dn_heap *neh = &(p->not_eligible_heap) ; | |
601 | ||
602 | if (p->if_name[0] == 0) /* tx clock is simulated */ | |
603 | p->numbytes += ( curr_time - p->sched_time ) * p->bandwidth; | |
604 | else { /* tx clock is for real, the ifq must be empty or this is a NOP */ | |
605 | if (p->ifp && p->ifp->if_snd.ifq_head != NULL) | |
606 | return ; | |
607 | else { | |
608 | DEB(printf("pipe %d ready from %s --\n", | |
609 | p->pipe_nr, p->if_name);) | |
610 | } | |
1c79356b | 611 | } |
9bccf70c | 612 | |
1c79356b | 613 | /* |
9bccf70c A |
614 | * While we have backlogged traffic AND credit, we need to do |
615 | * something on the queue. | |
1c79356b | 616 | */ |
9bccf70c A |
617 | while ( p->numbytes >=0 && (sch->elements>0 || neh->elements >0) ) { |
618 | if (sch->elements > 0) { /* have some eligible pkts to send out */ | |
619 | struct dn_flow_queue *q = sch->p[0].object ; | |
620 | struct dn_pkt *pkt = q->head; | |
621 | struct dn_flow_set *fs = q->fs; | |
622 | u_int64_t len = pkt->dn_m->m_pkthdr.len; | |
623 | int len_scaled = p->bandwidth ? len*8*hz : 0 ; | |
1c79356b | 624 | |
9bccf70c A |
625 | heap_extract(sch, NULL); /* remove queue from heap */ |
626 | p->numbytes -= len_scaled ; | |
627 | move_pkt(pkt, q, p, len); | |
628 | ||
629 | p->V += (len<<MY_M) / p->sum ; /* update V */ | |
630 | q->S = q->F ; /* update start time */ | |
631 | if (q->len == 0) { /* Flow not backlogged any more */ | |
632 | fs->backlogged-- ; | |
633 | heap_insert(&(p->idle_heap), q->F, q); | |
634 | } else { /* still backlogged */ | |
635 | /* | |
636 | * update F and position in backlogged queue, then | |
637 | * put flow in not_eligible_heap (we will fix this later). | |
638 | */ | |
639 | len = (q->head)->dn_m->m_pkthdr.len; | |
640 | q->F += (len<<MY_M)/(u_int64_t) fs->weight ; | |
641 | if (DN_KEY_LEQ(q->S, p->V)) | |
642 | heap_insert(neh, q->S, q); | |
643 | else | |
644 | heap_insert(sch, q->F, q); | |
645 | } | |
646 | } | |
647 | /* | |
648 | * now compute V = max(V, min(S_i)). Remember that all elements in sch | |
649 | * have by definition S_i <= V so if sch is not empty, V is surely | |
650 | * the max and we must not update it. Conversely, if sch is empty | |
651 | * we only need to look at neh. | |
652 | */ | |
653 | if (sch->elements == 0 && neh->elements > 0) | |
654 | p->V = MAX64 ( p->V, neh->p[0].key ); | |
655 | /* move from neh to sch any packets that have become eligible */ | |
656 | while (neh->elements > 0 && DN_KEY_LEQ(neh->p[0].key, p->V) ) { | |
657 | struct dn_flow_queue *q = neh->p[0].object ; | |
658 | heap_extract(neh, NULL); | |
659 | heap_insert(sch, q->F, q); | |
660 | } | |
661 | ||
662 | if (p->if_name[0] != '\0') {/* tx clock is from a real thing */ | |
663 | p->numbytes = -1 ; /* mark not ready for I/O */ | |
664 | break ; | |
665 | } | |
1c79356b | 666 | } |
9bccf70c A |
667 | if (sch->elements == 0 && neh->elements == 0 && p->numbytes >= 0 |
668 | && p->idle_heap.elements > 0) { | |
669 | /* | |
670 | * no traffic and no events scheduled. We can get rid of idle-heap. | |
671 | */ | |
672 | int i ; | |
673 | ||
674 | for (i = 0 ; i < p->idle_heap.elements ; i++) { | |
675 | struct dn_flow_queue *q = p->idle_heap.p[i].object ; | |
1c79356b | 676 | |
9bccf70c A |
677 | q->F = 0 ; |
678 | q->S = q->F + 1 ; | |
679 | } | |
680 | p->sum = 0 ; | |
681 | p->V = 0 ; | |
682 | p->idle_heap.elements = 0 ; | |
683 | } | |
684 | /* | |
685 | * If we are getting clocks from dummynet (not a real interface) and | |
686 | * If we are under credit, schedule the next ready event. | |
687 | * Also fix the delivery time of the last packet. | |
1c79356b | 688 | */ |
9bccf70c A |
689 | if (p->if_name[0]==0 && p->numbytes < 0) { /* this implies bandwidth >0 */ |
690 | dn_key t=0 ; /* number of ticks i have to wait */ | |
1c79356b | 691 | |
9bccf70c A |
692 | if (p->bandwidth > 0) |
693 | t = ( p->bandwidth -1 - p->numbytes) / p->bandwidth ; | |
694 | p->tail->output_time += t ; | |
695 | p->sched_time = curr_time ; | |
696 | heap_insert(&wfq_ready_heap, curr_time + t, (void *)p); | |
697 | /* XXX should check errors on heap_insert, and drain the whole | |
698 | * queue on error hoping next time we are luckier. | |
699 | */ | |
1c79356b | 700 | } |
9bccf70c A |
701 | /* |
702 | * If the delay line was empty call transmit_event(p) now. | |
703 | * Otherwise, the scheduler will take care of it. | |
704 | */ | |
705 | if (p_was_empty) | |
706 | transmit_event(p); | |
1c79356b A |
707 | } |
708 | ||
709 | /* | |
9bccf70c A |
710 | * This is called once per tick, or HZ times per second. It is used to |
711 | * increment the current tick counter and schedule expired events. | |
1c79356b A |
712 | */ |
713 | static void | |
9bccf70c | 714 | dummynet(void * __unused unused) |
1c79356b | 715 | { |
9bccf70c A |
716 | void *p ; /* generic parameter to handler */ |
717 | struct dn_heap *h ; | |
718 | int s ; | |
719 | struct dn_heap *heaps[3]; | |
720 | int i; | |
721 | struct dn_pipe *pe ; | |
1c79356b | 722 | |
9bccf70c A |
723 | heaps[0] = &ready_heap ; /* fixed-rate queues */ |
724 | heaps[1] = &wfq_ready_heap ; /* wfq queues */ | |
725 | heaps[2] = &extract_heap ; /* delay line */ | |
726 | s = splimp(); /* see note on top, splnet() is not enough */ | |
727 | curr_time++ ; | |
728 | for (i=0; i < 3 ; i++) { | |
729 | h = heaps[i]; | |
730 | while (h->elements > 0 && DN_KEY_LEQ(h->p[0].key, curr_time) ) { | |
731 | DDB(if (h->p[0].key > curr_time) | |
732 | printf("-- dummynet: warning, heap %d is %d ticks late\n", | |
733 | i, (int)(curr_time - h->p[0].key));) | |
734 | p = h->p[0].object ; /* store a copy before heap_extract */ | |
735 | heap_extract(h, NULL); /* need to extract before processing */ | |
736 | if (i == 0) | |
737 | ready_event(p) ; | |
738 | else if (i == 1) { | |
739 | struct dn_pipe *pipe = p; | |
740 | if (pipe->if_name[0] != '\0') | |
741 | printf("*** bad ready_event_wfq for pipe %s\n", | |
742 | pipe->if_name); | |
743 | else | |
744 | ready_event_wfq(p) ; | |
745 | } else | |
746 | transmit_event(p); | |
747 | } | |
1c79356b | 748 | } |
9bccf70c A |
749 | /* sweep pipes trying to expire idle flow_queues */ |
750 | for (pe = all_pipes; pe ; pe = pe->next ) | |
751 | if (pe->idle_heap.elements > 0 && | |
752 | DN_KEY_LT(pe->idle_heap.p[0].key, pe->V) ) { | |
753 | struct dn_flow_queue *q = pe->idle_heap.p[0].object ; | |
1c79356b | 754 | |
9bccf70c A |
755 | heap_extract(&(pe->idle_heap), NULL); |
756 | q->S = q->F + 1 ; /* mark timestamp as invalid */ | |
757 | pe->sum -= q->fs->weight ; | |
758 | } | |
759 | splx(s); | |
760 | timeout(dummynet, NULL, 1); | |
761 | } | |
762 | ||
1c79356b | 763 | /* |
9bccf70c | 764 | * called by an interface when tx_rdy occurs. |
1c79356b | 765 | */ |
9bccf70c A |
766 | int |
767 | if_tx_rdy(struct ifnet *ifp) | |
1c79356b | 768 | { |
9bccf70c | 769 | struct dn_pipe *p; |
1c79356b | 770 | |
9bccf70c A |
771 | for (p = all_pipes; p ; p = p->next ) |
772 | if (p->ifp == ifp) | |
773 | break ; | |
774 | if (p == NULL) { | |
775 | char buf[32]; | |
776 | sprintf(buf, "%s%d",ifp->if_name, ifp->if_unit); | |
777 | for (p = all_pipes; p ; p = p->next ) | |
778 | if (!strcmp(p->if_name, buf) ) { | |
779 | p->ifp = ifp ; | |
780 | DEB(printf("++ tx rdy from %s (now found)\n", buf);) | |
781 | break ; | |
782 | } | |
783 | } | |
784 | if (p != NULL) { | |
785 | DEB(printf("++ tx rdy from %s%d - qlen %d\n", ifp->if_name, | |
786 | ifp->if_unit, ifp->if_snd.ifq_len);) | |
787 | p->numbytes = 0 ; /* mark ready for I/O */ | |
788 | ready_event_wfq(p); | |
1c79356b | 789 | } |
9bccf70c | 790 | return 0; |
1c79356b A |
791 | } |
792 | ||
1c79356b | 793 | /* |
9bccf70c A |
794 | * Unconditionally expire empty queues in case of shortage. |
795 | * Returns the number of queues freed. | |
1c79356b | 796 | */ |
9bccf70c A |
797 | static int |
798 | expire_queues(struct dn_flow_set *fs) | |
1c79356b | 799 | { |
9bccf70c A |
800 | struct dn_flow_queue *q, *prev ; |
801 | int i, initial_elements = fs->rq_elements ; | |
1c79356b | 802 | |
9bccf70c A |
803 | if (fs->last_expired == time_second) |
804 | return 0 ; | |
805 | fs->last_expired = time_second ; | |
806 | for (i = 0 ; i <= fs->rq_size ; i++) /* last one is overflow */ | |
807 | for (prev=NULL, q = fs->rq[i] ; q != NULL ; ) | |
808 | if (q->head != NULL || q->S != q->F+1) { | |
809 | prev = q ; | |
810 | q = q->next ; | |
811 | } else { /* entry is idle, expire it */ | |
812 | struct dn_flow_queue *old_q = q ; | |
813 | ||
814 | if (prev != NULL) | |
815 | prev->next = q = q->next ; | |
816 | else | |
817 | fs->rq[i] = q = q->next ; | |
818 | fs->rq_elements-- ; | |
819 | FREE(old_q, M_IPFW); | |
1c79356b | 820 | } |
9bccf70c | 821 | return initial_elements - fs->rq_elements ; |
1c79356b A |
822 | } |
823 | ||
824 | /* | |
9bccf70c A |
825 | * If room, create a new queue and put at head of slot i; |
826 | * otherwise, create or use the default queue. | |
1c79356b | 827 | */ |
9bccf70c A |
828 | static struct dn_flow_queue * |
829 | create_queue(struct dn_flow_set *fs, int i) | |
1c79356b | 830 | { |
9bccf70c | 831 | struct dn_flow_queue *q ; |
1c79356b | 832 | |
9bccf70c A |
833 | if (fs->rq_elements > fs->rq_size * dn_max_ratio && |
834 | expire_queues(fs) == 0) { | |
835 | /* | |
836 | * No way to get room, use or create overflow queue. | |
837 | */ | |
838 | i = fs->rq_size ; | |
839 | if ( fs->rq[i] != NULL ) | |
840 | return fs->rq[i] ; | |
841 | } | |
842 | q = _MALLOC(sizeof(*q), M_IPFW, M_DONTWAIT) ; | |
843 | if (q == NULL) { | |
844 | printf("sorry, cannot allocate queue for new flow\n"); | |
845 | return NULL ; | |
846 | } | |
847 | bzero(q, sizeof(*q) ); /* needed */ | |
848 | q->fs = fs ; | |
849 | q->hash_slot = i ; | |
850 | q->next = fs->rq[i] ; | |
851 | q->S = q->F + 1; /* hack - mark timestamp as invalid */ | |
852 | fs->rq[i] = q ; | |
853 | fs->rq_elements++ ; | |
854 | return q ; | |
855 | } | |
1c79356b | 856 | |
9bccf70c A |
857 | /* |
858 | * Given a flow_set and a pkt in last_pkt, find a matching queue | |
859 | * after appropriate masking. The queue is moved to front | |
860 | * so that further searches take less time. | |
861 | */ | |
862 | static struct dn_flow_queue * | |
863 | find_queue(struct dn_flow_set *fs) | |
864 | { | |
865 | int i = 0 ; /* we need i and q for new allocations */ | |
866 | struct dn_flow_queue *q, *prev; | |
1c79356b | 867 | |
9bccf70c A |
868 | if ( !(fs->flags_fs & DN_HAVE_FLOW_MASK) ) |
869 | q = fs->rq[0] ; | |
870 | else { | |
871 | /* first, do the masking */ | |
872 | last_pkt.dst_ip &= fs->flow_mask.dst_ip ; | |
873 | last_pkt.src_ip &= fs->flow_mask.src_ip ; | |
874 | last_pkt.dst_port &= fs->flow_mask.dst_port ; | |
875 | last_pkt.src_port &= fs->flow_mask.src_port ; | |
876 | last_pkt.proto &= fs->flow_mask.proto ; | |
877 | last_pkt.flags = 0 ; /* we don't care about this one */ | |
878 | /* then, hash function */ | |
879 | i = ( (last_pkt.dst_ip) & 0xffff ) ^ | |
880 | ( (last_pkt.dst_ip >> 15) & 0xffff ) ^ | |
881 | ( (last_pkt.src_ip << 1) & 0xffff ) ^ | |
882 | ( (last_pkt.src_ip >> 16 ) & 0xffff ) ^ | |
883 | (last_pkt.dst_port << 1) ^ (last_pkt.src_port) ^ | |
884 | (last_pkt.proto ); | |
885 | i = i % fs->rq_size ; | |
886 | /* finally, scan the current list for a match */ | |
887 | searches++ ; | |
888 | for (prev=NULL, q = fs->rq[i] ; q ; ) { | |
889 | search_steps++; | |
890 | if (bcmp(&last_pkt, &(q->id), sizeof(q->id) ) == 0) | |
891 | break ; /* found */ | |
892 | else if (pipe_expire && q->head == NULL && q->S == q->F+1 ) { | |
893 | /* entry is idle and not in any heap, expire it */ | |
894 | struct dn_flow_queue *old_q = q ; | |
1c79356b | 895 | |
9bccf70c A |
896 | if (prev != NULL) |
897 | prev->next = q = q->next ; | |
898 | else | |
899 | fs->rq[i] = q = q->next ; | |
900 | fs->rq_elements-- ; | |
901 | FREE(old_q, M_IPFW); | |
902 | continue ; | |
903 | } | |
904 | prev = q ; | |
905 | q = q->next ; | |
1c79356b | 906 | } |
9bccf70c A |
907 | if (q && prev != NULL) { /* found and not in front */ |
908 | prev->next = q->next ; | |
909 | q->next = fs->rq[i] ; | |
910 | fs->rq[i] = q ; | |
1c79356b | 911 | } |
9bccf70c A |
912 | } |
913 | if (q == NULL) { /* no match, need to allocate a new entry */ | |
914 | q = create_queue(fs, i); | |
915 | if (q != NULL) | |
916 | q->id = last_pkt ; | |
917 | } | |
918 | return q ; | |
919 | } | |
920 | ||
921 | static int | |
922 | red_drops(struct dn_flow_set *fs, struct dn_flow_queue *q, int len) | |
923 | { | |
924 | /* | |
925 | * RED algorithm | |
926 | * | |
927 | * RED calculates the average queue size (avg) using a low-pass filter | |
928 | * with an exponential weighted (w_q) moving average: | |
929 | * avg <- (1-w_q) * avg + w_q * q_size | |
930 | * where q_size is the queue length (measured in bytes or * packets). | |
931 | * | |
932 | * If q_size == 0, we compute the idle time for the link, and set | |
933 | * avg = (1 - w_q)^(idle/s) | |
934 | * where s is the time needed for transmitting a medium-sized packet. | |
935 | * | |
936 | * Now, if avg < min_th the packet is enqueued. | |
937 | * If avg > max_th the packet is dropped. Otherwise, the packet is | |
938 | * dropped with probability P function of avg. | |
939 | * | |
940 | */ | |
941 | ||
942 | int64_t p_b = 0; | |
943 | /* queue in bytes or packets ? */ | |
944 | u_int q_size = (fs->flags_fs & DN_QSIZE_IS_BYTES) ? q->len_bytes : q->len; | |
945 | ||
946 | DEB(printf("\n%d q: %2u ", (int) curr_time, q_size);) | |
947 | ||
948 | /* average queue size estimation */ | |
949 | if (q_size != 0) { | |
950 | /* | |
951 | * queue is not empty, avg <- avg + (q_size - avg) * w_q | |
952 | */ | |
953 | int diff = SCALE(q_size) - q->avg; | |
954 | int64_t v = SCALE_MUL((int64_t) diff, (int64_t) fs->w_q); | |
955 | ||
956 | q->avg += (int) v; | |
957 | } else { | |
958 | /* | |
959 | * queue is empty, find for how long the queue has been | |
960 | * empty and use a lookup table for computing | |
961 | * (1 - * w_q)^(idle_time/s) where s is the time to send a | |
962 | * (small) packet. | |
963 | * XXX check wraps... | |
964 | */ | |
965 | if (q->avg) { | |
966 | u_int t = (curr_time - q->q_time) / fs->lookup_step; | |
967 | ||
968 | q->avg = (t < fs->lookup_depth) ? | |
969 | SCALE_MUL(q->avg, fs->w_q_lookup[t]) : 0; | |
970 | } | |
971 | } | |
972 | DEB(printf("avg: %u ", SCALE_VAL(q->avg));) | |
973 | ||
974 | /* should i drop ? */ | |
975 | ||
976 | if (q->avg < fs->min_th) { | |
977 | q->count = -1; | |
978 | return 0; /* accept packet ; */ | |
979 | } | |
980 | if (q->avg >= fs->max_th) { /* average queue >= max threshold */ | |
981 | if (fs->flags_fs & DN_IS_GENTLE_RED) { | |
1c79356b | 982 | /* |
9bccf70c A |
983 | * According to Gentle-RED, if avg is greater than max_th the |
984 | * packet is dropped with a probability | |
985 | * p_b = c_3 * avg - c_4 | |
986 | * where c_3 = (1 - max_p) / max_th, and c_4 = 1 - 2 * max_p | |
1c79356b | 987 | */ |
9bccf70c A |
988 | p_b = SCALE_MUL((int64_t) fs->c_3, (int64_t) q->avg) - fs->c_4; |
989 | } else { | |
990 | q->count = -1; | |
991 | printf("- drop"); | |
992 | return 1 ; | |
993 | } | |
994 | } else if (q->avg > fs->min_th) { | |
995 | /* | |
996 | * we compute p_b using the linear dropping function p_b = c_1 * | |
997 | * avg - c_2, where c_1 = max_p / (max_th - min_th), and c_2 = | |
998 | * max_p * min_th / (max_th - min_th) | |
999 | */ | |
1000 | p_b = SCALE_MUL((int64_t) fs->c_1, (int64_t) q->avg) - fs->c_2; | |
1001 | } | |
1002 | if (fs->flags_fs & DN_QSIZE_IS_BYTES) | |
1003 | p_b = (p_b * len) / fs->max_pkt_size; | |
1004 | if (++q->count == 0) | |
1005 | q->random = random() & 0xffff; | |
1006 | else { | |
1007 | /* | |
1008 | * q->count counts packets arrived since last drop, so a greater | |
1009 | * value of q->count means a greater packet drop probability. | |
1010 | */ | |
1011 | if (SCALE_MUL(p_b, SCALE((int64_t) q->count)) > q->random) { | |
1012 | q->count = 0; | |
1013 | DEB(printf("- red drop");) | |
1014 | /* after a drop we calculate a new random value */ | |
1015 | q->random = random() & 0xffff; | |
1016 | return 1; /* drop */ | |
1017 | } | |
1018 | } | |
1019 | /* end of RED algorithm */ | |
1020 | return 0 ; /* accept */ | |
1021 | } | |
1022 | ||
1023 | static __inline | |
1024 | struct dn_flow_set * | |
1025 | locate_flowset(int pipe_nr, struct ip_fw_chain *rule) | |
1026 | { | |
1027 | struct dn_flow_set *fs = NULL ; | |
1028 | ||
1029 | if ( (rule->rule->fw_flg & IP_FW_F_COMMAND) == IP_FW_F_QUEUE ) | |
1030 | for (fs=all_flow_sets; fs && fs->fs_nr != pipe_nr; fs=fs->next) | |
1031 | ; | |
1032 | else { | |
1033 | struct dn_pipe *p1; | |
1034 | for (p1 = all_pipes; p1 && p1->pipe_nr != pipe_nr; p1 = p1->next) | |
1035 | ; | |
1036 | if (p1 != NULL) | |
1037 | fs = &(p1->fs) ; | |
1038 | } | |
1039 | if (fs != NULL) | |
1040 | rule->rule->pipe_ptr = fs ; /* record for the future */ | |
1041 | return fs ; | |
1042 | } | |
1043 | ||
1044 | /* | |
1045 | * dummynet hook for packets. Below 'pipe' is a pipe or a queue | |
1046 | * depending on whether WF2Q or fixed bw is used. | |
1047 | */ | |
1048 | int | |
1049 | dummynet_io(int pipe_nr, int dir, /* pipe_nr can also be a fs_nr */ | |
1050 | struct mbuf *m, struct ifnet *ifp, struct route *ro, | |
1051 | struct sockaddr_in *dst, | |
1052 | struct ip_fw_chain *rule, int flags) | |
1053 | { | |
1054 | struct dn_pkt *pkt; | |
1055 | struct dn_flow_set *fs; | |
1056 | struct dn_pipe *pipe ; | |
1057 | u_int64_t len = m->m_pkthdr.len ; | |
1058 | struct dn_flow_queue *q = NULL ; | |
1059 | int s ; | |
1060 | ||
1061 | s = splimp(); | |
1062 | ||
1063 | pipe_nr &= 0xffff ; | |
1064 | ||
1065 | if ( (fs = rule->rule->pipe_ptr) == NULL ) { | |
1066 | fs = locate_flowset(pipe_nr, rule); | |
1067 | if (fs == NULL) | |
1068 | goto dropit ; /* this queue/pipe does not exist! */ | |
1069 | } | |
1070 | pipe = fs->pipe ; | |
1071 | if (pipe == NULL) { /* must be a queue, try find a matching pipe */ | |
1072 | for (pipe = all_pipes; pipe && pipe->pipe_nr != fs->parent_nr; | |
1073 | pipe = pipe->next) | |
1074 | ; | |
1075 | if (pipe != NULL) | |
1076 | fs->pipe = pipe ; | |
1077 | else { | |
1078 | printf("No pipe %d for queue %d, drop pkt\n", | |
1079 | fs->parent_nr, fs->fs_nr); | |
1080 | goto dropit ; | |
1081 | } | |
1082 | } | |
1083 | q = find_queue(fs); | |
1084 | if ( q == NULL ) | |
1085 | goto dropit ; /* cannot allocate queue */ | |
1086 | /* | |
1087 | * update statistics, then check reasons to drop pkt | |
1088 | */ | |
1089 | q->tot_bytes += len ; | |
1090 | q->tot_pkts++ ; | |
1091 | if ( fs->plr && random() < fs->plr ) | |
1092 | goto dropit ; /* random pkt drop */ | |
1093 | if ( fs->flags_fs & DN_QSIZE_IS_BYTES) { | |
1094 | if (q->len_bytes > fs->qsize) | |
1095 | goto dropit ; /* queue size overflow */ | |
1096 | } else { | |
1097 | if (q->len >= fs->qsize) | |
1098 | goto dropit ; /* queue count overflow */ | |
1099 | } | |
1100 | if ( fs->flags_fs & DN_IS_RED && red_drops(fs, q, len) ) | |
1101 | goto dropit ; | |
1102 | ||
1103 | pkt = (struct dn_pkt *)_MALLOC(sizeof (*pkt), M_IPFW, M_NOWAIT) ; | |
1104 | if ( pkt == NULL ) | |
1105 | goto dropit ; /* cannot allocate packet header */ | |
1106 | /* ok, i can handle the pkt now... */ | |
1107 | bzero(pkt, sizeof(*pkt) ); /* XXX expensive, see if we can remove it*/ | |
1108 | /* build and enqueue packet + parameters */ | |
1109 | pkt->hdr.mh_type = MT_DUMMYNET ; | |
1110 | (struct ip_fw_chain *)pkt->hdr.mh_data = rule ; | |
1111 | DN_NEXT(pkt) = NULL; | |
1112 | pkt->dn_m = m; | |
1113 | pkt->dn_dir = dir ; | |
1114 | ||
1115 | pkt->ifp = ifp; | |
1116 | if (dir == DN_TO_IP_OUT) { | |
1117 | /* | |
1118 | * We need to copy *ro because for ICMP pkts (and maybe others) | |
1119 | * the caller passed a pointer into the stack; dst might also be | |
1120 | * a pointer into *ro so it needs to be updated. | |
1121 | */ | |
1122 | pkt->ro = *ro; | |
1123 | if (ro->ro_rt) | |
1124 | rtref(ro->ro_rt); | |
1125 | if (dst == (struct sockaddr_in *)&ro->ro_dst) /* dst points into ro */ | |
1126 | dst = (struct sockaddr_in *)&(pkt->ro.ro_dst) ; | |
1127 | ||
1128 | pkt->dn_dst = dst; | |
1129 | pkt->flags = flags ; | |
1130 | } | |
1131 | if (q->head == NULL) | |
1132 | q->head = pkt; | |
1133 | else | |
1134 | DN_NEXT(q->tail) = pkt; | |
1135 | q->tail = pkt; | |
1136 | q->len++; | |
1137 | q->len_bytes += len ; | |
1138 | ||
1139 | if ( q->head != pkt ) /* flow was not idle, we are done */ | |
1140 | goto done; | |
1141 | /* | |
1142 | * If we reach this point the flow was previously idle, so we need | |
1143 | * to schedule it. This involves different actions for fixed-rate or | |
1144 | * WF2Q queues. | |
1145 | */ | |
1146 | if ( (rule->rule->fw_flg & IP_FW_F_COMMAND) == IP_FW_F_PIPE ) { | |
1147 | /* | |
1148 | * Fixed-rate queue: just insert into the ready_heap. | |
1149 | */ | |
1150 | dn_key t = 0 ; | |
1151 | if (pipe->bandwidth) | |
1152 | t = SET_TICKS(pkt, q, pipe); | |
1153 | q->sched_time = curr_time ; | |
1154 | if (t == 0) /* must process it now */ | |
1155 | ready_event( q ); | |
1156 | else | |
1157 | heap_insert(&ready_heap, curr_time + t , q ); | |
1158 | } else { | |
1159 | /* | |
1160 | * WF2Q. First, compute start time S: if the flow was idle (S=F+1) | |
1161 | * set S to the virtual time V for the controlling pipe, and update | |
1162 | * the sum of weights for the pipe; otherwise, remove flow from | |
1163 | * idle_heap and set S to max(F,V). | |
1164 | * Second, compute finish time F = S + len/weight. | |
1165 | * Third, if pipe was idle, update V=max(S, V). | |
1166 | * Fourth, count one more backlogged flow. | |
1167 | */ | |
1168 | if (DN_KEY_GT(q->S, q->F)) { /* means timestamps are invalid */ | |
1169 | q->S = pipe->V ; | |
1170 | pipe->sum += fs->weight ; /* add weight of new queue */ | |
1171 | } else { | |
1172 | heap_extract(&(pipe->idle_heap), q); | |
1173 | q->S = MAX64(q->F, pipe->V ) ; | |
1174 | } | |
1175 | q->F = q->S + ( len<<MY_M )/(u_int64_t) fs->weight; | |
1176 | ||
1177 | if (pipe->not_eligible_heap.elements == 0 && | |
1178 | pipe->scheduler_heap.elements == 0) | |
1179 | pipe->V = MAX64 ( q->S, pipe->V ); | |
1180 | fs->backlogged++ ; | |
1181 | /* | |
1182 | * Look at eligibility. A flow is not eligibile if S>V (when | |
1183 | * this happens, it means that there is some other flow already | |
1184 | * scheduled for the same pipe, so the scheduler_heap cannot be | |
1185 | * empty). If the flow is not eligible we just store it in the | |
1186 | * not_eligible_heap. Otherwise, we store in the scheduler_heap | |
1187 | * and possibly invoke ready_event_wfq() right now if there is | |
1188 | * leftover credit. | |
1189 | * Note that for all flows in scheduler_heap (SCH), S_i <= V, | |
1190 | * and for all flows in not_eligible_heap (NEH), S_i > V . | |
1191 | * So when we need to compute max( V, min(S_i) ) forall i in SCH+NEH, | |
1192 | * we only need to look into NEH. | |
1193 | */ | |
1194 | if (DN_KEY_GT(q->S, pipe->V) ) { /* not eligible */ | |
1195 | if (pipe->scheduler_heap.elements == 0) | |
1196 | printf("++ ouch! not eligible but empty scheduler!\n"); | |
1197 | heap_insert(&(pipe->not_eligible_heap), q->S, q); | |
1198 | } else { | |
1199 | heap_insert(&(pipe->scheduler_heap), q->F, q); | |
1200 | if (pipe->numbytes >= 0) { /* pipe is idle */ | |
1201 | if (pipe->scheduler_heap.elements != 1) | |
1202 | printf("*** OUCH! pipe should have been idle!\n"); | |
1203 | DEB(printf("Waking up pipe %d at %d\n", | |
1204 | pipe->pipe_nr, (int)(q->F >> MY_M)); ) | |
1205 | pipe->sched_time = curr_time ; | |
1206 | ready_event_wfq(pipe); | |
1c79356b | 1207 | } |
9bccf70c A |
1208 | } |
1209 | } | |
1210 | done: | |
1211 | splx(s); | |
1212 | return 0; | |
1213 | ||
1214 | dropit: | |
1215 | splx(s); | |
1216 | if (q) | |
1217 | q->drops++ ; | |
1218 | m_freem(m); | |
1219 | return ENOBUFS ; | |
1220 | } | |
1221 | ||
1222 | /* | |
1223 | * Below, the rt_unref is only needed when (pkt->dn_dir == DN_TO_IP_OUT) | |
1224 | * Doing this would probably save us the initial bzero of dn_pkt | |
1225 | */ | |
1226 | #define DN_FREE_PKT(pkt) { \ | |
1227 | struct dn_pkt *n = pkt ; \ | |
1228 | rt_unref ( n->ro.ro_rt ) ; \ | |
1229 | m_freem(n->dn_m); \ | |
1230 | pkt = DN_NEXT(n) ; \ | |
1231 | FREE(n, M_IPFW) ; } | |
1232 | ||
1233 | /* | |
1234 | * Dispose all packets and flow_queues on a flow_set. | |
1235 | * If all=1, also remove red lookup table and other storage, | |
1236 | * including the descriptor itself. | |
1237 | * For the one in dn_pipe MUST also cleanup ready_heap... | |
1238 | */ | |
1239 | static void | |
1240 | purge_flow_set(struct dn_flow_set *fs, int all) | |
1241 | { | |
1242 | struct dn_pkt *pkt ; | |
1243 | struct dn_flow_queue *q, *qn ; | |
1244 | int i ; | |
1245 | ||
1246 | for (i = 0 ; i <= fs->rq_size ; i++ ) { | |
1247 | for (q = fs->rq[i] ; q ; q = qn ) { | |
1248 | for (pkt = q->head ; pkt ; ) | |
1249 | DN_FREE_PKT(pkt) ; | |
1250 | qn = q->next ; | |
1251 | FREE(q, M_IPFW); | |
1252 | } | |
1253 | fs->rq[i] = NULL ; | |
1254 | } | |
1255 | fs->rq_elements = 0 ; | |
1256 | if (all) { | |
1257 | /* RED - free lookup table */ | |
1258 | if (fs->w_q_lookup) | |
1259 | FREE(fs->w_q_lookup, M_IPFW); | |
1260 | if (fs->rq) | |
1261 | FREE(fs->rq, M_IPFW); | |
1262 | /* if this fs is not part of a pipe, free it */ | |
1263 | if (fs->pipe && fs != &(fs->pipe->fs) ) | |
1264 | FREE(fs, M_IPFW); | |
1265 | } | |
1266 | } | |
1267 | ||
1268 | /* | |
1269 | * Dispose all packets queued on a pipe (not a flow_set). | |
1270 | * Also free all resources associated to a pipe, which is about | |
1271 | * to be deleted. | |
1272 | */ | |
1273 | static void | |
1274 | purge_pipe(struct dn_pipe *pipe) | |
1275 | { | |
1276 | struct dn_pkt *pkt ; | |
1277 | ||
1278 | purge_flow_set( &(pipe->fs), 1 ); | |
1279 | ||
1280 | for (pkt = pipe->head ; pkt ; ) | |
1281 | DN_FREE_PKT(pkt) ; | |
1282 | ||
1283 | heap_free( &(pipe->scheduler_heap) ); | |
1284 | heap_free( &(pipe->not_eligible_heap) ); | |
1285 | heap_free( &(pipe->idle_heap) ); | |
1286 | } | |
1287 | ||
1288 | /* | |
1289 | * Delete all pipes and heaps returning memory. Must also | |
1290 | * remove references from all ipfw rules to all pipes. | |
1291 | */ | |
1292 | static void | |
1293 | dummynet_flush() | |
1294 | { | |
1295 | struct dn_pipe *curr_p, *p ; | |
1296 | struct ip_fw_chain *chain ; | |
1297 | struct dn_flow_set *fs, *curr_fs; | |
1298 | int s ; | |
1299 | ||
1300 | s = splimp() ; | |
1301 | ||
1302 | /* remove all references to pipes ...*/ | |
1303 | LIST_FOREACH(chain, &ip_fw_chain_head, next) | |
1304 | chain->rule->pipe_ptr = NULL ; | |
1305 | /* prevent future matches... */ | |
1306 | p = all_pipes ; | |
1307 | all_pipes = NULL ; | |
1308 | fs = all_flow_sets ; | |
1309 | all_flow_sets = NULL ; | |
1310 | /* and free heaps so we don't have unwanted events */ | |
1311 | heap_free(&ready_heap); | |
1312 | heap_free(&wfq_ready_heap); | |
1313 | heap_free(&extract_heap); | |
1314 | splx(s) ; | |
1315 | /* | |
1316 | * Now purge all queued pkts and delete all pipes | |
1317 | */ | |
1318 | /* scan and purge all flow_sets. */ | |
1319 | for ( ; fs ; ) { | |
1320 | curr_fs = fs ; | |
1321 | fs = fs->next ; | |
1322 | purge_flow_set(curr_fs, 1); | |
1323 | } | |
1324 | for ( ; p ; ) { | |
1325 | purge_pipe(p); | |
1326 | curr_p = p ; | |
1327 | p = p->next ; | |
1328 | FREE(q, M_IPFW); | |
1329 | } | |
1330 | } | |
1331 | ||
1332 | ||
1333 | extern struct ip_fw_chain *ip_fw_default_rule ; | |
1334 | static void | |
1335 | dn_rule_delete_fs(struct dn_flow_set *fs, void *r) | |
1336 | { | |
1337 | int i ; | |
1338 | struct dn_flow_queue *q ; | |
1339 | struct dn_pkt *pkt ; | |
1340 | ||
1341 | for (i = 0 ; i <= fs->rq_size ; i++) /* last one is ovflow */ | |
1342 | for (q = fs->rq[i] ; q ; q = q->next ) | |
1343 | for (pkt = q->head ; pkt ; pkt = DN_NEXT(pkt) ) | |
1344 | if (pkt->hdr.mh_data == r) | |
1345 | pkt->hdr.mh_data = (void *)ip_fw_default_rule ; | |
1346 | } | |
1347 | /* | |
1348 | * when a firewall rule is deleted, scan all queues and remove the flow-id | |
1349 | * from packets matching this rule. | |
1350 | */ | |
1351 | void | |
1352 | dn_rule_delete(void *r) | |
1353 | { | |
1354 | struct dn_pipe *p ; | |
1355 | struct dn_pkt *pkt ; | |
1356 | struct dn_flow_set *fs ; | |
1357 | ||
1358 | /* | |
1359 | * If the rule references a queue (dn_flow_set), then scan | |
1360 | * the flow set, otherwise scan pipes. Should do either, but doing | |
1361 | * both does not harm. | |
1362 | */ | |
1363 | for ( fs = all_flow_sets ; fs ; fs = fs->next ) | |
1364 | dn_rule_delete_fs(fs, r); | |
1365 | for ( p = all_pipes ; p ; p = p->next ) { | |
1366 | fs = &(p->fs) ; | |
1367 | dn_rule_delete_fs(fs, r); | |
1368 | for (pkt = p->head ; pkt ; pkt = DN_NEXT(pkt) ) | |
1369 | if (pkt->hdr.mh_data == r) | |
1370 | pkt->hdr.mh_data = (void *)ip_fw_default_rule ; | |
1371 | } | |
1372 | } | |
1373 | ||
1374 | /* | |
1375 | * setup RED parameters | |
1376 | */ | |
1377 | static int | |
1378 | config_red(struct dn_flow_set *p, struct dn_flow_set * x) | |
1379 | { | |
1380 | int i; | |
1381 | ||
1382 | x->w_q = p->w_q; | |
1383 | x->min_th = SCALE(p->min_th); | |
1384 | x->max_th = SCALE(p->max_th); | |
1385 | x->max_p = p->max_p; | |
1386 | ||
1387 | x->c_1 = p->max_p / (p->max_th - p->min_th); | |
1388 | x->c_2 = SCALE_MUL(x->c_1, SCALE(p->min_th)); | |
1389 | if (x->flags_fs & DN_IS_GENTLE_RED) { | |
1390 | x->c_3 = (SCALE(1) - p->max_p) / p->max_th; | |
1391 | x->c_4 = (SCALE(1) - 2 * p->max_p); | |
1392 | } | |
1393 | ||
1394 | /* if the lookup table already exist, free and create it again */ | |
1395 | if (x->w_q_lookup) | |
1396 | FREE(x->w_q_lookup, M_IPFW); | |
1397 | if (red_lookup_depth == 0) { | |
1398 | printf("\nnet.inet.ip.dummynet.red_lookup_depth must be > 0"); | |
1399 | FREE(x, M_IPFW); | |
1400 | return EINVAL; | |
1401 | } | |
1402 | x->lookup_depth = red_lookup_depth; | |
1403 | x->w_q_lookup = (u_int *) _MALLOC(x->lookup_depth * sizeof(int), | |
1404 | M_IPFW, M_DONTWAIT); | |
1405 | if (x->w_q_lookup == NULL) { | |
1406 | printf("sorry, cannot allocate red lookup table\n"); | |
1407 | FREE(x, M_IPFW); | |
1408 | return ENOSPC; | |
1409 | } | |
1410 | ||
1411 | /* fill the lookup table with (1 - w_q)^x */ | |
1412 | x->lookup_step = p->lookup_step ; | |
1413 | x->lookup_weight = p->lookup_weight ; | |
1414 | x->w_q_lookup[0] = SCALE(1) - x->w_q; | |
1415 | for (i = 1; i < x->lookup_depth; i++) | |
1416 | x->w_q_lookup[i] = SCALE_MUL(x->w_q_lookup[i - 1], x->lookup_weight); | |
1417 | if (red_avg_pkt_size < 1) | |
1418 | red_avg_pkt_size = 512 ; | |
1419 | x->avg_pkt_size = red_avg_pkt_size ; | |
1420 | if (red_max_pkt_size < 1) | |
1421 | red_max_pkt_size = 1500 ; | |
1422 | x->max_pkt_size = red_max_pkt_size ; | |
1423 | return 0 ; | |
1424 | } | |
1425 | ||
1426 | static int | |
1427 | alloc_hash(struct dn_flow_set *x, struct dn_flow_set *pfs) | |
1428 | { | |
1429 | if (x->flags_fs & DN_HAVE_FLOW_MASK) { /* allocate some slots */ | |
1430 | int l = pfs->rq_size; | |
1431 | ||
1432 | if (l == 0) | |
1433 | l = dn_hash_size; | |
1434 | if (l < 4) | |
1435 | l = 4; | |
1436 | else if (l > 1024) | |
1437 | l = 1024; | |
1438 | x->rq_size = l; | |
1439 | } else /* one is enough for null mask */ | |
1440 | x->rq_size = 1; | |
1441 | x->rq = _MALLOC((1 + x->rq_size) * sizeof(struct dn_flow_queue *), | |
1442 | M_IPFW, M_DONTWAIT); | |
1443 | if (x->rq == NULL) { | |
1444 | printf("sorry, cannot allocate queue\n"); | |
1445 | return ENOSPC; | |
1446 | } | |
1447 | bzero(x->rq, (1+x->rq_size) * sizeof(struct dn_flow_queue *)); | |
1448 | x->rq_elements = 0; | |
1449 | return 0 ; | |
1450 | } | |
1451 | ||
1452 | static void | |
1453 | set_fs_parms(struct dn_flow_set *x, struct dn_flow_set *src) | |
1454 | { | |
1455 | x->flags_fs = src->flags_fs; | |
1456 | x->qsize = src->qsize; | |
1457 | x->plr = src->plr; | |
1458 | x->flow_mask = src->flow_mask; | |
1459 | if (x->flags_fs & DN_QSIZE_IS_BYTES) { | |
1460 | if (x->qsize > 1024*1024) | |
1461 | x->qsize = 1024*1024 ; | |
1462 | } else { | |
1463 | if (x->qsize == 0) | |
1464 | x->qsize = 50 ; | |
1465 | if (x->qsize > 100) | |
1466 | x->qsize = 50 ; | |
1467 | } | |
1468 | /* configuring RED */ | |
1469 | if ( x->flags_fs & DN_IS_RED ) | |
1470 | config_red(src, x) ; /* XXX should check errors */ | |
1471 | } | |
1472 | ||
1473 | /* | |
1474 | * setup pipe or queue parameters. | |
1475 | */ | |
1476 | ||
1477 | static int | |
1478 | config_pipe(struct dn_pipe *p) | |
1479 | { | |
1480 | int s ; | |
1481 | struct dn_flow_set *pfs = &(p->fs); | |
1482 | ||
1483 | /* | |
1484 | * The config program passes parameters as follows: | |
1485 | * bw = bits/second (0 means no limits), | |
1486 | * delay = ms, must be translated into ticks. | |
1487 | * qsize = slots/bytes | |
1488 | */ | |
1489 | p->delay = ( p->delay * hz ) / 1000 ; | |
1490 | /* We need either a pipe number or a flow_set number */ | |
1491 | if (p->pipe_nr == 0 && pfs->fs_nr == 0) | |
1492 | return EINVAL ; | |
1493 | if (p->pipe_nr != 0 && pfs->fs_nr != 0) | |
1494 | return EINVAL ; | |
1495 | if (p->pipe_nr != 0) { /* this is a pipe */ | |
1496 | struct dn_pipe *x, *a, *b; | |
1497 | /* locate pipe */ | |
1498 | for (a = NULL , b = all_pipes ; b && b->pipe_nr < p->pipe_nr ; | |
1c79356b | 1499 | a = b , b = b->next) ; |
9bccf70c A |
1500 | |
1501 | if (b == NULL || b->pipe_nr != p->pipe_nr) { /* new pipe */ | |
1502 | x = _MALLOC(sizeof(struct dn_pipe), M_IPFW, M_DONTWAIT) ; | |
1503 | if (x == NULL) { | |
1504 | printf("ip_dummynet.c: no memory for new pipe\n"); | |
1505 | return ENOSPC; | |
1c79356b | 1506 | } |
9bccf70c A |
1507 | bzero(x, sizeof(struct dn_pipe)); |
1508 | x->pipe_nr = p->pipe_nr; | |
1509 | x->fs.pipe = x ; | |
1510 | /* idle_heap is the only one from which we extract from the middle. | |
1511 | */ | |
1512 | x->idle_heap.size = x->idle_heap.elements = 0 ; | |
1513 | x->idle_heap.offset=OFFSET_OF(struct dn_flow_queue, heap_pos); | |
1514 | } else | |
1515 | x = b; | |
1516 | ||
1517 | x->bandwidth = p->bandwidth ; | |
1518 | x->numbytes = 0; /* just in case... */ | |
1519 | bcopy(p->if_name, x->if_name, sizeof(p->if_name) ); | |
1520 | x->ifp = NULL ; /* reset interface ptr */ | |
1521 | x->delay = p->delay ; | |
1522 | set_fs_parms(&(x->fs), pfs); | |
1c79356b | 1523 | |
1c79356b | 1524 | |
9bccf70c A |
1525 | if ( x->fs.rq == NULL ) { /* a new pipe */ |
1526 | s = alloc_hash(&(x->fs), pfs) ; | |
1527 | if (s) { | |
1528 | FREE(x, M_IPFW); | |
1529 | return s ; | |
1530 | } | |
1531 | s = splimp() ; | |
1532 | x->next = b ; | |
1533 | if (a == NULL) | |
1534 | all_pipes = x ; | |
1535 | else | |
1536 | a->next = x ; | |
1537 | splx(s); | |
1538 | } | |
1539 | } else { /* config queue */ | |
1540 | struct dn_flow_set *x, *a, *b ; | |
1541 | ||
1542 | /* locate flow_set */ | |
1543 | for (a=NULL, b=all_flow_sets ; b && b->fs_nr < pfs->fs_nr ; | |
1c79356b | 1544 | a = b , b = b->next) ; |
1c79356b | 1545 | |
9bccf70c A |
1546 | if (b == NULL || b->fs_nr != pfs->fs_nr) { /* new */ |
1547 | if (pfs->parent_nr == 0) /* need link to a pipe */ | |
1548 | return EINVAL ; | |
1549 | x = _MALLOC(sizeof(struct dn_flow_set), M_IPFW, M_DONTWAIT); | |
1550 | if (x == NULL) { | |
1551 | printf("ip_dummynet.c: no memory for new flow_set\n"); | |
1552 | return ENOSPC; | |
1c79356b | 1553 | } |
9bccf70c A |
1554 | bzero(x, sizeof(struct dn_flow_set)); |
1555 | x->fs_nr = pfs->fs_nr; | |
1556 | x->parent_nr = pfs->parent_nr; | |
1557 | x->weight = pfs->weight ; | |
1558 | if (x->weight == 0) | |
1559 | x->weight = 1 ; | |
1560 | else if (x->weight > 100) | |
1561 | x->weight = 100 ; | |
1562 | } else { | |
1563 | /* Change parent pipe not allowed; must delete and recreate */ | |
1564 | if (pfs->parent_nr != 0 && b->parent_nr != pfs->parent_nr) | |
1565 | return EINVAL ; | |
1566 | x = b; | |
1c79356b | 1567 | } |
9bccf70c A |
1568 | set_fs_parms(x, pfs); |
1569 | ||
1570 | if ( x->rq == NULL ) { /* a new flow_set */ | |
1571 | s = alloc_hash(x, pfs) ; | |
1572 | if (s) { | |
1573 | FREE(x, M_IPFW); | |
1574 | return s ; | |
1575 | } | |
1576 | s = splimp() ; | |
1577 | x->next = b; | |
1578 | if (a == NULL) | |
1579 | all_flow_sets = x; | |
1580 | else | |
1581 | a->next = x; | |
1582 | splx(s); | |
1583 | } | |
1584 | } | |
1585 | return 0 ; | |
1586 | } | |
1587 | ||
1588 | /* | |
1589 | * Helper function to remove from a heap queues which are linked to | |
1590 | * a flow_set about to be deleted. | |
1591 | */ | |
1592 | static void | |
1593 | fs_remove_from_heap(struct dn_heap *h, struct dn_flow_set *fs) | |
1594 | { | |
1595 | int i = 0, found = 0 ; | |
1596 | for (; i < h->elements ;) | |
1597 | if ( ((struct dn_flow_queue *)h->p[i].object)->fs == fs) { | |
1598 | h->elements-- ; | |
1599 | h->p[i] = h->p[h->elements] ; | |
1600 | found++ ; | |
1601 | } else | |
1602 | i++ ; | |
1603 | if (found) | |
1604 | heapify(h); | |
1c79356b A |
1605 | } |
1606 | ||
9bccf70c A |
1607 | /* |
1608 | * helper function to remove a pipe from a heap (can be there at most once) | |
1609 | */ | |
1610 | static void | |
1611 | pipe_remove_from_heap(struct dn_heap *h, struct dn_pipe *p) | |
1612 | { | |
1613 | if (h->elements > 0) { | |
1614 | int i = 0 ; | |
1615 | for (i=0; i < h->elements ; i++ ) { | |
1616 | if (h->p[i].object == p) { /* found it */ | |
1617 | h->elements-- ; | |
1618 | h->p[i] = h->p[h->elements] ; | |
1619 | heapify(h); | |
1620 | break ; | |
1621 | } | |
1622 | } | |
1623 | } | |
1624 | } | |
1625 | ||
1626 | /* | |
1627 | * drain all queues. Called in case of severe mbuf shortage. | |
1628 | */ | |
1c79356b | 1629 | void |
9bccf70c | 1630 | dummynet_drain() |
1c79356b | 1631 | { |
9bccf70c A |
1632 | struct dn_flow_set *fs; |
1633 | struct dn_pipe *p; | |
1634 | struct dn_pkt *pkt; | |
1635 | ||
1636 | heap_free(&ready_heap); | |
1637 | heap_free(&wfq_ready_heap); | |
1638 | heap_free(&extract_heap); | |
1639 | /* remove all references to this pipe from flow_sets */ | |
1640 | for (fs = all_flow_sets; fs; fs= fs->next ) | |
1641 | purge_flow_set(fs, 0); | |
1642 | ||
1643 | for (p = all_pipes; p; p= p->next ) { | |
1644 | purge_flow_set(&(p->fs), 0); | |
1645 | for (pkt = p->head ; pkt ; ) | |
1646 | DN_FREE_PKT(pkt) ; | |
1647 | p->head = p->tail = NULL ; | |
1648 | } | |
1c79356b A |
1649 | } |
1650 | ||
9bccf70c A |
1651 | /* |
1652 | * Fully delete a pipe or a queue, cleaning up associated info. | |
1653 | */ | |
1654 | static int | |
1655 | delete_pipe(struct dn_pipe *p) | |
1656 | { | |
1657 | int s ; | |
1658 | struct ip_fw_chain *chain ; | |
1659 | ||
1660 | if (p->pipe_nr == 0 && p->fs.fs_nr == 0) | |
1661 | return EINVAL ; | |
1662 | if (p->pipe_nr != 0 && p->fs.fs_nr != 0) | |
1663 | return EINVAL ; | |
1664 | if (p->pipe_nr != 0) { /* this is an old-style pipe */ | |
1665 | struct dn_pipe *a, *b; | |
1666 | struct dn_flow_set *fs; | |
1c79356b | 1667 | |
9bccf70c A |
1668 | /* locate pipe */ |
1669 | for (a = NULL , b = all_pipes ; b && b->pipe_nr < p->pipe_nr ; | |
1670 | a = b , b = b->next) ; | |
1671 | if (b == NULL || (b->pipe_nr != p->pipe_nr) ) | |
1672 | return EINVAL ; /* not found */ | |
1c79356b | 1673 | |
9bccf70c | 1674 | s = splimp() ; |
1c79356b | 1675 | |
9bccf70c A |
1676 | /* unlink from list of pipes */ |
1677 | if (a == NULL) | |
1678 | all_pipes = b->next ; | |
1679 | else | |
1680 | a->next = b->next ; | |
1681 | /* remove references to this pipe from the ip_fw rules. */ | |
1682 | LIST_FOREACH(chain, &ip_fw_chain_head, next) | |
1683 | if (chain->rule->pipe_ptr == &(b->fs)) | |
1684 | chain->rule->pipe_ptr = NULL ; | |
1685 | ||
1686 | /* remove all references to this pipe from flow_sets */ | |
1687 | for (fs = all_flow_sets; fs; fs= fs->next ) | |
1688 | if (fs->pipe == b) { | |
1689 | printf("++ ref to pipe %d from fs %d\n", | |
1690 | p->pipe_nr, fs->fs_nr); | |
1691 | fs->pipe = NULL ; | |
1692 | purge_flow_set(fs, 0); | |
1693 | } | |
1694 | fs_remove_from_heap(&ready_heap, &(b->fs)); | |
1695 | purge_pipe(b); /* remove all data associated to this pipe */ | |
1696 | /* remove reference to here from extract_heap and wfq_ready_heap */ | |
1697 | pipe_remove_from_heap(&extract_heap, b); | |
1698 | pipe_remove_from_heap(&wfq_ready_heap, b); | |
1699 | splx(s); | |
1700 | FREE(b, M_IPFW); | |
1701 | } else { /* this is a WF2Q queue (dn_flow_set) */ | |
1702 | struct dn_flow_set *a, *b; | |
1703 | ||
1704 | /* locate set */ | |
1705 | for (a = NULL, b = all_flow_sets ; b && b->fs_nr < p->fs.fs_nr ; | |
1706 | a = b , b = b->next) ; | |
1707 | if (b == NULL || (b->fs_nr != p->fs.fs_nr) ) | |
1708 | return EINVAL ; /* not found */ | |
1709 | ||
1710 | s = splimp() ; | |
1711 | if (a == NULL) | |
1712 | all_flow_sets = b->next ; | |
1713 | else | |
1714 | a->next = b->next ; | |
1715 | /* remove references to this flow_set from the ip_fw rules. */ | |
1716 | LIST_FOREACH(chain, &ip_fw_chain_head, next) | |
1717 | if (chain->rule->pipe_ptr == b) | |
1718 | chain->rule->pipe_ptr = NULL ; | |
1719 | ||
1720 | if (b->pipe != NULL) { | |
1721 | /* Update total weight on parent pipe and cleanup parent heaps */ | |
1722 | b->pipe->sum -= b->weight * b->backlogged ; | |
1723 | fs_remove_from_heap(&(b->pipe->not_eligible_heap), b); | |
1724 | fs_remove_from_heap(&(b->pipe->scheduler_heap), b); | |
1725 | #if 1 /* XXX should i remove from idle_heap as well ? */ | |
1726 | fs_remove_from_heap(&(b->pipe->idle_heap), b); | |
1727 | #endif | |
1728 | } | |
1729 | purge_flow_set(b, 1); | |
1730 | splx(s); | |
1731 | } | |
1732 | return 0 ; | |
1733 | } | |
1734 | ||
1735 | /* | |
1736 | * helper function used to copy data from kernel in DUMMYNET_GET | |
1737 | */ | |
1738 | static char * | |
1739 | dn_copy_set(struct dn_flow_set *set, char *bp) | |
1740 | { | |
1741 | int i, copied = 0 ; | |
1742 | struct dn_flow_queue *q, *qp = (struct dn_flow_queue *)bp; | |
1743 | ||
1744 | for (i = 0 ; i <= set->rq_size ; i++) | |
1745 | for (q = set->rq[i] ; q ; q = q->next, qp++ ) { | |
1746 | if (q->hash_slot != i) | |
1747 | printf("++ at %d: wrong slot (have %d, " | |
1748 | "should be %d)\n", copied, q->hash_slot, i); | |
1749 | if (q->fs != set) | |
1750 | printf("++ at %d: wrong fs ptr (have %p, should be %p)\n", | |
1751 | i, q->fs, set); | |
1752 | copied++ ; | |
1753 | bcopy(q, qp, sizeof( *q ) ); | |
1754 | /* cleanup pointers */ | |
1755 | qp->next = NULL ; | |
1756 | qp->head = qp->tail = NULL ; | |
1757 | qp->fs = NULL ; | |
1758 | } | |
1759 | if (copied != set->rq_elements) | |
1760 | printf("++ wrong count, have %d should be %d\n", | |
1761 | copied, set->rq_elements); | |
1762 | return (char *)qp ; | |
1763 | } | |
1c79356b A |
1764 | |
1765 | static int | |
9bccf70c | 1766 | dummynet_get(struct sockopt *sopt) |
1c79356b | 1767 | { |
9bccf70c A |
1768 | char *buf, *bp ; /* bp is the "copy-pointer" */ |
1769 | size_t size ; | |
1770 | struct dn_flow_set *set ; | |
1771 | struct dn_pipe *p ; | |
1772 | int s, error=0 ; | |
1773 | ||
1774 | s = splimp(); | |
1775 | /* | |
1776 | * compute size of data structures: list of pipes and flow_sets. | |
1777 | */ | |
1778 | for (p = all_pipes, size = 0 ; p ; p = p->next ) | |
1779 | size += sizeof( *p ) + | |
1780 | p->fs.rq_elements * sizeof(struct dn_flow_queue); | |
1781 | for (set = all_flow_sets ; set ; set = set->next ) | |
1782 | size += sizeof ( *set ) + | |
1783 | set->rq_elements * sizeof(struct dn_flow_queue); | |
1784 | buf = _MALLOC(size, M_TEMP, M_DONTWAIT); | |
1785 | if (buf == 0) { | |
1c79356b | 1786 | splx(s); |
9bccf70c A |
1787 | return ENOBUFS ; |
1788 | } | |
1789 | for (p = all_pipes, bp = buf ; p ; p = p->next ) { | |
1790 | struct dn_pipe *pipe_bp = (struct dn_pipe *)bp ; | |
1791 | ||
1792 | /* | |
1793 | * copy pipe descriptor into *bp, convert delay back to ms, | |
1794 | * then copy the flow_set descriptor(s) one at a time. | |
1795 | * After each flow_set, copy the queue descriptor it owns. | |
1796 | */ | |
1797 | bcopy(p, bp, sizeof( *p ) ); | |
1798 | pipe_bp->delay = (pipe_bp->delay * 1000) / hz ; | |
1799 | /* | |
1800 | * XXX the following is a hack based on ->next being the | |
1801 | * first field in dn_pipe and dn_flow_set. The correct | |
1802 | * solution would be to move the dn_flow_set to the beginning | |
1803 | * of struct dn_pipe. | |
1804 | */ | |
1805 | pipe_bp->next = (struct dn_pipe *)DN_IS_PIPE ; | |
1806 | /* clean pointers */ | |
1807 | pipe_bp->head = pipe_bp->tail = NULL ; | |
1808 | pipe_bp->fs.next = NULL ; | |
1809 | pipe_bp->fs.pipe = NULL ; | |
1810 | pipe_bp->fs.rq = NULL ; | |
1811 | ||
1812 | bp += sizeof( *p ) ; | |
1813 | bp = dn_copy_set( &(p->fs), bp ); | |
1814 | } | |
1815 | for (set = all_flow_sets ; set ; set = set->next ) { | |
1816 | struct dn_flow_set *fs_bp = (struct dn_flow_set *)bp ; | |
1817 | bcopy(set, bp, sizeof( *set ) ); | |
1818 | /* XXX same hack as above */ | |
1819 | fs_bp->next = (struct dn_flow_set *)DN_IS_QUEUE ; | |
1820 | fs_bp->pipe = NULL ; | |
1821 | fs_bp->rq = NULL ; | |
1822 | bp += sizeof( *set ) ; | |
1823 | bp = dn_copy_set( set, bp ); | |
1824 | } | |
1825 | splx(s); | |
1826 | error = sooptcopyout(sopt, buf, size); | |
1827 | FREE(buf, M_TEMP); | |
1828 | return error ; | |
1c79356b A |
1829 | } |
1830 | ||
9bccf70c A |
1831 | /* |
1832 | * Handler for the various dummynet socket options (get, flush, config, del) | |
1833 | */ | |
1c79356b | 1834 | static int |
9bccf70c | 1835 | ip_dn_ctl(struct sockopt *sopt) |
1c79356b | 1836 | { |
9bccf70c A |
1837 | int error = 0 ; |
1838 | struct dn_pipe *p, tmp_pipe; | |
1839 | ||
1840 | /* Disallow sets in really-really secure mode. */ | |
1841 | if (sopt->sopt_dir == SOPT_SET && securelevel >= 3) | |
1842 | return (EPERM); | |
1843 | ||
1844 | switch (sopt->sopt_name) { | |
1845 | default : | |
1846 | printf("ip_dn_ctl -- unknown option %d", sopt->sopt_name); | |
1847 | return EINVAL ; | |
1848 | ||
1849 | case IP_DUMMYNET_GET : | |
1850 | error = dummynet_get(sopt); | |
1851 | break ; | |
1852 | ||
1853 | case IP_DUMMYNET_FLUSH : | |
1854 | dummynet_flush() ; | |
1855 | break ; | |
1856 | case IP_DUMMYNET_CONFIGURE : | |
1857 | p = &tmp_pipe ; | |
1858 | error = sooptcopyin(sopt, p, sizeof *p, sizeof *p); | |
1859 | if (error) | |
1860 | break ; | |
1861 | error = config_pipe(p); | |
1862 | break ; | |
1863 | ||
1864 | case IP_DUMMYNET_DEL : /* remove a pipe or queue */ | |
1865 | p = &tmp_pipe ; | |
1866 | error = sooptcopyin(sopt, p, sizeof *p, sizeof *p); | |
1867 | if (error) | |
1868 | break ; | |
1869 | ||
1870 | error = delete_pipe(p); | |
1871 | break ; | |
1872 | } | |
1873 | return error ; | |
1c79356b A |
1874 | } |
1875 | ||
9bccf70c A |
1876 | static void |
1877 | ip_dn_init(void) | |
1c79356b | 1878 | { |
9bccf70c A |
1879 | printf("DUMMYNET initialized (010124)\n"); |
1880 | all_pipes = NULL ; | |
1881 | all_flow_sets = NULL ; | |
1882 | ready_heap.size = ready_heap.elements = 0 ; | |
1883 | ready_heap.offset = 0 ; | |
1884 | ||
1885 | wfq_ready_heap.size = wfq_ready_heap.elements = 0 ; | |
1886 | wfq_ready_heap.offset = 0 ; | |
1887 | ||
1888 | extract_heap.size = extract_heap.elements = 0 ; | |
1889 | extract_heap.offset = 0 ; | |
1890 | ip_dn_ctl_ptr = ip_dn_ctl; | |
1891 | timeout(dummynet, NULL, 1); | |
1c79356b | 1892 | } |
9bccf70c A |
1893 | |
1894 | static ip_dn_ctl_t *old_dn_ctl_ptr ; | |
1895 | ||
1896 | static int | |
1897 | dummynet_modevent(module_t mod, int type, void *data) | |
1898 | { | |
1899 | int s ; | |
1900 | switch (type) { | |
1901 | case MOD_LOAD: | |
1902 | s = splimp(); | |
1903 | old_dn_ctl_ptr = ip_dn_ctl_ptr; | |
1904 | ip_dn_init(); | |
1905 | splx(s); | |
1906 | break; | |
1907 | case MOD_UNLOAD: | |
1908 | s = splimp(); | |
1909 | ip_dn_ctl_ptr = old_dn_ctl_ptr; | |
1910 | splx(s); | |
1911 | dummynet_flush(); | |
1912 | break ; | |
1913 | default: | |
1914 | break ; | |
1915 | } | |
1916 | return 0 ; | |
1917 | } | |
1918 | ||
1919 | static moduledata_t dummynet_mod = { | |
1920 | "dummynet", | |
1921 | dummynet_modevent, | |
1922 | NULL | |
1923 | } ; | |
1924 | DECLARE_MODULE(dummynet, dummynet_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); |