git.saurik.com Git - apple/xnu.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
	3	*
	4	* @APPLE_LICENSE_HEADER_START@
	5	*
	6	* The contents of this file constitute Original Code as defined in and
	7	* are subject to the Apple Public Source License Version 1.1 (the
	8	* "License"). You may not use this file except in compliance with the
	9	* License. Please obtain a copy of the License at
	10	* http://www.apple.com/publicsource and read it before using this file.
	11	*
	12	* This Original Code and all software distributed under the License are
	13	* distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
	14	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
	15	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
	16	* FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
	17	* License for the specific language governing rights and limitations
	18	* under the License.
	19	*
	20	* @APPLE_LICENSE_HEADER_END@
	21	*/
	22	* Copyright (c) 1998-2001 Luigi Rizzo, Universita` di Pisa
	23	* Portions Copyright (c) 2000 Akamba Corp.
	24	* All rights reserved
	25	*
	26	* Redistribution and use in source and binary forms, with or without
	27	* modification, are permitted provided that the following conditions
	28	* are met:
	29	* 1. Redistributions of source code must retain the above copyright
	30	* notice, this list of conditions and the following disclaimer.
	31	* 2. Redistributions in binary form must reproduce the above copyright
	32	* notice, this list of conditions and the following disclaimer in the
	33	* documentation and/or other materials provided with the distribution.
	34	*
	35	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	36	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	37	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	38	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	39	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	40	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	41	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	42	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	43	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	44	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	45	* SUCH DAMAGE.
	46	*
	47	* $FreeBSD: src/sys/netinet/ip_dummynet.c,v 1.24.2.11 2001/02/09 23:18:08 luigi Exp $
	48	*/
	49
	50	#define DEB(x)
	51	#define DDB(x) x
	52
	53	/*
	54	* This module implements IP dummynet, a bandwidth limiter/delay emulator
	55	* used in conjunction with the ipfw package.
	56	* Description of the data structures used is in ip_dummynet.h
	57	* Here you mainly find the following blocks of code:
	58	* + variable declarations;
	59	* + heap management functions;
	60	* + scheduler and dummynet functions;
	61	* + configuration and initialization.
	62	*
	63	* NOTA BENE: critical sections are protected by splimp()/splx()
	64	* pairs. One would think that splnet() is enough as for most of
	65	* the netinet code, but it is not so because when used with
	66	* bridging, dummynet is invoked at splimp().
	67	*
	68	* Most important Changes:
	69	*
	70	* 010124: Fixed WF2Q behaviour
	71	* 010122: Fixed spl protection.
	72	* 000601: WF2Q support
	73	* 000106: large rewrite, use heaps to handle very many pipes.
	74	* 980513: initial release
	75	*
	76	* include files marked with XXX are probably not needed
	77	*/
	78
	79	#include <sys/param.h>
	80	#include <sys/systm.h>
	81	#include <sys/malloc.h>
	82	#include <sys/mbuf.h>
	83	#include <sys/queue.h> /* XXX */
	84	#include <sys/kernel.h>
	85	#include <sys/socket.h>
	86	#include <sys/socketvar.h>
	87	#include <sys/time.h>
	88	#include <sys/sysctl.h>
	89	#include <net/if.h>
	90	#include <net/route.h>
	91	#include <netinet/in.h>
	92	#include <netinet/in_systm.h>
	93	#include <netinet/in_var.h>
	94	#include <netinet/ip.h>
	95	#include <netinet/ip_fw.h>
	96	#include <netinet/ip_dummynet.h>
	97	#include <netinet/ip_var.h>
	98
	99	#if BRIDGE
	100	#include <netinet/if_ether.h> /* for struct arpcom */
	101	#include <net/bridge.h>
	102	#endif
	103
	104	/*
	105	* We keep a private variable for the simulation time, but we could
	106	* probably use an existing one ("softticks" in sys/kern/kern_timer.c)
	107	*/
	108	static dn_key curr_time = 0 ; /* current simulation time */
	109
	110	static int dn_hash_size = 64 ; /* default hash size */
	111
	112	/* statistics on number of queue searches and search steps */
	113	static int searches, search_steps ;
	114	static int pipe_expire = 1 ; /* expire queue if empty */
	115	static int dn_max_ratio = 16 ; /* max queues/buckets ratio */
	116
	117	static int red_lookup_depth = 256; /* RED - default lookup table depth */
	118	static int red_avg_pkt_size = 512; /* RED - default medium packet size */
	119	static int red_max_pkt_size = 1500; /* RED - default max packet size */
	120
	121	/*
	122	* Three heaps contain queues and pipes that the scheduler handles:
	123	*
	124	* ready_heap contains all dn_flow_queue related to fixed-rate pipes.
	125	*
	126	* wfq_ready_heap contains the pipes associated with WF2Q flows
	127	*
	128	* extract_heap contains pipes associated with delay lines.
	129	*
	130	*/
	131	static struct dn_heap ready_heap, extract_heap, wfq_ready_heap ;
	132
	133	static int heap_init(struct dn_heap *h, int size) ;
	134	static int heap_insert (struct dn_heap h, dn_key key1, void p);
	135	static void heap_extract(struct dn_heap h, void obj);
	136
	137	static void transmit_event(struct dn_pipe *pipe);
	138	static void ready_event(struct dn_flow_queue *q);
	139
	140	static struct dn_pipe all_pipes = NULL ; / list of all pipes */
	141	static struct dn_flow_set all_flow_sets = NULL ;/ list of all flow_sets */
	142
	143	#if SYSCTL_NODE
	144	SYSCTL_NODE(_net_inet_ip, OID_AUTO, dummynet,
	145	CTLFLAG_RW, 0, "Dummynet");
	146	SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, hash_size,
	147	CTLFLAG_RW, &dn_hash_size, 0, "Default hash table size");
	148	SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, curr_time,
	149	CTLFLAG_RD, &curr_time, 0, "Current tick");
	150	SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, ready_heap,
	151	CTLFLAG_RD, &ready_heap.size, 0, "Size of ready heap");
	152	SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, extract_heap,
	153	CTLFLAG_RD, &extract_heap.size, 0, "Size of extract heap");
	154	SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, searches,
	155	CTLFLAG_RD, &searches, 0, "Number of queue searches");
	156	SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, search_steps,
	157	CTLFLAG_RD, &search_steps, 0, "Number of queue search steps");
	158	SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, expire,
	159	CTLFLAG_RW, &pipe_expire, 0, "Expire queue if empty");
	160	SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, max_chain_len,
	161	CTLFLAG_RW, &dn_max_ratio, 0,
	162	"Max ratio between dynamic queues and buckets");
	163	SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_lookup_depth,
	164	CTLFLAG_RD, &red_lookup_depth, 0, "Depth of RED lookup table");
	165	SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_avg_pkt_size,
	166	CTLFLAG_RD, &red_avg_pkt_size, 0, "RED Medium packet size");
	167	SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_max_pkt_size,
	168	CTLFLAG_RD, &red_max_pkt_size, 0, "RED Max packet size");
	169	#endif
	170
	171	static int config_pipe(struct dn_pipe *p);
	172	static int ip_dn_ctl(struct sockopt *sopt);
	173
	174	static void rt_unref(struct rtentry *);
	175	static void dummynet(void *);
	176	static void dummynet_flush(void);
	177	void dummynet_drain(void);
	178	int if_tx_rdy(struct ifnet *ifp);
	179
	180	/*
	181	* ip_fw_chain is used when deleting a pipe, because ipfw rules can
	182	* hold references to the pipe.
	183	*/
	184	extern LIST_HEAD (ip_fw_head, ip_fw_chain) ip_fw_chain_head;
	185
	186	static void
	187	rt_unref(struct rtentry *rt)
	188	{
	189	if (rt == NULL)
	190	return ;
	191	if (rt->rt_refcnt <= 0)
	192	printf("-- warning, refcnt now %ld, decreasing\n", rt->rt_refcnt);
	193	rtfree(rt);
	194	}
	195
	196	/*
	197	* Heap management functions.
	198	*
	199	* In the heap, first node is element 0. Children of i are 2i+1 and 2i+2.
	200	* Some macros help finding parent/children so we can optimize them.
	201	*
	202	* heap_init() is called to expand the heap when needed.
	203	* Increment size in blocks of 16 entries.
	204	* XXX failure to allocate a new element is a pretty bad failure
	205	* as we basically stall a whole queue forever!!
	206	* Returns 1 on error, 0 on success
	207	*/
	208	#define HEAP_FATHER(x) ( ( (x) - 1 ) / 2 )
	209	#define HEAP_LEFT(x) ( 2*(x) + 1 )
	210	#define HEAP_IS_LEFT(x) ( (x) & 1 )
	211	#define HEAP_RIGHT(x) ( 2*(x) + 2 )
	212	#define HEAP_SWAP(a, b, buffer) { buffer = a ; a = b ; b = buffer ; }
	213	#define HEAP_INCREMENT 15
	214
	215	static int
	216	heap_init(struct dn_heap *h, int new_size)
	217	{
	218	struct dn_heap_entry *p;
	219
	220	if (h->size >= new_size ) {
	221	printf("heap_init, Bogus call, have %d want %d\n",
	222	h->size, new_size);
	223	return 0 ;
	224	}
	225	new_size = (new_size + HEAP_INCREMENT ) & ~HEAP_INCREMENT ;
	226	p = _MALLOC(new_size * sizeof(*p), M_IPFW, M_DONTWAIT );
	227	if (p == NULL) {
	228	printf(" heap_init, resize %d failed\n", new_size );
	229	return 1 ; /* error */
	230	}
	231	if (h->size > 0) {
	232	bcopy(h->p, p, h->size * sizeof(*p) );
	233	FREE(h->p, M_IPFW);
	234	}
	235	h->p = p ;
	236	h->size = new_size ;
	237	return 0 ;
	238	}
	239
	240	/*
	241	* Insert element in heap. Normally, p != NULL, we insert p in
	242	* a new position and bubble up. If p == NULL, then the element is
	243	* already in place, and key is the position where to start the
	244	* bubble-up.
	245	* Returns 1 on failure (cannot allocate new heap entry)
	246	*
	247	* If offset > 0 the position (index, int) of the element in the heap is
	248	* also stored in the element itself at the given offset in bytes.
	249	*/
	250	#define SET_OFFSET(heap, node) \
	251	if (heap->offset > 0) \
	252	((int )((char *)(heap->p[node].object) + heap->offset)) = node ;
	253	/*
	254	* RESET_OFFSET is used for sanity checks. It sets offset to an invalid value.
	255	*/
	256	#define RESET_OFFSET(heap, node) \
	257	if (heap->offset > 0) \
	258	((int )((char *)(heap->p[node].object) + heap->offset)) = -1 ;
	259	static int
	260	heap_insert(struct dn_heap h, dn_key key1, void p)
	261	{
	262	int son = h->elements ;
	263
	264	if (p == NULL) /* data already there, set starting point */
	265	son = key1 ;
	266	else { /* insert new element at the end, possibly resize */
	267	son = h->elements ;
	268	if (son == h->size) /* need resize... */
	269	if (heap_init(h, h->elements+1) )
	270	return 1 ; /* failure... */
	271	h->p[son].object = p ;
	272	h->p[son].key = key1 ;
	273	h->elements++ ;
	274	}
	275	while (son > 0) { /* bubble up */
	276	int father = HEAP_FATHER(son) ;
	277	struct dn_heap_entry tmp ;
	278
	279	if (DN_KEY_LT( h->p[father].key, h->p[son].key ) )
	280	break ; /* found right position */
	281	/* son smaller than father, swap and repeat */
	282	HEAP_SWAP(h->p[son], h->p[father], tmp) ;
	283	SET_OFFSET(h, son);
	284	son = father ;
	285	}
	286	SET_OFFSET(h, son);
	287	return 0 ;
	288	}
	289
	290	/*
	291	* remove top element from heap, or obj if obj != NULL
	292	*/
	293	static void
	294	heap_extract(struct dn_heap h, void obj)
	295	{
	296	int child, father, max = h->elements - 1 ;
	297
	298	if (max < 0) {
	299	printf("warning, extract from empty heap 0x%p\n", h);
	300	return ;
	301	}
	302	father = 0 ; /* default: move up smallest child */
	303	if (obj != NULL) { /* extract specific element, index is at offset */
	304	if (h->offset <= 0)
	305	panic("*** heap_extract from middle not supported on this heap!!!\n");
	306	father = ((int )((char *)obj + h->offset)) ;
	307	if (father < 0 \|\| father >= h->elements) {
	308	printf("dummynet: heap_extract, father %d out of bound 0..%d\n",
	309	father, h->elements);
	310	panic("heap_extract");
	311	}
	312	}
	313	RESET_OFFSET(h, father);
	314	child = HEAP_LEFT(father) ; /* left child */
	315	while (child <= max) { /* valid entry */
	316	if (child != max && DN_KEY_LT(h->p[child+1].key, h->p[child].key) )
	317	child = child+1 ; /* take right child, otherwise left */
	318	h->p[father] = h->p[child] ;
	319	SET_OFFSET(h, father);
	320	father = child ;
	321	child = HEAP_LEFT(child) ; /* left child for next loop */
	322	}
	323	h->elements-- ;
	324	if (father != max) {
	325	/*
	326	* Fill hole with last entry and bubble up, reusing the insert code
	327	*/
	328	h->p[father] = h->p[max] ;
	329	heap_insert(h, father, NULL); /* this one cannot fail */
	330	}
	331	}
	332
	333	#if 0
	334	/*
	335	* change object position and update references
	336	* XXX this one is never used!
	337	*/
	338	static void
	339	heap_move(struct dn_heap h, dn_key new_key, void object)
	340	{
	341	int temp;
	342	int i ;
	343	int max = h->elements-1 ;
	344	struct dn_heap_entry buf ;
	345
	346	if (h->offset <= 0)
	347	panic("cannot move items on this heap");
	348
	349	i = ((int )((char *)object + h->offset));
	350	if (DN_KEY_LT(new_key, h->p[i].key) ) { /* must move up */
	351	h->p[i].key = new_key ;
	352	for (; i>0 && DN_KEY_LT(new_key, h->p[(temp = HEAP_FATHER(i))].key) ;
	353	i = temp ) { /* bubble up */
	354	HEAP_SWAP(h->p[i], h->p[temp], buf) ;
	355	SET_OFFSET(h, i);
	356	}
	357	} else { /* must move down */
	358	h->p[i].key = new_key ;
	359	while ( (temp = HEAP_LEFT(i)) <= max ) { /* found left child */
	360	if ((temp != max) && DN_KEY_GT(h->p[temp].key, h->p[temp+1].key))
	361	temp++ ; /* select child with min key */
	362	if (DN_KEY_GT(new_key, h->p[temp].key)) { /* go down */
	363	HEAP_SWAP(h->p[i], h->p[temp], buf) ;
	364	SET_OFFSET(h, i);
	365	} else
	366	break ;
	367	i = temp ;
	368	}
	369	}
	370	SET_OFFSET(h, i);
	371	}
	372	#endif /* heap_move, unused */
	373
	374	/*
	375	* heapify() will reorganize data inside an array to maintain the
	376	* heap property. It is needed when we delete a bunch of entries.
	377	*/
	378	static void
	379	heapify(struct dn_heap *h)
	380	{
	381	int i ;
	382
	383	for (i = 0 ; i < h->elements ; i++ )
	384	heap_insert(h, i , NULL) ;
	385	}
	386
	387	/*
	388	* cleanup the heap and free data structure
	389	*/
	390	static void
	391	heap_free(struct dn_heap *h)
	392	{
	393	if (h->size >0 )
	394	FREE(h->p, M_IPFW);
	395	bzero(h, sizeof(*h) );
	396	}
	397
	398	/*
	399	* --- end of heap management functions ---
	400	*/
	401
	402	/*
	403	* Scheduler functions:
	404	*
	405	* transmit_event() is called when the delay-line needs to enter
	406	* the scheduler, either because of existing pkts getting ready,
	407	* or new packets entering the queue. The event handled is the delivery
	408	* time of the packet.
	409	*
	410	* ready_event() does something similar with fixed-rate queues, and the
	411	* event handled is the finish time of the head pkt.
	412	*
	413	* wfq_ready_event() does something similar with WF2Q queues, and the
	414	* event handled is the start time of the head pkt.
	415	*
	416	* In all cases, we make sure that the data structures are consistent
	417	* before passing pkts out, because this might trigger recursive
	418	* invocations of the procedures.
	419	*/
	420	static void
	421	transmit_event(struct dn_pipe *pipe)
	422	{
	423	struct dn_pkt *pkt ;
	424
	425	while ( (pkt = pipe->head) && DN_KEY_LEQ(pkt->output_time, curr_time) ) {
	426	/*
	427	* first unlink, then call procedures, since ip_input() can invoke
	428	* ip_output() and viceversa, thus causing nested calls
	429	*/
	430	pipe->head = DN_NEXT(pkt) ;
	431
	432	/*
	433	* The actual mbuf is preceded by a struct dn_pkt, resembling an mbuf
	434	* (NOT A REAL one, just a small block of malloc'ed memory) with
	435	* m_type = MT_DUMMYNET
	436	* m_next = actual mbuf to be processed by ip_input/output
	437	* m_data = the matching rule
	438	* and some other fields.
	439	* The block IS FREED HERE because it contains parameters passed
	440	* to the called routine.
	441	*/
	442	switch (pkt->dn_dir) {
	443	case DN_TO_IP_OUT:
	444	(void)ip_output((struct mbuf *)pkt, NULL, NULL, 0, NULL);
	445	rt_unref (pkt->ro.ro_rt) ;
	446	break ;
	447
	448	case DN_TO_IP_IN :
	449	ip_input((struct mbuf *)pkt) ;
	450	break ;
	451
	452	#if BRIDGE
	453	case DN_TO_BDG_FWD : {
	454	struct mbuf m = (struct mbuf )pkt ;
	455	struct ether_header *eh;
	456
	457	if (pkt->dn_m->m_len < ETHER_HDR_LEN
	458	&& (pkt->dn_m = m_pullup(pkt->dn_m, ETHER_HDR_LEN)) == NULL) {
	459	printf("dummynet/bridge: pullup fail, dropping pkt\n");
	460	break;
	461	}
	462	/*
	463	* same as ether_input, make eh be a pointer into the mbuf
	464	*/
	465	eh = mtod(pkt->dn_m, struct ether_header *);
	466	m_adj(pkt->dn_m, ETHER_HDR_LEN);
	467	/*
	468	* bdg_forward() wants a pointer to the pseudo-mbuf-header, but
	469	* on return it will supply the pointer to the actual packet
	470	* (originally pkt->dn_m, but could be something else now) if
	471	* it has not consumed it.
	472	*/
	473	m = bdg_forward(m, eh, pkt->ifp);
	474	if (m)
	475	m_freem(m);
	476	}
	477	break ;
	478	#endif
	479
	480	default:
	481	printf("dummynet: bad switch %d!\n", pkt->dn_dir);
	482	m_freem(pkt->dn_m);
	483	break ;
	484	}
	485	FREE(pkt, M_IPFW);
	486	}
	487	/* if there are leftover packets, put into the heap for next event */
	488	if ( (pkt = pipe->head) )
	489	heap_insert(&extract_heap, pkt->output_time, pipe ) ;
	490	/* XXX should check errors on heap_insert, by draining the
	491	* whole pipe p and hoping in the future we are more successful
	492	*/
	493	}
	494
	495	/*
	496	* the following macro computes how many ticks we have to wait
	497	* before being able to transmit a packet. The credit is taken from
	498	* either a pipe (WF2Q) or a flow_queue (per-flow queueing)
	499	*/
	500	#define SET_TICKS(pkt, q, p) \
	501	(pkt->dn_m->m_pkthdr.len8hz - (q)->numbytes + p->bandwidth - 1 ) / \
	502	p->bandwidth ;
	503
	504	/*
	505	* extract pkt from queue, compute output time (could be now)
	506	* and put into delay line (p_queue)
	507	*/
	508	static void
	509	move_pkt(struct dn_pkt pkt, struct dn_flow_queue q,
	510	struct dn_pipe *p, int len)
	511	{
	512	q->head = DN_NEXT(pkt) ;
	513	q->len-- ;
	514	q->len_bytes -= len ;
	515
	516	pkt->output_time = curr_time + p->delay ;
	517
	518	if (p->head == NULL)
	519	p->head = pkt;
	520	else
	521	DN_NEXT(p->tail) = pkt;
	522	p->tail = pkt;
	523	DN_NEXT(p->tail) = NULL;
	524	}
	525
	526	/*
	527	* ready_event() is invoked every time the queue must enter the
	528	* scheduler, either because the first packet arrives, or because
	529	* a previously scheduled event fired.
	530	* On invokation, drain as many pkts as possible (could be 0) and then
	531	* if there are leftover packets reinsert the pkt in the scheduler.
	532	*/
	533	static void
	534	ready_event(struct dn_flow_queue *q)
	535	{
	536	struct dn_pkt *pkt;
	537	struct dn_pipe *p = q->fs->pipe ;
	538	int p_was_empty ;
	539
	540	if (p == NULL) {
	541	printf("ready_event- pipe is gone\n");
	542	return ;
	543	}
	544	p_was_empty = (p->head == NULL) ;
	545
	546	/*
	547	* schedule fixed-rate queues linked to this pipe:
	548	* Account for the bw accumulated since last scheduling, then
	549	* drain as many pkts as allowed by q->numbytes and move to
	550	* the delay line (in p) computing output time.
	551	* bandwidth==0 (no limit) means we can drain the whole queue,
	552	* setting len_scaled = 0 does the job.
	553	*/
	554	q->numbytes += ( curr_time - q->sched_time ) * p->bandwidth;
	555	while ( (pkt = q->head) != NULL ) {
	556	int len = pkt->dn_m->m_pkthdr.len;
	557	int len_scaled = p->bandwidth ? len8hz : 0 ;
	558	if (len_scaled > q->numbytes )
	559	break ;
	560	q->numbytes -= len_scaled ;
	561	move_pkt(pkt, q, p, len);
	562	}
	563	/*
	564	* If we have more packets queued, schedule next ready event
	565	* (can only occur when bandwidth != 0, otherwise we would have
	566	* flushed the whole queue in the previous loop).
	567	* To this purpose we record the current time and compute how many
	568	* ticks to go for the finish time of the packet.
	569	*/
	570	if ( (pkt = q->head) != NULL ) { /* this implies bandwidth != 0 */
	571	dn_key t = SET_TICKS(pkt, q, p); /* ticks i have to wait */
	572	q->sched_time = curr_time ;
	573	heap_insert(&ready_heap, curr_time + t, (void *)q );
	574	/* XXX should check errors on heap_insert, and drain the whole
	575	* queue on error hoping next time we are luckier.
	576	*/
	577	} else /* RED needs to know when the queue becomes empty */
	578	q->q_time = curr_time;
	579	/*
	580	* If the delay line was empty call transmit_event(p) now.
	581	* Otherwise, the scheduler will take care of it.
	582	*/
	583	if (p_was_empty)
	584	transmit_event(p);
	585	}
	586
	587	/*
	588	* Called when we can transmit packets on WF2Q queues. Take pkts out of
	589	* the queues at their start time, and enqueue into the delay line.
	590	* Packets are drained until p->numbytes < 0. As long as
	591	* len_scaled >= p->numbytes, the packet goes into the delay line
	592	* with a deadline p->delay. For the last packet, if p->numbytes<0,
	593	* there is an additional delay.
	594	*/
	595	static void
	596	ready_event_wfq(struct dn_pipe *p)
	597	{
	598	int p_was_empty = (p->head == NULL) ;
	599	struct dn_heap *sch = &(p->scheduler_heap);
	600	struct dn_heap *neh = &(p->not_eligible_heap) ;
	601
	602	if (p->if_name[0] == 0) /* tx clock is simulated */
	603	p->numbytes += ( curr_time - p->sched_time ) * p->bandwidth;
	604	else { /* tx clock is for real, the ifq must be empty or this is a NOP */
	605	if (p->ifp && p->ifp->if_snd.ifq_head != NULL)
	606	return ;
	607	else {
	608	DEB(printf("pipe %d ready from %s --\n",
	609	p->pipe_nr, p->if_name);)
	610	}
	611	}
	612
	613	/*
	614	* While we have backlogged traffic AND credit, we need to do
	615	* something on the queue.
	616	*/
	617	while ( p->numbytes >=0 && (sch->elements>0 \|\| neh->elements >0) ) {
	618	if (sch->elements > 0) { /* have some eligible pkts to send out */
	619	struct dn_flow_queue *q = sch->p[0].object ;
	620	struct dn_pkt *pkt = q->head;
	621	struct dn_flow_set *fs = q->fs;
	622	u_int64_t len = pkt->dn_m->m_pkthdr.len;
	623	int len_scaled = p->bandwidth ? len8hz : 0 ;
	624
	625	heap_extract(sch, NULL); /* remove queue from heap */
	626	p->numbytes -= len_scaled ;
	627	move_pkt(pkt, q, p, len);
	628
	629	p->V += (len<<MY_M) / p->sum ; /* update V */
	630	q->S = q->F ; /* update start time */
	631	if (q->len == 0) { /* Flow not backlogged any more */
	632	fs->backlogged-- ;
	633	heap_insert(&(p->idle_heap), q->F, q);
	634	} else { /* still backlogged */
	635	/*
	636	* update F and position in backlogged queue, then
	637	* put flow in not_eligible_heap (we will fix this later).
	638	*/
	639	len = (q->head)->dn_m->m_pkthdr.len;
	640	q->F += (len<<MY_M)/(u_int64_t) fs->weight ;
	641	if (DN_KEY_LEQ(q->S, p->V))
	642	heap_insert(neh, q->S, q);
	643	else
	644	heap_insert(sch, q->F, q);
	645	}
	646	}
	647	/*
	648	* now compute V = max(V, min(S_i)). Remember that all elements in sch
	649	* have by definition S_i <= V so if sch is not empty, V is surely
	650	* the max and we must not update it. Conversely, if sch is empty
	651	* we only need to look at neh.
	652	*/
	653	if (sch->elements == 0 && neh->elements > 0)
	654	p->V = MAX64 ( p->V, neh->p[0].key );
	655	/* move from neh to sch any packets that have become eligible */
	656	while (neh->elements > 0 && DN_KEY_LEQ(neh->p[0].key, p->V) ) {
	657	struct dn_flow_queue *q = neh->p[0].object ;
	658	heap_extract(neh, NULL);
	659	heap_insert(sch, q->F, q);
	660	}
	661
	662	if (p->if_name[0] != '\0') {/* tx clock is from a real thing */
	663	p->numbytes = -1 ; /* mark not ready for I/O */
	664	break ;
	665	}
	666	}
	667	if (sch->elements == 0 && neh->elements == 0 && p->numbytes >= 0
	668	&& p->idle_heap.elements > 0) {
	669	/*
	670	* no traffic and no events scheduled. We can get rid of idle-heap.
	671	*/
	672	int i ;
	673
	674	for (i = 0 ; i < p->idle_heap.elements ; i++) {
	675	struct dn_flow_queue *q = p->idle_heap.p[i].object ;
	676
	677	q->F = 0 ;
	678	q->S = q->F + 1 ;
	679	}
	680	p->sum = 0 ;
	681	p->V = 0 ;
	682	p->idle_heap.elements = 0 ;
	683	}
	684	/*
	685	* If we are getting clocks from dummynet (not a real interface) and
	686	* If we are under credit, schedule the next ready event.
	687	* Also fix the delivery time of the last packet.
	688	*/
	689	if (p->if_name[0]==0 && p->numbytes < 0) { /* this implies bandwidth >0 */
	690	dn_key t=0 ; /* number of ticks i have to wait */
	691
	692	if (p->bandwidth > 0)
	693	t = ( p->bandwidth -1 - p->numbytes) / p->bandwidth ;
	694	p->tail->output_time += t ;
	695	p->sched_time = curr_time ;
	696	heap_insert(&wfq_ready_heap, curr_time + t, (void *)p);
	697	/* XXX should check errors on heap_insert, and drain the whole
	698	* queue on error hoping next time we are luckier.
	699	*/
	700	}
	701	/*
	702	* If the delay line was empty call transmit_event(p) now.
	703	* Otherwise, the scheduler will take care of it.
	704	*/
	705	if (p_was_empty)
	706	transmit_event(p);
	707	}
	708
	709	/*
	710	* This is called once per tick, or HZ times per second. It is used to
	711	* increment the current tick counter and schedule expired events.
	712	*/
	713	static void
	714	dummynet(void * __unused unused)
	715	{
	716	void p ; / generic parameter to handler */
	717	struct dn_heap *h ;
	718	int s ;
	719	struct dn_heap *heaps[3];
	720	int i;
	721	struct dn_pipe *pe ;
	722
	723	heaps[0] = &ready_heap ; /* fixed-rate queues */
	724	heaps[1] = &wfq_ready_heap ; /* wfq queues */
	725	heaps[2] = &extract_heap ; /* delay line */
	726	s = splimp(); /* see note on top, splnet() is not enough */
	727	curr_time++ ;
	728	for (i=0; i < 3 ; i++) {
	729	h = heaps[i];
	730	while (h->elements > 0 && DN_KEY_LEQ(h->p[0].key, curr_time) ) {
	731	DDB(if (h->p[0].key > curr_time)
	732	printf("-- dummynet: warning, heap %d is %d ticks late\n",
	733	i, (int)(curr_time - h->p[0].key));)
	734	p = h->p[0].object ; /* store a copy before heap_extract */
	735	heap_extract(h, NULL); /* need to extract before processing */
	736	if (i == 0)
	737	ready_event(p) ;
	738	else if (i == 1) {
	739	struct dn_pipe *pipe = p;
	740	if (pipe->if_name[0] != '\0')
	741	printf("*** bad ready_event_wfq for pipe %s\n",
	742	pipe->if_name);
	743	else
	744	ready_event_wfq(p) ;
	745	} else
	746	transmit_event(p);
	747	}
	748	}
	749	/* sweep pipes trying to expire idle flow_queues */
	750	for (pe = all_pipes; pe ; pe = pe->next )
	751	if (pe->idle_heap.elements > 0 &&
	752	DN_KEY_LT(pe->idle_heap.p[0].key, pe->V) ) {
	753	struct dn_flow_queue *q = pe->idle_heap.p[0].object ;
	754
	755	heap_extract(&(pe->idle_heap), NULL);
	756	q->S = q->F + 1 ; /* mark timestamp as invalid */
	757	pe->sum -= q->fs->weight ;
	758	}
	759	splx(s);
	760	timeout(dummynet, NULL, 1);
	761	}
	762
	763	/*
	764	* called by an interface when tx_rdy occurs.
	765	*/
	766	int
	767	if_tx_rdy(struct ifnet *ifp)
	768	{
	769	struct dn_pipe *p;
	770
	771	for (p = all_pipes; p ; p = p->next )
	772	if (p->ifp == ifp)
	773	break ;
	774	if (p == NULL) {
	775	char buf[32];
	776	sprintf(buf, "%s%d",ifp->if_name, ifp->if_unit);
	777	for (p = all_pipes; p ; p = p->next )
	778	if (!strcmp(p->if_name, buf) ) {
	779	p->ifp = ifp ;
	780	DEB(printf("++ tx rdy from %s (now found)\n", buf);)
	781	break ;
	782	}
	783	}
	784	if (p != NULL) {
	785	DEB(printf("++ tx rdy from %s%d - qlen %d\n", ifp->if_name,
	786	ifp->if_unit, ifp->if_snd.ifq_len);)
	787	p->numbytes = 0 ; /* mark ready for I/O */
	788	ready_event_wfq(p);
	789	}
	790	return 0;
	791	}
	792
	793	/*
	794	* Unconditionally expire empty queues in case of shortage.
	795	* Returns the number of queues freed.
	796	*/
	797	static int
	798	expire_queues(struct dn_flow_set *fs)
	799	{
	800	struct dn_flow_queue q, prev ;
	801	int i, initial_elements = fs->rq_elements ;
	802
	803	if (fs->last_expired == time_second)
	804	return 0 ;
	805	fs->last_expired = time_second ;
	806	for (i = 0 ; i <= fs->rq_size ; i++) /* last one is overflow */
	807	for (prev=NULL, q = fs->rq[i] ; q != NULL ; )
	808	if (q->head != NULL \|\| q->S != q->F+1) {
	809	prev = q ;
	810	q = q->next ;
	811	} else { /* entry is idle, expire it */
	812	struct dn_flow_queue *old_q = q ;
	813
	814	if (prev != NULL)
	815	prev->next = q = q->next ;
	816	else
	817	fs->rq[i] = q = q->next ;
	818	fs->rq_elements-- ;
	819	FREE(old_q, M_IPFW);
	820	}
	821	return initial_elements - fs->rq_elements ;
	822	}
	823
	824	/*
	825	* If room, create a new queue and put at head of slot i;
	826	* otherwise, create or use the default queue.
	827	*/
	828	static struct dn_flow_queue *
	829	create_queue(struct dn_flow_set *fs, int i)
	830	{
	831	struct dn_flow_queue *q ;
	832
	833	if (fs->rq_elements > fs->rq_size * dn_max_ratio &&
	834	expire_queues(fs) == 0) {
	835	/*
	836	* No way to get room, use or create overflow queue.
	837	*/
	838	i = fs->rq_size ;
	839	if ( fs->rq[i] != NULL )
	840	return fs->rq[i] ;
	841	}
	842	q = _MALLOC(sizeof(*q), M_IPFW, M_DONTWAIT) ;
	843	if (q == NULL) {
	844	printf("sorry, cannot allocate queue for new flow\n");
	845	return NULL ;
	846	}
	847	bzero(q, sizeof(q) ); / needed */
	848	q->fs = fs ;
	849	q->hash_slot = i ;
	850	q->next = fs->rq[i] ;
	851	q->S = q->F + 1; /* hack - mark timestamp as invalid */
	852	fs->rq[i] = q ;
	853	fs->rq_elements++ ;
	854	return q ;
	855	}
	856
	857	/*
	858	* Given a flow_set and a pkt in last_pkt, find a matching queue
	859	* after appropriate masking. The queue is moved to front
	860	* so that further searches take less time.
	861	*/
	862	static struct dn_flow_queue *
	863	find_queue(struct dn_flow_set *fs)
	864	{
	865	int i = 0 ; /* we need i and q for new allocations */
	866	struct dn_flow_queue q, prev;
	867
	868	if ( !(fs->flags_fs & DN_HAVE_FLOW_MASK) )
	869	q = fs->rq[0] ;
	870	else {
	871	/* first, do the masking */
	872	last_pkt.dst_ip &= fs->flow_mask.dst_ip ;
	873	last_pkt.src_ip &= fs->flow_mask.src_ip ;
	874	last_pkt.dst_port &= fs->flow_mask.dst_port ;
	875	last_pkt.src_port &= fs->flow_mask.src_port ;
	876	last_pkt.proto &= fs->flow_mask.proto ;
	877	last_pkt.flags = 0 ; /* we don't care about this one */
	878	/* then, hash function */
	879	i = ( (last_pkt.dst_ip) & 0xffff ) ^
	880	( (last_pkt.dst_ip >> 15) & 0xffff ) ^
	881	( (last_pkt.src_ip << 1) & 0xffff ) ^
	882	( (last_pkt.src_ip >> 16 ) & 0xffff ) ^
	883	(last_pkt.dst_port << 1) ^ (last_pkt.src_port) ^
	884	(last_pkt.proto );
	885	i = i % fs->rq_size ;
	886	/* finally, scan the current list for a match */
	887	searches++ ;
	888	for (prev=NULL, q = fs->rq[i] ; q ; ) {
	889	search_steps++;
	890	if (bcmp(&last_pkt, &(q->id), sizeof(q->id) ) == 0)
	891	break ; /* found */
	892	else if (pipe_expire && q->head == NULL && q->S == q->F+1 ) {
	893	/* entry is idle and not in any heap, expire it */
	894	struct dn_flow_queue *old_q = q ;
	895
	896	if (prev != NULL)
	897	prev->next = q = q->next ;
	898	else
	899	fs->rq[i] = q = q->next ;
	900	fs->rq_elements-- ;
	901	FREE(old_q, M_IPFW);
	902	continue ;
	903	}
	904	prev = q ;
	905	q = q->next ;
	906	}
	907	if (q && prev != NULL) { /* found and not in front */
	908	prev->next = q->next ;
	909	q->next = fs->rq[i] ;
	910	fs->rq[i] = q ;
	911	}
	912	}
	913	if (q == NULL) { /* no match, need to allocate a new entry */
	914	q = create_queue(fs, i);
	915	if (q != NULL)
	916	q->id = last_pkt ;
	917	}
	918	return q ;
	919	}
	920
	921	static int
	922	red_drops(struct dn_flow_set fs, struct dn_flow_queue q, int len)
	923	{
	924	/*
	925	* RED algorithm
	926	*
	927	* RED calculates the average queue size (avg) using a low-pass filter
	928	* with an exponential weighted (w_q) moving average:
	929	* avg <- (1-w_q) * avg + w_q * q_size
	930	* where q_size is the queue length (measured in bytes or * packets).
	931	*
	932	* If q_size == 0, we compute the idle time for the link, and set
	933	* avg = (1 - w_q)^(idle/s)
	934	* where s is the time needed for transmitting a medium-sized packet.
	935	*
	936	* Now, if avg < min_th the packet is enqueued.
	937	* If avg > max_th the packet is dropped. Otherwise, the packet is
	938	* dropped with probability P function of avg.
	939	*
	940	*/
	941
	942	int64_t p_b = 0;
	943	/* queue in bytes or packets ? */
	944	u_int q_size = (fs->flags_fs & DN_QSIZE_IS_BYTES) ? q->len_bytes : q->len;
	945
	946	DEB(printf("\n%d q: %2u ", (int) curr_time, q_size);)
	947
	948	/* average queue size estimation */
	949	if (q_size != 0) {
	950	/*
	951	* queue is not empty, avg <- avg + (q_size - avg) * w_q
	952	*/
	953	int diff = SCALE(q_size) - q->avg;
	954	int64_t v = SCALE_MUL((int64_t) diff, (int64_t) fs->w_q);
	955
	956	q->avg += (int) v;
	957	} else {
	958	/*
	959	* queue is empty, find for how long the queue has been
	960	* empty and use a lookup table for computing
	961	* (1 - * w_q)^(idle_time/s) where s is the time to send a
	962	* (small) packet.
	963	* XXX check wraps...
	964	*/
	965	if (q->avg) {
	966	u_int t = (curr_time - q->q_time) / fs->lookup_step;
	967
	968	q->avg = (t < fs->lookup_depth) ?
	969	SCALE_MUL(q->avg, fs->w_q_lookup[t]) : 0;
	970	}
	971	}
	972	DEB(printf("avg: %u ", SCALE_VAL(q->avg));)
	973
	974	/* should i drop ? */
	975
	976	if (q->avg < fs->min_th) {
	977	q->count = -1;
	978	return 0; /* accept packet ; */
	979	}
	980	if (q->avg >= fs->max_th) { /* average queue >= max threshold */
	981	if (fs->flags_fs & DN_IS_GENTLE_RED) {
	982	/*
	983	* According to Gentle-RED, if avg is greater than max_th the
	984	* packet is dropped with a probability
	985	* p_b = c_3 * avg - c_4
	986	* where c_3 = (1 - max_p) / max_th, and c_4 = 1 - 2 * max_p
	987	*/
	988	p_b = SCALE_MUL((int64_t) fs->c_3, (int64_t) q->avg) - fs->c_4;
	989	} else {
	990	q->count = -1;
	991	printf("- drop");
	992	return 1 ;
	993	}
	994	} else if (q->avg > fs->min_th) {
	995	/*
	996	* we compute p_b using the linear dropping function p_b = c_1 *
	997	* avg - c_2, where c_1 = max_p / (max_th - min_th), and c_2 =
	998	* max_p * min_th / (max_th - min_th)
	999	*/
	1000	p_b = SCALE_MUL((int64_t) fs->c_1, (int64_t) q->avg) - fs->c_2;
	1001	}
	1002	if (fs->flags_fs & DN_QSIZE_IS_BYTES)
	1003	p_b = (p_b * len) / fs->max_pkt_size;
	1004	if (++q->count == 0)
	1005	q->random = random() & 0xffff;
	1006	else {
	1007	/*
	1008	* q->count counts packets arrived since last drop, so a greater
	1009	* value of q->count means a greater packet drop probability.
	1010	*/
	1011	if (SCALE_MUL(p_b, SCALE((int64_t) q->count)) > q->random) {
	1012	q->count = 0;
	1013	DEB(printf("- red drop");)
	1014	/* after a drop we calculate a new random value */
	1015	q->random = random() & 0xffff;
	1016	return 1; /* drop */
	1017	}
	1018	}
	1019	/* end of RED algorithm */
	1020	return 0 ; /* accept */
	1021	}
	1022
	1023	static __inline
	1024	struct dn_flow_set *
	1025	locate_flowset(int pipe_nr, struct ip_fw_chain *rule)
	1026	{
	1027	struct dn_flow_set *fs = NULL ;
	1028
	1029	if ( (rule->rule->fw_flg & IP_FW_F_COMMAND) == IP_FW_F_QUEUE )
	1030	for (fs=all_flow_sets; fs && fs->fs_nr != pipe_nr; fs=fs->next)
	1031	;
	1032	else {
	1033	struct dn_pipe *p1;
	1034	for (p1 = all_pipes; p1 && p1->pipe_nr != pipe_nr; p1 = p1->next)
	1035	;
	1036	if (p1 != NULL)
	1037	fs = &(p1->fs) ;
	1038	}
	1039	if (fs != NULL)
	1040	rule->rule->pipe_ptr = fs ; /* record for the future */
	1041	return fs ;
	1042	}
	1043
	1044	/*
	1045	* dummynet hook for packets. Below 'pipe' is a pipe or a queue
	1046	* depending on whether WF2Q or fixed bw is used.
	1047	*/
	1048	int
	1049	dummynet_io(int pipe_nr, int dir, /* pipe_nr can also be a fs_nr */
	1050	struct mbuf m, struct ifnet ifp, struct route *ro,
	1051	struct sockaddr_in *dst,
	1052	struct ip_fw_chain *rule, int flags)
	1053	{
	1054	struct dn_pkt *pkt;
	1055	struct dn_flow_set *fs;
	1056	struct dn_pipe *pipe ;
	1057	u_int64_t len = m->m_pkthdr.len ;
	1058	struct dn_flow_queue *q = NULL ;
	1059	int s ;
	1060
	1061	s = splimp();
	1062
	1063	pipe_nr &= 0xffff ;
	1064
	1065	if ( (fs = rule->rule->pipe_ptr) == NULL ) {
	1066	fs = locate_flowset(pipe_nr, rule);
	1067	if (fs == NULL)
	1068	goto dropit ; /* this queue/pipe does not exist! */
	1069	}
	1070	pipe = fs->pipe ;
	1071	if (pipe == NULL) { /* must be a queue, try find a matching pipe */
	1072	for (pipe = all_pipes; pipe && pipe->pipe_nr != fs->parent_nr;
	1073	pipe = pipe->next)
	1074	;
	1075	if (pipe != NULL)
	1076	fs->pipe = pipe ;
	1077	else {
	1078	printf("No pipe %d for queue %d, drop pkt\n",
	1079	fs->parent_nr, fs->fs_nr);
	1080	goto dropit ;
	1081	}
	1082	}
	1083	q = find_queue(fs);
	1084	if ( q == NULL )
	1085	goto dropit ; /* cannot allocate queue */
	1086	/*
	1087	* update statistics, then check reasons to drop pkt
	1088	*/
	1089	q->tot_bytes += len ;
	1090	q->tot_pkts++ ;
	1091	if ( fs->plr && random() < fs->plr )
	1092	goto dropit ; /* random pkt drop */
	1093	if ( fs->flags_fs & DN_QSIZE_IS_BYTES) {
	1094	if (q->len_bytes > fs->qsize)
	1095	goto dropit ; /* queue size overflow */
	1096	} else {
	1097	if (q->len >= fs->qsize)
	1098	goto dropit ; /* queue count overflow */
	1099	}
	1100	if ( fs->flags_fs & DN_IS_RED && red_drops(fs, q, len) )
	1101	goto dropit ;
	1102
	1103	pkt = (struct dn_pkt )_MALLOC(sizeof (pkt), M_IPFW, M_NOWAIT) ;
	1104	if ( pkt == NULL )
	1105	goto dropit ; /* cannot allocate packet header */
	1106	/* ok, i can handle the pkt now... */
	1107	bzero(pkt, sizeof(pkt) ); / XXX expensive, see if we can remove it*/
	1108	/* build and enqueue packet + parameters */
	1109	pkt->hdr.mh_type = MT_DUMMYNET ;
	1110	(struct ip_fw_chain *)pkt->hdr.mh_data = rule ;
	1111	DN_NEXT(pkt) = NULL;
	1112	pkt->dn_m = m;
	1113	pkt->dn_dir = dir ;
	1114
	1115	pkt->ifp = ifp;
	1116	if (dir == DN_TO_IP_OUT) {
	1117	/*
	1118	* We need to copy *ro because for ICMP pkts (and maybe others)
	1119	* the caller passed a pointer into the stack; dst might also be
	1120	* a pointer into *ro so it needs to be updated.
	1121	*/
	1122	pkt->ro = *ro;
	1123	if (ro->ro_rt)
	1124	rtref(ro->ro_rt);
	1125	if (dst == (struct sockaddr_in )&ro->ro_dst) / dst points into ro */
	1126	dst = (struct sockaddr_in *)&(pkt->ro.ro_dst) ;
	1127
	1128	pkt->dn_dst = dst;
	1129	pkt->flags = flags ;
	1130	}
	1131	if (q->head == NULL)
	1132	q->head = pkt;
	1133	else
	1134	DN_NEXT(q->tail) = pkt;
	1135	q->tail = pkt;
	1136	q->len++;
	1137	q->len_bytes += len ;
	1138
	1139	if ( q->head != pkt ) /* flow was not idle, we are done */
	1140	goto done;
	1141	/*
	1142	* If we reach this point the flow was previously idle, so we need
	1143	* to schedule it. This involves different actions for fixed-rate or
	1144	* WF2Q queues.
	1145	*/
	1146	if ( (rule->rule->fw_flg & IP_FW_F_COMMAND) == IP_FW_F_PIPE ) {
	1147	/*
	1148	* Fixed-rate queue: just insert into the ready_heap.
	1149	*/
	1150	dn_key t = 0 ;
	1151	if (pipe->bandwidth)
	1152	t = SET_TICKS(pkt, q, pipe);
	1153	q->sched_time = curr_time ;
	1154	if (t == 0) /* must process it now */
	1155	ready_event( q );
	1156	else
	1157	heap_insert(&ready_heap, curr_time + t , q );
	1158	} else {
	1159	/*
	1160	* WF2Q. First, compute start time S: if the flow was idle (S=F+1)
	1161	* set S to the virtual time V for the controlling pipe, and update
	1162	* the sum of weights for the pipe; otherwise, remove flow from
	1163	* idle_heap and set S to max(F,V).
	1164	* Second, compute finish time F = S + len/weight.
	1165	* Third, if pipe was idle, update V=max(S, V).
	1166	* Fourth, count one more backlogged flow.
	1167	*/
	1168	if (DN_KEY_GT(q->S, q->F)) { /* means timestamps are invalid */
	1169	q->S = pipe->V ;
	1170	pipe->sum += fs->weight ; /* add weight of new queue */
	1171	} else {
	1172	heap_extract(&(pipe->idle_heap), q);
	1173	q->S = MAX64(q->F, pipe->V ) ;
	1174	}
	1175	q->F = q->S + ( len<<MY_M )/(u_int64_t) fs->weight;
	1176
	1177	if (pipe->not_eligible_heap.elements == 0 &&
	1178	pipe->scheduler_heap.elements == 0)
	1179	pipe->V = MAX64 ( q->S, pipe->V );
	1180	fs->backlogged++ ;
	1181	/*
	1182	* Look at eligibility. A flow is not eligibile if S>V (when
	1183	* this happens, it means that there is some other flow already
	1184	* scheduled for the same pipe, so the scheduler_heap cannot be
	1185	* empty). If the flow is not eligible we just store it in the
	1186	* not_eligible_heap. Otherwise, we store in the scheduler_heap
	1187	* and possibly invoke ready_event_wfq() right now if there is
	1188	* leftover credit.
	1189	* Note that for all flows in scheduler_heap (SCH), S_i <= V,
	1190	* and for all flows in not_eligible_heap (NEH), S_i > V .
	1191	* So when we need to compute max( V, min(S_i) ) forall i in SCH+NEH,
	1192	* we only need to look into NEH.
	1193	*/
	1194	if (DN_KEY_GT(q->S, pipe->V) ) { /* not eligible */
	1195	if (pipe->scheduler_heap.elements == 0)
	1196	printf("++ ouch! not eligible but empty scheduler!\n");
	1197	heap_insert(&(pipe->not_eligible_heap), q->S, q);
	1198	} else {
	1199	heap_insert(&(pipe->scheduler_heap), q->F, q);
	1200	if (pipe->numbytes >= 0) { /* pipe is idle */
	1201	if (pipe->scheduler_heap.elements != 1)
	1202	printf("*** OUCH! pipe should have been idle!\n");
	1203	DEB(printf("Waking up pipe %d at %d\n",
	1204	pipe->pipe_nr, (int)(q->F >> MY_M)); )
	1205	pipe->sched_time = curr_time ;
	1206	ready_event_wfq(pipe);
	1207	}
	1208	}
	1209	}
	1210	done:
	1211	splx(s);
	1212	return 0;
	1213
	1214	dropit:
	1215	splx(s);
	1216	if (q)
	1217	q->drops++ ;
	1218	m_freem(m);
	1219	return ENOBUFS ;
	1220	}
	1221
	1222	/*
	1223	* Below, the rt_unref is only needed when (pkt->dn_dir == DN_TO_IP_OUT)
	1224	* Doing this would probably save us the initial bzero of dn_pkt
	1225	*/
	1226	#define DN_FREE_PKT(pkt) { \
	1227	struct dn_pkt *n = pkt ; \
	1228	rt_unref ( n->ro.ro_rt ) ; \
	1229	m_freem(n->dn_m); \
	1230	pkt = DN_NEXT(n) ; \
	1231	FREE(n, M_IPFW) ; }
	1232
	1233	/*
	1234	* Dispose all packets and flow_queues on a flow_set.
	1235	* If all=1, also remove red lookup table and other storage,
	1236	* including the descriptor itself.
	1237	* For the one in dn_pipe MUST also cleanup ready_heap...
	1238	*/
	1239	static void
	1240	purge_flow_set(struct dn_flow_set *fs, int all)
	1241	{
	1242	struct dn_pkt *pkt ;
	1243	struct dn_flow_queue q, qn ;
	1244	int i ;
	1245
	1246	for (i = 0 ; i <= fs->rq_size ; i++ ) {
	1247	for (q = fs->rq[i] ; q ; q = qn ) {
	1248	for (pkt = q->head ; pkt ; )
	1249	DN_FREE_PKT(pkt) ;
	1250	qn = q->next ;
	1251	FREE(q, M_IPFW);
	1252	}
	1253	fs->rq[i] = NULL ;
	1254	}
	1255	fs->rq_elements = 0 ;
	1256	if (all) {
	1257	/* RED - free lookup table */
	1258	if (fs->w_q_lookup)
	1259	FREE(fs->w_q_lookup, M_IPFW);
	1260	if (fs->rq)
	1261	FREE(fs->rq, M_IPFW);
	1262	/* if this fs is not part of a pipe, free it */
	1263	if (fs->pipe && fs != &(fs->pipe->fs) )
	1264	FREE(fs, M_IPFW);
	1265	}
	1266	}
	1267
	1268	/*
	1269	* Dispose all packets queued on a pipe (not a flow_set).
	1270	* Also free all resources associated to a pipe, which is about
	1271	* to be deleted.
	1272	*/
	1273	static void
	1274	purge_pipe(struct dn_pipe *pipe)
	1275	{
	1276	struct dn_pkt *pkt ;
	1277
	1278	purge_flow_set( &(pipe->fs), 1 );
	1279
	1280	for (pkt = pipe->head ; pkt ; )
	1281	DN_FREE_PKT(pkt) ;
	1282
	1283	heap_free( &(pipe->scheduler_heap) );
	1284	heap_free( &(pipe->not_eligible_heap) );
	1285	heap_free( &(pipe->idle_heap) );
	1286	}
	1287
	1288	/*
	1289	* Delete all pipes and heaps returning memory. Must also
	1290	* remove references from all ipfw rules to all pipes.
	1291	*/
	1292	static void
	1293	dummynet_flush()
	1294	{
	1295	struct dn_pipe curr_p, p ;
	1296	struct ip_fw_chain *chain ;
	1297	struct dn_flow_set fs, curr_fs;
	1298	int s ;
	1299
	1300	s = splimp() ;
	1301
	1302	/* remove all references to pipes ...*/
	1303	LIST_FOREACH(chain, &ip_fw_chain_head, next)
	1304	chain->rule->pipe_ptr = NULL ;
	1305	/* prevent future matches... */
	1306	p = all_pipes ;
	1307	all_pipes = NULL ;
	1308	fs = all_flow_sets ;
	1309	all_flow_sets = NULL ;
	1310	/* and free heaps so we don't have unwanted events */
	1311	heap_free(&ready_heap);
	1312	heap_free(&wfq_ready_heap);
	1313	heap_free(&extract_heap);
	1314	splx(s) ;
	1315	/*
	1316	* Now purge all queued pkts and delete all pipes
	1317	*/
	1318	/* scan and purge all flow_sets. */
	1319	for ( ; fs ; ) {
	1320	curr_fs = fs ;
	1321	fs = fs->next ;
	1322	purge_flow_set(curr_fs, 1);
	1323	}
	1324	for ( ; p ; ) {
	1325	purge_pipe(p);
	1326	curr_p = p ;
	1327	p = p->next ;
	1328	FREE(q, M_IPFW);
	1329	}
	1330	}
	1331
	1332
	1333	extern struct ip_fw_chain *ip_fw_default_rule ;
	1334	static void
	1335	dn_rule_delete_fs(struct dn_flow_set fs, void r)
	1336	{
	1337	int i ;
	1338	struct dn_flow_queue *q ;
	1339	struct dn_pkt *pkt ;
	1340
	1341	for (i = 0 ; i <= fs->rq_size ; i++) /* last one is ovflow */
	1342	for (q = fs->rq[i] ; q ; q = q->next )
	1343	for (pkt = q->head ; pkt ; pkt = DN_NEXT(pkt) )
	1344	if (pkt->hdr.mh_data == r)
	1345	pkt->hdr.mh_data = (void *)ip_fw_default_rule ;
	1346	}
	1347	/*
	1348	* when a firewall rule is deleted, scan all queues and remove the flow-id
	1349	* from packets matching this rule.
	1350	*/
	1351	void
	1352	dn_rule_delete(void *r)
	1353	{
	1354	struct dn_pipe *p ;
	1355	struct dn_pkt *pkt ;
	1356	struct dn_flow_set *fs ;
	1357
	1358	/*
	1359	* If the rule references a queue (dn_flow_set), then scan
	1360	* the flow set, otherwise scan pipes. Should do either, but doing
	1361	* both does not harm.
	1362	*/
	1363	for ( fs = all_flow_sets ; fs ; fs = fs->next )
	1364	dn_rule_delete_fs(fs, r);
	1365	for ( p = all_pipes ; p ; p = p->next ) {
	1366	fs = &(p->fs) ;
	1367	dn_rule_delete_fs(fs, r);
	1368	for (pkt = p->head ; pkt ; pkt = DN_NEXT(pkt) )
	1369	if (pkt->hdr.mh_data == r)
	1370	pkt->hdr.mh_data = (void *)ip_fw_default_rule ;
	1371	}
	1372	}
	1373
	1374	/*
	1375	* setup RED parameters
	1376	*/
	1377	static int
	1378	config_red(struct dn_flow_set p, struct dn_flow_set x)
	1379	{
	1380	int i;
	1381
	1382	x->w_q = p->w_q;
	1383	x->min_th = SCALE(p->min_th);
	1384	x->max_th = SCALE(p->max_th);
	1385	x->max_p = p->max_p;
	1386
	1387	x->c_1 = p->max_p / (p->max_th - p->min_th);
	1388	x->c_2 = SCALE_MUL(x->c_1, SCALE(p->min_th));
	1389	if (x->flags_fs & DN_IS_GENTLE_RED) {
	1390	x->c_3 = (SCALE(1) - p->max_p) / p->max_th;
	1391	x->c_4 = (SCALE(1) - 2 * p->max_p);
	1392	}
	1393
	1394	/* if the lookup table already exist, free and create it again */
	1395	if (x->w_q_lookup)
	1396	FREE(x->w_q_lookup, M_IPFW);
	1397	if (red_lookup_depth == 0) {
	1398	printf("\nnet.inet.ip.dummynet.red_lookup_depth must be > 0");
	1399	FREE(x, M_IPFW);
	1400	return EINVAL;
	1401	}
	1402	x->lookup_depth = red_lookup_depth;
	1403	x->w_q_lookup = (u_int ) _MALLOC(x->lookup_depth sizeof(int),
	1404	M_IPFW, M_DONTWAIT);
	1405	if (x->w_q_lookup == NULL) {
	1406	printf("sorry, cannot allocate red lookup table\n");
	1407	FREE(x, M_IPFW);
	1408	return ENOSPC;
	1409	}
	1410
	1411	/* fill the lookup table with (1 - w_q)^x */
	1412	x->lookup_step = p->lookup_step ;
	1413	x->lookup_weight = p->lookup_weight ;
	1414	x->w_q_lookup[0] = SCALE(1) - x->w_q;
	1415	for (i = 1; i < x->lookup_depth; i++)
	1416	x->w_q_lookup[i] = SCALE_MUL(x->w_q_lookup[i - 1], x->lookup_weight);
	1417	if (red_avg_pkt_size < 1)
	1418	red_avg_pkt_size = 512 ;
	1419	x->avg_pkt_size = red_avg_pkt_size ;
	1420	if (red_max_pkt_size < 1)
	1421	red_max_pkt_size = 1500 ;
	1422	x->max_pkt_size = red_max_pkt_size ;
	1423	return 0 ;
	1424	}
	1425
	1426	static int
	1427	alloc_hash(struct dn_flow_set x, struct dn_flow_set pfs)
	1428	{
	1429	if (x->flags_fs & DN_HAVE_FLOW_MASK) { /* allocate some slots */
	1430	int l = pfs->rq_size;
	1431
	1432	if (l == 0)
	1433	l = dn_hash_size;
	1434	if (l < 4)
	1435	l = 4;
	1436	else if (l > 1024)
	1437	l = 1024;
	1438	x->rq_size = l;
	1439	} else /* one is enough for null mask */
	1440	x->rq_size = 1;
	1441	x->rq = _MALLOC((1 + x->rq_size) * sizeof(struct dn_flow_queue *),
	1442	M_IPFW, M_DONTWAIT);
	1443	if (x->rq == NULL) {
	1444	printf("sorry, cannot allocate queue\n");
	1445	return ENOSPC;
	1446	}
	1447	bzero(x->rq, (1+x->rq_size) * sizeof(struct dn_flow_queue *));
	1448	x->rq_elements = 0;
	1449	return 0 ;
	1450	}
	1451
	1452	static void
	1453	set_fs_parms(struct dn_flow_set x, struct dn_flow_set src)
	1454	{
	1455	x->flags_fs = src->flags_fs;
	1456	x->qsize = src->qsize;
	1457	x->plr = src->plr;
	1458	x->flow_mask = src->flow_mask;
	1459	if (x->flags_fs & DN_QSIZE_IS_BYTES) {
	1460	if (x->qsize > 1024*1024)
	1461	x->qsize = 1024*1024 ;
	1462	} else {
	1463	if (x->qsize == 0)
	1464	x->qsize = 50 ;
	1465	if (x->qsize > 100)
	1466	x->qsize = 50 ;
	1467	}
	1468	/* configuring RED */
	1469	if ( x->flags_fs & DN_IS_RED )
	1470	config_red(src, x) ; /* XXX should check errors */
	1471	}
	1472
	1473	/*
	1474	* setup pipe or queue parameters.
	1475	*/
	1476
	1477	static int
	1478	config_pipe(struct dn_pipe *p)
	1479	{
	1480	int s ;
	1481	struct dn_flow_set *pfs = &(p->fs);
	1482
	1483	/*
	1484	* The config program passes parameters as follows:
	1485	* bw = bits/second (0 means no limits),
	1486	* delay = ms, must be translated into ticks.
	1487	* qsize = slots/bytes
	1488	*/
	1489	p->delay = ( p->delay * hz ) / 1000 ;
	1490	/* We need either a pipe number or a flow_set number */
	1491	if (p->pipe_nr == 0 && pfs->fs_nr == 0)
	1492	return EINVAL ;
	1493	if (p->pipe_nr != 0 && pfs->fs_nr != 0)
	1494	return EINVAL ;
	1495	if (p->pipe_nr != 0) { /* this is a pipe */
	1496	struct dn_pipe x, a, *b;
	1497	/* locate pipe */
	1498	for (a = NULL , b = all_pipes ; b && b->pipe_nr < p->pipe_nr ;
	1499	a = b , b = b->next) ;
	1500
	1501	if (b == NULL \|\| b->pipe_nr != p->pipe_nr) { /* new pipe */
	1502	x = _MALLOC(sizeof(struct dn_pipe), M_IPFW, M_DONTWAIT) ;
	1503	if (x == NULL) {
	1504	printf("ip_dummynet.c: no memory for new pipe\n");
	1505	return ENOSPC;
	1506	}
	1507	bzero(x, sizeof(struct dn_pipe));
	1508	x->pipe_nr = p->pipe_nr;
	1509	x->fs.pipe = x ;
	1510	/* idle_heap is the only one from which we extract from the middle.
	1511	*/
	1512	x->idle_heap.size = x->idle_heap.elements = 0 ;
	1513	x->idle_heap.offset=OFFSET_OF(struct dn_flow_queue, heap_pos);
	1514	} else
	1515	x = b;
	1516
	1517	x->bandwidth = p->bandwidth ;
	1518	x->numbytes = 0; /* just in case... */
	1519	bcopy(p->if_name, x->if_name, sizeof(p->if_name) );
	1520	x->ifp = NULL ; /* reset interface ptr */
	1521	x->delay = p->delay ;
	1522	set_fs_parms(&(x->fs), pfs);
	1523
	1524
	1525	if ( x->fs.rq == NULL ) { /* a new pipe */
	1526	s = alloc_hash(&(x->fs), pfs) ;
	1527	if (s) {
	1528	FREE(x, M_IPFW);
	1529	return s ;
	1530	}
	1531	s = splimp() ;
	1532	x->next = b ;
	1533	if (a == NULL)
	1534	all_pipes = x ;
	1535	else
	1536	a->next = x ;
	1537	splx(s);
	1538	}
	1539	} else { /* config queue */
	1540	struct dn_flow_set x, a, *b ;
	1541
	1542	/* locate flow_set */
	1543	for (a=NULL, b=all_flow_sets ; b && b->fs_nr < pfs->fs_nr ;
	1544	a = b , b = b->next) ;
	1545
	1546	if (b == NULL \|\| b->fs_nr != pfs->fs_nr) { /* new */
	1547	if (pfs->parent_nr == 0) /* need link to a pipe */
	1548	return EINVAL ;
	1549	x = _MALLOC(sizeof(struct dn_flow_set), M_IPFW, M_DONTWAIT);
	1550	if (x == NULL) {
	1551	printf("ip_dummynet.c: no memory for new flow_set\n");
	1552	return ENOSPC;
	1553	}
	1554	bzero(x, sizeof(struct dn_flow_set));
	1555	x->fs_nr = pfs->fs_nr;
	1556	x->parent_nr = pfs->parent_nr;
	1557	x->weight = pfs->weight ;
	1558	if (x->weight == 0)
	1559	x->weight = 1 ;
	1560	else if (x->weight > 100)
	1561	x->weight = 100 ;
	1562	} else {
	1563	/* Change parent pipe not allowed; must delete and recreate */
	1564	if (pfs->parent_nr != 0 && b->parent_nr != pfs->parent_nr)
	1565	return EINVAL ;
	1566	x = b;
	1567	}
	1568	set_fs_parms(x, pfs);
	1569
	1570	if ( x->rq == NULL ) { /* a new flow_set */
	1571	s = alloc_hash(x, pfs) ;
	1572	if (s) {
	1573	FREE(x, M_IPFW);
	1574	return s ;
	1575	}
	1576	s = splimp() ;
	1577	x->next = b;
	1578	if (a == NULL)
	1579	all_flow_sets = x;
	1580	else
	1581	a->next = x;
	1582	splx(s);
	1583	}
	1584	}
	1585	return 0 ;
	1586	}
	1587
	1588	/*
	1589	* Helper function to remove from a heap queues which are linked to
	1590	* a flow_set about to be deleted.
	1591	*/
	1592	static void
	1593	fs_remove_from_heap(struct dn_heap h, struct dn_flow_set fs)
	1594	{
	1595	int i = 0, found = 0 ;
	1596	for (; i < h->elements ;)
	1597	if ( ((struct dn_flow_queue *)h->p[i].object)->fs == fs) {
	1598	h->elements-- ;
	1599	h->p[i] = h->p[h->elements] ;
	1600	found++ ;
	1601	} else
	1602	i++ ;
	1603	if (found)
	1604	heapify(h);
	1605	}
	1606
	1607	/*
	1608	* helper function to remove a pipe from a heap (can be there at most once)
	1609	*/
	1610	static void
	1611	pipe_remove_from_heap(struct dn_heap h, struct dn_pipe p)
	1612	{
	1613	if (h->elements > 0) {
	1614	int i = 0 ;
	1615	for (i=0; i < h->elements ; i++ ) {
	1616	if (h->p[i].object == p) { /* found it */
	1617	h->elements-- ;
	1618	h->p[i] = h->p[h->elements] ;
	1619	heapify(h);
	1620	break ;
	1621	}
	1622	}
	1623	}
	1624	}
	1625
	1626	/*
	1627	* drain all queues. Called in case of severe mbuf shortage.
	1628	*/
	1629	void
	1630	dummynet_drain()
	1631	{
	1632	struct dn_flow_set *fs;
	1633	struct dn_pipe *p;
	1634	struct dn_pkt *pkt;
	1635
	1636	heap_free(&ready_heap);
	1637	heap_free(&wfq_ready_heap);
	1638	heap_free(&extract_heap);
	1639	/* remove all references to this pipe from flow_sets */
	1640	for (fs = all_flow_sets; fs; fs= fs->next )
	1641	purge_flow_set(fs, 0);
	1642
	1643	for (p = all_pipes; p; p= p->next ) {
	1644	purge_flow_set(&(p->fs), 0);
	1645	for (pkt = p->head ; pkt ; )
	1646	DN_FREE_PKT(pkt) ;
	1647	p->head = p->tail = NULL ;
	1648	}
	1649	}
	1650
	1651	/*
	1652	* Fully delete a pipe or a queue, cleaning up associated info.
	1653	*/
	1654	static int
	1655	delete_pipe(struct dn_pipe *p)
	1656	{
	1657	int s ;
	1658	struct ip_fw_chain *chain ;
	1659
	1660	if (p->pipe_nr == 0 && p->fs.fs_nr == 0)
	1661	return EINVAL ;
	1662	if (p->pipe_nr != 0 && p->fs.fs_nr != 0)
	1663	return EINVAL ;
	1664	if (p->pipe_nr != 0) { /* this is an old-style pipe */
	1665	struct dn_pipe a, b;
	1666	struct dn_flow_set *fs;
	1667
	1668	/* locate pipe */
	1669	for (a = NULL , b = all_pipes ; b && b->pipe_nr < p->pipe_nr ;
	1670	a = b , b = b->next) ;
	1671	if (b == NULL \|\| (b->pipe_nr != p->pipe_nr) )
	1672	return EINVAL ; /* not found */
	1673
	1674	s = splimp() ;
	1675
	1676	/* unlink from list of pipes */
	1677	if (a == NULL)
	1678	all_pipes = b->next ;
	1679	else
	1680	a->next = b->next ;
	1681	/* remove references to this pipe from the ip_fw rules. */
	1682	LIST_FOREACH(chain, &ip_fw_chain_head, next)
	1683	if (chain->rule->pipe_ptr == &(b->fs))
	1684	chain->rule->pipe_ptr = NULL ;
	1685
	1686	/* remove all references to this pipe from flow_sets */
	1687	for (fs = all_flow_sets; fs; fs= fs->next )
	1688	if (fs->pipe == b) {
	1689	printf("++ ref to pipe %d from fs %d\n",
	1690	p->pipe_nr, fs->fs_nr);
	1691	fs->pipe = NULL ;
	1692	purge_flow_set(fs, 0);
	1693	}
	1694	fs_remove_from_heap(&ready_heap, &(b->fs));
	1695	purge_pipe(b); /* remove all data associated to this pipe */
	1696	/* remove reference to here from extract_heap and wfq_ready_heap */
	1697	pipe_remove_from_heap(&extract_heap, b);
	1698	pipe_remove_from_heap(&wfq_ready_heap, b);
	1699	splx(s);
	1700	FREE(b, M_IPFW);
	1701	} else { /* this is a WF2Q queue (dn_flow_set) */
	1702	struct dn_flow_set a, b;
	1703
	1704	/* locate set */
	1705	for (a = NULL, b = all_flow_sets ; b && b->fs_nr < p->fs.fs_nr ;
	1706	a = b , b = b->next) ;
	1707	if (b == NULL \|\| (b->fs_nr != p->fs.fs_nr) )
	1708	return EINVAL ; /* not found */
	1709
	1710	s = splimp() ;
	1711	if (a == NULL)
	1712	all_flow_sets = b->next ;
	1713	else
	1714	a->next = b->next ;
	1715	/* remove references to this flow_set from the ip_fw rules. */
	1716	LIST_FOREACH(chain, &ip_fw_chain_head, next)
	1717	if (chain->rule->pipe_ptr == b)
	1718	chain->rule->pipe_ptr = NULL ;
	1719
	1720	if (b->pipe != NULL) {
	1721	/* Update total weight on parent pipe and cleanup parent heaps */
	1722	b->pipe->sum -= b->weight * b->backlogged ;
	1723	fs_remove_from_heap(&(b->pipe->not_eligible_heap), b);
	1724	fs_remove_from_heap(&(b->pipe->scheduler_heap), b);
	1725	#if 1 /* XXX should i remove from idle_heap as well ? */
	1726	fs_remove_from_heap(&(b->pipe->idle_heap), b);
	1727	#endif
	1728	}
	1729	purge_flow_set(b, 1);
	1730	splx(s);
	1731	}
	1732	return 0 ;
	1733	}
	1734
	1735	/*
	1736	* helper function used to copy data from kernel in DUMMYNET_GET
	1737	*/
	1738	static char *
	1739	dn_copy_set(struct dn_flow_set set, char bp)
	1740	{
	1741	int i, copied = 0 ;
	1742	struct dn_flow_queue q, qp = (struct dn_flow_queue *)bp;
	1743
	1744	for (i = 0 ; i <= set->rq_size ; i++)
	1745	for (q = set->rq[i] ; q ; q = q->next, qp++ ) {
	1746	if (q->hash_slot != i)
	1747	printf("++ at %d: wrong slot (have %d, "
	1748	"should be %d)\n", copied, q->hash_slot, i);
	1749	if (q->fs != set)
	1750	printf("++ at %d: wrong fs ptr (have %p, should be %p)\n",
	1751	i, q->fs, set);
	1752	copied++ ;
	1753	bcopy(q, qp, sizeof( *q ) );
	1754	/* cleanup pointers */
	1755	qp->next = NULL ;
	1756	qp->head = qp->tail = NULL ;
	1757	qp->fs = NULL ;
	1758	}
	1759	if (copied != set->rq_elements)
	1760	printf("++ wrong count, have %d should be %d\n",
	1761	copied, set->rq_elements);
	1762	return (char *)qp ;
	1763	}
	1764
	1765	static int
	1766	dummynet_get(struct sockopt *sopt)
	1767	{
	1768	char buf, bp ; /* bp is the "copy-pointer" */
	1769	size_t size ;
	1770	struct dn_flow_set *set ;
	1771	struct dn_pipe *p ;
	1772	int s, error=0 ;
	1773
	1774	s = splimp();
	1775	/*
	1776	* compute size of data structures: list of pipes and flow_sets.
	1777	*/
	1778	for (p = all_pipes, size = 0 ; p ; p = p->next )
	1779	size += sizeof( *p ) +
	1780	p->fs.rq_elements * sizeof(struct dn_flow_queue);
	1781	for (set = all_flow_sets ; set ; set = set->next )
	1782	size += sizeof ( *set ) +
	1783	set->rq_elements * sizeof(struct dn_flow_queue);
	1784	buf = _MALLOC(size, M_TEMP, M_DONTWAIT);
	1785	if (buf == 0) {
	1786	splx(s);
	1787	return ENOBUFS ;
	1788	}
	1789	for (p = all_pipes, bp = buf ; p ; p = p->next ) {
	1790	struct dn_pipe pipe_bp = (struct dn_pipe )bp ;
	1791
	1792	/*
	1793	* copy pipe descriptor into *bp, convert delay back to ms,
	1794	* then copy the flow_set descriptor(s) one at a time.
	1795	* After each flow_set, copy the queue descriptor it owns.
	1796	*/
	1797	bcopy(p, bp, sizeof( *p ) );
	1798	pipe_bp->delay = (pipe_bp->delay * 1000) / hz ;
	1799	/*
	1800	* XXX the following is a hack based on ->next being the
	1801	* first field in dn_pipe and dn_flow_set. The correct
	1802	* solution would be to move the dn_flow_set to the beginning
	1803	* of struct dn_pipe.
	1804	*/
	1805	pipe_bp->next = (struct dn_pipe *)DN_IS_PIPE ;
	1806	/* clean pointers */
	1807	pipe_bp->head = pipe_bp->tail = NULL ;
	1808	pipe_bp->fs.next = NULL ;
	1809	pipe_bp->fs.pipe = NULL ;
	1810	pipe_bp->fs.rq = NULL ;
	1811
	1812	bp += sizeof( *p ) ;
	1813	bp = dn_copy_set( &(p->fs), bp );
	1814	}
	1815	for (set = all_flow_sets ; set ; set = set->next ) {
	1816	struct dn_flow_set fs_bp = (struct dn_flow_set )bp ;
	1817	bcopy(set, bp, sizeof( *set ) );
	1818	/* XXX same hack as above */
	1819	fs_bp->next = (struct dn_flow_set *)DN_IS_QUEUE ;
	1820	fs_bp->pipe = NULL ;
	1821	fs_bp->rq = NULL ;
	1822	bp += sizeof( *set ) ;
	1823	bp = dn_copy_set( set, bp );
	1824	}
	1825	splx(s);
	1826	error = sooptcopyout(sopt, buf, size);
	1827	FREE(buf, M_TEMP);
	1828	return error ;
	1829	}
	1830
	1831	/*
	1832	* Handler for the various dummynet socket options (get, flush, config, del)
	1833	*/
	1834	static int
	1835	ip_dn_ctl(struct sockopt *sopt)
	1836	{
	1837	int error = 0 ;
	1838	struct dn_pipe *p, tmp_pipe;
	1839
	1840	/* Disallow sets in really-really secure mode. */
	1841	if (sopt->sopt_dir == SOPT_SET && securelevel >= 3)
	1842	return (EPERM);
	1843
	1844	switch (sopt->sopt_name) {
	1845	default :
	1846	printf("ip_dn_ctl -- unknown option %d", sopt->sopt_name);
	1847	return EINVAL ;
	1848
	1849	case IP_DUMMYNET_GET :
	1850	error = dummynet_get(sopt);
	1851	break ;
	1852
	1853	case IP_DUMMYNET_FLUSH :
	1854	dummynet_flush() ;
	1855	break ;
	1856	case IP_DUMMYNET_CONFIGURE :
	1857	p = &tmp_pipe ;
	1858	error = sooptcopyin(sopt, p, sizeof p, sizeof p);
	1859	if (error)
	1860	break ;
	1861	error = config_pipe(p);
	1862	break ;
	1863
	1864	case IP_DUMMYNET_DEL : /* remove a pipe or queue */
	1865	p = &tmp_pipe ;
	1866	error = sooptcopyin(sopt, p, sizeof p, sizeof p);
	1867	if (error)
	1868	break ;
	1869
	1870	error = delete_pipe(p);
	1871	break ;
	1872	}
	1873	return error ;
	1874	}
	1875
	1876	static void
	1877	ip_dn_init(void)
	1878	{
	1879	printf("DUMMYNET initialized (010124)\n");
	1880	all_pipes = NULL ;
	1881	all_flow_sets = NULL ;
	1882	ready_heap.size = ready_heap.elements = 0 ;
	1883	ready_heap.offset = 0 ;
	1884
	1885	wfq_ready_heap.size = wfq_ready_heap.elements = 0 ;
	1886	wfq_ready_heap.offset = 0 ;
	1887
	1888	extract_heap.size = extract_heap.elements = 0 ;
	1889	extract_heap.offset = 0 ;
	1890	ip_dn_ctl_ptr = ip_dn_ctl;
	1891	timeout(dummynet, NULL, 1);
	1892	}
	1893
	1894	static ip_dn_ctl_t *old_dn_ctl_ptr ;
	1895
	1896	static int
	1897	dummynet_modevent(module_t mod, int type, void *data)
	1898	{
	1899	int s ;
	1900	switch (type) {
	1901	case MOD_LOAD:
	1902	s = splimp();
	1903	old_dn_ctl_ptr = ip_dn_ctl_ptr;
	1904	ip_dn_init();
	1905	splx(s);
	1906	break;
	1907	case MOD_UNLOAD:
	1908	s = splimp();
	1909	ip_dn_ctl_ptr = old_dn_ctl_ptr;
	1910	splx(s);
	1911	dummynet_flush();
	1912	break ;
	1913	default:
	1914	break ;
	1915	}
	1916	return 0 ;
	1917	}
	1918
	1919	static moduledata_t dummynet_mod = {
	1920	"dummynet",
	1921	dummynet_modevent,
	1922	NULL
	1923	} ;
	1924	DECLARE_MODULE(dummynet, dummynet_mod, SI_SUB_PSEUDO, SI_ORDER_ANY);