bsd/netinet/ip_dummynet.c

   1 /*
   2  * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * Copyright (c) 1998-2002 Luigi Rizzo, Universita` di Pisa
  30  * Portions Copyright (c) 2000 Akamba Corp.
  31  * All rights reserved
  32  *
  33  * Redistribution and use in source and binary forms, with or without
  34  * modification, are permitted provided that the following conditions
  35  * are met:
  36  * 1. Redistributions of source code must retain the above copyright
  37  *    notice, this list of conditions and the following disclaimer.
  38  * 2. Redistributions in binary form must reproduce the above copyright
  39  *    notice, this list of conditions and the following disclaimer in the
  40  *    documentation and/or other materials provided with the distribution.
  41  *
  42  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  43  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  44  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  45  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  46  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  47  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  48  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  49  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  50  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  51  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  52  * SUCH DAMAGE.
  53  *
  54  * $FreeBSD: src/sys/netinet/ip_dummynet.c,v 1.84 2004/08/25 09:31:30 pjd Exp $
  55  */
  56
  57 #define DUMMYNET_DEBUG
  58
  59 /*
  60  * This module implements IP dummynet, a bandwidth limiter/delay emulator
  61  * Description of the data structures used is in ip_dummynet.h
  62  * Here you mainly find the following blocks of code:
  63  *  + variable declarations;
  64  *  + heap management functions;
  65  *  + scheduler and dummynet functions;
  66  *  + configuration and initialization.
  67  *
  68  * NOTA BENE: critical sections are protected by the "dummynet lock".
  69  *
  70  * Most important Changes:
  71  *
  72  * 010124: Fixed WF2Q behaviour
  73  * 010122: Fixed spl protection.
  74  * 000601: WF2Q support
  75  * 000106: large rewrite, use heaps to handle very many pipes.
  76  * 980513:      initial release
  77  *
  78  * include files marked with XXX are probably not needed
  79  */
  80
  81 #include <sys/param.h>
  82 #include <sys/systm.h>
  83 #include <sys/malloc.h>
  84 #include <sys/mbuf.h>
  85 #include <sys/queue.h>                  /* XXX */
  86 #include <sys/kernel.h>
  87 #include <sys/random.h>
  88 #include <sys/socket.h>
  89 #include <sys/socketvar.h>
  90 #include <sys/time.h>
  91 #include <sys/sysctl.h>
  92 #include <net/if.h>
  93 #include <net/route.h>
  94 #include <net/kpi_protocol.h>
  95 #if DUMMYNET
  96 #include <net/kpi_protocol.h>
  97 #endif /* DUMMYNET */
  98 #include <net/nwk_wq.h>
  99 #include <net/pfvar.h>
 100 #include <netinet/in.h>
 101 #include <netinet/in_systm.h>
 102 #include <netinet/in_var.h>
 103 #include <netinet/ip.h>
 104 #include <netinet/ip_dummynet.h>
 105 #include <netinet/ip_var.h>
 106
 107 #include <netinet/ip6.h>       /* for ip6_input, ip6_output prototypes */
 108 #include <netinet6/ip6_var.h>
 109
 110 /*
 111  * We keep a private variable for the simulation time, but we could
 112  * probably use an existing one ("softticks" in sys/kern/kern_timer.c)
 113  */
 114 static dn_key curr_time = 0;  /* current simulation time */
 115
 116 /* this is for the timer that fires to call dummynet() - we only enable the timer when
 117  *       there are packets to process, otherwise it's disabled */
 118 static int timer_enabled = 0;
 119
 120 static int dn_hash_size = 64;   /* default hash size */
 121
 122 /* statistics on number of queue searches and search steps */
 123 static int searches, search_steps;
 124 static int pipe_expire = 1;    /* expire queue if empty */
 125 static int dn_max_ratio = 16;  /* max queues/buckets ratio */
 126
 127 static int red_lookup_depth = 256;      /* RED - default lookup table depth */
 128 static int red_avg_pkt_size = 512;      /* RED - default medium packet size */
 129 static int red_max_pkt_size = 1500;     /* RED - default max packet size */
 130
 131 static int serialize = 0;
 132
 133 /*
 134  * Three heaps contain queues and pipes that the scheduler handles:
 135  *
 136  * ready_heap contains all dn_flow_queue related to fixed-rate pipes.
 137  *
 138  * wfq_ready_heap contains the pipes associated with WF2Q flows
 139  *
 140  * extract_heap contains pipes associated with delay lines.
 141  *
 142  */
 143 static struct dn_heap ready_heap, extract_heap, wfq_ready_heap;
 144
 145 static int heap_init(struct dn_heap *h, int size);
 146 static int heap_insert(struct dn_heap *h, dn_key key1, void *p);
 147 static void heap_extract(struct dn_heap *h, void *obj);
 148
 149
 150 static void     transmit_event(struct dn_pipe *pipe, struct mbuf **head,
 151     struct mbuf **tail);
 152 static void     ready_event(struct dn_flow_queue *q, struct mbuf **head,
 153     struct mbuf **tail);
 154 static void     ready_event_wfq(struct dn_pipe *p, struct mbuf **head,
 155     struct mbuf **tail);
 156
 157 /*
 158  * Packets are retrieved from queues in Dummynet in chains instead of
 159  * packet-by-packet.  The entire list of packets is first dequeued and
 160  * sent out by the following function.
 161  */
 162 static void dummynet_send(struct mbuf *m);
 163
 164 #define HASHSIZE        16
 165 #define HASH(num)       ((((num) >> 8) ^ ((num) >> 4) ^ (num)) & 0x0f)
 166 static struct dn_pipe_head      pipehash[HASHSIZE];     /* all pipes */
 167 static struct dn_flow_set_head  flowsethash[HASHSIZE];  /* all flowsets */
 168
 169 #ifdef SYSCTL_NODE
 170 SYSCTL_NODE(_net_inet_ip, OID_AUTO, dummynet,
 171     CTLFLAG_RW | CTLFLAG_LOCKED, 0, "Dummynet");
 172 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, hash_size,
 173     CTLFLAG_RW | CTLFLAG_LOCKED, &dn_hash_size, 0, "Default hash table size");
 174 SYSCTL_QUAD(_net_inet_ip_dummynet, OID_AUTO, curr_time,
 175     CTLFLAG_RD | CTLFLAG_LOCKED, &curr_time, "Current tick");
 176 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, ready_heap,
 177     CTLFLAG_RD | CTLFLAG_LOCKED, &ready_heap.size, 0, "Size of ready heap");
 178 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, extract_heap,
 179     CTLFLAG_RD | CTLFLAG_LOCKED, &extract_heap.size, 0, "Size of extract heap");
 180 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, searches,
 181     CTLFLAG_RD | CTLFLAG_LOCKED, &searches, 0, "Number of queue searches");
 182 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, search_steps,
 183     CTLFLAG_RD | CTLFLAG_LOCKED, &search_steps, 0, "Number of queue search steps");
 184 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, expire,
 185     CTLFLAG_RW | CTLFLAG_LOCKED, &pipe_expire, 0, "Expire queue if empty");
 186 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, max_chain_len,
 187     CTLFLAG_RW | CTLFLAG_LOCKED, &dn_max_ratio, 0,
 188     "Max ratio between dynamic queues and buckets");
 189 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_lookup_depth,
 190     CTLFLAG_RD | CTLFLAG_LOCKED, &red_lookup_depth, 0, "Depth of RED lookup table");
 191 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_avg_pkt_size,
 192     CTLFLAG_RD | CTLFLAG_LOCKED, &red_avg_pkt_size, 0, "RED Medium packet size");
 193 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_max_pkt_size,
 194     CTLFLAG_RD | CTLFLAG_LOCKED, &red_max_pkt_size, 0, "RED Max packet size");
 195 #endif
 196
 197 #ifdef DUMMYNET_DEBUG
 198 int     dummynet_debug = 0;
 199 #ifdef SYSCTL_NODE
 200 SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, debug, CTLFLAG_RW | CTLFLAG_LOCKED, &dummynet_debug,
 201     0, "control debugging printfs");
 202 #endif
 203 #define DPRINTF(X)      if (dummynet_debug) printf X
 204 #else
 205 #define DPRINTF(X)
 206 #endif
 207
 208 /* dummynet lock */
 209 static lck_grp_t         *dn_mutex_grp;
 210 static lck_grp_attr_t    *dn_mutex_grp_attr;
 211 static lck_attr_t        *dn_mutex_attr;
 212 decl_lck_mtx_data(static, dn_mutex_data);
 213 static lck_mtx_t         *dn_mutex = &dn_mutex_data;
 214
 215 static int config_pipe(struct dn_pipe *p);
 216 static int ip_dn_ctl(struct sockopt *sopt);
 217
 218 static void dummynet(void *);
 219 static void dummynet_flush(void);
 220 void dummynet_drain(void);
 221 static ip_dn_io_t dummynet_io;
 222
 223 static void cp_flow_set_to_64_user(struct dn_flow_set *set, struct dn_flow_set_64 *fs_bp);
 224 static void cp_queue_to_64_user( struct dn_flow_queue *q, struct dn_flow_queue_64 *qp);
 225 static char *cp_pipe_to_64_user(struct dn_pipe *p, struct dn_pipe_64 *pipe_bp);
 226 static char* dn_copy_set_64(struct dn_flow_set *set, char *bp);
 227 static int cp_pipe_from_user_64( struct sockopt *sopt, struct dn_pipe *p );
 228
 229 static void cp_flow_set_to_32_user(struct dn_flow_set *set, struct dn_flow_set_32 *fs_bp);
 230 static void cp_queue_to_32_user( struct dn_flow_queue *q, struct dn_flow_queue_32 *qp);
 231 static char *cp_pipe_to_32_user(struct dn_pipe *p, struct dn_pipe_32 *pipe_bp);
 232 static char* dn_copy_set_32(struct dn_flow_set *set, char *bp);
 233 static int cp_pipe_from_user_32( struct sockopt *sopt, struct dn_pipe *p );
 234
 235 struct eventhandler_lists_ctxt dummynet_evhdlr_ctxt;
 236
 237 uint32_t
 238 my_random(void)
 239 {
 240         uint32_t val;
 241         read_frandom(&val, sizeof(val));
 242         val &= 0x7FFFFFFF;
 243
 244         return val;
 245 }
 246
 247 /*
 248  * Heap management functions.
 249  *
 250  * In the heap, first node is element 0. Children of i are 2i+1 and 2i+2.
 251  * Some macros help finding parent/children so we can optimize them.
 252  *
 253  * heap_init() is called to expand the heap when needed.
 254  * Increment size in blocks of 16 entries.
 255  * XXX failure to allocate a new element is a pretty bad failure
 256  * as we basically stall a whole queue forever!!
 257  * Returns 1 on error, 0 on success
 258  */
 259 #define HEAP_FATHER(x) ( ( (x) - 1 ) / 2 )
 260 #define HEAP_LEFT(x) ( 2*(x) + 1 )
 261 #define HEAP_IS_LEFT(x) ( (x) & 1 )
 262 #define HEAP_RIGHT(x) ( 2*(x) + 2 )
 263 #define HEAP_SWAP(a, b, buffer) { buffer = a ; a = b ; b = buffer ; }
 264 #define HEAP_INCREMENT  15
 265
 266
 267 int
 268 cp_pipe_from_user_32( struct sockopt *sopt, struct dn_pipe *p )
 269 {
 270         struct dn_pipe_32 user_pipe_32;
 271         int error = 0;
 272
 273         error = sooptcopyin(sopt, &user_pipe_32, sizeof(struct dn_pipe_32), sizeof(struct dn_pipe_32));
 274         if (!error) {
 275                 p->pipe_nr = user_pipe_32.pipe_nr;
 276                 p->bandwidth = user_pipe_32.bandwidth;
 277                 p->delay = user_pipe_32.delay;
 278                 p->V = user_pipe_32.V;
 279                 p->sum = user_pipe_32.sum;
 280                 p->numbytes = user_pipe_32.numbytes;
 281                 p->sched_time = user_pipe_32.sched_time;
 282                 bcopy( user_pipe_32.if_name, p->if_name, IFNAMSIZ);
 283                 p->ready = user_pipe_32.ready;
 284
 285                 p->fs.fs_nr = user_pipe_32.fs.fs_nr;
 286                 p->fs.flags_fs = user_pipe_32.fs.flags_fs;
 287                 p->fs.parent_nr = user_pipe_32.fs.parent_nr;
 288                 p->fs.weight = user_pipe_32.fs.weight;
 289                 p->fs.qsize = user_pipe_32.fs.qsize;
 290                 p->fs.plr = user_pipe_32.fs.plr;
 291                 p->fs.flow_mask = user_pipe_32.fs.flow_mask;
 292                 p->fs.rq_size = user_pipe_32.fs.rq_size;
 293                 p->fs.rq_elements = user_pipe_32.fs.rq_elements;
 294                 p->fs.last_expired = user_pipe_32.fs.last_expired;
 295                 p->fs.backlogged = user_pipe_32.fs.backlogged;
 296                 p->fs.w_q = user_pipe_32.fs.w_q;
 297                 p->fs.max_th = user_pipe_32.fs.max_th;
 298                 p->fs.min_th = user_pipe_32.fs.min_th;
 299                 p->fs.max_p = user_pipe_32.fs.max_p;
 300                 p->fs.c_1 = user_pipe_32.fs.c_1;
 301                 p->fs.c_2 = user_pipe_32.fs.c_2;
 302                 p->fs.c_3 = user_pipe_32.fs.c_3;
 303                 p->fs.c_4 = user_pipe_32.fs.c_4;
 304                 p->fs.lookup_depth = user_pipe_32.fs.lookup_depth;
 305                 p->fs.lookup_step = user_pipe_32.fs.lookup_step;
 306                 p->fs.lookup_weight = user_pipe_32.fs.lookup_weight;
 307                 p->fs.avg_pkt_size = user_pipe_32.fs.avg_pkt_size;
 308                 p->fs.max_pkt_size = user_pipe_32.fs.max_pkt_size;
 309         }
 310         return error;
 311 }
 312
 313
 314 int
 315 cp_pipe_from_user_64( struct sockopt *sopt, struct dn_pipe *p )
 316 {
 317         struct dn_pipe_64 user_pipe_64;
 318         int error = 0;
 319
 320         error = sooptcopyin(sopt, &user_pipe_64, sizeof(struct dn_pipe_64), sizeof(struct dn_pipe_64));
 321         if (!error) {
 322                 p->pipe_nr = user_pipe_64.pipe_nr;
 323                 p->bandwidth = user_pipe_64.bandwidth;
 324                 p->delay = user_pipe_64.delay;
 325                 p->V = user_pipe_64.V;
 326                 p->sum = user_pipe_64.sum;
 327                 p->numbytes = user_pipe_64.numbytes;
 328                 p->sched_time = user_pipe_64.sched_time;
 329                 bcopy( user_pipe_64.if_name, p->if_name, IFNAMSIZ);
 330                 p->ready = user_pipe_64.ready;
 331
 332                 p->fs.fs_nr = user_pipe_64.fs.fs_nr;
 333                 p->fs.flags_fs = user_pipe_64.fs.flags_fs;
 334                 p->fs.parent_nr = user_pipe_64.fs.parent_nr;
 335                 p->fs.weight = user_pipe_64.fs.weight;
 336                 p->fs.qsize = user_pipe_64.fs.qsize;
 337                 p->fs.plr = user_pipe_64.fs.plr;
 338                 p->fs.flow_mask = user_pipe_64.fs.flow_mask;
 339                 p->fs.rq_size = user_pipe_64.fs.rq_size;
 340                 p->fs.rq_elements = user_pipe_64.fs.rq_elements;
 341                 p->fs.last_expired = user_pipe_64.fs.last_expired;
 342                 p->fs.backlogged = user_pipe_64.fs.backlogged;
 343                 p->fs.w_q = user_pipe_64.fs.w_q;
 344                 p->fs.max_th = user_pipe_64.fs.max_th;
 345                 p->fs.min_th = user_pipe_64.fs.min_th;
 346                 p->fs.max_p = user_pipe_64.fs.max_p;
 347                 p->fs.c_1 = user_pipe_64.fs.c_1;
 348                 p->fs.c_2 = user_pipe_64.fs.c_2;
 349                 p->fs.c_3 = user_pipe_64.fs.c_3;
 350                 p->fs.c_4 = user_pipe_64.fs.c_4;
 351                 p->fs.lookup_depth = user_pipe_64.fs.lookup_depth;
 352                 p->fs.lookup_step = user_pipe_64.fs.lookup_step;
 353                 p->fs.lookup_weight = user_pipe_64.fs.lookup_weight;
 354                 p->fs.avg_pkt_size = user_pipe_64.fs.avg_pkt_size;
 355                 p->fs.max_pkt_size = user_pipe_64.fs.max_pkt_size;
 356         }
 357         return error;
 358 }
 359
 360 static void
 361 cp_flow_set_to_32_user(struct dn_flow_set *set, struct dn_flow_set_32 *fs_bp)
 362 {
 363         fs_bp->fs_nr = set->fs_nr;
 364         fs_bp->flags_fs = set->flags_fs;
 365         fs_bp->parent_nr = set->parent_nr;
 366         fs_bp->weight = set->weight;
 367         fs_bp->qsize = set->qsize;
 368         fs_bp->plr = set->plr;
 369         fs_bp->flow_mask = set->flow_mask;
 370         fs_bp->rq_size = set->rq_size;
 371         fs_bp->rq_elements = set->rq_elements;
 372         fs_bp->last_expired = set->last_expired;
 373         fs_bp->backlogged = set->backlogged;
 374         fs_bp->w_q = set->w_q;
 375         fs_bp->max_th = set->max_th;
 376         fs_bp->min_th = set->min_th;
 377         fs_bp->max_p = set->max_p;
 378         fs_bp->c_1 = set->c_1;
 379         fs_bp->c_2 = set->c_2;
 380         fs_bp->c_3 = set->c_3;
 381         fs_bp->c_4 = set->c_4;
 382         fs_bp->w_q_lookup = CAST_DOWN_EXPLICIT(user32_addr_t, set->w_q_lookup);
 383         fs_bp->lookup_depth = set->lookup_depth;
 384         fs_bp->lookup_step = set->lookup_step;
 385         fs_bp->lookup_weight = set->lookup_weight;
 386         fs_bp->avg_pkt_size = set->avg_pkt_size;
 387         fs_bp->max_pkt_size = set->max_pkt_size;
 388 }
 389
 390 static void
 391 cp_flow_set_to_64_user(struct dn_flow_set *set, struct dn_flow_set_64 *fs_bp)
 392 {
 393         fs_bp->fs_nr = set->fs_nr;
 394         fs_bp->flags_fs = set->flags_fs;
 395         fs_bp->parent_nr = set->parent_nr;
 396         fs_bp->weight = set->weight;
 397         fs_bp->qsize = set->qsize;
 398         fs_bp->plr = set->plr;
 399         fs_bp->flow_mask = set->flow_mask;
 400         fs_bp->rq_size = set->rq_size;
 401         fs_bp->rq_elements = set->rq_elements;
 402         fs_bp->last_expired = set->last_expired;
 403         fs_bp->backlogged = set->backlogged;
 404         fs_bp->w_q = set->w_q;
 405         fs_bp->max_th = set->max_th;
 406         fs_bp->min_th = set->min_th;
 407         fs_bp->max_p = set->max_p;
 408         fs_bp->c_1 = set->c_1;
 409         fs_bp->c_2 = set->c_2;
 410         fs_bp->c_3 = set->c_3;
 411         fs_bp->c_4 = set->c_4;
 412         fs_bp->w_q_lookup = CAST_DOWN(user64_addr_t, set->w_q_lookup);
 413         fs_bp->lookup_depth = set->lookup_depth;
 414         fs_bp->lookup_step = set->lookup_step;
 415         fs_bp->lookup_weight = set->lookup_weight;
 416         fs_bp->avg_pkt_size = set->avg_pkt_size;
 417         fs_bp->max_pkt_size = set->max_pkt_size;
 418 }
 419
 420 static
 421 void
 422 cp_queue_to_32_user( struct dn_flow_queue *q, struct dn_flow_queue_32 *qp)
 423 {
 424         qp->id = q->id;
 425         qp->len = q->len;
 426         qp->len_bytes = q->len_bytes;
 427         qp->numbytes = q->numbytes;
 428         qp->tot_pkts = q->tot_pkts;
 429         qp->tot_bytes = q->tot_bytes;
 430         qp->drops = q->drops;
 431         qp->hash_slot = q->hash_slot;
 432         qp->avg = q->avg;
 433         qp->count = q->count;
 434         qp->random = q->random;
 435         qp->q_time = q->q_time;
 436         qp->heap_pos = q->heap_pos;
 437         qp->sched_time = q->sched_time;
 438         qp->S = q->S;
 439         qp->F = q->F;
 440 }
 441
 442 static
 443 void
 444 cp_queue_to_64_user( struct dn_flow_queue *q, struct dn_flow_queue_64 *qp)
 445 {
 446         qp->id = q->id;
 447         qp->len = q->len;
 448         qp->len_bytes = q->len_bytes;
 449         qp->numbytes = q->numbytes;
 450         qp->tot_pkts = q->tot_pkts;
 451         qp->tot_bytes = q->tot_bytes;
 452         qp->drops = q->drops;
 453         qp->hash_slot = q->hash_slot;
 454         qp->avg = q->avg;
 455         qp->count = q->count;
 456         qp->random = q->random;
 457         qp->q_time = q->q_time;
 458         qp->heap_pos = q->heap_pos;
 459         qp->sched_time = q->sched_time;
 460         qp->S = q->S;
 461         qp->F = q->F;
 462 }
 463
 464 static
 465 char *
 466 cp_pipe_to_32_user(struct dn_pipe *p, struct dn_pipe_32 *pipe_bp)
 467 {
 468         char    *bp;
 469
 470         pipe_bp->pipe_nr = p->pipe_nr;
 471         pipe_bp->bandwidth = p->bandwidth;
 472         pipe_bp->delay = p->delay;
 473         bcopy( &(p->scheduler_heap), &(pipe_bp->scheduler_heap), sizeof(struct dn_heap_32));
 474         pipe_bp->scheduler_heap.p = CAST_DOWN_EXPLICIT(user32_addr_t, pipe_bp->scheduler_heap.p);
 475         bcopy( &(p->not_eligible_heap), &(pipe_bp->not_eligible_heap), sizeof(struct dn_heap_32));
 476         pipe_bp->not_eligible_heap.p = CAST_DOWN_EXPLICIT(user32_addr_t, pipe_bp->not_eligible_heap.p);
 477         bcopy( &(p->idle_heap), &(pipe_bp->idle_heap), sizeof(struct dn_heap_32));
 478         pipe_bp->idle_heap.p = CAST_DOWN_EXPLICIT(user32_addr_t, pipe_bp->idle_heap.p);
 479         pipe_bp->V = p->V;
 480         pipe_bp->sum = p->sum;
 481         pipe_bp->numbytes = p->numbytes;
 482         pipe_bp->sched_time = p->sched_time;
 483         bcopy( p->if_name, pipe_bp->if_name, IFNAMSIZ);
 484         pipe_bp->ifp = CAST_DOWN_EXPLICIT(user32_addr_t, p->ifp);
 485         pipe_bp->ready = p->ready;
 486
 487         cp_flow_set_to_32_user( &(p->fs), &(pipe_bp->fs));
 488
 489         pipe_bp->delay = (pipe_bp->delay * 1000) / (hz * 10);
 490         /*
 491          * XXX the following is a hack based on ->next being the
 492          * first field in dn_pipe and dn_flow_set. The correct
 493          * solution would be to move the dn_flow_set to the beginning
 494          * of struct dn_pipe.
 495          */
 496         pipe_bp->next = CAST_DOWN_EXPLICIT( user32_addr_t, DN_IS_PIPE );
 497         /* clean pointers */
 498         pipe_bp->head = pipe_bp->tail = (user32_addr_t) 0;
 499         pipe_bp->fs.next = (user32_addr_t)0;
 500         pipe_bp->fs.pipe = (user32_addr_t)0;
 501         pipe_bp->fs.rq = (user32_addr_t)0;
 502         bp = ((char *)pipe_bp) + sizeof(struct dn_pipe_32);
 503         return dn_copy_set_32( &(p->fs), bp);
 504 }
 505
 506 static
 507 char *
 508 cp_pipe_to_64_user(struct dn_pipe *p, struct dn_pipe_64 *pipe_bp)
 509 {
 510         char    *bp;
 511
 512         pipe_bp->pipe_nr = p->pipe_nr;
 513         pipe_bp->bandwidth = p->bandwidth;
 514         pipe_bp->delay = p->delay;
 515         bcopy( &(p->scheduler_heap), &(pipe_bp->scheduler_heap), sizeof(struct dn_heap_64));
 516         pipe_bp->scheduler_heap.p = CAST_DOWN(user64_addr_t, pipe_bp->scheduler_heap.p);
 517         bcopy( &(p->not_eligible_heap), &(pipe_bp->not_eligible_heap), sizeof(struct dn_heap_64));
 518         pipe_bp->not_eligible_heap.p = CAST_DOWN(user64_addr_t, pipe_bp->not_eligible_heap.p);
 519         bcopy( &(p->idle_heap), &(pipe_bp->idle_heap), sizeof(struct dn_heap_64));
 520         pipe_bp->idle_heap.p = CAST_DOWN(user64_addr_t, pipe_bp->idle_heap.p);
 521         pipe_bp->V = p->V;
 522         pipe_bp->sum = p->sum;
 523         pipe_bp->numbytes = p->numbytes;
 524         pipe_bp->sched_time = p->sched_time;
 525         bcopy( p->if_name, pipe_bp->if_name, IFNAMSIZ);
 526         pipe_bp->ifp = CAST_DOWN(user64_addr_t, p->ifp);
 527         pipe_bp->ready = p->ready;
 528
 529         cp_flow_set_to_64_user( &(p->fs), &(pipe_bp->fs));
 530
 531         pipe_bp->delay = (pipe_bp->delay * 1000) / (hz * 10);
 532         /*
 533          * XXX the following is a hack based on ->next being the
 534          * first field in dn_pipe and dn_flow_set. The correct
 535          * solution would be to move the dn_flow_set to the beginning
 536          * of struct dn_pipe.
 537          */
 538         pipe_bp->next = CAST_DOWN( user64_addr_t, DN_IS_PIPE );
 539         /* clean pointers */
 540         pipe_bp->head = pipe_bp->tail = USER_ADDR_NULL;
 541         pipe_bp->fs.next = USER_ADDR_NULL;
 542         pipe_bp->fs.pipe = USER_ADDR_NULL;
 543         pipe_bp->fs.rq = USER_ADDR_NULL;
 544         bp = ((char *)pipe_bp) + sizeof(struct dn_pipe_64);
 545         return dn_copy_set_64( &(p->fs), bp);
 546 }
 547
 548 static int
 549 heap_init(struct dn_heap *h, int new_size)
 550 {
 551         struct dn_heap_entry *p;
 552
 553         if (h->size >= new_size) {
 554                 printf("dummynet: heap_init, Bogus call, have %d want %d\n",
 555                     h->size, new_size);
 556                 return 0;
 557         }
 558         new_size = (new_size + HEAP_INCREMENT) & ~HEAP_INCREMENT;
 559         p = _MALLOC(new_size * sizeof(*p), M_DUMMYNET, M_DONTWAIT );
 560         if (p == NULL) {
 561                 printf("dummynet: heap_init, resize %d failed\n", new_size );
 562                 return 1; /* error */
 563         }
 564         if (h->size > 0) {
 565                 bcopy(h->p, p, h->size * sizeof(*p));
 566                 FREE(h->p, M_DUMMYNET);
 567         }
 568         h->p = p;
 569         h->size = new_size;
 570         return 0;
 571 }
 572
 573 /*
 574  * Insert element in heap. Normally, p != NULL, we insert p in
 575  * a new position and bubble up. If p == NULL, then the element is
 576  * already in place, and key is the position where to start the
 577  * bubble-up.
 578  * Returns 1 on failure (cannot allocate new heap entry)
 579  *
 580  * If offset > 0 the position (index, int) of the element in the heap is
 581  * also stored in the element itself at the given offset in bytes.
 582  */
 583 #define SET_OFFSET(heap, node) \
 584     if (heap->offset > 0) \
 585             *((int *)((char *)(heap->p[node].object) + heap->offset)) = node ;
 586 /*
 587  * RESET_OFFSET is used for sanity checks. It sets offset to an invalid value.
 588  */
 589 #define RESET_OFFSET(heap, node) \
 590     if (heap->offset > 0) \
 591             *((int *)((char *)(heap->p[node].object) + heap->offset)) = -1 ;
 592 static int
 593 heap_insert(struct dn_heap *h, dn_key key1, void *p)
 594 {
 595         int son = h->elements;
 596
 597         if (p == NULL) { /* data already there, set starting point */
 598                 son = key1;
 599         } else {        /* insert new element at the end, possibly resize */
 600                 son = h->elements;
 601                 if (son == h->size) { /* need resize... */
 602                         if (heap_init(h, h->elements + 1)) {
 603                                 return 1; /* failure... */
 604                         }
 605                 }
 606                 h->p[son].object = p;
 607                 h->p[son].key = key1;
 608                 h->elements++;
 609         }
 610         while (son > 0) {                       /* bubble up */
 611                 int father = HEAP_FATHER(son);
 612                 struct dn_heap_entry tmp;
 613
 614                 if (DN_KEY_LT( h->p[father].key, h->p[son].key )) {
 615                         break; /* found right position */
 616                 }
 617                 /* son smaller than father, swap and repeat */
 618                 HEAP_SWAP(h->p[son], h->p[father], tmp);
 619                 SET_OFFSET(h, son);
 620                 son = father;
 621         }
 622         SET_OFFSET(h, son);
 623         return 0;
 624 }
 625
 626 /*
 627  * remove top element from heap, or obj if obj != NULL
 628  */
 629 static void
 630 heap_extract(struct dn_heap *h, void *obj)
 631 {
 632         int child, father, maxelt = h->elements - 1;
 633
 634         if (maxelt < 0) {
 635                 printf("dummynet: warning, extract from empty heap 0x%llx\n",
 636                     (uint64_t)VM_KERNEL_ADDRPERM(h));
 637                 return;
 638         }
 639         father = 0; /* default: move up smallest child */
 640         if (obj != NULL) { /* extract specific element, index is at offset */
 641                 if (h->offset <= 0) {
 642                         panic("dummynet: heap_extract from middle not supported on this heap!!!\n");
 643                 }
 644                 father = *((int *)((char *)obj + h->offset));
 645                 if (father < 0 || father >= h->elements) {
 646                         printf("dummynet: heap_extract, father %d out of bound 0..%d\n",
 647                             father, h->elements);
 648                         panic("dummynet: heap_extract");
 649                 }
 650         }
 651         RESET_OFFSET(h, father);
 652         child = HEAP_LEFT(father);      /* left child */
 653         while (child <= maxelt) {       /* valid entry */
 654                 if (child != maxelt && DN_KEY_LT(h->p[child + 1].key, h->p[child].key)) {
 655                         child = child + 1; /* take right child, otherwise left */
 656                 }
 657                 h->p[father] = h->p[child];
 658                 SET_OFFSET(h, father);
 659                 father = child;
 660                 child = HEAP_LEFT(child); /* left child for next loop */
 661         }
 662         h->elements--;
 663         if (father != maxelt) {
 664                 /*
 665                  * Fill hole with last entry and bubble up, reusing the insert code
 666                  */
 667                 h->p[father] = h->p[maxelt];
 668                 heap_insert(h, father, NULL); /* this one cannot fail */
 669         }
 670 }
 671
 672 /*
 673  * heapify() will reorganize data inside an array to maintain the
 674  * heap property. It is needed when we delete a bunch of entries.
 675  */
 676 static void
 677 heapify(struct dn_heap *h)
 678 {
 679         int i;
 680
 681         for (i = 0; i < h->elements; i++) {
 682                 heap_insert(h, i, NULL);
 683         }
 684 }
 685
 686 /*
 687  * cleanup the heap and free data structure
 688  */
 689 static void
 690 heap_free(struct dn_heap *h)
 691 {
 692         if (h->size > 0) {
 693                 FREE(h->p, M_DUMMYNET);
 694         }
 695         bzero(h, sizeof(*h));
 696 }
 697
 698 /*
 699  * --- end of heap management functions ---
 700  */
 701
 702 /*
 703  * Return the mbuf tag holding the dummynet state.  As an optimization
 704  * this is assumed to be the first tag on the list.  If this turns out
 705  * wrong we'll need to search the list.
 706  */
 707 static struct dn_pkt_tag *
 708 dn_tag_get(struct mbuf *m)
 709 {
 710         struct m_tag *mtag = m_tag_first(m);
 711
 712         if (!(mtag != NULL &&
 713             mtag->m_tag_id == KERNEL_MODULE_TAG_ID &&
 714             mtag->m_tag_type == KERNEL_TAG_TYPE_DUMMYNET)) {
 715                 panic("packet on dummynet queue w/o dummynet tag: 0x%llx",
 716                     (uint64_t)VM_KERNEL_ADDRPERM(m));
 717         }
 718
 719         return (struct dn_pkt_tag *)(mtag + 1);
 720 }
 721
 722 /*
 723  * Scheduler functions:
 724  *
 725  * transmit_event() is called when the delay-line needs to enter
 726  * the scheduler, either because of existing pkts getting ready,
 727  * or new packets entering the queue. The event handled is the delivery
 728  * time of the packet.
 729  *
 730  * ready_event() does something similar with fixed-rate queues, and the
 731  * event handled is the finish time of the head pkt.
 732  *
 733  * wfq_ready_event() does something similar with WF2Q queues, and the
 734  * event handled is the start time of the head pkt.
 735  *
 736  * In all cases, we make sure that the data structures are consistent
 737  * before passing pkts out, because this might trigger recursive
 738  * invocations of the procedures.
 739  */
 740 static void
 741 transmit_event(struct dn_pipe *pipe, struct mbuf **head, struct mbuf **tail)
 742 {
 743         struct mbuf *m;
 744         struct dn_pkt_tag *pkt = NULL;
 745         u_int64_t schedule_time;
 746
 747         LCK_MTX_ASSERT(dn_mutex, LCK_MTX_ASSERT_OWNED);
 748         ASSERT(serialize >= 0);
 749         if (serialize == 0) {
 750                 while ((m = pipe->head) != NULL) {
 751                         pkt = dn_tag_get(m);
 752                         if (!DN_KEY_LEQ(pkt->dn_output_time, curr_time)) {
 753                                 break;
 754                         }
 755
 756                         pipe->head = m->m_nextpkt;
 757                         if (*tail != NULL) {
 758                                 (*tail)->m_nextpkt = m;
 759                         } else {
 760                                 *head = m;
 761                         }
 762                         *tail = m;
 763                 }
 764
 765                 if (*tail != NULL) {
 766                         (*tail)->m_nextpkt = NULL;
 767                 }
 768         }
 769
 770         schedule_time = pkt == NULL || DN_KEY_LEQ(pkt->dn_output_time, curr_time) ?
 771             curr_time + 1 : pkt->dn_output_time;
 772
 773         /* if there are leftover packets, put the pipe into the heap for next ready event */
 774         if ((m = pipe->head) != NULL) {
 775                 pkt = dn_tag_get(m);
 776                 /* XXX should check errors on heap_insert, by draining the
 777                  * whole pipe p and hoping in the future we are more successful
 778                  */
 779                 heap_insert(&extract_heap, schedule_time, pipe);
 780         }
 781 }
 782
 783 /*
 784  * the following macro computes how many ticks we have to wait
 785  * before being able to transmit a packet. The credit is taken from
 786  * either a pipe (WF2Q) or a flow_queue (per-flow queueing)
 787  */
 788
 789 /* hz is 100, which gives a granularity of 10ms in the old timer.
 790  * The timer has been changed to fire every 1ms, so the use of
 791  * hz has been modified here. All instances of hz have been left
 792  * in place but adjusted by a factor of 10 so that hz is functionally
 793  * equal to 1000.
 794  */
 795 #define SET_TICKS(_m, q, p)     \
 796     ((_m)->m_pkthdr.len*8*(hz*10) - (q)->numbytes + p->bandwidth - 1 ) / \
 797             p->bandwidth ;
 798
 799 /*
 800  * extract pkt from queue, compute output time (could be now)
 801  * and put into delay line (p_queue)
 802  */
 803 static void
 804 move_pkt(struct mbuf *pkt, struct dn_flow_queue *q,
 805     struct dn_pipe *p, int len)
 806 {
 807         struct dn_pkt_tag *dt = dn_tag_get(pkt);
 808
 809         q->head = pkt->m_nextpkt;
 810         q->len--;
 811         q->len_bytes -= len;
 812
 813         dt->dn_output_time = curr_time + p->delay;
 814
 815         if (p->head == NULL) {
 816                 p->head = pkt;
 817         } else {
 818                 p->tail->m_nextpkt = pkt;
 819         }
 820         p->tail = pkt;
 821         p->tail->m_nextpkt = NULL;
 822 }
 823
 824 /*
 825  * ready_event() is invoked every time the queue must enter the
 826  * scheduler, either because the first packet arrives, or because
 827  * a previously scheduled event fired.
 828  * On invokation, drain as many pkts as possible (could be 0) and then
 829  * if there are leftover packets reinsert the pkt in the scheduler.
 830  */
 831 static void
 832 ready_event(struct dn_flow_queue *q, struct mbuf **head, struct mbuf **tail)
 833 {
 834         struct mbuf *pkt;
 835         struct dn_pipe *p = q->fs->pipe;
 836         int p_was_empty;
 837
 838         LCK_MTX_ASSERT(dn_mutex, LCK_MTX_ASSERT_OWNED);
 839
 840         if (p == NULL) {
 841                 printf("dummynet: ready_event pipe is gone\n");
 842                 return;
 843         }
 844         p_was_empty = (p->head == NULL);
 845
 846         /*
 847          * schedule fixed-rate queues linked to this pipe:
 848          * Account for the bw accumulated since last scheduling, then
 849          * drain as many pkts as allowed by q->numbytes and move to
 850          * the delay line (in p) computing output time.
 851          * bandwidth==0 (no limit) means we can drain the whole queue,
 852          * setting len_scaled = 0 does the job.
 853          */
 854         q->numbytes += (curr_time - q->sched_time) * p->bandwidth;
 855         while ((pkt = q->head) != NULL) {
 856                 int len = pkt->m_pkthdr.len;
 857                 int len_scaled = p->bandwidth ? len * 8 * (hz * 10) : 0;
 858                 if (len_scaled > q->numbytes) {
 859                         break;
 860                 }
 861                 q->numbytes -= len_scaled;
 862                 move_pkt(pkt, q, p, len);
 863         }
 864         /*
 865          * If we have more packets queued, schedule next ready event
 866          * (can only occur when bandwidth != 0, otherwise we would have
 867          * flushed the whole queue in the previous loop).
 868          * To this purpose we record the current time and compute how many
 869          * ticks to go for the finish time of the packet.
 870          */
 871         if ((pkt = q->head) != NULL) { /* this implies bandwidth != 0 */
 872                 dn_key t = SET_TICKS(pkt, q, p); /* ticks i have to wait */
 873                 q->sched_time = curr_time;
 874                 heap_insert(&ready_heap, curr_time + t, (void *)q );
 875                 /* XXX should check errors on heap_insert, and drain the whole
 876                  * queue on error hoping next time we are luckier.
 877                  */
 878         } else { /* RED needs to know when the queue becomes empty */
 879                 q->q_time = curr_time;
 880                 q->numbytes = 0;
 881         }
 882         /*
 883          * If the delay line was empty call transmit_event(p) now.
 884          * Otherwise, the scheduler will take care of it.
 885          */
 886         if (p_was_empty) {
 887                 transmit_event(p, head, tail);
 888         }
 889 }
 890
 891 /*
 892  * Called when we can transmit packets on WF2Q queues. Take pkts out of
 893  * the queues at their start time, and enqueue into the delay line.
 894  * Packets are drained until p->numbytes < 0. As long as
 895  * len_scaled >= p->numbytes, the packet goes into the delay line
 896  * with a deadline p->delay. For the last packet, if p->numbytes<0,
 897  * there is an additional delay.
 898  */
 899 static void
 900 ready_event_wfq(struct dn_pipe *p, struct mbuf **head, struct mbuf **tail)
 901 {
 902         int p_was_empty = (p->head == NULL);
 903         struct dn_heap *sch = &(p->scheduler_heap);
 904         struct dn_heap *neh = &(p->not_eligible_heap);
 905         int64_t p_numbytes = p->numbytes;
 906
 907         LCK_MTX_ASSERT(dn_mutex, LCK_MTX_ASSERT_OWNED);
 908
 909         if (p->if_name[0] == 0) { /* tx clock is simulated */
 910                 p_numbytes += (curr_time - p->sched_time) * p->bandwidth;
 911         } else { /* tx clock is for real, the ifq must be empty or this is a NOP */
 912                 if (p->ifp && !IFCQ_IS_EMPTY(&p->ifp->if_snd)) {
 913                         return;
 914                 } else {
 915                         DPRINTF(("dummynet: pipe %d ready from %s --\n",
 916                             p->pipe_nr, p->if_name));
 917                 }
 918         }
 919
 920         /*
 921          * While we have backlogged traffic AND credit, we need to do
 922          * something on the queue.
 923          */
 924         while (p_numbytes >= 0 && (sch->elements > 0 || neh->elements > 0)) {
 925                 if (sch->elements > 0) { /* have some eligible pkts to send out */
 926                         struct dn_flow_queue *q = sch->p[0].object;
 927                         struct mbuf *pkt = q->head;
 928                         struct dn_flow_set *fs = q->fs;
 929                         u_int64_t len = pkt->m_pkthdr.len;
 930                         int len_scaled = p->bandwidth ? len * 8 * (hz * 10) : 0;
 931
 932                         heap_extract(sch, NULL); /* remove queue from heap */
 933                         p_numbytes -= len_scaled;
 934                         move_pkt(pkt, q, p, len);
 935
 936                         p->V += (len << MY_M) / p->sum; /* update V */
 937                         q->S = q->F; /* update start time */
 938                         if (q->len == 0) { /* Flow not backlogged any more */
 939                                 fs->backlogged--;
 940                                 heap_insert(&(p->idle_heap), q->F, q);
 941                         } else { /* still backlogged */
 942                                 /*
 943                                  * update F and position in backlogged queue, then
 944                                  * put flow in not_eligible_heap (we will fix this later).
 945                                  */
 946                                 len = (q->head)->m_pkthdr.len;
 947                                 q->F += (len << MY_M) / (u_int64_t) fs->weight;
 948                                 if (DN_KEY_LEQ(q->S, p->V)) {
 949                                         heap_insert(neh, q->S, q);
 950                                 } else {
 951                                         heap_insert(sch, q->F, q);
 952                                 }
 953                         }
 954                 }
 955                 /*
 956                  * now compute V = max(V, min(S_i)). Remember that all elements in sch
 957                  * have by definition S_i <= V so if sch is not empty, V is surely
 958                  * the max and we must not update it. Conversely, if sch is empty
 959                  * we only need to look at neh.
 960                  */
 961                 if (sch->elements == 0 && neh->elements > 0) {
 962                         p->V = MAX64( p->V, neh->p[0].key );
 963                 }
 964                 /* move from neh to sch any packets that have become eligible */
 965                 while (neh->elements > 0 && DN_KEY_LEQ(neh->p[0].key, p->V)) {
 966                         struct dn_flow_queue *q = neh->p[0].object;
 967                         heap_extract(neh, NULL);
 968                         heap_insert(sch, q->F, q);
 969                 }
 970
 971                 if (p->if_name[0] != '\0') {/* tx clock is from a real thing */
 972                         p_numbytes = -1; /* mark not ready for I/O */
 973                         break;
 974                 }
 975         }
 976         if (sch->elements == 0 && neh->elements == 0 && p_numbytes >= 0
 977             && p->idle_heap.elements > 0) {
 978                 /*
 979                  * no traffic and no events scheduled. We can get rid of idle-heap.
 980                  */
 981                 int i;
 982
 983                 for (i = 0; i < p->idle_heap.elements; i++) {
 984                         struct dn_flow_queue *q = p->idle_heap.p[i].object;
 985
 986                         q->F = 0;
 987                         q->S = q->F + 1;
 988                 }
 989                 p->sum = 0;
 990                 p->V = 0;
 991                 p->idle_heap.elements = 0;
 992         }
 993         /*
 994          * If we are getting clocks from dummynet (not a real interface) and
 995          * If we are under credit, schedule the next ready event.
 996          * Also fix the delivery time of the last packet.
 997          */
 998         if (p->if_name[0] == 0 && p_numbytes < 0) { /* this implies bandwidth >0 */
 999                 dn_key t = 0; /* number of ticks i have to wait */
1000
1001                 if (p->bandwidth > 0) {
1002                         t = (p->bandwidth - 1 - p_numbytes) / p->bandwidth;
1003                 }
1004                 dn_tag_get(p->tail)->dn_output_time += t;
1005                 p->sched_time = curr_time;
1006                 heap_insert(&wfq_ready_heap, curr_time + t, (void *)p);
1007                 /* XXX should check errors on heap_insert, and drain the whole
1008                  * queue on error hoping next time we are luckier.
1009                  */
1010         }
1011
1012         /* Fit (adjust if necessary) 64bit result into 32bit variable. */
1013         if (p_numbytes > INT_MAX) {
1014                 p->numbytes = INT_MAX;
1015         } else if (p_numbytes < INT_MIN) {
1016                 p->numbytes = INT_MIN;
1017         } else {
1018                 p->numbytes = p_numbytes;
1019         }
1020
1021         /*
1022          * If the delay line was empty call transmit_event(p) now.
1023          * Otherwise, the scheduler will take care of it.
1024          */
1025         if (p_was_empty) {
1026                 transmit_event(p, head, tail);
1027         }
1028 }
1029
1030 /*
1031  * This is called every 1ms. It is used to
1032  * increment the current tick counter and schedule expired events.
1033  */
1034 static void
1035 dummynet(__unused void * unused)
1036 {
1037         void *p; /* generic parameter to handler */
1038         struct dn_heap *h;
1039         struct dn_heap *heaps[3];
1040         struct mbuf *head = NULL, *tail = NULL;
1041         int i;
1042         struct dn_pipe *pe;
1043         struct timespec ts;
1044         struct timeval      tv;
1045
1046         heaps[0] = &ready_heap;         /* fixed-rate queues */
1047         heaps[1] = &wfq_ready_heap;     /* wfq queues */
1048         heaps[2] = &extract_heap;       /* delay line */
1049
1050         lck_mtx_lock(dn_mutex);
1051
1052         /* make all time measurements in milliseconds (ms) -
1053          * here we convert secs and usecs to msecs (just divide the
1054          * usecs and take the closest whole number).
1055          */
1056         microuptime(&tv);
1057         curr_time = (tv.tv_sec * 1000) + (tv.tv_usec / 1000);
1058
1059         for (i = 0; i < 3; i++) {
1060                 h = heaps[i];
1061                 while (h->elements > 0 && DN_KEY_LEQ(h->p[0].key, curr_time)) {
1062                         if (h->p[0].key > curr_time) {
1063                                 printf("dummynet: warning, heap %d is %d ticks late\n",
1064                                     i, (int)(curr_time - h->p[0].key));
1065                         }
1066                         p = h->p[0].object; /* store a copy before heap_extract */
1067                         heap_extract(h, NULL); /* need to extract before processing */
1068                         if (i == 0) {
1069                                 ready_event(p, &head, &tail);
1070                         } else if (i == 1) {
1071                                 struct dn_pipe *pipe = p;
1072                                 if (pipe->if_name[0] != '\0') {
1073                                         printf("dummynet: bad ready_event_wfq for pipe %s\n",
1074                                             pipe->if_name);
1075                                 } else {
1076                                         ready_event_wfq(p, &head, &tail);
1077                                 }
1078                         } else {
1079                                 transmit_event(p, &head, &tail);
1080                         }
1081                 }
1082         }
1083         /* sweep pipes trying to expire idle flow_queues */
1084         for (i = 0; i < HASHSIZE; i++) {
1085                 SLIST_FOREACH(pe, &pipehash[i], next) {
1086                         if (pe->idle_heap.elements > 0 &&
1087                             DN_KEY_LT(pe->idle_heap.p[0].key, pe->V)) {
1088                                 struct dn_flow_queue *q = pe->idle_heap.p[0].object;
1089
1090                                 heap_extract(&(pe->idle_heap), NULL);
1091                                 q->S = q->F + 1; /* mark timestamp as invalid */
1092                                 pe->sum -= q->fs->weight;
1093                         }
1094                 }
1095         }
1096
1097         /* check the heaps to see if there's still stuff in there, and
1098          * only set the timer if there are packets to process
1099          */
1100         timer_enabled = 0;
1101         for (i = 0; i < 3; i++) {
1102                 h = heaps[i];
1103                 if (h->elements > 0) { // set the timer
1104                         ts.tv_sec = 0;
1105                         ts.tv_nsec = 1 * 1000000;       // 1ms
1106                         timer_enabled = 1;
1107                         bsd_timeout(dummynet, NULL, &ts);
1108                         break;
1109                 }
1110         }
1111
1112         if (head != NULL) {
1113                 serialize++;
1114         }
1115
1116         lck_mtx_unlock(dn_mutex);
1117
1118         /* Send out the de-queued list of ready-to-send packets */
1119         if (head != NULL) {
1120                 dummynet_send(head);
1121                 lck_mtx_lock(dn_mutex);
1122                 serialize--;
1123                 lck_mtx_unlock(dn_mutex);
1124         }
1125 }
1126
1127
1128 static void
1129 dummynet_send(struct mbuf *m)
1130 {
1131         struct dn_pkt_tag *pkt;
1132         struct mbuf *n;
1133
1134         for (; m != NULL; m = n) {
1135                 n = m->m_nextpkt;
1136                 m->m_nextpkt = NULL;
1137                 pkt = dn_tag_get(m);
1138
1139                 DPRINTF(("dummynet_send m: 0x%llx dn_dir: %d dn_flags: 0x%x\n",
1140                     (uint64_t)VM_KERNEL_ADDRPERM(m), pkt->dn_dir,
1141                     pkt->dn_flags));
1142
1143                 switch (pkt->dn_dir) {
1144                 case DN_TO_IP_OUT: {
1145                         struct route tmp_rt;
1146
1147                         /* route is already in the packet's dn_ro */
1148                         bzero(&tmp_rt, sizeof(tmp_rt));
1149
1150                         /* Force IP_RAWOUTPUT as the IP header is fully formed */
1151                         pkt->dn_flags |= IP_RAWOUTPUT | IP_FORWARDING;
1152                         (void)ip_output(m, NULL, &tmp_rt, pkt->dn_flags, NULL, NULL);
1153                         ROUTE_RELEASE(&tmp_rt);
1154                         break;
1155                 }
1156                 case DN_TO_IP_IN:
1157                         proto_inject(PF_INET, m);
1158                         break;
1159                 case DN_TO_IP6_OUT: {
1160                         /* routes already in the packet's dn_{ro6,pmtu} */
1161                         ip6_output(m, NULL, NULL, IPV6_FORWARDING, NULL, NULL, NULL);
1162                         break;
1163                 }
1164                 case DN_TO_IP6_IN:
1165                         proto_inject(PF_INET6, m);
1166                         break;
1167                 default:
1168                         printf("dummynet: bad switch %d!\n", pkt->dn_dir);
1169                         m_freem(m);
1170                         break;
1171                 }
1172         }
1173 }
1174
1175 /*
1176  * Unconditionally expire empty queues in case of shortage.
1177  * Returns the number of queues freed.
1178  */
1179 static int
1180 expire_queues(struct dn_flow_set *fs)
1181 {
1182         struct dn_flow_queue *q, *prev;
1183         int i, initial_elements = fs->rq_elements;
1184         struct timeval timenow;
1185
1186         /* reviewed for getmicrotime usage */
1187         getmicrotime(&timenow);
1188
1189         if (fs->last_expired == timenow.tv_sec) {
1190                 return 0;
1191         }
1192         fs->last_expired = timenow.tv_sec;
1193         for (i = 0; i <= fs->rq_size; i++) { /* last one is overflow */
1194                 for (prev = NULL, q = fs->rq[i]; q != NULL;) {
1195                         if (q->head != NULL || q->S != q->F + 1) {
1196                                 prev = q;
1197                                 q = q->next;
1198                         } else { /* entry is idle, expire it */
1199                                 struct dn_flow_queue *old_q = q;
1200
1201                                 if (prev != NULL) {
1202                                         prev->next = q = q->next;
1203                                 } else {
1204                                         fs->rq[i] = q = q->next;
1205                                 }
1206                                 fs->rq_elements--;
1207                                 FREE(old_q, M_DUMMYNET);
1208                         }
1209                 }
1210         }
1211         return initial_elements - fs->rq_elements;
1212 }
1213
1214 /*
1215  * If room, create a new queue and put at head of slot i;
1216  * otherwise, create or use the default queue.
1217  */
1218 static struct dn_flow_queue *
1219 create_queue(struct dn_flow_set *fs, int i)
1220 {
1221         struct dn_flow_queue *q;
1222
1223         if (fs->rq_elements > fs->rq_size * dn_max_ratio &&
1224             expire_queues(fs) == 0) {
1225                 /*
1226                  * No way to get room, use or create overflow queue.
1227                  */
1228                 i = fs->rq_size;
1229                 if (fs->rq[i] != NULL) {
1230                         return fs->rq[i];
1231                 }
1232         }
1233         q = _MALLOC(sizeof(*q), M_DUMMYNET, M_DONTWAIT | M_ZERO);
1234         if (q == NULL) {
1235                 printf("dummynet: sorry, cannot allocate queue for new flow\n");
1236                 return NULL;
1237         }
1238         q->fs = fs;
1239         q->hash_slot = i;
1240         q->next = fs->rq[i];
1241         q->S = q->F + 1; /* hack - mark timestamp as invalid */
1242         fs->rq[i] = q;
1243         fs->rq_elements++;
1244         return q;
1245 }
1246
1247 /*
1248  * Given a flow_set and a pkt in last_pkt, find a matching queue
1249  * after appropriate masking. The queue is moved to front
1250  * so that further searches take less time.
1251  */
1252 static struct dn_flow_queue *
1253 find_queue(struct dn_flow_set *fs, struct ip_flow_id *id)
1254 {
1255         int i = 0; /* we need i and q for new allocations */
1256         struct dn_flow_queue *q, *prev;
1257         int is_v6 = IS_IP6_FLOW_ID(id);
1258
1259         if (!(fs->flags_fs & DN_HAVE_FLOW_MASK)) {
1260                 q = fs->rq[0];
1261         } else {
1262                 /* first, do the masking, then hash */
1263                 id->dst_port &= fs->flow_mask.dst_port;
1264                 id->src_port &= fs->flow_mask.src_port;
1265                 id->proto &= fs->flow_mask.proto;
1266                 id->flags = 0; /* we don't care about this one */
1267                 if (is_v6) {
1268                         APPLY_MASK(&id->dst_ip6, &fs->flow_mask.dst_ip6);
1269                         APPLY_MASK(&id->src_ip6, &fs->flow_mask.src_ip6);
1270                         id->flow_id6 &= fs->flow_mask.flow_id6;
1271
1272                         i = ((id->dst_ip6.__u6_addr.__u6_addr32[0]) & 0xffff) ^
1273                             ((id->dst_ip6.__u6_addr.__u6_addr32[1]) & 0xffff) ^
1274                             ((id->dst_ip6.__u6_addr.__u6_addr32[2]) & 0xffff) ^
1275                             ((id->dst_ip6.__u6_addr.__u6_addr32[3]) & 0xffff) ^
1276
1277                             ((id->dst_ip6.__u6_addr.__u6_addr32[0] >> 15) & 0xffff) ^
1278                             ((id->dst_ip6.__u6_addr.__u6_addr32[1] >> 15) & 0xffff) ^
1279                             ((id->dst_ip6.__u6_addr.__u6_addr32[2] >> 15) & 0xffff) ^
1280                             ((id->dst_ip6.__u6_addr.__u6_addr32[3] >> 15) & 0xffff) ^
1281
1282                             ((id->src_ip6.__u6_addr.__u6_addr32[0] << 1) & 0xfffff) ^
1283                             ((id->src_ip6.__u6_addr.__u6_addr32[1] << 1) & 0xfffff) ^
1284                             ((id->src_ip6.__u6_addr.__u6_addr32[2] << 1) & 0xfffff) ^
1285                             ((id->src_ip6.__u6_addr.__u6_addr32[3] << 1) & 0xfffff) ^
1286
1287                             ((id->src_ip6.__u6_addr.__u6_addr32[0] >> 16) & 0xffff) ^
1288                             ((id->src_ip6.__u6_addr.__u6_addr32[1] >> 16) & 0xffff) ^
1289                             ((id->src_ip6.__u6_addr.__u6_addr32[2] >> 16) & 0xffff) ^
1290                             ((id->src_ip6.__u6_addr.__u6_addr32[3] >> 16) & 0xffff) ^
1291
1292                             (id->dst_port << 1) ^ (id->src_port) ^
1293                             (id->proto) ^
1294                             (id->flow_id6);
1295                 } else {
1296                         id->dst_ip &= fs->flow_mask.dst_ip;
1297                         id->src_ip &= fs->flow_mask.src_ip;
1298
1299                         i = ((id->dst_ip) & 0xffff) ^
1300                             ((id->dst_ip >> 15) & 0xffff) ^
1301                             ((id->src_ip << 1) & 0xffff) ^
1302                             ((id->src_ip >> 16) & 0xffff) ^
1303                             (id->dst_port << 1) ^ (id->src_port) ^
1304                             (id->proto);
1305                 }
1306                 i = i % fs->rq_size;
1307                 /* finally, scan the current list for a match */
1308                 searches++;
1309                 for (prev = NULL, q = fs->rq[i]; q;) {
1310                         search_steps++;
1311                         if (is_v6 &&
1312                             IN6_ARE_ADDR_EQUAL(&id->dst_ip6, &q->id.dst_ip6) &&
1313                             IN6_ARE_ADDR_EQUAL(&id->src_ip6, &q->id.src_ip6) &&
1314                             id->dst_port == q->id.dst_port &&
1315                             id->src_port == q->id.src_port &&
1316                             id->proto == q->id.proto &&
1317                             id->flags == q->id.flags &&
1318                             id->flow_id6 == q->id.flow_id6) {
1319                                 break; /* found */
1320                         }
1321                         if (!is_v6 && id->dst_ip == q->id.dst_ip &&
1322                             id->src_ip == q->id.src_ip &&
1323                             id->dst_port == q->id.dst_port &&
1324                             id->src_port == q->id.src_port &&
1325                             id->proto == q->id.proto &&
1326                             id->flags == q->id.flags) {
1327                                 break; /* found */
1328                         }
1329                         /* No match. Check if we can expire the entry */
1330                         if (pipe_expire && q->head == NULL && q->S == q->F + 1) {
1331                                 /* entry is idle and not in any heap, expire it */
1332                                 struct dn_flow_queue *old_q = q;
1333
1334                                 if (prev != NULL) {
1335                                         prev->next = q = q->next;
1336                                 } else {
1337                                         fs->rq[i] = q = q->next;
1338                                 }
1339                                 fs->rq_elements--;
1340                                 FREE(old_q, M_DUMMYNET);
1341                                 continue;
1342                         }
1343                         prev = q;
1344                         q = q->next;
1345                 }
1346                 if (q && prev != NULL) { /* found and not in front */
1347                         prev->next = q->next;
1348                         q->next = fs->rq[i];
1349                         fs->rq[i] = q;
1350                 }
1351         }
1352         if (q == NULL) { /* no match, need to allocate a new entry */
1353                 q = create_queue(fs, i);
1354                 if (q != NULL) {
1355                         q->id = *id;
1356                 }
1357         }
1358         return q;
1359 }
1360
1361 static int
1362 red_drops(struct dn_flow_set *fs, struct dn_flow_queue *q, int len)
1363 {
1364         /*
1365          * RED algorithm
1366          *
1367          * RED calculates the average queue size (avg) using a low-pass filter
1368          * with an exponential weighted (w_q) moving average:
1369          *      avg  <-  (1-w_q) * avg + w_q * q_size
1370          * where q_size is the queue length (measured in bytes or * packets).
1371          *
1372          * If q_size == 0, we compute the idle time for the link, and set
1373          *      avg = (1 - w_q)^(idle/s)
1374          * where s is the time needed for transmitting a medium-sized packet.
1375          *
1376          * Now, if avg < min_th the packet is enqueued.
1377          * If avg > max_th the packet is dropped. Otherwise, the packet is
1378          * dropped with probability P function of avg.
1379          *
1380          */
1381
1382         int64_t p_b = 0;
1383         /* queue in bytes or packets ? */
1384         u_int q_size = (fs->flags_fs & DN_QSIZE_IS_BYTES) ? q->len_bytes : q->len;
1385
1386         DPRINTF(("\ndummynet: %d q: %2u ", (int) curr_time, q_size));
1387
1388         /* average queue size estimation */
1389         if (q_size != 0) {
1390                 /*
1391                  * queue is not empty, avg <- avg + (q_size - avg) * w_q
1392                  */
1393                 int diff = SCALE(q_size) - q->avg;
1394                 int64_t v = SCALE_MUL((int64_t) diff, (int64_t) fs->w_q);
1395
1396                 q->avg += (int) v;
1397         } else {
1398                 /*
1399                  * queue is empty, find for how long the queue has been
1400                  * empty and use a lookup table for computing
1401                  * (1 - * w_q)^(idle_time/s) where s is the time to send a
1402                  * (small) packet.
1403                  * XXX check wraps...
1404                  */
1405                 if (q->avg) {
1406                         u_int t = (curr_time - q->q_time) / fs->lookup_step;
1407
1408                         q->avg = (t < fs->lookup_depth) ?
1409                             SCALE_MUL(q->avg, fs->w_q_lookup[t]) : 0;
1410                 }
1411         }
1412         DPRINTF(("dummynet: avg: %u ", SCALE_VAL(q->avg)));
1413
1414         /* should i drop ? */
1415
1416         if (q->avg < fs->min_th) {
1417                 q->count = -1;
1418                 return 0; /* accept packet ; */
1419         }
1420         if (q->avg >= fs->max_th) { /* average queue >=  max threshold */
1421                 if (fs->flags_fs & DN_IS_GENTLE_RED) {
1422                         /*
1423                          * According to Gentle-RED, if avg is greater than max_th the
1424                          * packet is dropped with a probability
1425                          *      p_b = c_3 * avg - c_4
1426                          * where c_3 = (1 - max_p) / max_th, and c_4 = 1 - 2 * max_p
1427                          */
1428                         p_b = SCALE_MUL((int64_t) fs->c_3, (int64_t) q->avg) - fs->c_4;
1429                 } else {
1430                         q->count = -1;
1431                         DPRINTF(("dummynet: - drop"));
1432                         return 1;
1433                 }
1434         } else if (q->avg > fs->min_th) {
1435                 /*
1436                  * we compute p_b using the linear dropping function p_b = c_1 *
1437                  * avg - c_2, where c_1 = max_p / (max_th - min_th), and c_2 =
1438                  * max_p * min_th / (max_th - min_th)
1439                  */
1440                 p_b = SCALE_MUL((int64_t) fs->c_1, (int64_t) q->avg) - fs->c_2;
1441         }
1442         if (fs->flags_fs & DN_QSIZE_IS_BYTES) {
1443                 p_b = (p_b * len) / fs->max_pkt_size;
1444         }
1445         if (++q->count == 0) {
1446                 q->random = (my_random() & 0xffff);
1447         } else {
1448                 /*
1449                  * q->count counts packets arrived since last drop, so a greater
1450                  * value of q->count means a greater packet drop probability.
1451                  */
1452                 if (SCALE_MUL(p_b, SCALE((int64_t) q->count)) > q->random) {
1453                         q->count = 0;
1454                         DPRINTF(("dummynet: - red drop"));
1455                         /* after a drop we calculate a new random value */
1456                         q->random = (my_random() & 0xffff);
1457                         return 1; /* drop */
1458                 }
1459         }
1460         /* end of RED algorithm */
1461         return 0; /* accept */
1462 }
1463
1464 static __inline
1465 struct dn_flow_set *
1466 locate_flowset(int fs_nr)
1467 {
1468         struct dn_flow_set *fs;
1469         SLIST_FOREACH(fs, &flowsethash[HASH(fs_nr)], next) {
1470                 if (fs->fs_nr == fs_nr) {
1471                         return fs;
1472                 }
1473         }
1474
1475         return NULL;
1476 }
1477
1478 static __inline struct dn_pipe *
1479 locate_pipe(int pipe_nr)
1480 {
1481         struct dn_pipe *pipe;
1482
1483         SLIST_FOREACH(pipe, &pipehash[HASH(pipe_nr)], next) {
1484                 if (pipe->pipe_nr == pipe_nr) {
1485                         return pipe;
1486                 }
1487         }
1488
1489         return NULL;
1490 }
1491
1492
1493
1494 /*
1495  * dummynet hook for packets. Below 'pipe' is a pipe or a queue
1496  * depending on whether WF2Q or fixed bw is used.
1497  *
1498  * pipe_nr      pipe or queue the packet is destined for.
1499  * dir          where shall we send the packet after dummynet.
1500  * m            the mbuf with the packet
1501  * ifp          the 'ifp' parameter from the caller.
1502  *              NULL in ip_input, destination interface in ip_output,
1503  *              real_dst in bdg_forward
1504  * ro           route parameter (only used in ip_output, NULL otherwise)
1505  * dst          destination address, only used by ip_output
1506  * rule         matching rule, in case of multiple passes
1507  * flags        flags from the caller, only used in ip_output
1508  *
1509  */
1510 static int
1511 dummynet_io(struct mbuf *m, int pipe_nr, int dir, struct ip_fw_args *fwa)
1512 {
1513         struct mbuf *head = NULL, *tail = NULL;
1514         struct dn_pkt_tag *pkt;
1515         struct m_tag *mtag;
1516         struct dn_flow_set *fs = NULL;
1517         struct dn_pipe *pipe;
1518         u_int64_t len = m->m_pkthdr.len;
1519         struct dn_flow_queue *q = NULL;
1520         int is_pipe = 0;
1521         struct timespec ts;
1522         struct timeval      tv;
1523
1524         DPRINTF(("dummynet_io m: 0x%llx pipe: %d dir: %d\n",
1525             (uint64_t)VM_KERNEL_ADDRPERM(m), pipe_nr, dir));
1526
1527
1528 #if DUMMYNET
1529         is_pipe = fwa->fwa_flags == DN_IS_PIPE ? 1 : 0;
1530 #endif /* DUMMYNET */
1531
1532         pipe_nr &= 0xffff;
1533
1534         lck_mtx_lock(dn_mutex);
1535
1536         /* make all time measurements in milliseconds (ms) -
1537          * here we convert secs and usecs to msecs (just divide the
1538          * usecs and take the closest whole number).
1539          */
1540         microuptime(&tv);
1541         curr_time = (tv.tv_sec * 1000) + (tv.tv_usec / 1000);
1542
1543         /*
1544          * This is a dummynet rule, so we expect an O_PIPE or O_QUEUE rule.
1545          */
1546         if (is_pipe) {
1547                 pipe = locate_pipe(pipe_nr);
1548                 if (pipe != NULL) {
1549                         fs = &(pipe->fs);
1550                 }
1551         } else {
1552                 fs = locate_flowset(pipe_nr);
1553         }
1554
1555
1556         if (fs == NULL) {
1557                 goto dropit; /* this queue/pipe does not exist! */
1558         }
1559         pipe = fs->pipe;
1560         if (pipe == NULL) { /* must be a queue, try find a matching pipe */
1561                 pipe = locate_pipe(fs->parent_nr);
1562
1563                 if (pipe != NULL) {
1564                         fs->pipe = pipe;
1565                 } else {
1566                         printf("dummynet: no pipe %d for queue %d, drop pkt\n",
1567                             fs->parent_nr, fs->fs_nr);
1568                         goto dropit;
1569                 }
1570         }
1571         q = find_queue(fs, &(fwa->fwa_id));
1572         if (q == NULL) {
1573                 goto dropit;    /* cannot allocate queue                */
1574         }
1575         /*
1576          * update statistics, then check reasons to drop pkt
1577          */
1578         q->tot_bytes += len;
1579         q->tot_pkts++;
1580         if (fs->plr && (my_random() < fs->plr)) {
1581                 goto dropit;    /* random pkt drop                      */
1582         }
1583         if (fs->flags_fs & DN_QSIZE_IS_BYTES) {
1584                 if (q->len_bytes > fs->qsize) {
1585                         goto dropit; /* queue size overflow                     */
1586                 }
1587         } else {
1588                 if (q->len >= fs->qsize) {
1589                         goto dropit; /* queue count overflow                    */
1590                 }
1591         }
1592         if (fs->flags_fs & DN_IS_RED && red_drops(fs, q, len)) {
1593                 goto dropit;
1594         }
1595
1596         /* XXX expensive to zero, see if we can remove it*/
1597         mtag = m_tag_create(KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_DUMMYNET,
1598             sizeof(struct dn_pkt_tag), M_NOWAIT, m);
1599         if (mtag == NULL) {
1600                 goto dropit;            /* cannot allocate packet header        */
1601         }
1602         m_tag_prepend(m, mtag); /* attach to mbuf chain */
1603
1604         pkt = (struct dn_pkt_tag *)(mtag + 1);
1605         bzero(pkt, sizeof(struct dn_pkt_tag));
1606         /* ok, i can handle the pkt now... */
1607         /* build and enqueue packet + parameters */
1608         pkt->dn_pf_rule = fwa->fwa_pf_rule;
1609         pkt->dn_dir = dir;
1610
1611         pkt->dn_ifp = fwa->fwa_oif;
1612         if (dir == DN_TO_IP_OUT) {
1613                 /*
1614                  * We need to copy *ro because for ICMP pkts (and maybe others)
1615                  * the caller passed a pointer into the stack; dst might also be
1616                  * a pointer into *ro so it needs to be updated.
1617                  */
1618                 if (fwa->fwa_ro) {
1619                         route_copyout(&pkt->dn_ro, fwa->fwa_ro, sizeof(pkt->dn_ro));
1620                 }
1621                 if (fwa->fwa_dst) {
1622                         if (fwa->fwa_dst == (struct sockaddr_in *)&fwa->fwa_ro->ro_dst) { /* dst points into ro */
1623                                 fwa->fwa_dst = (struct sockaddr_in *)&(pkt->dn_ro.ro_dst);
1624                         }
1625
1626                         bcopy(fwa->fwa_dst, &pkt->dn_dst, sizeof(pkt->dn_dst));
1627                 }
1628         } else if (dir == DN_TO_IP6_OUT) {
1629                 if (fwa->fwa_ro6) {
1630                         route_copyout((struct route *)&pkt->dn_ro6,
1631                             (struct route *)fwa->fwa_ro6, sizeof(pkt->dn_ro6));
1632                 }
1633                 if (fwa->fwa_ro6_pmtu) {
1634                         route_copyout((struct route *)&pkt->dn_ro6_pmtu,
1635                             (struct route *)fwa->fwa_ro6_pmtu, sizeof(pkt->dn_ro6_pmtu));
1636                 }
1637                 if (fwa->fwa_dst6) {
1638                         if (fwa->fwa_dst6 == (struct sockaddr_in6 *)&fwa->fwa_ro6->ro_dst) { /* dst points into ro */
1639                                 fwa->fwa_dst6 = (struct sockaddr_in6 *)&(pkt->dn_ro6.ro_dst);
1640                         }
1641
1642                         bcopy(fwa->fwa_dst6, &pkt->dn_dst6, sizeof(pkt->dn_dst6));
1643                 }
1644                 pkt->dn_origifp = fwa->fwa_origifp;
1645                 pkt->dn_mtu = fwa->fwa_mtu;
1646                 pkt->dn_unfragpartlen = fwa->fwa_unfragpartlen;
1647                 if (fwa->fwa_exthdrs) {
1648                         bcopy(fwa->fwa_exthdrs, &pkt->dn_exthdrs, sizeof(pkt->dn_exthdrs));
1649                         /*
1650                          * Need to zero out the source structure so the mbufs
1651                          * won't be freed by ip6_output()
1652                          */
1653                         bzero(fwa->fwa_exthdrs, sizeof(struct ip6_exthdrs));
1654                 }
1655         }
1656         if (dir == DN_TO_IP_OUT || dir == DN_TO_IP6_OUT) {
1657                 pkt->dn_flags = fwa->fwa_oflags;
1658                 if (fwa->fwa_ipoa != NULL) {
1659                         pkt->dn_ipoa = *(fwa->fwa_ipoa);
1660                 }
1661         }
1662         if (q->head == NULL) {
1663                 q->head = m;
1664         } else {
1665                 q->tail->m_nextpkt = m;
1666         }
1667         q->tail = m;
1668         q->len++;
1669         q->len_bytes += len;
1670
1671         if (q->head != m) {     /* flow was not idle, we are done */
1672                 goto done;
1673         }
1674         /*
1675          * If we reach this point the flow was previously idle, so we need
1676          * to schedule it. This involves different actions for fixed-rate or
1677          * WF2Q queues.
1678          */
1679         if (is_pipe) {
1680                 /*
1681                  * Fixed-rate queue: just insert into the ready_heap.
1682                  */
1683                 dn_key t = 0;
1684                 if (pipe->bandwidth) {
1685                         t = SET_TICKS(m, q, pipe);
1686                 }
1687                 q->sched_time = curr_time;
1688                 if (t == 0) { /* must process it now */
1689                         ready_event( q, &head, &tail );
1690                 } else {
1691                         heap_insert(&ready_heap, curr_time + t, q );
1692                 }
1693         } else {
1694                 /*
1695                  * WF2Q. First, compute start time S: if the flow was idle (S=F+1)
1696                  * set S to the virtual time V for the controlling pipe, and update
1697                  * the sum of weights for the pipe; otherwise, remove flow from
1698                  * idle_heap and set S to max(F,V).
1699                  * Second, compute finish time F = S + len/weight.
1700                  * Third, if pipe was idle, update V=max(S, V).
1701                  * Fourth, count one more backlogged flow.
1702                  */
1703                 if (DN_KEY_GT(q->S, q->F)) { /* means timestamps are invalid */
1704                         q->S = pipe->V;
1705                         pipe->sum += fs->weight; /* add weight of new queue */
1706                 } else {
1707                         heap_extract(&(pipe->idle_heap), q);
1708                         q->S = MAX64(q->F, pipe->V );
1709                 }
1710                 q->F = q->S + (len << MY_M) / (u_int64_t) fs->weight;
1711
1712                 if (pipe->not_eligible_heap.elements == 0 &&
1713                     pipe->scheduler_heap.elements == 0) {
1714                         pipe->V = MAX64( q->S, pipe->V );
1715                 }
1716                 fs->backlogged++;
1717                 /*
1718                  * Look at eligibility. A flow is not eligibile if S>V (when
1719                  * this happens, it means that there is some other flow already
1720                  * scheduled for the same pipe, so the scheduler_heap cannot be
1721                  * empty). If the flow is not eligible we just store it in the
1722                  * not_eligible_heap. Otherwise, we store in the scheduler_heap
1723                  * and possibly invoke ready_event_wfq() right now if there is
1724                  * leftover credit.
1725                  * Note that for all flows in scheduler_heap (SCH), S_i <= V,
1726                  * and for all flows in not_eligible_heap (NEH), S_i > V .
1727                  * So when we need to compute max( V, min(S_i) ) forall i in SCH+NEH,
1728                  * we only need to look into NEH.
1729                  */
1730                 if (DN_KEY_GT(q->S, pipe->V)) { /* not eligible */
1731                         if (pipe->scheduler_heap.elements == 0) {
1732                                 printf("dummynet: ++ ouch! not eligible but empty scheduler!\n");
1733                         }
1734                         heap_insert(&(pipe->not_eligible_heap), q->S, q);
1735                 } else {
1736                         heap_insert(&(pipe->scheduler_heap), q->F, q);
1737                         if (pipe->numbytes >= 0) { /* pipe is idle */
1738                                 if (pipe->scheduler_heap.elements != 1) {
1739                                         printf("dummynet: OUCH! pipe should have been idle!\n");
1740                                 }
1741                                 DPRINTF(("dummynet: waking up pipe %d at %d\n",
1742                                     pipe->pipe_nr, (int)(q->F >> MY_M)));
1743                                 pipe->sched_time = curr_time;
1744                                 ready_event_wfq(pipe, &head, &tail);
1745                         }
1746                 }
1747         }
1748 done:
1749         /* start the timer and set global if not already set */
1750         if (!timer_enabled) {
1751                 ts.tv_sec = 0;
1752                 ts.tv_nsec = 1 * 1000000;       // 1ms
1753                 timer_enabled = 1;
1754                 bsd_timeout(dummynet, NULL, &ts);
1755         }
1756
1757         lck_mtx_unlock(dn_mutex);
1758
1759         if (head != NULL) {
1760                 dummynet_send(head);
1761         }
1762
1763         return 0;
1764
1765 dropit:
1766         if (q) {
1767                 q->drops++;
1768         }
1769         lck_mtx_unlock(dn_mutex);
1770         m_freem(m);
1771         return (fs && (fs->flags_fs & DN_NOERROR)) ? 0 : ENOBUFS;
1772 }
1773
1774 /*
1775  * Below, the ROUTE_RELEASE is only needed when (pkt->dn_dir == DN_TO_IP_OUT)
1776  * Doing this would probably save us the initial bzero of dn_pkt
1777  */
1778 #define DN_FREE_PKT(_m) do {                                    \
1779         struct m_tag *tag = m_tag_locate(m, KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_DUMMYNET, NULL); \
1780         if (tag) {                                              \
1781                 struct dn_pkt_tag *n = (struct dn_pkt_tag *)(tag+1);    \
1782                 ROUTE_RELEASE(&n->dn_ro);                       \
1783         }                                                       \
1784         m_tag_delete(_m, tag);                                  \
1785         m_freem(_m);                                            \
1786 } while (0)
1787
1788 /*
1789  * Dispose all packets and flow_queues on a flow_set.
1790  * If all=1, also remove red lookup table and other storage,
1791  * including the descriptor itself.
1792  * For the one in dn_pipe MUST also cleanup ready_heap...
1793  */
1794 static void
1795 purge_flow_set(struct dn_flow_set *fs, int all)
1796 {
1797         struct dn_flow_queue *q, *qn;
1798         int i;
1799
1800         LCK_MTX_ASSERT(dn_mutex, LCK_MTX_ASSERT_OWNED);
1801
1802         for (i = 0; i <= fs->rq_size; i++) {
1803                 for (q = fs->rq[i]; q; q = qn) {
1804                         struct mbuf *m, *mnext;
1805
1806                         mnext = q->head;
1807                         while ((m = mnext) != NULL) {
1808                                 mnext = m->m_nextpkt;
1809                                 DN_FREE_PKT(m);
1810                         }
1811                         qn = q->next;
1812                         FREE(q, M_DUMMYNET);
1813                 }
1814                 fs->rq[i] = NULL;
1815         }
1816         fs->rq_elements = 0;
1817         if (all) {
1818                 /* RED - free lookup table */
1819                 if (fs->w_q_lookup) {
1820                         FREE(fs->w_q_lookup, M_DUMMYNET);
1821                 }
1822                 if (fs->rq) {
1823                         FREE(fs->rq, M_DUMMYNET);
1824                 }
1825                 /* if this fs is not part of a pipe, free it */
1826                 if (fs->pipe && fs != &(fs->pipe->fs)) {
1827                         FREE(fs, M_DUMMYNET);
1828                 }
1829         }
1830 }
1831
1832 /*
1833  * Dispose all packets queued on a pipe (not a flow_set).
1834  * Also free all resources associated to a pipe, which is about
1835  * to be deleted.
1836  */
1837 static void
1838 purge_pipe(struct dn_pipe *pipe)
1839 {
1840         struct mbuf *m, *mnext;
1841
1842         purge_flow_set( &(pipe->fs), 1 );
1843
1844         mnext = pipe->head;
1845         while ((m = mnext) != NULL) {
1846                 mnext = m->m_nextpkt;
1847                 DN_FREE_PKT(m);
1848         }
1849
1850         heap_free( &(pipe->scheduler_heap));
1851         heap_free( &(pipe->not_eligible_heap));
1852         heap_free( &(pipe->idle_heap));
1853 }
1854
1855 /*
1856  * Delete all pipes and heaps returning memory.
1857  */
1858 static void
1859 dummynet_flush(void)
1860 {
1861         struct dn_pipe *pipe, *pipe1;
1862         struct dn_flow_set *fs, *fs1;
1863         int i;
1864
1865         lck_mtx_lock(dn_mutex);
1866
1867
1868         /* Free heaps so we don't have unwanted events. */
1869         heap_free(&ready_heap);
1870         heap_free(&wfq_ready_heap);
1871         heap_free(&extract_heap);
1872
1873         /*
1874          * Now purge all queued pkts and delete all pipes.
1875          *
1876          * XXXGL: can we merge the for(;;) cycles into one or not?
1877          */
1878         for (i = 0; i < HASHSIZE; i++) {
1879                 SLIST_FOREACH_SAFE(fs, &flowsethash[i], next, fs1) {
1880                         SLIST_REMOVE(&flowsethash[i], fs, dn_flow_set, next);
1881                         purge_flow_set(fs, 1);
1882                 }
1883         }
1884         for (i = 0; i < HASHSIZE; i++) {
1885                 SLIST_FOREACH_SAFE(pipe, &pipehash[i], next, pipe1) {
1886                         SLIST_REMOVE(&pipehash[i], pipe, dn_pipe, next);
1887                         purge_pipe(pipe);
1888                         FREE(pipe, M_DUMMYNET);
1889                 }
1890         }
1891         lck_mtx_unlock(dn_mutex);
1892 }
1893
1894 /*
1895  * setup RED parameters
1896  */
1897 static int
1898 config_red(struct dn_flow_set *p, struct dn_flow_set * x)
1899 {
1900         int i;
1901
1902         x->w_q = p->w_q;
1903         x->min_th = SCALE(p->min_th);
1904         x->max_th = SCALE(p->max_th);
1905         x->max_p = p->max_p;
1906
1907         x->c_1 = p->max_p / (p->max_th - p->min_th);
1908         x->c_2 = SCALE_MUL(x->c_1, SCALE(p->min_th));
1909         if (x->flags_fs & DN_IS_GENTLE_RED) {
1910                 x->c_3 = (SCALE(1) - p->max_p) / p->max_th;
1911                 x->c_4 = (SCALE(1) - 2 * p->max_p);
1912         }
1913
1914         /* if the lookup table already exist, free and create it again */
1915         if (x->w_q_lookup) {
1916                 FREE(x->w_q_lookup, M_DUMMYNET);
1917                 x->w_q_lookup = NULL;
1918         }
1919         if (red_lookup_depth == 0) {
1920                 printf("\ndummynet: net.inet.ip.dummynet.red_lookup_depth must be > 0\n");
1921                 FREE(x, M_DUMMYNET);
1922                 return EINVAL;
1923         }
1924         x->lookup_depth = red_lookup_depth;
1925         x->w_q_lookup = (u_int *) _MALLOC(x->lookup_depth * sizeof(int),
1926             M_DUMMYNET, M_DONTWAIT);
1927         if (x->w_q_lookup == NULL) {
1928                 printf("dummynet: sorry, cannot allocate red lookup table\n");
1929                 FREE(x, M_DUMMYNET);
1930                 return ENOSPC;
1931         }
1932
1933         /* fill the lookup table with (1 - w_q)^x */
1934         x->lookup_step = p->lookup_step;
1935         x->lookup_weight = p->lookup_weight;
1936         x->w_q_lookup[0] = SCALE(1) - x->w_q;
1937         for (i = 1; i < x->lookup_depth; i++) {
1938                 x->w_q_lookup[i] = SCALE_MUL(x->w_q_lookup[i - 1], x->lookup_weight);
1939         }
1940         if (red_avg_pkt_size < 1) {
1941                 red_avg_pkt_size = 512;
1942         }
1943         x->avg_pkt_size = red_avg_pkt_size;
1944         if (red_max_pkt_size < 1) {
1945                 red_max_pkt_size = 1500;
1946         }
1947         x->max_pkt_size = red_max_pkt_size;
1948         return 0;
1949 }
1950
1951 static int
1952 alloc_hash(struct dn_flow_set *x, struct dn_flow_set *pfs)
1953 {
1954         if (x->flags_fs & DN_HAVE_FLOW_MASK) { /* allocate some slots */
1955                 int l = pfs->rq_size;
1956
1957                 if (l == 0) {
1958                         l = dn_hash_size;
1959                 }
1960                 if (l < 4) {
1961                         l = 4;
1962                 } else if (l > DN_MAX_HASH_SIZE) {
1963                         l = DN_MAX_HASH_SIZE;
1964                 }
1965                 x->rq_size = l;
1966         } else {            /* one is enough for null mask */
1967                 x->rq_size = 1;
1968         }
1969         x->rq = _MALLOC((1 + x->rq_size) * sizeof(struct dn_flow_queue *),
1970             M_DUMMYNET, M_DONTWAIT | M_ZERO);
1971         if (x->rq == NULL) {
1972                 printf("dummynet: sorry, cannot allocate queue\n");
1973                 return ENOSPC;
1974         }
1975         x->rq_elements = 0;
1976         return 0;
1977 }
1978
1979 static void
1980 set_fs_parms(struct dn_flow_set *x, struct dn_flow_set *src)
1981 {
1982         x->flags_fs = src->flags_fs;
1983         x->qsize = src->qsize;
1984         x->plr = src->plr;
1985         x->flow_mask = src->flow_mask;
1986         if (x->flags_fs & DN_QSIZE_IS_BYTES) {
1987                 if (x->qsize > 1024 * 1024) {
1988                         x->qsize = 1024 * 1024;
1989                 }
1990         } else {
1991                 if (x->qsize == 0) {
1992                         x->qsize = 50;
1993                 }
1994                 if (x->qsize > 100) {
1995                         x->qsize = 50;
1996                 }
1997         }
1998         /* configuring RED */
1999         if (x->flags_fs & DN_IS_RED) {
2000                 config_red(src, x); /* XXX should check errors */
2001         }
2002 }
2003
2004 /*
2005  * setup pipe or queue parameters.
2006  */
2007 static int
2008 config_pipe(struct dn_pipe *p)
2009 {
2010         int i, r;
2011         struct dn_flow_set *pfs = &(p->fs);
2012         struct dn_flow_queue *q;
2013
2014         /*
2015          * The config program passes parameters as follows:
2016          * bw = bits/second (0 means no limits),
2017          * delay = ms, must be translated into ticks.
2018          * qsize = slots/bytes
2019          */
2020         p->delay = (p->delay * (hz * 10)) / 1000;
2021         /* We need either a pipe number or a flow_set number */
2022         if (p->pipe_nr == 0 && pfs->fs_nr == 0) {
2023                 return EINVAL;
2024         }
2025         if (p->pipe_nr != 0 && pfs->fs_nr != 0) {
2026                 return EINVAL;
2027         }
2028         if (p->pipe_nr != 0) { /* this is a pipe */
2029                 struct dn_pipe *x, *b;
2030                 struct dummynet_event dn_event;
2031                 lck_mtx_lock(dn_mutex);
2032
2033                 /* locate pipe */
2034                 b = locate_pipe(p->pipe_nr);
2035
2036                 if (b == NULL || b->pipe_nr != p->pipe_nr) { /* new pipe */
2037                         x = _MALLOC(sizeof(struct dn_pipe), M_DUMMYNET, M_DONTWAIT | M_ZERO);
2038                         if (x == NULL) {
2039                                 lck_mtx_unlock(dn_mutex);
2040                                 printf("dummynet: no memory for new pipe\n");
2041                                 return ENOSPC;
2042                         }
2043                         x->pipe_nr = p->pipe_nr;
2044                         x->fs.pipe = x;
2045                         /* idle_heap is the only one from which we extract from the middle.
2046                          */
2047                         x->idle_heap.size = x->idle_heap.elements = 0;
2048                         x->idle_heap.offset = offsetof(struct dn_flow_queue, heap_pos);
2049                 } else {
2050                         x = b;
2051                         /* Flush accumulated credit for all queues */
2052                         for (i = 0; i <= x->fs.rq_size; i++) {
2053                                 for (q = x->fs.rq[i]; q; q = q->next) {
2054                                         q->numbytes = 0;
2055                                 }
2056                         }
2057                 }
2058
2059                 x->bandwidth = p->bandwidth;
2060                 x->numbytes = 0; /* just in case... */
2061                 bcopy(p->if_name, x->if_name, sizeof(p->if_name));
2062                 x->ifp = NULL; /* reset interface ptr */
2063                 x->delay = p->delay;
2064                 set_fs_parms(&(x->fs), pfs);
2065
2066
2067                 if (x->fs.rq == NULL) { /* a new pipe */
2068                         r = alloc_hash(&(x->fs), pfs);
2069                         if (r) {
2070                                 lck_mtx_unlock(dn_mutex);
2071                                 FREE(x, M_DUMMYNET);
2072                                 return r;
2073                         }
2074                         SLIST_INSERT_HEAD(&pipehash[HASH(x->pipe_nr)],
2075                             x, next);
2076                 }
2077                 lck_mtx_unlock(dn_mutex);
2078
2079                 bzero(&dn_event, sizeof(dn_event));
2080                 dn_event.dn_event_code = DUMMYNET_PIPE_CONFIG;
2081                 dn_event.dn_event_pipe_config.bandwidth = p->bandwidth;
2082                 dn_event.dn_event_pipe_config.delay = p->delay;
2083                 dn_event.dn_event_pipe_config.plr = pfs->plr;
2084
2085                 dummynet_event_enqueue_nwk_wq_entry(&dn_event);
2086         } else { /* config queue */
2087                 struct dn_flow_set *x, *b;
2088
2089                 lck_mtx_lock(dn_mutex);
2090                 /* locate flow_set */
2091                 b = locate_flowset(pfs->fs_nr);
2092
2093                 if (b == NULL || b->fs_nr != pfs->fs_nr) { /* new  */
2094                         if (pfs->parent_nr == 0) { /* need link to a pipe */
2095                                 lck_mtx_unlock(dn_mutex);
2096                                 return EINVAL;
2097                         }
2098                         x = _MALLOC(sizeof(struct dn_flow_set), M_DUMMYNET, M_DONTWAIT | M_ZERO);
2099                         if (x == NULL) {
2100                                 lck_mtx_unlock(dn_mutex);
2101                                 printf("dummynet: no memory for new flow_set\n");
2102                                 return ENOSPC;
2103                         }
2104                         x->fs_nr = pfs->fs_nr;
2105                         x->parent_nr = pfs->parent_nr;
2106                         x->weight = pfs->weight;
2107                         if (x->weight == 0) {
2108                                 x->weight = 1;
2109                         } else if (x->weight > 100) {
2110                                 x->weight = 100;
2111                         }
2112                 } else {
2113                         /* Change parent pipe not allowed; must delete and recreate */
2114                         if (pfs->parent_nr != 0 && b->parent_nr != pfs->parent_nr) {
2115                                 lck_mtx_unlock(dn_mutex);
2116                                 return EINVAL;
2117                         }
2118                         x = b;
2119                 }
2120                 set_fs_parms(x, pfs);
2121
2122                 if (x->rq == NULL) { /* a new flow_set */
2123                         r = alloc_hash(x, pfs);
2124                         if (r) {
2125                                 lck_mtx_unlock(dn_mutex);
2126                                 FREE(x, M_DUMMYNET);
2127                                 return r;
2128                         }
2129                         SLIST_INSERT_HEAD(&flowsethash[HASH(x->fs_nr)],
2130                             x, next);
2131                 }
2132                 lck_mtx_unlock(dn_mutex);
2133         }
2134         return 0;
2135 }
2136
2137 /*
2138  * Helper function to remove from a heap queues which are linked to
2139  * a flow_set about to be deleted.
2140  */
2141 static void
2142 fs_remove_from_heap(struct dn_heap *h, struct dn_flow_set *fs)
2143 {
2144         int i = 0, found = 0;
2145         for (; i < h->elements;) {
2146                 if (((struct dn_flow_queue *)h->p[i].object)->fs == fs) {
2147                         h->elements--;
2148                         h->p[i] = h->p[h->elements];
2149                         found++;
2150                 } else {
2151                         i++;
2152                 }
2153         }
2154         if (found) {
2155                 heapify(h);
2156         }
2157 }
2158
2159 /*
2160  * helper function to remove a pipe from a heap (can be there at most once)
2161  */
2162 static void
2163 pipe_remove_from_heap(struct dn_heap *h, struct dn_pipe *p)
2164 {
2165         if (h->elements > 0) {
2166                 int i = 0;
2167                 for (i = 0; i < h->elements; i++) {
2168                         if (h->p[i].object == p) { /* found it */
2169                                 h->elements--;
2170                                 h->p[i] = h->p[h->elements];
2171                                 heapify(h);
2172                                 break;
2173                         }
2174                 }
2175         }
2176 }
2177
2178 /*
2179  * drain all queues. Called in case of severe mbuf shortage.
2180  */
2181 void
2182 dummynet_drain(void)
2183 {
2184         struct dn_flow_set *fs;
2185         struct dn_pipe *p;
2186         struct mbuf *m, *mnext;
2187         int i;
2188
2189         LCK_MTX_ASSERT(dn_mutex, LCK_MTX_ASSERT_OWNED);
2190
2191         heap_free(&ready_heap);
2192         heap_free(&wfq_ready_heap);
2193         heap_free(&extract_heap);
2194         /* remove all references to this pipe from flow_sets */
2195         for (i = 0; i < HASHSIZE; i++) {
2196                 SLIST_FOREACH(fs, &flowsethash[i], next) {
2197                         purge_flow_set(fs, 0);
2198                 }
2199         }
2200
2201         for (i = 0; i < HASHSIZE; i++) {
2202                 SLIST_FOREACH(p, &pipehash[i], next) {
2203                         purge_flow_set(&(p->fs), 0);
2204
2205                         mnext = p->head;
2206                         while ((m = mnext) != NULL) {
2207                                 mnext = m->m_nextpkt;
2208                                 DN_FREE_PKT(m);
2209                         }
2210                         p->head = p->tail = NULL;
2211                 }
2212         }
2213 }
2214
2215 /*
2216  * Fully delete a pipe or a queue, cleaning up associated info.
2217  */
2218 static int
2219 delete_pipe(struct dn_pipe *p)
2220 {
2221         if (p->pipe_nr == 0 && p->fs.fs_nr == 0) {
2222                 return EINVAL;
2223         }
2224         if (p->pipe_nr != 0 && p->fs.fs_nr != 0) {
2225                 return EINVAL;
2226         }
2227         if (p->pipe_nr != 0) { /* this is an old-style pipe */
2228                 struct dn_pipe *b;
2229                 struct dn_flow_set *fs;
2230                 int i;
2231
2232                 lck_mtx_lock(dn_mutex);
2233                 /* locate pipe */
2234                 b = locate_pipe(p->pipe_nr);
2235                 if (b == NULL) {
2236                         lck_mtx_unlock(dn_mutex);
2237                         return EINVAL; /* not found */
2238                 }
2239
2240                 /* Unlink from list of pipes. */
2241                 SLIST_REMOVE(&pipehash[HASH(b->pipe_nr)], b, dn_pipe, next);
2242
2243
2244                 /* Remove all references to this pipe from flow_sets. */
2245                 for (i = 0; i < HASHSIZE; i++) {
2246                         SLIST_FOREACH(fs, &flowsethash[i], next) {
2247                                 if (fs->pipe == b) {
2248                                         printf("dummynet: ++ ref to pipe %d from fs %d\n",
2249                                             p->pipe_nr, fs->fs_nr);
2250                                         fs->pipe = NULL;
2251                                         purge_flow_set(fs, 0);
2252                                 }
2253                         }
2254                 }
2255                 fs_remove_from_heap(&ready_heap, &(b->fs));
2256
2257                 purge_pipe(b); /* remove all data associated to this pipe */
2258                 /* remove reference to here from extract_heap and wfq_ready_heap */
2259                 pipe_remove_from_heap(&extract_heap, b);
2260                 pipe_remove_from_heap(&wfq_ready_heap, b);
2261                 lck_mtx_unlock(dn_mutex);
2262
2263                 FREE(b, M_DUMMYNET);
2264         } else { /* this is a WF2Q queue (dn_flow_set) */
2265                 struct dn_flow_set *b;
2266
2267                 lck_mtx_lock(dn_mutex);
2268                 /* locate set */
2269                 b = locate_flowset(p->fs.fs_nr);
2270                 if (b == NULL) {
2271                         lck_mtx_unlock(dn_mutex);
2272                         return EINVAL; /* not found */
2273                 }
2274
2275
2276                 /* Unlink from list of flowsets. */
2277                 SLIST_REMOVE( &flowsethash[HASH(b->fs_nr)], b, dn_flow_set, next);
2278
2279                 if (b->pipe != NULL) {
2280                         /* Update total weight on parent pipe and cleanup parent heaps */
2281                         b->pipe->sum -= b->weight * b->backlogged;
2282                         fs_remove_from_heap(&(b->pipe->not_eligible_heap), b);
2283                         fs_remove_from_heap(&(b->pipe->scheduler_heap), b);
2284 #if 1   /* XXX should i remove from idle_heap as well ? */
2285                         fs_remove_from_heap(&(b->pipe->idle_heap), b);
2286 #endif
2287                 }
2288                 purge_flow_set(b, 1);
2289                 lck_mtx_unlock(dn_mutex);
2290         }
2291         return 0;
2292 }
2293
2294 /*
2295  * helper function used to copy data from kernel in DUMMYNET_GET
2296  */
2297 static
2298 char*
2299 dn_copy_set_32(struct dn_flow_set *set, char *bp)
2300 {
2301         int i, copied = 0;
2302         struct dn_flow_queue *q;
2303         struct dn_flow_queue_32 *qp = (struct dn_flow_queue_32 *)bp;
2304
2305         LCK_MTX_ASSERT(dn_mutex, LCK_MTX_ASSERT_OWNED);
2306
2307         for (i = 0; i <= set->rq_size; i++) {
2308                 for (q = set->rq[i]; q; q = q->next, qp++) {
2309                         if (q->hash_slot != i) {
2310                                 printf("dummynet: ++ at %d: wrong slot (have %d, "
2311                                     "should be %d)\n", copied, q->hash_slot, i);
2312                         }
2313                         if (q->fs != set) {
2314                                 printf("dummynet: ++ at %d: wrong fs ptr "
2315                                     "(have 0x%llx, should be 0x%llx)\n", i,
2316                                     (uint64_t)VM_KERNEL_ADDRPERM(q->fs),
2317                                     (uint64_t)VM_KERNEL_ADDRPERM(set));
2318                         }
2319                         copied++;
2320                         cp_queue_to_32_user( q, qp );
2321                         /* cleanup pointers */
2322                         qp->next = (user32_addr_t)0;
2323                         qp->head = qp->tail = (user32_addr_t)0;
2324                         qp->fs = (user32_addr_t)0;
2325                 }
2326         }
2327         if (copied != set->rq_elements) {
2328                 printf("dummynet: ++ wrong count, have %d should be %d\n",
2329                     copied, set->rq_elements);
2330         }
2331         return (char *)qp;
2332 }
2333
2334 static
2335 char*
2336 dn_copy_set_64(struct dn_flow_set *set, char *bp)
2337 {
2338         int i, copied = 0;
2339         struct dn_flow_queue *q;
2340         struct dn_flow_queue_64 *qp = (struct dn_flow_queue_64 *)bp;
2341
2342         LCK_MTX_ASSERT(dn_mutex, LCK_MTX_ASSERT_OWNED);
2343
2344         for (i = 0; i <= set->rq_size; i++) {
2345                 for (q = set->rq[i]; q; q = q->next, qp++) {
2346                         if (q->hash_slot != i) {
2347                                 printf("dummynet: ++ at %d: wrong slot (have %d, "
2348                                     "should be %d)\n", copied, q->hash_slot, i);
2349                         }
2350                         if (q->fs != set) {
2351                                 printf("dummynet: ++ at %d: wrong fs ptr "
2352                                     "(have 0x%llx, should be 0x%llx)\n", i,
2353                                     (uint64_t)VM_KERNEL_ADDRPERM(q->fs),
2354                                     (uint64_t)VM_KERNEL_ADDRPERM(set));
2355                         }
2356                         copied++;
2357                         //bcopy(q, qp, sizeof(*q));
2358                         cp_queue_to_64_user( q, qp );
2359                         /* cleanup pointers */
2360                         qp->next = USER_ADDR_NULL;
2361                         qp->head = qp->tail = USER_ADDR_NULL;
2362                         qp->fs = USER_ADDR_NULL;
2363                 }
2364         }
2365         if (copied != set->rq_elements) {
2366                 printf("dummynet: ++ wrong count, have %d should be %d\n",
2367                     copied, set->rq_elements);
2368         }
2369         return (char *)qp;
2370 }
2371
2372 static size_t
2373 dn_calc_size(int is64user)
2374 {
2375         struct dn_flow_set *set;
2376         struct dn_pipe *p;
2377         size_t size = 0;
2378         size_t pipesize;
2379         size_t queuesize;
2380         size_t setsize;
2381         int i;
2382
2383         LCK_MTX_ASSERT(dn_mutex, LCK_MTX_ASSERT_OWNED);
2384         if (is64user) {
2385                 pipesize = sizeof(struct dn_pipe_64);
2386                 queuesize = sizeof(struct dn_flow_queue_64);
2387                 setsize = sizeof(struct dn_flow_set_64);
2388         } else {
2389                 pipesize = sizeof(struct dn_pipe_32);
2390                 queuesize = sizeof(struct dn_flow_queue_32);
2391                 setsize = sizeof(struct dn_flow_set_32);
2392         }
2393         /*
2394          * compute size of data structures: list of pipes and flow_sets.
2395          */
2396         for (i = 0; i < HASHSIZE; i++) {
2397                 SLIST_FOREACH(p, &pipehash[i], next) {
2398                         size += sizeof(*p) +
2399                             p->fs.rq_elements * sizeof(struct dn_flow_queue);
2400                 }
2401                 SLIST_FOREACH(set, &flowsethash[i], next) {
2402                         size += sizeof(*set) +
2403                             set->rq_elements * sizeof(struct dn_flow_queue);
2404                 }
2405         }
2406         return size;
2407 }
2408
2409 static int
2410 dummynet_get(struct sockopt *sopt)
2411 {
2412         char *buf = NULL, *bp = NULL; /* bp is the "copy-pointer" */
2413         size_t size = 0;
2414         struct dn_flow_set *set;
2415         struct dn_pipe *p;
2416         int error = 0, i;
2417         int is64user = 0;
2418
2419         /* XXX lock held too long */
2420         lck_mtx_lock(dn_mutex);
2421         /*
2422          * XXX: Ugly, but we need to allocate memory with M_WAITOK flag
2423          * and we cannot use this flag while holding a mutex.
2424          */
2425         if (proc_is64bit(sopt->sopt_p)) {
2426                 is64user = 1;
2427         }
2428         for (i = 0; i < 10; i++) {
2429                 size = dn_calc_size(is64user);
2430                 lck_mtx_unlock(dn_mutex);
2431                 buf = _MALLOC(size, M_TEMP, M_WAITOK | M_ZERO);
2432                 if (buf == NULL) {
2433                         return ENOBUFS;
2434                 }
2435                 lck_mtx_lock(dn_mutex);
2436                 if (size == dn_calc_size(is64user)) {
2437                         break;
2438                 }
2439                 FREE(buf, M_TEMP);
2440                 buf = NULL;
2441         }
2442         if (buf == NULL) {
2443                 lck_mtx_unlock(dn_mutex);
2444                 return ENOBUFS;
2445         }
2446
2447         bp = buf;
2448         for (i = 0; i < HASHSIZE; i++) {
2449                 SLIST_FOREACH(p, &pipehash[i], next) {
2450                         /*
2451                          * copy pipe descriptor into *bp, convert delay
2452                          * back to ms, then copy the flow_set descriptor(s)
2453                          * one at a time. After each flow_set, copy the
2454                          * queue descriptor it owns.
2455                          */
2456                         if (is64user) {
2457                                 bp = cp_pipe_to_64_user(p,
2458                                     (struct dn_pipe_64 *)bp);
2459                         } else {
2460                                 bp = cp_pipe_to_32_user(p,
2461                                     (struct dn_pipe_32 *)bp);
2462                         }
2463                 }
2464         }
2465         for (i = 0; i < HASHSIZE; i++) {
2466                 SLIST_FOREACH(set, &flowsethash[i], next) {
2467                         struct dn_flow_set_64 *fs_bp =
2468                             (struct dn_flow_set_64 *)bp;
2469                         cp_flow_set_to_64_user(set, fs_bp);
2470                         /* XXX same hack as above */
2471                         fs_bp->next = CAST_DOWN(user64_addr_t,
2472                             DN_IS_QUEUE);
2473                         fs_bp->pipe = USER_ADDR_NULL;
2474                         fs_bp->rq = USER_ADDR_NULL;
2475                         bp += sizeof(struct dn_flow_set_64);
2476                         bp = dn_copy_set_64( set, bp );
2477                 }
2478         }
2479         lck_mtx_unlock(dn_mutex);
2480         error = sooptcopyout(sopt, buf, size);
2481         FREE(buf, M_TEMP);
2482         return error;
2483 }
2484
2485 /*
2486  * Handler for the various dummynet socket options (get, flush, config, del)
2487  */
2488 static int
2489 ip_dn_ctl(struct sockopt *sopt)
2490 {
2491         int error = 0;
2492         struct dn_pipe *p, tmp_pipe;
2493
2494         /* Disallow sets in really-really secure mode. */
2495         if (sopt->sopt_dir == SOPT_SET && securelevel >= 3) {
2496                 return EPERM;
2497         }
2498
2499         switch (sopt->sopt_name) {
2500         default:
2501                 printf("dummynet: -- unknown option %d", sopt->sopt_name);
2502                 return EINVAL;
2503
2504         case IP_DUMMYNET_GET:
2505                 error = dummynet_get(sopt);
2506                 break;
2507
2508         case IP_DUMMYNET_FLUSH:
2509                 dummynet_flush();
2510                 break;
2511
2512         case IP_DUMMYNET_CONFIGURE:
2513                 p = &tmp_pipe;
2514                 if (proc_is64bit(sopt->sopt_p)) {
2515                         error = cp_pipe_from_user_64( sopt, p );
2516                 } else {
2517                         error = cp_pipe_from_user_32( sopt, p );
2518                 }
2519
2520                 if (error) {
2521                         break;
2522                 }
2523                 error = config_pipe(p);
2524                 break;
2525
2526         case IP_DUMMYNET_DEL:   /* remove a pipe or queue */
2527                 p = &tmp_pipe;
2528                 if (proc_is64bit(sopt->sopt_p)) {
2529                         error = cp_pipe_from_user_64( sopt, p );
2530                 } else {
2531                         error = cp_pipe_from_user_32( sopt, p );
2532                 }
2533                 if (error) {
2534                         break;
2535                 }
2536
2537                 error = delete_pipe(p);
2538                 break;
2539         }
2540         return error;
2541 }
2542
2543 void
2544 dummynet_init(void)
2545 {
2546         eventhandler_lists_ctxt_init(&dummynet_evhdlr_ctxt);
2547 }
2548
2549 void
2550 ip_dn_init(void)
2551 {
2552         /* setup locks */
2553         dn_mutex_grp_attr = lck_grp_attr_alloc_init();
2554         dn_mutex_grp = lck_grp_alloc_init("dn", dn_mutex_grp_attr);
2555         dn_mutex_attr = lck_attr_alloc_init();
2556         lck_mtx_init(dn_mutex, dn_mutex_grp, dn_mutex_attr);
2557
2558         ready_heap.size = ready_heap.elements = 0;
2559         ready_heap.offset = 0;
2560
2561         wfq_ready_heap.size = wfq_ready_heap.elements = 0;
2562         wfq_ready_heap.offset = 0;
2563
2564         extract_heap.size = extract_heap.elements = 0;
2565         extract_heap.offset = 0;
2566         ip_dn_ctl_ptr = ip_dn_ctl;
2567         ip_dn_io_ptr = dummynet_io;
2568 }
2569
2570 struct dn_event_nwk_wq_entry {
2571         struct nwk_wq_entry nwk_wqe;
2572         struct dummynet_event dn_ev_arg;
2573 };
2574
2575 static void
2576 dummynet_event_callback(void *arg)
2577 {
2578         struct dummynet_event *p_dn_ev = (struct dummynet_event *)arg;
2579
2580         EVENTHANDLER_INVOKE(&dummynet_evhdlr_ctxt, dummynet_event, p_dn_ev);
2581         return;
2582 }
2583
2584 void
2585 dummynet_event_enqueue_nwk_wq_entry(struct dummynet_event *p_dn_event)
2586 {
2587         struct dn_event_nwk_wq_entry *p_dn_ev = NULL;
2588
2589         MALLOC(p_dn_ev, struct dn_event_nwk_wq_entry *,
2590             sizeof(struct dn_event_nwk_wq_entry),
2591             M_NWKWQ, M_WAITOK | M_ZERO);
2592
2593         p_dn_ev->nwk_wqe.func = dummynet_event_callback;
2594         p_dn_ev->nwk_wqe.is_arg_managed = TRUE;
2595         p_dn_ev->nwk_wqe.arg = &p_dn_ev->dn_ev_arg;
2596
2597         bcopy(p_dn_event, &(p_dn_ev->dn_ev_arg),
2598             sizeof(struct dummynet_event));
2599         nwk_wq_enqueue((struct nwk_wq_entry*)p_dn_ev);
2600 }