bsd/kern/uipc_mbuf.c

   1 /*
   2  * Copyright (c) 1998-2014 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
  29 /*
  30  * Copyright (c) 1982, 1986, 1988, 1991, 1993
  31  *      The Regents of the University of California.  All rights reserved.
  32  *
  33  * Redistribution and use in source and binary forms, with or without
  34  * modification, are permitted provided that the following conditions
  35  * are met:
  36  * 1. Redistributions of source code must retain the above copyright
  37  *    notice, this list of conditions and the following disclaimer.
  38  * 2. Redistributions in binary form must reproduce the above copyright
  39  *    notice, this list of conditions and the following disclaimer in the
  40  *    documentation and/or other materials provided with the distribution.
  41  * 3. All advertising materials mentioning features or use of this software
  42  *    must display the following acknowledgement:
  43  *      This product includes software developed by the University of
  44  *      California, Berkeley and its contributors.
  45  * 4. Neither the name of the University nor the names of its contributors
  46  *    may be used to endorse or promote products derived from this software
  47  *    without specific prior written permission.
  48  *
  49  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  50  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  51  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  52  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  53  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  54  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  55  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  56  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  57  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  58  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  59  * SUCH DAMAGE.
  60  *
  61  *      @(#)uipc_mbuf.c 8.2 (Berkeley) 1/4/94
  62  */
  63 /*
  64  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
  65  * support for mandatory and extensible security protections.  This notice
  66  * is included in support of clause 2.2 (b) of the Apple Public License,
  67  * Version 2.0.
  68  */
  69
  70 #include <sys/param.h>
  71 #include <sys/systm.h>
  72 #include <sys/malloc.h>
  73 #include <sys/mbuf.h>
  74 #include <sys/kernel.h>
  75 #include <sys/sysctl.h>
  76 #include <sys/syslog.h>
  77 #include <sys/protosw.h>
  78 #include <sys/domain.h>
  79 #include <sys/queue.h>
  80 #include <sys/proc.h>
  81
  82 #include <dev/random/randomdev.h>
  83
  84 #include <kern/kern_types.h>
  85 #include <kern/simple_lock.h>
  86 #include <kern/queue.h>
  87 #include <kern/sched_prim.h>
  88 #include <kern/cpu_number.h>
  89 #include <kern/zalloc.h>
  90
  91 #include <libkern/OSAtomic.h>
  92 #include <libkern/OSDebug.h>
  93 #include <libkern/libkern.h>
  94
  95 #include <IOKit/IOMapper.h>
  96
  97 #include <machine/limits.h>
  98 #include <machine/machine_routines.h>
  99
 100 #if CONFIG_MACF_NET
 101 #include <security/mac_framework.h>
 102 #endif /* MAC_NET */
 103
 104 #include <sys/mcache.h>
 105 #include <net/ntstat.h>
 106
 107 /*
 108  * MBUF IMPLEMENTATION NOTES.
 109  *
 110  * There is a total of 5 per-CPU caches:
 111  *
 112  * MC_MBUF:
 113  *      This is a cache of rudimentary objects of MSIZE in size; each
 114  *      object represents an mbuf structure.  This cache preserves only
 115  *      the m_type field of the mbuf during its transactions.
 116  *
 117  * MC_CL:
 118  *      This is a cache of rudimentary objects of MCLBYTES in size; each
 119  *      object represents a mcluster structure.  This cache does not
 120  *      preserve the contents of the objects during its transactions.
 121  *
 122  * MC_BIGCL:
 123  *      This is a cache of rudimentary objects of MBIGCLBYTES in size; each
 124  *      object represents a mbigcluster structure.  This cache does not
 125  *      preserve the contents of the objects during its transaction.
 126  *
 127  * MC_MBUF_CL:
 128  *      This is a cache of mbufs each having a cluster attached to it.
 129  *      It is backed by MC_MBUF and MC_CL rudimentary caches.  Several
 130  *      fields of the mbuf related to the external cluster are preserved
 131  *      during transactions.
 132  *
 133  * MC_MBUF_BIGCL:
 134  *      This is a cache of mbufs each having a big cluster attached to it.
 135  *      It is backed by MC_MBUF and MC_BIGCL rudimentary caches.  Several
 136  *      fields of the mbuf related to the external cluster are preserved
 137  *      during transactions.
 138  *
 139  * OBJECT ALLOCATION:
 140  *
 141  * Allocation requests are handled first at the per-CPU (mcache) layer
 142  * before falling back to the slab layer.  Performance is optimal when
 143  * the request is satisfied at the CPU layer because global data/lock
 144  * never gets accessed.  When the slab layer is entered for allocation,
 145  * the slab freelist will be checked first for available objects before
 146  * the VM backing store is invoked.  Slab layer operations are serialized
 147  * for all of the caches as the mbuf global lock is held most of the time.
 148  * Allocation paths are different depending on the class of objects:
 149  *
 150  * a. Rudimentary object:
 151  *
 152  *      { m_get_common(), m_clattach(), m_mclget(),
 153  *        m_mclalloc(), m_bigalloc(), m_copym_with_hdrs(),
 154  *        composite object allocation }
 155  *                      |       ^
 156  *                      |       |
 157  *                      |       +-----------------------+
 158  *                      v                               |
 159  *         mcache_alloc/mcache_alloc_ext()      mbuf_slab_audit()
 160  *                      |                               ^
 161  *                      v                               |
 162  *                 [CPU cache] -------> (found?) -------+
 163  *                      |                               |
 164  *                      v                               |
 165  *               mbuf_slab_alloc()                      |
 166  *                      |                               |
 167  *                      v                               |
 168  *      +---------> [freelist] -------> (found?) -------+
 169  *      |               |
 170  *      |               v
 171  *      |           m_clalloc()
 172  *      |               |
 173  *      |               v
 174  *      +---<<---- kmem_mb_alloc()
 175  *
 176  * b. Composite object:
 177  *
 178  *      { m_getpackets_internal(), m_allocpacket_internal() }
 179  *                      |       ^
 180  *                      |       |
 181  *                      |       +------ (done) ---------+
 182  *                      v                               |
 183  *         mcache_alloc/mcache_alloc_ext()      mbuf_cslab_audit()
 184  *                      |                               ^
 185  *                      v                               |
 186  *                 [CPU cache] -------> (found?) -------+
 187  *                      |                               |
 188  *                      v                               |
 189  *               mbuf_cslab_alloc()                     |
 190  *                      |                               |
 191  *                      v                               |
 192  *                  [freelist] -------> (found?) -------+
 193  *                      |                               |
 194  *                      v                               |
 195  *              (rudimentary object)                    |
 196  *         mcache_alloc/mcache_alloc_ext() ------>>-----+
 197  *
 198  * Auditing notes: If auditing is enabled, buffers will be subjected to
 199  * integrity checks by the audit routine.  This is done by verifying their
 200  * contents against DEADBEEF (free) pattern before returning them to caller.
 201  * As part of this step, the routine will also record the transaction and
 202  * pattern-fill the buffers with BADDCAFE (uninitialized) pattern.  It will
 203  * also restore any constructed data structure fields if necessary.
 204  *
 205  * OBJECT DEALLOCATION:
 206  *
 207  * Freeing an object simply involves placing it into the CPU cache; this
 208  * pollutes the cache to benefit subsequent allocations.  The slab layer
 209  * will only be entered if the object is to be purged out of the cache.
 210  * During normal operations, this happens only when the CPU layer resizes
 211  * its bucket while it's adjusting to the allocation load.  Deallocation
 212  * paths are different depending on the class of objects:
 213  *
 214  * a. Rudimentary object:
 215  *
 216  *      { m_free(), m_freem_list(), composite object deallocation }
 217  *                      |       ^
 218  *                      |       |
 219  *                      |       +------ (done) ---------+
 220  *                      v                               |
 221  *         mcache_free/mcache_free_ext()                |
 222  *                      |                               |
 223  *                      v                               |
 224  *              mbuf_slab_audit()                       |
 225  *                      |                               |
 226  *                      v                               |
 227  *                 [CPU cache] ---> (not purging?) -----+
 228  *                      |                               |
 229  *                      v                               |
 230  *               mbuf_slab_free()                       |
 231  *                      |                               |
 232  *                      v                               |
 233  *                  [freelist] ----------->>------------+
 234  *       (objects never get purged to VM)
 235  *
 236  * b. Composite object:
 237  *
 238  *      { m_free(), m_freem_list() }
 239  *                      |       ^
 240  *                      |       |
 241  *                      |       +------ (done) ---------+
 242  *                      v                               |
 243  *         mcache_free/mcache_free_ext()                |
 244  *                      |                               |
 245  *                      v                               |
 246  *              mbuf_cslab_audit()                      |
 247  *                      |                               |
 248  *                      v                               |
 249  *                 [CPU cache] ---> (not purging?) -----+
 250  *                      |                               |
 251  *                      v                               |
 252  *               mbuf_cslab_free()                      |
 253  *                      |                               |
 254  *                      v                               |
 255  *                  [freelist] ---> (not purging?) -----+
 256  *                      |                               |
 257  *                      v                               |
 258  *              (rudimentary object)                    |
 259  *         mcache_free/mcache_free_ext() ------->>------+
 260  *
 261  * Auditing notes: If auditing is enabled, the audit routine will save
 262  * any constructed data structure fields (if necessary) before filling the
 263  * contents of the buffers with DEADBEEF (free) pattern and recording the
 264  * transaction.  Buffers that are freed (whether at CPU or slab layer) are
 265  * expected to contain the free pattern.
 266  *
 267  * DEBUGGING:
 268  *
 269  * Debugging can be enabled by adding "mbuf_debug=0x3" to boot-args; this
 270  * translates to the mcache flags (MCF_VERIFY | MCF_AUDIT).  Additionally,
 271  * the CPU layer cache can be disabled by setting the MCF_NOCPUCACHE flag,
 272  * i.e. modify the boot argument parameter to "mbuf_debug=0x13".  Leak
 273  * detection may also be disabled by setting the MCF_NOLEAKLOG flag, e.g.
 274  * "mbuf_debug=0x113".  Note that debugging consumes more CPU and memory.
 275  *
 276  * Each object is associated with exactly one mcache_audit_t structure that
 277  * contains the information related to its last buffer transaction.  Given
 278  * an address of an object, the audit structure can be retrieved by finding
 279  * the position of the object relevant to the base address of the cluster:
 280  *
 281  *      +------------+                  +=============+
 282  *      | mbuf addr  |                  | mclaudit[i] |
 283  *      +------------+                  +=============+
 284  *            |                         | cl_audit[0] |
 285  *      i = MTOBG(addr)                 +-------------+
 286  *            |                 +-----> | cl_audit[1] | -----> mcache_audit_t
 287  *      b = BGTOM(i)            |       +-------------+
 288  *            |                 |       |     ...     |
 289  *      x = MCLIDX(b, addr)     |       +-------------+
 290  *            |                 |       | cl_audit[7] |
 291  *            +-----------------+       +-------------+
 292  *               (e.g. x == 1)
 293  *
 294  * The mclaudit[] array is allocated at initialization time, but its contents
 295  * get populated when the corresponding cluster is created.  Because a page
 296  * can be turned into NMBPBG number of mbufs, we preserve enough space for the
 297  * mbufs so that there is a 1-to-1 mapping between them.  A page that never
 298  * gets (or has not yet) turned into mbufs will use only cl_audit[0] with the
 299  * remaining entries unused.  For 16KB cluster, only one entry from the first
 300  * page is allocated and used for the entire object.
 301  */
 302
 303 /* TODO: should be in header file */
 304 /* kernel translater */
 305 extern vm_offset_t kmem_mb_alloc(vm_map_t, int, int);
 306 extern ppnum_t pmap_find_phys(pmap_t pmap, addr64_t va);
 307 extern vm_map_t mb_map;         /* special map */
 308
 309 /* Global lock */
 310 decl_lck_mtx_data(static, mbuf_mlock_data);
 311 static lck_mtx_t *mbuf_mlock = &mbuf_mlock_data;
 312 static lck_attr_t *mbuf_mlock_attr;
 313 static lck_grp_t *mbuf_mlock_grp;
 314 static lck_grp_attr_t *mbuf_mlock_grp_attr;
 315
 316 /* Back-end (common) layer */
 317 static void *mbuf_worker_run;   /* wait channel for worker thread */
 318 static int mbuf_worker_ready;   /* worker thread is runnable */
 319 static int mbuf_expand_mcl;     /* number of cluster creation requets */
 320 static int mbuf_expand_big;     /* number of big cluster creation requests */
 321 static int mbuf_expand_16k;     /* number of 16KB cluster creation requests */
 322 static int ncpu;                /* number of CPUs */
 323 static ppnum_t *mcl_paddr;      /* Array of cluster physical addresses */
 324 static ppnum_t mcl_pages;       /* Size of array (# physical pages) */
 325 static ppnum_t mcl_paddr_base;  /* Handle returned by IOMapper::iovmAlloc() */
 326 static mcache_t *ref_cache;     /* Cache of cluster reference & flags */
 327 static mcache_t *mcl_audit_con_cache; /* Audit contents cache */
 328 static unsigned int mbuf_debug; /* patchable mbuf mcache flags */
 329 static unsigned int mb_normalized; /* number of packets "normalized" */
 330
 331 #define MB_GROWTH_AGGRESSIVE    1       /* Threshold: 1/2 of total */
 332 #define MB_GROWTH_NORMAL        2       /* Threshold: 3/4 of total */
 333
 334 typedef enum {
 335         MC_MBUF = 0,    /* Regular mbuf */
 336         MC_CL,          /* Cluster */
 337         MC_BIGCL,       /* Large (4KB) cluster */
 338         MC_16KCL,       /* Jumbo (16KB) cluster */
 339         MC_MBUF_CL,     /* mbuf + cluster */
 340         MC_MBUF_BIGCL,  /* mbuf + large (4KB) cluster */
 341         MC_MBUF_16KCL   /* mbuf + jumbo (16KB) cluster */
 342 } mbuf_class_t;
 343
 344 #define MBUF_CLASS_MIN          MC_MBUF
 345 #define MBUF_CLASS_MAX          MC_MBUF_16KCL
 346 #define MBUF_CLASS_LAST         MC_16KCL
 347 #define MBUF_CLASS_VALID(c) \
 348         ((int)(c) >= MBUF_CLASS_MIN && (int)(c) <= MBUF_CLASS_MAX)
 349 #define MBUF_CLASS_COMPOSITE(c) \
 350         ((int)(c) > MBUF_CLASS_LAST)
 351
 352
 353 /*
 354  * mbuf specific mcache allocation request flags.
 355  */
 356 #define MCR_COMP        MCR_USR1 /* for MC_MBUF_{CL,BIGCL,16KCL} caches */
 357
 358 /*
 359  * Per-cluster slab structure.
 360  *
 361  * A slab is a cluster control structure that contains one or more object
 362  * chunks; the available chunks are chained in the slab's freelist (sl_head).
 363  * Each time a chunk is taken out of the slab, the slab's reference count
 364  * gets incremented.  When all chunks have been taken out, the empty slab
 365  * gets removed (SLF_DETACHED) from the class's slab list.  A chunk that is
 366  * returned to a slab causes the slab's reference count to be decremented;
 367  * it also causes the slab to be reinserted back to class's slab list, if
 368  * it's not already done.
 369  *
 370  * Compartmentalizing of the object chunks into slabs allows us to easily
 371  * merge one or more slabs together when the adjacent slabs are idle, as
 372  * well as to convert or move a slab from one class to another; e.g. the
 373  * mbuf cluster slab can be converted to a regular cluster slab when all
 374  * mbufs in the slab have been freed.
 375  *
 376  * A slab may also span across multiple clusters for chunks larger than
 377  * a cluster's size.  In this case, only the slab of the first cluster is
 378  * used.  The rest of the slabs are marked with SLF_PARTIAL to indicate
 379  * that they are part of the larger slab.
 380  *
 381  * Each slab controls a page of memory.
 382  */
 383 typedef struct mcl_slab {
 384         struct mcl_slab *sl_next;       /* neighboring slab */
 385         u_int8_t        sl_class;       /* controlling mbuf class */
 386         int8_t          sl_refcnt;      /* outstanding allocations */
 387         int8_t          sl_chunks;      /* chunks (bufs) in this slab */
 388         u_int16_t       sl_flags;       /* slab flags (see below) */
 389         u_int16_t       sl_len;         /* slab length */
 390         void            *sl_base;       /* base of allocated memory */
 391         void            *sl_head;       /* first free buffer */
 392         TAILQ_ENTRY(mcl_slab) sl_link;  /* next/prev slab on freelist */
 393 } mcl_slab_t;
 394
 395 #define SLF_MAPPED      0x0001          /* backed by a mapped page */
 396 #define SLF_PARTIAL     0x0002          /* part of another slab */
 397 #define SLF_DETACHED    0x0004          /* not in slab freelist */
 398
 399 /*
 400  * The array of slabs are broken into groups of arrays per 1MB of kernel
 401  * memory to reduce the footprint.  Each group is allocated on demand
 402  * whenever a new piece of memory mapped in from the VM crosses the 1MB
 403  * boundary.
 404  */
 405 #define NSLABSPMB       ((1 << MBSHIFT) >> PGSHIFT)     /* 256 slabs/grp */
 406
 407 typedef struct mcl_slabg {
 408         mcl_slab_t      slg_slab[NSLABSPMB];    /* group of slabs */
 409 } mcl_slabg_t;
 410
 411 /*
 412  * Number of slabs needed to control a 16KB cluster object.
 413  */
 414 #define NSLABSP16KB     (M16KCLBYTES >> PGSHIFT)
 415
 416 /*
 417  * Per-cluster audit structure.
 418  */
 419 typedef struct {
 420         mcache_audit_t  *cl_audit[NMBPBG];      /* array of audits */
 421 } mcl_audit_t;
 422
 423 typedef struct {
 424         struct thread   *msa_thread;    /* thread doing transaction */
 425         struct thread   *msa_pthread;   /* previous transaction thread */
 426         uint32_t        msa_tstamp;     /* transaction timestamp (ms) */
 427         uint32_t        msa_ptstamp;    /* prev transaction timestamp (ms) */
 428         uint16_t        msa_depth;      /* pc stack depth */
 429         uint16_t        msa_pdepth;     /* previous transaction pc stack */
 430         void            *msa_stack[MCACHE_STACK_DEPTH];
 431         void            *msa_pstack[MCACHE_STACK_DEPTH];
 432 } mcl_scratch_audit_t;
 433
 434 typedef struct {
 435         /*
 436          * Size of data from the beginning of an mbuf that covers m_hdr,
 437          * pkthdr and m_ext structures.  If auditing is enabled, we allocate
 438          * a shadow mbuf structure of this size inside each audit structure,
 439          * and the contents of the real mbuf gets copied into it when the mbuf
 440          * is freed.  This allows us to pattern-fill the mbuf for integrity
 441          * check, and to preserve any constructed mbuf fields (e.g. mbuf +
 442          * cluster cache case).  Note that we don't save the contents of
 443          * clusters when they are freed; we simply pattern-fill them.
 444          */
 445         u_int8_t                sc_mbuf[(MSIZE - _MHLEN) + sizeof (_m_ext_t)];
 446         mcl_scratch_audit_t     sc_scratch __attribute__((aligned(8)));
 447 } mcl_saved_contents_t;
 448
 449 #define AUDIT_CONTENTS_SIZE     (sizeof (mcl_saved_contents_t))
 450
 451 #define MCA_SAVED_MBUF_PTR(_mca)                                        \
 452         ((struct mbuf *)(void *)((mcl_saved_contents_t *)               \
 453         (_mca)->mca_contents)->sc_mbuf)
 454 #define MCA_SAVED_MBUF_SIZE                                             \
 455         (sizeof (((mcl_saved_contents_t *)0)->sc_mbuf))
 456 #define MCA_SAVED_SCRATCH_PTR(_mca)                                     \
 457         (&((mcl_saved_contents_t *)(_mca)->mca_contents)->sc_scratch)
 458
 459 /*
 460  * mbuf specific mcache audit flags
 461  */
 462 #define MB_INUSE        0x01    /* object has not been returned to slab */
 463 #define MB_COMP_INUSE   0x02    /* object has not been returned to cslab */
 464 #define MB_SCVALID      0x04    /* object has valid saved contents */
 465
 466 /*
 467  * Each of the following two arrays hold up to nmbclusters elements.
 468  */
 469 static mcl_audit_t *mclaudit;   /* array of cluster audit information */
 470 static unsigned int maxclaudit; /* max # of entries in audit table */
 471 static mcl_slabg_t **slabstbl;  /* cluster slabs table */
 472 static unsigned int maxslabgrp; /* max # of entries in slabs table */
 473 static unsigned int slabgrp;    /* # of entries in slabs table */
 474
 475 /* Globals */
 476 int nclusters;                  /* # of clusters for non-jumbo (legacy) sizes */
 477 int njcl;                       /* # of clusters for jumbo sizes */
 478 int njclbytes;                  /* size of a jumbo cluster */
 479 union mbigcluster *mbutl;       /* first mapped cluster address */
 480 union mbigcluster *embutl;      /* ending virtual address of mclusters */
 481 int _max_linkhdr;               /* largest link-level header */
 482 int _max_protohdr;              /* largest protocol header */
 483 int max_hdr;                    /* largest link+protocol header */
 484 int max_datalen;                /* MHLEN - max_hdr */
 485
 486 static boolean_t mclverify;     /* debug: pattern-checking */
 487 static boolean_t mcltrace;      /* debug: stack tracing */
 488 static boolean_t mclfindleak;   /* debug: leak detection */
 489 static boolean_t mclexpleak;    /* debug: expose leak info to user space */
 490
 491 static struct timeval mb_start; /* beginning of time */
 492
 493 /* mbuf leak detection variables */
 494 static struct mleak_table mleak_table;
 495 static mleak_stat_t *mleak_stat;
 496
 497 #define MLEAK_STAT_SIZE(n) \
 498         ((size_t)(&((mleak_stat_t *)0)->ml_trace[n]))
 499
 500 struct mallocation {
 501         mcache_obj_t *element;  /* the alloc'ed element, NULL if unused */
 502         u_int32_t trace_index;  /* mtrace index for corresponding backtrace */
 503         u_int32_t count;        /* How many objects were requested */
 504         u_int64_t hitcount;     /* for determining hash effectiveness */
 505 };
 506
 507 struct mtrace {
 508         u_int64_t       collisions;
 509         u_int64_t       hitcount;
 510         u_int64_t       allocs;
 511         u_int64_t       depth;
 512         uintptr_t       addr[MLEAK_STACK_DEPTH];
 513 };
 514
 515 /* Size must be a power of two for the zhash to be able to just mask off bits */
 516 #define MLEAK_ALLOCATION_MAP_NUM        512
 517 #define MLEAK_TRACE_MAP_NUM             256
 518
 519 /*
 520  * Sample factor for how often to record a trace.  This is overwritable
 521  * by the boot-arg mleak_sample_factor.
 522  */
 523 #define MLEAK_SAMPLE_FACTOR             500
 524
 525 /*
 526  * Number of top leakers recorded.
 527  */
 528 #define MLEAK_NUM_TRACES                5
 529
 530 #define MB_LEAK_SPACING_64 "                    "
 531 #define MB_LEAK_SPACING_32 "            "
 532
 533
 534 #define MB_LEAK_HDR_32  "\n\
 535     trace [1]   trace [2]   trace [3]   trace [4]   trace [5]  \n\
 536     ----------  ----------  ----------  ----------  ---------- \n\
 537 "
 538
 539 #define MB_LEAK_HDR_64  "\n\
 540     trace [1]           trace [2]           trace [3]       \
 541         trace [4]           trace [5]      \n\
 542     ------------------  ------------------  ------------------  \
 543     ------------------  ------------------ \n\
 544 "
 545
 546 static uint32_t mleak_alloc_buckets = MLEAK_ALLOCATION_MAP_NUM;
 547 static uint32_t mleak_trace_buckets = MLEAK_TRACE_MAP_NUM;
 548
 549 /* Hashmaps of allocations and their corresponding traces */
 550 static struct mallocation *mleak_allocations;
 551 static struct mtrace *mleak_traces;
 552 static struct mtrace *mleak_top_trace[MLEAK_NUM_TRACES];
 553
 554 /* Lock to protect mleak tables from concurrent modification */
 555 decl_lck_mtx_data(static, mleak_lock_data);
 556 static lck_mtx_t *mleak_lock = &mleak_lock_data;
 557 static lck_attr_t *mleak_lock_attr;
 558 static lck_grp_t *mleak_lock_grp;
 559 static lck_grp_attr_t *mleak_lock_grp_attr;
 560
 561 extern u_int32_t high_sb_max;
 562
 563 /* The minimum number of objects that are allocated, to start. */
 564 #define MINCL           32
 565 #define MINBIGCL        (MINCL >> 1)
 566 #define MIN16KCL        (MINCL >> 2)
 567
 568 /* Low watermarks (only map in pages once free counts go below) */
 569 #define MBIGCL_LOWAT    MINBIGCL
 570 #define M16KCL_LOWAT    MIN16KCL
 571
 572 typedef struct {
 573         mbuf_class_t    mtbl_class;     /* class type */
 574         mcache_t        *mtbl_cache;    /* mcache for this buffer class */
 575         TAILQ_HEAD(mcl_slhead, mcl_slab) mtbl_slablist; /* slab list */
 576         mcache_obj_t    *mtbl_cobjlist; /* composite objects freelist */
 577         mb_class_stat_t *mtbl_stats;    /* statistics fetchable via sysctl */
 578         u_int32_t       mtbl_maxsize;   /* maximum buffer size */
 579         int             mtbl_minlimit;  /* minimum allowed */
 580         int             mtbl_maxlimit;  /* maximum allowed */
 581         u_int32_t       mtbl_wantpurge; /* purge during next reclaim */
 582         uint32_t        mtbl_avgtotal;  /* average total on iOS */
 583 } mbuf_table_t;
 584
 585 #define m_class(c)      mbuf_table[c].mtbl_class
 586 #define m_cache(c)      mbuf_table[c].mtbl_cache
 587 #define m_slablist(c)   mbuf_table[c].mtbl_slablist
 588 #define m_cobjlist(c)   mbuf_table[c].mtbl_cobjlist
 589 #define m_maxsize(c)    mbuf_table[c].mtbl_maxsize
 590 #define m_minlimit(c)   mbuf_table[c].mtbl_minlimit
 591 #define m_maxlimit(c)   mbuf_table[c].mtbl_maxlimit
 592 #define m_wantpurge(c)  mbuf_table[c].mtbl_wantpurge
 593 #define m_avgtotal(c)   mbuf_table[c].mtbl_avgtotal
 594 #define m_cname(c)      mbuf_table[c].mtbl_stats->mbcl_cname
 595 #define m_size(c)       mbuf_table[c].mtbl_stats->mbcl_size
 596 #define m_total(c)      mbuf_table[c].mtbl_stats->mbcl_total
 597 #define m_active(c)     mbuf_table[c].mtbl_stats->mbcl_active
 598 #define m_infree(c)     mbuf_table[c].mtbl_stats->mbcl_infree
 599 #define m_slab_cnt(c)   mbuf_table[c].mtbl_stats->mbcl_slab_cnt
 600 #define m_alloc_cnt(c)  mbuf_table[c].mtbl_stats->mbcl_alloc_cnt
 601 #define m_free_cnt(c)   mbuf_table[c].mtbl_stats->mbcl_free_cnt
 602 #define m_notified(c)   mbuf_table[c].mtbl_stats->mbcl_notified
 603 #define m_purge_cnt(c)  mbuf_table[c].mtbl_stats->mbcl_purge_cnt
 604 #define m_fail_cnt(c)   mbuf_table[c].mtbl_stats->mbcl_fail_cnt
 605 #define m_ctotal(c)     mbuf_table[c].mtbl_stats->mbcl_ctotal
 606 #define m_peak(c)       mbuf_table[c].mtbl_stats->mbcl_peak_reported
 607 #define m_release_cnt(c) mbuf_table[c].mtbl_stats->mbcl_release_cnt
 608
 609 static mbuf_table_t mbuf_table[] = {
 610         /*
 611          * The caches for mbufs, regular clusters and big clusters.
 612          * The average total values were based on data gathered by actual
 613          * usage patterns on iOS.
 614          */
 615         { MC_MBUF, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_MBUF)),
 616             NULL, NULL, 0, 0, 0, 0, 3000 },
 617         { MC_CL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_CL)),
 618             NULL, NULL, 0, 0, 0, 0, 2000 },
 619         { MC_BIGCL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_BIGCL)),
 620             NULL, NULL, 0, 0, 0, 0, 1000 },
 621         { MC_16KCL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_16KCL)),
 622             NULL, NULL, 0, 0, 0, 0, 1000 },
 623         /*
 624          * The following are special caches; they serve as intermediate
 625          * caches backed by the above rudimentary caches.  Each object
 626          * in the cache is an mbuf with a cluster attached to it.  Unlike
 627          * the above caches, these intermediate caches do not directly
 628          * deal with the slab structures; instead, the constructed
 629          * cached elements are simply stored in the freelists.
 630          */
 631         { MC_MBUF_CL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0, 2000 },
 632         { MC_MBUF_BIGCL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0, 1000 },
 633         { MC_MBUF_16KCL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0, 1000 },
 634 };
 635
 636 #define NELEM(a)        (sizeof (a) / sizeof ((a)[0]))
 637
 638 static void *mb_waitchan = &mbuf_table; /* wait channel for all caches */
 639 static int mb_waiters;                  /* number of waiters */
 640
 641 boolean_t mb_peak_newreport = FALSE;
 642 boolean_t mb_peak_firstreport = FALSE;
 643
 644 /* generate a report by default after 1 week of uptime */
 645 #define MBUF_PEAK_FIRST_REPORT_THRESHOLD        604800
 646
 647 #define MB_WDT_MAXTIME  10              /* # of secs before watchdog panic */
 648 static struct timeval mb_wdtstart;      /* watchdog start timestamp */
 649 static char *mbuf_dump_buf;
 650
 651 #define MBUF_DUMP_BUF_SIZE      2048
 652
 653 /*
 654  * mbuf watchdog is enabled by default on embedded platforms.  It is
 655  * also toggeable via the kern.ipc.mb_watchdog sysctl.
 656  * Garbage collection is also enabled by default on embedded platforms.
 657  * mb_drain_maxint controls the amount of time to wait (in seconds) before
 658  * consecutive calls to m_drain().
 659  */
 660 static unsigned int mb_watchdog = 0;
 661 static unsigned int mb_drain_maxint = 0;
 662
 663 /* Red zone */
 664 static u_int32_t mb_redzone_cookie;
 665 static void m_redzone_init(struct mbuf *);
 666 static void m_redzone_verify(struct mbuf *m);
 667
 668 /* The following are used to serialize m_clalloc() */
 669 static boolean_t mb_clalloc_busy;
 670 static void *mb_clalloc_waitchan = &mb_clalloc_busy;
 671 static int mb_clalloc_waiters;
 672
 673 static void mbuf_mtypes_sync(boolean_t);
 674 static int mbstat_sysctl SYSCTL_HANDLER_ARGS;
 675 static void mbuf_stat_sync(void);
 676 static int mb_stat_sysctl SYSCTL_HANDLER_ARGS;
 677 static int mleak_top_trace_sysctl SYSCTL_HANDLER_ARGS;
 678 static int mleak_table_sysctl SYSCTL_HANDLER_ARGS;
 679 static char *mbuf_dump(void);
 680 static void mbuf_table_init(void);
 681 static inline void m_incref(struct mbuf *);
 682 static inline u_int32_t m_decref(struct mbuf *);
 683 static int m_clalloc(const u_int32_t, const int, const u_int32_t);
 684 static void mbuf_worker_thread_init(void);
 685 static mcache_obj_t *slab_alloc(mbuf_class_t, int);
 686 static void slab_free(mbuf_class_t, mcache_obj_t *);
 687 static unsigned int mbuf_slab_alloc(void *, mcache_obj_t ***,
 688     unsigned int, int);
 689 static void mbuf_slab_free(void *, mcache_obj_t *, int);
 690 static void mbuf_slab_audit(void *, mcache_obj_t *, boolean_t);
 691 static void mbuf_slab_notify(void *, u_int32_t);
 692 static unsigned int cslab_alloc(mbuf_class_t, mcache_obj_t ***,
 693     unsigned int);
 694 static unsigned int cslab_free(mbuf_class_t, mcache_obj_t *, int);
 695 static unsigned int mbuf_cslab_alloc(void *, mcache_obj_t ***,
 696     unsigned int, int);
 697 static void mbuf_cslab_free(void *, mcache_obj_t *, int);
 698 static void mbuf_cslab_audit(void *, mcache_obj_t *, boolean_t);
 699 static int freelist_populate(mbuf_class_t, unsigned int, int);
 700 static void freelist_init(mbuf_class_t);
 701 static boolean_t mbuf_cached_above(mbuf_class_t, int);
 702 static boolean_t mbuf_steal(mbuf_class_t, unsigned int);
 703 static void m_reclaim(mbuf_class_t, unsigned int, boolean_t);
 704 static int m_howmany(int, size_t);
 705 static void mbuf_worker_thread(void);
 706 static void mbuf_watchdog(void);
 707 static boolean_t mbuf_sleep(mbuf_class_t, unsigned int, int);
 708
 709 static void mcl_audit_init(void *, mcache_audit_t **, mcache_obj_t **,
 710     size_t, unsigned int);
 711 static void mcl_audit_free(void *, unsigned int);
 712 static mcache_audit_t *mcl_audit_buf2mca(mbuf_class_t, mcache_obj_t *);
 713 static void mcl_audit_mbuf(mcache_audit_t *, void *, boolean_t, boolean_t);
 714 static void mcl_audit_cluster(mcache_audit_t *, void *, size_t, boolean_t,
 715     boolean_t);
 716 static void mcl_audit_restore_mbuf(struct mbuf *, mcache_audit_t *, boolean_t);
 717 static void mcl_audit_save_mbuf(struct mbuf *, mcache_audit_t *);
 718 static void mcl_audit_scratch(mcache_audit_t *);
 719 static void mcl_audit_mcheck_panic(struct mbuf *);
 720 static void mcl_audit_verify_nextptr(void *, mcache_audit_t *);
 721
 722 static void mleak_activate(void);
 723 static void mleak_logger(u_int32_t, mcache_obj_t *, boolean_t);
 724 static boolean_t mleak_log(uintptr_t *, mcache_obj_t *, uint32_t, int);
 725 static void mleak_free(mcache_obj_t *);
 726 static void mleak_sort_traces(void);
 727 static void mleak_update_stats(void);
 728
 729 static mcl_slab_t *slab_get(void *);
 730 static void slab_init(mcl_slab_t *, mbuf_class_t, u_int32_t,
 731     void *, void *, unsigned int, int, int);
 732 static void slab_insert(mcl_slab_t *, mbuf_class_t);
 733 static void slab_remove(mcl_slab_t *, mbuf_class_t);
 734 static boolean_t slab_inrange(mcl_slab_t *, void *);
 735 static void slab_nextptr_panic(mcl_slab_t *, void *);
 736 static void slab_detach(mcl_slab_t *);
 737 static boolean_t slab_is_detached(mcl_slab_t *);
 738
 739 static int m_copyback0(struct mbuf **, int, int, const void *, int, int);
 740 static struct mbuf *m_split0(struct mbuf *, int, int, int);
 741 __private_extern__ void mbuf_report_peak_usage(void);
 742 static boolean_t mbuf_report_usage(mbuf_class_t);
 743
 744 /* flags for m_copyback0 */
 745 #define M_COPYBACK0_COPYBACK    0x0001  /* copyback from cp */
 746 #define M_COPYBACK0_PRESERVE    0x0002  /* preserve original data */
 747 #define M_COPYBACK0_COW         0x0004  /* do copy-on-write */
 748 #define M_COPYBACK0_EXTEND      0x0008  /* extend chain */
 749
 750 /*
 751  * This flag is set for all mbufs that come out of and into the composite
 752  * mbuf + cluster caches, i.e. MC_MBUF_CL and MC_MBUF_BIGCL.  mbufs that
 753  * are marked with such a flag have clusters attached to them, and will be
 754  * treated differently when they are freed; instead of being placed back
 755  * into the mbuf and cluster freelists, the composite mbuf + cluster objects
 756  * are placed back into the appropriate composite cache's freelist, and the
 757  * actual freeing is deferred until the composite objects are purged.  At
 758  * such a time, this flag will be cleared from the mbufs and the objects
 759  * will be freed into their own separate freelists.
 760  */
 761 #define EXTF_COMPOSITE  0x1
 762
 763 /*
 764  * This flag indicates that the external cluster is read-only, i.e. it is
 765  * or was referred to by more than one mbufs.  Once set, this flag is never
 766  * cleared.
 767  */
 768 #define EXTF_READONLY   0x2
 769 #define EXTF_MASK       (EXTF_COMPOSITE | EXTF_READONLY)
 770
 771 #define MEXT_RFA(m)             ((m)->m_ext.ext_refflags)
 772 #define MEXT_REF(m)             (MEXT_RFA(m)->refcnt)
 773 #define MEXT_FLAGS(m)           (MEXT_RFA(m)->flags)
 774 #define MBUF_IS_COMPOSITE(m)    \
 775         (MEXT_REF(m) == 0 && (MEXT_FLAGS(m) & EXTF_MASK) == EXTF_COMPOSITE)
 776
 777 /*
 778  * Macros used to verify the integrity of the mbuf.
 779  */
 780 #define _MCHECK(m) {                                                    \
 781         if ((m)->m_type != MT_FREE) {                                   \
 782                 if (mclaudit == NULL)                                   \
 783                         panic("MCHECK: m_type=%d m=%p",                 \
 784                             (u_int16_t)(m)->m_type, m);                 \
 785                 else                                                    \
 786                         mcl_audit_mcheck_panic(m);                      \
 787         }                                                               \
 788 }
 789
 790 #define MBUF_IN_MAP(addr)                                               \
 791         ((void *)(addr) >= (void *)mbutl && (void *)(addr) < (void *)embutl)
 792
 793 #define MRANGE(addr) {                                                  \
 794         if (!MBUF_IN_MAP(addr))                                         \
 795                 panic("MRANGE: address out of range 0x%p", addr);       \
 796 }
 797
 798 /*
 799  * Macro version of mtod.
 800  */
 801 #define MTOD(m, t)      ((t)((m)->m_data))
 802
 803 /*
 804  * Macros to obtain (4KB) cluster index and base cluster address.
 805  */
 806
 807 #define MTOBG(x)        (((char *)(x) - (char *)mbutl) >> MBIGCLSHIFT)
 808 #define BGTOM(x)        ((union mbigcluster *)(mbutl + (x)))
 809
 810 /*
 811  * Macro to find the mbuf index relative to a base.
 812  */
 813 #define MCLIDX(c, m)    (((char *)(m) - (char *)(c)) >> MSIZESHIFT)
 814
 815 /*
 816  * Same thing for 2KB cluster index.
 817  */
 818 #define CLBGIDX(c, m)   (((char *)(m) - (char *)(c)) >> MCLSHIFT)
 819
 820 /*
 821  * Macros used during mbuf and cluster initialization.
 822  */
 823 #define MBUF_INIT_PKTHDR(m) {                                           \
 824         (m)->m_pkthdr.rcvif = NULL;                                     \
 825         (m)->m_pkthdr.pkt_hdr = NULL;                                   \
 826         (m)->m_pkthdr.len = 0;                                          \
 827         (m)->m_pkthdr.csum_flags = 0;                                   \
 828         (m)->m_pkthdr.csum_data = 0;                                    \
 829         (m)->m_pkthdr.vlan_tag = 0;                                     \
 830         m_classifier_init(m, 0);                                        \
 831         m_tag_init(m, 1);                                               \
 832         m_scratch_init(m);                                              \
 833         m_redzone_init(m);                                              \
 834 }
 835
 836 #define MBUF_INIT(m, pkthdr, type) {                                    \
 837         _MCHECK(m);                                                     \
 838         (m)->m_next = (m)->m_nextpkt = NULL;                            \
 839         (m)->m_len = 0;                                                 \
 840         (m)->m_type = type;                                             \
 841         if ((pkthdr) == 0) {                                            \
 842                 (m)->m_data = (m)->m_dat;                               \
 843                 (m)->m_flags = 0;                                       \
 844         } else {                                                        \
 845                 (m)->m_data = (m)->m_pktdat;                            \
 846                 (m)->m_flags = M_PKTHDR;                                \
 847                 MBUF_INIT_PKTHDR(m);                                    \
 848         }                                                               \
 849 }
 850
 851 #define MEXT_INIT(m, buf, size, free, arg, rfa, ref, flag) {            \
 852         (m)->m_data = (m)->m_ext.ext_buf = (buf);                       \
 853         (m)->m_flags |= M_EXT;                                          \
 854         (m)->m_ext.ext_size = (size);                                   \
 855         (m)->m_ext.ext_free = (free);                                   \
 856         (m)->m_ext.ext_arg = (arg);                                     \
 857         (m)->m_ext.ext_refs.forward = (m)->m_ext.ext_refs.backward =    \
 858             &(m)->m_ext.ext_refs;                                       \
 859         MEXT_RFA(m) = (rfa);                                            \
 860         MEXT_REF(m) = (ref);                                            \
 861         MEXT_FLAGS(m) = (flag);                                         \
 862 }
 863
 864 #define MBUF_CL_INIT(m, buf, rfa, ref, flag)    \
 865         MEXT_INIT(m, buf, m_maxsize(MC_CL), NULL, NULL, rfa, ref, flag)
 866
 867 #define MBUF_BIGCL_INIT(m, buf, rfa, ref, flag) \
 868         MEXT_INIT(m, buf, m_maxsize(MC_BIGCL), m_bigfree, NULL, rfa, ref, flag)
 869
 870 #define MBUF_16KCL_INIT(m, buf, rfa, ref, flag) \
 871         MEXT_INIT(m, buf, m_maxsize(MC_16KCL), m_16kfree, NULL, rfa, ref, flag)
 872
 873 /*
 874  * Macro to convert BSD malloc sleep flag to mcache's
 875  */
 876 #define MSLEEPF(f)      ((!((f) & M_DONTWAIT)) ? MCR_SLEEP : MCR_NOSLEEP)
 877
 878 /*
 879  * The structure that holds all mbuf class statistics exportable via sysctl.
 880  * Similar to mbstat structure, the mb_stat structure is protected by the
 881  * global mbuf lock.  It contains additional information about the classes
 882  * that allows for a more accurate view of the state of the allocator.
 883  */
 884 struct mb_stat *mb_stat;
 885 struct omb_stat *omb_stat;      /* For backwards compatibility */
 886
 887 #define MB_STAT_SIZE(n) \
 888         ((size_t)(&((mb_stat_t *)0)->mbs_class[n]))
 889 #define OMB_STAT_SIZE(n) \
 890         ((size_t)(&((struct omb_stat *)0)->mbs_class[n]))
 891
 892 /*
 893  * The legacy structure holding all of the mbuf allocation statistics.
 894  * The actual statistics used by the kernel are stored in the mbuf_table
 895  * instead, and are updated atomically while the global mbuf lock is held.
 896  * They are mirrored in mbstat to support legacy applications (e.g. netstat).
 897  * Unlike before, the kernel no longer relies on the contents of mbstat for
 898  * its operations (e.g. cluster expansion) because the structure is exposed
 899  * to outside and could possibly be modified, therefore making it unsafe.
 900  * With the exception of the mbstat.m_mtypes array (see below), all of the
 901  * statistics are updated as they change.
 902  */
 903 struct mbstat mbstat;
 904
 905 #define MBSTAT_MTYPES_MAX \
 906         (sizeof (mbstat.m_mtypes) / sizeof (mbstat.m_mtypes[0]))
 907
 908 /*
 909  * Allocation statistics related to mbuf types (up to MT_MAX-1) are updated
 910  * atomically and stored in a per-CPU structure which is lock-free; this is
 911  * done in order to avoid writing to the global mbstat data structure which
 912  * would cause false sharing.  During sysctl request for kern.ipc.mbstat,
 913  * the statistics across all CPUs will be converged into the mbstat.m_mtypes
 914  * array and returned to the application.  Any updates for types greater or
 915  * equal than MT_MAX would be done atomically to the mbstat; this slows down
 916  * performance but is okay since the kernel uses only up to MT_MAX-1 while
 917  * anything beyond that (up to type 255) is considered a corner case.
 918  */
 919 typedef struct {
 920         unsigned int    cpu_mtypes[MT_MAX];
 921 } __attribute__((aligned(MAX_CPU_CACHE_LINE_SIZE), packed)) mtypes_cpu_t;
 922
 923 typedef struct {
 924         mtypes_cpu_t    mbs_cpu[1];
 925 } mbuf_mtypes_t;
 926
 927 static mbuf_mtypes_t *mbuf_mtypes;      /* per-CPU statistics */
 928
 929 #define MBUF_MTYPES_SIZE(n) \
 930         ((size_t)(&((mbuf_mtypes_t *)0)->mbs_cpu[n]))
 931
 932 #define MTYPES_CPU(p) \
 933         ((mtypes_cpu_t *)(void *)((char *)(p) + MBUF_MTYPES_SIZE(cpu_number())))
 934
 935 #define mtype_stat_add(type, n) {                                       \
 936         if ((unsigned)(type) < MT_MAX) {                                \
 937                 mtypes_cpu_t *mbs = MTYPES_CPU(mbuf_mtypes);            \
 938                 atomic_add_32(&mbs->cpu_mtypes[type], n);               \
 939         } else if ((unsigned)(type) < (unsigned)MBSTAT_MTYPES_MAX) {    \
 940                 atomic_add_16((int16_t *)&mbstat.m_mtypes[type], n);    \
 941         }                                                               \
 942 }
 943
 944 #define mtype_stat_sub(t, n)    mtype_stat_add(t, -(n))
 945 #define mtype_stat_inc(t)       mtype_stat_add(t, 1)
 946 #define mtype_stat_dec(t)       mtype_stat_sub(t, 1)
 947
 948 static void
 949 mbuf_mtypes_sync(boolean_t locked)
 950 {
 951         int m, n;
 952         mtypes_cpu_t mtc;
 953
 954         if (locked)
 955                 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
 956
 957         bzero(&mtc, sizeof (mtc));
 958         for (m = 0; m < ncpu; m++) {
 959                 mtypes_cpu_t *scp = &mbuf_mtypes->mbs_cpu[m];
 960                 mtypes_cpu_t temp;
 961
 962                 bcopy(&scp->cpu_mtypes, &temp.cpu_mtypes,
 963                     sizeof (temp.cpu_mtypes));
 964
 965                 for (n = 0; n < MT_MAX; n++)
 966                         mtc.cpu_mtypes[n] += temp.cpu_mtypes[n];
 967         }
 968         if (!locked)
 969                 lck_mtx_lock(mbuf_mlock);
 970         for (n = 0; n < MT_MAX; n++)
 971                 mbstat.m_mtypes[n] = mtc.cpu_mtypes[n];
 972         if (!locked)
 973                 lck_mtx_unlock(mbuf_mlock);
 974 }
 975
 976 static int
 977 mbstat_sysctl SYSCTL_HANDLER_ARGS
 978 {
 979 #pragma unused(oidp, arg1, arg2)
 980         mbuf_mtypes_sync(FALSE);
 981
 982         return (SYSCTL_OUT(req, &mbstat, sizeof (mbstat)));
 983 }
 984
 985 static void
 986 mbuf_stat_sync(void)
 987 {
 988         mb_class_stat_t *sp;
 989         mcache_cpu_t *ccp;
 990         mcache_t *cp;
 991         int k, m, bktsize;
 992
 993         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
 994
 995         for (k = 0; k < NELEM(mbuf_table); k++) {
 996                 cp = m_cache(k);
 997                 ccp = &cp->mc_cpu[0];
 998                 bktsize = ccp->cc_bktsize;
 999                 sp = mbuf_table[k].mtbl_stats;
1000
1001                 if (cp->mc_flags & MCF_NOCPUCACHE)
1002                         sp->mbcl_mc_state = MCS_DISABLED;
1003                 else if (cp->mc_purge_cnt > 0)
1004                         sp->mbcl_mc_state = MCS_PURGING;
1005                 else if (bktsize == 0)
1006                         sp->mbcl_mc_state = MCS_OFFLINE;
1007                 else
1008                         sp->mbcl_mc_state = MCS_ONLINE;
1009
1010                 sp->mbcl_mc_cached = 0;
1011                 for (m = 0; m < ncpu; m++) {
1012                         ccp = &cp->mc_cpu[m];
1013                         if (ccp->cc_objs > 0)
1014                                 sp->mbcl_mc_cached += ccp->cc_objs;
1015                         if (ccp->cc_pobjs > 0)
1016                                 sp->mbcl_mc_cached += ccp->cc_pobjs;
1017                 }
1018                 sp->mbcl_mc_cached += (cp->mc_full.bl_total * bktsize);
1019                 sp->mbcl_active = sp->mbcl_total - sp->mbcl_mc_cached -
1020                     sp->mbcl_infree;
1021
1022                 sp->mbcl_mc_waiter_cnt = cp->mc_waiter_cnt;
1023                 sp->mbcl_mc_wretry_cnt = cp->mc_wretry_cnt;
1024                 sp->mbcl_mc_nwretry_cnt = cp->mc_nwretry_cnt;
1025
1026                 /* Calculate total count specific to each class */
1027                 sp->mbcl_ctotal = sp->mbcl_total;
1028                 switch (m_class(k)) {
1029                 case MC_MBUF:
1030                         /* Deduct mbufs used in composite caches */
1031                         sp->mbcl_ctotal -= (m_total(MC_MBUF_CL) +
1032                             m_total(MC_MBUF_BIGCL));
1033                         break;
1034
1035                 case MC_CL:
1036                         /* Deduct clusters used in composite cache */
1037                         sp->mbcl_ctotal -= m_total(MC_MBUF_CL);
1038                         break;
1039
1040                 case MC_BIGCL:
1041                         /* Deduct clusters used in composite cache */
1042                         sp->mbcl_ctotal -= m_total(MC_MBUF_BIGCL);
1043                         break;
1044
1045                 case MC_16KCL:
1046                         /* Deduct clusters used in composite cache */
1047                         sp->mbcl_ctotal -= m_total(MC_MBUF_16KCL);
1048                         break;
1049
1050                 default:
1051                         break;
1052                 }
1053         }
1054 }
1055
1056 static int
1057 mb_stat_sysctl SYSCTL_HANDLER_ARGS
1058 {
1059 #pragma unused(oidp, arg1, arg2)
1060         void *statp;
1061         int k, statsz, proc64 = proc_is64bit(req->p);
1062
1063         lck_mtx_lock(mbuf_mlock);
1064         mbuf_stat_sync();
1065
1066         if (!proc64) {
1067                 struct omb_class_stat *oc;
1068                 struct mb_class_stat *c;
1069
1070                 omb_stat->mbs_cnt = mb_stat->mbs_cnt;
1071                 oc = &omb_stat->mbs_class[0];
1072                 c = &mb_stat->mbs_class[0];
1073                 for (k = 0; k < omb_stat->mbs_cnt; k++, oc++, c++) {
1074                         (void) snprintf(oc->mbcl_cname, sizeof (oc->mbcl_cname),
1075                             "%s", c->mbcl_cname);
1076                         oc->mbcl_size = c->mbcl_size;
1077                         oc->mbcl_total = c->mbcl_total;
1078                         oc->mbcl_active = c->mbcl_active;
1079                         oc->mbcl_infree = c->mbcl_infree;
1080                         oc->mbcl_slab_cnt = c->mbcl_slab_cnt;
1081                         oc->mbcl_alloc_cnt = c->mbcl_alloc_cnt;
1082                         oc->mbcl_free_cnt = c->mbcl_free_cnt;
1083                         oc->mbcl_notified = c->mbcl_notified;
1084                         oc->mbcl_purge_cnt = c->mbcl_purge_cnt;
1085                         oc->mbcl_fail_cnt = c->mbcl_fail_cnt;
1086                         oc->mbcl_ctotal = c->mbcl_ctotal;
1087                         oc->mbcl_release_cnt = c->mbcl_release_cnt;
1088                         oc->mbcl_mc_state = c->mbcl_mc_state;
1089                         oc->mbcl_mc_cached = c->mbcl_mc_cached;
1090                         oc->mbcl_mc_waiter_cnt = c->mbcl_mc_waiter_cnt;
1091                         oc->mbcl_mc_wretry_cnt = c->mbcl_mc_wretry_cnt;
1092                         oc->mbcl_mc_nwretry_cnt = c->mbcl_mc_nwretry_cnt;
1093                 }
1094                 statp = omb_stat;
1095                 statsz = OMB_STAT_SIZE(NELEM(mbuf_table));
1096         } else {
1097                 statp = mb_stat;
1098                 statsz = MB_STAT_SIZE(NELEM(mbuf_table));
1099         }
1100
1101         lck_mtx_unlock(mbuf_mlock);
1102
1103         return (SYSCTL_OUT(req, statp, statsz));
1104 }
1105
1106 static int
1107 mleak_top_trace_sysctl SYSCTL_HANDLER_ARGS
1108 {
1109 #pragma unused(oidp, arg1, arg2)
1110         int i;
1111
1112         /* Ensure leak tracing turned on */
1113         if (!mclfindleak || !mclexpleak)
1114                 return (ENXIO);
1115
1116         lck_mtx_lock(mleak_lock);
1117         mleak_update_stats();
1118         i = SYSCTL_OUT(req, mleak_stat, MLEAK_STAT_SIZE(MLEAK_NUM_TRACES));
1119         lck_mtx_unlock(mleak_lock);
1120
1121         return (i);
1122 }
1123
1124 static int
1125 mleak_table_sysctl SYSCTL_HANDLER_ARGS
1126 {
1127 #pragma unused(oidp, arg1, arg2)
1128         int i = 0;
1129
1130         /* Ensure leak tracing turned on */
1131         if (!mclfindleak || !mclexpleak)
1132                 return (ENXIO);
1133
1134         lck_mtx_lock(mleak_lock);
1135         i = SYSCTL_OUT(req, &mleak_table, sizeof (mleak_table));
1136         lck_mtx_unlock(mleak_lock);
1137
1138         return (i);
1139 }
1140
1141 static inline void
1142 m_incref(struct mbuf *m)
1143 {
1144         UInt32 old, new;
1145         volatile UInt32 *addr = (volatile UInt32 *)&MEXT_REF(m);
1146
1147         do {
1148                 old = *addr;
1149                 new = old + 1;
1150                 ASSERT(new != 0);
1151         } while (!OSCompareAndSwap(old, new, addr));
1152
1153         /*
1154          * If cluster is shared, mark it with (sticky) EXTF_READONLY;
1155          * we don't clear the flag when the refcount goes back to 1
1156          * to simplify code calling m_mclhasreference().
1157          */
1158         if (new > 1 && !(MEXT_FLAGS(m) & EXTF_READONLY))
1159                 (void) OSBitOrAtomic(EXTF_READONLY, &MEXT_FLAGS(m));
1160 }
1161
1162 static inline u_int32_t
1163 m_decref(struct mbuf *m)
1164 {
1165         UInt32 old, new;
1166         volatile UInt32 *addr = (volatile UInt32 *)&MEXT_REF(m);
1167
1168         do {
1169                 old = *addr;
1170                 new = old - 1;
1171                 ASSERT(old != 0);
1172         } while (!OSCompareAndSwap(old, new, addr));
1173
1174         return (new);
1175 }
1176
1177 static void
1178 mbuf_table_init(void)
1179 {
1180         unsigned int b, c, s;
1181         int m;
1182
1183         MALLOC(omb_stat, struct omb_stat *, OMB_STAT_SIZE(NELEM(mbuf_table)),
1184             M_TEMP, M_WAITOK | M_ZERO);
1185         VERIFY(omb_stat != NULL);
1186
1187         MALLOC(mb_stat, mb_stat_t *, MB_STAT_SIZE(NELEM(mbuf_table)),
1188             M_TEMP, M_WAITOK | M_ZERO);
1189         VERIFY(mb_stat != NULL);
1190
1191         mb_stat->mbs_cnt = NELEM(mbuf_table);
1192         for (m = 0; m < NELEM(mbuf_table); m++)
1193                 mbuf_table[m].mtbl_stats = &mb_stat->mbs_class[m];
1194
1195 #if CONFIG_MBUF_JUMBO
1196         /*
1197          * Set aside 1/3 of the mbuf cluster map for jumbo clusters; we do
1198          * this only on platforms where jumbo cluster pool is enabled.
1199          */
1200         njcl = nmbclusters / 3;
1201         njclbytes = M16KCLBYTES;
1202 #endif /* CONFIG_MBUF_JUMBO */
1203
1204         /*
1205          * nclusters holds both the 2KB and 4KB pools, so ensure it's
1206          * a multiple of 4KB clusters.
1207          */
1208         nclusters = P2ROUNDDOWN(nmbclusters - njcl, NCLPBG);
1209         if (njcl > 0) {
1210                 /*
1211                  * Each jumbo cluster takes 8 2KB clusters, so make
1212                  * sure that the pool size is evenly divisible by 8;
1213                  * njcl is in 2KB unit, hence treated as such.
1214                  */
1215                 njcl = P2ROUNDDOWN(nmbclusters - nclusters, 8);
1216
1217                 /* Update nclusters with rounded down value of njcl */
1218                 nclusters = P2ROUNDDOWN(nmbclusters - njcl, NCLPBG);
1219         }
1220
1221         /*
1222          * njcl is valid only on platforms with 16KB jumbo clusters, where
1223          * it is configured to 1/3 of the pool size.  On these platforms,
1224          * the remaining is used for 2KB and 4KB clusters.  On platforms
1225          * without 16KB jumbo clusters, the entire pool is used for both
1226          * 2KB and 4KB clusters.  A 4KB cluster can either be splitted into
1227          * 16 mbufs, or into 2 2KB clusters.
1228          *
1229          *  +---+---+------------ ... -----------+------- ... -------+
1230          *  | c | b |              s             |        njcl       |
1231          *  +---+---+------------ ... -----------+------- ... -------+
1232          *
1233          * 1/32th of the shared region is reserved for pure 2KB and 4KB
1234          * clusters (1/64th each.)
1235          */
1236         c = P2ROUNDDOWN((nclusters >> 6), 2);           /* in 2KB unit */
1237         b = P2ROUNDDOWN((nclusters >> (6 + NCLPBGSHIFT)), 2); /* in 4KB unit */
1238         s = nclusters - (c + (b << NCLPBGSHIFT));       /* in 2KB unit */
1239
1240         /*
1241          * 1/64th (c) is reserved for 2KB clusters.
1242          */
1243         m_minlimit(MC_CL) = c;
1244         m_maxlimit(MC_CL) = s + c;                      /* in 2KB unit */
1245         m_maxsize(MC_CL) = m_size(MC_CL) = MCLBYTES;
1246         (void) snprintf(m_cname(MC_CL), MAX_MBUF_CNAME, "cl");
1247
1248         /*
1249          * Another 1/64th (b) of the map is reserved for 4KB clusters.
1250          * It cannot be turned into 2KB clusters or mbufs.
1251          */
1252         m_minlimit(MC_BIGCL) = b;
1253         m_maxlimit(MC_BIGCL) = (s >> NCLPBGSHIFT) + b;  /* in 4KB unit */
1254         m_maxsize(MC_BIGCL) = m_size(MC_BIGCL) = MBIGCLBYTES;
1255         (void) snprintf(m_cname(MC_BIGCL), MAX_MBUF_CNAME, "bigcl");
1256
1257         /*
1258          * The remaining 31/32ths (s) are all-purpose (mbufs, 2KB, or 4KB)
1259          */
1260         m_minlimit(MC_MBUF) = 0;
1261         m_maxlimit(MC_MBUF) = (s << NMBPCLSHIFT);       /* in mbuf unit */
1262         m_maxsize(MC_MBUF) = m_size(MC_MBUF) = MSIZE;
1263         (void) snprintf(m_cname(MC_MBUF), MAX_MBUF_CNAME, "mbuf");
1264
1265         /*
1266          * Set limits for the composite classes.
1267          */
1268         m_minlimit(MC_MBUF_CL) = 0;
1269         m_maxlimit(MC_MBUF_CL) = m_maxlimit(MC_CL);
1270         m_maxsize(MC_MBUF_CL) = MCLBYTES;
1271         m_size(MC_MBUF_CL) = m_size(MC_MBUF) + m_size(MC_CL);
1272         (void) snprintf(m_cname(MC_MBUF_CL), MAX_MBUF_CNAME, "mbuf_cl");
1273
1274         m_minlimit(MC_MBUF_BIGCL) = 0;
1275         m_maxlimit(MC_MBUF_BIGCL) = m_maxlimit(MC_BIGCL);
1276         m_maxsize(MC_MBUF_BIGCL) = MBIGCLBYTES;
1277         m_size(MC_MBUF_BIGCL) = m_size(MC_MBUF) + m_size(MC_BIGCL);
1278         (void) snprintf(m_cname(MC_MBUF_BIGCL), MAX_MBUF_CNAME, "mbuf_bigcl");
1279
1280         /*
1281          * And for jumbo classes.
1282          */
1283         m_minlimit(MC_16KCL) = 0;
1284         m_maxlimit(MC_16KCL) = (njcl >> NCLPJCLSHIFT);  /* in 16KB unit */
1285         m_maxsize(MC_16KCL) = m_size(MC_16KCL) = M16KCLBYTES;
1286         (void) snprintf(m_cname(MC_16KCL), MAX_MBUF_CNAME, "16kcl");
1287
1288         m_minlimit(MC_MBUF_16KCL) = 0;
1289         m_maxlimit(MC_MBUF_16KCL) = m_maxlimit(MC_16KCL);
1290         m_maxsize(MC_MBUF_16KCL) = M16KCLBYTES;
1291         m_size(MC_MBUF_16KCL) = m_size(MC_MBUF) + m_size(MC_16KCL);
1292         (void) snprintf(m_cname(MC_MBUF_16KCL), MAX_MBUF_CNAME, "mbuf_16kcl");
1293
1294         /*
1295          * Initialize the legacy mbstat structure.
1296          */
1297         bzero(&mbstat, sizeof (mbstat));
1298         mbstat.m_msize = m_maxsize(MC_MBUF);
1299         mbstat.m_mclbytes = m_maxsize(MC_CL);
1300         mbstat.m_minclsize = MINCLSIZE;
1301         mbstat.m_mlen = MLEN;
1302         mbstat.m_mhlen = MHLEN;
1303         mbstat.m_bigmclbytes = m_maxsize(MC_BIGCL);
1304 }
1305
1306 #if defined(__LP64__)
1307 typedef struct ncl_tbl {
1308         uint64_t nt_maxmem;     /* memory (sane) size */
1309         uint32_t nt_mbpool;     /* mbuf pool size */
1310 } ncl_tbl_t;
1311
1312 /* Non-server */
1313 static ncl_tbl_t ncl_table[] = {
1314         { (1ULL << GBSHIFT)       /*  1 GB */,  (64 << MBSHIFT)  /*  64 MB */ },
1315         { (1ULL << (GBSHIFT + 3)) /*  8 GB */,  (96 << MBSHIFT)  /*  96 MB */ },
1316         { (1ULL << (GBSHIFT + 4)) /* 16 GB */,  (128 << MBSHIFT) /* 128 MB */ },
1317         { 0, 0 }
1318 };
1319
1320 /* Server */
1321 static ncl_tbl_t ncl_table_srv[] = {
1322         { (1ULL << GBSHIFT)       /*  1 GB */,  (96 << MBSHIFT)  /*  96 MB */ },
1323         { (1ULL << (GBSHIFT + 2)) /*  4 GB */,  (128 << MBSHIFT) /* 128 MB */ },
1324         { (1ULL << (GBSHIFT + 3)) /*  8 GB */,  (160 << MBSHIFT) /* 160 MB */ },
1325         { (1ULL << (GBSHIFT + 4)) /* 16 GB */,  (192 << MBSHIFT) /* 192 MB */ },
1326         { (1ULL << (GBSHIFT + 5)) /* 32 GB */,  (256 << MBSHIFT) /* 256 MB */ },
1327         { (1ULL << (GBSHIFT + 6)) /* 64 GB */,  (384 << MBSHIFT) /* 384 MB */ },
1328         { 0, 0 }
1329 };
1330 #endif /* __LP64__ */
1331
1332 __private_extern__ unsigned int
1333 mbuf_default_ncl(int server, uint64_t mem)
1334 {
1335 #if !defined(__LP64__)
1336 #pragma unused(server)
1337         unsigned int n;
1338         /*
1339          * 32-bit kernel (default to 64MB of mbuf pool for >= 1GB RAM).
1340          */
1341         if ((n = ((mem / 16) / MCLBYTES)) > 32768)
1342                 n = 32768;
1343 #else
1344         unsigned int n, i;
1345         ncl_tbl_t *tbl = (server ? ncl_table_srv : ncl_table);
1346         /*
1347          * 64-bit kernel (mbuf pool size based on table).
1348          */
1349         n = tbl[0].nt_mbpool;
1350         for (i = 0; tbl[i].nt_mbpool != 0; i++) {
1351                 if (mem < tbl[i].nt_maxmem)
1352                         break;
1353                 n = tbl[i].nt_mbpool;
1354         }
1355         n >>= MCLSHIFT;
1356 #endif /* !__LP64__ */
1357         return (n);
1358 }
1359
1360 __private_extern__ void
1361 mbinit(void)
1362 {
1363         unsigned int m;
1364         unsigned int initmcl = 0;
1365         void *buf;
1366         thread_t thread = THREAD_NULL;
1367
1368         microuptime(&mb_start);
1369
1370         /*
1371          * These MBUF_ values must be equal to their private counterparts.
1372          */
1373         _CASSERT(MBUF_EXT == M_EXT);
1374         _CASSERT(MBUF_PKTHDR == M_PKTHDR);
1375         _CASSERT(MBUF_EOR == M_EOR);
1376         _CASSERT(MBUF_LOOP == M_LOOP);
1377         _CASSERT(MBUF_BCAST == M_BCAST);
1378         _CASSERT(MBUF_MCAST == M_MCAST);
1379         _CASSERT(MBUF_FRAG == M_FRAG);
1380         _CASSERT(MBUF_FIRSTFRAG == M_FIRSTFRAG);
1381         _CASSERT(MBUF_LASTFRAG == M_LASTFRAG);
1382         _CASSERT(MBUF_PROMISC == M_PROMISC);
1383         _CASSERT(MBUF_HASFCS == M_HASFCS);
1384
1385         _CASSERT(MBUF_TYPE_FREE == MT_FREE);
1386         _CASSERT(MBUF_TYPE_DATA == MT_DATA);
1387         _CASSERT(MBUF_TYPE_HEADER == MT_HEADER);
1388         _CASSERT(MBUF_TYPE_SOCKET == MT_SOCKET);
1389         _CASSERT(MBUF_TYPE_PCB == MT_PCB);
1390         _CASSERT(MBUF_TYPE_RTABLE == MT_RTABLE);
1391         _CASSERT(MBUF_TYPE_HTABLE == MT_HTABLE);
1392         _CASSERT(MBUF_TYPE_ATABLE == MT_ATABLE);
1393         _CASSERT(MBUF_TYPE_SONAME == MT_SONAME);
1394         _CASSERT(MBUF_TYPE_SOOPTS == MT_SOOPTS);
1395         _CASSERT(MBUF_TYPE_FTABLE == MT_FTABLE);
1396         _CASSERT(MBUF_TYPE_RIGHTS == MT_RIGHTS);
1397         _CASSERT(MBUF_TYPE_IFADDR == MT_IFADDR);
1398         _CASSERT(MBUF_TYPE_CONTROL == MT_CONTROL);
1399         _CASSERT(MBUF_TYPE_OOBDATA == MT_OOBDATA);
1400
1401         _CASSERT(MBUF_TSO_IPV4 == CSUM_TSO_IPV4);
1402         _CASSERT(MBUF_TSO_IPV6 == CSUM_TSO_IPV6);
1403         _CASSERT(MBUF_CSUM_REQ_SUM16 == CSUM_PARTIAL);
1404         _CASSERT(MBUF_CSUM_TCP_SUM16 == MBUF_CSUM_REQ_SUM16);
1405         _CASSERT(MBUF_CSUM_REQ_IP == CSUM_IP);
1406         _CASSERT(MBUF_CSUM_REQ_TCP == CSUM_TCP);
1407         _CASSERT(MBUF_CSUM_REQ_UDP == CSUM_UDP);
1408         _CASSERT(MBUF_CSUM_REQ_TCPIPV6 == CSUM_TCPIPV6);
1409         _CASSERT(MBUF_CSUM_REQ_UDPIPV6 == CSUM_UDPIPV6);
1410         _CASSERT(MBUF_CSUM_DID_IP == CSUM_IP_CHECKED);
1411         _CASSERT(MBUF_CSUM_IP_GOOD == CSUM_IP_VALID);
1412         _CASSERT(MBUF_CSUM_DID_DATA == CSUM_DATA_VALID);
1413         _CASSERT(MBUF_CSUM_PSEUDO_HDR == CSUM_PSEUDO_HDR);
1414
1415         _CASSERT(MBUF_WAITOK == M_WAIT);
1416         _CASSERT(MBUF_DONTWAIT == M_DONTWAIT);
1417         _CASSERT(MBUF_COPYALL == M_COPYALL);
1418
1419         _CASSERT(MBUF_SC2TC(MBUF_SC_BK_SYS) == MBUF_TC_BK);
1420         _CASSERT(MBUF_SC2TC(MBUF_SC_BK) == MBUF_TC_BK);
1421         _CASSERT(MBUF_SC2TC(MBUF_SC_BE) == MBUF_TC_BE);
1422         _CASSERT(MBUF_SC2TC(MBUF_SC_RD) == MBUF_TC_BE);
1423         _CASSERT(MBUF_SC2TC(MBUF_SC_OAM) == MBUF_TC_BE);
1424         _CASSERT(MBUF_SC2TC(MBUF_SC_AV) == MBUF_TC_VI);
1425         _CASSERT(MBUF_SC2TC(MBUF_SC_RV) == MBUF_TC_VI);
1426         _CASSERT(MBUF_SC2TC(MBUF_SC_VI) == MBUF_TC_VI);
1427         _CASSERT(MBUF_SC2TC(MBUF_SC_VO) == MBUF_TC_VO);
1428         _CASSERT(MBUF_SC2TC(MBUF_SC_CTL) == MBUF_TC_VO);
1429
1430         _CASSERT(MBUF_TC2SCVAL(MBUF_TC_BK) == SCVAL_BK);
1431         _CASSERT(MBUF_TC2SCVAL(MBUF_TC_BE) == SCVAL_BE);
1432         _CASSERT(MBUF_TC2SCVAL(MBUF_TC_VI) == SCVAL_VI);
1433         _CASSERT(MBUF_TC2SCVAL(MBUF_TC_VO) == SCVAL_VO);
1434
1435         /* Module specific scratch space (32-bit alignment requirement) */
1436         _CASSERT(!(offsetof(struct mbuf, m_pkthdr.pkt_mpriv) %
1437             sizeof (uint32_t)));
1438
1439         /* Initialize random red zone cookie value */
1440         _CASSERT(sizeof (mb_redzone_cookie) ==
1441             sizeof (((struct pkthdr *)0)->redzone));
1442         read_random(&mb_redzone_cookie, sizeof (mb_redzone_cookie));
1443
1444         /* Make sure we don't save more than we should */
1445         _CASSERT(MCA_SAVED_MBUF_SIZE <= sizeof (struct mbuf));
1446
1447         if (nmbclusters == 0)
1448                 nmbclusters = NMBCLUSTERS;
1449
1450         /* This should be a sane (at least even) value by now */
1451         VERIFY(nmbclusters != 0 && !(nmbclusters & 0x1));
1452
1453         /* Setup the mbuf table */
1454         mbuf_table_init();
1455
1456         /* Global lock for common layer */
1457         mbuf_mlock_grp_attr = lck_grp_attr_alloc_init();
1458         mbuf_mlock_grp = lck_grp_alloc_init("mbuf", mbuf_mlock_grp_attr);
1459         mbuf_mlock_attr = lck_attr_alloc_init();
1460         lck_mtx_init(mbuf_mlock, mbuf_mlock_grp, mbuf_mlock_attr);
1461
1462         /*
1463          * Allocate cluster slabs table:
1464          *
1465          *      maxslabgrp = (N * 2048) / (1024 * 1024)
1466          *
1467          * Where N is nmbclusters rounded up to the nearest 512.  This yields
1468          * mcl_slab_g_t units, each one representing a MB of memory.
1469          */
1470         maxslabgrp =
1471             (P2ROUNDUP(nmbclusters, (MBSIZE >> 11)) << MCLSHIFT) >> MBSHIFT;
1472         MALLOC(slabstbl, mcl_slabg_t **, maxslabgrp * sizeof (mcl_slabg_t *),
1473             M_TEMP, M_WAITOK | M_ZERO);
1474         VERIFY(slabstbl != NULL);
1475
1476         /*
1477          * Allocate audit structures, if needed:
1478          *
1479          *      maxclaudit = (maxslabgrp * 1024 * 1024) / 4096
1480          *
1481          * This yields mcl_audit_t units, each one representing a page.
1482          */
1483         PE_parse_boot_argn("mbuf_debug", &mbuf_debug, sizeof (mbuf_debug));
1484         mbuf_debug |= mcache_getflags();
1485         if (mbuf_debug & MCF_DEBUG) {
1486                 maxclaudit = ((maxslabgrp << MBSHIFT) >> PGSHIFT);
1487                 MALLOC(mclaudit, mcl_audit_t *, maxclaudit * sizeof (*mclaudit),
1488                     M_TEMP, M_WAITOK | M_ZERO);
1489                 VERIFY(mclaudit != NULL);
1490
1491                 mcl_audit_con_cache = mcache_create("mcl_audit_contents",
1492                     AUDIT_CONTENTS_SIZE, sizeof (u_int64_t), 0, MCR_SLEEP);
1493                 VERIFY(mcl_audit_con_cache != NULL);
1494         }
1495         mclverify = (mbuf_debug & MCF_VERIFY);
1496         mcltrace = (mbuf_debug & MCF_TRACE);
1497         mclfindleak = !(mbuf_debug & MCF_NOLEAKLOG);
1498         mclexpleak = mclfindleak && (mbuf_debug & MCF_EXPLEAKLOG);
1499
1500         /* Enable mbuf leak logging, with a lock to protect the tables */
1501
1502         mleak_lock_grp_attr = lck_grp_attr_alloc_init();
1503         mleak_lock_grp = lck_grp_alloc_init("mleak_lock", mleak_lock_grp_attr);
1504         mleak_lock_attr = lck_attr_alloc_init();
1505         lck_mtx_init(mleak_lock, mleak_lock_grp, mleak_lock_attr);
1506
1507         mleak_activate();
1508
1509         /* Calculate the number of pages assigned to the cluster pool */
1510         mcl_pages = (nmbclusters * MCLBYTES) / CLBYTES;
1511         MALLOC(mcl_paddr, ppnum_t *, mcl_pages * sizeof (ppnum_t),
1512             M_TEMP, M_WAITOK);
1513         VERIFY(mcl_paddr != NULL);
1514
1515         /* Register with the I/O Bus mapper */
1516         mcl_paddr_base = IOMapperIOVMAlloc(mcl_pages);
1517         bzero((char *)mcl_paddr, mcl_pages * sizeof (ppnum_t));
1518
1519         embutl = (union mbigcluster *)
1520             ((void *)((unsigned char *)mbutl + (nmbclusters * MCLBYTES)));
1521         VERIFY((((char *)embutl - (char *)mbutl) % MBIGCLBYTES) == 0);
1522
1523         /* Prime up the freelist */
1524         PE_parse_boot_argn("initmcl", &initmcl, sizeof (initmcl));
1525         if (initmcl != 0) {
1526                 initmcl >>= NCLPBGSHIFT;        /* become a 4K unit */
1527                 if (initmcl > m_maxlimit(MC_BIGCL))
1528                         initmcl = m_maxlimit(MC_BIGCL);
1529         }
1530         if (initmcl < m_minlimit(MC_BIGCL))
1531                 initmcl = m_minlimit(MC_BIGCL);
1532
1533         lck_mtx_lock(mbuf_mlock);
1534
1535         /*
1536          * For classes with non-zero minimum limits, populate their freelists
1537          * so that m_total(class) is at least m_minlimit(class).
1538          */
1539         VERIFY(m_total(MC_BIGCL) == 0 && m_minlimit(MC_BIGCL) != 0);
1540         freelist_populate(m_class(MC_BIGCL), initmcl, M_WAIT);
1541         VERIFY(m_total(MC_BIGCL) >= m_minlimit(MC_BIGCL));
1542         freelist_init(m_class(MC_CL));
1543
1544         for (m = 0; m < NELEM(mbuf_table); m++) {
1545                 /* Make sure we didn't miss any */
1546                 VERIFY(m_minlimit(m_class(m)) == 0 ||
1547                     m_total(m_class(m)) >= m_minlimit(m_class(m)));
1548
1549                 /* populate the initial sizes and report from there on */
1550                 m_peak(m_class(m)) = m_total(m_class(m));
1551         }
1552         mb_peak_newreport = FALSE;
1553
1554         lck_mtx_unlock(mbuf_mlock);
1555
1556         (void) kernel_thread_start((thread_continue_t)mbuf_worker_thread_init,
1557             NULL, &thread);
1558         thread_deallocate(thread);
1559
1560         ref_cache = mcache_create("mext_ref", sizeof (struct ext_ref),
1561             0, 0, MCR_SLEEP);
1562
1563         /* Create the cache for each class */
1564         for (m = 0; m < NELEM(mbuf_table); m++) {
1565                 void *allocfunc, *freefunc, *auditfunc, *logfunc;
1566                 u_int32_t flags;
1567
1568                 flags = mbuf_debug;
1569                 if (m_class(m) == MC_MBUF_CL || m_class(m) == MC_MBUF_BIGCL ||
1570                     m_class(m) == MC_MBUF_16KCL) {
1571                         allocfunc = mbuf_cslab_alloc;
1572                         freefunc = mbuf_cslab_free;
1573                         auditfunc = mbuf_cslab_audit;
1574                         logfunc = mleak_logger;
1575                 } else {
1576                         allocfunc = mbuf_slab_alloc;
1577                         freefunc = mbuf_slab_free;
1578                         auditfunc = mbuf_slab_audit;
1579                         logfunc = mleak_logger;
1580                 }
1581
1582                 /*
1583                  * Disable per-CPU caches for jumbo classes if there
1584                  * is no jumbo cluster pool available in the system.
1585                  * The cache itself is still created (but will never
1586                  * be populated) since it simplifies the code.
1587                  */
1588                 if ((m_class(m) == MC_MBUF_16KCL || m_class(m) == MC_16KCL) &&
1589                     njcl == 0)
1590                         flags |= MCF_NOCPUCACHE;
1591
1592                 if (!mclfindleak)
1593                         flags |= MCF_NOLEAKLOG;
1594
1595                 m_cache(m) = mcache_create_ext(m_cname(m), m_maxsize(m),
1596                     allocfunc, freefunc, auditfunc, logfunc, mbuf_slab_notify,
1597                     (void *)(uintptr_t)m, flags, MCR_SLEEP);
1598         }
1599
1600         /*
1601          * Allocate structure for per-CPU statistics that's aligned
1602          * on the CPU cache boundary; this code assumes that we never
1603          * uninitialize this framework, since the original address
1604          * before alignment is not saved.
1605          */
1606         ncpu = ml_get_max_cpus();
1607         MALLOC(buf, void *, MBUF_MTYPES_SIZE(ncpu) + CPU_CACHE_LINE_SIZE,
1608             M_TEMP, M_WAITOK);
1609         VERIFY(buf != NULL);
1610
1611         mbuf_mtypes = (mbuf_mtypes_t *)P2ROUNDUP((intptr_t)buf,
1612             CPU_CACHE_LINE_SIZE);
1613         bzero(mbuf_mtypes, MBUF_MTYPES_SIZE(ncpu));
1614
1615         /*
1616          * Set the max limit on sb_max to be 1/16 th of the size of
1617          * memory allocated for mbuf clusters.
1618          */
1619         high_sb_max = (nmbclusters << (MCLSHIFT - 4));
1620         if (high_sb_max < sb_max) {
1621                 /* sb_max is too large for this configuration, scale it down */
1622                 if (high_sb_max > (1 << MBSHIFT)) {
1623                         /* We have atleast 16 M of mbuf pool */
1624                         sb_max = high_sb_max;
1625                 } else if ((nmbclusters << MCLSHIFT) > (1 << MBSHIFT)) {
1626                         /*
1627                          * If we have more than 1M of mbufpool, cap the size of
1628                          * max sock buf at 1M
1629                          */
1630                         sb_max = high_sb_max = (1 << MBSHIFT);
1631                 } else {
1632                         sb_max = high_sb_max;
1633                 }
1634         }
1635
1636         /* allocate space for mbuf_dump_buf */
1637         MALLOC(mbuf_dump_buf, char *, MBUF_DUMP_BUF_SIZE, M_TEMP, M_WAITOK);
1638         VERIFY(mbuf_dump_buf != NULL);
1639
1640         if (mbuf_debug & MCF_DEBUG) {
1641                 printf("%s: MLEN %d, MHLEN %d\n", __func__,
1642                     (int)_MLEN, (int)_MHLEN);
1643         }
1644
1645         printf("%s: done [%d MB total pool size, (%d/%d) split]\n", __func__,
1646             (nmbclusters << MCLSHIFT) >> MBSHIFT,
1647             (nclusters << MCLSHIFT) >> MBSHIFT,
1648             (njcl << MCLSHIFT) >> MBSHIFT);
1649 }
1650
1651 /*
1652  * Obtain a slab of object(s) from the class's freelist.
1653  */
1654 static mcache_obj_t *
1655 slab_alloc(mbuf_class_t class, int wait)
1656 {
1657         mcl_slab_t *sp;
1658         mcache_obj_t *buf;
1659
1660         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1661
1662         VERIFY(class != MC_16KCL || njcl > 0);
1663
1664         /* This should always be NULL for us */
1665         VERIFY(m_cobjlist(class) == NULL);
1666
1667         /*
1668          * Treat composite objects as having longer lifespan by using
1669          * a slab from the reverse direction, in hoping that this could
1670          * reduce the probability of fragmentation for slabs that hold
1671          * more than one buffer chunks (e.g. mbuf slabs).  For other
1672          * slabs, this probably doesn't make much of a difference.
1673          */
1674         if ((class == MC_MBUF || class == MC_CL) && (wait & MCR_COMP))
1675                 sp = (mcl_slab_t *)TAILQ_LAST(&m_slablist(class), mcl_slhead);
1676         else
1677                 sp = (mcl_slab_t *)TAILQ_FIRST(&m_slablist(class));
1678
1679         if (sp == NULL) {
1680                 VERIFY(m_infree(class) == 0 && m_slab_cnt(class) == 0);
1681                 /* The slab list for this class is empty */
1682                 return (NULL);
1683         }
1684
1685         VERIFY(m_infree(class) > 0);
1686         VERIFY(!slab_is_detached(sp));
1687         VERIFY(sp->sl_class == class &&
1688             (sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED);
1689         buf = sp->sl_head;
1690         VERIFY(slab_inrange(sp, buf) && sp == slab_get(buf));
1691
1692         if (class == MC_MBUF) {
1693                 sp->sl_head = buf->obj_next;
1694                 VERIFY(sp->sl_head != NULL || sp->sl_refcnt == (NMBPBG - 1));
1695         } else if (class == MC_CL) {
1696                 sp->sl_head = buf->obj_next;
1697                 VERIFY(sp->sl_head != NULL || sp->sl_refcnt == (NCLPBG - 1));
1698         } else {
1699                 sp->sl_head = NULL;
1700         }
1701         if (sp->sl_head != NULL && !slab_inrange(sp, sp->sl_head)) {
1702                 slab_nextptr_panic(sp, sp->sl_head);
1703                 /* In case sl_head is in the map but not in the slab */
1704                 VERIFY(slab_inrange(sp, sp->sl_head));
1705                 /* NOTREACHED */
1706         }
1707
1708         /* Increment slab reference */
1709         sp->sl_refcnt++;
1710
1711         if (mclaudit != NULL) {
1712                 mcache_audit_t *mca = mcl_audit_buf2mca(class, buf);
1713                 mca->mca_uflags = 0;
1714                 /* Save contents on mbuf objects only */
1715                 if (class == MC_MBUF)
1716                         mca->mca_uflags |= MB_SCVALID;
1717         }
1718
1719         if (class == MC_CL) {
1720                 mbstat.m_clfree = (--m_infree(MC_CL)) + m_infree(MC_MBUF_CL);
1721                 /*
1722                  * A 2K cluster slab can have at most NCLPBG references.
1723                  */
1724                 VERIFY(sp->sl_refcnt >= 1 && sp->sl_refcnt <= NCLPBG &&
1725                     sp->sl_chunks == NCLPBG &&
1726                     sp->sl_len == m_maxsize(MC_BIGCL));
1727                 VERIFY(sp->sl_refcnt < NCLPBG || sp->sl_head == NULL);
1728         } else if (class == MC_BIGCL) {
1729                 mbstat.m_bigclfree = (--m_infree(MC_BIGCL)) +
1730                     m_infree(MC_MBUF_BIGCL);
1731                 /*
1732                  * A 4K cluster slab can have at most 1 reference.
1733                  */
1734                 VERIFY(sp->sl_refcnt == 1 && sp->sl_chunks == 1 &&
1735                     sp->sl_len == m_maxsize(class) && sp->sl_head == NULL);
1736         } else if (class == MC_16KCL) {
1737                 mcl_slab_t *nsp;
1738                 int k;
1739
1740                 --m_infree(MC_16KCL);
1741                 VERIFY(sp->sl_refcnt == 1 && sp->sl_chunks == 1 &&
1742                     sp->sl_len == m_maxsize(class) && sp->sl_head == NULL);
1743                 /*
1744                  * Increment 2nd-Nth slab reference, where N is NSLABSP16KB.
1745                  * A 16KB big cluster takes NSLABSP16KB slabs, each having at
1746                  * most 1 reference.
1747                  */
1748                 for (nsp = sp, k = 1; k < NSLABSP16KB; k++) {
1749                         nsp = nsp->sl_next;
1750                         /* Next slab must already be present */
1751                         VERIFY(nsp != NULL);
1752                         nsp->sl_refcnt++;
1753                         VERIFY(!slab_is_detached(nsp));
1754                         VERIFY(nsp->sl_class == MC_16KCL &&
1755                             nsp->sl_flags == (SLF_MAPPED | SLF_PARTIAL) &&
1756                             nsp->sl_refcnt == 1 && nsp->sl_chunks == 0 &&
1757                             nsp->sl_len == 0 && nsp->sl_base == sp->sl_base &&
1758                             nsp->sl_head == NULL);
1759                 }
1760         } else {
1761                 VERIFY(class == MC_MBUF);
1762                 --m_infree(MC_MBUF);
1763                 /*
1764                  * If auditing is turned on, this check is
1765                  * deferred until later in mbuf_slab_audit().
1766                  */
1767                 if (mclaudit == NULL)
1768                         _MCHECK((struct mbuf *)buf);
1769                 /*
1770                  * Since we have incremented the reference count above,
1771                  * an mbuf slab (formerly a 4KB cluster slab that was cut
1772                  * up into mbufs) must have a reference count between 1
1773                  * and NMBPBG at this point.
1774                  */
1775                 VERIFY(sp->sl_refcnt >= 1 && sp->sl_refcnt <= NMBPBG &&
1776                     sp->sl_chunks == NMBPBG &&
1777                     sp->sl_len == m_maxsize(MC_BIGCL));
1778                 VERIFY(sp->sl_refcnt < NMBPBG || sp->sl_head == NULL);
1779         }
1780
1781         /* If empty, remove this slab from the class's freelist */
1782         if (sp->sl_head == NULL) {
1783                 VERIFY(class != MC_MBUF || sp->sl_refcnt == NMBPBG);
1784                 VERIFY(class != MC_CL || sp->sl_refcnt == NCLPBG);
1785                 slab_remove(sp, class);
1786         }
1787
1788         return (buf);
1789 }
1790
1791 /*
1792  * Place a slab of object(s) back into a class's slab list.
1793  */
1794 static void
1795 slab_free(mbuf_class_t class, mcache_obj_t *buf)
1796 {
1797         mcl_slab_t *sp;
1798
1799         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1800
1801         VERIFY(class != MC_16KCL || njcl > 0);
1802         VERIFY(buf->obj_next == NULL);
1803         sp = slab_get(buf);
1804         VERIFY(sp->sl_class == class && slab_inrange(sp, buf) &&
1805             (sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED);
1806
1807         /* Decrement slab reference */
1808         sp->sl_refcnt--;
1809
1810         if (class == MC_CL) {
1811                 VERIFY(IS_P2ALIGNED(buf, MCLBYTES));
1812                 /*
1813                  * A slab that has been splitted for 2KB clusters can have
1814                  * at most 1 outstanding reference at this point.
1815                  */
1816                 VERIFY(sp->sl_refcnt >= 0 && sp->sl_refcnt <= (NCLPBG - 1) &&
1817                     sp->sl_chunks == NCLPBG &&
1818                     sp->sl_len == m_maxsize(MC_BIGCL));
1819                 VERIFY(sp->sl_refcnt < (NCLPBG - 1) ||
1820                     (slab_is_detached(sp) && sp->sl_head == NULL));
1821         } else if (class == MC_BIGCL) {
1822                 VERIFY(IS_P2ALIGNED(buf, MCLBYTES));
1823                 /*
1824                  * A 4KB cluster slab can have at most 1 reference
1825                  * which must be 0 at this point.
1826                  */
1827                 VERIFY(sp->sl_refcnt == 0 && sp->sl_chunks == 1 &&
1828                     sp->sl_len == m_maxsize(class) && sp->sl_head == NULL);
1829                 VERIFY(slab_is_detached(sp));
1830         } else if (class == MC_16KCL) {
1831                 mcl_slab_t *nsp;
1832                 int k;
1833                 /*
1834                  * A 16KB cluster takes NSLABSP16KB slabs, all must
1835                  * now have 0 reference.
1836                  */
1837                 VERIFY(IS_P2ALIGNED(buf, MBIGCLBYTES));
1838                 VERIFY(sp->sl_refcnt == 0 && sp->sl_chunks == 1 &&
1839                     sp->sl_len == m_maxsize(class) && sp->sl_head == NULL);
1840                 VERIFY(slab_is_detached(sp));
1841                 for (nsp = sp, k = 1; k < NSLABSP16KB; k++) {
1842                         nsp = nsp->sl_next;
1843                         /* Next slab must already be present */
1844                         VERIFY(nsp != NULL);
1845                         nsp->sl_refcnt--;
1846                         VERIFY(slab_is_detached(nsp));
1847                         VERIFY(nsp->sl_class == MC_16KCL &&
1848                             (nsp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) &&
1849                             nsp->sl_refcnt == 0 && nsp->sl_chunks == 0 &&
1850                             nsp->sl_len == 0 && nsp->sl_base == sp->sl_base &&
1851                             nsp->sl_head == NULL);
1852                 }
1853         } else {
1854                 /*
1855                  * A slab that has been splitted for mbufs has at most NMBPBG
1856                  * reference counts.  Since we have decremented one reference
1857                  * above, it must now be between 0 and NMBPBG-1.
1858                  */
1859                 VERIFY(class == MC_MBUF);
1860                 VERIFY(sp->sl_refcnt >= 0 && sp->sl_refcnt <= (NMBPBG - 1) &&
1861                     sp->sl_chunks == NMBPBG &&
1862                     sp->sl_len == m_maxsize(MC_BIGCL));
1863                 VERIFY(sp->sl_refcnt < (NMBPBG - 1) ||
1864                     (slab_is_detached(sp) && sp->sl_head == NULL));
1865         }
1866
1867         /*
1868          * When auditing is enabled, ensure that the buffer still
1869          * contains the free pattern.  Otherwise it got corrupted
1870          * while at the CPU cache layer.
1871          */
1872         if (mclaudit != NULL) {
1873                 mcache_audit_t *mca = mcl_audit_buf2mca(class, buf);
1874                 if (mclverify) {
1875                         mcache_audit_free_verify(mca, buf, 0, m_maxsize(class));
1876                 }
1877                 mca->mca_uflags &= ~MB_SCVALID;
1878         }
1879
1880         if (class == MC_CL) {
1881                 mbstat.m_clfree = (++m_infree(MC_CL)) + m_infree(MC_MBUF_CL);
1882                 buf->obj_next = sp->sl_head;
1883         } else if (class == MC_BIGCL) {
1884                 mbstat.m_bigclfree = (++m_infree(MC_BIGCL)) +
1885                     m_infree(MC_MBUF_BIGCL);
1886         } else if (class == MC_16KCL) {
1887                 ++m_infree(MC_16KCL);
1888         } else {
1889                 ++m_infree(MC_MBUF);
1890                 buf->obj_next = sp->sl_head;
1891         }
1892         sp->sl_head = buf;
1893
1894         /*
1895          * If a slab has been splitted to either one which holds 2KB clusters,
1896          * or one which holds mbufs, turn it back to one which holds a 4KB
1897          * cluster.
1898          */
1899         if (class == MC_MBUF && sp->sl_refcnt == 0 &&
1900             m_total(class) > m_minlimit(class) &&
1901             m_total(MC_BIGCL) < m_maxlimit(MC_BIGCL)) {
1902                 int i = NMBPBG;
1903
1904                 m_total(MC_BIGCL)++;
1905                 mbstat.m_bigclusters = m_total(MC_BIGCL);
1906                 m_total(MC_MBUF) -= NMBPBG;
1907                 mbstat.m_mbufs = m_total(MC_MBUF);
1908                 m_infree(MC_MBUF) -= NMBPBG;
1909                 mtype_stat_add(MT_FREE, -((unsigned)NMBPBG));
1910
1911                 VERIFY(m_total(MC_BIGCL) <= m_maxlimit(MC_BIGCL));
1912                 VERIFY(m_total(MC_MBUF) >= m_minlimit(MC_MBUF));
1913
1914                 while (i--) {
1915                         struct mbuf *m = sp->sl_head;
1916                         VERIFY(m != NULL);
1917                         sp->sl_head = m->m_next;
1918                         m->m_next = NULL;
1919                 }
1920                 VERIFY(sp->sl_head == NULL);
1921
1922                 /* Remove the slab from the mbuf class's slab list */
1923                 slab_remove(sp, class);
1924
1925                 /* Reinitialize it as a 4KB cluster slab */
1926                 slab_init(sp, MC_BIGCL, sp->sl_flags, sp->sl_base, sp->sl_base,
1927                     sp->sl_len, 0, 1);
1928
1929                 if (mclverify) {
1930                         mcache_set_pattern(MCACHE_FREE_PATTERN,
1931                             (caddr_t)sp->sl_head, m_maxsize(MC_BIGCL));
1932                 }
1933                 mbstat.m_bigclfree = (++m_infree(MC_BIGCL)) +
1934                     m_infree(MC_MBUF_BIGCL);
1935
1936                 VERIFY(slab_is_detached(sp));
1937                 /* And finally switch class */
1938                 class = MC_BIGCL;
1939         } else if (class == MC_CL && sp->sl_refcnt == 0 &&
1940             m_total(class) > m_minlimit(class) &&
1941             m_total(MC_BIGCL) < m_maxlimit(MC_BIGCL)) {
1942                 int i = NCLPBG;
1943
1944                 m_total(MC_BIGCL)++;
1945                 mbstat.m_bigclusters = m_total(MC_BIGCL);
1946                 m_total(MC_CL) -= NCLPBG;
1947                 mbstat.m_clusters = m_total(MC_CL);
1948                 m_infree(MC_CL) -= NCLPBG;
1949                 VERIFY(m_total(MC_BIGCL) <= m_maxlimit(MC_BIGCL));
1950                 VERIFY(m_total(MC_CL) >= m_minlimit(MC_CL));
1951
1952                 while (i--) {
1953                         union mcluster *c = sp->sl_head;
1954                         VERIFY(c != NULL);
1955                         sp->sl_head = c->mcl_next;
1956                         c->mcl_next = NULL;
1957                 }
1958                 VERIFY(sp->sl_head == NULL);
1959
1960                 /* Remove the slab from the 2KB cluster class's slab list */
1961                 slab_remove(sp, class);
1962
1963                 /* Reinitialize it as a 4KB cluster slab */
1964                 slab_init(sp, MC_BIGCL, sp->sl_flags, sp->sl_base, sp->sl_base,
1965                     sp->sl_len, 0, 1);
1966
1967                 if (mclverify) {
1968                         mcache_set_pattern(MCACHE_FREE_PATTERN,
1969                             (caddr_t)sp->sl_head, m_maxsize(MC_BIGCL));
1970                 }
1971                 mbstat.m_bigclfree = (++m_infree(MC_BIGCL)) +
1972                     m_infree(MC_MBUF_BIGCL);
1973
1974                 VERIFY(slab_is_detached(sp));
1975                 /* And finally switch class */
1976                 class = MC_BIGCL;
1977         }
1978
1979         /* Reinsert the slab to the class's slab list */
1980         if (slab_is_detached(sp))
1981                 slab_insert(sp, class);
1982 }
1983
1984 /*
1985  * Common allocator for rudimentary objects called by the CPU cache layer
1986  * during an allocation request whenever there is no available element in the
1987  * bucket layer.  It returns one or more elements from the appropriate global
1988  * freelist.  If the freelist is empty, it will attempt to populate it and
1989  * retry the allocation.
1990  */
1991 static unsigned int
1992 mbuf_slab_alloc(void *arg, mcache_obj_t ***plist, unsigned int num, int wait)
1993 {
1994         mbuf_class_t class = (mbuf_class_t)arg;
1995         unsigned int need = num;
1996         mcache_obj_t **list = *plist;
1997
1998         ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
1999         ASSERT(need > 0);
2000
2001         lck_mtx_lock(mbuf_mlock);
2002
2003         for (;;) {
2004                 if ((*list = slab_alloc(class, wait)) != NULL) {
2005                         (*list)->obj_next = NULL;
2006                         list = *plist = &(*list)->obj_next;
2007
2008                         if (--need == 0) {
2009                                 /*
2010                                  * If the number of elements in freelist has
2011                                  * dropped below low watermark, asynchronously
2012                                  * populate the freelist now rather than doing
2013                                  * it later when we run out of elements.
2014                                  */
2015                                 if (!mbuf_cached_above(class, wait) &&
2016                                     m_infree(class) < m_total(class) >> 5) {
2017                                         (void) freelist_populate(class, 1,
2018                                             M_DONTWAIT);
2019                                 }
2020                                 break;
2021                         }
2022                 } else {
2023                         VERIFY(m_infree(class) == 0 || class == MC_CL);
2024
2025                         (void) freelist_populate(class, 1,
2026                             (wait & MCR_NOSLEEP) ? M_DONTWAIT : M_WAIT);
2027
2028                         if (m_infree(class) > 0)
2029                                 continue;
2030
2031                         /* Check if there's anything at the cache layer */
2032                         if (mbuf_cached_above(class, wait))
2033                                 break;
2034
2035                         /* watchdog checkpoint */
2036                         mbuf_watchdog();
2037
2038                         /* We have nothing and cannot block; give up */
2039                         if (wait & MCR_NOSLEEP) {
2040                                 if (!(wait & MCR_TRYHARD)) {
2041                                         m_fail_cnt(class)++;
2042                                         mbstat.m_drops++;
2043                                         break;
2044                                 }
2045                         }
2046
2047                         /*
2048                          * If the freelist is still empty and the caller is
2049                          * willing to be blocked, sleep on the wait channel
2050                          * until an element is available.  Otherwise, if
2051                          * MCR_TRYHARD is set, do our best to satisfy the
2052                          * request without having to go to sleep.
2053                          */
2054                         if (mbuf_worker_ready &&
2055                             mbuf_sleep(class, need, wait))
2056                                 break;
2057
2058                         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2059                 }
2060         }
2061
2062         m_alloc_cnt(class) += num - need;
2063         lck_mtx_unlock(mbuf_mlock);
2064
2065         return (num - need);
2066 }
2067
2068 /*
2069  * Common de-allocator for rudimentary objects called by the CPU cache
2070  * layer when one or more elements need to be returned to the appropriate
2071  * global freelist.
2072  */
2073 static void
2074 mbuf_slab_free(void *arg, mcache_obj_t *list, __unused int purged)
2075 {
2076         mbuf_class_t class = (mbuf_class_t)arg;
2077         mcache_obj_t *nlist;
2078         unsigned int num = 0;
2079         int w;
2080
2081         ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
2082
2083         lck_mtx_lock(mbuf_mlock);
2084
2085         for (;;) {
2086                 nlist = list->obj_next;
2087                 list->obj_next = NULL;
2088                 slab_free(class, list);
2089                 ++num;
2090                 if ((list = nlist) == NULL)
2091                         break;
2092         }
2093         m_free_cnt(class) += num;
2094
2095         if ((w = mb_waiters) > 0)
2096                 mb_waiters = 0;
2097
2098         lck_mtx_unlock(mbuf_mlock);
2099
2100         if (w != 0)
2101                 wakeup(mb_waitchan);
2102 }
2103
2104 /*
2105  * Common auditor for rudimentary objects called by the CPU cache layer
2106  * during an allocation or free request.  For the former, this is called
2107  * after the objects are obtained from either the bucket or slab layer
2108  * and before they are returned to the caller.  For the latter, this is
2109  * called immediately during free and before placing the objects into
2110  * the bucket or slab layer.
2111  */
2112 static void
2113 mbuf_slab_audit(void *arg, mcache_obj_t *list, boolean_t alloc)
2114 {
2115         mbuf_class_t class = (mbuf_class_t)arg;
2116         mcache_audit_t *mca;
2117
2118         ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
2119
2120         while (list != NULL) {
2121                 lck_mtx_lock(mbuf_mlock);
2122                 mca = mcl_audit_buf2mca(class, list);
2123
2124                 /* Do the sanity checks */
2125                 if (class == MC_MBUF) {
2126                         mcl_audit_mbuf(mca, list, FALSE, alloc);
2127                         ASSERT(mca->mca_uflags & MB_SCVALID);
2128                 } else {
2129                         mcl_audit_cluster(mca, list, m_maxsize(class),
2130                             alloc, TRUE);
2131                         ASSERT(!(mca->mca_uflags & MB_SCVALID));
2132                 }
2133                 /* Record this transaction */
2134                 if (mcltrace)
2135                         mcache_buffer_log(mca, list, m_cache(class), &mb_start);
2136
2137                 if (alloc)
2138                         mca->mca_uflags |= MB_INUSE;
2139                 else
2140                         mca->mca_uflags &= ~MB_INUSE;
2141                 /* Unpair the object (unconditionally) */
2142                 mca->mca_uptr = NULL;
2143                 lck_mtx_unlock(mbuf_mlock);
2144
2145                 list = list->obj_next;
2146         }
2147 }
2148
2149 /*
2150  * Common notify routine for all caches.  It is called by mcache when
2151  * one or more objects get freed.  We use this indication to trigger
2152  * the wakeup of any sleeping threads so that they can retry their
2153  * allocation requests.
2154  */
2155 static void
2156 mbuf_slab_notify(void *arg, u_int32_t reason)
2157 {
2158         mbuf_class_t class = (mbuf_class_t)arg;
2159         int w;
2160
2161         ASSERT(MBUF_CLASS_VALID(class));
2162
2163         if (reason != MCN_RETRYALLOC)
2164                 return;
2165
2166         lck_mtx_lock(mbuf_mlock);
2167         if ((w = mb_waiters) > 0) {
2168                 m_notified(class)++;
2169                 mb_waiters = 0;
2170         }
2171         lck_mtx_unlock(mbuf_mlock);
2172
2173         if (w != 0)
2174                 wakeup(mb_waitchan);
2175 }
2176
2177 /*
2178  * Obtain object(s) from the composite class's freelist.
2179  */
2180 static unsigned int
2181 cslab_alloc(mbuf_class_t class, mcache_obj_t ***plist, unsigned int num)
2182 {
2183         unsigned int need = num;
2184         mcl_slab_t *sp, *clsp, *nsp;
2185         struct mbuf *m;
2186         mcache_obj_t **list = *plist;
2187         void *cl;
2188
2189         VERIFY(need > 0);
2190         VERIFY(class != MC_MBUF_16KCL || njcl > 0);
2191         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2192
2193         /* Get what we can from the freelist */
2194         while ((*list = m_cobjlist(class)) != NULL) {
2195                 MRANGE(*list);
2196
2197                 m = (struct mbuf *)*list;
2198                 sp = slab_get(m);
2199                 cl = m->m_ext.ext_buf;
2200                 clsp = slab_get(cl);
2201                 VERIFY(m->m_flags == M_EXT && cl != NULL);
2202                 VERIFY(MEXT_RFA(m) != NULL && MBUF_IS_COMPOSITE(m));
2203
2204                 if (class == MC_MBUF_CL) {
2205                         VERIFY(clsp->sl_refcnt >= 1 &&
2206                             clsp->sl_refcnt <= NCLPBG);
2207                 } else {
2208                         VERIFY(clsp->sl_refcnt == 1);
2209                 }
2210
2211                 if (class == MC_MBUF_16KCL) {
2212                         int k;
2213                         for (nsp = clsp, k = 1; k < NSLABSP16KB; k++) {
2214                                 nsp = nsp->sl_next;
2215                                 /* Next slab must already be present */
2216                                 VERIFY(nsp != NULL);
2217                                 VERIFY(nsp->sl_refcnt == 1);
2218                         }
2219                 }
2220
2221                 if ((m_cobjlist(class) = (*list)->obj_next) != NULL &&
2222                     !MBUF_IN_MAP(m_cobjlist(class))) {
2223                         slab_nextptr_panic(sp, m_cobjlist(class));
2224                         /* NOTREACHED */
2225                 }
2226                 (*list)->obj_next = NULL;
2227                 list = *plist = &(*list)->obj_next;
2228
2229                 if (--need == 0)
2230                         break;
2231         }
2232         m_infree(class) -= (num - need);
2233
2234         return (num - need);
2235 }
2236
2237 /*
2238  * Place object(s) back into a composite class's freelist.
2239  */
2240 static unsigned int
2241 cslab_free(mbuf_class_t class, mcache_obj_t *list, int purged)
2242 {
2243         mcache_obj_t *o, *tail;
2244         unsigned int num = 0;
2245         struct mbuf *m, *ms;
2246         mcache_audit_t *mca = NULL;
2247         mcache_obj_t *ref_list = NULL;
2248         mcl_slab_t *clsp, *nsp;
2249         void *cl;
2250         mbuf_class_t cl_class;
2251
2252         ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
2253         VERIFY(class != MC_MBUF_16KCL || njcl > 0);
2254         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2255
2256         if (class == MC_MBUF_CL) {
2257                 cl_class = MC_CL;
2258         } else if (class == MC_MBUF_BIGCL) {
2259                 cl_class = MC_BIGCL;
2260         } else {
2261                 VERIFY(class == MC_MBUF_16KCL);
2262                 cl_class = MC_16KCL;
2263         }
2264
2265         o = tail = list;
2266
2267         while ((m = ms = (struct mbuf *)o) != NULL) {
2268                 mcache_obj_t *rfa, *nexto = o->obj_next;
2269
2270                 /* Do the mbuf sanity checks */
2271                 if (mclaudit != NULL) {
2272                         mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
2273                         if (mclverify) {
2274                                 mcache_audit_free_verify(mca, m, 0,
2275                                     m_maxsize(MC_MBUF));
2276                         }
2277                         ms = MCA_SAVED_MBUF_PTR(mca);
2278                 }
2279
2280                 /* Do the cluster sanity checks */
2281                 cl = ms->m_ext.ext_buf;
2282                 clsp = slab_get(cl);
2283                 if (mclverify) {
2284                         size_t size = m_maxsize(cl_class);
2285                         mcache_audit_free_verify(mcl_audit_buf2mca(cl_class,
2286                             (mcache_obj_t *)cl), cl, 0, size);
2287                 }
2288                 VERIFY(ms->m_type == MT_FREE);
2289                 VERIFY(ms->m_flags == M_EXT);
2290                 VERIFY(MEXT_RFA(ms) != NULL && MBUF_IS_COMPOSITE(ms));
2291                 if (cl_class == MC_CL) {
2292                         VERIFY(clsp->sl_refcnt >= 1 &&
2293                             clsp->sl_refcnt <= NCLPBG);
2294                 } else {
2295                         VERIFY(clsp->sl_refcnt == 1);
2296                 }
2297                 if (cl_class == MC_16KCL) {
2298                         int k;
2299                         for (nsp = clsp, k = 1; k < NSLABSP16KB; k++) {
2300                                 nsp = nsp->sl_next;
2301                                 /* Next slab must already be present */
2302                                 VERIFY(nsp != NULL);
2303                                 VERIFY(nsp->sl_refcnt == 1);
2304                         }
2305                 }
2306
2307                 /*
2308                  * If we're asked to purge, restore the actual mbuf using
2309                  * contents of the shadow structure (if auditing is enabled)
2310                  * and clear EXTF_COMPOSITE flag from the mbuf, as we are
2311                  * about to free it and the attached cluster into their caches.
2312                  */
2313                 if (purged) {
2314                         /* Restore constructed mbuf fields */
2315                         if (mclaudit != NULL)
2316                                 mcl_audit_restore_mbuf(m, mca, TRUE);
2317
2318                         MEXT_REF(m) = 0;
2319                         MEXT_FLAGS(m) = 0;
2320
2321                         rfa = (mcache_obj_t *)(void *)MEXT_RFA(m);
2322                         rfa->obj_next = ref_list;
2323                         ref_list = rfa;
2324                         MEXT_RFA(m) = NULL;
2325
2326                         m->m_type = MT_FREE;
2327                         m->m_flags = m->m_len = 0;
2328                         m->m_next = m->m_nextpkt = NULL;
2329
2330                         /* Save mbuf fields and make auditing happy */
2331                         if (mclaudit != NULL)
2332                                 mcl_audit_mbuf(mca, o, FALSE, FALSE);
2333
2334                         VERIFY(m_total(class) > 0);
2335                         m_total(class)--;
2336
2337                         /* Free the mbuf */
2338                         o->obj_next = NULL;
2339                         slab_free(MC_MBUF, o);
2340
2341                         /* And free the cluster */
2342                         ((mcache_obj_t *)cl)->obj_next = NULL;
2343                         if (class == MC_MBUF_CL)
2344                                 slab_free(MC_CL, cl);
2345                         else if (class == MC_MBUF_BIGCL)
2346                                 slab_free(MC_BIGCL, cl);
2347                         else
2348                                 slab_free(MC_16KCL, cl);
2349                 }
2350
2351                 ++num;
2352                 tail = o;
2353                 o = nexto;
2354         }
2355
2356         if (!purged) {
2357                 tail->obj_next = m_cobjlist(class);
2358                 m_cobjlist(class) = list;
2359                 m_infree(class) += num;
2360         } else if (ref_list != NULL) {
2361                 mcache_free_ext(ref_cache, ref_list);
2362         }
2363
2364         return (num);
2365 }
2366
2367 /*
2368  * Common allocator for composite objects called by the CPU cache layer
2369  * during an allocation request whenever there is no available element in
2370  * the bucket layer.  It returns one or more composite elements from the
2371  * appropriate global freelist.  If the freelist is empty, it will attempt
2372  * to obtain the rudimentary objects from their caches and construct them
2373  * into composite mbuf + cluster objects.
2374  */
2375 static unsigned int
2376 mbuf_cslab_alloc(void *arg, mcache_obj_t ***plist, unsigned int needed,
2377     int wait)
2378 {
2379         mbuf_class_t class = (mbuf_class_t)arg;
2380         mbuf_class_t cl_class = 0;
2381         unsigned int num = 0, cnum = 0, want = needed;
2382         mcache_obj_t *ref_list = NULL;
2383         mcache_obj_t *mp_list = NULL;
2384         mcache_obj_t *clp_list = NULL;
2385         mcache_obj_t **list;
2386         struct ext_ref *rfa;
2387         struct mbuf *m;
2388         void *cl;
2389
2390         ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
2391         ASSERT(needed > 0);
2392
2393         VERIFY(class != MC_MBUF_16KCL || njcl > 0);
2394
2395         /* There should not be any slab for this class */
2396         VERIFY(m_slab_cnt(class) == 0 &&
2397             m_slablist(class).tqh_first == NULL &&
2398             m_slablist(class).tqh_last == NULL);
2399
2400         lck_mtx_lock(mbuf_mlock);
2401
2402         /* Try using the freelist first */
2403         num = cslab_alloc(class, plist, needed);
2404         list = *plist;
2405         if (num == needed) {
2406                 m_alloc_cnt(class) += num;
2407                 lck_mtx_unlock(mbuf_mlock);
2408                 return (needed);
2409         }
2410
2411         lck_mtx_unlock(mbuf_mlock);
2412
2413         /*
2414          * We could not satisfy the request using the freelist alone;
2415          * allocate from the appropriate rudimentary caches and use
2416          * whatever we can get to construct the composite objects.
2417          */
2418         needed -= num;
2419
2420         /*
2421          * Mark these allocation requests as coming from a composite cache.
2422          * Also, if the caller is willing to be blocked, mark the request
2423          * with MCR_FAILOK such that we don't end up sleeping at the mbuf
2424          * slab layer waiting for the individual object when one or more
2425          * of the already-constructed composite objects are available.
2426          */
2427         wait |= MCR_COMP;
2428         if (!(wait & MCR_NOSLEEP))
2429                 wait |= MCR_FAILOK;
2430
2431         /* allocate mbufs */
2432         needed = mcache_alloc_ext(m_cache(MC_MBUF), &mp_list, needed, wait);
2433         if (needed == 0) {
2434                 ASSERT(mp_list == NULL);
2435                 goto fail;
2436         }
2437
2438         /* allocate clusters */
2439         if (class == MC_MBUF_CL) {
2440                 cl_class = MC_CL;
2441         } else if (class == MC_MBUF_BIGCL) {
2442                 cl_class = MC_BIGCL;
2443         } else {
2444                 VERIFY(class == MC_MBUF_16KCL);
2445                 cl_class = MC_16KCL;
2446         }
2447         needed = mcache_alloc_ext(m_cache(cl_class), &clp_list, needed, wait);
2448         if (needed == 0) {
2449                 ASSERT(clp_list == NULL);
2450                 goto fail;
2451         }
2452
2453         needed = mcache_alloc_ext(ref_cache, &ref_list, needed, wait);
2454         if (needed == 0) {
2455                 ASSERT(ref_list == NULL);
2456                 goto fail;
2457         }
2458
2459         /*
2460          * By this time "needed" is MIN(mbuf, cluster, ref).  Any left
2461          * overs will get freed accordingly before we return to caller.
2462          */
2463         for (cnum = 0; cnum < needed; cnum++) {
2464                 struct mbuf *ms;
2465
2466                 m = ms = (struct mbuf *)mp_list;
2467                 mp_list = mp_list->obj_next;
2468
2469                 cl = clp_list;
2470                 clp_list = clp_list->obj_next;
2471                 ((mcache_obj_t *)cl)->obj_next = NULL;
2472
2473                 rfa = (struct ext_ref *)ref_list;
2474                 ref_list = ref_list->obj_next;
2475                 ((mcache_obj_t *)(void *)rfa)->obj_next = NULL;
2476
2477                 /*
2478                  * If auditing is enabled, construct the shadow mbuf
2479                  * in the audit structure instead of in the actual one.
2480                  * mbuf_cslab_audit() will take care of restoring the
2481                  * contents after the integrity check.
2482                  */
2483                 if (mclaudit != NULL) {
2484                         mcache_audit_t *mca, *cl_mca;
2485
2486                         lck_mtx_lock(mbuf_mlock);
2487                         mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
2488                         ms = MCA_SAVED_MBUF_PTR(mca);
2489                         cl_mca = mcl_audit_buf2mca(MC_CL, (mcache_obj_t *)cl);
2490
2491                         /*
2492                          * Pair them up.  Note that this is done at the time
2493                          * the mbuf+cluster objects are constructed.  This
2494                          * information should be treated as "best effort"
2495                          * debugging hint since more than one mbufs can refer
2496                          * to a cluster.  In that case, the cluster might not
2497                          * be freed along with the mbuf it was paired with.
2498                          */
2499                         mca->mca_uptr = cl_mca;
2500                         cl_mca->mca_uptr = mca;
2501
2502                         ASSERT(mca->mca_uflags & MB_SCVALID);
2503                         ASSERT(!(cl_mca->mca_uflags & MB_SCVALID));
2504                         lck_mtx_unlock(mbuf_mlock);
2505
2506                         /* Technically, they are in the freelist */
2507                         if (mclverify) {
2508                                 size_t size;
2509
2510                                 mcache_set_pattern(MCACHE_FREE_PATTERN, m,
2511                                     m_maxsize(MC_MBUF));
2512
2513                                 if (class == MC_MBUF_CL)
2514                                         size = m_maxsize(MC_CL);
2515                                 else if (class == MC_MBUF_BIGCL)
2516                                         size = m_maxsize(MC_BIGCL);
2517                                 else
2518                                         size = m_maxsize(MC_16KCL);
2519
2520                                 mcache_set_pattern(MCACHE_FREE_PATTERN, cl,
2521                                     size);
2522                         }
2523                 }
2524
2525                 MBUF_INIT(ms, 0, MT_FREE);
2526                 if (class == MC_MBUF_16KCL) {
2527                         MBUF_16KCL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE);
2528                 } else if (class == MC_MBUF_BIGCL) {
2529                         MBUF_BIGCL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE);
2530                 } else {
2531                         MBUF_CL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE);
2532                 }
2533                 VERIFY(ms->m_flags == M_EXT);
2534                 VERIFY(MEXT_RFA(ms) != NULL && MBUF_IS_COMPOSITE(ms));
2535
2536                 *list = (mcache_obj_t *)m;
2537                 (*list)->obj_next = NULL;
2538                 list = *plist = &(*list)->obj_next;
2539         }
2540
2541 fail:
2542         /*
2543          * Free up what's left of the above.
2544          */
2545         if (mp_list != NULL)
2546                 mcache_free_ext(m_cache(MC_MBUF), mp_list);
2547         if (clp_list != NULL)
2548                 mcache_free_ext(m_cache(cl_class), clp_list);
2549         if (ref_list != NULL)
2550                 mcache_free_ext(ref_cache, ref_list);
2551
2552         lck_mtx_lock(mbuf_mlock);
2553         if (num > 0 || cnum > 0) {
2554                 m_total(class) += cnum;
2555                 VERIFY(m_total(class) <= m_maxlimit(class));
2556                 m_alloc_cnt(class) += num + cnum;
2557         }
2558         if ((num + cnum) < want)
2559                 m_fail_cnt(class) += (want - (num + cnum));
2560         lck_mtx_unlock(mbuf_mlock);
2561
2562         return (num + cnum);
2563 }
2564
2565 /*
2566  * Common de-allocator for composite objects called by the CPU cache
2567  * layer when one or more elements need to be returned to the appropriate
2568  * global freelist.
2569  */
2570 static void
2571 mbuf_cslab_free(void *arg, mcache_obj_t *list, int purged)
2572 {
2573         mbuf_class_t class = (mbuf_class_t)arg;
2574         unsigned int num;
2575         int w;
2576
2577         ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
2578
2579         lck_mtx_lock(mbuf_mlock);
2580
2581         num = cslab_free(class, list, purged);
2582         m_free_cnt(class) += num;
2583
2584         if ((w = mb_waiters) > 0)
2585                 mb_waiters = 0;
2586
2587         lck_mtx_unlock(mbuf_mlock);
2588
2589         if (w != 0)
2590                 wakeup(mb_waitchan);
2591 }
2592
2593 /*
2594  * Common auditor for composite objects called by the CPU cache layer
2595  * during an allocation or free request.  For the former, this is called
2596  * after the objects are obtained from either the bucket or slab layer
2597  * and before they are returned to the caller.  For the latter, this is
2598  * called immediately during free and before placing the objects into
2599  * the bucket or slab layer.
2600  */
2601 static void
2602 mbuf_cslab_audit(void *arg, mcache_obj_t *list, boolean_t alloc)
2603 {
2604         mbuf_class_t class = (mbuf_class_t)arg;
2605         mcache_audit_t *mca;
2606         struct mbuf *m, *ms;
2607         mcl_slab_t *clsp, *nsp;
2608         size_t size;
2609         void *cl;
2610
2611         ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
2612
2613         while ((m = ms = (struct mbuf *)list) != NULL) {
2614                 lck_mtx_lock(mbuf_mlock);
2615                 /* Do the mbuf sanity checks and record its transaction */
2616                 mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
2617                 mcl_audit_mbuf(mca, m, TRUE, alloc);
2618                 if (mcltrace)
2619                         mcache_buffer_log(mca, m, m_cache(class), &mb_start);
2620
2621                 if (alloc)
2622                         mca->mca_uflags |= MB_COMP_INUSE;
2623                 else
2624                         mca->mca_uflags &= ~MB_COMP_INUSE;
2625
2626                 /*
2627                  * Use the shadow mbuf in the audit structure if we are
2628                  * freeing, since the contents of the actual mbuf has been
2629                  * pattern-filled by the above call to mcl_audit_mbuf().
2630                  */
2631                 if (!alloc && mclverify)
2632                         ms = MCA_SAVED_MBUF_PTR(mca);
2633
2634                 /* Do the cluster sanity checks and record its transaction */
2635                 cl = ms->m_ext.ext_buf;
2636                 clsp = slab_get(cl);
2637                 VERIFY(ms->m_flags == M_EXT && cl != NULL);
2638                 VERIFY(MEXT_RFA(ms) != NULL && MBUF_IS_COMPOSITE(ms));
2639                 if (class == MC_MBUF_CL)
2640                         VERIFY(clsp->sl_refcnt >= 1 &&
2641                             clsp->sl_refcnt <= NCLPBG);
2642                 else
2643                         VERIFY(clsp->sl_refcnt == 1);
2644
2645                 if (class == MC_MBUF_16KCL) {
2646                         int k;
2647                         for (nsp = clsp, k = 1; k < NSLABSP16KB; k++) {
2648                                 nsp = nsp->sl_next;
2649                                 /* Next slab must already be present */
2650                                 VERIFY(nsp != NULL);
2651                                 VERIFY(nsp->sl_refcnt == 1);
2652                         }
2653                 }
2654
2655                 mca = mcl_audit_buf2mca(MC_CL, cl);
2656                 if (class == MC_MBUF_CL)
2657                         size = m_maxsize(MC_CL);
2658                 else if (class == MC_MBUF_BIGCL)
2659                         size = m_maxsize(MC_BIGCL);
2660                 else
2661                         size = m_maxsize(MC_16KCL);
2662                 mcl_audit_cluster(mca, cl, size, alloc, FALSE);
2663                 if (mcltrace)
2664                         mcache_buffer_log(mca, cl, m_cache(class), &mb_start);
2665
2666                 if (alloc)
2667                         mca->mca_uflags |= MB_COMP_INUSE;
2668                 else
2669                         mca->mca_uflags &= ~MB_COMP_INUSE;
2670                 lck_mtx_unlock(mbuf_mlock);
2671
2672                 list = list->obj_next;
2673         }
2674 }
2675
2676 /*
2677  * Allocate some number of mbuf clusters and place on cluster freelist.
2678  */
2679 static int
2680 m_clalloc(const u_int32_t num, const int wait, const u_int32_t bufsize)
2681 {
2682         int i;
2683         vm_size_t size = 0;
2684         int numpages = 0, large_buffer = (bufsize == m_maxsize(MC_16KCL));
2685         vm_offset_t page = 0;
2686         mcache_audit_t *mca_list = NULL;
2687         mcache_obj_t *con_list = NULL;
2688         mcl_slab_t *sp;
2689
2690         VERIFY(bufsize == m_maxsize(MC_BIGCL) ||
2691             bufsize == m_maxsize(MC_16KCL));
2692
2693         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2694
2695         /*
2696          * Multiple threads may attempt to populate the cluster map one
2697          * after another.  Since we drop the lock below prior to acquiring
2698          * the physical page(s), our view of the cluster map may no longer
2699          * be accurate, and we could end up over-committing the pages beyond
2700          * the maximum allowed for each class.  To prevent it, this entire
2701          * operation (including the page mapping) is serialized.
2702          */
2703         while (mb_clalloc_busy) {
2704                 mb_clalloc_waiters++;
2705                 (void) msleep(mb_clalloc_waitchan, mbuf_mlock,
2706                     (PZERO-1), "m_clalloc", NULL);
2707                 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2708         }
2709
2710         /* We are busy now; tell everyone else to go away */
2711         mb_clalloc_busy = TRUE;
2712
2713         /*
2714          * Honor the caller's wish to block or not block.  We have a way
2715          * to grow the pool asynchronously using the mbuf worker thread.
2716          */
2717         i = m_howmany(num, bufsize);
2718         if (i == 0 || (wait & M_DONTWAIT))
2719                 goto out;
2720
2721         lck_mtx_unlock(mbuf_mlock);
2722
2723         size = round_page(i * bufsize);
2724         page = kmem_mb_alloc(mb_map, size, large_buffer);
2725
2726         /*
2727          * If we did ask for "n" 16KB physically contiguous chunks
2728          * and didn't get them, then please try again without this
2729          * restriction.
2730          */
2731         if (large_buffer && page == 0)
2732                 page = kmem_mb_alloc(mb_map, size, 0);
2733
2734         if (page == 0) {
2735                 if (bufsize == m_maxsize(MC_BIGCL)) {
2736                         /* Try for 1 page if failed, only 4KB request */
2737                         size = NBPG;
2738                         page = kmem_mb_alloc(mb_map, size, 0);
2739                 }
2740
2741                 if (page == 0) {
2742                         lck_mtx_lock(mbuf_mlock);
2743                         goto out;
2744                 }
2745         }
2746
2747         VERIFY(IS_P2ALIGNED(page, NBPG));
2748         numpages = size / NBPG;
2749
2750         /* If auditing is enabled, allocate the audit structures now */
2751         if (mclaudit != NULL) {
2752                 int needed;
2753
2754                 /*
2755                  * Yes, I realize this is a waste of memory for clusters
2756                  * that never get transformed into mbufs, as we may end
2757                  * up with NMBPBG-1 unused audit structures per cluster.
2758                  * But doing so tremendously simplifies the allocation
2759                  * strategy, since at this point we are not holding the
2760                  * mbuf lock and the caller is okay to be blocked.
2761                  */
2762                 if (bufsize == m_maxsize(MC_BIGCL)) {
2763                         needed = numpages * NMBPBG;
2764
2765                         i = mcache_alloc_ext(mcl_audit_con_cache,
2766                             &con_list, needed, MCR_SLEEP);
2767
2768                         VERIFY(con_list != NULL && i == needed);
2769                 } else {
2770                         needed = numpages / NSLABSP16KB;
2771                 }
2772
2773                 i = mcache_alloc_ext(mcache_audit_cache,
2774                     (mcache_obj_t **)&mca_list, needed, MCR_SLEEP);
2775
2776                 VERIFY(mca_list != NULL && i == needed);
2777         }
2778
2779         lck_mtx_lock(mbuf_mlock);
2780
2781         for (i = 0; i < numpages; i++, page += NBPG) {
2782                 ppnum_t offset = ((char *)page - (char *)mbutl) / NBPG;
2783                 ppnum_t new_page = pmap_find_phys(kernel_pmap, page);
2784                 mbuf_class_t class = MC_BIGCL;
2785
2786                 /*
2787                  * If there is a mapper the appropriate I/O page is returned;
2788                  * zero out the page to discard its past contents to prevent
2789                  * exposing leftover kernel memory.
2790                  */
2791                 VERIFY(offset < mcl_pages);
2792                 if (mcl_paddr_base != 0) {
2793                         bzero((void *)(uintptr_t) page, page_size);
2794                         new_page = IOMapperInsertPage(mcl_paddr_base,
2795                             offset, new_page);
2796                 }
2797                 mcl_paddr[offset] = new_page;
2798
2799                 /* Pattern-fill this fresh page */
2800                 if (mclverify) {
2801                         mcache_set_pattern(MCACHE_FREE_PATTERN,
2802                             (caddr_t)page, NBPG);
2803                 }
2804                 if (bufsize == m_maxsize(MC_BIGCL)) {
2805                         union mbigcluster *mbc = (union mbigcluster *)page;
2806
2807                         /* One for the entire page */
2808                         sp = slab_get(mbc);
2809                         if (mclaudit != NULL) {
2810                                 mcl_audit_init(mbc, &mca_list, &con_list,
2811                                     AUDIT_CONTENTS_SIZE, NMBPBG);
2812                         }
2813                         VERIFY(sp->sl_refcnt == 0 && sp->sl_flags == 0);
2814                         slab_init(sp, MC_BIGCL, SLF_MAPPED,
2815                             mbc, mbc, bufsize, 0, 1);
2816
2817                         /* Insert this slab */
2818                         slab_insert(sp, MC_BIGCL);
2819
2820                         /* Update stats now since slab_get() drops the lock */
2821                         mbstat.m_bigclfree = ++m_infree(MC_BIGCL) +
2822                             m_infree(MC_MBUF_BIGCL);
2823                         mbstat.m_bigclusters = ++m_total(MC_BIGCL);
2824                         VERIFY(m_total(MC_BIGCL) <= m_maxlimit(MC_BIGCL));
2825                         class = MC_BIGCL;
2826                 } else if ((i % NSLABSP16KB) == 0) {
2827                         union m16kcluster *m16kcl = (union m16kcluster *)page;
2828                         mcl_slab_t *nsp;
2829                         int k;
2830
2831                         VERIFY(njcl > 0);
2832                         /* One for the entire 16KB */
2833                         sp = slab_get(m16kcl);
2834                         if (mclaudit != NULL)
2835                                 mcl_audit_init(m16kcl, &mca_list, NULL, 0, 1);
2836
2837                         VERIFY(sp->sl_refcnt == 0 && sp->sl_flags == 0);
2838                         slab_init(sp, MC_16KCL, SLF_MAPPED,
2839                             m16kcl, m16kcl, bufsize, 0, 1);
2840
2841                         /*
2842                          * 2nd-Nth page's slab is part of the first one,
2843                          * where N is NSLABSP16KB.
2844                          */
2845                         for (k = 1; k < NSLABSP16KB; k++) {
2846                                 nsp = slab_get(((union mbigcluster *)page) + k);
2847                                 VERIFY(nsp->sl_refcnt == 0 &&
2848                                     nsp->sl_flags == 0);
2849                                 slab_init(nsp, MC_16KCL,
2850                                     SLF_MAPPED | SLF_PARTIAL,
2851                                     m16kcl, NULL, 0, 0, 0);
2852                         }
2853
2854                         /* Insert this slab */
2855                         slab_insert(sp, MC_16KCL);
2856
2857                         /* Update stats now since slab_get() drops the lock */
2858                         m_infree(MC_16KCL)++;
2859                         m_total(MC_16KCL)++;
2860                         VERIFY(m_total(MC_16KCL) <= m_maxlimit(MC_16KCL));
2861                         class = MC_16KCL;
2862                 }
2863                 if (!mb_peak_newreport && mbuf_report_usage(class))
2864                         mb_peak_newreport = TRUE;
2865         }
2866         VERIFY(mca_list == NULL && con_list == NULL);
2867
2868         /* We're done; let others enter */
2869         mb_clalloc_busy = FALSE;
2870         if (mb_clalloc_waiters > 0) {
2871                 mb_clalloc_waiters = 0;
2872                 wakeup(mb_clalloc_waitchan);
2873         }
2874
2875         if (bufsize == m_maxsize(MC_BIGCL))
2876                 return (numpages);
2877
2878         VERIFY(bufsize == m_maxsize(MC_16KCL));
2879         return (numpages / NSLABSP16KB);
2880
2881 out:
2882         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2883
2884         /* We're done; let others enter */
2885         mb_clalloc_busy = FALSE;
2886         if (mb_clalloc_waiters > 0) {
2887                 mb_clalloc_waiters = 0;
2888                 wakeup(mb_clalloc_waitchan);
2889         }
2890
2891         /*
2892          * When non-blocking we kick a thread if we have to grow the
2893          * pool or if the number of free clusters is less than requested.
2894          */
2895         if (bufsize == m_maxsize(MC_BIGCL)) {
2896                 if (i > 0) {
2897                         /*
2898                          * Remember total number of 4KB clusters needed
2899                          * at this time.
2900                          */
2901                         i += m_total(MC_BIGCL);
2902                         if (i > mbuf_expand_big) {
2903                                 mbuf_expand_big = i;
2904                                 if (mbuf_worker_ready)
2905                                         wakeup((caddr_t)&mbuf_worker_run);
2906                         }
2907                 }
2908
2909                 if (m_infree(MC_BIGCL) >= num)
2910                         return (1);
2911         } else {
2912                 if (i > 0) {
2913                         /*
2914                          * Remember total number of 16KB clusters needed
2915                          * at this time.
2916                          */
2917                         i += m_total(MC_16KCL);
2918                         if (i > mbuf_expand_16k) {
2919                                 mbuf_expand_16k = i;
2920                                 if (mbuf_worker_ready)
2921                                         wakeup((caddr_t)&mbuf_worker_run);
2922                         }
2923                 }
2924
2925                 if (m_infree(MC_16KCL) >= num)
2926                         return (1);
2927         }
2928         return (0);
2929 }
2930
2931 /*
2932  * Populate the global freelist of the corresponding buffer class.
2933  */
2934 static int
2935 freelist_populate(mbuf_class_t class, unsigned int num, int wait)
2936 {
2937         mcache_obj_t *o = NULL;
2938         int i, numpages = 0, count;
2939
2940         VERIFY(class == MC_MBUF || class == MC_CL || class == MC_BIGCL ||
2941             class == MC_16KCL);
2942
2943         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2944
2945         switch (class) {
2946         case MC_MBUF:
2947         case MC_CL:
2948         case MC_BIGCL:
2949                 numpages = (num * m_size(class) + NBPG - 1) / NBPG;
2950                 i = m_clalloc(numpages, wait, m_maxsize(MC_BIGCL));
2951
2952                 /* Respect the 4KB clusters minimum limit */
2953                 if (m_total(MC_BIGCL) == m_maxlimit(MC_BIGCL) &&
2954                     m_infree(MC_BIGCL) <= m_minlimit(MC_BIGCL)) {
2955                         if (class != MC_BIGCL || (wait & MCR_COMP))
2956                                 return (0);
2957                 }
2958                 if (class == MC_BIGCL)
2959                         return (i != 0);
2960                 break;
2961
2962         case MC_16KCL:
2963                 return (m_clalloc(num, wait, m_maxsize(class)) != 0);
2964                 /* NOTREACHED */
2965
2966         default:
2967                 VERIFY(0);
2968                 /* NOTREACHED */
2969         }
2970
2971         VERIFY(class == MC_MBUF || class == MC_CL);
2972
2973         /* how many objects will we cut the page into? */
2974         int numobj = (class == MC_MBUF ? NMBPBG : NCLPBG);
2975
2976         for (count = 0; count < numpages; count++) {
2977
2978                 /* respect totals, minlimit, maxlimit */
2979                 if (m_total(MC_BIGCL) <= m_minlimit(MC_BIGCL) ||
2980                     m_total(class) >= m_maxlimit(class))
2981                         break;
2982
2983                 if ((o = slab_alloc(MC_BIGCL, wait)) == NULL)
2984                         break;
2985
2986                 struct mbuf *m = (struct mbuf *)o;
2987                 union mcluster *c = (union mcluster *)o;
2988                 mcl_slab_t *sp = slab_get(o);
2989                 mcache_audit_t *mca = NULL;
2990
2991                 VERIFY(slab_is_detached(sp) &&
2992                     (sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED);
2993
2994                 /*
2995                  * Make sure that the cluster is unmolested
2996                  * while in freelist
2997                  */
2998                 if (mclverify) {
2999                         mca = mcl_audit_buf2mca(MC_BIGCL, o);
3000                         mcache_audit_free_verify(mca, o, 0,
3001                             m_maxsize(MC_BIGCL));
3002                 }
3003
3004                 /* Reinitialize it as an mbuf or 2K slab */
3005                 slab_init(sp, class, sp->sl_flags,
3006                     sp->sl_base, NULL, sp->sl_len, 0, numobj);
3007
3008                 VERIFY(o == (mcache_obj_t *)sp->sl_base);
3009                 VERIFY(sp->sl_head == NULL);
3010
3011                 VERIFY(m_total(MC_BIGCL) > 0);
3012                 m_total(MC_BIGCL)--;
3013                 mbstat.m_bigclusters = m_total(MC_BIGCL);
3014
3015                 m_total(class) += numobj;
3016                 m_infree(class) += numobj;
3017
3018                 VERIFY(m_total(MC_BIGCL) >= m_minlimit(MC_BIGCL));
3019                 VERIFY(m_total(class) <= m_maxlimit(class));
3020                 if (!mb_peak_newreport && mbuf_report_usage(class))
3021                         mb_peak_newreport = TRUE;
3022
3023                 i = numobj;
3024                 if (class == MC_MBUF) {
3025                         mbstat.m_mbufs = m_total(MC_MBUF);
3026                         mtype_stat_add(MT_FREE, NMBPBG);
3027                         while (i--) {
3028                                 /*
3029                                  * If auditing is enabled, construct the
3030                                  * shadow mbuf in the audit structure
3031                                  * instead of the actual one.
3032                                  * mbuf_slab_audit() will take care of
3033                                  * restoring the contents after the
3034                                  * integrity check.
3035                                  */
3036                                 if (mclaudit != NULL) {
3037                                         struct mbuf *ms;
3038                                         mca = mcl_audit_buf2mca(MC_MBUF,
3039                                             (mcache_obj_t *)m);
3040                                         ms = MCA_SAVED_MBUF_PTR(mca);
3041                                         ms->m_type = MT_FREE;
3042                                 } else {
3043                                         m->m_type = MT_FREE;
3044                                 }
3045                                 m->m_next = sp->sl_head;
3046                                 sp->sl_head = (void *)m++;
3047                         }
3048                 } else { /* MC_CL */
3049                         mbstat.m_clfree =
3050                             m_infree(MC_CL) + m_infree(MC_MBUF_CL);
3051                         mbstat.m_clusters = m_total(MC_CL);
3052                         while (i--) {
3053                                 c->mcl_next = sp->sl_head;
3054                                 sp->sl_head = (void *)c++;
3055                         }
3056                 }
3057
3058                 /* Insert into the mbuf or 2k slab list */
3059                 slab_insert(sp, class);
3060
3061                 if ((i = mb_waiters) > 0)
3062                         mb_waiters = 0;
3063                 if (i != 0)
3064                         wakeup(mb_waitchan);
3065         }
3066         return (count != 0);
3067 }
3068
3069 /*
3070  * For each class, initialize the freelist to hold m_minlimit() objects.
3071  */
3072 static void
3073 freelist_init(mbuf_class_t class)
3074 {
3075         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
3076
3077         VERIFY(class == MC_CL || class == MC_BIGCL);
3078         VERIFY(m_total(class) == 0);
3079         VERIFY(m_minlimit(class) > 0);
3080
3081         while (m_total(class) < m_minlimit(class))
3082                 (void) freelist_populate(class, m_minlimit(class), M_WAIT);
3083
3084         VERIFY(m_total(class) >= m_minlimit(class));
3085 }
3086
3087 /*
3088  * (Inaccurately) check if it might be worth a trip back to the
3089  * mcache layer due the availability of objects there.  We'll
3090  * end up back here if there's nothing up there.
3091  */
3092 static boolean_t
3093 mbuf_cached_above(mbuf_class_t class, int wait)
3094 {
3095         switch (class) {
3096         case MC_MBUF:
3097                 if (wait & MCR_COMP)
3098                         return (!mcache_bkt_isempty(m_cache(MC_MBUF_CL)) ||
3099                             !mcache_bkt_isempty(m_cache(MC_MBUF_BIGCL)));
3100                 break;
3101
3102         case MC_CL:
3103                 if (wait & MCR_COMP)
3104                         return (!mcache_bkt_isempty(m_cache(MC_MBUF_CL)));
3105                 break;
3106
3107         case MC_BIGCL:
3108                 if (wait & MCR_COMP)
3109                         return (!mcache_bkt_isempty(m_cache(MC_MBUF_BIGCL)));
3110                 break;
3111
3112         case MC_16KCL:
3113                 if (wait & MCR_COMP)
3114                         return (!mcache_bkt_isempty(m_cache(MC_MBUF_16KCL)));
3115                 break;
3116
3117         case MC_MBUF_CL:
3118         case MC_MBUF_BIGCL:
3119         case MC_MBUF_16KCL:
3120                 break;
3121
3122         default:
3123                 VERIFY(0);
3124                 /* NOTREACHED */
3125         }
3126
3127         return (!mcache_bkt_isempty(m_cache(class)));
3128 }
3129
3130 /*
3131  * If possible, convert constructed objects to raw ones.
3132  */
3133 static boolean_t
3134 mbuf_steal(mbuf_class_t class, unsigned int num)
3135 {
3136         mcache_obj_t *top = NULL;
3137         mcache_obj_t **list = &top;
3138         unsigned int tot = 0;
3139
3140         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
3141
3142         switch (class) {
3143         case MC_MBUF:
3144         case MC_CL:
3145         case MC_BIGCL:
3146         case MC_16KCL:
3147                 return (FALSE);
3148
3149         case MC_MBUF_CL:
3150         case MC_MBUF_BIGCL:
3151         case MC_MBUF_16KCL:
3152                 /* Get the required number of constructed objects if possible */
3153                 if (m_infree(class) > m_minlimit(class)) {
3154                         tot = cslab_alloc(class, &list,
3155                             MIN(num, m_infree(class)));
3156                 }
3157
3158                 /* And destroy them to get back the raw objects */
3159                 if (top != NULL)
3160                         (void) cslab_free(class, top, 1);
3161                 break;
3162
3163         default:
3164                 VERIFY(0);
3165                 /* NOTREACHED */
3166         }
3167
3168         return (tot == num);
3169 }
3170
3171 static void
3172 m_reclaim(mbuf_class_t class, unsigned int num, boolean_t comp)
3173 {
3174         int m, bmap = 0;
3175
3176         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
3177
3178         VERIFY(m_total(MC_CL) <= m_maxlimit(MC_CL));
3179         VERIFY(m_total(MC_BIGCL) <= m_maxlimit(MC_BIGCL));
3180         VERIFY(m_total(MC_16KCL) <= m_maxlimit(MC_16KCL));
3181
3182         /*
3183          * This logic can be made smarter; for now, simply mark
3184          * all other related classes as potential victims.
3185          */
3186         switch (class) {
3187         case MC_MBUF:
3188                 m_wantpurge(MC_CL)++;
3189                 m_wantpurge(MC_BIGCL)++;
3190                 m_wantpurge(MC_MBUF_CL)++;
3191                 m_wantpurge(MC_MBUF_BIGCL)++;
3192                 break;
3193
3194         case MC_CL:
3195                 m_wantpurge(MC_MBUF)++;
3196                 m_wantpurge(MC_BIGCL)++;
3197                 m_wantpurge(MC_MBUF_BIGCL)++;
3198                 if (!comp)
3199                         m_wantpurge(MC_MBUF_CL)++;
3200                 break;
3201
3202         case MC_BIGCL:
3203                 m_wantpurge(MC_MBUF)++;
3204                 m_wantpurge(MC_CL)++;
3205                 m_wantpurge(MC_MBUF_CL)++;
3206                 if (!comp)
3207                         m_wantpurge(MC_MBUF_BIGCL)++;
3208                 break;
3209
3210         case MC_16KCL:
3211                 if (!comp)
3212                         m_wantpurge(MC_MBUF_16KCL)++;
3213                 break;
3214
3215         default:
3216                 VERIFY(0);
3217                 /* NOTREACHED */
3218         }
3219
3220         /*
3221          * Run through each marked class and check if we really need to
3222          * purge (and therefore temporarily disable) the per-CPU caches
3223          * layer used by the class.  If so, remember the classes since
3224          * we are going to drop the lock below prior to purging.
3225          */
3226         for (m = 0; m < NELEM(mbuf_table); m++) {
3227                 if (m_wantpurge(m) > 0) {
3228                         m_wantpurge(m) = 0;
3229                         /*
3230                          * Try hard to steal the required number of objects
3231                          * from the freelist of other mbuf classes.  Only
3232                          * purge and disable the per-CPU caches layer when
3233                          * we don't have enough; it's the last resort.
3234                          */
3235                         if (!mbuf_steal(m, num))
3236                                 bmap |= (1 << m);
3237                 }
3238         }
3239
3240         lck_mtx_unlock(mbuf_mlock);
3241
3242         if (bmap != 0) {
3243                 /* signal the domains to drain */
3244                 net_drain_domains();
3245
3246                 /* Sigh; we have no other choices but to ask mcache to purge */
3247                 for (m = 0; m < NELEM(mbuf_table); m++) {
3248                         if ((bmap & (1 << m)) &&
3249                             mcache_purge_cache(m_cache(m), TRUE)) {
3250                                 lck_mtx_lock(mbuf_mlock);
3251                                 m_purge_cnt(m)++;
3252                                 mbstat.m_drain++;
3253                                 lck_mtx_unlock(mbuf_mlock);
3254                         }
3255                 }
3256         } else {
3257                 /*
3258                  * Request mcache to reap extra elements from all of its caches;
3259                  * note that all reaps are serialized and happen only at a fixed
3260                  * interval.
3261                  */
3262                 mcache_reap();
3263         }
3264         lck_mtx_lock(mbuf_mlock);
3265 }
3266
3267 static inline struct mbuf *
3268 m_get_common(int wait, short type, int hdr)
3269 {
3270         struct mbuf *m;
3271         int mcflags = MSLEEPF(wait);
3272
3273         /* Is this due to a non-blocking retry?  If so, then try harder */
3274         if (mcflags & MCR_NOSLEEP)
3275                 mcflags |= MCR_TRYHARD;
3276
3277         m = mcache_alloc(m_cache(MC_MBUF), mcflags);
3278         if (m != NULL) {
3279                 MBUF_INIT(m, hdr, type);
3280                 mtype_stat_inc(type);
3281                 mtype_stat_dec(MT_FREE);
3282 #if CONFIG_MACF_NET
3283                 if (hdr && mac_init_mbuf(m, wait) != 0) {
3284                         m_free(m);
3285                         return (NULL);
3286                 }
3287 #endif /* MAC_NET */
3288         }
3289         return (m);
3290 }
3291
3292 /*
3293  * Space allocation routines; these are also available as macros
3294  * for critical paths.
3295  */
3296 #define _M_GET(wait, type)      m_get_common(wait, type, 0)
3297 #define _M_GETHDR(wait, type)   m_get_common(wait, type, 1)
3298 #define _M_RETRY(wait, type)    _M_GET(wait, type)
3299 #define _M_RETRYHDR(wait, type) _M_GETHDR(wait, type)
3300 #define _MGET(m, how, type)     ((m) = _M_GET(how, type))
3301 #define _MGETHDR(m, how, type)  ((m) = _M_GETHDR(how, type))
3302
3303 struct mbuf *
3304 m_get(int wait, int type)
3305 {
3306         return (_M_GET(wait, type));
3307 }
3308
3309 struct mbuf *
3310 m_gethdr(int wait, int type)
3311 {
3312         return (_M_GETHDR(wait, type));
3313 }
3314
3315 struct mbuf *
3316 m_retry(int wait, int type)
3317 {
3318         return (_M_RETRY(wait, type));
3319 }
3320
3321 struct mbuf *
3322 m_retryhdr(int wait, int type)
3323 {
3324         return (_M_RETRYHDR(wait, type));
3325 }
3326
3327 struct mbuf *
3328 m_getclr(int wait, int type)
3329 {
3330         struct mbuf *m;
3331
3332         _MGET(m, wait, type);
3333         if (m != NULL)
3334                 bzero(MTOD(m, caddr_t), MLEN);
3335         return (m);
3336 }
3337
3338 struct mbuf *
3339 m_free(struct mbuf *m)
3340 {
3341         struct mbuf *n = m->m_next;
3342
3343         if (m->m_type == MT_FREE)
3344                 panic("m_free: freeing an already freed mbuf");
3345
3346         if (m->m_flags & M_PKTHDR) {
3347                 /* Check for scratch area overflow */
3348                 m_redzone_verify(m);
3349                 /* Free the aux data and tags if there is any */
3350                 m_tag_delete_chain(m, NULL);
3351         }
3352
3353         if (m->m_flags & M_EXT) {
3354                 u_int32_t refcnt;
3355                 u_int32_t composite;
3356
3357                 refcnt = m_decref(m);
3358                 composite = (MEXT_FLAGS(m) & EXTF_COMPOSITE);
3359                 if (refcnt == 0 && !composite) {
3360                         if (m->m_ext.ext_free == NULL) {
3361                                 mcache_free(m_cache(MC_CL), m->m_ext.ext_buf);
3362                         } else if (m->m_ext.ext_free == m_bigfree) {
3363                                 mcache_free(m_cache(MC_BIGCL),
3364                                     m->m_ext.ext_buf);
3365                         } else if (m->m_ext.ext_free == m_16kfree) {
3366                                 mcache_free(m_cache(MC_16KCL),
3367                                     m->m_ext.ext_buf);
3368                         } else {
3369                                 (*(m->m_ext.ext_free))(m->m_ext.ext_buf,
3370                                     m->m_ext.ext_size, m->m_ext.ext_arg);
3371                         }
3372                         mcache_free(ref_cache, MEXT_RFA(m));
3373                         MEXT_RFA(m) = NULL;
3374                 } else if (refcnt == 0 && composite) {
3375                         VERIFY(m->m_type != MT_FREE);
3376
3377                         mtype_stat_dec(m->m_type);
3378                         mtype_stat_inc(MT_FREE);
3379
3380                         m->m_type = MT_FREE;
3381                         m->m_flags = M_EXT;
3382                         m->m_len = 0;
3383                         m->m_next = m->m_nextpkt = NULL;
3384
3385                         MEXT_FLAGS(m) &= ~EXTF_READONLY;
3386
3387                         /* "Free" into the intermediate cache */
3388                         if (m->m_ext.ext_free == NULL) {
3389                                 mcache_free(m_cache(MC_MBUF_CL), m);
3390                         } else if (m->m_ext.ext_free == m_bigfree) {
3391                                 mcache_free(m_cache(MC_MBUF_BIGCL), m);
3392                         } else {
3393                                 VERIFY(m->m_ext.ext_free == m_16kfree);
3394                                 mcache_free(m_cache(MC_MBUF_16KCL), m);
3395                         }
3396                         return (n);
3397                 }
3398         }
3399
3400         if (m->m_type != MT_FREE) {
3401                 mtype_stat_dec(m->m_type);
3402                 mtype_stat_inc(MT_FREE);
3403         }
3404
3405         m->m_type = MT_FREE;
3406         m->m_flags = m->m_len = 0;
3407         m->m_next = m->m_nextpkt = NULL;
3408
3409         mcache_free(m_cache(MC_MBUF), m);
3410
3411         return (n);
3412 }
3413
3414 __private_extern__ struct mbuf *
3415 m_clattach(struct mbuf *m, int type, caddr_t extbuf,
3416     void (*extfree)(caddr_t, u_int, caddr_t), u_int extsize, caddr_t extarg,
3417     int wait)
3418 {
3419         struct ext_ref *rfa = NULL;
3420
3421         if (m == NULL && (m = _M_GETHDR(wait, type)) == NULL)
3422                 return (NULL);
3423
3424         if (m->m_flags & M_EXT) {
3425                 u_int32_t refcnt;
3426                 u_int32_t composite;
3427
3428                 refcnt = m_decref(m);
3429                 composite = (MEXT_FLAGS(m) & EXTF_COMPOSITE);
3430                 if (refcnt == 0 && !composite) {
3431                         if (m->m_ext.ext_free == NULL) {
3432                                 mcache_free(m_cache(MC_CL), m->m_ext.ext_buf);
3433                         } else if (m->m_ext.ext_free == m_bigfree) {
3434                                 mcache_free(m_cache(MC_BIGCL),
3435                                     m->m_ext.ext_buf);
3436                         } else if (m->m_ext.ext_free == m_16kfree) {
3437                                 mcache_free(m_cache(MC_16KCL),
3438                                     m->m_ext.ext_buf);
3439                         } else {
3440                                 (*(m->m_ext.ext_free))(m->m_ext.ext_buf,
3441                                     m->m_ext.ext_size, m->m_ext.ext_arg);
3442                         }
3443                         /* Re-use the reference structure */
3444                         rfa = MEXT_RFA(m);
3445                 } else if (refcnt == 0 && composite) {
3446                         VERIFY(m->m_type != MT_FREE);
3447
3448                         mtype_stat_dec(m->m_type);
3449                         mtype_stat_inc(MT_FREE);
3450
3451                         m->m_type = MT_FREE;
3452                         m->m_flags = M_EXT;
3453                         m->m_len = 0;
3454                         m->m_next = m->m_nextpkt = NULL;
3455
3456                         MEXT_FLAGS(m) &= ~EXTF_READONLY;
3457
3458                         /* "Free" into the intermediate cache */
3459                         if (m->m_ext.ext_free == NULL) {
3460                                 mcache_free(m_cache(MC_MBUF_CL), m);
3461                         } else if (m->m_ext.ext_free == m_bigfree) {
3462                                 mcache_free(m_cache(MC_MBUF_BIGCL), m);
3463                         } else {
3464                                 VERIFY(m->m_ext.ext_free == m_16kfree);
3465                                 mcache_free(m_cache(MC_MBUF_16KCL), m);
3466                         }
3467                         /*
3468                          * Allocate a new mbuf, since we didn't divorce
3469                          * the composite mbuf + cluster pair above.
3470                          */
3471                         if ((m = _M_GETHDR(wait, type)) == NULL)
3472                                 return (NULL);
3473                 }
3474         }
3475
3476         if (rfa == NULL &&
3477             (rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL) {
3478                 m_free(m);
3479                 return (NULL);
3480         }
3481
3482         MEXT_INIT(m, extbuf, extsize, extfree, extarg, rfa, 1, 0);
3483
3484         return (m);
3485 }
3486
3487 /*
3488  * Perform `fast' allocation mbuf clusters from a cache of recently-freed
3489  * clusters. (If the cache is empty, new clusters are allocated en-masse.)
3490  */
3491 struct mbuf *
3492 m_getcl(int wait, int type, int flags)
3493 {
3494         struct mbuf *m;
3495         int mcflags = MSLEEPF(wait);
3496         int hdr = (flags & M_PKTHDR);
3497
3498         /* Is this due to a non-blocking retry?  If so, then try harder */
3499         if (mcflags & MCR_NOSLEEP)
3500                 mcflags |= MCR_TRYHARD;
3501
3502         m = mcache_alloc(m_cache(MC_MBUF_CL), mcflags);
3503         if (m != NULL) {
3504                 u_int32_t flag;
3505                 struct ext_ref *rfa;
3506                 void *cl;
3507
3508                 VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT);
3509                 cl = m->m_ext.ext_buf;
3510                 rfa = MEXT_RFA(m);
3511
3512                 ASSERT(cl != NULL && rfa != NULL);
3513                 VERIFY(MBUF_IS_COMPOSITE(m) && m->m_ext.ext_free == NULL);
3514
3515                 flag = MEXT_FLAGS(m);
3516
3517                 MBUF_INIT(m, hdr, type);
3518                 MBUF_CL_INIT(m, cl, rfa, 1, flag);
3519
3520                 mtype_stat_inc(type);
3521                 mtype_stat_dec(MT_FREE);
3522 #if CONFIG_MACF_NET
3523                 if (hdr && mac_init_mbuf(m, wait) != 0) {
3524                         m_freem(m);
3525                         return (NULL);
3526                 }
3527 #endif /* MAC_NET */
3528         }
3529         return (m);
3530 }
3531
3532 /* m_mclget() add an mbuf cluster to a normal mbuf */
3533 struct mbuf *
3534 m_mclget(struct mbuf *m, int wait)
3535 {
3536         struct ext_ref *rfa;
3537
3538         if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL)
3539                 return (m);
3540
3541         m->m_ext.ext_buf = m_mclalloc(wait);
3542         if (m->m_ext.ext_buf != NULL) {
3543                 MBUF_CL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0);
3544         } else {
3545                 mcache_free(ref_cache, rfa);
3546         }
3547         return (m);
3548 }
3549
3550 /* Allocate an mbuf cluster */
3551 caddr_t
3552 m_mclalloc(int wait)
3553 {
3554         int mcflags = MSLEEPF(wait);
3555
3556         /* Is this due to a non-blocking retry?  If so, then try harder */
3557         if (mcflags & MCR_NOSLEEP)
3558                 mcflags |= MCR_TRYHARD;
3559
3560         return (mcache_alloc(m_cache(MC_CL), mcflags));
3561 }
3562
3563 /* Free an mbuf cluster */
3564 void
3565 m_mclfree(caddr_t p)
3566 {
3567         mcache_free(m_cache(MC_CL), p);
3568 }
3569
3570 /*
3571  * mcl_hasreference() checks if a cluster of an mbuf is referenced by
3572  * another mbuf; see comments in m_incref() regarding EXTF_READONLY.
3573  */
3574 int
3575 m_mclhasreference(struct mbuf *m)
3576 {
3577         if (!(m->m_flags & M_EXT))
3578                 return (0);
3579
3580         ASSERT(MEXT_RFA(m) != NULL);
3581
3582         return ((MEXT_FLAGS(m) & EXTF_READONLY) ? 1 : 0);
3583 }
3584
3585 __private_extern__ caddr_t
3586 m_bigalloc(int wait)
3587 {
3588         int mcflags = MSLEEPF(wait);
3589
3590         /* Is this due to a non-blocking retry?  If so, then try harder */
3591         if (mcflags & MCR_NOSLEEP)
3592                 mcflags |= MCR_TRYHARD;
3593
3594         return (mcache_alloc(m_cache(MC_BIGCL), mcflags));
3595 }
3596
3597 __private_extern__ void
3598 m_bigfree(caddr_t p, __unused u_int size, __unused caddr_t arg)
3599 {
3600         mcache_free(m_cache(MC_BIGCL), p);
3601 }
3602
3603 /* m_mbigget() add an 4KB mbuf cluster to a normal mbuf */
3604 __private_extern__ struct mbuf *
3605 m_mbigget(struct mbuf *m, int wait)
3606 {
3607         struct ext_ref *rfa;
3608
3609         if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL)
3610                 return (m);
3611
3612         m->m_ext.ext_buf =  m_bigalloc(wait);
3613         if (m->m_ext.ext_buf != NULL) {
3614                 MBUF_BIGCL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0);
3615         } else {
3616                 mcache_free(ref_cache, rfa);
3617         }
3618         return (m);
3619 }
3620
3621 __private_extern__ caddr_t
3622 m_16kalloc(int wait)
3623 {
3624         int mcflags = MSLEEPF(wait);
3625
3626         /* Is this due to a non-blocking retry?  If so, then try harder */
3627         if (mcflags & MCR_NOSLEEP)
3628                 mcflags |= MCR_TRYHARD;
3629
3630         return (mcache_alloc(m_cache(MC_16KCL), mcflags));
3631 }
3632
3633 __private_extern__ void
3634 m_16kfree(caddr_t p, __unused u_int size, __unused caddr_t arg)
3635 {
3636         mcache_free(m_cache(MC_16KCL), p);
3637 }
3638
3639 /* m_m16kget() add a 16KB mbuf cluster to a normal mbuf */
3640 __private_extern__ struct mbuf *
3641 m_m16kget(struct mbuf *m, int wait)
3642 {
3643         struct ext_ref *rfa;
3644
3645         if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL)
3646                 return (m);
3647
3648         m->m_ext.ext_buf =  m_16kalloc(wait);
3649         if (m->m_ext.ext_buf != NULL) {
3650                 MBUF_16KCL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0);
3651         } else {
3652                 mcache_free(ref_cache, rfa);
3653         }
3654         return (m);
3655 }
3656
3657 /*
3658  * "Move" mbuf pkthdr from "from" to "to".
3659  * "from" must have M_PKTHDR set, and "to" must be empty.
3660  */
3661 void
3662 m_copy_pkthdr(struct mbuf *to, struct mbuf *from)
3663 {
3664         VERIFY(from->m_flags & M_PKTHDR);
3665
3666         /* Check for scratch area overflow */
3667         m_redzone_verify(from);
3668
3669         if (to->m_flags & M_PKTHDR) {
3670                 /* Check for scratch area overflow */
3671                 m_redzone_verify(to);
3672                 /* We will be taking over the tags of 'to' */
3673                 m_tag_delete_chain(to, NULL);
3674         }
3675         to->m_pkthdr = from->m_pkthdr;          /* especially tags */
3676         m_classifier_init(from, 0);             /* purge classifier info */
3677         m_tag_init(from, 1);                    /* purge all tags from src */
3678         m_scratch_init(from);                   /* clear src scratch area */
3679         to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT);
3680         if ((to->m_flags & M_EXT) == 0)
3681                 to->m_data = to->m_pktdat;
3682         m_redzone_init(to);                     /* setup red zone on dst */
3683 }
3684
3685 /*
3686  * Duplicate "from"'s mbuf pkthdr in "to".
3687  * "from" must have M_PKTHDR set, and "to" must be empty.
3688  * In particular, this does a deep copy of the packet tags.
3689  */
3690 static int
3691 m_dup_pkthdr(struct mbuf *to, struct mbuf *from, int how)
3692 {
3693         VERIFY(from->m_flags & M_PKTHDR);
3694
3695         /* Check for scratch area overflow */
3696         m_redzone_verify(from);
3697
3698         if (to->m_flags & M_PKTHDR) {
3699                 /* Check for scratch area overflow */
3700                 m_redzone_verify(to);
3701                 /* We will be taking over the tags of 'to' */
3702                 m_tag_delete_chain(to, NULL);
3703         }
3704         to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT);
3705         if ((to->m_flags & M_EXT) == 0)
3706                 to->m_data = to->m_pktdat;
3707         to->m_pkthdr = from->m_pkthdr;
3708         m_redzone_init(to);                     /* setup red zone on dst */
3709         m_tag_init(to, 0);                      /* preserve dst static tags */
3710         return (m_tag_copy_chain(to, from, how));
3711 }
3712
3713 void
3714 m_copy_pftag(struct mbuf *to, struct mbuf *from)
3715 {
3716         to->m_pkthdr.pf_mtag = from->m_pkthdr.pf_mtag;
3717 #if PF_ECN
3718         to->m_pkthdr.pf_mtag.pftag_hdr = NULL;
3719         to->m_pkthdr.pf_mtag.pftag_flags &= ~(PF_TAG_HDR_INET|PF_TAG_HDR_INET6);
3720 #endif /* PF_ECN */
3721 }
3722
3723 void
3724 m_classifier_init(struct mbuf *m, uint32_t pktf_mask)
3725 {
3726         VERIFY(m->m_flags & M_PKTHDR);
3727
3728         m->m_pkthdr.pkt_proto = 0;
3729         m->m_pkthdr.pkt_flowsrc = 0;
3730         m->m_pkthdr.pkt_flowid = 0;
3731         m->m_pkthdr.pkt_flags &= pktf_mask;     /* caller-defined mask */
3732         /* preserve service class and interface info for loopback packets */
3733         if (!(m->m_pkthdr.pkt_flags & PKTF_LOOP))
3734                 (void) m_set_service_class(m, MBUF_SC_BE);
3735         if (!(m->m_pkthdr.pkt_flags & PKTF_IFAINFO))
3736                 m->m_pkthdr.pkt_ifainfo = 0;
3737 #if MEASURE_BW
3738         m->m_pkthdr.pkt_bwseq  = 0;
3739 #endif /* MEASURE_BW */
3740 }
3741
3742 void
3743 m_copy_classifier(struct mbuf *to, struct mbuf *from)
3744 {
3745         VERIFY(to->m_flags & M_PKTHDR);
3746         VERIFY(from->m_flags & M_PKTHDR);
3747
3748         to->m_pkthdr.pkt_proto = from->m_pkthdr.pkt_proto;
3749         to->m_pkthdr.pkt_flowsrc = from->m_pkthdr.pkt_flowsrc;
3750         to->m_pkthdr.pkt_flowid = from->m_pkthdr.pkt_flowid;
3751         to->m_pkthdr.pkt_flags = from->m_pkthdr.pkt_flags;
3752         (void) m_set_service_class(to, from->m_pkthdr.pkt_svc);
3753         to->m_pkthdr.pkt_ifainfo  = from->m_pkthdr.pkt_ifainfo;
3754 #if MEASURE_BW
3755         to->m_pkthdr.pkt_bwseq  = from->m_pkthdr.pkt_bwseq;
3756 #endif /* MEASURE_BW */
3757 }
3758
3759 /*
3760  * Return a list of mbuf hdrs that point to clusters.  Try for num_needed;
3761  * if wantall is not set, return whatever number were available.  Set up the
3762  * first num_with_pkthdrs with mbuf hdrs configured as packet headers; these
3763  * are chained on the m_nextpkt field.  Any packets requested beyond this
3764  * are chained onto the last packet header's m_next field.  The size of
3765  * the cluster is controlled by the parameter bufsize.
3766  */
3767 __private_extern__ struct mbuf *
3768 m_getpackets_internal(unsigned int *num_needed, int num_with_pkthdrs,
3769     int wait, int wantall, size_t bufsize)
3770 {
3771         struct mbuf *m;
3772         struct mbuf **np, *top;
3773         unsigned int pnum, needed = *num_needed;
3774         mcache_obj_t *mp_list = NULL;
3775         int mcflags = MSLEEPF(wait);
3776         u_int32_t flag;
3777         struct ext_ref *rfa;
3778         mcache_t *cp;
3779         void *cl;
3780
3781         ASSERT(bufsize == m_maxsize(MC_CL) ||
3782             bufsize == m_maxsize(MC_BIGCL) ||
3783             bufsize == m_maxsize(MC_16KCL));
3784
3785         /*
3786          * Caller must first check for njcl because this
3787          * routine is internal and not exposed/used via KPI.
3788          */
3789         VERIFY(bufsize != m_maxsize(MC_16KCL) || njcl > 0);
3790
3791         top = NULL;
3792         np = &top;
3793         pnum = 0;
3794
3795         /*
3796          * The caller doesn't want all the requested buffers; only some.
3797          * Try hard to get what we can, but don't block.  This effectively
3798          * overrides MCR_SLEEP, since this thread will not go to sleep
3799          * if we can't get all the buffers.
3800          */
3801         if (!wantall || (mcflags & MCR_NOSLEEP))
3802                 mcflags |= MCR_TRYHARD;
3803
3804         /* Allocate the composite mbuf + cluster elements from the cache */
3805         if (bufsize == m_maxsize(MC_CL))
3806                 cp = m_cache(MC_MBUF_CL);
3807         else if (bufsize == m_maxsize(MC_BIGCL))
3808                 cp = m_cache(MC_MBUF_BIGCL);
3809         else
3810                 cp = m_cache(MC_MBUF_16KCL);
3811         needed = mcache_alloc_ext(cp, &mp_list, needed, mcflags);
3812
3813         for (pnum = 0; pnum < needed; pnum++) {
3814                 m = (struct mbuf *)mp_list;
3815                 mp_list = mp_list->obj_next;
3816
3817                 VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT);
3818                 cl = m->m_ext.ext_buf;
3819                 rfa = MEXT_RFA(m);
3820
3821                 ASSERT(cl != NULL && rfa != NULL);
3822                 VERIFY(MBUF_IS_COMPOSITE(m));
3823
3824                 flag = MEXT_FLAGS(m);
3825
3826                 MBUF_INIT(m, num_with_pkthdrs, MT_DATA);
3827                 if (bufsize == m_maxsize(MC_16KCL)) {
3828                         MBUF_16KCL_INIT(m, cl, rfa, 1, flag);
3829                 } else if (bufsize == m_maxsize(MC_BIGCL)) {
3830                         MBUF_BIGCL_INIT(m, cl, rfa, 1, flag);
3831                 } else {
3832                         MBUF_CL_INIT(m, cl, rfa, 1, flag);
3833                 }
3834
3835                 if (num_with_pkthdrs > 0) {
3836                         --num_with_pkthdrs;
3837 #if CONFIG_MACF_NET
3838                         if (mac_mbuf_label_init(m, wait) != 0) {
3839                                 m_freem(m);
3840                                 break;
3841                         }
3842 #endif /* MAC_NET */
3843                 }
3844
3845                 *np = m;
3846                 if (num_with_pkthdrs > 0)
3847                         np = &m->m_nextpkt;
3848                 else
3849                         np = &m->m_next;
3850         }
3851         ASSERT(pnum != *num_needed || mp_list == NULL);
3852         if (mp_list != NULL)
3853                 mcache_free_ext(cp, mp_list);
3854
3855         if (pnum > 0) {
3856                 mtype_stat_add(MT_DATA, pnum);
3857                 mtype_stat_sub(MT_FREE, pnum);
3858         }
3859
3860         if (wantall && (pnum != *num_needed)) {
3861                 if (top != NULL)
3862                         m_freem_list(top);
3863                 return (NULL);
3864         }
3865
3866         if (pnum > *num_needed) {
3867                 printf("%s: File a radar related to <rdar://10146739>. \
3868                         needed = %u, pnum = %u, num_needed = %u \n",
3869                         __func__, needed, pnum, *num_needed);
3870         }
3871
3872         *num_needed = pnum;
3873         return (top);
3874 }
3875
3876 /*
3877  * Return list of mbuf linked by m_nextpkt.  Try for numlist, and if
3878  * wantall is not set, return whatever number were available.  The size of
3879  * each mbuf in the list is controlled by the parameter packetlen.  Each
3880  * mbuf of the list may have a chain of mbufs linked by m_next.  Each mbuf
3881  * in the chain is called a segment.  If maxsegments is not null and the
3882  * value pointed to is not null, this specify the maximum number of segments
3883  * for a chain of mbufs.  If maxsegments is zero or the value pointed to
3884  * is zero the caller does not have any restriction on the number of segments.
3885  * The actual  number of segments of a mbuf chain is return in the value
3886  * pointed to by maxsegments.
3887  */
3888 __private_extern__ struct mbuf *
3889 m_allocpacket_internal(unsigned int *numlist, size_t packetlen,
3890     unsigned int *maxsegments, int wait, int wantall, size_t wantsize)
3891 {
3892         struct mbuf **np, *top, *first = NULL;
3893         size_t bufsize, r_bufsize;
3894         unsigned int num = 0;
3895         unsigned int nsegs = 0;
3896         unsigned int needed, resid;
3897         int mcflags = MSLEEPF(wait);
3898         mcache_obj_t *mp_list = NULL, *rmp_list = NULL;
3899         mcache_t *cp = NULL, *rcp = NULL;
3900
3901         if (*numlist == 0)
3902                 return (NULL);
3903
3904         top = NULL;
3905         np = &top;
3906
3907         if (wantsize == 0) {
3908                 if (packetlen <= MINCLSIZE) {
3909                         bufsize = packetlen;
3910                 } else if (packetlen > m_maxsize(MC_CL)) {
3911                         /* Use 4KB if jumbo cluster pool isn't available */
3912                         if (packetlen <= m_maxsize(MC_BIGCL) || njcl == 0)
3913                                 bufsize = m_maxsize(MC_BIGCL);
3914                         else
3915                                 bufsize = m_maxsize(MC_16KCL);
3916                 } else {
3917                         bufsize = m_maxsize(MC_CL);
3918                 }
3919         } else if (wantsize == m_maxsize(MC_CL) ||
3920             wantsize == m_maxsize(MC_BIGCL) ||
3921             (wantsize == m_maxsize(MC_16KCL) && njcl > 0)) {
3922                 bufsize = wantsize;
3923         } else {
3924                 return (NULL);
3925         }
3926
3927         if (bufsize <= MHLEN) {
3928                 nsegs = 1;
3929         } else if (bufsize <= MINCLSIZE) {
3930                 if (maxsegments != NULL && *maxsegments == 1) {
3931                         bufsize = m_maxsize(MC_CL);
3932                         nsegs = 1;
3933                 } else {
3934                         nsegs = 2;
3935                 }
3936         } else if (bufsize == m_maxsize(MC_16KCL)) {
3937                 VERIFY(njcl > 0);
3938                 nsegs = ((packetlen - 1) >> (PGSHIFT + 2)) + 1;
3939         } else if (bufsize == m_maxsize(MC_BIGCL)) {
3940                 nsegs = ((packetlen - 1) >> PGSHIFT) + 1;
3941         } else {
3942                 nsegs = ((packetlen - 1) >> MCLSHIFT) + 1;
3943         }
3944         if (maxsegments != NULL) {
3945                 if (*maxsegments && nsegs > *maxsegments) {
3946                         *maxsegments = nsegs;
3947                         return (NULL);
3948                 }
3949                 *maxsegments = nsegs;
3950         }
3951
3952         /*
3953          * The caller doesn't want all the requested buffers; only some.
3954          * Try hard to get what we can, but don't block.  This effectively
3955          * overrides MCR_SLEEP, since this thread will not go to sleep
3956          * if we can't get all the buffers.
3957          */
3958         if (!wantall || (mcflags & MCR_NOSLEEP))
3959                 mcflags |= MCR_TRYHARD;
3960
3961         /*
3962          * Simple case where all elements in the lists/chains are mbufs.
3963          * Unless bufsize is greater than MHLEN, each segment chain is made
3964          * up of exactly 1 mbuf.  Otherwise, each segment chain is made up
3965          * of 2 mbufs; the second one is used for the residual data, i.e.
3966          * the remaining data that cannot fit into the first mbuf.
3967          */
3968         if (bufsize <= MINCLSIZE) {
3969                 /* Allocate the elements in one shot from the mbuf cache */
3970                 ASSERT(bufsize <= MHLEN || nsegs == 2);
3971                 cp = m_cache(MC_MBUF);
3972                 needed = mcache_alloc_ext(cp, &mp_list,
3973                     (*numlist) * nsegs, mcflags);
3974
3975                 /*
3976                  * The number of elements must be even if we are to use an
3977                  * mbuf (instead of a cluster) to store the residual data.
3978                  * If we couldn't allocate the requested number of mbufs,
3979                  * trim the number down (if it's odd) in order to avoid
3980                  * creating a partial segment chain.
3981                  */
3982                 if (bufsize > MHLEN && (needed & 0x1))
3983                         needed--;
3984
3985                 while (num < needed) {
3986                         struct mbuf *m;
3987
3988                         m = (struct mbuf *)mp_list;
3989                         mp_list = mp_list->obj_next;
3990                         ASSERT(m != NULL);
3991
3992                         MBUF_INIT(m, 1, MT_DATA);
3993 #if CONFIG_MACF_NET
3994                         if (mac_init_mbuf(m, wait) != 0) {
3995                                 m_free(m);
3996                                 break;
3997                         }
3998 #endif /* MAC_NET */
3999                         num++;
4000                         if (bufsize > MHLEN) {
4001                                 /* A second mbuf for this segment chain */
4002                                 m->m_next = (struct mbuf *)mp_list;
4003                                 mp_list = mp_list->obj_next;
4004                                 ASSERT(m->m_next != NULL);
4005
4006                                 MBUF_INIT(m->m_next, 0, MT_DATA);
4007                                 num++;
4008                         }
4009                         *np = m;
4010                         np = &m->m_nextpkt;
4011                 }
4012                 ASSERT(num != *numlist || mp_list == NULL);
4013
4014                 if (num > 0) {
4015                         mtype_stat_add(MT_DATA, num);
4016                         mtype_stat_sub(MT_FREE, num);
4017                 }
4018                 num /= nsegs;
4019
4020                 /* We've got them all; return to caller */
4021                 if (num == *numlist)
4022                         return (top);
4023
4024                 goto fail;
4025         }
4026
4027         /*
4028          * Complex cases where elements are made up of one or more composite
4029          * mbufs + cluster, depending on packetlen.  Each N-segment chain can
4030          * be illustrated as follows:
4031          *
4032          * [mbuf + cluster 1] [mbuf + cluster 2] ... [mbuf + cluster N]
4033          *
4034          * Every composite mbuf + cluster element comes from the intermediate
4035          * cache (either MC_MBUF_CL or MC_MBUF_BIGCL).  For space efficiency,
4036          * the last composite element will come from the MC_MBUF_CL cache,
4037          * unless the residual data is larger than 2KB where we use the
4038          * big cluster composite cache (MC_MBUF_BIGCL) instead.  Residual
4039          * data is defined as extra data beyond the first element that cannot
4040          * fit into the previous element, i.e. there is no residual data if
4041          * the chain only has 1 segment.
4042          */
4043         r_bufsize = bufsize;
4044         resid = packetlen > bufsize ? packetlen % bufsize : 0;
4045         if (resid > 0) {
4046                 /* There is residual data; figure out the cluster size */
4047                 if (wantsize == 0 && packetlen > MINCLSIZE) {
4048                         /*
4049                          * Caller didn't request that all of the segments
4050                          * in the chain use the same cluster size; use the
4051                          * smaller of the cluster sizes.
4052                          */
4053                         if (njcl > 0 && resid > m_maxsize(MC_BIGCL))
4054                                 r_bufsize = m_maxsize(MC_16KCL);
4055                         else if (resid > m_maxsize(MC_CL))
4056                                 r_bufsize = m_maxsize(MC_BIGCL);
4057                         else
4058                                 r_bufsize = m_maxsize(MC_CL);
4059                 } else {
4060                         /* Use the same cluster size as the other segments */
4061                         resid = 0;
4062                 }
4063         }
4064
4065         needed = *numlist;
4066         if (resid > 0) {
4067                 /*
4068                  * Attempt to allocate composite mbuf + cluster elements for
4069                  * the residual data in each chain; record the number of such
4070                  * elements that can be allocated so that we know how many
4071                  * segment chains we can afford to create.
4072                  */
4073                 if (r_bufsize <= m_maxsize(MC_CL))
4074                         rcp = m_cache(MC_MBUF_CL);
4075                 else if (r_bufsize <= m_maxsize(MC_BIGCL))
4076                         rcp = m_cache(MC_MBUF_BIGCL);
4077                 else
4078                         rcp = m_cache(MC_MBUF_16KCL);
4079                 needed = mcache_alloc_ext(rcp, &rmp_list, *numlist, mcflags);
4080
4081                 if (needed == 0)
4082                         goto fail;
4083
4084                 /* This is temporarily reduced for calculation */
4085                 ASSERT(nsegs > 1);
4086                 nsegs--;
4087         }
4088
4089         /*
4090          * Attempt to allocate the rest of the composite mbuf + cluster
4091          * elements for the number of segment chains that we need.
4092          */
4093         if (bufsize <= m_maxsize(MC_CL))
4094                 cp = m_cache(MC_MBUF_CL);
4095         else if (bufsize <= m_maxsize(MC_BIGCL))
4096                 cp = m_cache(MC_MBUF_BIGCL);
4097         else
4098                 cp = m_cache(MC_MBUF_16KCL);
4099         needed = mcache_alloc_ext(cp, &mp_list, needed * nsegs, mcflags);
4100
4101         /* Round it down to avoid creating a partial segment chain */
4102         needed = (needed / nsegs) * nsegs;
4103         if (needed == 0)
4104                 goto fail;
4105
4106         if (resid > 0) {
4107                 /*
4108                  * We're about to construct the chain(s); take into account
4109                  * the number of segments we have created above to hold the
4110                  * residual data for each chain, as well as restore the
4111                  * original count of segments per chain.
4112                  */
4113                 ASSERT(nsegs > 0);
4114                 needed += needed / nsegs;
4115                 nsegs++;
4116         }
4117
4118         for (;;) {
4119                 struct mbuf *m;
4120                 u_int32_t flag;
4121                 struct ext_ref *rfa;
4122                 void *cl;
4123                 int pkthdr;
4124
4125                 ++num;
4126                 if (nsegs == 1 || (num % nsegs) != 0 || resid == 0) {
4127                         m = (struct mbuf *)mp_list;
4128                         mp_list = mp_list->obj_next;
4129                 } else {
4130                         m = (struct mbuf *)rmp_list;
4131                         rmp_list = rmp_list->obj_next;
4132                 }
4133                 ASSERT(m != NULL);
4134                 VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT);
4135                 VERIFY(m->m_ext.ext_free == NULL ||
4136                     m->m_ext.ext_free == m_bigfree ||
4137                     m->m_ext.ext_free == m_16kfree);
4138
4139                 cl = m->m_ext.ext_buf;
4140                 rfa = MEXT_RFA(m);
4141
4142                 ASSERT(cl != NULL && rfa != NULL);
4143                 VERIFY(MBUF_IS_COMPOSITE(m));
4144
4145                 flag = MEXT_FLAGS(m);
4146
4147                 pkthdr = (nsegs == 1 || (num % nsegs) == 1);
4148                 if (pkthdr)
4149                         first = m;
4150                 MBUF_INIT(m, pkthdr, MT_DATA);
4151                 if (m->m_ext.ext_free == m_16kfree) {
4152                         MBUF_16KCL_INIT(m, cl, rfa, 1, flag);
4153                 } else if (m->m_ext.ext_free == m_bigfree) {
4154                         MBUF_BIGCL_INIT(m, cl, rfa, 1, flag);
4155                 } else {
4156                         MBUF_CL_INIT(m, cl, rfa, 1, flag);
4157                 }
4158 #if CONFIG_MACF_NET
4159                 if (pkthdr && mac_init_mbuf(m, wait) != 0) {
4160                         --num;
4161                         m_freem(m);
4162                         break;
4163                 }
4164 #endif /* MAC_NET */
4165
4166                 *np = m;
4167                 if ((num % nsegs) == 0)
4168                         np = &first->m_nextpkt;
4169                 else
4170                         np = &m->m_next;
4171
4172                 if (num == needed)
4173                         break;
4174         }
4175
4176         if (num > 0) {
4177                 mtype_stat_add(MT_DATA, num);
4178                 mtype_stat_sub(MT_FREE, num);
4179         }
4180
4181         num /= nsegs;
4182
4183         /* We've got them all; return to caller */
4184         if (num == *numlist) {
4185                 ASSERT(mp_list == NULL && rmp_list == NULL);
4186                 return (top);
4187         }
4188
4189 fail:
4190         /* Free up what's left of the above */
4191         if (mp_list != NULL)
4192                 mcache_free_ext(cp, mp_list);
4193         if (rmp_list != NULL)
4194                 mcache_free_ext(rcp, rmp_list);
4195         if (wantall && top != NULL) {
4196                 m_freem(top);
4197                 return (NULL);
4198         }
4199         *numlist = num;
4200         return (top);
4201 }
4202
4203 /*
4204  * Best effort to get a mbuf cluster + pkthdr.  Used by drivers to allocated
4205  * packets on receive ring.
4206  */
4207 __private_extern__ struct mbuf *
4208 m_getpacket_how(int wait)
4209 {
4210         unsigned int num_needed = 1;
4211
4212         return (m_getpackets_internal(&num_needed, 1, wait, 1,
4213             m_maxsize(MC_CL)));
4214 }
4215
4216 /*
4217  * Best effort to get a mbuf cluster + pkthdr.  Used by drivers to allocated
4218  * packets on receive ring.
4219  */
4220 struct mbuf *
4221 m_getpacket(void)
4222 {
4223         unsigned int num_needed = 1;
4224
4225         return (m_getpackets_internal(&num_needed, 1, M_WAIT, 1,
4226             m_maxsize(MC_CL)));
4227 }
4228
4229 /*
4230  * Return a list of mbuf hdrs that point to clusters.  Try for num_needed;
4231  * if this can't be met, return whatever number were available.  Set up the
4232  * first num_with_pkthdrs with mbuf hdrs configured as packet headers.  These
4233  * are chained on the m_nextpkt field.  Any packets requested beyond this are
4234  * chained onto the last packet header's m_next field.
4235  */
4236 struct mbuf *
4237 m_getpackets(int num_needed, int num_with_pkthdrs, int how)
4238 {
4239         unsigned int n = num_needed;
4240
4241         return (m_getpackets_internal(&n, num_with_pkthdrs, how, 0,
4242             m_maxsize(MC_CL)));
4243 }
4244
4245 /*
4246  * Return a list of mbuf hdrs set up as packet hdrs chained together
4247  * on the m_nextpkt field
4248  */
4249 struct mbuf *
4250 m_getpackethdrs(int num_needed, int how)
4251 {
4252         struct mbuf *m;
4253         struct mbuf **np, *top;
4254
4255         top = NULL;
4256         np = &top;
4257
4258         while (num_needed--) {
4259                 m = _M_RETRYHDR(how, MT_DATA);
4260                 if (m == NULL)
4261                         break;
4262
4263                 *np = m;
4264                 np = &m->m_nextpkt;
4265         }
4266
4267         return (top);
4268 }
4269
4270 /*
4271  * Free an mbuf list (m_nextpkt) while following m_next.  Returns the count
4272  * for mbufs packets freed.  Used by the drivers.
4273  */
4274 int
4275 m_freem_list(struct mbuf *m)
4276 {
4277         struct mbuf *nextpkt;
4278         mcache_obj_t *mp_list = NULL;
4279         mcache_obj_t *mcl_list = NULL;
4280         mcache_obj_t *mbc_list = NULL;
4281         mcache_obj_t *m16k_list = NULL;
4282         mcache_obj_t *m_mcl_list = NULL;
4283         mcache_obj_t *m_mbc_list = NULL;
4284         mcache_obj_t *m_m16k_list = NULL;
4285         mcache_obj_t *ref_list = NULL;
4286         int pktcount = 0;
4287         int mt_free = 0, mt_data = 0, mt_header = 0, mt_soname = 0, mt_tag = 0;
4288
4289         while (m != NULL) {
4290                 pktcount++;
4291
4292                 nextpkt = m->m_nextpkt;
4293                 m->m_nextpkt = NULL;
4294
4295                 while (m != NULL) {
4296                         struct mbuf *next = m->m_next;
4297                         mcache_obj_t *o, *rfa;
4298                         u_int32_t refcnt, composite;
4299
4300                         if (m->m_type == MT_FREE)
4301                                 panic("m_free: freeing an already freed mbuf");
4302
4303                         if (m->m_type != MT_FREE)
4304                                 mt_free++;
4305
4306                         if (m->m_flags & M_PKTHDR) {
4307                                 /* Check for scratch area overflow */
4308                                 m_redzone_verify(m);
4309                                 /* Free the aux data and tags if there is any */
4310                                 m_tag_delete_chain(m, NULL);
4311                         }
4312
4313                         if (!(m->m_flags & M_EXT))
4314                                 goto simple_free;
4315
4316                         o = (mcache_obj_t *)(void *)m->m_ext.ext_buf;
4317                         refcnt = m_decref(m);
4318                         composite = (MEXT_FLAGS(m) & EXTF_COMPOSITE);
4319                         if (refcnt == 0 && !composite) {
4320                                 if (m->m_ext.ext_free == NULL) {
4321                                         o->obj_next = mcl_list;
4322                                         mcl_list = o;
4323                                 } else if (m->m_ext.ext_free == m_bigfree) {
4324                                         o->obj_next = mbc_list;
4325                                         mbc_list = o;
4326                                 } else if (m->m_ext.ext_free == m_16kfree) {
4327                                         o->obj_next = m16k_list;
4328                                         m16k_list = o;
4329                                 } else {
4330                                         (*(m->m_ext.ext_free))((caddr_t)o,
4331                                             m->m_ext.ext_size,
4332                                             m->m_ext.ext_arg);
4333                                 }
4334                                 rfa = (mcache_obj_t *)(void *)MEXT_RFA(m);
4335                                 rfa->obj_next = ref_list;
4336                                 ref_list = rfa;
4337                                 MEXT_RFA(m) = NULL;
4338                         } else if (refcnt == 0 && composite) {
4339                                 VERIFY(m->m_type != MT_FREE);
4340                                 /*
4341                                  * Amortize the costs of atomic operations
4342                                  * by doing them at the end, if possible.
4343                                  */
4344                                 if (m->m_type == MT_DATA)
4345                                         mt_data++;
4346                                 else if (m->m_type == MT_HEADER)
4347                                         mt_header++;
4348                                 else if (m->m_type == MT_SONAME)
4349                                         mt_soname++;
4350                                 else if (m->m_type == MT_TAG)
4351                                         mt_tag++;
4352                                 else
4353                                         mtype_stat_dec(m->m_type);
4354
4355                                 m->m_type = MT_FREE;
4356                                 m->m_flags = M_EXT;
4357                                 m->m_len = 0;
4358                                 m->m_next = m->m_nextpkt = NULL;
4359
4360                                 MEXT_FLAGS(m) &= ~EXTF_READONLY;
4361
4362                                 /* "Free" into the intermediate cache */
4363                                 o = (mcache_obj_t *)m;
4364                                 if (m->m_ext.ext_free == NULL) {
4365                                         o->obj_next = m_mcl_list;
4366                                         m_mcl_list = o;
4367                                 } else if (m->m_ext.ext_free == m_bigfree) {
4368                                         o->obj_next = m_mbc_list;
4369                                         m_mbc_list = o;
4370                                 } else {
4371                                         VERIFY(m->m_ext.ext_free == m_16kfree);
4372                                         o->obj_next = m_m16k_list;
4373                                         m_m16k_list = o;
4374                                 }
4375                                 m = next;
4376                                 continue;
4377                         }
4378 simple_free:
4379                         /*
4380                          * Amortize the costs of atomic operations
4381                          * by doing them at the end, if possible.
4382                          */
4383                         if (m->m_type == MT_DATA)
4384                                 mt_data++;
4385                         else if (m->m_type == MT_HEADER)
4386                                 mt_header++;
4387                         else if (m->m_type == MT_SONAME)
4388                                 mt_soname++;
4389                         else if (m->m_type == MT_TAG)
4390                                 mt_tag++;
4391                         else if (m->m_type != MT_FREE)
4392                                 mtype_stat_dec(m->m_type);
4393
4394                         m->m_type = MT_FREE;
4395                         m->m_flags = m->m_len = 0;
4396                         m->m_next = m->m_nextpkt = NULL;
4397
4398                         ((mcache_obj_t *)m)->obj_next = mp_list;
4399                         mp_list = (mcache_obj_t *)m;
4400
4401                         m = next;
4402                 }
4403
4404                 m = nextpkt;
4405         }
4406
4407         if (mt_free > 0)
4408                 mtype_stat_add(MT_FREE, mt_free);
4409         if (mt_data > 0)
4410                 mtype_stat_sub(MT_DATA, mt_data);
4411         if (mt_header > 0)
4412                 mtype_stat_sub(MT_HEADER, mt_header);
4413         if (mt_soname > 0)
4414                 mtype_stat_sub(MT_SONAME, mt_soname);
4415         if (mt_tag > 0)
4416                 mtype_stat_sub(MT_TAG, mt_tag);
4417
4418         if (mp_list != NULL)
4419                 mcache_free_ext(m_cache(MC_MBUF), mp_list);
4420         if (mcl_list != NULL)
4421                 mcache_free_ext(m_cache(MC_CL), mcl_list);
4422         if (mbc_list != NULL)
4423                 mcache_free_ext(m_cache(MC_BIGCL), mbc_list);
4424         if (m16k_list != NULL)
4425                 mcache_free_ext(m_cache(MC_16KCL), m16k_list);
4426         if (m_mcl_list != NULL)
4427                 mcache_free_ext(m_cache(MC_MBUF_CL), m_mcl_list);
4428         if (m_mbc_list != NULL)
4429                 mcache_free_ext(m_cache(MC_MBUF_BIGCL), m_mbc_list);
4430         if (m_m16k_list != NULL)
4431                 mcache_free_ext(m_cache(MC_MBUF_16KCL), m_m16k_list);
4432         if (ref_list != NULL)
4433                 mcache_free_ext(ref_cache, ref_list);
4434
4435         return (pktcount);
4436 }
4437
4438 void
4439 m_freem(struct mbuf *m)
4440 {
4441         while (m != NULL)
4442                 m = m_free(m);
4443 }
4444
4445 /*
4446  * Mbuffer utility routines.
4447  */
4448
4449 /*
4450  * Compute the amount of space available before the current start
4451  * of data in an mbuf.
4452  */
4453 int
4454 m_leadingspace(struct mbuf *m)
4455 {
4456         if (m->m_flags & M_EXT) {
4457                 if (MCLHASREFERENCE(m))
4458                         return (0);
4459                 return (m->m_data - m->m_ext.ext_buf);
4460         }
4461         if (m->m_flags & M_PKTHDR)
4462                 return (m->m_data - m->m_pktdat);
4463         return (m->m_data - m->m_dat);
4464 }
4465
4466 /*
4467  * Compute the amount of space available after the end of data in an mbuf.
4468  */
4469 int
4470 m_trailingspace(struct mbuf *m)
4471 {
4472         if (m->m_flags & M_EXT) {
4473                 if (MCLHASREFERENCE(m))
4474                         return (0);
4475                 return (m->m_ext.ext_buf + m->m_ext.ext_size -
4476                     (m->m_data + m->m_len));
4477         }
4478         return (&m->m_dat[MLEN] - (m->m_data + m->m_len));
4479 }
4480
4481 /*
4482  * Lesser-used path for M_PREPEND: allocate new mbuf to prepend to chain,
4483  * copy junk along.  Does not adjust packet header length.
4484  */
4485 struct mbuf *
4486 m_prepend(struct mbuf *m, int len, int how)
4487 {
4488         struct mbuf *mn;
4489
4490         _MGET(mn, how, m->m_type);
4491         if (mn == NULL) {
4492                 m_freem(m);
4493                 return (NULL);
4494         }
4495         if (m->m_flags & M_PKTHDR) {
4496                 M_COPY_PKTHDR(mn, m);
4497                 m->m_flags &= ~M_PKTHDR;
4498         }
4499         mn->m_next = m;
4500         m = mn;
4501         if (len < MHLEN)
4502                 MH_ALIGN(m, len);
4503         m->m_len = len;
4504         return (m);
4505 }
4506
4507 /*
4508  * Replacement for old M_PREPEND macro: allocate new mbuf to prepend to
4509  * chain, copy junk along, and adjust length.
4510  */
4511 struct mbuf *
4512 m_prepend_2(struct mbuf *m, int len, int how)
4513 {
4514         if (M_LEADINGSPACE(m) >= len) {
4515                 m->m_data -= len;
4516                 m->m_len += len;
4517         } else {
4518                 m = m_prepend(m, len, how);
4519         }
4520         if ((m) && (m->m_flags & M_PKTHDR))
4521                 m->m_pkthdr.len += len;
4522         return (m);
4523 }
4524
4525 /*
4526  * Make a copy of an mbuf chain starting "off0" bytes from the beginning,
4527  * continuing for "len" bytes.  If len is M_COPYALL, copy to end of mbuf.
4528  * The wait parameter is a choice of M_WAIT/M_DONTWAIT from caller.
4529  */
4530 int MCFail;
4531
4532 struct mbuf *
4533 m_copym_mode(struct mbuf *m, int off0, int len, int wait, uint32_t mode)
4534 {
4535         struct mbuf *n, *mhdr = NULL, **np;
4536         int off = off0;
4537         struct mbuf *top;
4538         int copyhdr = 0;
4539
4540         if (off < 0 || len < 0)
4541                 panic("m_copym: invalid offset %d or len %d", off, len);
4542
4543         VERIFY((mode != M_COPYM_MUST_COPY_HDR &&
4544             mode != M_COPYM_MUST_MOVE_HDR) || (m->m_flags & M_PKTHDR));
4545
4546         if ((off == 0 && (m->m_flags & M_PKTHDR)) ||
4547             mode == M_COPYM_MUST_COPY_HDR || mode == M_COPYM_MUST_MOVE_HDR) {
4548                 mhdr = m;
4549                 copyhdr = 1;
4550         }
4551
4552         while (off >= m->m_len) {
4553                 if (m->m_next == NULL)
4554                         panic("m_copym: invalid mbuf chain");
4555                 off -= m->m_len;
4556                 m = m->m_next;
4557         }
4558         np = &top;
4559         top = NULL;
4560
4561         while (len > 0) {
4562                 if (m == NULL) {
4563                         if (len != M_COPYALL)
4564                                 panic("m_copym: len != M_COPYALL");
4565                         break;
4566                 }
4567
4568                 if (copyhdr)
4569                         n = _M_RETRYHDR(wait, m->m_type);
4570                 else
4571                         n = _M_RETRY(wait, m->m_type);
4572                 *np = n;
4573
4574                 if (n == NULL)
4575                         goto nospace;
4576
4577                 if (copyhdr != 0) {
4578                         if ((mode == M_COPYM_MOVE_HDR) ||
4579                             (mode == M_COPYM_MUST_MOVE_HDR)) {
4580                                 M_COPY_PKTHDR(n, mhdr);
4581                         } else if ((mode == M_COPYM_COPY_HDR) ||
4582                             (mode == M_COPYM_MUST_COPY_HDR)) {
4583                                 if (m_dup_pkthdr(n, mhdr, wait) == 0)
4584                                         goto nospace;
4585                         }
4586                         if (len == M_COPYALL)
4587                                 n->m_pkthdr.len -= off0;
4588                         else
4589                                 n->m_pkthdr.len = len;
4590                         copyhdr = 0;
4591                         /*
4592                          * There is data to copy from the packet header mbuf
4593                          * if it is empty or it is before the starting offset
4594                          */
4595                         if (mhdr != m) {
4596                                 np = &n->m_next;
4597                                 continue;
4598                         }
4599                 }
4600                 n->m_len = MIN(len, (m->m_len - off));
4601                 if (m->m_flags & M_EXT) {
4602                         n->m_ext = m->m_ext;
4603                         m_incref(m);
4604                         n->m_data = m->m_data + off;
4605                         n->m_flags |= M_EXT;
4606                 } else {
4607                         /*
4608                          * Limit to the capacity of the destination
4609                          */
4610                         if (n->m_flags & M_PKTHDR)
4611                                 n->m_len = MIN(n->m_len, MHLEN);
4612                         else
4613                                 n->m_len = MIN(n->m_len, MLEN);
4614
4615                         if (MTOD(n, char *) + n->m_len > ((char *)n) + MSIZE)
4616                                 panic("%s n %p copy overflow",
4617                                         __func__, n);
4618
4619                         bcopy(MTOD(m, caddr_t)+off, MTOD(n, caddr_t),
4620                             (unsigned)n->m_len);
4621                 }
4622                 if (len != M_COPYALL)
4623                         len -= n->m_len;
4624                 off = 0;
4625                 m = m->m_next;
4626                 np = &n->m_next;
4627         }
4628
4629         if (top == NULL)
4630                 MCFail++;
4631
4632         return (top);
4633 nospace:
4634
4635         m_freem(top);
4636         MCFail++;
4637         return (NULL);
4638 }
4639
4640
4641 struct mbuf *
4642 m_copym(struct mbuf *m, int off0, int len, int wait)
4643 {
4644         return (m_copym_mode(m, off0, len, wait, M_COPYM_MOVE_HDR));
4645 }
4646
4647 /*
4648  * Equivalent to m_copym except that all necessary mbuf hdrs are allocated
4649  * within this routine also, the last mbuf and offset accessed are passed
4650  * out and can be passed back in to avoid having to rescan the entire mbuf
4651  * list (normally hung off of the socket)
4652  */
4653 struct mbuf *
4654 m_copym_with_hdrs(struct mbuf *m0, int off0, int len0, int wait,
4655     struct mbuf **m_lastm, int *m_off, uint32_t mode)
4656 {
4657         struct mbuf *m = m0, *n, **np = NULL;
4658         int off = off0, len = len0;
4659         struct mbuf *top = NULL;
4660         int mcflags = MSLEEPF(wait);
4661         int copyhdr = 0;
4662         int type = 0;
4663         mcache_obj_t *list = NULL;
4664         int needed = 0;
4665
4666         if (off == 0 && (m->m_flags & M_PKTHDR))
4667                 copyhdr = 1;
4668
4669         if (m_lastm != NULL && *m_lastm != NULL) {
4670                 m = *m_lastm;
4671                 off = *m_off;
4672         } else {
4673                 while (off >= m->m_len) {
4674                         off -= m->m_len;
4675                         m = m->m_next;
4676                 }
4677         }
4678
4679         n = m;
4680         while (len > 0) {
4681                 needed++;
4682                 ASSERT(n != NULL);
4683                 len -= MIN(len, (n->m_len - ((needed == 1) ? off : 0)));
4684                 n = n->m_next;
4685         }
4686         needed++;
4687         len = len0;
4688
4689         /*
4690          * If the caller doesn't want to be put to sleep, mark it with
4691          * MCR_TRYHARD so that we may reclaim buffers from other places
4692          * before giving up.
4693          */
4694         if (mcflags & MCR_NOSLEEP)
4695                 mcflags |= MCR_TRYHARD;
4696
4697         if (mcache_alloc_ext(m_cache(MC_MBUF), &list, needed,
4698             mcflags) != needed)
4699                 goto nospace;
4700
4701         needed = 0;
4702         while (len > 0) {
4703                 n = (struct mbuf *)list;
4704                 list = list->obj_next;
4705                 ASSERT(n != NULL && m != NULL);
4706
4707                 type = (top == NULL) ? MT_HEADER : m->m_type;
4708                 MBUF_INIT(n, (top == NULL), type);
4709 #if CONFIG_MACF_NET
4710                 if (top == NULL && mac_mbuf_label_init(n, wait) != 0) {
4711                         mtype_stat_inc(MT_HEADER);
4712                         mtype_stat_dec(MT_FREE);
4713                         m_free(n);
4714                         goto nospace;
4715                 }
4716 #endif /* MAC_NET */
4717
4718                 if (top == NULL) {
4719                         top = n;
4720                         np = &top->m_next;
4721                         continue;
4722                 } else {
4723                         needed++;
4724                         *np = n;
4725                 }
4726
4727                 if (copyhdr) {
4728                         if ((mode == M_COPYM_MOVE_HDR) ||
4729                             (mode == M_COPYM_MUST_MOVE_HDR)) {
4730                                 M_COPY_PKTHDR(n, m);
4731                         } else if ((mode == M_COPYM_COPY_HDR) ||
4732                             (mode == M_COPYM_MUST_COPY_HDR)) {
4733                                 if (m_dup_pkthdr(n, m, wait) == 0)
4734                                         goto nospace;
4735                         }
4736                         n->m_pkthdr.len = len;
4737                         copyhdr = 0;
4738                 }
4739                 n->m_len = MIN(len, (m->m_len - off));
4740
4741                 if (m->m_flags & M_EXT) {
4742                         n->m_ext = m->m_ext;
4743                         m_incref(m);
4744                         n->m_data = m->m_data + off;
4745                         n->m_flags |= M_EXT;
4746                 } else {
4747                         if (MTOD(n, char *) + n->m_len > ((char *)n) + MSIZE)
4748                                 panic("%s n %p copy overflow",
4749                                         __func__, n);
4750
4751                         bcopy(MTOD(m, caddr_t)+off, MTOD(n, caddr_t),
4752                             (unsigned)n->m_len);
4753                 }
4754                 len -= n->m_len;
4755
4756                 if (len == 0) {
4757                         if (m_lastm != NULL && m_off != NULL) {
4758                                 if ((off + n->m_len) == m->m_len) {
4759                                         *m_lastm = m->m_next;
4760                                         *m_off  = 0;
4761                                 } else {
4762                                         *m_lastm = m;
4763                                         *m_off  = off + n->m_len;
4764                                 }
4765                         }
4766                         break;
4767                 }
4768                 off = 0;
4769                 m = m->m_next;
4770                 np = &n->m_next;
4771         }
4772
4773         mtype_stat_inc(MT_HEADER);
4774         mtype_stat_add(type, needed);
4775         mtype_stat_sub(MT_FREE, needed + 1);
4776
4777         ASSERT(list == NULL);
4778         return (top);
4779
4780 nospace:
4781         if (list != NULL)
4782                 mcache_free_ext(m_cache(MC_MBUF), list);
4783         if (top != NULL)
4784                 m_freem(top);
4785         MCFail++;
4786         return (NULL);
4787 }
4788
4789 /*
4790  * Copy data from an mbuf chain starting "off" bytes from the beginning,
4791  * continuing for "len" bytes, into the indicated buffer.
4792  */
4793 void
4794 m_copydata(struct mbuf *m, int off, int len, void *vp)
4795 {
4796         unsigned count;
4797         char *cp = vp;
4798
4799         if (off < 0 || len < 0)
4800                 panic("m_copydata: invalid offset %d or len %d", off, len);
4801
4802         while (off > 0) {
4803                 if (m == NULL)
4804                         panic("m_copydata: invalid mbuf chain");
4805                 if (off < m->m_len)
4806                         break;
4807                 off -= m->m_len;
4808                 m = m->m_next;
4809         }
4810         while (len > 0) {
4811                 if (m == NULL)
4812                         panic("m_copydata: invalid mbuf chain");
4813                 count = MIN(m->m_len - off, len);
4814                 bcopy(MTOD(m, caddr_t) + off, cp, count);
4815                 len -= count;
4816                 cp += count;
4817                 off = 0;
4818                 m = m->m_next;
4819         }
4820 }
4821
4822 /*
4823  * Concatenate mbuf chain n to m.  Both chains must be of the same type
4824  * (e.g. MT_DATA).  Any m_pkthdr is not updated.
4825  */
4826 void
4827 m_cat(struct mbuf *m, struct mbuf *n)
4828 {
4829         while (m->m_next)
4830                 m = m->m_next;
4831         while (n) {
4832                 if ((m->m_flags & M_EXT) ||
4833                     m->m_data + m->m_len + n->m_len >= &m->m_dat[MLEN]) {
4834                         /* just join the two chains */
4835                         m->m_next = n;
4836                         return;
4837                 }
4838                 /* splat the data from one into the other */
4839                 bcopy(MTOD(n, caddr_t), MTOD(m, caddr_t) + m->m_len,
4840                     (u_int)n->m_len);
4841                 m->m_len += n->m_len;
4842                 n = m_free(n);
4843         }
4844 }
4845
4846 void
4847 m_adj(struct mbuf *mp, int req_len)
4848 {
4849         int len = req_len;
4850         struct mbuf *m;
4851         int count;
4852
4853         if ((m = mp) == NULL)
4854                 return;
4855         if (len >= 0) {
4856                 /*
4857                  * Trim from head.
4858                  */
4859                 while (m != NULL && len > 0) {
4860                         if (m->m_len <= len) {
4861                                 len -= m->m_len;
4862                                 m->m_len = 0;
4863                                 m = m->m_next;
4864                         } else {
4865                                 m->m_len -= len;
4866                                 m->m_data += len;
4867                                 len = 0;
4868                         }
4869                 }
4870                 m = mp;
4871                 if (m->m_flags & M_PKTHDR)
4872                         m->m_pkthdr.len -= (req_len - len);
4873         } else {
4874                 /*
4875                  * Trim from tail.  Scan the mbuf chain,
4876                  * calculating its length and finding the last mbuf.
4877                  * If the adjustment only affects this mbuf, then just
4878                  * adjust and return.  Otherwise, rescan and truncate
4879                  * after the remaining size.
4880                  */
4881                 len = -len;
4882                 count = 0;
4883                 for (;;) {
4884                         count += m->m_len;
4885                         if (m->m_next == (struct mbuf *)0)
4886                                 break;
4887                         m = m->m_next;
4888                 }
4889                 if (m->m_len >= len) {
4890                         m->m_len -= len;
4891                         m = mp;
4892                         if (m->m_flags & M_PKTHDR)
4893                                 m->m_pkthdr.len -= len;
4894                         return;
4895                 }
4896                 count -= len;
4897                 if (count < 0)
4898                         count = 0;
4899                 /*
4900                  * Correct length for chain is "count".
4901                  * Find the mbuf with last data, adjust its length,
4902                  * and toss data from remaining mbufs on chain.
4903                  */
4904                 m = mp;
4905                 if (m->m_flags & M_PKTHDR)
4906                         m->m_pkthdr.len = count;
4907                 for (; m; m = m->m_next) {
4908                         if (m->m_len >= count) {
4909                                 m->m_len = count;
4910                                 break;
4911                         }
4912                         count -= m->m_len;
4913                 }
4914                 while ((m = m->m_next))
4915                         m->m_len = 0;
4916         }
4917 }
4918
4919 /*
4920  * Rearange an mbuf chain so that len bytes are contiguous
4921  * and in the data area of an mbuf (so that mtod and dtom
4922  * will work for a structure of size len).  Returns the resulting
4923  * mbuf chain on success, frees it and returns null on failure.
4924  * If there is room, it will add up to max_protohdr-len extra bytes to the
4925  * contiguous region in an attempt to avoid being called next time.
4926  */
4927 int MPFail;
4928
4929 struct mbuf *
4930 m_pullup(struct mbuf *n, int len)
4931 {
4932         struct mbuf *m;
4933         int count;
4934         int space;
4935
4936         /*
4937          * If first mbuf has no cluster, and has room for len bytes
4938          * without shifting current data, pullup into it,
4939          * otherwise allocate a new mbuf to prepend to the chain.
4940          */
4941         if ((n->m_flags & M_EXT) == 0 &&
4942             n->m_data + len < &n->m_dat[MLEN] && n->m_next) {
4943                 if (n->m_len >= len)
4944                         return (n);
4945                 m = n;
4946                 n = n->m_next;
4947                 len -= m->m_len;
4948         } else {
4949                 if (len > MHLEN)
4950                         goto bad;
4951                 _MGET(m, M_DONTWAIT, n->m_type);
4952                 if (m == 0)
4953                         goto bad;
4954                 m->m_len = 0;
4955                 if (n->m_flags & M_PKTHDR) {
4956                         M_COPY_PKTHDR(m, n);
4957                         n->m_flags &= ~M_PKTHDR;
4958                 }
4959         }
4960         space = &m->m_dat[MLEN] - (m->m_data + m->m_len);
4961         do {
4962                 count = MIN(MIN(MAX(len, max_protohdr), space), n->m_len);
4963                 bcopy(MTOD(n, caddr_t), MTOD(m, caddr_t) + m->m_len,
4964                     (unsigned)count);
4965                 len -= count;
4966                 m->m_len += count;
4967                 n->m_len -= count;
4968                 space -= count;
4969                 if (n->m_len)
4970                         n->m_data += count;
4971                 else
4972                         n = m_free(n);
4973         } while (len > 0 && n);
4974         if (len > 0) {
4975                 (void) m_free(m);
4976                 goto bad;
4977         }
4978         m->m_next = n;
4979         return (m);
4980 bad:
4981         m_freem(n);
4982         MPFail++;
4983         return (0);
4984 }
4985
4986 /*
4987  * Like m_pullup(), except a new mbuf is always allocated, and we allow
4988  * the amount of empty space before the data in the new mbuf to be specified
4989  * (in the event that the caller expects to prepend later).
4990  */
4991 __private_extern__ int MSFail = 0;
4992
4993 __private_extern__ struct mbuf *
4994 m_copyup(struct mbuf *n, int len, int dstoff)
4995 {
4996         struct mbuf *m;
4997         int count, space;
4998
4999         if (len > (MHLEN - dstoff))
5000                 goto bad;
5001         MGET(m, M_DONTWAIT, n->m_type);
5002         if (m == NULL)
5003                 goto bad;
5004         m->m_len = 0;
5005         if (n->m_flags & M_PKTHDR) {
5006                 m_copy_pkthdr(m, n);
5007                 n->m_flags &= ~M_PKTHDR;
5008         }
5009         m->m_data += dstoff;
5010         space = &m->m_dat[MLEN] - (m->m_data + m->m_len);
5011         do {
5012                 count = min(min(max(len, max_protohdr), space), n->m_len);
5013                 memcpy(mtod(m, caddr_t) + m->m_len, mtod(n, caddr_t),
5014                     (unsigned)count);
5015                 len -= count;
5016                 m->m_len += count;
5017                 n->m_len -= count;
5018                 space -= count;
5019                 if (n->m_len)
5020                         n->m_data += count;
5021                 else
5022                         n = m_free(n);
5023         } while (len > 0 && n);
5024         if (len > 0) {
5025                 (void) m_free(m);
5026                 goto bad;
5027         }
5028         m->m_next = n;
5029         return (m);
5030 bad:
5031         m_freem(n);
5032         MSFail++;
5033         return (NULL);
5034 }
5035
5036 /*
5037  * Partition an mbuf chain in two pieces, returning the tail --
5038  * all but the first len0 bytes.  In case of failure, it returns NULL and
5039  * attempts to restore the chain to its original state.
5040  */
5041 struct mbuf *
5042 m_split(struct mbuf *m0, int len0, int wait)
5043 {
5044         return (m_split0(m0, len0, wait, 1));
5045 }
5046
5047 static struct mbuf *
5048 m_split0(struct mbuf *m0, int len0, int wait, int copyhdr)
5049 {
5050         struct mbuf *m, *n;
5051         unsigned len = len0, remain;
5052
5053         for (m = m0; m && len > m->m_len; m = m->m_next)
5054                 len -= m->m_len;
5055         if (m == NULL)
5056                 return (NULL);
5057         remain = m->m_len - len;
5058         if (copyhdr && (m0->m_flags & M_PKTHDR)) {
5059                 _MGETHDR(n, wait, m0->m_type);
5060                 if (n == NULL)
5061                         return (NULL);
5062                 n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif;
5063                 n->m_pkthdr.len = m0->m_pkthdr.len - len0;
5064                 m0->m_pkthdr.len = len0;
5065                 if (m->m_flags & M_EXT)
5066                         goto extpacket;
5067                 if (remain > MHLEN) {
5068                         /* m can't be the lead packet */
5069                         MH_ALIGN(n, 0);
5070                         n->m_next = m_split(m, len, wait);
5071                         if (n->m_next == NULL) {
5072                                 (void) m_free(n);
5073                                 return (NULL);
5074                         } else
5075                                 return (n);
5076                 } else
5077                         MH_ALIGN(n, remain);
5078         } else if (remain == 0) {
5079                 n = m->m_next;
5080                 m->m_next = NULL;
5081                 return (n);
5082         } else {
5083                 _MGET(n, wait, m->m_type);
5084                 if (n == NULL)
5085                         return (NULL);
5086                 M_ALIGN(n, remain);
5087         }
5088 extpacket:
5089         if (m->m_flags & M_EXT) {
5090                 n->m_flags |= M_EXT;
5091                 n->m_ext = m->m_ext;
5092                 m_incref(m);
5093                 n->m_data = m->m_data + len;
5094         } else {
5095                 bcopy(MTOD(m, caddr_t) + len, MTOD(n, caddr_t), remain);
5096         }
5097         n->m_len = remain;
5098         m->m_len = len;
5099         n->m_next = m->m_next;
5100         m->m_next = NULL;
5101         return (n);
5102 }
5103
5104 /*
5105  * Routine to copy from device local memory into mbufs.
5106  */
5107 struct mbuf *
5108 m_devget(char *buf, int totlen, int off0, struct ifnet *ifp,
5109     void (*copy)(const void *, void *, size_t))
5110 {
5111         struct mbuf *m;
5112         struct mbuf *top = NULL, **mp = &top;
5113         int off = off0, len;
5114         char *cp;
5115         char *epkt;
5116
5117         cp = buf;
5118         epkt = cp + totlen;
5119         if (off) {
5120                 /*
5121                  * If 'off' is non-zero, packet is trailer-encapsulated,
5122                  * so we have to skip the type and length fields.
5123                  */
5124                 cp += off + 2 * sizeof (u_int16_t);
5125                 totlen -= 2 * sizeof (u_int16_t);
5126         }
5127         _MGETHDR(m, M_DONTWAIT, MT_DATA);
5128         if (m == NULL)
5129                 return (NULL);
5130         m->m_pkthdr.rcvif = ifp;
5131         m->m_pkthdr.len = totlen;
5132         m->m_len = MHLEN;
5133
5134         while (totlen > 0) {
5135                 if (top != NULL) {
5136                         _MGET(m, M_DONTWAIT, MT_DATA);
5137                         if (m == NULL) {
5138                                 m_freem(top);
5139                                 return (NULL);
5140                         }
5141                         m->m_len = MLEN;
5142                 }
5143                 len = MIN(totlen, epkt - cp);
5144                 if (len >= MINCLSIZE) {
5145                         MCLGET(m, M_DONTWAIT);
5146                         if (m->m_flags & M_EXT) {
5147                                 m->m_len = len = MIN(len, m_maxsize(MC_CL));
5148                         } else {
5149                                 /* give up when it's out of cluster mbufs */
5150                                 if (top != NULL)
5151                                         m_freem(top);
5152                                 m_freem(m);
5153                                 return (NULL);
5154                         }
5155                 } else {
5156                         /*
5157                          * Place initial small packet/header at end of mbuf.
5158                          */
5159                         if (len < m->m_len) {
5160                                 if (top == NULL &&
5161                                     len + max_linkhdr <= m->m_len)
5162                                         m->m_data += max_linkhdr;
5163                                 m->m_len = len;
5164                         } else {
5165                                 len = m->m_len;
5166                         }
5167                 }
5168                 if (copy)
5169                         copy(cp, MTOD(m, caddr_t), (unsigned)len);
5170                 else
5171                         bcopy(cp, MTOD(m, caddr_t), (unsigned)len);
5172                 cp += len;
5173                 *mp = m;
5174                 mp = &m->m_next;
5175                 totlen -= len;
5176                 if (cp == epkt)
5177                         cp = buf;
5178         }
5179         return (top);
5180 }
5181
5182 #ifndef MBUF_GROWTH_NORMAL_THRESH
5183 #define MBUF_GROWTH_NORMAL_THRESH 25
5184 #endif
5185
5186 /*
5187  * Cluster freelist allocation check.
5188  */
5189 static int
5190 m_howmany(int num, size_t bufsize)
5191 {
5192         int i = 0, j = 0;
5193         u_int32_t m_mbclusters, m_clusters, m_bigclusters, m_16kclusters;
5194         u_int32_t m_mbfree, m_clfree, m_bigclfree, m_16kclfree;
5195         u_int32_t sumclusters, freeclusters;
5196         u_int32_t percent_pool, percent_kmem;
5197         u_int32_t mb_growth, mb_growth_thresh;
5198
5199         VERIFY(bufsize == m_maxsize(MC_BIGCL) ||
5200             bufsize == m_maxsize(MC_16KCL));
5201
5202         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
5203
5204         /* Numbers in 2K cluster units */
5205         m_mbclusters = m_total(MC_MBUF) >> NMBPCLSHIFT;
5206         m_clusters = m_total(MC_CL);
5207         m_bigclusters = m_total(MC_BIGCL) << NCLPBGSHIFT;
5208         m_16kclusters = m_total(MC_16KCL);
5209         sumclusters = m_mbclusters + m_clusters + m_bigclusters;
5210
5211         m_mbfree = m_infree(MC_MBUF) >> NMBPCLSHIFT;
5212         m_clfree = m_infree(MC_CL);
5213         m_bigclfree = m_infree(MC_BIGCL) << NCLPBGSHIFT;
5214         m_16kclfree = m_infree(MC_16KCL);
5215         freeclusters = m_mbfree + m_clfree + m_bigclfree;
5216
5217         /* Bail if we've maxed out the mbuf memory map */
5218         if ((bufsize == m_maxsize(MC_BIGCL) && sumclusters >= nclusters) ||
5219             (njcl > 0 && bufsize == m_maxsize(MC_16KCL) &&
5220             (m_16kclusters << NCLPJCLSHIFT) >= njcl)) {
5221                 return (0);
5222         }
5223
5224         if (bufsize == m_maxsize(MC_BIGCL)) {
5225                 /* Under minimum */
5226                 if (m_bigclusters < m_minlimit(MC_BIGCL))
5227                         return (m_minlimit(MC_BIGCL) - m_bigclusters);
5228
5229                 percent_pool =
5230                     ((sumclusters - freeclusters) * 100) / sumclusters;
5231                 percent_kmem = (sumclusters * 100) / nclusters;
5232
5233                 /*
5234                  * If a light/normal user, grow conservatively (75%)
5235                  * If a heavy user, grow aggressively (50%)
5236                  */
5237                 if (percent_kmem < MBUF_GROWTH_NORMAL_THRESH)
5238                         mb_growth = MB_GROWTH_NORMAL;
5239                 else
5240                         mb_growth = MB_GROWTH_AGGRESSIVE;
5241
5242                 if (percent_kmem < 5) {
5243                         /* For initial allocations */
5244                         i = num;
5245                 } else {
5246                         /* Return if >= MBIGCL_LOWAT clusters available */
5247                         if (m_infree(MC_BIGCL) >= MBIGCL_LOWAT &&
5248                             m_total(MC_BIGCL) >=
5249                             MBIGCL_LOWAT + m_minlimit(MC_BIGCL))
5250                                 return (0);
5251
5252                         /* Ensure at least num clusters are accessible */
5253                         if (num >= m_infree(MC_BIGCL))
5254                                 i = num - m_infree(MC_BIGCL);
5255                         if (num > m_total(MC_BIGCL) - m_minlimit(MC_BIGCL))
5256                                 j = num - (m_total(MC_BIGCL) -
5257                                     m_minlimit(MC_BIGCL));
5258
5259                         i = MAX(i, j);
5260
5261                         /*
5262                          * Grow pool if percent_pool > 75 (normal growth)
5263                          * or percent_pool > 50 (aggressive growth).
5264                          */
5265                         mb_growth_thresh = 100 - (100 / (1 << mb_growth));
5266                         if (percent_pool > mb_growth_thresh)
5267                                 j = ((sumclusters + num) >> mb_growth) -
5268                                     freeclusters;
5269                         i = MAX(i, j);
5270                 }
5271
5272                 /* Check to ensure we didn't go over limits */
5273                 if (i + m_bigclusters >= m_maxlimit(MC_BIGCL))
5274                         i = m_maxlimit(MC_BIGCL) - m_bigclusters;
5275                 if ((i << 1) + sumclusters >= nclusters)
5276                         i = (nclusters - sumclusters) >> 1;
5277                 VERIFY((m_total(MC_BIGCL) + i) <= m_maxlimit(MC_BIGCL));
5278                 VERIFY(sumclusters + (i << 1) <= nclusters);
5279
5280         } else { /* 16K CL */
5281                 VERIFY(njcl > 0);
5282                 /* Under minimum */
5283                 if (m_16kclusters < MIN16KCL)
5284                         return (MIN16KCL - m_16kclusters);
5285                 if (m_16kclfree >= M16KCL_LOWAT)
5286                         return (0);
5287
5288                 /* Ensure at least num clusters are available */
5289                 if (num >= m_16kclfree)
5290                         i = num - m_16kclfree;
5291
5292                 /* Always grow 16KCL pool aggressively */
5293                 if (((m_16kclusters + num) >> 1) > m_16kclfree)
5294                         j = ((m_16kclusters + num) >> 1) - m_16kclfree;
5295                 i = MAX(i, j);
5296
5297                 /* Check to ensure we don't go over limit */
5298                 if (i + m_16kclusters >= m_maxlimit(MC_16KCL))
5299                         i = m_maxlimit(MC_16KCL) - m_16kclusters;
5300                 VERIFY((m_total(MC_16KCL) + i) <= m_maxlimit(MC_16KCL));
5301         }
5302         return (i);
5303 }
5304 /*
5305  * Return the number of bytes in the mbuf chain, m.
5306  */
5307 unsigned int
5308 m_length(struct mbuf *m)
5309 {
5310         struct mbuf *m0;
5311         unsigned int pktlen;
5312
5313         if (m->m_flags & M_PKTHDR)
5314                 return (m->m_pkthdr.len);
5315
5316         pktlen = 0;
5317         for (m0 = m; m0 != NULL; m0 = m0->m_next)
5318                 pktlen += m0->m_len;
5319         return (pktlen);
5320 }
5321
5322 /*
5323  * Copy data from a buffer back into the indicated mbuf chain,
5324  * starting "off" bytes from the beginning, extending the mbuf
5325  * chain if necessary.
5326  */
5327 void
5328 m_copyback(struct mbuf *m0, int off, int len, const void *cp)
5329 {
5330 #if DEBUG
5331         struct mbuf *origm = m0;
5332         int error;
5333 #endif /* DEBUG */
5334
5335         if (m0 == NULL)
5336                 return;
5337
5338 #if DEBUG
5339         error =
5340 #endif /* DEBUG */
5341         m_copyback0(&m0, off, len, cp,
5342             M_COPYBACK0_COPYBACK | M_COPYBACK0_EXTEND, M_DONTWAIT);
5343
5344 #if DEBUG
5345         if (error != 0 || (m0 != NULL && origm != m0))
5346                 panic("m_copyback");
5347 #endif /* DEBUG */
5348 }
5349
5350 struct mbuf *
5351 m_copyback_cow(struct mbuf *m0, int off, int len, const void *cp, int how)
5352 {
5353         int error;
5354
5355         /* don't support chain expansion */
5356         VERIFY(off + len <= m_length(m0));
5357
5358         error = m_copyback0(&m0, off, len, cp,
5359             M_COPYBACK0_COPYBACK | M_COPYBACK0_COW, how);
5360         if (error) {
5361                 /*
5362                  * no way to recover from partial success.
5363                  * just free the chain.
5364                  */
5365                 m_freem(m0);
5366                 return (NULL);
5367         }
5368         return (m0);
5369 }
5370
5371 /*
5372  * m_makewritable: ensure the specified range writable.
5373  */
5374 int
5375 m_makewritable(struct mbuf **mp, int off, int len, int how)
5376 {
5377         int error;
5378 #if DEBUG
5379         struct mbuf *n;
5380         int origlen, reslen;
5381
5382         origlen = m_length(*mp);
5383 #endif /* DEBUG */
5384
5385 #if 0 /* M_COPYALL is large enough */
5386         if (len == M_COPYALL)
5387                 len = m_length(*mp) - off; /* XXX */
5388 #endif
5389
5390         error = m_copyback0(mp, off, len, NULL,
5391             M_COPYBACK0_PRESERVE | M_COPYBACK0_COW, how);
5392
5393 #if DEBUG
5394         reslen = 0;
5395         for (n = *mp; n; n = n->m_next)
5396                 reslen += n->m_len;
5397         if (origlen != reslen)
5398                 panic("m_makewritable: length changed");
5399         if (((*mp)->m_flags & M_PKTHDR) && reslen != (*mp)->m_pkthdr.len)
5400                 panic("m_makewritable: inconsist");
5401 #endif /* DEBUG */
5402
5403         return (error);
5404 }
5405
5406 static int
5407 m_copyback0(struct mbuf **mp0, int off, int len, const void *vp, int flags,
5408     int how)
5409 {
5410         int mlen;
5411         struct mbuf *m, *n;
5412         struct mbuf **mp;
5413         int totlen = 0;
5414         const char *cp = vp;
5415
5416         VERIFY(mp0 != NULL);
5417         VERIFY(*mp0 != NULL);
5418         VERIFY((flags & M_COPYBACK0_PRESERVE) == 0 || cp == NULL);
5419         VERIFY((flags & M_COPYBACK0_COPYBACK) == 0 || cp != NULL);
5420
5421         /*
5422          * we don't bother to update "totlen" in the case of M_COPYBACK0_COW,
5423          * assuming that M_COPYBACK0_EXTEND and M_COPYBACK0_COW are exclusive.
5424          */
5425
5426         VERIFY((~flags & (M_COPYBACK0_EXTEND|M_COPYBACK0_COW)) != 0);
5427
5428         mp = mp0;
5429         m = *mp;
5430         while (off > (mlen = m->m_len)) {
5431                 off -= mlen;
5432                 totlen += mlen;
5433                 if (m->m_next == NULL) {
5434                         int tspace;
5435 extend:
5436                         if (!(flags & M_COPYBACK0_EXTEND))
5437                                 goto out;
5438
5439                         /*
5440                          * try to make some space at the end of "m".
5441                          */
5442
5443                         mlen = m->m_len;
5444                         if (off + len >= MINCLSIZE &&
5445                             !(m->m_flags & M_EXT) && m->m_len == 0) {
5446                                 MCLGET(m, how);
5447                         }
5448                         tspace = M_TRAILINGSPACE(m);
5449                         if (tspace > 0) {
5450                                 tspace = MIN(tspace, off + len);
5451                                 VERIFY(tspace > 0);
5452                                 bzero(mtod(m, char *) + m->m_len,
5453                                     MIN(off, tspace));
5454                                 m->m_len += tspace;
5455                                 off += mlen;
5456                                 totlen -= mlen;
5457                                 continue;
5458                         }
5459
5460                         /*
5461                          * need to allocate an mbuf.
5462                          */
5463
5464                         if (off + len >= MINCLSIZE) {
5465                                 n = m_getcl(how, m->m_type, 0);
5466                         } else {
5467                                 n = _M_GET(how, m->m_type);
5468                         }
5469                         if (n == NULL) {
5470                                 goto out;
5471                         }
5472                         n->m_len = 0;
5473                         n->m_len = MIN(M_TRAILINGSPACE(n), off + len);
5474                         bzero(mtod(n, char *), MIN(n->m_len, off));
5475                         m->m_next = n;
5476                 }
5477                 mp = &m->m_next;
5478                 m = m->m_next;
5479         }
5480         while (len > 0) {
5481                 mlen = m->m_len - off;
5482                 if (mlen != 0 && m_mclhasreference(m)) {
5483                         char *datap;
5484                         int eatlen;
5485
5486                         /*
5487                          * this mbuf is read-only.
5488                          * allocate a new writable mbuf and try again.
5489                          */
5490
5491 #if DIAGNOSTIC
5492                         if (!(flags & M_COPYBACK0_COW))
5493                                 panic("m_copyback0: read-only");
5494 #endif /* DIAGNOSTIC */
5495
5496                         /*
5497                          * if we're going to write into the middle of
5498                          * a mbuf, split it first.
5499                          */
5500                         if (off > 0 && len < mlen) {
5501                                 n = m_split0(m, off, how, 0);
5502                                 if (n == NULL)
5503                                         goto enobufs;
5504                                 m->m_next = n;
5505                                 mp = &m->m_next;
5506                                 m = n;
5507                                 off = 0;
5508                                 continue;
5509                         }
5510
5511                         /*
5512                          * XXX TODO coalesce into the trailingspace of
5513                          * the previous mbuf when possible.
5514                          */
5515
5516                         /*
5517                          * allocate a new mbuf.  copy packet header if needed.
5518                          */
5519                         n = _M_GET(how, m->m_type);
5520                         if (n == NULL)
5521                                 goto enobufs;
5522                         if (off == 0 && (m->m_flags & M_PKTHDR)) {
5523                                 M_COPY_PKTHDR(n, m);
5524                                 n->m_len = MHLEN;
5525                         } else {
5526                                 if (len >= MINCLSIZE)
5527                                         MCLGET(n, M_DONTWAIT);
5528                                 n->m_len =
5529                                     (n->m_flags & M_EXT) ? MCLBYTES : MLEN;
5530                         }
5531                         if (n->m_len > len)
5532                                 n->m_len = len;
5533
5534                         /*
5535                          * free the region which has been overwritten.
5536                          * copying data from old mbufs if requested.
5537                          */
5538                         if (flags & M_COPYBACK0_PRESERVE)
5539                                 datap = mtod(n, char *);
5540                         else
5541                                 datap = NULL;
5542                         eatlen = n->m_len;
5543                         VERIFY(off == 0 || eatlen >= mlen);
5544                         if (off > 0) {
5545                                 VERIFY(len >= mlen);
5546                                 m->m_len = off;
5547                                 m->m_next = n;
5548                                 if (datap) {
5549                                         m_copydata(m, off, mlen, datap);
5550                                         datap += mlen;
5551                                 }
5552                                 eatlen -= mlen;
5553                                 mp = &m->m_next;
5554                                 m = m->m_next;
5555                         }
5556                         while (m != NULL && m_mclhasreference(m) &&
5557                             n->m_type == m->m_type && eatlen > 0) {
5558                                 mlen = MIN(eatlen, m->m_len);
5559                                 if (datap) {
5560                                         m_copydata(m, 0, mlen, datap);
5561                                         datap += mlen;
5562                                 }
5563                                 m->m_data += mlen;
5564                                 m->m_len -= mlen;
5565                                 eatlen -= mlen;
5566                                 if (m->m_len == 0)
5567                                         *mp = m = m_free(m);
5568                         }
5569                         if (eatlen > 0)
5570                                 n->m_len -= eatlen;
5571                         n->m_next = m;
5572                         *mp = m = n;
5573                         continue;
5574                 }
5575                 mlen = MIN(mlen, len);
5576                 if (flags & M_COPYBACK0_COPYBACK) {
5577                         bcopy(cp, mtod(m, caddr_t) + off, (unsigned)mlen);
5578                         cp += mlen;
5579                 }
5580                 len -= mlen;
5581                 mlen += off;
5582                 off = 0;
5583                 totlen += mlen;
5584                 if (len == 0)
5585                         break;
5586                 if (m->m_next == NULL) {
5587                         goto extend;
5588                 }
5589                 mp = &m->m_next;
5590                 m = m->m_next;
5591         }
5592 out:
5593         if (((m = *mp0)->m_flags & M_PKTHDR) && (m->m_pkthdr.len < totlen)) {
5594                 VERIFY(flags & M_COPYBACK0_EXTEND);
5595                 m->m_pkthdr.len = totlen;
5596         }
5597
5598         return (0);
5599
5600 enobufs:
5601         return (ENOBUFS);
5602 }
5603
5604 uint64_t
5605 mcl_to_paddr(char *addr)
5606 {
5607         vm_offset_t base_phys;
5608
5609         if (!MBUF_IN_MAP(addr))
5610                 return (0);
5611         base_phys = mcl_paddr[atop_64(addr - (char *)mbutl)];
5612
5613         if (base_phys == 0)
5614                 return (0);
5615         return ((uint64_t)(ptoa_64(base_phys) | ((uint64_t)addr & PAGE_MASK)));
5616 }
5617
5618 /*
5619  * Dup the mbuf chain passed in.  The whole thing.  No cute additional cruft.
5620  * And really copy the thing.  That way, we don't "precompute" checksums
5621  * for unsuspecting consumers.  Assumption: m->m_nextpkt == 0.  Trick: for
5622  * small packets, don't dup into a cluster.  That way received  packets
5623  * don't take up too much room in the sockbuf (cf. sbspace()).
5624  */
5625 int MDFail;
5626
5627 struct mbuf *
5628 m_dup(struct mbuf *m, int how)
5629 {
5630         struct mbuf *n, **np;
5631         struct mbuf *top;
5632         int copyhdr = 0;
5633
5634         np = &top;
5635         top = NULL;
5636         if (m->m_flags & M_PKTHDR)
5637                 copyhdr = 1;
5638
5639         /*
5640          * Quick check: if we have one mbuf and its data fits in an
5641          *  mbuf with packet header, just copy and go.
5642          */
5643         if (m->m_next == NULL) {
5644                 /* Then just move the data into an mbuf and be done... */
5645                 if (copyhdr) {
5646                         if (m->m_pkthdr.len <= MHLEN && m->m_len <= MHLEN) {
5647                                 if ((n = _M_GETHDR(how, m->m_type)) == NULL)
5648                                         return (NULL);
5649                                 n->m_len = m->m_len;
5650                                 m_dup_pkthdr(n, m, how);
5651                                 bcopy(m->m_data, n->m_data, m->m_len);
5652                                 return (n);
5653                         }
5654                 } else if (m->m_len <= MLEN) {
5655                         if ((n = _M_GET(how, m->m_type)) == NULL)
5656                                 return (NULL);
5657                         bcopy(m->m_data, n->m_data, m->m_len);
5658                         n->m_len = m->m_len;
5659                         return (n);
5660                 }
5661         }
5662         while (m != NULL) {
5663 #if BLUE_DEBUG
5664                 kprintf("<%x: %x, %x, %x\n", m, m->m_flags, m->m_len,
5665                     m->m_data);
5666 #endif
5667                 if (copyhdr)
5668                         n = _M_GETHDR(how, m->m_type);
5669                 else
5670                         n = _M_GET(how, m->m_type);
5671                 if (n == NULL)
5672                         goto nospace;
5673                 if (m->m_flags & M_EXT) {
5674                         if (m->m_len <= m_maxsize(MC_CL))
5675                                 MCLGET(n, how);
5676                         else if (m->m_len <= m_maxsize(MC_BIGCL))
5677                                 n = m_mbigget(n, how);
5678                         else if (m->m_len <= m_maxsize(MC_16KCL) && njcl > 0)
5679                                 n = m_m16kget(n, how);
5680                         if (!(n->m_flags & M_EXT)) {
5681                                 (void) m_free(n);
5682                                 goto nospace;
5683                         }
5684                 }
5685                 *np = n;
5686                 if (copyhdr) {
5687                         /* Don't use M_COPY_PKTHDR: preserve m_data */
5688                         m_dup_pkthdr(n, m, how);
5689                         copyhdr = 0;
5690                         if (!(n->m_flags & M_EXT))
5691                                 n->m_data = n->m_pktdat;
5692                 }
5693                 n->m_len = m->m_len;
5694                 /*
5695                  * Get the dup on the same bdry as the original
5696                  * Assume that the two mbufs have the same offset to data area
5697                  * (up to word boundaries)
5698                  */
5699                 bcopy(MTOD(m, caddr_t), MTOD(n, caddr_t), (unsigned)n->m_len);
5700                 m = m->m_next;
5701                 np = &n->m_next;
5702 #if BLUE_DEBUG
5703                 kprintf(">%x: %x, %x, %x\n", n, n->m_flags, n->m_len,
5704                     n->m_data);
5705 #endif
5706         }
5707
5708         if (top == NULL)
5709                 MDFail++;
5710         return (top);
5711
5712 nospace:
5713         m_freem(top);
5714         MDFail++;
5715         return (NULL);
5716 }
5717
5718 #define MBUF_MULTIPAGES(m)                                              \
5719         (((m)->m_flags & M_EXT) &&                                      \
5720         ((IS_P2ALIGNED((m)->m_data, NBPG) && (m)->m_len > NBPG) ||      \
5721         (!IS_P2ALIGNED((m)->m_data, NBPG) &&                            \
5722         P2ROUNDUP((m)->m_data, NBPG) < ((uintptr_t)(m)->m_data + (m)->m_len))))
5723
5724 static struct mbuf *
5725 m_expand(struct mbuf *m, struct mbuf **last)
5726 {
5727         struct mbuf *top = NULL;
5728         struct mbuf **nm = &top;
5729         uintptr_t data0, data;
5730         unsigned int len0, len;
5731
5732         VERIFY(MBUF_MULTIPAGES(m));
5733         VERIFY(m->m_next == NULL);
5734         data0 = (uintptr_t)m->m_data;
5735         len0 = m->m_len;
5736         *last = top;
5737
5738         for (;;) {
5739                 struct mbuf *n;
5740
5741                 data = data0;
5742                 if (IS_P2ALIGNED(data, NBPG) && len0 > NBPG)
5743                         len = NBPG;
5744                 else if (!IS_P2ALIGNED(data, NBPG) &&
5745                     P2ROUNDUP(data, NBPG) < (data + len0))
5746                         len = P2ROUNDUP(data, NBPG) - data;
5747                 else
5748                         len = len0;
5749
5750                 VERIFY(len > 0);
5751                 VERIFY(m->m_flags & M_EXT);
5752                 m->m_data = (void *)data;
5753                 m->m_len = len;
5754
5755                 *nm = *last = m;
5756                 nm = &m->m_next;
5757                 m->m_next = NULL;
5758
5759                 data0 += len;
5760                 len0 -= len;
5761                 if (len0 == 0)
5762                         break;
5763
5764                 n = _M_RETRY(M_DONTWAIT, MT_DATA);
5765                 if (n == NULL) {
5766                         m_freem(top);
5767                         top = *last = NULL;
5768                         break;
5769                 }
5770
5771                 n->m_ext = m->m_ext;
5772                 m_incref(m);
5773                 n->m_flags |= M_EXT;
5774                 m = n;
5775         }
5776         return (top);
5777 }
5778
5779 struct mbuf *
5780 m_normalize(struct mbuf *m)
5781 {
5782         struct mbuf *top = NULL;
5783         struct mbuf **nm = &top;
5784         boolean_t expanded = FALSE;
5785
5786         while (m != NULL) {
5787                 struct mbuf *n;
5788
5789                 n = m->m_next;
5790                 m->m_next = NULL;
5791
5792                 /* Does the data cross one or more page boundaries? */
5793                 if (MBUF_MULTIPAGES(m)) {
5794                         struct mbuf *last;
5795                         if ((m = m_expand(m, &last)) == NULL) {
5796                                 m_freem(n);
5797                                 m_freem(top);
5798                                 top = NULL;
5799                                 break;
5800                         }
5801                         *nm = m;
5802                         nm = &last->m_next;
5803                         expanded = TRUE;
5804                 } else {
5805                         *nm = m;
5806                         nm = &m->m_next;
5807                 }
5808                 m = n;
5809         }
5810         if (expanded)
5811                 atomic_add_32(&mb_normalized, 1);
5812         return (top);
5813 }
5814
5815 /*
5816  * Append the specified data to the indicated mbuf chain,
5817  * Extend the mbuf chain if the new data does not fit in
5818  * existing space.
5819  *
5820  * Return 1 if able to complete the job; otherwise 0.
5821  */
5822 int
5823 m_append(struct mbuf *m0, int len, caddr_t cp)
5824 {
5825         struct mbuf *m, *n;
5826         int remainder, space;
5827
5828         for (m = m0; m->m_next != NULL; m = m->m_next)
5829                 ;
5830         remainder = len;
5831         space = M_TRAILINGSPACE(m);
5832         if (space > 0) {
5833                 /*
5834                  * Copy into available space.
5835                  */
5836                 if (space > remainder)
5837                         space = remainder;
5838                 bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
5839                 m->m_len += space;
5840                 cp += space, remainder -= space;
5841         }
5842         while (remainder > 0) {
5843                 /*
5844                  * Allocate a new mbuf; could check space
5845                  * and allocate a cluster instead.
5846                  */
5847                 n = m_get(M_WAITOK, m->m_type);
5848                 if (n == NULL)
5849                         break;
5850                 n->m_len = min(MLEN, remainder);
5851                 bcopy(cp, mtod(n, caddr_t), n->m_len);
5852                 cp += n->m_len;
5853                 remainder -= n->m_len;
5854                 m->m_next = n;
5855                 m = n;
5856         }
5857         if (m0->m_flags & M_PKTHDR)
5858                 m0->m_pkthdr.len += len - remainder;
5859         return (remainder == 0);
5860 }
5861
5862 struct mbuf *
5863 m_last(struct mbuf *m)
5864 {
5865         while (m->m_next != NULL)
5866                 m = m->m_next;
5867         return (m);
5868 }
5869
5870 unsigned int
5871 m_fixhdr(struct mbuf *m0)
5872 {
5873         u_int len;
5874
5875         VERIFY(m0->m_flags & M_PKTHDR);
5876
5877         len = m_length2(m0, NULL);
5878         m0->m_pkthdr.len = len;
5879         return (len);
5880 }
5881
5882 unsigned int
5883 m_length2(struct mbuf *m0, struct mbuf **last)
5884 {
5885         struct mbuf *m;
5886         u_int len;
5887
5888         len = 0;
5889         for (m = m0; m != NULL; m = m->m_next) {
5890                 len += m->m_len;
5891                 if (m->m_next == NULL)
5892                         break;
5893         }
5894         if (last != NULL)
5895                 *last = m;
5896         return (len);
5897 }
5898
5899 /*
5900  * Defragment a mbuf chain, returning the shortest possible chain of mbufs
5901  * and clusters.  If allocation fails and this cannot be completed, NULL will
5902  * be returned, but the passed in chain will be unchanged.  Upon success,
5903  * the original chain will be freed, and the new chain will be returned.
5904  *
5905  * If a non-packet header is passed in, the original mbuf (chain?) will
5906  * be returned unharmed.
5907  *
5908  * If offset is specfied, the first mbuf in the chain will have a leading
5909  * space of the amount stated by the "off" parameter.
5910  *
5911  * This routine requires that the m_pkthdr.header field of the original
5912  * mbuf chain is cleared by the caller.
5913  */
5914 struct mbuf *
5915 m_defrag_offset(struct mbuf *m0, u_int32_t off, int how)
5916 {
5917         struct mbuf *m_new = NULL, *m_final = NULL;
5918         int progress = 0, length, pktlen;
5919
5920         if (!(m0->m_flags & M_PKTHDR))
5921                 return (m0);
5922
5923         VERIFY(off < MHLEN);
5924         m_fixhdr(m0); /* Needed sanity check */
5925
5926         pktlen = m0->m_pkthdr.len + off;
5927         if (pktlen > MHLEN)
5928                 m_final = m_getcl(how, MT_DATA, M_PKTHDR);
5929         else
5930                 m_final = m_gethdr(how, MT_DATA);
5931
5932         if (m_final == NULL)
5933                 goto nospace;
5934
5935         if (off > 0) {
5936                 pktlen -= off;
5937                 m_final->m_data += off;
5938         }
5939
5940         /*
5941          * Caller must have handled the contents pointed to by this
5942          * pointer before coming here, as otherwise it will point to
5943          * the original mbuf which will get freed upon success.
5944          */
5945         VERIFY(m0->m_pkthdr.pkt_hdr == NULL);
5946
5947         if (m_dup_pkthdr(m_final, m0, how) == 0)
5948                 goto nospace;
5949
5950         m_new = m_final;
5951
5952         while (progress < pktlen) {
5953                 length = pktlen - progress;
5954                 if (length > MCLBYTES)
5955                         length = MCLBYTES;
5956                 length -= ((m_new == m_final) ? off : 0);
5957
5958                 if (m_new == NULL) {
5959                         if (length > MLEN)
5960                                 m_new = m_getcl(how, MT_DATA, 0);
5961                         else
5962                                 m_new = m_get(how, MT_DATA);
5963                         if (m_new == NULL)
5964                                 goto nospace;
5965                 }
5966
5967                 m_copydata(m0, progress, length, mtod(m_new, caddr_t));
5968                 progress += length;
5969                 m_new->m_len = length;
5970                 if (m_new != m_final)
5971                         m_cat(m_final, m_new);
5972                 m_new = NULL;
5973         }
5974         m_freem(m0);
5975         m0 = m_final;
5976         return (m0);
5977 nospace:
5978         if (m_final)
5979                 m_freem(m_final);
5980         return (NULL);
5981 }
5982
5983 struct mbuf *
5984 m_defrag(struct mbuf *m0, int how)
5985 {
5986         return (m_defrag_offset(m0, 0, how));
5987 }
5988
5989 void
5990 m_mchtype(struct mbuf *m, int t)
5991 {
5992         mtype_stat_inc(t);
5993         mtype_stat_dec(m->m_type);
5994         (m)->m_type = t;
5995 }
5996
5997 void *
5998 m_mtod(struct mbuf *m)
5999 {
6000         return (MTOD(m, void *));
6001 }
6002
6003 struct mbuf *
6004 m_dtom(void *x)
6005 {
6006         return ((struct mbuf *)((uintptr_t)(x) & ~(MSIZE-1)));
6007 }
6008
6009 void
6010 m_mcheck(struct mbuf *m)
6011 {
6012         _MCHECK(m);
6013 }
6014
6015 /*
6016  * Return a pointer to mbuf/offset of location in mbuf chain.
6017  */
6018 struct mbuf *
6019 m_getptr(struct mbuf *m, int loc, int *off)
6020 {
6021
6022         while (loc >= 0) {
6023                 /* Normal end of search. */
6024                 if (m->m_len > loc) {
6025                         *off = loc;
6026                         return (m);
6027                 } else {
6028                         loc -= m->m_len;
6029                         if (m->m_next == NULL) {
6030                                 if (loc == 0) {
6031                                         /* Point at the end of valid data. */
6032                                         *off = m->m_len;
6033                                         return (m);
6034                                 }
6035                                 return (NULL);
6036                         }
6037                         m = m->m_next;
6038                 }
6039         }
6040         return (NULL);
6041 }
6042
6043 /*
6044  * Inform the corresponding mcache(s) that there's a waiter below.
6045  */
6046 static void
6047 mbuf_waiter_inc(mbuf_class_t class, boolean_t comp)
6048 {
6049         mcache_waiter_inc(m_cache(class));
6050         if (comp) {
6051                 if (class == MC_CL) {
6052                         mcache_waiter_inc(m_cache(MC_MBUF_CL));
6053                 } else if (class == MC_BIGCL) {
6054                         mcache_waiter_inc(m_cache(MC_MBUF_BIGCL));
6055                 } else if (class == MC_16KCL) {
6056                         mcache_waiter_inc(m_cache(MC_MBUF_16KCL));
6057                 } else {
6058                         mcache_waiter_inc(m_cache(MC_MBUF_CL));
6059                         mcache_waiter_inc(m_cache(MC_MBUF_BIGCL));
6060                 }
6061         }
6062 }
6063
6064 /*
6065  * Inform the corresponding mcache(s) that there's no more waiter below.
6066  */
6067 static void
6068 mbuf_waiter_dec(mbuf_class_t class, boolean_t comp)
6069 {
6070         mcache_waiter_dec(m_cache(class));
6071         if (comp) {
6072                 if (class == MC_CL) {
6073                         mcache_waiter_dec(m_cache(MC_MBUF_CL));
6074                 } else if (class == MC_BIGCL) {
6075                         mcache_waiter_dec(m_cache(MC_MBUF_BIGCL));
6076                 } else if (class == MC_16KCL) {
6077                         mcache_waiter_dec(m_cache(MC_MBUF_16KCL));
6078                 } else {
6079                         mcache_waiter_dec(m_cache(MC_MBUF_CL));
6080                         mcache_waiter_dec(m_cache(MC_MBUF_BIGCL));
6081                 }
6082         }
6083 }
6084
6085 /*
6086  * Called during slab (blocking and non-blocking) allocation.  If there
6087  * is at least one waiter, and the time since the first waiter is blocked
6088  * is greater than the watchdog timeout, panic the system.
6089  */
6090 static void
6091 mbuf_watchdog(void)
6092 {
6093         struct timeval now;
6094         unsigned int since;
6095
6096         if (mb_waiters == 0 || !mb_watchdog)
6097                 return;
6098
6099         microuptime(&now);
6100         since = now.tv_sec - mb_wdtstart.tv_sec;
6101         if (since >= MB_WDT_MAXTIME) {
6102                 panic_plain("%s: %d waiters stuck for %u secs\n%s", __func__,
6103                     mb_waiters, since, mbuf_dump());
6104                 /* NOTREACHED */
6105         }
6106 }
6107
6108 /*
6109  * Called during blocking allocation.  Returns TRUE if one or more objects
6110  * are available at the per-CPU caches layer and that allocation should be
6111  * retried at that level.
6112  */
6113 static boolean_t
6114 mbuf_sleep(mbuf_class_t class, unsigned int num, int wait)
6115 {
6116         boolean_t mcache_retry = FALSE;
6117
6118         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
6119
6120         /* Check if there's anything at the cache layer */
6121         if (mbuf_cached_above(class, wait)) {
6122                 mcache_retry = TRUE;
6123                 goto done;
6124         }
6125
6126         /* Nothing?  Then try hard to get it from somewhere */
6127         m_reclaim(class, num, (wait & MCR_COMP));
6128
6129         /* We tried hard and got something? */
6130         if (m_infree(class) > 0) {
6131                 mbstat.m_wait++;
6132                 goto done;
6133         } else if (mbuf_cached_above(class, wait)) {
6134                 mbstat.m_wait++;
6135                 mcache_retry = TRUE;
6136                 goto done;
6137         } else if (wait & MCR_TRYHARD) {
6138                 mcache_retry = TRUE;
6139                 goto done;
6140         }
6141
6142         /*
6143          * There's really nothing for us right now; inform the
6144          * cache(s) that there is a waiter below and go to sleep.
6145          */
6146         mbuf_waiter_inc(class, (wait & MCR_COMP));
6147
6148         VERIFY(!(wait & MCR_NOSLEEP));
6149
6150         /*
6151          * If this is the first waiter, arm the watchdog timer.  Otherwise
6152          * check if we need to panic the system due to watchdog timeout.
6153          */
6154         if (mb_waiters == 0)
6155                 microuptime(&mb_wdtstart);
6156         else
6157                 mbuf_watchdog();
6158
6159         mb_waiters++;
6160         (void) msleep(mb_waitchan, mbuf_mlock, (PZERO-1), m_cname(class), NULL);
6161
6162         /* We are now up; stop getting notified until next round */
6163         mbuf_waiter_dec(class, (wait & MCR_COMP));
6164
6165         /* We waited and got something */
6166         if (m_infree(class) > 0) {
6167                 mbstat.m_wait++;
6168                 goto done;
6169         } else if (mbuf_cached_above(class, wait)) {
6170                 mbstat.m_wait++;
6171                 mcache_retry = TRUE;
6172         }
6173 done:
6174         return (mcache_retry);
6175 }
6176
6177 static void
6178 mbuf_worker_thread(void)
6179 {
6180         int mbuf_expand;
6181
6182         while (1) {
6183                 lck_mtx_lock(mbuf_mlock);
6184
6185                 mbuf_expand = 0;
6186                 if (mbuf_expand_mcl) {
6187                         int n;
6188
6189                         /* Adjust to current number of cluster in use */
6190                         n = mbuf_expand_mcl -
6191                             (m_total(MC_CL) - m_infree(MC_CL));
6192                         if ((n + m_total(MC_CL)) > m_maxlimit(MC_CL))
6193                                 n = m_maxlimit(MC_CL) - m_total(MC_CL);
6194                         mbuf_expand_mcl = 0;
6195
6196                         if (n > 0 && freelist_populate(MC_CL, n, M_WAIT) > 0)
6197                                 mbuf_expand++;
6198                 }
6199                 if (mbuf_expand_big) {
6200                         int n;
6201
6202                         /* Adjust to current number of 4 KB cluster in use */
6203                         n = mbuf_expand_big -
6204                             (m_total(MC_BIGCL) - m_infree(MC_BIGCL));
6205                         if ((n + m_total(MC_BIGCL)) > m_maxlimit(MC_BIGCL))
6206                                 n = m_maxlimit(MC_BIGCL) - m_total(MC_BIGCL);
6207                         mbuf_expand_big = 0;
6208
6209                         if (n > 0 && freelist_populate(MC_BIGCL, n, M_WAIT) > 0)
6210                                 mbuf_expand++;
6211                 }
6212                 if (mbuf_expand_16k) {
6213                         int n;
6214
6215                         /* Adjust to current number of 16 KB cluster in use */
6216                         n = mbuf_expand_16k -
6217                             (m_total(MC_16KCL) - m_infree(MC_16KCL));
6218                         if ((n + m_total(MC_16KCL)) > m_maxlimit(MC_16KCL))
6219                                 n = m_maxlimit(MC_16KCL) - m_total(MC_16KCL);
6220                         mbuf_expand_16k = 0;
6221
6222                         if (n > 0)
6223                                 (void) freelist_populate(MC_16KCL, n, M_WAIT);
6224                 }
6225
6226                 /*
6227                  * Because we can run out of memory before filling the mbuf
6228                  * map, we should not allocate more clusters than they are
6229                  * mbufs -- otherwise we could have a large number of useless
6230                  * clusters allocated.
6231                  */
6232                 if (mbuf_expand) {
6233                         while (m_total(MC_MBUF) <
6234                             (m_total(MC_BIGCL) + m_total(MC_CL))) {
6235                                 if (freelist_populate(MC_MBUF, 1, M_WAIT) == 0)
6236                                         break;
6237                         }
6238                 }
6239
6240                 lck_mtx_unlock(mbuf_mlock);
6241
6242                 assert_wait(&mbuf_worker_run, THREAD_UNINT);
6243                 (void) thread_block((thread_continue_t)mbuf_worker_thread);
6244         }
6245 }
6246
6247 static void
6248 mbuf_worker_thread_init(void)
6249 {
6250         mbuf_worker_ready++;
6251         mbuf_worker_thread();
6252 }
6253
6254 static mcl_slab_t *
6255 slab_get(void *buf)
6256 {
6257         mcl_slabg_t *slg;
6258         unsigned int ix, k;
6259
6260         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
6261
6262         VERIFY(MBUF_IN_MAP(buf));
6263         ix = ((char *)buf - (char *)mbutl) >> MBSHIFT;
6264         VERIFY(ix < maxslabgrp);
6265
6266         if ((slg = slabstbl[ix]) == NULL) {
6267                 /*
6268                  * In the current implementation, we never shrink the slabs
6269                  * table; if we attempt to reallocate a cluster group when
6270                  * it's already allocated, panic since this is a sign of a
6271                  * memory corruption (slabstbl[ix] got nullified).
6272                  */
6273                 ++slabgrp;
6274                 VERIFY(ix < slabgrp);
6275                 /*
6276                  * Slabs expansion can only be done single threaded; when
6277                  * we get here, it must be as a result of m_clalloc() which
6278                  * is serialized and therefore mb_clalloc_busy must be set.
6279                  */
6280                 VERIFY(mb_clalloc_busy);
6281                 lck_mtx_unlock(mbuf_mlock);
6282
6283                 /* This is a new buffer; create the slabs group for it */
6284                 MALLOC(slg, mcl_slabg_t *, sizeof (*slg), M_TEMP,
6285                     M_WAITOK | M_ZERO);
6286                 VERIFY(slg != NULL);
6287
6288                 lck_mtx_lock(mbuf_mlock);
6289                 /*
6290                  * No other thread could have gone into m_clalloc() after
6291                  * we dropped the lock above, so verify that it's true.
6292                  */
6293                 VERIFY(mb_clalloc_busy);
6294
6295                 slabstbl[ix] = slg;
6296
6297                 /* Chain each slab in the group to its forward neighbor */
6298                 for (k = 1; k < NSLABSPMB; k++)
6299                         slg->slg_slab[k - 1].sl_next = &slg->slg_slab[k];
6300                 VERIFY(slg->slg_slab[NSLABSPMB - 1].sl_next == NULL);
6301
6302                 /* And chain the last slab in the previous group to this */
6303                 if (ix > 0) {
6304                         VERIFY(slabstbl[ix - 1]->
6305                             slg_slab[NSLABSPMB - 1].sl_next == NULL);
6306                         slabstbl[ix - 1]->slg_slab[NSLABSPMB - 1].sl_next =
6307                             &slg->slg_slab[0];
6308                 }
6309         }
6310
6311         ix = MTOBG(buf) % NSLABSPMB;
6312         VERIFY(ix < NSLABSPMB);
6313
6314         return (&slg->slg_slab[ix]);
6315 }
6316
6317 static void
6318 slab_init(mcl_slab_t *sp, mbuf_class_t class, u_int32_t flags,
6319     void *base, void *head, unsigned int len, int refcnt, int chunks)
6320 {
6321         sp->sl_class = class;
6322         sp->sl_flags = flags;
6323         sp->sl_base = base;
6324         sp->sl_head = head;
6325         sp->sl_len = len;
6326         sp->sl_refcnt = refcnt;
6327         sp->sl_chunks = chunks;
6328         slab_detach(sp);
6329 }
6330
6331 static void
6332 slab_insert(mcl_slab_t *sp, mbuf_class_t class)
6333 {
6334         VERIFY(slab_is_detached(sp));
6335         m_slab_cnt(class)++;
6336         TAILQ_INSERT_TAIL(&m_slablist(class), sp, sl_link);
6337         sp->sl_flags &= ~SLF_DETACHED;
6338         if (class == MC_16KCL) {
6339                 int k;
6340                 for (k = 1; k < NSLABSP16KB; k++) {
6341                         sp = sp->sl_next;
6342                         /* Next slab must already be present */
6343                         VERIFY(sp != NULL);
6344                         VERIFY(slab_is_detached(sp));
6345                         sp->sl_flags &= ~SLF_DETACHED;
6346                 }
6347         }
6348 }
6349
6350 static void
6351 slab_remove(mcl_slab_t *sp, mbuf_class_t class)
6352 {
6353         VERIFY(!slab_is_detached(sp));
6354         VERIFY(m_slab_cnt(class) > 0);
6355         m_slab_cnt(class)--;
6356         TAILQ_REMOVE(&m_slablist(class), sp, sl_link);
6357         slab_detach(sp);
6358         if (class == MC_16KCL) {
6359                 int k;
6360                 for (k = 1; k < NSLABSP16KB; k++) {
6361                         sp = sp->sl_next;
6362                         /* Next slab must already be present */
6363                         VERIFY(sp != NULL);
6364                         VERIFY(!slab_is_detached(sp));
6365                         slab_detach(sp);
6366                 }
6367         }
6368 }
6369
6370 static boolean_t
6371 slab_inrange(mcl_slab_t *sp, void *buf)
6372 {
6373         return ((uintptr_t)buf >= (uintptr_t)sp->sl_base &&
6374             (uintptr_t)buf < ((uintptr_t)sp->sl_base + sp->sl_len));
6375 }
6376
6377 #undef panic
6378
6379 static void
6380 slab_nextptr_panic(mcl_slab_t *sp, void *addr)
6381 {
6382         int i;
6383         unsigned int chunk_len = sp->sl_len / sp->sl_chunks;
6384         uintptr_t buf = (uintptr_t)sp->sl_base;
6385
6386         for (i = 0; i < sp->sl_chunks; i++, buf += chunk_len) {
6387                 void *next = ((mcache_obj_t *)buf)->obj_next;
6388                 if (next != addr)
6389                         continue;
6390                 if (!mclverify) {
6391                         if (next != NULL && !MBUF_IN_MAP(next)) {
6392                                 mcache_t *cp = m_cache(sp->sl_class);
6393                                 panic("%s: %s buffer %p in slab %p modified "
6394                                     "after free at offset 0: %p out of range "
6395                                     "[%p-%p)\n", __func__, cp->mc_name,
6396                                     (void *)buf, sp, next, mbutl, embutl);
6397                                 /* NOTREACHED */
6398                         }
6399                 } else {
6400                         mcache_audit_t *mca = mcl_audit_buf2mca(sp->sl_class,
6401                             (mcache_obj_t *)buf);
6402                         mcl_audit_verify_nextptr(next, mca);
6403                 }
6404         }
6405 }
6406
6407 static void
6408 slab_detach(mcl_slab_t *sp)
6409 {
6410         sp->sl_link.tqe_next = (mcl_slab_t *)-1;
6411         sp->sl_link.tqe_prev = (mcl_slab_t **)-1;
6412         sp->sl_flags |= SLF_DETACHED;
6413 }
6414
6415 static boolean_t
6416 slab_is_detached(mcl_slab_t *sp)
6417 {
6418         return ((intptr_t)sp->sl_link.tqe_next == -1 &&
6419             (intptr_t)sp->sl_link.tqe_prev == -1 &&
6420             (sp->sl_flags & SLF_DETACHED));
6421 }
6422
6423 static void
6424 mcl_audit_init(void *buf, mcache_audit_t **mca_list,
6425     mcache_obj_t **con_list, size_t con_size, unsigned int num)
6426 {
6427         mcache_audit_t *mca, *mca_tail;
6428         mcache_obj_t *con = NULL;
6429         boolean_t save_contents = (con_list != NULL);
6430         unsigned int i, ix;
6431
6432         ASSERT(num <= NMBPBG);
6433         ASSERT(con_list == NULL || con_size != 0);
6434
6435         ix = MTOBG(buf);
6436         VERIFY(ix < maxclaudit);
6437
6438         /* Make sure we haven't been here before */
6439         for (i = 0; i < NMBPBG; i++)
6440                 VERIFY(mclaudit[ix].cl_audit[i] == NULL);
6441
6442         mca = mca_tail = *mca_list;
6443         if (save_contents)
6444                 con = *con_list;
6445
6446         for (i = 0; i < num; i++) {
6447                 mcache_audit_t *next;
6448
6449                 next = mca->mca_next;
6450                 bzero(mca, sizeof (*mca));
6451                 mca->mca_next = next;
6452                 mclaudit[ix].cl_audit[i] = mca;
6453
6454                 /* Attach the contents buffer if requested */
6455                 if (save_contents) {
6456                         mcl_saved_contents_t *msc =
6457                             (mcl_saved_contents_t *)(void *)con;
6458
6459                         VERIFY(msc != NULL);
6460                         VERIFY(IS_P2ALIGNED(msc, sizeof (u_int64_t)));
6461                         VERIFY(con_size == sizeof (*msc));
6462                         mca->mca_contents_size = con_size;
6463                         mca->mca_contents = msc;
6464                         con = con->obj_next;
6465                         bzero(mca->mca_contents, mca->mca_contents_size);
6466                 }
6467
6468                 mca_tail = mca;
6469                 mca = mca->mca_next;
6470         }
6471
6472         if (save_contents)
6473                 *con_list = con;
6474
6475         *mca_list = mca_tail->mca_next;
6476         mca_tail->mca_next = NULL;
6477 }
6478
6479 static void
6480 mcl_audit_free(void *buf, unsigned int num)
6481 {
6482         unsigned int i, ix;
6483         mcache_audit_t *mca, *mca_list;
6484
6485         ix = MTOBG(buf);
6486         VERIFY(ix < maxclaudit);
6487
6488         if (mclaudit[ix].cl_audit[0] != NULL) {
6489                 mca_list = mclaudit[ix].cl_audit[0];
6490                 for (i = 0; i < num; i++) {
6491                         mca = mclaudit[ix].cl_audit[i];
6492                         mclaudit[ix].cl_audit[i] = NULL;
6493                         if (mca->mca_contents)
6494                                 mcache_free(mcl_audit_con_cache,
6495                                     mca->mca_contents);
6496                 }
6497                 mcache_free_ext(mcache_audit_cache,
6498                     (mcache_obj_t *)mca_list);
6499         }
6500 }
6501
6502 /*
6503  * Given an address of a buffer (mbuf/2KB/4KB/16KB), return
6504  * the corresponding audit structure for that buffer.
6505  */
6506 static mcache_audit_t *
6507 mcl_audit_buf2mca(mbuf_class_t class, mcache_obj_t *o)
6508 {
6509         mcache_audit_t *mca = NULL;
6510         int ix = MTOBG(o);
6511
6512         VERIFY(ix < maxclaudit);
6513         VERIFY(IS_P2ALIGNED(o, MIN(m_maxsize(class), NBPG)));
6514
6515         switch (class) {
6516         case MC_MBUF:
6517                 /*
6518                  * For the mbuf case, find the index of the page
6519                  * used by the mbuf and use that index to locate the
6520                  * base address of the page.  Then find out the
6521                  * mbuf index relative to the page base and use
6522                  * it to locate the audit structure.
6523                  */
6524                 VERIFY(MCLIDX(BGTOM(ix), o) < (int)NMBPBG);
6525                 mca = mclaudit[ix].cl_audit[MCLIDX(BGTOM(ix), o)];
6526                 break;
6527
6528         case MC_CL:
6529                 /*
6530                  * Same thing as above, but for 2KB clusters in a page.
6531                  */
6532                 VERIFY(CLBGIDX(BGTOM(ix), o) < (int)NCLPBG);
6533                 mca = mclaudit[ix].cl_audit[CLBGIDX(BGTOM(ix), o)];
6534                 break;
6535
6536         case MC_BIGCL:
6537         case MC_16KCL:
6538                 /*
6539                  * Same as above, but only return the first element.
6540                  */
6541                 mca = mclaudit[ix].cl_audit[0];
6542                 break;
6543
6544         default:
6545                 VERIFY(0);
6546                 /* NOTREACHED */
6547         }
6548
6549         return (mca);
6550 }
6551
6552 static void
6553 mcl_audit_mbuf(mcache_audit_t *mca, void *addr, boolean_t composite,
6554     boolean_t alloc)
6555 {
6556         struct mbuf *m = addr;
6557         mcache_obj_t *next = ((mcache_obj_t *)m)->obj_next;
6558
6559         VERIFY(mca->mca_contents != NULL &&
6560             mca->mca_contents_size == AUDIT_CONTENTS_SIZE);
6561
6562         if (mclverify)
6563                 mcl_audit_verify_nextptr(next, mca);
6564
6565         if (!alloc) {
6566                 /* Save constructed mbuf fields */
6567                 mcl_audit_save_mbuf(m, mca);
6568                 if (mclverify) {
6569                         mcache_set_pattern(MCACHE_FREE_PATTERN, m,
6570                             m_maxsize(MC_MBUF));
6571                 }
6572                 ((mcache_obj_t *)m)->obj_next = next;
6573                 return;
6574         }
6575
6576         /* Check if the buffer has been corrupted while in freelist */
6577         if (mclverify) {
6578                 mcache_audit_free_verify_set(mca, addr, 0, m_maxsize(MC_MBUF));
6579         }
6580         /* Restore constructed mbuf fields */
6581         mcl_audit_restore_mbuf(m, mca, composite);
6582 }
6583
6584 static void
6585 mcl_audit_restore_mbuf(struct mbuf *m, mcache_audit_t *mca, boolean_t composite)
6586 {
6587         struct mbuf *ms = MCA_SAVED_MBUF_PTR(mca);
6588
6589         if (composite) {
6590                 struct mbuf *next = m->m_next;
6591                 VERIFY(ms->m_flags == M_EXT && MEXT_RFA(ms) != NULL &&
6592                     MBUF_IS_COMPOSITE(ms));
6593                 VERIFY(mca->mca_contents_size == AUDIT_CONTENTS_SIZE);
6594                 /*
6595                  * We could have hand-picked the mbuf fields and restore
6596                  * them individually, but that will be a maintenance
6597                  * headache.  Instead, restore everything that was saved;
6598                  * the mbuf layer will recheck and reinitialize anyway.
6599                  */
6600                 bcopy(ms, m, MCA_SAVED_MBUF_SIZE);
6601                 m->m_next = next;
6602         } else {
6603                 /*
6604                  * For a regular mbuf (no cluster attached) there's nothing
6605                  * to restore other than the type field, which is expected
6606                  * to be MT_FREE.
6607                  */
6608                 m->m_type = ms->m_type;
6609         }
6610         _MCHECK(m);
6611 }
6612
6613 static void
6614 mcl_audit_save_mbuf(struct mbuf *m, mcache_audit_t *mca)
6615 {
6616         VERIFY(mca->mca_contents_size == AUDIT_CONTENTS_SIZE);
6617         _MCHECK(m);
6618         bcopy(m, MCA_SAVED_MBUF_PTR(mca), MCA_SAVED_MBUF_SIZE);
6619 }
6620
6621 static void
6622 mcl_audit_cluster(mcache_audit_t *mca, void *addr, size_t size, boolean_t alloc,
6623     boolean_t save_next)
6624 {
6625         mcache_obj_t *next = ((mcache_obj_t *)addr)->obj_next;
6626
6627         if (!alloc) {
6628                 if (mclverify) {
6629                         mcache_set_pattern(MCACHE_FREE_PATTERN, addr, size);
6630                 }
6631                 if (save_next) {
6632                         mcl_audit_verify_nextptr(next, mca);
6633                         ((mcache_obj_t *)addr)->obj_next = next;
6634                 }
6635         } else if (mclverify) {
6636                 /* Check if the buffer has been corrupted while in freelist */
6637                 mcl_audit_verify_nextptr(next, mca);
6638                 mcache_audit_free_verify_set(mca, addr, 0, size);
6639         }
6640 }
6641
6642 static void
6643 mcl_audit_scratch(mcache_audit_t *mca)
6644 {
6645         void *stack[MCACHE_STACK_DEPTH + 1];
6646         mcl_scratch_audit_t *msa;
6647         struct timeval now;
6648
6649         VERIFY(mca->mca_contents != NULL);
6650         msa = MCA_SAVED_SCRATCH_PTR(mca);
6651
6652         msa->msa_pthread = msa->msa_thread;
6653         msa->msa_thread = current_thread();
6654         bcopy(msa->msa_stack, msa->msa_pstack, sizeof (msa->msa_pstack));
6655         msa->msa_pdepth = msa->msa_depth;
6656         bzero(stack, sizeof (stack));
6657         msa->msa_depth = OSBacktrace(stack, MCACHE_STACK_DEPTH + 1) - 1;
6658         bcopy(&stack[1], msa->msa_stack, sizeof (msa->msa_stack));
6659
6660         msa->msa_ptstamp = msa->msa_tstamp;
6661         microuptime(&now);
6662         /* tstamp is in ms relative to base_ts */
6663         msa->msa_tstamp = ((now.tv_usec - mb_start.tv_usec) / 1000);
6664         if ((now.tv_sec - mb_start.tv_sec) > 0)
6665                 msa->msa_tstamp += ((now.tv_sec - mb_start.tv_sec) * 1000);
6666 }
6667
6668 static void
6669 mcl_audit_mcheck_panic(struct mbuf *m)
6670 {
6671         mcache_audit_t *mca;
6672
6673         MRANGE(m);
6674         mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
6675
6676         panic("mcl_audit: freed mbuf %p with type 0x%x (instead of 0x%x)\n%s\n",
6677             m, (u_int16_t)m->m_type, MT_FREE, mcache_dump_mca(mca));
6678         /* NOTREACHED */
6679 }
6680
6681 static void
6682 mcl_audit_verify_nextptr(void *next, mcache_audit_t *mca)
6683 {
6684         if (next != NULL && !MBUF_IN_MAP(next) &&
6685             (next != (void *)MCACHE_FREE_PATTERN || !mclverify)) {
6686                 panic("mcl_audit: buffer %p modified after free at offset 0: "
6687                     "%p out of range [%p-%p)\n%s\n",
6688                     mca->mca_addr, next, mbutl, embutl, mcache_dump_mca(mca));
6689                 /* NOTREACHED */
6690         }
6691 }
6692
6693 /* This function turns on mbuf leak detection */
6694 static void
6695 mleak_activate(void)
6696 {
6697         mleak_table.mleak_sample_factor = MLEAK_SAMPLE_FACTOR;
6698         PE_parse_boot_argn("mleak_sample_factor",
6699             &mleak_table.mleak_sample_factor,
6700             sizeof (mleak_table.mleak_sample_factor));
6701
6702         if (mleak_table.mleak_sample_factor == 0)
6703                 mclfindleak = 0;
6704
6705         if (mclfindleak == 0)
6706                 return;
6707
6708         vm_size_t alloc_size =
6709             mleak_alloc_buckets * sizeof (struct mallocation);
6710         vm_size_t trace_size = mleak_trace_buckets * sizeof (struct mtrace);
6711
6712         MALLOC(mleak_allocations, struct mallocation *, alloc_size,
6713             M_TEMP, M_WAITOK | M_ZERO);
6714         VERIFY(mleak_allocations != NULL);
6715
6716         MALLOC(mleak_traces, struct mtrace *, trace_size,
6717             M_TEMP, M_WAITOK | M_ZERO);
6718         VERIFY(mleak_traces != NULL);
6719
6720         MALLOC(mleak_stat, mleak_stat_t *, MLEAK_STAT_SIZE(MLEAK_NUM_TRACES),
6721             M_TEMP, M_WAITOK | M_ZERO);
6722         VERIFY(mleak_stat != NULL);
6723         mleak_stat->ml_cnt = MLEAK_NUM_TRACES;
6724 #ifdef __LP64__
6725         mleak_stat->ml_isaddr64 = 1;
6726 #endif /* __LP64__ */
6727 }
6728
6729 static void
6730 mleak_logger(u_int32_t num, mcache_obj_t *addr, boolean_t alloc)
6731 {
6732         int temp;
6733
6734         if (mclfindleak == 0)
6735                 return;
6736
6737         if (!alloc)
6738                 return (mleak_free(addr));
6739
6740         temp = atomic_add_32_ov(&mleak_table.mleak_capture, 1);
6741
6742         if ((temp % mleak_table.mleak_sample_factor) == 0 && addr != NULL) {
6743                 uintptr_t bt[MLEAK_STACK_DEPTH];
6744                 int logged = fastbacktrace(bt, MLEAK_STACK_DEPTH);
6745                 mleak_log(bt, addr, logged, num);
6746         }
6747 }
6748
6749 /*
6750  * This function records the allocation in the mleak_allocations table
6751  * and the backtrace in the mleak_traces table; if allocation slot is in use,
6752  * replace old allocation with new one if the trace slot is in use, return
6753  * (or increment refcount if same trace).
6754  */
6755 static boolean_t
6756 mleak_log(uintptr_t *bt, mcache_obj_t *addr, uint32_t depth, int num)
6757 {
6758         struct mallocation *allocation;
6759         struct mtrace *trace;
6760         uint32_t trace_index;
6761
6762         /* Quit if someone else modifying the tables */
6763         if (!lck_mtx_try_lock_spin(mleak_lock)) {
6764                 mleak_table.total_conflicts++;
6765                 return (FALSE);
6766         }
6767
6768         allocation = &mleak_allocations[hashaddr((uintptr_t)addr,
6769             mleak_alloc_buckets)];
6770         trace_index = hashbacktrace(bt, depth, mleak_trace_buckets);
6771         trace = &mleak_traces[trace_index];
6772
6773         VERIFY(allocation <= &mleak_allocations[mleak_alloc_buckets - 1]);
6774         VERIFY(trace <= &mleak_traces[mleak_trace_buckets - 1]);
6775
6776         allocation->hitcount++;
6777         trace->hitcount++;
6778
6779         /*
6780          * If the allocation bucket we want is occupied
6781          * and the occupier has the same trace, just bail.
6782          */
6783         if (allocation->element != NULL &&
6784             trace_index == allocation->trace_index) {
6785                 mleak_table.alloc_collisions++;
6786                 lck_mtx_unlock(mleak_lock);
6787                 return (TRUE);
6788         }
6789
6790         /*
6791          * Store the backtrace in the traces array;
6792          * Size of zero = trace bucket is free.
6793          */
6794         if (trace->allocs > 0 &&
6795             bcmp(trace->addr, bt, (depth * sizeof (uintptr_t))) != 0) {
6796                 /* Different, unique trace, but the same hash! Bail out. */
6797                 trace->collisions++;
6798                 mleak_table.trace_collisions++;
6799                 lck_mtx_unlock(mleak_lock);
6800                 return (TRUE);
6801         } else if (trace->allocs > 0) {
6802                 /* Same trace, already added, so increment refcount */
6803                 trace->allocs++;
6804         } else {
6805                 /* Found an unused trace bucket, so record the trace here */
6806                 if (trace->depth != 0) {
6807                         /* this slot previously used but not currently in use */
6808                         mleak_table.trace_overwrites++;
6809                 }
6810                 mleak_table.trace_recorded++;
6811                 trace->allocs = 1;
6812                 memcpy(trace->addr, bt, (depth * sizeof (uintptr_t)));
6813                 trace->depth = depth;
6814                 trace->collisions = 0;
6815         }
6816
6817         /* Step 2: Store the allocation record in the allocations array */
6818         if (allocation->element != NULL) {
6819                 /*
6820                  * Replace an existing allocation.  No need to preserve
6821                  * because only a subset of the allocations are being
6822                  * recorded anyway.
6823                  */
6824                 mleak_table.alloc_collisions++;
6825         } else if (allocation->trace_index != 0) {
6826                 mleak_table.alloc_overwrites++;
6827         }
6828         allocation->element = addr;
6829         allocation->trace_index = trace_index;
6830         allocation->count = num;
6831         mleak_table.alloc_recorded++;
6832         mleak_table.outstanding_allocs++;
6833
6834         lck_mtx_unlock(mleak_lock);
6835         return (TRUE);
6836 }
6837
6838 static void
6839 mleak_free(mcache_obj_t *addr)
6840 {
6841         while (addr != NULL) {
6842                 struct mallocation *allocation = &mleak_allocations
6843                     [hashaddr((uintptr_t)addr, mleak_alloc_buckets)];
6844
6845                 if (allocation->element == addr &&
6846                     allocation->trace_index < mleak_trace_buckets) {
6847                         lck_mtx_lock_spin(mleak_lock);
6848                         if (allocation->element == addr &&
6849                             allocation->trace_index < mleak_trace_buckets) {
6850                                 struct mtrace *trace;
6851                                 trace = &mleak_traces[allocation->trace_index];
6852                                 /* allocs = 0 means trace bucket is unused */
6853                                 if (trace->allocs > 0)
6854                                         trace->allocs--;
6855                                 if (trace->allocs == 0)
6856                                         trace->depth = 0;
6857                                 /* NULL element means alloc bucket is unused */
6858                                 allocation->element = NULL;
6859                                 mleak_table.outstanding_allocs--;
6860                         }
6861                         lck_mtx_unlock(mleak_lock);
6862                 }
6863                 addr = addr->obj_next;
6864         }
6865 }
6866
6867 static void
6868 mleak_sort_traces()
6869 {
6870         int i, j, k;
6871         struct mtrace *swap;
6872
6873         for(i = 0; i < MLEAK_NUM_TRACES; i++)
6874                 mleak_top_trace[i] = NULL;
6875
6876         for(i = 0, j = 0; j < MLEAK_NUM_TRACES && i < mleak_trace_buckets; i++)
6877         {
6878                 if (mleak_traces[i].allocs <= 0)
6879                         continue;
6880
6881                 mleak_top_trace[j] = &mleak_traces[i];
6882                 for (k = j; k > 0; k--) {
6883                         if (mleak_top_trace[k]->allocs <=
6884                             mleak_top_trace[k-1]->allocs)
6885                                 break;
6886
6887                         swap = mleak_top_trace[k-1];
6888                         mleak_top_trace[k-1] = mleak_top_trace[k];
6889                         mleak_top_trace[k] = swap;
6890                 }
6891                 j++;
6892         }
6893
6894         j--;
6895         for(; i < mleak_trace_buckets; i++) {
6896                 if (mleak_traces[i].allocs <= mleak_top_trace[j]->allocs)
6897                         continue;
6898
6899                 mleak_top_trace[j] = &mleak_traces[i];
6900
6901                 for (k = j; k > 0; k--) {
6902                         if (mleak_top_trace[k]->allocs <=
6903                             mleak_top_trace[k-1]->allocs)
6904                                 break;
6905
6906                         swap = mleak_top_trace[k-1];
6907                         mleak_top_trace[k-1] = mleak_top_trace[k];
6908                         mleak_top_trace[k] = swap;
6909                 }
6910         }
6911 }
6912
6913 static void
6914 mleak_update_stats()
6915 {
6916         mleak_trace_stat_t *mltr;
6917         int i;
6918
6919         VERIFY(mleak_stat != NULL);
6920 #ifdef __LP64__
6921         VERIFY(mleak_stat->ml_isaddr64);
6922 #else
6923         VERIFY(!mleak_stat->ml_isaddr64);
6924 #endif /* !__LP64__ */
6925         VERIFY(mleak_stat->ml_cnt == MLEAK_NUM_TRACES);
6926
6927         mleak_sort_traces();
6928
6929         mltr = &mleak_stat->ml_trace[0];
6930         bzero(mltr, sizeof (*mltr) * MLEAK_NUM_TRACES);
6931         for (i = 0; i < MLEAK_NUM_TRACES; i++) {
6932         int j;
6933
6934                 if (mleak_top_trace[i] == NULL ||
6935                     mleak_top_trace[i]->allocs == 0)
6936                         continue;
6937
6938                 mltr->mltr_collisions   = mleak_top_trace[i]->collisions;
6939                 mltr->mltr_hitcount     = mleak_top_trace[i]->hitcount;
6940                 mltr->mltr_allocs       = mleak_top_trace[i]->allocs;
6941                 mltr->mltr_depth        = mleak_top_trace[i]->depth;
6942
6943                 VERIFY(mltr->mltr_depth <= MLEAK_STACK_DEPTH);
6944                 for (j = 0; j < mltr->mltr_depth; j++)
6945                         mltr->mltr_addr[j] = mleak_top_trace[i]->addr[j];
6946
6947                 mltr++;
6948         }
6949 }
6950
6951 static struct mbtypes {
6952         int             mt_type;
6953         const char      *mt_name;
6954 } mbtypes[] = {
6955         { MT_DATA,      "data" },
6956         { MT_OOBDATA,   "oob data" },
6957         { MT_CONTROL,   "ancillary data" },
6958         { MT_HEADER,    "packet headers" },
6959         { MT_SOCKET,    "socket structures" },
6960         { MT_PCB,       "protocol control blocks" },
6961         { MT_RTABLE,    "routing table entries" },
6962         { MT_HTABLE,    "IMP host table entries" },
6963         { MT_ATABLE,    "address resolution tables" },
6964         { MT_FTABLE,    "fragment reassembly queue headers" },
6965         { MT_SONAME,    "socket names and addresses" },
6966         { MT_SOOPTS,    "socket options" },
6967         { MT_RIGHTS,    "access rights" },
6968         { MT_IFADDR,    "interface addresses" },
6969         { MT_TAG,       "packet tags" },
6970         { 0,            NULL }
6971 };
6972
6973 #define MBUF_DUMP_BUF_CHK() {   \
6974         clen -= k;              \
6975         if (clen < 1)           \
6976                 goto done;      \
6977         c += k;                 \
6978 }
6979
6980 static char *
6981 mbuf_dump(void)
6982 {
6983         unsigned long totmem = 0, totfree = 0, totmbufs, totused, totpct;
6984         u_int32_t m_mbufs = 0, m_clfree = 0, m_bigclfree = 0;
6985         u_int32_t m_mbufclfree = 0, m_mbufbigclfree = 0;
6986         u_int32_t m_16kclusters = 0, m_16kclfree = 0, m_mbuf16kclfree = 0;
6987         int nmbtypes = sizeof (mbstat.m_mtypes) / sizeof (short);
6988         uint8_t seen[256];
6989         struct mbtypes *mp;
6990         mb_class_stat_t *sp;
6991         mleak_trace_stat_t *mltr;
6992         char *c = mbuf_dump_buf;
6993         int i, k, clen = MBUF_DUMP_BUF_SIZE;
6994
6995         mbuf_dump_buf[0] = '\0';
6996
6997         /* synchronize all statistics in the mbuf table */
6998         mbuf_stat_sync();
6999         mbuf_mtypes_sync(TRUE);
7000
7001         sp = &mb_stat->mbs_class[0];
7002         for (i = 0; i < mb_stat->mbs_cnt; i++, sp++) {
7003                 u_int32_t mem;
7004
7005                 if (m_class(i) == MC_MBUF) {
7006                         m_mbufs = sp->mbcl_active;
7007                 } else if (m_class(i) == MC_CL) {
7008                         m_clfree = sp->mbcl_total - sp->mbcl_active;
7009                 } else if (m_class(i) == MC_BIGCL) {
7010                         m_bigclfree = sp->mbcl_total - sp->mbcl_active;
7011                 } else if (njcl > 0 && m_class(i) == MC_16KCL) {
7012                         m_16kclfree = sp->mbcl_total - sp->mbcl_active;
7013                         m_16kclusters = sp->mbcl_total;
7014                 } else if (m_class(i) == MC_MBUF_CL) {
7015                         m_mbufclfree = sp->mbcl_total - sp->mbcl_active;
7016                 } else if (m_class(i) == MC_MBUF_BIGCL) {
7017                         m_mbufbigclfree = sp->mbcl_total - sp->mbcl_active;
7018                 } else if (njcl > 0 && m_class(i) == MC_MBUF_16KCL) {
7019                         m_mbuf16kclfree = sp->mbcl_total - sp->mbcl_active;
7020                 }
7021
7022                 mem = sp->mbcl_ctotal * sp->mbcl_size;
7023                 totmem += mem;
7024                 totfree += (sp->mbcl_mc_cached + sp->mbcl_infree) *
7025                     sp->mbcl_size;
7026
7027         }
7028
7029         /* adjust free counts to include composite caches */
7030         m_clfree += m_mbufclfree;
7031         m_bigclfree += m_mbufbigclfree;
7032         m_16kclfree += m_mbuf16kclfree;
7033
7034         totmbufs = 0;
7035         for (mp = mbtypes; mp->mt_name != NULL; mp++)
7036                 totmbufs += mbstat.m_mtypes[mp->mt_type];
7037         if (totmbufs > m_mbufs)
7038                 totmbufs = m_mbufs;
7039         k = snprintf(c, clen, "%lu/%u mbufs in use:\n", totmbufs, m_mbufs);
7040         MBUF_DUMP_BUF_CHK();
7041
7042         bzero(&seen, sizeof (seen));
7043         for (mp = mbtypes; mp->mt_name != NULL; mp++) {
7044                 if (mbstat.m_mtypes[mp->mt_type] != 0) {
7045                         seen[mp->mt_type] = 1;
7046                         k = snprintf(c, clen, "\t%u mbufs allocated to %s\n",
7047                             mbstat.m_mtypes[mp->mt_type], mp->mt_name);
7048                         MBUF_DUMP_BUF_CHK();
7049                 }
7050         }
7051         seen[MT_FREE] = 1;
7052         for (i = 0; i < nmbtypes; i++)
7053                 if (!seen[i] && mbstat.m_mtypes[i] != 0) {
7054                         k = snprintf(c, clen, "\t%u mbufs allocated to "
7055                             "<mbuf type %d>\n", mbstat.m_mtypes[i], i);
7056                         MBUF_DUMP_BUF_CHK();
7057                 }
7058         if ((m_mbufs - totmbufs) > 0) {
7059                 k = snprintf(c, clen, "\t%lu mbufs allocated to caches\n",
7060                     m_mbufs - totmbufs);
7061                 MBUF_DUMP_BUF_CHK();
7062         }
7063         k = snprintf(c, clen, "%u/%u mbuf 2KB clusters in use\n"
7064             "%u/%u mbuf 4KB clusters in use\n",
7065             (unsigned int)(mbstat.m_clusters - m_clfree),
7066             (unsigned int)mbstat.m_clusters,
7067             (unsigned int)(mbstat.m_bigclusters - m_bigclfree),
7068             (unsigned int)mbstat.m_bigclusters);
7069         MBUF_DUMP_BUF_CHK();
7070
7071         if (njcl > 0) {
7072                 k = snprintf(c, clen, "%u/%u mbuf %uKB clusters in use\n",
7073                     m_16kclusters - m_16kclfree, m_16kclusters,
7074                     njclbytes / 1024);
7075                 MBUF_DUMP_BUF_CHK();
7076         }
7077         totused = totmem - totfree;
7078         if (totmem == 0) {
7079                 totpct = 0;
7080         } else if (totused < (ULONG_MAX / 100)) {
7081                 totpct = (totused * 100) / totmem;
7082         } else {
7083                 u_long totmem1 = totmem / 100;
7084                 u_long totused1 = totused / 100;
7085                 totpct = (totused1 * 100) / totmem1;
7086         }
7087         k = snprintf(c, clen, "%lu KB allocated to network (approx. %lu%% "
7088             "in use)\n", totmem / 1024, totpct);
7089         MBUF_DUMP_BUF_CHK();
7090
7091         /* mbuf leak detection statistics */
7092         mleak_update_stats();
7093
7094         k = snprintf(c, clen, "\nmbuf leak detection table:\n");
7095         MBUF_DUMP_BUF_CHK();
7096         k = snprintf(c, clen, "\ttotal captured: %u (one per %u)\n",
7097             mleak_table.mleak_capture / mleak_table.mleak_sample_factor,
7098             mleak_table.mleak_sample_factor);
7099         MBUF_DUMP_BUF_CHK();
7100         k = snprintf(c, clen, "\ttotal allocs outstanding: %llu\n",
7101             mleak_table.outstanding_allocs);
7102         MBUF_DUMP_BUF_CHK();
7103         k = snprintf(c, clen, "\tnew hash recorded: %llu allocs, %llu traces\n",
7104             mleak_table.alloc_recorded, mleak_table.trace_recorded);
7105         MBUF_DUMP_BUF_CHK();
7106         k = snprintf(c, clen, "\thash collisions: %llu allocs, %llu traces\n",
7107             mleak_table.alloc_collisions, mleak_table.trace_collisions);
7108         MBUF_DUMP_BUF_CHK();
7109         k = snprintf(c, clen, "\toverwrites: %llu allocs, %llu traces\n",
7110             mleak_table.alloc_overwrites, mleak_table.trace_overwrites);
7111         MBUF_DUMP_BUF_CHK();
7112         k = snprintf(c, clen, "\tlock conflicts: %llu\n\n",
7113             mleak_table.total_conflicts);
7114         MBUF_DUMP_BUF_CHK();
7115
7116         k = snprintf(c, clen, "top %d outstanding traces:\n",
7117             mleak_stat->ml_cnt);
7118         MBUF_DUMP_BUF_CHK();
7119         for (i = 0; i < mleak_stat->ml_cnt; i++) {
7120                 mltr = &mleak_stat->ml_trace[i];
7121                 k = snprintf(c, clen, "[%d] %llu outstanding alloc(s), "
7122                     "%llu hit(s), %llu collision(s)\n", (i + 1),
7123                     mltr->mltr_allocs, mltr->mltr_hitcount,
7124                     mltr->mltr_collisions);
7125                 MBUF_DUMP_BUF_CHK();
7126         }
7127
7128         if (mleak_stat->ml_isaddr64)
7129                 k = snprintf(c, clen, MB_LEAK_HDR_64);
7130         else
7131                 k = snprintf(c, clen, MB_LEAK_HDR_32);
7132         MBUF_DUMP_BUF_CHK();
7133
7134         for (i = 0; i < MLEAK_STACK_DEPTH; i++) {
7135                 int j;
7136                 k = snprintf(c, clen, "%2d: ", (i + 1));
7137                 MBUF_DUMP_BUF_CHK();
7138                 for (j = 0; j < mleak_stat->ml_cnt; j++) {
7139                         mltr = &mleak_stat->ml_trace[j];
7140                         if (i < mltr->mltr_depth) {
7141                                 if (mleak_stat->ml_isaddr64) {
7142                                         k = snprintf(c, clen, "0x%0llx  ",
7143                                             (uint64_t)VM_KERNEL_UNSLIDE(
7144                                                 mltr->mltr_addr[i]));
7145                                 } else {
7146                                         k = snprintf(c, clen,
7147                                             "0x%08x  ",
7148                                             (uint32_t)VM_KERNEL_UNSLIDE(
7149                                                 mltr->mltr_addr[i]));
7150                                 }
7151                         } else {
7152                                 if (mleak_stat->ml_isaddr64)
7153                                         k = snprintf(c, clen,
7154                                             MB_LEAK_SPACING_64);
7155                                 else
7156                                         k = snprintf(c, clen,
7157                                             MB_LEAK_SPACING_32);
7158                         }
7159                         MBUF_DUMP_BUF_CHK();
7160                 }
7161                 k = snprintf(c, clen, "\n");
7162                 MBUF_DUMP_BUF_CHK();
7163         }
7164 done:
7165         return (mbuf_dump_buf);
7166 }
7167
7168 #undef MBUF_DUMP_BUF_CHK
7169
7170 /*
7171  * Convert between a regular and a packet header mbuf.  Caller is responsible
7172  * for setting or clearing M_PKTHDR; this routine does the rest of the work.
7173  */
7174 int
7175 m_reinit(struct mbuf *m, int hdr)
7176 {
7177         int ret = 0;
7178
7179         if (hdr) {
7180                 VERIFY(!(m->m_flags & M_PKTHDR));
7181                 if (!(m->m_flags & M_EXT) &&
7182                     (m->m_data != m->m_dat || m->m_len > 0)) {
7183                         /*
7184                          * If there's no external cluster attached and the
7185                          * mbuf appears to contain user data, we cannot
7186                          * safely convert this to a packet header mbuf,
7187                          * as the packet header structure might overlap
7188                          * with the data.
7189                          */
7190                         printf("%s: cannot set M_PKTHDR on altered mbuf %llx, "
7191                             "m_data %llx (expected %llx), "
7192                             "m_len %d (expected 0)\n",
7193                             __func__,
7194                             (uint64_t)VM_KERNEL_ADDRPERM(m),
7195                             (uint64_t)VM_KERNEL_ADDRPERM(m->m_data),
7196                             (uint64_t)VM_KERNEL_ADDRPERM(m->m_dat), m->m_len);
7197                         ret = EBUSY;
7198                 } else {
7199                         VERIFY((m->m_flags & M_EXT) || m->m_data == m->m_dat);
7200                         m->m_flags |= M_PKTHDR;
7201                         MBUF_INIT_PKTHDR(m);
7202                 }
7203         } else {
7204                 /* Check for scratch area overflow */
7205                 m_redzone_verify(m);
7206                 /* Free the aux data and tags if there is any */
7207                 m_tag_delete_chain(m, NULL);
7208                 m->m_flags &= ~M_PKTHDR;
7209         }
7210
7211         return (ret);
7212 }
7213
7214 void
7215 m_scratch_init(struct mbuf *m)
7216 {
7217         struct pkthdr *pkt = &m->m_pkthdr;
7218
7219         VERIFY(m->m_flags & M_PKTHDR);
7220
7221         /* See comments in <rdar://problem/14040693> */
7222         if (pkt->pkt_flags & PKTF_PRIV_GUARDED) {
7223                 panic_plain("Invalid attempt to modify guarded module-private "
7224                     "area: mbuf %p, pkt_flags 0x%x\n", m, pkt->pkt_flags);
7225                 /* NOTREACHED */
7226         }
7227
7228         bzero(&pkt->pkt_mpriv, sizeof (pkt->pkt_mpriv));
7229 }
7230
7231 /*
7232  * This routine is reserved for mbuf_get_driver_scratch(); clients inside
7233  * xnu that intend on utilizing the module-private area should directly
7234  * refer to the pkt_mpriv structure in the pkthdr.  They are also expected
7235  * to set and clear PKTF_PRIV_GUARDED, while owning the packet and prior
7236  * to handing it off to another module, respectively.
7237  */
7238 u_int32_t
7239 m_scratch_get(struct mbuf *m, u_int8_t **p)
7240 {
7241         struct pkthdr *pkt = &m->m_pkthdr;
7242
7243         VERIFY(m->m_flags & M_PKTHDR);
7244
7245         /* See comments in <rdar://problem/14040693> */
7246         if (pkt->pkt_flags & PKTF_PRIV_GUARDED) {
7247                 panic_plain("Invalid attempt to access guarded module-private "
7248                     "area: mbuf %p, pkt_flags 0x%x\n", m, pkt->pkt_flags);
7249                 /* NOTREACHED */
7250         }
7251
7252         if (mcltrace) {
7253                 mcache_audit_t *mca;
7254
7255                 lck_mtx_lock(mbuf_mlock);
7256                 mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
7257                 if (mca->mca_uflags & MB_SCVALID)
7258                         mcl_audit_scratch(mca);
7259                 lck_mtx_unlock(mbuf_mlock);
7260         }
7261
7262         *p = (u_int8_t *)&pkt->pkt_mpriv;
7263         return (sizeof (pkt->pkt_mpriv));
7264 }
7265
7266 static void
7267 m_redzone_init(struct mbuf *m)
7268 {
7269         VERIFY(m->m_flags & M_PKTHDR);
7270         /*
7271          * Each mbuf has a unique red zone pattern, which is a XOR
7272          * of the red zone cookie and the address of the mbuf.
7273          */
7274         m->m_pkthdr.redzone = ((u_int32_t)(uintptr_t)m) ^ mb_redzone_cookie;
7275 }
7276
7277 static void
7278 m_redzone_verify(struct mbuf *m)
7279 {
7280         u_int32_t mb_redzone;
7281
7282         VERIFY(m->m_flags & M_PKTHDR);
7283
7284         mb_redzone = ((u_int32_t)(uintptr_t)m) ^ mb_redzone_cookie;
7285         if (m->m_pkthdr.redzone != mb_redzone) {
7286                 panic("mbuf %p redzone violation with value 0x%x "
7287                     "(instead of 0x%x, using cookie 0x%x)\n",
7288                     m, m->m_pkthdr.redzone, mb_redzone, mb_redzone_cookie);
7289                 /* NOTREACHED */
7290         }
7291 }
7292
7293 /*
7294  * Send a report of mbuf usage if the usage is at least 6% of max limit
7295  * or if there has been at least 3% increase since the last report.
7296  *
7297  * The values 6% and 3% are chosen so that we can do simple arithmetic
7298  * with shift operations.
7299  */
7300 static boolean_t
7301 mbuf_report_usage(mbuf_class_t cl)
7302 {
7303         /* if a report is already in progress, nothing to do */
7304         if (mb_peak_newreport)
7305                 return (TRUE);
7306
7307         if (m_total(cl) > m_peak(cl) &&
7308             m_total(cl) >= (m_maxlimit(cl) >> 4) &&
7309             (m_total(cl) - m_peak(cl)) >= (m_peak(cl) >> 5))
7310                 return (TRUE);
7311         return (FALSE);
7312 }
7313
7314 __private_extern__ void
7315 mbuf_report_peak_usage(void)
7316 {
7317         int i = 0;
7318         u_int64_t uptime;
7319         struct nstat_sysinfo_data ns_data;
7320         uint32_t memreleased = 0;
7321
7322         uptime = net_uptime();
7323         lck_mtx_lock(mbuf_mlock);
7324
7325         /* Generate an initial report after 1 week of uptime */
7326         if (!mb_peak_firstreport &&
7327             uptime > MBUF_PEAK_FIRST_REPORT_THRESHOLD) {
7328                 mb_peak_newreport = TRUE;
7329                 mb_peak_firstreport = TRUE;
7330         }
7331
7332         if (!mb_peak_newreport) {
7333                 lck_mtx_unlock(mbuf_mlock);
7334                 return;
7335         }
7336
7337         /*
7338          * Since a report is being generated before 1 week,
7339          * we do not need to force another one later
7340          */
7341         if (uptime < MBUF_PEAK_FIRST_REPORT_THRESHOLD)
7342                 mb_peak_firstreport = TRUE;
7343
7344         for (i = 0; i < NELEM(mbuf_table); i++) {
7345                 m_peak(m_class(i)) = m_total(m_class(i));
7346                 memreleased += m_release_cnt(i);
7347         }
7348         mb_peak_newreport = FALSE;
7349         lck_mtx_unlock(mbuf_mlock);
7350
7351         bzero(&ns_data, sizeof(ns_data));
7352         ns_data.flags = NSTAT_SYSINFO_MBUF_STATS;
7353         ns_data.u.mb_stats.total_256b = m_peak(MC_MBUF);
7354         ns_data.u.mb_stats.total_2kb = m_peak(MC_CL);
7355         ns_data.u.mb_stats.total_4kb = m_peak(MC_BIGCL);
7356         ns_data.u.mb_stats.sbmb_total = total_sbmb_cnt_peak;
7357         ns_data.u.mb_stats.sb_atmbuflimit = sbmb_limreached;
7358         ns_data.u.mb_stats.draincnt = mbstat.m_drain;
7359         ns_data.u.mb_stats.memreleased = memreleased;
7360
7361         nstat_sysinfo_send_data(&ns_data);
7362 }
7363
7364 /*
7365  * Called by the VM when there's memory pressure.
7366  */
7367 __private_extern__ void
7368 m_drain(void)
7369 {
7370         mbuf_class_t mc;
7371         mcl_slab_t *sp, *sp_tmp, *nsp;
7372         unsigned int num, k, interval, released = 0;
7373         unsigned int total_mem = 0, use_mem = 0;
7374         boolean_t ret, purge_caches = FALSE;
7375         ppnum_t offset;
7376         mcache_obj_t *obj;
7377         float per;
7378         static uint64_t last_drain = 0;
7379         static unsigned char scratch[32];
7380         static ppnum_t scratch_pa = 0;
7381
7382         if (mb_drain_maxint == 0 || mb_waiters)
7383                 return;
7384         if (scratch_pa == 0) {
7385                 bzero(scratch, sizeof(scratch));
7386                 scratch_pa = pmap_find_phys(kernel_pmap, (addr64_t)scratch);
7387                 VERIFY(scratch_pa);
7388         } else if (mclverify) {
7389                 /*
7390                  * Panic if a driver wrote to our scratch memory.
7391                  */
7392                 for (k = 0; k < sizeof(scratch); k++)
7393                         if (scratch[k])
7394                                 panic("suspect DMA to freed address");
7395         }
7396         /*
7397          * Don't free memory too often as that could cause excessive
7398          * waiting times for mbufs.  Purge caches if we were asked to drain
7399          * in the last 5 minutes.
7400          */
7401         lck_mtx_lock(mbuf_mlock);
7402         if (last_drain == 0) {
7403                 last_drain = net_uptime();
7404                 lck_mtx_unlock(mbuf_mlock);
7405                 return;
7406         }
7407         interval = net_uptime() - last_drain;
7408         if (interval <= mb_drain_maxint) {
7409                 lck_mtx_unlock(mbuf_mlock);
7410                 return;
7411         }
7412         if (interval <= mb_drain_maxint * 5)
7413                 purge_caches = TRUE;
7414         last_drain = net_uptime();
7415         /*
7416          * Don't free any memory if we're using 60% or more.
7417          */
7418         for (mc = 0; mc < NELEM(mbuf_table); mc++) {
7419                 total_mem += m_total(mc) * m_maxsize(mc);
7420                 use_mem += m_active(mc) * m_maxsize(mc);
7421         }
7422         per = (float)use_mem / (float)total_mem;
7423         if (per >= 0.6) {
7424                 lck_mtx_unlock(mbuf_mlock);
7425                 return;
7426         }
7427         /*
7428          * Purge all the caches.  This effectively disables
7429          * caching for a few seconds, but the mbuf worker thread will
7430          * re-enable them again.
7431          */
7432         if (purge_caches == TRUE)
7433                 for (mc = 0; mc < NELEM(mbuf_table); mc++) {
7434                         if (m_total(mc) < m_avgtotal(mc))
7435                                 continue;
7436                         lck_mtx_unlock(mbuf_mlock);
7437                         ret = mcache_purge_cache(m_cache(mc), FALSE);
7438                         lck_mtx_lock(mbuf_mlock);
7439                         if (ret == TRUE)
7440                                 m_purge_cnt(mc)++;
7441                 }
7442         /*
7443          * Move the objects from the composite class freelist to
7444          * the rudimentary slabs list, but keep at least 10% of the average
7445          * total in the freelist.
7446          */
7447         for (mc = 0; mc < NELEM(mbuf_table); mc++) {
7448                 while (m_cobjlist(mc) &&
7449                     m_total(mc) < m_avgtotal(mc) &&
7450                     m_infree(mc) > 0.1 * m_avgtotal(mc) + m_minlimit(mc)) {
7451                         obj = m_cobjlist(mc);
7452                         m_cobjlist(mc) = obj->obj_next;
7453                         obj->obj_next = NULL;
7454                         num = cslab_free(mc, obj, 1);
7455                         VERIFY(num == 1);
7456                         m_free_cnt(mc)++;
7457                         m_infree(mc)--;
7458                         /* cslab_free() handles m_total */
7459                 }
7460         }
7461         /*
7462          * Free the buffers present in the slab list up to 10% of the total
7463          * average per class.
7464          *
7465          * We walk the list backwards in an attempt to reduce fragmentation.
7466          */
7467         for (mc = NELEM(mbuf_table) - 1; (int)mc >= 0; mc--) {
7468                 TAILQ_FOREACH_SAFE(sp, &m_slablist(mc), sl_link, sp_tmp) {
7469                         /*
7470                          * Process only unused slabs occupying memory.
7471                          */
7472                         if (sp->sl_refcnt != 0 || sp->sl_len == 0 ||
7473                             sp->sl_base == NULL)
7474                                 continue;
7475                         if (m_total(mc) < m_avgtotal(mc) ||
7476                             m_infree(mc) < 0.1 * m_avgtotal(mc) + m_minlimit(mc))
7477                                 break;
7478                         slab_remove(sp, mc);
7479                         switch (mc) {
7480                         case MC_MBUF:
7481                                 m_infree(mc) -= NMBPBG;
7482                                 m_total(mc) -= NMBPBG;
7483                                 if (mclaudit != NULL)
7484                                         mcl_audit_free(sp->sl_base, NMBPBG);
7485                                 break;
7486                         case MC_CL:
7487                                 m_infree(mc) -= NCLPBG;
7488                                 m_total(mc) -= NCLPBG;
7489                                 if (mclaudit != NULL)
7490                                         mcl_audit_free(sp->sl_base, NMBPBG);
7491                                 break;
7492                         case MC_BIGCL:
7493                                 m_infree(mc)--;
7494                                 m_total(mc)--;
7495                                 if (mclaudit != NULL)
7496                                         mcl_audit_free(sp->sl_base, NMBPBG);
7497                                 break;
7498                         case MC_16KCL:
7499                                 m_infree(mc)--;
7500                                 m_total(mc)--;
7501                                 for (nsp = sp, k = 1; k < NSLABSP16KB; k++) {
7502                                         nsp = nsp->sl_next;
7503                                         VERIFY(nsp->sl_refcnt == 0 &&
7504                                             nsp->sl_base != NULL &&
7505                                             nsp->sl_len == 0);
7506                                         slab_init(nsp, 0, 0, NULL, NULL, 0, 0,
7507                                             0);
7508                                         nsp->sl_flags = 0;
7509                                 }
7510                                 if (mclaudit != NULL)
7511                                         mcl_audit_free(sp->sl_base, 1);
7512                                 break;
7513                         default:
7514                                 /*
7515                                  * The composite classes have their own
7516                                  * freelist (m_cobjlist), so we only
7517                                  * process rudimentary classes here.
7518                                  */
7519                                 VERIFY(0);
7520                         }
7521                         m_release_cnt(mc) += m_size(mc);
7522                         released += m_size(mc);
7523                         offset = ((char *)sp->sl_base - (char *)mbutl) / NBPG;
7524                         /*
7525                          * Make sure the IOMapper points to a valid, but
7526                          * bogus, address.  This should prevent further DMA
7527                          * accesses to freed memory.
7528                          */
7529                         IOMapperInsertPage(mcl_paddr_base, offset, scratch_pa);
7530                         mcl_paddr[offset] = 0;
7531                         kmem_free(mb_map, (vm_offset_t)sp->sl_base,
7532                             sp->sl_len);
7533                         slab_init(sp, 0, 0, NULL, NULL, 0, 0, 0);
7534                         sp->sl_flags = 0;
7535                 }
7536         }
7537         mbstat.m_drain++;
7538         mbstat.m_bigclusters = m_total(MC_BIGCL);
7539         mbstat.m_clusters = m_total(MC_CL);
7540         mbstat.m_mbufs = m_total(MC_MBUF);
7541         mbuf_stat_sync();
7542         mbuf_mtypes_sync(TRUE);
7543         lck_mtx_unlock(mbuf_mlock);
7544 }
7545
7546 static int
7547 m_drain_force_sysctl SYSCTL_HANDLER_ARGS
7548 {
7549 #pragma unused(arg1, arg2)
7550         int val = 0, err;
7551
7552         err = sysctl_handle_int(oidp, &val, 0, req);
7553         if (err != 0 || req->newptr == USER_ADDR_NULL)
7554                 return (err);
7555         if (val)
7556                 m_drain();
7557
7558         return (err);
7559 }
7560
7561 SYSCTL_DECL(_kern_ipc);
7562 SYSCTL_PROC(_kern_ipc, KIPC_MBSTAT, mbstat,
7563     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
7564     0, 0, mbstat_sysctl, "S,mbstat", "");
7565 SYSCTL_PROC(_kern_ipc, OID_AUTO, mb_stat,
7566     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
7567     0, 0, mb_stat_sysctl, "S,mb_stat", "");
7568 SYSCTL_PROC(_kern_ipc, OID_AUTO, mleak_top_trace,
7569     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
7570     0, 0, mleak_top_trace_sysctl, "S,mb_top_trace", "");
7571 SYSCTL_PROC(_kern_ipc, OID_AUTO, mleak_table,
7572     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
7573     0, 0, mleak_table_sysctl, "S,mleak_table", "");
7574 SYSCTL_INT(_kern_ipc, OID_AUTO, mleak_sample_factor,
7575     CTLFLAG_RW | CTLFLAG_LOCKED, &mleak_table.mleak_sample_factor, 0, "");
7576 SYSCTL_INT(_kern_ipc, OID_AUTO, mb_normalized,
7577     CTLFLAG_RD | CTLFLAG_LOCKED, &mb_normalized, 0, "");
7578 SYSCTL_INT(_kern_ipc, OID_AUTO, mb_watchdog,
7579     CTLFLAG_RW | CTLFLAG_LOCKED, &mb_watchdog, 0, "");
7580 SYSCTL_PROC(_kern_ipc, OID_AUTO, mb_drain_force,
7581     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, NULL, 0,
7582     m_drain_force_sysctl, "I",
7583     "Forces the mbuf garbage collection to run");
7584 SYSCTL_INT(_kern_ipc, OID_AUTO, mb_drain_maxint,
7585     CTLFLAG_RW | CTLFLAG_LOCKED, &mb_drain_maxint, 0,
7586     "Minimum time interval between garbage collection");