bsd/kern/uipc_mbuf.c

   1 /*
   2  * Copyright (c) 1998-2013 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
  29 /*
  30  * Copyright (c) 1982, 1986, 1988, 1991, 1993
  31  *      The Regents of the University of California.  All rights reserved.
  32  *
  33  * Redistribution and use in source and binary forms, with or without
  34  * modification, are permitted provided that the following conditions
  35  * are met:
  36  * 1. Redistributions of source code must retain the above copyright
  37  *    notice, this list of conditions and the following disclaimer.
  38  * 2. Redistributions in binary form must reproduce the above copyright
  39  *    notice, this list of conditions and the following disclaimer in the
  40  *    documentation and/or other materials provided with the distribution.
  41  * 3. All advertising materials mentioning features or use of this software
  42  *    must display the following acknowledgement:
  43  *      This product includes software developed by the University of
  44  *      California, Berkeley and its contributors.
  45  * 4. Neither the name of the University nor the names of its contributors
  46  *    may be used to endorse or promote products derived from this software
  47  *    without specific prior written permission.
  48  *
  49  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  50  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  51  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  52  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  53  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  54  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  55  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  56  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  57  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  58  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  59  * SUCH DAMAGE.
  60  *
  61  *      @(#)uipc_mbuf.c 8.2 (Berkeley) 1/4/94
  62  */
  63 /*
  64  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
  65  * support for mandatory and extensible security protections.  This notice
  66  * is included in support of clause 2.2 (b) of the Apple Public License,
  67  * Version 2.0.
  68  */
  69
  70 #include <sys/param.h>
  71 #include <sys/systm.h>
  72 #include <sys/malloc.h>
  73 #include <sys/mbuf.h>
  74 #include <sys/kernel.h>
  75 #include <sys/sysctl.h>
  76 #include <sys/syslog.h>
  77 #include <sys/protosw.h>
  78 #include <sys/domain.h>
  79 #include <sys/queue.h>
  80 #include <sys/proc.h>
  81
  82 #include <dev/random/randomdev.h>
  83
  84 #include <kern/kern_types.h>
  85 #include <kern/simple_lock.h>
  86 #include <kern/queue.h>
  87 #include <kern/sched_prim.h>
  88 #include <kern/cpu_number.h>
  89 #include <kern/zalloc.h>
  90
  91 #include <libkern/OSAtomic.h>
  92 #include <libkern/OSDebug.h>
  93 #include <libkern/libkern.h>
  94
  95 #include <IOKit/IOMapper.h>
  96
  97 #include <machine/limits.h>
  98 #include <machine/machine_routines.h>
  99
 100 #if CONFIG_MACF_NET
 101 #include <security/mac_framework.h>
 102 #endif /* MAC_NET */
 103
 104 #include <sys/mcache.h>
 105
 106 /*
 107  * MBUF IMPLEMENTATION NOTES.
 108  *
 109  * There is a total of 5 per-CPU caches:
 110  *
 111  * MC_MBUF:
 112  *      This is a cache of rudimentary objects of MSIZE in size; each
 113  *      object represents an mbuf structure.  This cache preserves only
 114  *      the m_type field of the mbuf during its transactions.
 115  *
 116  * MC_CL:
 117  *      This is a cache of rudimentary objects of MCLBYTES in size; each
 118  *      object represents a mcluster structure.  This cache does not
 119  *      preserve the contents of the objects during its transactions.
 120  *
 121  * MC_BIGCL:
 122  *      This is a cache of rudimentary objects of MBIGCLBYTES in size; each
 123  *      object represents a mbigcluster structure.  This cache does not
 124  *      preserve the contents of the objects during its transaction.
 125  *
 126  * MC_MBUF_CL:
 127  *      This is a cache of mbufs each having a cluster attached to it.
 128  *      It is backed by MC_MBUF and MC_CL rudimentary caches.  Several
 129  *      fields of the mbuf related to the external cluster are preserved
 130  *      during transactions.
 131  *
 132  * MC_MBUF_BIGCL:
 133  *      This is a cache of mbufs each having a big cluster attached to it.
 134  *      It is backed by MC_MBUF and MC_BIGCL rudimentary caches.  Several
 135  *      fields of the mbuf related to the external cluster are preserved
 136  *      during transactions.
 137  *
 138  * OBJECT ALLOCATION:
 139  *
 140  * Allocation requests are handled first at the per-CPU (mcache) layer
 141  * before falling back to the slab layer.  Performance is optimal when
 142  * the request is satisfied at the CPU layer because global data/lock
 143  * never gets accessed.  When the slab layer is entered for allocation,
 144  * the slab freelist will be checked first for available objects before
 145  * the VM backing store is invoked.  Slab layer operations are serialized
 146  * for all of the caches as the mbuf global lock is held most of the time.
 147  * Allocation paths are different depending on the class of objects:
 148  *
 149  * a. Rudimentary object:
 150  *
 151  *      { m_get_common(), m_clattach(), m_mclget(),
 152  *        m_mclalloc(), m_bigalloc(), m_copym_with_hdrs(),
 153  *        composite object allocation }
 154  *                      |       ^
 155  *                      |       |
 156  *                      |       +-----------------------+
 157  *                      v                               |
 158  *         mcache_alloc/mcache_alloc_ext()      mbuf_slab_audit()
 159  *                      |                               ^
 160  *                      v                               |
 161  *                 [CPU cache] -------> (found?) -------+
 162  *                      |                               |
 163  *                      v                               |
 164  *               mbuf_slab_alloc()                      |
 165  *                      |                               |
 166  *                      v                               |
 167  *      +---------> [freelist] -------> (found?) -------+
 168  *      |               |
 169  *      |               v
 170  *      |           m_clalloc()
 171  *      |               |
 172  *      |               v
 173  *      +---<<---- kmem_mb_alloc()
 174  *
 175  * b. Composite object:
 176  *
 177  *      { m_getpackets_internal(), m_allocpacket_internal() }
 178  *                      |       ^
 179  *                      |       |
 180  *                      |       +------ (done) ---------+
 181  *                      v                               |
 182  *         mcache_alloc/mcache_alloc_ext()      mbuf_cslab_audit()
 183  *                      |                               ^
 184  *                      v                               |
 185  *                 [CPU cache] -------> (found?) -------+
 186  *                      |                               |
 187  *                      v                               |
 188  *               mbuf_cslab_alloc()                     |
 189  *                      |                               |
 190  *                      v                               |
 191  *                  [freelist] -------> (found?) -------+
 192  *                      |                               |
 193  *                      v                               |
 194  *              (rudimentary object)                    |
 195  *         mcache_alloc/mcache_alloc_ext() ------>>-----+
 196  *
 197  * Auditing notes: If auditing is enabled, buffers will be subjected to
 198  * integrity checks by the audit routine.  This is done by verifying their
 199  * contents against DEADBEEF (free) pattern before returning them to caller.
 200  * As part of this step, the routine will also record the transaction and
 201  * pattern-fill the buffers with BADDCAFE (uninitialized) pattern.  It will
 202  * also restore any constructed data structure fields if necessary.
 203  *
 204  * OBJECT DEALLOCATION:
 205  *
 206  * Freeing an object simply involves placing it into the CPU cache; this
 207  * pollutes the cache to benefit subsequent allocations.  The slab layer
 208  * will only be entered if the object is to be purged out of the cache.
 209  * During normal operations, this happens only when the CPU layer resizes
 210  * its bucket while it's adjusting to the allocation load.  Deallocation
 211  * paths are different depending on the class of objects:
 212  *
 213  * a. Rudimentary object:
 214  *
 215  *      { m_free(), m_freem_list(), composite object deallocation }
 216  *                      |       ^
 217  *                      |       |
 218  *                      |       +------ (done) ---------+
 219  *                      v                               |
 220  *         mcache_free/mcache_free_ext()                |
 221  *                      |                               |
 222  *                      v                               |
 223  *              mbuf_slab_audit()                       |
 224  *                      |                               |
 225  *                      v                               |
 226  *                 [CPU cache] ---> (not purging?) -----+
 227  *                      |                               |
 228  *                      v                               |
 229  *               mbuf_slab_free()                       |
 230  *                      |                               |
 231  *                      v                               |
 232  *                  [freelist] ----------->>------------+
 233  *       (objects never get purged to VM)
 234  *
 235  * b. Composite object:
 236  *
 237  *      { m_free(), m_freem_list() }
 238  *                      |       ^
 239  *                      |       |
 240  *                      |       +------ (done) ---------+
 241  *                      v                               |
 242  *         mcache_free/mcache_free_ext()                |
 243  *                      |                               |
 244  *                      v                               |
 245  *              mbuf_cslab_audit()                      |
 246  *                      |                               |
 247  *                      v                               |
 248  *                 [CPU cache] ---> (not purging?) -----+
 249  *                      |                               |
 250  *                      v                               |
 251  *               mbuf_cslab_free()                      |
 252  *                      |                               |
 253  *                      v                               |
 254  *                  [freelist] ---> (not purging?) -----+
 255  *                      |                               |
 256  *                      v                               |
 257  *              (rudimentary object)                    |
 258  *         mcache_free/mcache_free_ext() ------->>------+
 259  *
 260  * Auditing notes: If auditing is enabled, the audit routine will save
 261  * any constructed data structure fields (if necessary) before filling the
 262  * contents of the buffers with DEADBEEF (free) pattern and recording the
 263  * transaction.  Buffers that are freed (whether at CPU or slab layer) are
 264  * expected to contain the free pattern.
 265  *
 266  * DEBUGGING:
 267  *
 268  * Debugging can be enabled by adding "mbuf_debug=0x3" to boot-args; this
 269  * translates to the mcache flags (MCF_VERIFY | MCF_AUDIT).  Additionally,
 270  * the CPU layer cache can be disabled by setting the MCF_NOCPUCACHE flag,
 271  * i.e. modify the boot argument parameter to "mbuf_debug=0x13".  Leak
 272  * detection may also be disabled by setting the MCF_NOLEAKLOG flag, e.g.
 273  * "mbuf_debug=0x113".  Note that debugging consumes more CPU and memory.
 274  *
 275  * Each object is associated with exactly one mcache_audit_t structure that
 276  * contains the information related to its last buffer transaction.  Given
 277  * an address of an object, the audit structure can be retrieved by finding
 278  * the position of the object relevant to the base address of the cluster:
 279  *
 280  *      +------------+                  +=============+
 281  *      | mbuf addr  |                  | mclaudit[i] |
 282  *      +------------+                  +=============+
 283  *            |                         | cl_audit[0] |
 284  *      i = MTOBG(addr)                 +-------------+
 285  *            |                 +-----> | cl_audit[1] | -----> mcache_audit_t
 286  *      b = BGTOM(i)            |       +-------------+
 287  *            |                 |       |     ...     |
 288  *      x = MCLIDX(b, addr)     |       +-------------+
 289  *            |                 |       | cl_audit[7] |
 290  *            +-----------------+       +-------------+
 291  *               (e.g. x == 1)
 292  *
 293  * The mclaudit[] array is allocated at initialization time, but its contents
 294  * get populated when the corresponding cluster is created.  Because a page
 295  * can be turned into NMBPBG number of mbufs, we preserve enough space for the
 296  * mbufs so that there is a 1-to-1 mapping between them.  A page that never
 297  * gets (or has not yet) turned into mbufs will use only cl_audit[0] with the
 298  * remaining entries unused.  For 16KB cluster, only one entry from the first
 299  * page is allocated and used for the entire object.
 300  */
 301
 302 /* TODO: should be in header file */
 303 /* kernel translater */
 304 extern vm_offset_t kmem_mb_alloc(vm_map_t, int, int);
 305 extern ppnum_t pmap_find_phys(pmap_t pmap, addr64_t va);
 306 extern vm_map_t mb_map;         /* special map */
 307
 308 /* Global lock */
 309 decl_lck_mtx_data(static, mbuf_mlock_data);
 310 static lck_mtx_t *mbuf_mlock = &mbuf_mlock_data;
 311 static lck_attr_t *mbuf_mlock_attr;
 312 static lck_grp_t *mbuf_mlock_grp;
 313 static lck_grp_attr_t *mbuf_mlock_grp_attr;
 314
 315 /* Back-end (common) layer */
 316 static void *mbuf_worker_run;   /* wait channel for worker thread */
 317 static int mbuf_worker_ready;   /* worker thread is runnable */
 318 static int mbuf_expand_mcl;     /* number of cluster creation requets */
 319 static int mbuf_expand_big;     /* number of big cluster creation requests */
 320 static int mbuf_expand_16k;     /* number of 16KB cluster creation requests */
 321 static int ncpu;                /* number of CPUs */
 322 static ppnum_t *mcl_paddr;      /* Array of cluster physical addresses */
 323 static ppnum_t mcl_pages;       /* Size of array (# physical pages) */
 324 static ppnum_t mcl_paddr_base;  /* Handle returned by IOMapper::iovmAlloc() */
 325 static mcache_t *ref_cache;     /* Cache of cluster reference & flags */
 326 static mcache_t *mcl_audit_con_cache; /* Audit contents cache */
 327 static unsigned int mbuf_debug; /* patchable mbuf mcache flags */
 328 static unsigned int mb_normalized; /* number of packets "normalized" */
 329
 330 #define MB_GROWTH_AGGRESSIVE    1       /* Threshold: 1/2 of total */
 331 #define MB_GROWTH_NORMAL        2       /* Threshold: 3/4 of total */
 332
 333 typedef enum {
 334         MC_MBUF = 0,    /* Regular mbuf */
 335         MC_CL,          /* Cluster */
 336         MC_BIGCL,       /* Large (4KB) cluster */
 337         MC_16KCL,       /* Jumbo (16KB) cluster */
 338         MC_MBUF_CL,     /* mbuf + cluster */
 339         MC_MBUF_BIGCL,  /* mbuf + large (4KB) cluster */
 340         MC_MBUF_16KCL   /* mbuf + jumbo (16KB) cluster */
 341 } mbuf_class_t;
 342
 343 #define MBUF_CLASS_MIN          MC_MBUF
 344 #define MBUF_CLASS_MAX          MC_MBUF_16KCL
 345 #define MBUF_CLASS_LAST         MC_16KCL
 346 #define MBUF_CLASS_VALID(c) \
 347         ((int)(c) >= MBUF_CLASS_MIN && (int)(c) <= MBUF_CLASS_MAX)
 348 #define MBUF_CLASS_COMPOSITE(c) \
 349         ((int)(c) > MBUF_CLASS_LAST)
 350
 351
 352 /*
 353  * mbuf specific mcache allocation request flags.
 354  */
 355 #define MCR_COMP        MCR_USR1 /* for MC_MBUF_{CL,BIGCL,16KCL} caches */
 356
 357 /*
 358  * Per-cluster slab structure.
 359  *
 360  * A slab is a cluster control structure that contains one or more object
 361  * chunks; the available chunks are chained in the slab's freelist (sl_head).
 362  * Each time a chunk is taken out of the slab, the slab's reference count
 363  * gets incremented.  When all chunks have been taken out, the empty slab
 364  * gets removed (SLF_DETACHED) from the class's slab list.  A chunk that is
 365  * returned to a slab causes the slab's reference count to be decremented;
 366  * it also causes the slab to be reinserted back to class's slab list, if
 367  * it's not already done.
 368  *
 369  * Compartmentalizing of the object chunks into slabs allows us to easily
 370  * merge one or more slabs together when the adjacent slabs are idle, as
 371  * well as to convert or move a slab from one class to another; e.g. the
 372  * mbuf cluster slab can be converted to a regular cluster slab when all
 373  * mbufs in the slab have been freed.
 374  *
 375  * A slab may also span across multiple clusters for chunks larger than
 376  * a cluster's size.  In this case, only the slab of the first cluster is
 377  * used.  The rest of the slabs are marked with SLF_PARTIAL to indicate
 378  * that they are part of the larger slab.
 379  *
 380  * Each slab controls a page of memory.
 381  */
 382 typedef struct mcl_slab {
 383         struct mcl_slab *sl_next;       /* neighboring slab */
 384         u_int8_t        sl_class;       /* controlling mbuf class */
 385         int8_t          sl_refcnt;      /* outstanding allocations */
 386         int8_t          sl_chunks;      /* chunks (bufs) in this slab */
 387         u_int16_t       sl_flags;       /* slab flags (see below) */
 388         u_int16_t       sl_len;         /* slab length */
 389         void            *sl_base;       /* base of allocated memory */
 390         void            *sl_head;       /* first free buffer */
 391         TAILQ_ENTRY(mcl_slab) sl_link;  /* next/prev slab on freelist */
 392 } mcl_slab_t;
 393
 394 #define SLF_MAPPED      0x0001          /* backed by a mapped page */
 395 #define SLF_PARTIAL     0x0002          /* part of another slab */
 396 #define SLF_DETACHED    0x0004          /* not in slab freelist */
 397
 398 /*
 399  * The array of slabs are broken into groups of arrays per 1MB of kernel
 400  * memory to reduce the footprint.  Each group is allocated on demand
 401  * whenever a new piece of memory mapped in from the VM crosses the 1MB
 402  * boundary.
 403  */
 404 #define NSLABSPMB       ((1 << MBSHIFT) >> PGSHIFT)     /* 256 slabs/grp */
 405
 406 typedef struct mcl_slabg {
 407         mcl_slab_t      slg_slab[NSLABSPMB];    /* group of slabs */
 408 } mcl_slabg_t;
 409
 410 /*
 411  * Number of slabs needed to control a 16KB cluster object.
 412  */
 413 #define NSLABSP16KB     (M16KCLBYTES >> PGSHIFT)
 414
 415 /*
 416  * Per-cluster audit structure.
 417  */
 418 typedef struct {
 419         mcache_audit_t  *cl_audit[NMBPBG];      /* array of audits */
 420 } mcl_audit_t;
 421
 422 typedef struct {
 423         struct thread   *msa_thread;    /* thread doing transaction */
 424         struct thread   *msa_pthread;   /* previous transaction thread */
 425         uint32_t        msa_tstamp;     /* transaction timestamp (ms) */
 426         uint32_t        msa_ptstamp;    /* prev transaction timestamp (ms) */
 427         uint16_t        msa_depth;      /* pc stack depth */
 428         uint16_t        msa_pdepth;     /* previous transaction pc stack */
 429         void            *msa_stack[MCACHE_STACK_DEPTH];
 430         void            *msa_pstack[MCACHE_STACK_DEPTH];
 431 } mcl_scratch_audit_t;
 432
 433 typedef struct {
 434         /*
 435          * Size of data from the beginning of an mbuf that covers m_hdr,
 436          * pkthdr and m_ext structures.  If auditing is enabled, we allocate
 437          * a shadow mbuf structure of this size inside each audit structure,
 438          * and the contents of the real mbuf gets copied into it when the mbuf
 439          * is freed.  This allows us to pattern-fill the mbuf for integrity
 440          * check, and to preserve any constructed mbuf fields (e.g. mbuf +
 441          * cluster cache case).  Note that we don't save the contents of
 442          * clusters when they are freed; we simply pattern-fill them.
 443          */
 444         u_int8_t                sc_mbuf[(MSIZE - _MHLEN) + sizeof (_m_ext_t)];
 445         mcl_scratch_audit_t     sc_scratch __attribute__((aligned(8)));
 446 } mcl_saved_contents_t;
 447
 448 #define AUDIT_CONTENTS_SIZE     (sizeof (mcl_saved_contents_t))
 449
 450 #define MCA_SAVED_MBUF_PTR(_mca)                                        \
 451         ((struct mbuf *)(void *)((mcl_saved_contents_t *)               \
 452         (_mca)->mca_contents)->sc_mbuf)
 453 #define MCA_SAVED_MBUF_SIZE                                             \
 454         (sizeof (((mcl_saved_contents_t *)0)->sc_mbuf))
 455 #define MCA_SAVED_SCRATCH_PTR(_mca)                                     \
 456         (&((mcl_saved_contents_t *)(_mca)->mca_contents)->sc_scratch)
 457
 458 /*
 459  * mbuf specific mcache audit flags
 460  */
 461 #define MB_INUSE        0x01    /* object has not been returned to slab */
 462 #define MB_COMP_INUSE   0x02    /* object has not been returned to cslab */
 463 #define MB_SCVALID      0x04    /* object has valid saved contents */
 464
 465 /*
 466  * Each of the following two arrays hold up to nmbclusters elements.
 467  */
 468 static mcl_audit_t *mclaudit;   /* array of cluster audit information */
 469 static unsigned int maxclaudit; /* max # of entries in audit table */
 470 static mcl_slabg_t **slabstbl;  /* cluster slabs table */
 471 static unsigned int maxslabgrp; /* max # of entries in slabs table */
 472 static unsigned int slabgrp;    /* # of entries in slabs table */
 473
 474 /* Globals */
 475 int nclusters;                  /* # of clusters for non-jumbo (legacy) sizes */
 476 int njcl;                       /* # of clusters for jumbo sizes */
 477 int njclbytes;                  /* size of a jumbo cluster */
 478 union mbigcluster *mbutl;       /* first mapped cluster address */
 479 union mbigcluster *embutl;      /* ending virtual address of mclusters */
 480 int _max_linkhdr;               /* largest link-level header */
 481 int _max_protohdr;              /* largest protocol header */
 482 int max_hdr;                    /* largest link+protocol header */
 483 int max_datalen;                /* MHLEN - max_hdr */
 484
 485 static boolean_t mclverify;     /* debug: pattern-checking */
 486 static boolean_t mcltrace;      /* debug: stack tracing */
 487 static boolean_t mclfindleak;   /* debug: leak detection */
 488 static boolean_t mclexpleak;    /* debug: expose leak info to user space */
 489
 490 static struct timeval mb_start; /* beginning of time */
 491
 492 /* mbuf leak detection variables */
 493 static struct mleak_table mleak_table;
 494 static mleak_stat_t *mleak_stat;
 495
 496 #define MLEAK_STAT_SIZE(n) \
 497         ((size_t)(&((mleak_stat_t *)0)->ml_trace[n]))
 498
 499 struct mallocation {
 500         mcache_obj_t *element;  /* the alloc'ed element, NULL if unused */
 501         u_int32_t trace_index;  /* mtrace index for corresponding backtrace */
 502         u_int32_t count;        /* How many objects were requested */
 503         u_int64_t hitcount;     /* for determining hash effectiveness */
 504 };
 505
 506 struct mtrace {
 507         u_int64_t       collisions;
 508         u_int64_t       hitcount;
 509         u_int64_t       allocs;
 510         u_int64_t       depth;
 511         uintptr_t       addr[MLEAK_STACK_DEPTH];
 512 };
 513
 514 /* Size must be a power of two for the zhash to be able to just mask off bits */
 515 #define MLEAK_ALLOCATION_MAP_NUM        512
 516 #define MLEAK_TRACE_MAP_NUM             256
 517
 518 /*
 519  * Sample factor for how often to record a trace.  This is overwritable
 520  * by the boot-arg mleak_sample_factor.
 521  */
 522 #define MLEAK_SAMPLE_FACTOR             500
 523
 524 /*
 525  * Number of top leakers recorded.
 526  */
 527 #define MLEAK_NUM_TRACES                5
 528
 529 #define MB_LEAK_SPACING_64 "                    "
 530 #define MB_LEAK_SPACING_32 "            "
 531
 532
 533 #define MB_LEAK_HDR_32  "\n\
 534     trace [1]   trace [2]   trace [3]   trace [4]   trace [5]  \n\
 535     ----------  ----------  ----------  ----------  ---------- \n\
 536 "
 537
 538 #define MB_LEAK_HDR_64  "\n\
 539     trace [1]           trace [2]           trace [3]       \
 540         trace [4]           trace [5]      \n\
 541     ------------------  ------------------  ------------------  \
 542     ------------------  ------------------ \n\
 543 "
 544
 545 static uint32_t mleak_alloc_buckets = MLEAK_ALLOCATION_MAP_NUM;
 546 static uint32_t mleak_trace_buckets = MLEAK_TRACE_MAP_NUM;
 547
 548 /* Hashmaps of allocations and their corresponding traces */
 549 static struct mallocation *mleak_allocations;
 550 static struct mtrace *mleak_traces;
 551 static struct mtrace *mleak_top_trace[MLEAK_NUM_TRACES];
 552
 553 /* Lock to protect mleak tables from concurrent modification */
 554 decl_lck_mtx_data(static, mleak_lock_data);
 555 static lck_mtx_t *mleak_lock = &mleak_lock_data;
 556 static lck_attr_t *mleak_lock_attr;
 557 static lck_grp_t *mleak_lock_grp;
 558 static lck_grp_attr_t *mleak_lock_grp_attr;
 559
 560 extern u_int32_t high_sb_max;
 561
 562 /* The minimum number of objects that are allocated, to start. */
 563 #define MINCL           32
 564 #define MINBIGCL        (MINCL >> 1)
 565 #define MIN16KCL        (MINCL >> 2)
 566
 567 /* Low watermarks (only map in pages once free counts go below) */
 568 #define MBIGCL_LOWAT    MINBIGCL
 569 #define M16KCL_LOWAT    MIN16KCL
 570
 571 typedef struct {
 572         mbuf_class_t    mtbl_class;     /* class type */
 573         mcache_t        *mtbl_cache;    /* mcache for this buffer class */
 574         TAILQ_HEAD(mcl_slhead, mcl_slab) mtbl_slablist; /* slab list */
 575         mcache_obj_t    *mtbl_cobjlist; /* composite objects freelist */
 576         mb_class_stat_t *mtbl_stats;    /* statistics fetchable via sysctl */
 577         u_int32_t       mtbl_maxsize;   /* maximum buffer size */
 578         int             mtbl_minlimit;  /* minimum allowed */
 579         int             mtbl_maxlimit;  /* maximum allowed */
 580         u_int32_t       mtbl_wantpurge; /* purge during next reclaim */
 581 } mbuf_table_t;
 582
 583 #define m_class(c)      mbuf_table[c].mtbl_class
 584 #define m_cache(c)      mbuf_table[c].mtbl_cache
 585 #define m_slablist(c)   mbuf_table[c].mtbl_slablist
 586 #define m_cobjlist(c)   mbuf_table[c].mtbl_cobjlist
 587 #define m_maxsize(c)    mbuf_table[c].mtbl_maxsize
 588 #define m_minlimit(c)   mbuf_table[c].mtbl_minlimit
 589 #define m_maxlimit(c)   mbuf_table[c].mtbl_maxlimit
 590 #define m_wantpurge(c)  mbuf_table[c].mtbl_wantpurge
 591 #define m_cname(c)      mbuf_table[c].mtbl_stats->mbcl_cname
 592 #define m_size(c)       mbuf_table[c].mtbl_stats->mbcl_size
 593 #define m_total(c)      mbuf_table[c].mtbl_stats->mbcl_total
 594 #define m_active(c)     mbuf_table[c].mtbl_stats->mbcl_active
 595 #define m_infree(c)     mbuf_table[c].mtbl_stats->mbcl_infree
 596 #define m_slab_cnt(c)   mbuf_table[c].mtbl_stats->mbcl_slab_cnt
 597 #define m_alloc_cnt(c)  mbuf_table[c].mtbl_stats->mbcl_alloc_cnt
 598 #define m_free_cnt(c)   mbuf_table[c].mtbl_stats->mbcl_free_cnt
 599 #define m_notified(c)   mbuf_table[c].mtbl_stats->mbcl_notified
 600 #define m_purge_cnt(c)  mbuf_table[c].mtbl_stats->mbcl_purge_cnt
 601 #define m_fail_cnt(c)   mbuf_table[c].mtbl_stats->mbcl_fail_cnt
 602 #define m_ctotal(c)     mbuf_table[c].mtbl_stats->mbcl_ctotal
 603
 604 static mbuf_table_t mbuf_table[] = {
 605         /*
 606          * The caches for mbufs, regular clusters and big clusters.
 607          */
 608         { MC_MBUF, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_MBUF)),
 609             NULL, NULL, 0, 0, 0, 0 },
 610         { MC_CL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_CL)),
 611             NULL, NULL, 0, 0, 0, 0 },
 612         { MC_BIGCL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_BIGCL)),
 613             NULL, NULL, 0, 0, 0, 0 },
 614         { MC_16KCL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_16KCL)),
 615             NULL, NULL, 0, 0, 0, 0 },
 616         /*
 617          * The following are special caches; they serve as intermediate
 618          * caches backed by the above rudimentary caches.  Each object
 619          * in the cache is an mbuf with a cluster attached to it.  Unlike
 620          * the above caches, these intermediate caches do not directly
 621          * deal with the slab structures; instead, the constructed
 622          * cached elements are simply stored in the freelists.
 623          */
 624         { MC_MBUF_CL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0 },
 625         { MC_MBUF_BIGCL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0 },
 626         { MC_MBUF_16KCL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0 },
 627 };
 628
 629 #define NELEM(a)        (sizeof (a) / sizeof ((a)[0]))
 630
 631 static void *mb_waitchan = &mbuf_table; /* wait channel for all caches */
 632 static int mb_waiters;                  /* number of waiters */
 633
 634 #define MB_WDT_MAXTIME  10              /* # of secs before watchdog panic */
 635 static struct timeval mb_wdtstart;      /* watchdog start timestamp */
 636 static char *mbuf_dump_buf;
 637
 638 #define MBUF_DUMP_BUF_SIZE      2048
 639
 640 /*
 641  * mbuf watchdog is enabled by default on embedded platforms.  It is
 642  * also toggeable via the kern.ipc.mb_watchdog sysctl.
 643  */
 644 static unsigned int mb_watchdog = 0;
 645
 646 /* Red zone */
 647 static u_int32_t mb_redzone_cookie;
 648 static void m_redzone_init(struct mbuf *);
 649 static void m_redzone_verify(struct mbuf *m);
 650
 651 /* The following are used to serialize m_clalloc() */
 652 static boolean_t mb_clalloc_busy;
 653 static void *mb_clalloc_waitchan = &mb_clalloc_busy;
 654 static int mb_clalloc_waiters;
 655
 656 static void mbuf_mtypes_sync(boolean_t);
 657 static int mbstat_sysctl SYSCTL_HANDLER_ARGS;
 658 static void mbuf_stat_sync(void);
 659 static int mb_stat_sysctl SYSCTL_HANDLER_ARGS;
 660 static int mleak_top_trace_sysctl SYSCTL_HANDLER_ARGS;
 661 static int mleak_table_sysctl SYSCTL_HANDLER_ARGS;
 662 static char *mbuf_dump(void);
 663 static void mbuf_table_init(void);
 664 static inline void m_incref(struct mbuf *);
 665 static inline u_int32_t m_decref(struct mbuf *);
 666 static int m_clalloc(const u_int32_t, const int, const u_int32_t);
 667 static void mbuf_worker_thread_init(void);
 668 static mcache_obj_t *slab_alloc(mbuf_class_t, int);
 669 static void slab_free(mbuf_class_t, mcache_obj_t *);
 670 static unsigned int mbuf_slab_alloc(void *, mcache_obj_t ***,
 671     unsigned int, int);
 672 static void mbuf_slab_free(void *, mcache_obj_t *, int);
 673 static void mbuf_slab_audit(void *, mcache_obj_t *, boolean_t);
 674 static void mbuf_slab_notify(void *, u_int32_t);
 675 static unsigned int cslab_alloc(mbuf_class_t, mcache_obj_t ***,
 676     unsigned int);
 677 static unsigned int cslab_free(mbuf_class_t, mcache_obj_t *, int);
 678 static unsigned int mbuf_cslab_alloc(void *, mcache_obj_t ***,
 679     unsigned int, int);
 680 static void mbuf_cslab_free(void *, mcache_obj_t *, int);
 681 static void mbuf_cslab_audit(void *, mcache_obj_t *, boolean_t);
 682 static int freelist_populate(mbuf_class_t, unsigned int, int);
 683 static void freelist_init(mbuf_class_t);
 684 static boolean_t mbuf_cached_above(mbuf_class_t, int);
 685 static boolean_t mbuf_steal(mbuf_class_t, unsigned int);
 686 static void m_reclaim(mbuf_class_t, unsigned int, boolean_t);
 687 static int m_howmany(int, size_t);
 688 static void mbuf_worker_thread(void);
 689 static void mbuf_watchdog(void);
 690 static boolean_t mbuf_sleep(mbuf_class_t, unsigned int, int);
 691
 692 static void mcl_audit_init(void *, mcache_audit_t **, mcache_obj_t **,
 693     size_t, unsigned int);
 694 static mcache_audit_t *mcl_audit_buf2mca(mbuf_class_t, mcache_obj_t *);
 695 static void mcl_audit_mbuf(mcache_audit_t *, void *, boolean_t, boolean_t);
 696 static void mcl_audit_cluster(mcache_audit_t *, void *, size_t, boolean_t,
 697     boolean_t);
 698 static void mcl_audit_restore_mbuf(struct mbuf *, mcache_audit_t *, boolean_t);
 699 static void mcl_audit_save_mbuf(struct mbuf *, mcache_audit_t *);
 700 static void mcl_audit_scratch(mcache_audit_t *);
 701 static void mcl_audit_mcheck_panic(struct mbuf *);
 702 static void mcl_audit_verify_nextptr(void *, mcache_audit_t *);
 703
 704 static void mleak_activate(void);
 705 static void mleak_logger(u_int32_t, mcache_obj_t *, boolean_t);
 706 static boolean_t mleak_log(uintptr_t *, mcache_obj_t *, uint32_t, int);
 707 static void mleak_free(mcache_obj_t *);
 708 static void mleak_sort_traces(void);
 709 static void mleak_update_stats(void);
 710
 711 static mcl_slab_t *slab_get(void *);
 712 static void slab_init(mcl_slab_t *, mbuf_class_t, u_int32_t,
 713     void *, void *, unsigned int, int, int);
 714 static void slab_insert(mcl_slab_t *, mbuf_class_t);
 715 static void slab_remove(mcl_slab_t *, mbuf_class_t);
 716 static boolean_t slab_inrange(mcl_slab_t *, void *);
 717 static void slab_nextptr_panic(mcl_slab_t *, void *);
 718 static void slab_detach(mcl_slab_t *);
 719 static boolean_t slab_is_detached(mcl_slab_t *);
 720
 721 static int m_copyback0(struct mbuf **, int, int, const void *, int, int);
 722 static struct mbuf *m_split0(struct mbuf *, int, int, int);
 723
 724 /* flags for m_copyback0 */
 725 #define M_COPYBACK0_COPYBACK    0x0001  /* copyback from cp */
 726 #define M_COPYBACK0_PRESERVE    0x0002  /* preserve original data */
 727 #define M_COPYBACK0_COW         0x0004  /* do copy-on-write */
 728 #define M_COPYBACK0_EXTEND      0x0008  /* extend chain */
 729
 730 /*
 731  * This flag is set for all mbufs that come out of and into the composite
 732  * mbuf + cluster caches, i.e. MC_MBUF_CL and MC_MBUF_BIGCL.  mbufs that
 733  * are marked with such a flag have clusters attached to them, and will be
 734  * treated differently when they are freed; instead of being placed back
 735  * into the mbuf and cluster freelists, the composite mbuf + cluster objects
 736  * are placed back into the appropriate composite cache's freelist, and the
 737  * actual freeing is deferred until the composite objects are purged.  At
 738  * such a time, this flag will be cleared from the mbufs and the objects
 739  * will be freed into their own separate freelists.
 740  */
 741 #define EXTF_COMPOSITE  0x1
 742
 743 /*
 744  * This flag indicates that the external cluster is read-only, i.e. it is
 745  * or was referred to by more than one mbufs.  Once set, this flag is never
 746  * cleared.
 747  */
 748 #define EXTF_READONLY   0x2
 749 #define EXTF_MASK       (EXTF_COMPOSITE | EXTF_READONLY)
 750
 751 #define MEXT_RFA(m)             ((m)->m_ext.ext_refflags)
 752 #define MEXT_REF(m)             (MEXT_RFA(m)->refcnt)
 753 #define MEXT_FLAGS(m)           (MEXT_RFA(m)->flags)
 754 #define MBUF_IS_COMPOSITE(m)    \
 755         (MEXT_REF(m) == 0 && (MEXT_FLAGS(m) & EXTF_MASK) == EXTF_COMPOSITE)
 756
 757 /*
 758  * Macros used to verify the integrity of the mbuf.
 759  */
 760 #define _MCHECK(m) {                                                    \
 761         if ((m)->m_type != MT_FREE) {                                   \
 762                 if (mclaudit == NULL)                                   \
 763                         panic("MCHECK: m_type=%d m=%p",                 \
 764                             (u_int16_t)(m)->m_type, m);                 \
 765                 else                                                    \
 766                         mcl_audit_mcheck_panic(m);                      \
 767         }                                                               \
 768 }
 769
 770 #define MBUF_IN_MAP(addr)                                               \
 771         ((void *)(addr) >= (void *)mbutl && (void *)(addr) < (void *)embutl)
 772
 773 #define MRANGE(addr) {                                                  \
 774         if (!MBUF_IN_MAP(addr))                                         \
 775                 panic("MRANGE: address out of range 0x%p", addr);       \
 776 }
 777
 778 /*
 779  * Macro version of mtod.
 780  */
 781 #define MTOD(m, t)      ((t)((m)->m_data))
 782
 783 /*
 784  * Macros to obtain (4KB) cluster index and base cluster address.
 785  */
 786
 787 #define MTOBG(x)        (((char *)(x) - (char *)mbutl) >> MBIGCLSHIFT)
 788 #define BGTOM(x)        ((union mbigcluster *)(mbutl + (x)))
 789
 790 /*
 791  * Macro to find the mbuf index relative to a base.
 792  */
 793 #define MCLIDX(c, m)    (((char *)(m) - (char *)(c)) >> MSIZESHIFT)
 794
 795 /*
 796  * Same thing for 2KB cluster index.
 797  */
 798 #define CLBGIDX(c, m)   (((char *)(m) - (char *)(c)) >> MCLSHIFT)
 799
 800 /*
 801  * Macros used during mbuf and cluster initialization.
 802  */
 803 #define MBUF_INIT_PKTHDR(m) {                                           \
 804         (m)->m_pkthdr.rcvif = NULL;                                     \
 805         (m)->m_pkthdr.pkt_hdr = NULL;                                   \
 806         (m)->m_pkthdr.len = 0;                                          \
 807         (m)->m_pkthdr.csum_flags = 0;                                   \
 808         (m)->m_pkthdr.csum_data = 0;                                    \
 809         (m)->m_pkthdr.vlan_tag = 0;                                     \
 810         m_classifier_init(m, 0);                                        \
 811         m_tag_init(m, 1);                                               \
 812         m_scratch_init(m);                                              \
 813         m_redzone_init(m);                                              \
 814 }
 815
 816 #define MBUF_INIT(m, pkthdr, type) {                                    \
 817         _MCHECK(m);                                                     \
 818         (m)->m_next = (m)->m_nextpkt = NULL;                            \
 819         (m)->m_len = 0;                                                 \
 820         (m)->m_type = type;                                             \
 821         if ((pkthdr) == 0) {                                            \
 822                 (m)->m_data = (m)->m_dat;                               \
 823                 (m)->m_flags = 0;                                       \
 824         } else {                                                        \
 825                 (m)->m_data = (m)->m_pktdat;                            \
 826                 (m)->m_flags = M_PKTHDR;                                \
 827                 MBUF_INIT_PKTHDR(m);                                    \
 828         }                                                               \
 829 }
 830
 831 #define MEXT_INIT(m, buf, size, free, arg, rfa, ref, flag) {            \
 832         (m)->m_data = (m)->m_ext.ext_buf = (buf);                       \
 833         (m)->m_flags |= M_EXT;                                          \
 834         (m)->m_ext.ext_size = (size);                                   \
 835         (m)->m_ext.ext_free = (free);                                   \
 836         (m)->m_ext.ext_arg = (arg);                                     \
 837         (m)->m_ext.ext_refs.forward = (m)->m_ext.ext_refs.backward =    \
 838             &(m)->m_ext.ext_refs;                                       \
 839         MEXT_RFA(m) = (rfa);                                            \
 840         MEXT_REF(m) = (ref);                                            \
 841         MEXT_FLAGS(m) = (flag);                                         \
 842 }
 843
 844 #define MBUF_CL_INIT(m, buf, rfa, ref, flag)    \
 845         MEXT_INIT(m, buf, m_maxsize(MC_CL), NULL, NULL, rfa, ref, flag)
 846
 847 #define MBUF_BIGCL_INIT(m, buf, rfa, ref, flag) \
 848         MEXT_INIT(m, buf, m_maxsize(MC_BIGCL), m_bigfree, NULL, rfa, ref, flag)
 849
 850 #define MBUF_16KCL_INIT(m, buf, rfa, ref, flag) \
 851         MEXT_INIT(m, buf, m_maxsize(MC_16KCL), m_16kfree, NULL, rfa, ref, flag)
 852
 853 /*
 854  * Macro to convert BSD malloc sleep flag to mcache's
 855  */
 856 #define MSLEEPF(f)      ((!((f) & M_DONTWAIT)) ? MCR_SLEEP : MCR_NOSLEEP)
 857
 858 /*
 859  * The structure that holds all mbuf class statistics exportable via sysctl.
 860  * Similar to mbstat structure, the mb_stat structure is protected by the
 861  * global mbuf lock.  It contains additional information about the classes
 862  * that allows for a more accurate view of the state of the allocator.
 863  */
 864 struct mb_stat *mb_stat;
 865 struct omb_stat *omb_stat;      /* For backwards compatibility */
 866
 867 #define MB_STAT_SIZE(n) \
 868         ((size_t)(&((mb_stat_t *)0)->mbs_class[n]))
 869 #define OMB_STAT_SIZE(n) \
 870         ((size_t)(&((struct omb_stat *)0)->mbs_class[n]))
 871
 872 /*
 873  * The legacy structure holding all of the mbuf allocation statistics.
 874  * The actual statistics used by the kernel are stored in the mbuf_table
 875  * instead, and are updated atomically while the global mbuf lock is held.
 876  * They are mirrored in mbstat to support legacy applications (e.g. netstat).
 877  * Unlike before, the kernel no longer relies on the contents of mbstat for
 878  * its operations (e.g. cluster expansion) because the structure is exposed
 879  * to outside and could possibly be modified, therefore making it unsafe.
 880  * With the exception of the mbstat.m_mtypes array (see below), all of the
 881  * statistics are updated as they change.
 882  */
 883 struct mbstat mbstat;
 884
 885 #define MBSTAT_MTYPES_MAX \
 886         (sizeof (mbstat.m_mtypes) / sizeof (mbstat.m_mtypes[0]))
 887
 888 /*
 889  * Allocation statistics related to mbuf types (up to MT_MAX-1) are updated
 890  * atomically and stored in a per-CPU structure which is lock-free; this is
 891  * done in order to avoid writing to the global mbstat data structure which
 892  * would cause false sharing.  During sysctl request for kern.ipc.mbstat,
 893  * the statistics across all CPUs will be converged into the mbstat.m_mtypes
 894  * array and returned to the application.  Any updates for types greater or
 895  * equal than MT_MAX would be done atomically to the mbstat; this slows down
 896  * performance but is okay since the kernel uses only up to MT_MAX-1 while
 897  * anything beyond that (up to type 255) is considered a corner case.
 898  */
 899 typedef struct {
 900         unsigned int    cpu_mtypes[MT_MAX];
 901 } __attribute__((aligned(MAX_CPU_CACHE_LINE_SIZE), packed)) mtypes_cpu_t;
 902
 903 typedef struct {
 904         mtypes_cpu_t    mbs_cpu[1];
 905 } mbuf_mtypes_t;
 906
 907 static mbuf_mtypes_t *mbuf_mtypes;      /* per-CPU statistics */
 908
 909 #define MBUF_MTYPES_SIZE(n) \
 910         ((size_t)(&((mbuf_mtypes_t *)0)->mbs_cpu[n]))
 911
 912 #define MTYPES_CPU(p) \
 913         ((mtypes_cpu_t *)(void *)((char *)(p) + MBUF_MTYPES_SIZE(cpu_number())))
 914
 915 #define mtype_stat_add(type, n) {                                       \
 916         if ((unsigned)(type) < MT_MAX) {                                \
 917                 mtypes_cpu_t *mbs = MTYPES_CPU(mbuf_mtypes);            \
 918                 atomic_add_32(&mbs->cpu_mtypes[type], n);               \
 919         } else if ((unsigned)(type) < (unsigned)MBSTAT_MTYPES_MAX) {    \
 920                 atomic_add_16((int16_t *)&mbstat.m_mtypes[type], n);    \
 921         }                                                               \
 922 }
 923
 924 #define mtype_stat_sub(t, n)    mtype_stat_add(t, -(n))
 925 #define mtype_stat_inc(t)       mtype_stat_add(t, 1)
 926 #define mtype_stat_dec(t)       mtype_stat_sub(t, 1)
 927
 928 static void
 929 mbuf_mtypes_sync(boolean_t locked)
 930 {
 931         int m, n;
 932         mtypes_cpu_t mtc;
 933
 934         if (locked)
 935                 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
 936
 937         bzero(&mtc, sizeof (mtc));
 938         for (m = 0; m < ncpu; m++) {
 939                 mtypes_cpu_t *scp = &mbuf_mtypes->mbs_cpu[m];
 940                 mtypes_cpu_t temp;
 941
 942                 bcopy(&scp->cpu_mtypes, &temp.cpu_mtypes,
 943                     sizeof (temp.cpu_mtypes));
 944
 945                 for (n = 0; n < MT_MAX; n++)
 946                         mtc.cpu_mtypes[n] += temp.cpu_mtypes[n];
 947         }
 948         if (!locked)
 949                 lck_mtx_lock(mbuf_mlock);
 950         for (n = 0; n < MT_MAX; n++)
 951                 mbstat.m_mtypes[n] = mtc.cpu_mtypes[n];
 952         if (!locked)
 953                 lck_mtx_unlock(mbuf_mlock);
 954 }
 955
 956 static int
 957 mbstat_sysctl SYSCTL_HANDLER_ARGS
 958 {
 959 #pragma unused(oidp, arg1, arg2)
 960         mbuf_mtypes_sync(FALSE);
 961
 962         return (SYSCTL_OUT(req, &mbstat, sizeof (mbstat)));
 963 }
 964
 965 static void
 966 mbuf_stat_sync(void)
 967 {
 968         mb_class_stat_t *sp;
 969         mcache_cpu_t *ccp;
 970         mcache_t *cp;
 971         int k, m, bktsize;
 972
 973         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
 974
 975         for (k = 0; k < NELEM(mbuf_table); k++) {
 976                 cp = m_cache(k);
 977                 ccp = &cp->mc_cpu[0];
 978                 bktsize = ccp->cc_bktsize;
 979                 sp = mbuf_table[k].mtbl_stats;
 980
 981                 if (cp->mc_flags & MCF_NOCPUCACHE)
 982                         sp->mbcl_mc_state = MCS_DISABLED;
 983                 else if (cp->mc_purge_cnt > 0)
 984                         sp->mbcl_mc_state = MCS_PURGING;
 985                 else if (bktsize == 0)
 986                         sp->mbcl_mc_state = MCS_OFFLINE;
 987                 else
 988                         sp->mbcl_mc_state = MCS_ONLINE;
 989
 990                 sp->mbcl_mc_cached = 0;
 991                 for (m = 0; m < ncpu; m++) {
 992                         ccp = &cp->mc_cpu[m];
 993                         if (ccp->cc_objs > 0)
 994                                 sp->mbcl_mc_cached += ccp->cc_objs;
 995                         if (ccp->cc_pobjs > 0)
 996                                 sp->mbcl_mc_cached += ccp->cc_pobjs;
 997                 }
 998                 sp->mbcl_mc_cached += (cp->mc_full.bl_total * bktsize);
 999                 sp->mbcl_active = sp->mbcl_total - sp->mbcl_mc_cached -
1000                     sp->mbcl_infree;
1001
1002                 sp->mbcl_mc_waiter_cnt = cp->mc_waiter_cnt;
1003                 sp->mbcl_mc_wretry_cnt = cp->mc_wretry_cnt;
1004                 sp->mbcl_mc_nwretry_cnt = cp->mc_nwretry_cnt;
1005
1006                 /* Calculate total count specific to each class */
1007                 sp->mbcl_ctotal = sp->mbcl_total;
1008                 switch (m_class(k)) {
1009                 case MC_MBUF:
1010                         /* Deduct mbufs used in composite caches */
1011                         sp->mbcl_ctotal -= (m_total(MC_MBUF_CL) +
1012                             m_total(MC_MBUF_BIGCL));
1013                         break;
1014
1015                 case MC_CL:
1016                         /* Deduct clusters used in composite cache */
1017                         sp->mbcl_ctotal -= m_total(MC_MBUF_CL);
1018                         break;
1019
1020                 case MC_BIGCL:
1021                         /* Deduct clusters used in composite cache */
1022                         sp->mbcl_ctotal -= m_total(MC_MBUF_BIGCL);
1023                         break;
1024
1025                 case MC_16KCL:
1026                         /* Deduct clusters used in composite cache */
1027                         sp->mbcl_ctotal -= m_total(MC_MBUF_16KCL);
1028                         break;
1029
1030                 default:
1031                         break;
1032                 }
1033         }
1034 }
1035
1036 static int
1037 mb_stat_sysctl SYSCTL_HANDLER_ARGS
1038 {
1039 #pragma unused(oidp, arg1, arg2)
1040         void *statp;
1041         int k, statsz, proc64 = proc_is64bit(req->p);
1042
1043         lck_mtx_lock(mbuf_mlock);
1044         mbuf_stat_sync();
1045
1046         if (!proc64) {
1047                 struct omb_class_stat *oc;
1048                 struct mb_class_stat *c;
1049
1050                 omb_stat->mbs_cnt = mb_stat->mbs_cnt;
1051                 oc = &omb_stat->mbs_class[0];
1052                 c = &mb_stat->mbs_class[0];
1053                 for (k = 0; k < omb_stat->mbs_cnt; k++, oc++, c++) {
1054                         (void) snprintf(oc->mbcl_cname, sizeof (oc->mbcl_cname),
1055                             "%s", c->mbcl_cname);
1056                         oc->mbcl_size = c->mbcl_size;
1057                         oc->mbcl_total = c->mbcl_total;
1058                         oc->mbcl_active = c->mbcl_active;
1059                         oc->mbcl_infree = c->mbcl_infree;
1060                         oc->mbcl_slab_cnt = c->mbcl_slab_cnt;
1061                         oc->mbcl_alloc_cnt = c->mbcl_alloc_cnt;
1062                         oc->mbcl_free_cnt = c->mbcl_free_cnt;
1063                         oc->mbcl_notified = c->mbcl_notified;
1064                         oc->mbcl_purge_cnt = c->mbcl_purge_cnt;
1065                         oc->mbcl_fail_cnt = c->mbcl_fail_cnt;
1066                         oc->mbcl_ctotal = c->mbcl_ctotal;
1067                         oc->mbcl_mc_state = c->mbcl_mc_state;
1068                         oc->mbcl_mc_cached = c->mbcl_mc_cached;
1069                         oc->mbcl_mc_waiter_cnt = c->mbcl_mc_waiter_cnt;
1070                         oc->mbcl_mc_wretry_cnt = c->mbcl_mc_wretry_cnt;
1071                         oc->mbcl_mc_nwretry_cnt = c->mbcl_mc_nwretry_cnt;
1072                 }
1073                 statp = omb_stat;
1074                 statsz = OMB_STAT_SIZE(NELEM(mbuf_table));
1075         } else {
1076                 statp = mb_stat;
1077                 statsz = MB_STAT_SIZE(NELEM(mbuf_table));
1078         }
1079
1080         lck_mtx_unlock(mbuf_mlock);
1081
1082         return (SYSCTL_OUT(req, statp, statsz));
1083 }
1084
1085 static int
1086 mleak_top_trace_sysctl SYSCTL_HANDLER_ARGS
1087 {
1088 #pragma unused(oidp, arg1, arg2)
1089         int i;
1090
1091         /* Ensure leak tracing turned on */
1092         if (!mclfindleak || !mclexpleak)
1093                 return (ENXIO);
1094
1095         lck_mtx_lock(mleak_lock);
1096         mleak_update_stats();
1097         i = SYSCTL_OUT(req, mleak_stat, MLEAK_STAT_SIZE(MLEAK_NUM_TRACES));
1098         lck_mtx_unlock(mleak_lock);
1099
1100         return (i);
1101 }
1102
1103 static int
1104 mleak_table_sysctl SYSCTL_HANDLER_ARGS
1105 {
1106 #pragma unused(oidp, arg1, arg2)
1107         int i = 0;
1108
1109         /* Ensure leak tracing turned on */
1110         if (!mclfindleak || !mclexpleak)
1111                 return (ENXIO);
1112
1113         lck_mtx_lock(mleak_lock);
1114         i = SYSCTL_OUT(req, &mleak_table, sizeof (mleak_table));
1115         lck_mtx_unlock(mleak_lock);
1116
1117         return (i);
1118 }
1119
1120 static inline void
1121 m_incref(struct mbuf *m)
1122 {
1123         UInt32 old, new;
1124         volatile UInt32 *addr = (volatile UInt32 *)&MEXT_REF(m);
1125
1126         do {
1127                 old = *addr;
1128                 new = old + 1;
1129                 ASSERT(new != 0);
1130         } while (!OSCompareAndSwap(old, new, addr));
1131
1132         /*
1133          * If cluster is shared, mark it with (sticky) EXTF_READONLY;
1134          * we don't clear the flag when the refcount goes back to 1
1135          * to simplify code calling m_mclhasreference().
1136          */
1137         if (new > 1 && !(MEXT_FLAGS(m) & EXTF_READONLY))
1138                 (void) OSBitOrAtomic(EXTF_READONLY, &MEXT_FLAGS(m));
1139 }
1140
1141 static inline u_int32_t
1142 m_decref(struct mbuf *m)
1143 {
1144         UInt32 old, new;
1145         volatile UInt32 *addr = (volatile UInt32 *)&MEXT_REF(m);
1146
1147         do {
1148                 old = *addr;
1149                 new = old - 1;
1150                 ASSERT(old != 0);
1151         } while (!OSCompareAndSwap(old, new, addr));
1152
1153         return (new);
1154 }
1155
1156 static void
1157 mbuf_table_init(void)
1158 {
1159         unsigned int b, c, s;
1160         int m;
1161
1162         MALLOC(omb_stat, struct omb_stat *, OMB_STAT_SIZE(NELEM(mbuf_table)),
1163             M_TEMP, M_WAITOK | M_ZERO);
1164         VERIFY(omb_stat != NULL);
1165
1166         MALLOC(mb_stat, mb_stat_t *, MB_STAT_SIZE(NELEM(mbuf_table)),
1167             M_TEMP, M_WAITOK | M_ZERO);
1168         VERIFY(mb_stat != NULL);
1169
1170         mb_stat->mbs_cnt = NELEM(mbuf_table);
1171         for (m = 0; m < NELEM(mbuf_table); m++)
1172                 mbuf_table[m].mtbl_stats = &mb_stat->mbs_class[m];
1173
1174 #if CONFIG_MBUF_JUMBO
1175         /*
1176          * Set aside 1/3 of the mbuf cluster map for jumbo clusters; we do
1177          * this only on platforms where jumbo cluster pool is enabled.
1178          */
1179         njcl = nmbclusters / 3;
1180         njclbytes = M16KCLBYTES;
1181 #endif /* CONFIG_MBUF_JUMBO */
1182
1183         /*
1184          * nclusters holds both the 2KB and 4KB pools, so ensure it's
1185          * a multiple of 4KB clusters.
1186          */
1187         nclusters = P2ROUNDDOWN(nmbclusters - njcl, NCLPBG);
1188         if (njcl > 0) {
1189                 /*
1190                  * Each jumbo cluster takes 8 2KB clusters, so make
1191                  * sure that the pool size is evenly divisible by 8;
1192                  * njcl is in 2KB unit, hence treated as such.
1193                  */
1194                 njcl = P2ROUNDDOWN(nmbclusters - nclusters, 8);
1195
1196                 /* Update nclusters with rounded down value of njcl */
1197                 nclusters = P2ROUNDDOWN(nmbclusters - njcl, NCLPBG);
1198         }
1199
1200         /*
1201          * njcl is valid only on platforms with 16KB jumbo clusters, where
1202          * it is configured to 1/3 of the pool size.  On these platforms,
1203          * the remaining is used for 2KB and 4KB clusters.  On platforms
1204          * without 16KB jumbo clusters, the entire pool is used for both
1205          * 2KB and 4KB clusters.  A 4KB cluster can either be splitted into
1206          * 16 mbufs, or into 2 2KB clusters.
1207          *
1208          *  +---+---+------------ ... -----------+------- ... -------+
1209          *  | c | b |              s             |        njcl       |
1210          *  +---+---+------------ ... -----------+------- ... -------+
1211          *
1212          * 1/32th of the shared region is reserved for pure 2KB and 4KB
1213          * clusters (1/64th each.)
1214          */
1215         c = P2ROUNDDOWN((nclusters >> 6), 2);           /* in 2KB unit */
1216         b = P2ROUNDDOWN((nclusters >> (6 + NCLPBGSHIFT)), 2); /* in 4KB unit */
1217         s = nclusters - (c + (b << NCLPBGSHIFT));       /* in 2KB unit */
1218
1219         /*
1220          * 1/64th (c) is reserved for 2KB clusters.
1221          */
1222         m_minlimit(MC_CL) = c;
1223         m_maxlimit(MC_CL) = s + c;                      /* in 2KB unit */
1224         m_maxsize(MC_CL) = m_size(MC_CL) = MCLBYTES;
1225         (void) snprintf(m_cname(MC_CL), MAX_MBUF_CNAME, "cl");
1226
1227         /*
1228          * Another 1/64th (b) of the map is reserved for 4KB clusters.
1229          * It cannot be turned into 2KB clusters or mbufs.
1230          */
1231         m_minlimit(MC_BIGCL) = b;
1232         m_maxlimit(MC_BIGCL) = (s >> NCLPBGSHIFT) + b;  /* in 4KB unit */
1233         m_maxsize(MC_BIGCL) = m_size(MC_BIGCL) = MBIGCLBYTES;
1234         (void) snprintf(m_cname(MC_BIGCL), MAX_MBUF_CNAME, "bigcl");
1235
1236         /*
1237          * The remaining 31/32ths (s) are all-purpose (mbufs, 2KB, or 4KB)
1238          */
1239         m_minlimit(MC_MBUF) = 0;
1240         m_maxlimit(MC_MBUF) = (s << NMBPCLSHIFT);       /* in mbuf unit */
1241         m_maxsize(MC_MBUF) = m_size(MC_MBUF) = MSIZE;
1242         (void) snprintf(m_cname(MC_MBUF), MAX_MBUF_CNAME, "mbuf");
1243
1244         /*
1245          * Set limits for the composite classes.
1246          */
1247         m_minlimit(MC_MBUF_CL) = 0;
1248         m_maxlimit(MC_MBUF_CL) = m_maxlimit(MC_CL);
1249         m_maxsize(MC_MBUF_CL) = MCLBYTES;
1250         m_size(MC_MBUF_CL) = m_size(MC_MBUF) + m_size(MC_CL);
1251         (void) snprintf(m_cname(MC_MBUF_CL), MAX_MBUF_CNAME, "mbuf_cl");
1252
1253         m_minlimit(MC_MBUF_BIGCL) = 0;
1254         m_maxlimit(MC_MBUF_BIGCL) = m_maxlimit(MC_BIGCL);
1255         m_maxsize(MC_MBUF_BIGCL) = MBIGCLBYTES;
1256         m_size(MC_MBUF_BIGCL) = m_size(MC_MBUF) + m_size(MC_BIGCL);
1257         (void) snprintf(m_cname(MC_MBUF_BIGCL), MAX_MBUF_CNAME, "mbuf_bigcl");
1258
1259         /*
1260          * And for jumbo classes.
1261          */
1262         m_minlimit(MC_16KCL) = 0;
1263         m_maxlimit(MC_16KCL) = (njcl >> NCLPJCLSHIFT);  /* in 16KB unit */
1264         m_maxsize(MC_16KCL) = m_size(MC_16KCL) = M16KCLBYTES;
1265         (void) snprintf(m_cname(MC_16KCL), MAX_MBUF_CNAME, "16kcl");
1266
1267         m_minlimit(MC_MBUF_16KCL) = 0;
1268         m_maxlimit(MC_MBUF_16KCL) = m_maxlimit(MC_16KCL);
1269         m_maxsize(MC_MBUF_16KCL) = M16KCLBYTES;
1270         m_size(MC_MBUF_16KCL) = m_size(MC_MBUF) + m_size(MC_16KCL);
1271         (void) snprintf(m_cname(MC_MBUF_16KCL), MAX_MBUF_CNAME, "mbuf_16kcl");
1272
1273         /*
1274          * Initialize the legacy mbstat structure.
1275          */
1276         bzero(&mbstat, sizeof (mbstat));
1277         mbstat.m_msize = m_maxsize(MC_MBUF);
1278         mbstat.m_mclbytes = m_maxsize(MC_CL);
1279         mbstat.m_minclsize = MINCLSIZE;
1280         mbstat.m_mlen = MLEN;
1281         mbstat.m_mhlen = MHLEN;
1282         mbstat.m_bigmclbytes = m_maxsize(MC_BIGCL);
1283 }
1284
1285 #if defined(__LP64__)
1286 typedef struct ncl_tbl {
1287         uint64_t nt_maxmem;     /* memory (sane) size */
1288         uint32_t nt_mbpool;     /* mbuf pool size */
1289 } ncl_tbl_t;
1290
1291 /* Non-server */
1292 static ncl_tbl_t ncl_table[] = {
1293         { (1ULL << GBSHIFT)       /*  1 GB */,  (64 << MBSHIFT)  /*  64 MB */ },
1294         { (1ULL << (GBSHIFT + 3)) /*  8 GB */,  (96 << MBSHIFT)  /*  96 MB */ },
1295         { (1ULL << (GBSHIFT + 4)) /* 16 GB */,  (128 << MBSHIFT) /* 128 MB */ },
1296         { 0, 0 }
1297 };
1298
1299 /* Server */
1300 static ncl_tbl_t ncl_table_srv[] = {
1301         { (1ULL << GBSHIFT)       /*  1 GB */,  (96 << MBSHIFT)  /*  96 MB */ },
1302         { (1ULL << (GBSHIFT + 2)) /*  4 GB */,  (128 << MBSHIFT) /* 128 MB */ },
1303         { (1ULL << (GBSHIFT + 3)) /*  8 GB */,  (160 << MBSHIFT) /* 160 MB */ },
1304         { (1ULL << (GBSHIFT + 4)) /* 16 GB */,  (192 << MBSHIFT) /* 192 MB */ },
1305         { (1ULL << (GBSHIFT + 5)) /* 32 GB */,  (256 << MBSHIFT) /* 256 MB */ },
1306         { (1ULL << (GBSHIFT + 6)) /* 64 GB */,  (384 << MBSHIFT) /* 384 MB */ },
1307         { 0, 0 }
1308 };
1309 #endif /* __LP64__ */
1310
1311 __private_extern__ unsigned int
1312 mbuf_default_ncl(int server, uint64_t mem)
1313 {
1314 #if !defined(__LP64__)
1315 #pragma unused(server)
1316         unsigned int n;
1317         /*
1318          * 32-bit kernel (default to 64MB of mbuf pool for >= 1GB RAM).
1319          */
1320         if ((n = ((mem / 16) / MCLBYTES)) > 32768)
1321                 n = 32768;
1322 #else
1323         unsigned int n, i;
1324         ncl_tbl_t *tbl = (server ? ncl_table_srv : ncl_table);
1325         /*
1326          * 64-bit kernel (mbuf pool size based on table).
1327          */
1328         n = tbl[0].nt_mbpool;
1329         for (i = 0; tbl[i].nt_mbpool != 0; i++) {
1330                 if (mem < tbl[i].nt_maxmem)
1331                         break;
1332                 n = tbl[i].nt_mbpool;
1333         }
1334         n >>= MCLSHIFT;
1335 #endif /* !__LP64__ */
1336         return (n);
1337 }
1338
1339 __private_extern__ void
1340 mbinit(void)
1341 {
1342         unsigned int m;
1343         unsigned int initmcl = 0;
1344         void *buf;
1345         thread_t thread = THREAD_NULL;
1346
1347         microuptime(&mb_start);
1348
1349         /*
1350          * These MBUF_ values must be equal to their private counterparts.
1351          */
1352         _CASSERT(MBUF_EXT == M_EXT);
1353         _CASSERT(MBUF_PKTHDR == M_PKTHDR);
1354         _CASSERT(MBUF_EOR == M_EOR);
1355         _CASSERT(MBUF_LOOP == M_LOOP);
1356         _CASSERT(MBUF_BCAST == M_BCAST);
1357         _CASSERT(MBUF_MCAST == M_MCAST);
1358         _CASSERT(MBUF_FRAG == M_FRAG);
1359         _CASSERT(MBUF_FIRSTFRAG == M_FIRSTFRAG);
1360         _CASSERT(MBUF_LASTFRAG == M_LASTFRAG);
1361         _CASSERT(MBUF_PROMISC == M_PROMISC);
1362         _CASSERT(MBUF_HASFCS == M_HASFCS);
1363
1364         _CASSERT(MBUF_TYPE_FREE == MT_FREE);
1365         _CASSERT(MBUF_TYPE_DATA == MT_DATA);
1366         _CASSERT(MBUF_TYPE_HEADER == MT_HEADER);
1367         _CASSERT(MBUF_TYPE_SOCKET == MT_SOCKET);
1368         _CASSERT(MBUF_TYPE_PCB == MT_PCB);
1369         _CASSERT(MBUF_TYPE_RTABLE == MT_RTABLE);
1370         _CASSERT(MBUF_TYPE_HTABLE == MT_HTABLE);
1371         _CASSERT(MBUF_TYPE_ATABLE == MT_ATABLE);
1372         _CASSERT(MBUF_TYPE_SONAME == MT_SONAME);
1373         _CASSERT(MBUF_TYPE_SOOPTS == MT_SOOPTS);
1374         _CASSERT(MBUF_TYPE_FTABLE == MT_FTABLE);
1375         _CASSERT(MBUF_TYPE_RIGHTS == MT_RIGHTS);
1376         _CASSERT(MBUF_TYPE_IFADDR == MT_IFADDR);
1377         _CASSERT(MBUF_TYPE_CONTROL == MT_CONTROL);
1378         _CASSERT(MBUF_TYPE_OOBDATA == MT_OOBDATA);
1379
1380         _CASSERT(MBUF_TSO_IPV4 == CSUM_TSO_IPV4);
1381         _CASSERT(MBUF_TSO_IPV6 == CSUM_TSO_IPV6);
1382         _CASSERT(MBUF_CSUM_REQ_SUM16 == CSUM_PARTIAL);
1383         _CASSERT(MBUF_CSUM_TCP_SUM16 == MBUF_CSUM_REQ_SUM16);
1384         _CASSERT(MBUF_CSUM_REQ_IP == CSUM_IP);
1385         _CASSERT(MBUF_CSUM_REQ_TCP == CSUM_TCP);
1386         _CASSERT(MBUF_CSUM_REQ_UDP == CSUM_UDP);
1387         _CASSERT(MBUF_CSUM_REQ_TCPIPV6 == CSUM_TCPIPV6);
1388         _CASSERT(MBUF_CSUM_REQ_UDPIPV6 == CSUM_UDPIPV6);
1389         _CASSERT(MBUF_CSUM_DID_IP == CSUM_IP_CHECKED);
1390         _CASSERT(MBUF_CSUM_IP_GOOD == CSUM_IP_VALID);
1391         _CASSERT(MBUF_CSUM_DID_DATA == CSUM_DATA_VALID);
1392         _CASSERT(MBUF_CSUM_PSEUDO_HDR == CSUM_PSEUDO_HDR);
1393
1394         _CASSERT(MBUF_WAITOK == M_WAIT);
1395         _CASSERT(MBUF_DONTWAIT == M_DONTWAIT);
1396         _CASSERT(MBUF_COPYALL == M_COPYALL);
1397
1398         _CASSERT(MBUF_SC2TC(MBUF_SC_BK_SYS) == MBUF_TC_BK);
1399         _CASSERT(MBUF_SC2TC(MBUF_SC_BK) == MBUF_TC_BK);
1400         _CASSERT(MBUF_SC2TC(MBUF_SC_BE) == MBUF_TC_BE);
1401         _CASSERT(MBUF_SC2TC(MBUF_SC_RD) == MBUF_TC_BE);
1402         _CASSERT(MBUF_SC2TC(MBUF_SC_OAM) == MBUF_TC_BE);
1403         _CASSERT(MBUF_SC2TC(MBUF_SC_AV) == MBUF_TC_VI);
1404         _CASSERT(MBUF_SC2TC(MBUF_SC_RV) == MBUF_TC_VI);
1405         _CASSERT(MBUF_SC2TC(MBUF_SC_VI) == MBUF_TC_VI);
1406         _CASSERT(MBUF_SC2TC(MBUF_SC_VO) == MBUF_TC_VO);
1407         _CASSERT(MBUF_SC2TC(MBUF_SC_CTL) == MBUF_TC_VO);
1408
1409         _CASSERT(MBUF_TC2SCVAL(MBUF_TC_BK) == SCVAL_BK);
1410         _CASSERT(MBUF_TC2SCVAL(MBUF_TC_BE) == SCVAL_BE);
1411         _CASSERT(MBUF_TC2SCVAL(MBUF_TC_VI) == SCVAL_VI);
1412         _CASSERT(MBUF_TC2SCVAL(MBUF_TC_VO) == SCVAL_VO);
1413
1414         /* Module specific scratch space (32-bit alignment requirement) */
1415         _CASSERT(!(offsetof(struct mbuf, m_pkthdr.pkt_mpriv) %
1416             sizeof (uint32_t)));
1417
1418         /* Initialize random red zone cookie value */
1419         _CASSERT(sizeof (mb_redzone_cookie) ==
1420             sizeof (((struct pkthdr *)0)->redzone));
1421         read_random(&mb_redzone_cookie, sizeof (mb_redzone_cookie));
1422
1423         /* Make sure we don't save more than we should */
1424         _CASSERT(MCA_SAVED_MBUF_SIZE <= sizeof (struct mbuf));
1425
1426         if (nmbclusters == 0)
1427                 nmbclusters = NMBCLUSTERS;
1428
1429         /* This should be a sane (at least even) value by now */
1430         VERIFY(nmbclusters != 0 && !(nmbclusters & 0x1));
1431
1432         /* Setup the mbuf table */
1433         mbuf_table_init();
1434
1435         /* Global lock for common layer */
1436         mbuf_mlock_grp_attr = lck_grp_attr_alloc_init();
1437         mbuf_mlock_grp = lck_grp_alloc_init("mbuf", mbuf_mlock_grp_attr);
1438         mbuf_mlock_attr = lck_attr_alloc_init();
1439         lck_mtx_init(mbuf_mlock, mbuf_mlock_grp, mbuf_mlock_attr);
1440
1441         /*
1442          * Allocate cluster slabs table:
1443          *
1444          *      maxslabgrp = (N * 2048) / (1024 * 1024)
1445          *
1446          * Where N is nmbclusters rounded up to the nearest 512.  This yields
1447          * mcl_slab_g_t units, each one representing a MB of memory.
1448          */
1449         maxslabgrp =
1450             (P2ROUNDUP(nmbclusters, (MBSIZE >> 11)) << MCLSHIFT) >> MBSHIFT;
1451         MALLOC(slabstbl, mcl_slabg_t **, maxslabgrp * sizeof (mcl_slabg_t *),
1452             M_TEMP, M_WAITOK | M_ZERO);
1453         VERIFY(slabstbl != NULL);
1454
1455         /*
1456          * Allocate audit structures, if needed:
1457          *
1458          *      maxclaudit = (maxslabgrp * 1024 * 1024) / 4096
1459          *
1460          * This yields mcl_audit_t units, each one representing a page.
1461          */
1462         PE_parse_boot_argn("mbuf_debug", &mbuf_debug, sizeof (mbuf_debug));
1463         mbuf_debug |= mcache_getflags();
1464         if (mbuf_debug & MCF_DEBUG) {
1465                 maxclaudit = ((maxslabgrp << MBSHIFT) >> PGSHIFT);
1466                 MALLOC(mclaudit, mcl_audit_t *, maxclaudit * sizeof (*mclaudit),
1467                     M_TEMP, M_WAITOK | M_ZERO);
1468                 VERIFY(mclaudit != NULL);
1469
1470                 mcl_audit_con_cache = mcache_create("mcl_audit_contents",
1471                     AUDIT_CONTENTS_SIZE, sizeof (u_int64_t), 0, MCR_SLEEP);
1472                 VERIFY(mcl_audit_con_cache != NULL);
1473         }
1474         mclverify = (mbuf_debug & MCF_VERIFY);
1475         mcltrace = (mbuf_debug & MCF_TRACE);
1476         mclfindleak = !(mbuf_debug & MCF_NOLEAKLOG);
1477         mclexpleak = mclfindleak && (mbuf_debug & MCF_EXPLEAKLOG);
1478
1479         /* Enable mbuf leak logging, with a lock to protect the tables */
1480
1481         mleak_lock_grp_attr = lck_grp_attr_alloc_init();
1482         mleak_lock_grp = lck_grp_alloc_init("mleak_lock", mleak_lock_grp_attr);
1483         mleak_lock_attr = lck_attr_alloc_init();
1484         lck_mtx_init(mleak_lock, mleak_lock_grp, mleak_lock_attr);
1485
1486         mleak_activate();
1487
1488         /* Calculate the number of pages assigned to the cluster pool */
1489         mcl_pages = (nmbclusters * MCLBYTES) / CLBYTES;
1490         MALLOC(mcl_paddr, ppnum_t *, mcl_pages * sizeof (ppnum_t),
1491             M_TEMP, M_WAITOK);
1492         VERIFY(mcl_paddr != NULL);
1493
1494         /* Register with the I/O Bus mapper */
1495         mcl_paddr_base = IOMapperIOVMAlloc(mcl_pages);
1496         bzero((char *)mcl_paddr, mcl_pages * sizeof (ppnum_t));
1497
1498         embutl = (union mbigcluster *)
1499             ((void *)((unsigned char *)mbutl + (nmbclusters * MCLBYTES)));
1500         VERIFY((((char *)embutl - (char *)mbutl) % MBIGCLBYTES) == 0);
1501
1502         /* Prime up the freelist */
1503         PE_parse_boot_argn("initmcl", &initmcl, sizeof (initmcl));
1504         if (initmcl != 0) {
1505                 initmcl >>= NCLPBGSHIFT;        /* become a 4K unit */
1506                 if (initmcl > m_maxlimit(MC_BIGCL))
1507                         initmcl = m_maxlimit(MC_BIGCL);
1508         }
1509         if (initmcl < m_minlimit(MC_BIGCL))
1510                 initmcl = m_minlimit(MC_BIGCL);
1511
1512         lck_mtx_lock(mbuf_mlock);
1513
1514         /*
1515          * For classes with non-zero minimum limits, populate their freelists
1516          * so that m_total(class) is at least m_minlimit(class).
1517          */
1518         VERIFY(m_total(MC_BIGCL) == 0 && m_minlimit(MC_BIGCL) != 0);
1519         freelist_populate(m_class(MC_BIGCL), initmcl, M_WAIT);
1520         VERIFY(m_total(MC_BIGCL) >= m_minlimit(MC_BIGCL));
1521         freelist_init(m_class(MC_CL));
1522
1523         for (m = 0; m < NELEM(mbuf_table); m++) {
1524                 /* Make sure we didn't miss any */
1525                 VERIFY(m_minlimit(m_class(m)) == 0 ||
1526                     m_total(m_class(m)) >= m_minlimit(m_class(m)));
1527         }
1528
1529         lck_mtx_unlock(mbuf_mlock);
1530
1531         (void) kernel_thread_start((thread_continue_t)mbuf_worker_thread_init,
1532             NULL, &thread);
1533         thread_deallocate(thread);
1534
1535         ref_cache = mcache_create("mext_ref", sizeof (struct ext_ref),
1536             0, 0, MCR_SLEEP);
1537
1538         /* Create the cache for each class */
1539         for (m = 0; m < NELEM(mbuf_table); m++) {
1540                 void *allocfunc, *freefunc, *auditfunc, *logfunc;
1541                 u_int32_t flags;
1542
1543                 flags = mbuf_debug;
1544                 if (m_class(m) == MC_MBUF_CL || m_class(m) == MC_MBUF_BIGCL ||
1545                     m_class(m) == MC_MBUF_16KCL) {
1546                         allocfunc = mbuf_cslab_alloc;
1547                         freefunc = mbuf_cslab_free;
1548                         auditfunc = mbuf_cslab_audit;
1549                         logfunc = mleak_logger;
1550                 } else {
1551                         allocfunc = mbuf_slab_alloc;
1552                         freefunc = mbuf_slab_free;
1553                         auditfunc = mbuf_slab_audit;
1554                         logfunc = mleak_logger;
1555                 }
1556
1557                 /*
1558                  * Disable per-CPU caches for jumbo classes if there
1559                  * is no jumbo cluster pool available in the system.
1560                  * The cache itself is still created (but will never
1561                  * be populated) since it simplifies the code.
1562                  */
1563                 if ((m_class(m) == MC_MBUF_16KCL || m_class(m) == MC_16KCL) &&
1564                     njcl == 0)
1565                         flags |= MCF_NOCPUCACHE;
1566
1567                 if (!mclfindleak)
1568                         flags |= MCF_NOLEAKLOG;
1569
1570                 m_cache(m) = mcache_create_ext(m_cname(m), m_maxsize(m),
1571                     allocfunc, freefunc, auditfunc, logfunc, mbuf_slab_notify,
1572                     (void *)(uintptr_t)m, flags, MCR_SLEEP);
1573         }
1574
1575         /*
1576          * Allocate structure for per-CPU statistics that's aligned
1577          * on the CPU cache boundary; this code assumes that we never
1578          * uninitialize this framework, since the original address
1579          * before alignment is not saved.
1580          */
1581         ncpu = ml_get_max_cpus();
1582         MALLOC(buf, void *, MBUF_MTYPES_SIZE(ncpu) + CPU_CACHE_LINE_SIZE,
1583             M_TEMP, M_WAITOK);
1584         VERIFY(buf != NULL);
1585
1586         mbuf_mtypes = (mbuf_mtypes_t *)P2ROUNDUP((intptr_t)buf,
1587             CPU_CACHE_LINE_SIZE);
1588         bzero(mbuf_mtypes, MBUF_MTYPES_SIZE(ncpu));
1589
1590         /*
1591          * Set the max limit on sb_max to be 1/16 th of the size of
1592          * memory allocated for mbuf clusters.
1593          */
1594         high_sb_max = (nmbclusters << (MCLSHIFT - 4));
1595         if (high_sb_max < sb_max) {
1596                 /* sb_max is too large for this configuration, scale it down */
1597                 if (high_sb_max > (1 << MBSHIFT)) {
1598                         /* We have atleast 16 M of mbuf pool */
1599                         sb_max = high_sb_max;
1600                 } else if ((nmbclusters << MCLSHIFT) > (1 << MBSHIFT)) {
1601                         /*
1602                          * If we have more than 1M of mbufpool, cap the size of
1603                          * max sock buf at 1M
1604                          */
1605                         sb_max = high_sb_max = (1 << MBSHIFT);
1606                 } else {
1607                         sb_max = high_sb_max;
1608                 }
1609         }
1610
1611         /* allocate space for mbuf_dump_buf */
1612         MALLOC(mbuf_dump_buf, char *, MBUF_DUMP_BUF_SIZE, M_TEMP, M_WAITOK);
1613         VERIFY(mbuf_dump_buf != NULL);
1614
1615         if (mbuf_debug & MCF_DEBUG) {
1616                 printf("%s: MLEN %d, MHLEN %d\n", __func__,
1617                     (int)_MLEN, (int)_MHLEN);
1618         }
1619
1620         printf("%s: done [%d MB total pool size, (%d/%d) split]\n", __func__,
1621             (nmbclusters << MCLSHIFT) >> MBSHIFT,
1622             (nclusters << MCLSHIFT) >> MBSHIFT,
1623             (njcl << MCLSHIFT) >> MBSHIFT);
1624 }
1625
1626 /*
1627  * Obtain a slab of object(s) from the class's freelist.
1628  */
1629 static mcache_obj_t *
1630 slab_alloc(mbuf_class_t class, int wait)
1631 {
1632         mcl_slab_t *sp;
1633         mcache_obj_t *buf;
1634
1635         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1636
1637         VERIFY(class != MC_16KCL || njcl > 0);
1638
1639         /* This should always be NULL for us */
1640         VERIFY(m_cobjlist(class) == NULL);
1641
1642         /*
1643          * Treat composite objects as having longer lifespan by using
1644          * a slab from the reverse direction, in hoping that this could
1645          * reduce the probability of fragmentation for slabs that hold
1646          * more than one buffer chunks (e.g. mbuf slabs).  For other
1647          * slabs, this probably doesn't make much of a difference.
1648          */
1649         if ((class == MC_MBUF || class == MC_CL) && (wait & MCR_COMP))
1650                 sp = (mcl_slab_t *)TAILQ_LAST(&m_slablist(class), mcl_slhead);
1651         else
1652                 sp = (mcl_slab_t *)TAILQ_FIRST(&m_slablist(class));
1653
1654         if (sp == NULL) {
1655                 VERIFY(m_infree(class) == 0 && m_slab_cnt(class) == 0);
1656                 /* The slab list for this class is empty */
1657                 return (NULL);
1658         }
1659
1660         VERIFY(m_infree(class) > 0);
1661         VERIFY(!slab_is_detached(sp));
1662         VERIFY(sp->sl_class == class &&
1663             (sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED);
1664         buf = sp->sl_head;
1665         VERIFY(slab_inrange(sp, buf) && sp == slab_get(buf));
1666
1667         if (class == MC_MBUF) {
1668                 sp->sl_head = buf->obj_next;
1669                 VERIFY(sp->sl_head != NULL || sp->sl_refcnt == (NMBPBG - 1));
1670         } else if (class == MC_CL) {
1671                 sp->sl_head = buf->obj_next;
1672                 VERIFY(sp->sl_head != NULL || sp->sl_refcnt == (NCLPBG - 1));
1673         } else {
1674                 sp->sl_head = NULL;
1675         }
1676         if (sp->sl_head != NULL && !slab_inrange(sp, sp->sl_head)) {
1677                 slab_nextptr_panic(sp, sp->sl_head);
1678                 /* In case sl_head is in the map but not in the slab */
1679                 VERIFY(slab_inrange(sp, sp->sl_head));
1680                 /* NOTREACHED */
1681         }
1682
1683         /* Increment slab reference */
1684         sp->sl_refcnt++;
1685
1686         if (mclaudit != NULL) {
1687                 mcache_audit_t *mca = mcl_audit_buf2mca(class, buf);
1688                 mca->mca_uflags = 0;
1689                 /* Save contents on mbuf objects only */
1690                 if (class == MC_MBUF)
1691                         mca->mca_uflags |= MB_SCVALID;
1692         }
1693
1694         if (class == MC_CL) {
1695                 mbstat.m_clfree = (--m_infree(MC_CL)) + m_infree(MC_MBUF_CL);
1696                 /*
1697                  * A 2K cluster slab can have at most NCLPBG references.
1698                  */
1699                 VERIFY(sp->sl_refcnt >= 1 && sp->sl_refcnt <= NCLPBG &&
1700                     sp->sl_chunks == NCLPBG &&
1701                     sp->sl_len == m_maxsize(MC_BIGCL));
1702                 VERIFY(sp->sl_refcnt < NCLPBG || sp->sl_head == NULL);
1703         } else if (class == MC_BIGCL) {
1704                 mbstat.m_bigclfree = (--m_infree(MC_BIGCL)) +
1705                     m_infree(MC_MBUF_BIGCL);
1706                 /*
1707                  * A 4K cluster slab can have at most 1 reference.
1708                  */
1709                 VERIFY(sp->sl_refcnt == 1 && sp->sl_chunks == 1 &&
1710                     sp->sl_len == m_maxsize(class) && sp->sl_head == NULL);
1711         } else if (class == MC_16KCL) {
1712                 mcl_slab_t *nsp;
1713                 int k;
1714
1715                 --m_infree(MC_16KCL);
1716                 VERIFY(sp->sl_refcnt == 1 && sp->sl_chunks == 1 &&
1717                     sp->sl_len == m_maxsize(class) && sp->sl_head == NULL);
1718                 /*
1719                  * Increment 2nd-Nth slab reference, where N is NSLABSP16KB.
1720                  * A 16KB big cluster takes NSLABSP16KB slabs, each having at
1721                  * most 1 reference.
1722                  */
1723                 for (nsp = sp, k = 1; k < NSLABSP16KB; k++) {
1724                         nsp = nsp->sl_next;
1725                         /* Next slab must already be present */
1726                         VERIFY(nsp != NULL);
1727                         nsp->sl_refcnt++;
1728                         VERIFY(!slab_is_detached(nsp));
1729                         VERIFY(nsp->sl_class == MC_16KCL &&
1730                             nsp->sl_flags == (SLF_MAPPED | SLF_PARTIAL) &&
1731                             nsp->sl_refcnt == 1 && nsp->sl_chunks == 0 &&
1732                             nsp->sl_len == 0 && nsp->sl_base == sp->sl_base &&
1733                             nsp->sl_head == NULL);
1734                 }
1735         } else {
1736                 VERIFY(class == MC_MBUF);
1737                 --m_infree(MC_MBUF);
1738                 /*
1739                  * If auditing is turned on, this check is
1740                  * deferred until later in mbuf_slab_audit().
1741                  */
1742                 if (mclaudit == NULL)
1743                         _MCHECK((struct mbuf *)buf);
1744                 /*
1745                  * Since we have incremented the reference count above,
1746                  * an mbuf slab (formerly a 4KB cluster slab that was cut
1747                  * up into mbufs) must have a reference count between 1
1748                  * and NMBPBG at this point.
1749                  */
1750                 VERIFY(sp->sl_refcnt >= 1 && sp->sl_refcnt <= NMBPBG &&
1751                     sp->sl_chunks == NMBPBG &&
1752                     sp->sl_len == m_maxsize(MC_BIGCL));
1753                 VERIFY(sp->sl_refcnt < NMBPBG || sp->sl_head == NULL);
1754         }
1755
1756         /* If empty, remove this slab from the class's freelist */
1757         if (sp->sl_head == NULL) {
1758                 VERIFY(class != MC_MBUF || sp->sl_refcnt == NMBPBG);
1759                 VERIFY(class != MC_CL || sp->sl_refcnt == NCLPBG);
1760                 slab_remove(sp, class);
1761         }
1762
1763         return (buf);
1764 }
1765
1766 /*
1767  * Place a slab of object(s) back into a class's slab list.
1768  */
1769 static void
1770 slab_free(mbuf_class_t class, mcache_obj_t *buf)
1771 {
1772         mcl_slab_t *sp;
1773
1774         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1775
1776         VERIFY(class != MC_16KCL || njcl > 0);
1777         VERIFY(buf->obj_next == NULL);
1778         sp = slab_get(buf);
1779         VERIFY(sp->sl_class == class && slab_inrange(sp, buf) &&
1780             (sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED);
1781
1782         /* Decrement slab reference */
1783         sp->sl_refcnt--;
1784
1785         if (class == MC_CL) {
1786                 VERIFY(IS_P2ALIGNED(buf, MCLBYTES));
1787                 /*
1788                  * A slab that has been splitted for 2KB clusters can have
1789                  * at most 1 outstanding reference at this point.
1790                  */
1791                 VERIFY(sp->sl_refcnt >= 0 && sp->sl_refcnt <= (NCLPBG - 1) &&
1792                     sp->sl_chunks == NCLPBG &&
1793                     sp->sl_len == m_maxsize(MC_BIGCL));
1794                 VERIFY(sp->sl_refcnt < (NCLPBG - 1) ||
1795                     (slab_is_detached(sp) && sp->sl_head == NULL));
1796         } else if (class == MC_BIGCL) {
1797                 VERIFY(IS_P2ALIGNED(buf, MCLBYTES));
1798                 /*
1799                  * A 4KB cluster slab can have at most 1 reference
1800                  * which must be 0 at this point.
1801                  */
1802                 VERIFY(sp->sl_refcnt == 0 && sp->sl_chunks == 1 &&
1803                     sp->sl_len == m_maxsize(class) && sp->sl_head == NULL);
1804                 VERIFY(slab_is_detached(sp));
1805         } else if (class == MC_16KCL) {
1806                 mcl_slab_t *nsp;
1807                 int k;
1808                 /*
1809                  * A 16KB cluster takes NSLABSP16KB slabs, all must
1810                  * now have 0 reference.
1811                  */
1812                 VERIFY(IS_P2ALIGNED(buf, MBIGCLBYTES));
1813                 VERIFY(sp->sl_refcnt == 0 && sp->sl_chunks == 1 &&
1814                     sp->sl_len == m_maxsize(class) && sp->sl_head == NULL);
1815                 VERIFY(slab_is_detached(sp));
1816                 for (nsp = sp, k = 1; k < NSLABSP16KB; k++) {
1817                         nsp = nsp->sl_next;
1818                         /* Next slab must already be present */
1819                         VERIFY(nsp != NULL);
1820                         nsp->sl_refcnt--;
1821                         VERIFY(slab_is_detached(nsp));
1822                         VERIFY(nsp->sl_class == MC_16KCL &&
1823                             (nsp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) &&
1824                             nsp->sl_refcnt == 0 && nsp->sl_chunks == 0 &&
1825                             nsp->sl_len == 0 && nsp->sl_base == sp->sl_base &&
1826                             nsp->sl_head == NULL);
1827                 }
1828         } else {
1829                 /*
1830                  * A slab that has been splitted for mbufs has at most NMBPBG
1831                  * reference counts.  Since we have decremented one reference
1832                  * above, it must now be between 0 and NMBPBG-1.
1833                  */
1834                 VERIFY(class == MC_MBUF);
1835                 VERIFY(sp->sl_refcnt >= 0 && sp->sl_refcnt <= (NMBPBG - 1) &&
1836                     sp->sl_chunks == NMBPBG &&
1837                     sp->sl_len == m_maxsize(MC_BIGCL));
1838                 VERIFY(sp->sl_refcnt < (NMBPBG - 1) ||
1839                     (slab_is_detached(sp) && sp->sl_head == NULL));
1840         }
1841
1842         /*
1843          * When auditing is enabled, ensure that the buffer still
1844          * contains the free pattern.  Otherwise it got corrupted
1845          * while at the CPU cache layer.
1846          */
1847         if (mclaudit != NULL) {
1848                 mcache_audit_t *mca = mcl_audit_buf2mca(class, buf);
1849                 if (mclverify) {
1850                         mcache_audit_free_verify(mca, buf, 0, m_maxsize(class));
1851                 }
1852                 mca->mca_uflags &= ~MB_SCVALID;
1853         }
1854
1855         if (class == MC_CL) {
1856                 mbstat.m_clfree = (++m_infree(MC_CL)) + m_infree(MC_MBUF_CL);
1857                 buf->obj_next = sp->sl_head;
1858         } else if (class == MC_BIGCL) {
1859                 mbstat.m_bigclfree = (++m_infree(MC_BIGCL)) +
1860                     m_infree(MC_MBUF_BIGCL);
1861         } else if (class == MC_16KCL) {
1862                 ++m_infree(MC_16KCL);
1863         } else {
1864                 ++m_infree(MC_MBUF);
1865                 buf->obj_next = sp->sl_head;
1866         }
1867         sp->sl_head = buf;
1868
1869         /*
1870          * If a slab has been splitted to either one which holds 2KB clusters,
1871          * or one which holds mbufs, turn it back to one which holds a 4KB
1872          * cluster.
1873          */
1874         if (class == MC_MBUF && sp->sl_refcnt == 0 &&
1875             m_total(class) > m_minlimit(class) &&
1876             m_total(MC_BIGCL) < m_maxlimit(MC_BIGCL)) {
1877                 int i = NMBPBG;
1878
1879                 m_total(MC_BIGCL)++;
1880                 mbstat.m_bigclusters = m_total(MC_BIGCL);
1881                 m_total(MC_MBUF) -= NMBPBG;
1882                 mbstat.m_mbufs = m_total(MC_MBUF);
1883                 m_infree(MC_MBUF) -= NMBPBG;
1884                 mtype_stat_add(MT_FREE, -((unsigned)NMBPBG));
1885
1886                 VERIFY(m_total(MC_BIGCL) <= m_maxlimit(MC_BIGCL));
1887                 VERIFY(m_total(MC_MBUF) >= m_minlimit(MC_MBUF));
1888
1889                 while (i--) {
1890                         struct mbuf *m = sp->sl_head;
1891                         VERIFY(m != NULL);
1892                         sp->sl_head = m->m_next;
1893                         m->m_next = NULL;
1894                 }
1895                 VERIFY(sp->sl_head == NULL);
1896
1897                 /* Remove the slab from the mbuf class's slab list */
1898                 slab_remove(sp, class);
1899
1900                 /* Reinitialize it as a 4KB cluster slab */
1901                 slab_init(sp, MC_BIGCL, sp->sl_flags, sp->sl_base, sp->sl_base,
1902                     sp->sl_len, 0, 1);
1903
1904                 if (mclverify) {
1905                         mcache_set_pattern(MCACHE_FREE_PATTERN,
1906                             (caddr_t)sp->sl_head, m_maxsize(MC_BIGCL));
1907                 }
1908                 mbstat.m_bigclfree = (++m_infree(MC_BIGCL)) +
1909                     m_infree(MC_MBUF_BIGCL);
1910
1911                 VERIFY(slab_is_detached(sp));
1912                 /* And finally switch class */
1913                 class = MC_BIGCL;
1914         } else if (class == MC_CL && sp->sl_refcnt == 0 &&
1915             m_total(class) > m_minlimit(class) &&
1916             m_total(MC_BIGCL) < m_maxlimit(MC_BIGCL)) {
1917                 int i = NCLPBG;
1918
1919                 m_total(MC_BIGCL)++;
1920                 mbstat.m_bigclusters = m_total(MC_BIGCL);
1921                 m_total(MC_CL) -= NCLPBG;
1922                 mbstat.m_clusters = m_total(MC_CL);
1923                 m_infree(MC_CL) -= NCLPBG;
1924                 VERIFY(m_total(MC_BIGCL) <= m_maxlimit(MC_BIGCL));
1925                 VERIFY(m_total(MC_CL) >= m_minlimit(MC_CL));
1926
1927                 while (i--) {
1928                         union mcluster *c = sp->sl_head;
1929                         VERIFY(c != NULL);
1930                         sp->sl_head = c->mcl_next;
1931                         c->mcl_next = NULL;
1932                 }
1933                 VERIFY(sp->sl_head == NULL);
1934
1935                 /* Remove the slab from the 2KB cluster class's slab list */
1936                 slab_remove(sp, class);
1937
1938                 /* Reinitialize it as a 4KB cluster slab */
1939                 slab_init(sp, MC_BIGCL, sp->sl_flags, sp->sl_base, sp->sl_base,
1940                     sp->sl_len, 0, 1);
1941
1942                 if (mclverify) {
1943                         mcache_set_pattern(MCACHE_FREE_PATTERN,
1944                             (caddr_t)sp->sl_head, m_maxsize(MC_BIGCL));
1945                 }
1946                 mbstat.m_bigclfree = (++m_infree(MC_BIGCL)) +
1947                     m_infree(MC_MBUF_BIGCL);
1948
1949                 VERIFY(slab_is_detached(sp));
1950                 /* And finally switch class */
1951                 class = MC_BIGCL;
1952         }
1953
1954         /* Reinsert the slab to the class's slab list */
1955         if (slab_is_detached(sp))
1956                 slab_insert(sp, class);
1957 }
1958
1959 /*
1960  * Common allocator for rudimentary objects called by the CPU cache layer
1961  * during an allocation request whenever there is no available element in the
1962  * bucket layer.  It returns one or more elements from the appropriate global
1963  * freelist.  If the freelist is empty, it will attempt to populate it and
1964  * retry the allocation.
1965  */
1966 static unsigned int
1967 mbuf_slab_alloc(void *arg, mcache_obj_t ***plist, unsigned int num, int wait)
1968 {
1969         mbuf_class_t class = (mbuf_class_t)arg;
1970         unsigned int need = num;
1971         mcache_obj_t **list = *plist;
1972
1973         ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
1974         ASSERT(need > 0);
1975
1976         lck_mtx_lock(mbuf_mlock);
1977
1978         for (;;) {
1979                 if ((*list = slab_alloc(class, wait)) != NULL) {
1980                         (*list)->obj_next = NULL;
1981                         list = *plist = &(*list)->obj_next;
1982
1983                         if (--need == 0) {
1984                                 /*
1985                                  * If the number of elements in freelist has
1986                                  * dropped below low watermark, asynchronously
1987                                  * populate the freelist now rather than doing
1988                                  * it later when we run out of elements.
1989                                  */
1990                                 if (!mbuf_cached_above(class, wait) &&
1991                                     m_infree(class) < m_total(class) >> 5) {
1992                                         (void) freelist_populate(class, 1,
1993                                             M_DONTWAIT);
1994                                 }
1995                                 break;
1996                         }
1997                 } else {
1998                         VERIFY(m_infree(class) == 0 || class == MC_CL);
1999
2000                         (void) freelist_populate(class, 1,
2001                             (wait & MCR_NOSLEEP) ? M_DONTWAIT : M_WAIT);
2002
2003                         if (m_infree(class) > 0)
2004                                 continue;
2005
2006                         /* Check if there's anything at the cache layer */
2007                         if (mbuf_cached_above(class, wait))
2008                                 break;
2009
2010                         /* watchdog checkpoint */
2011                         mbuf_watchdog();
2012
2013                         /* We have nothing and cannot block; give up */
2014                         if (wait & MCR_NOSLEEP) {
2015                                 if (!(wait & MCR_TRYHARD)) {
2016                                         m_fail_cnt(class)++;
2017                                         mbstat.m_drops++;
2018                                         break;
2019                                 }
2020                         }
2021
2022                         /*
2023                          * If the freelist is still empty and the caller is
2024                          * willing to be blocked, sleep on the wait channel
2025                          * until an element is available.  Otherwise, if
2026                          * MCR_TRYHARD is set, do our best to satisfy the
2027                          * request without having to go to sleep.
2028                          */
2029                         if (mbuf_worker_ready &&
2030                             mbuf_sleep(class, need, wait))
2031                                 break;
2032
2033                         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2034                 }
2035         }
2036
2037         m_alloc_cnt(class) += num - need;
2038         lck_mtx_unlock(mbuf_mlock);
2039
2040         return (num - need);
2041 }
2042
2043 /*
2044  * Common de-allocator for rudimentary objects called by the CPU cache
2045  * layer when one or more elements need to be returned to the appropriate
2046  * global freelist.
2047  */
2048 static void
2049 mbuf_slab_free(void *arg, mcache_obj_t *list, __unused int purged)
2050 {
2051         mbuf_class_t class = (mbuf_class_t)arg;
2052         mcache_obj_t *nlist;
2053         unsigned int num = 0;
2054         int w;
2055
2056         ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
2057
2058         lck_mtx_lock(mbuf_mlock);
2059
2060         for (;;) {
2061                 nlist = list->obj_next;
2062                 list->obj_next = NULL;
2063                 slab_free(class, list);
2064                 ++num;
2065                 if ((list = nlist) == NULL)
2066                         break;
2067         }
2068         m_free_cnt(class) += num;
2069
2070         if ((w = mb_waiters) > 0)
2071                 mb_waiters = 0;
2072
2073         lck_mtx_unlock(mbuf_mlock);
2074
2075         if (w != 0)
2076                 wakeup(mb_waitchan);
2077 }
2078
2079 /*
2080  * Common auditor for rudimentary objects called by the CPU cache layer
2081  * during an allocation or free request.  For the former, this is called
2082  * after the objects are obtained from either the bucket or slab layer
2083  * and before they are returned to the caller.  For the latter, this is
2084  * called immediately during free and before placing the objects into
2085  * the bucket or slab layer.
2086  */
2087 static void
2088 mbuf_slab_audit(void *arg, mcache_obj_t *list, boolean_t alloc)
2089 {
2090         mbuf_class_t class = (mbuf_class_t)arg;
2091         mcache_audit_t *mca;
2092
2093         ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
2094
2095         while (list != NULL) {
2096                 lck_mtx_lock(mbuf_mlock);
2097                 mca = mcl_audit_buf2mca(class, list);
2098
2099                 /* Do the sanity checks */
2100                 if (class == MC_MBUF) {
2101                         mcl_audit_mbuf(mca, list, FALSE, alloc);
2102                         ASSERT(mca->mca_uflags & MB_SCVALID);
2103                 } else {
2104                         mcl_audit_cluster(mca, list, m_maxsize(class),
2105                             alloc, TRUE);
2106                         ASSERT(!(mca->mca_uflags & MB_SCVALID));
2107                 }
2108                 /* Record this transaction */
2109                 if (mcltrace)
2110                         mcache_buffer_log(mca, list, m_cache(class), &mb_start);
2111
2112                 if (alloc)
2113                         mca->mca_uflags |= MB_INUSE;
2114                 else
2115                         mca->mca_uflags &= ~MB_INUSE;
2116                 /* Unpair the object (unconditionally) */
2117                 mca->mca_uptr = NULL;
2118                 lck_mtx_unlock(mbuf_mlock);
2119
2120                 list = list->obj_next;
2121         }
2122 }
2123
2124 /*
2125  * Common notify routine for all caches.  It is called by mcache when
2126  * one or more objects get freed.  We use this indication to trigger
2127  * the wakeup of any sleeping threads so that they can retry their
2128  * allocation requests.
2129  */
2130 static void
2131 mbuf_slab_notify(void *arg, u_int32_t reason)
2132 {
2133         mbuf_class_t class = (mbuf_class_t)arg;
2134         int w;
2135
2136         ASSERT(MBUF_CLASS_VALID(class));
2137
2138         if (reason != MCN_RETRYALLOC)
2139                 return;
2140
2141         lck_mtx_lock(mbuf_mlock);
2142         if ((w = mb_waiters) > 0) {
2143                 m_notified(class)++;
2144                 mb_waiters = 0;
2145         }
2146         lck_mtx_unlock(mbuf_mlock);
2147
2148         if (w != 0)
2149                 wakeup(mb_waitchan);
2150 }
2151
2152 /*
2153  * Obtain object(s) from the composite class's freelist.
2154  */
2155 static unsigned int
2156 cslab_alloc(mbuf_class_t class, mcache_obj_t ***plist, unsigned int num)
2157 {
2158         unsigned int need = num;
2159         mcl_slab_t *sp, *clsp, *nsp;
2160         struct mbuf *m;
2161         mcache_obj_t **list = *plist;
2162         void *cl;
2163
2164         VERIFY(need > 0);
2165         VERIFY(class != MC_MBUF_16KCL || njcl > 0);
2166         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2167
2168         /* Get what we can from the freelist */
2169         while ((*list = m_cobjlist(class)) != NULL) {
2170                 MRANGE(*list);
2171
2172                 m = (struct mbuf *)*list;
2173                 sp = slab_get(m);
2174                 cl = m->m_ext.ext_buf;
2175                 clsp = slab_get(cl);
2176                 VERIFY(m->m_flags == M_EXT && cl != NULL);
2177                 VERIFY(MEXT_RFA(m) != NULL && MBUF_IS_COMPOSITE(m));
2178
2179                 if (class == MC_MBUF_CL) {
2180                         VERIFY(clsp->sl_refcnt >= 1 &&
2181                             clsp->sl_refcnt <= NCLPBG);
2182                 } else {
2183                         VERIFY(clsp->sl_refcnt == 1);
2184                 }
2185
2186                 if (class == MC_MBUF_16KCL) {
2187                         int k;
2188                         for (nsp = clsp, k = 1; k < NSLABSP16KB; k++) {
2189                                 nsp = nsp->sl_next;
2190                                 /* Next slab must already be present */
2191                                 VERIFY(nsp != NULL);
2192                                 VERIFY(nsp->sl_refcnt == 1);
2193                         }
2194                 }
2195
2196                 if ((m_cobjlist(class) = (*list)->obj_next) != NULL &&
2197                     !MBUF_IN_MAP(m_cobjlist(class))) {
2198                         slab_nextptr_panic(sp, m_cobjlist(class));
2199                         /* NOTREACHED */
2200                 }
2201                 (*list)->obj_next = NULL;
2202                 list = *plist = &(*list)->obj_next;
2203
2204                 if (--need == 0)
2205                         break;
2206         }
2207         m_infree(class) -= (num - need);
2208
2209         return (num - need);
2210 }
2211
2212 /*
2213  * Place object(s) back into a composite class's freelist.
2214  */
2215 static unsigned int
2216 cslab_free(mbuf_class_t class, mcache_obj_t *list, int purged)
2217 {
2218         mcache_obj_t *o, *tail;
2219         unsigned int num = 0;
2220         struct mbuf *m, *ms;
2221         mcache_audit_t *mca = NULL;
2222         mcache_obj_t *ref_list = NULL;
2223         mcl_slab_t *clsp, *nsp;
2224         void *cl;
2225         mbuf_class_t cl_class;
2226
2227         ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
2228         VERIFY(class != MC_MBUF_16KCL || njcl > 0);
2229         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2230
2231         if (class == MC_MBUF_CL) {
2232                 cl_class = MC_CL;
2233         } else if (class == MC_MBUF_BIGCL) {
2234                 cl_class = MC_BIGCL;
2235         } else {
2236                 VERIFY(class == MC_MBUF_16KCL);
2237                 cl_class = MC_16KCL;
2238         }
2239
2240         o = tail = list;
2241
2242         while ((m = ms = (struct mbuf *)o) != NULL) {
2243                 mcache_obj_t *rfa, *nexto = o->obj_next;
2244
2245                 /* Do the mbuf sanity checks */
2246                 if (mclaudit != NULL) {
2247                         mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
2248                         if (mclverify) {
2249                                 mcache_audit_free_verify(mca, m, 0,
2250                                     m_maxsize(MC_MBUF));
2251                         }
2252                         ms = MCA_SAVED_MBUF_PTR(mca);
2253                 }
2254
2255                 /* Do the cluster sanity checks */
2256                 cl = ms->m_ext.ext_buf;
2257                 clsp = slab_get(cl);
2258                 if (mclverify) {
2259                         size_t size = m_maxsize(cl_class);
2260                         mcache_audit_free_verify(mcl_audit_buf2mca(cl_class,
2261                             (mcache_obj_t *)cl), cl, 0, size);
2262                 }
2263                 VERIFY(ms->m_type == MT_FREE);
2264                 VERIFY(ms->m_flags == M_EXT);
2265                 VERIFY(MEXT_RFA(ms) != NULL && MBUF_IS_COMPOSITE(ms));
2266                 if (cl_class == MC_CL) {
2267                         VERIFY(clsp->sl_refcnt >= 1 &&
2268                             clsp->sl_refcnt <= NCLPBG);
2269                 } else {
2270                         VERIFY(clsp->sl_refcnt == 1);
2271                 }
2272                 if (cl_class == MC_16KCL) {
2273                         int k;
2274                         for (nsp = clsp, k = 1; k < NSLABSP16KB; k++) {
2275                                 nsp = nsp->sl_next;
2276                                 /* Next slab must already be present */
2277                                 VERIFY(nsp != NULL);
2278                                 VERIFY(nsp->sl_refcnt == 1);
2279                         }
2280                 }
2281
2282                 /*
2283                  * If we're asked to purge, restore the actual mbuf using
2284                  * contents of the shadow structure (if auditing is enabled)
2285                  * and clear EXTF_COMPOSITE flag from the mbuf, as we are
2286                  * about to free it and the attached cluster into their caches.
2287                  */
2288                 if (purged) {
2289                         /* Restore constructed mbuf fields */
2290                         if (mclaudit != NULL)
2291                                 mcl_audit_restore_mbuf(m, mca, TRUE);
2292
2293                         MEXT_REF(m) = 0;
2294                         MEXT_FLAGS(m) = 0;
2295
2296                         rfa = (mcache_obj_t *)(void *)MEXT_RFA(m);
2297                         rfa->obj_next = ref_list;
2298                         ref_list = rfa;
2299                         MEXT_RFA(m) = NULL;
2300
2301                         m->m_type = MT_FREE;
2302                         m->m_flags = m->m_len = 0;
2303                         m->m_next = m->m_nextpkt = NULL;
2304
2305                         /* Save mbuf fields and make auditing happy */
2306                         if (mclaudit != NULL)
2307                                 mcl_audit_mbuf(mca, o, FALSE, FALSE);
2308
2309                         VERIFY(m_total(class) > 0);
2310                         m_total(class)--;
2311
2312                         /* Free the mbuf */
2313                         o->obj_next = NULL;
2314                         slab_free(MC_MBUF, o);
2315
2316                         /* And free the cluster */
2317                         ((mcache_obj_t *)cl)->obj_next = NULL;
2318                         if (class == MC_MBUF_CL)
2319                                 slab_free(MC_CL, cl);
2320                         else if (class == MC_MBUF_BIGCL)
2321                                 slab_free(MC_BIGCL, cl);
2322                         else
2323                                 slab_free(MC_16KCL, cl);
2324                 }
2325
2326                 ++num;
2327                 tail = o;
2328                 o = nexto;
2329         }
2330
2331         if (!purged) {
2332                 tail->obj_next = m_cobjlist(class);
2333                 m_cobjlist(class) = list;
2334                 m_infree(class) += num;
2335         } else if (ref_list != NULL) {
2336                 mcache_free_ext(ref_cache, ref_list);
2337         }
2338
2339         return (num);
2340 }
2341
2342 /*
2343  * Common allocator for composite objects called by the CPU cache layer
2344  * during an allocation request whenever there is no available element in
2345  * the bucket layer.  It returns one or more composite elements from the
2346  * appropriate global freelist.  If the freelist is empty, it will attempt
2347  * to obtain the rudimentary objects from their caches and construct them
2348  * into composite mbuf + cluster objects.
2349  */
2350 static unsigned int
2351 mbuf_cslab_alloc(void *arg, mcache_obj_t ***plist, unsigned int needed,
2352     int wait)
2353 {
2354         mbuf_class_t class = (mbuf_class_t)arg;
2355         mbuf_class_t cl_class = 0;
2356         unsigned int num = 0, cnum = 0, want = needed;
2357         mcache_obj_t *ref_list = NULL;
2358         mcache_obj_t *mp_list = NULL;
2359         mcache_obj_t *clp_list = NULL;
2360         mcache_obj_t **list;
2361         struct ext_ref *rfa;
2362         struct mbuf *m;
2363         void *cl;
2364
2365         ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
2366         ASSERT(needed > 0);
2367
2368         VERIFY(class != MC_MBUF_16KCL || njcl > 0);
2369
2370         /* There should not be any slab for this class */
2371         VERIFY(m_slab_cnt(class) == 0 &&
2372             m_slablist(class).tqh_first == NULL &&
2373             m_slablist(class).tqh_last == NULL);
2374
2375         lck_mtx_lock(mbuf_mlock);
2376
2377         /* Try using the freelist first */
2378         num = cslab_alloc(class, plist, needed);
2379         list = *plist;
2380         if (num == needed) {
2381                 m_alloc_cnt(class) += num;
2382                 lck_mtx_unlock(mbuf_mlock);
2383                 return (needed);
2384         }
2385
2386         lck_mtx_unlock(mbuf_mlock);
2387
2388         /*
2389          * We could not satisfy the request using the freelist alone;
2390          * allocate from the appropriate rudimentary caches and use
2391          * whatever we can get to construct the composite objects.
2392          */
2393         needed -= num;
2394
2395         /*
2396          * Mark these allocation requests as coming from a composite cache.
2397          * Also, if the caller is willing to be blocked, mark the request
2398          * with MCR_FAILOK such that we don't end up sleeping at the mbuf
2399          * slab layer waiting for the individual object when one or more
2400          * of the already-constructed composite objects are available.
2401          */
2402         wait |= MCR_COMP;
2403         if (!(wait & MCR_NOSLEEP))
2404                 wait |= MCR_FAILOK;
2405
2406         /* allocate mbufs */
2407         needed = mcache_alloc_ext(m_cache(MC_MBUF), &mp_list, needed, wait);
2408         if (needed == 0) {
2409                 ASSERT(mp_list == NULL);
2410                 goto fail;
2411         }
2412
2413         /* allocate clusters */
2414         if (class == MC_MBUF_CL) {
2415                 cl_class = MC_CL;
2416         } else if (class == MC_MBUF_BIGCL) {
2417                 cl_class = MC_BIGCL;
2418         } else {
2419                 VERIFY(class == MC_MBUF_16KCL);
2420                 cl_class = MC_16KCL;
2421         }
2422         needed = mcache_alloc_ext(m_cache(cl_class), &clp_list, needed, wait);
2423         if (needed == 0) {
2424                 ASSERT(clp_list == NULL);
2425                 goto fail;
2426         }
2427
2428         needed = mcache_alloc_ext(ref_cache, &ref_list, needed, wait);
2429         if (needed == 0) {
2430                 ASSERT(ref_list == NULL);
2431                 goto fail;
2432         }
2433
2434         /*
2435          * By this time "needed" is MIN(mbuf, cluster, ref).  Any left
2436          * overs will get freed accordingly before we return to caller.
2437          */
2438         for (cnum = 0; cnum < needed; cnum++) {
2439                 struct mbuf *ms;
2440
2441                 m = ms = (struct mbuf *)mp_list;
2442                 mp_list = mp_list->obj_next;
2443
2444                 cl = clp_list;
2445                 clp_list = clp_list->obj_next;
2446                 ((mcache_obj_t *)cl)->obj_next = NULL;
2447
2448                 rfa = (struct ext_ref *)ref_list;
2449                 ref_list = ref_list->obj_next;
2450                 ((mcache_obj_t *)(void *)rfa)->obj_next = NULL;
2451
2452                 /*
2453                  * If auditing is enabled, construct the shadow mbuf
2454                  * in the audit structure instead of in the actual one.
2455                  * mbuf_cslab_audit() will take care of restoring the
2456                  * contents after the integrity check.
2457                  */
2458                 if (mclaudit != NULL) {
2459                         mcache_audit_t *mca, *cl_mca;
2460
2461                         lck_mtx_lock(mbuf_mlock);
2462                         mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
2463                         ms = MCA_SAVED_MBUF_PTR(mca);
2464                         cl_mca = mcl_audit_buf2mca(MC_CL, (mcache_obj_t *)cl);
2465
2466                         /*
2467                          * Pair them up.  Note that this is done at the time
2468                          * the mbuf+cluster objects are constructed.  This
2469                          * information should be treated as "best effort"
2470                          * debugging hint since more than one mbufs can refer
2471                          * to a cluster.  In that case, the cluster might not
2472                          * be freed along with the mbuf it was paired with.
2473                          */
2474                         mca->mca_uptr = cl_mca;
2475                         cl_mca->mca_uptr = mca;
2476
2477                         ASSERT(mca->mca_uflags & MB_SCVALID);
2478                         ASSERT(!(cl_mca->mca_uflags & MB_SCVALID));
2479                         lck_mtx_unlock(mbuf_mlock);
2480
2481                         /* Technically, they are in the freelist */
2482                         if (mclverify) {
2483                                 size_t size;
2484
2485                                 mcache_set_pattern(MCACHE_FREE_PATTERN, m,
2486                                     m_maxsize(MC_MBUF));
2487
2488                                 if (class == MC_MBUF_CL)
2489                                         size = m_maxsize(MC_CL);
2490                                 else if (class == MC_MBUF_BIGCL)
2491                                         size = m_maxsize(MC_BIGCL);
2492                                 else
2493                                         size = m_maxsize(MC_16KCL);
2494
2495                                 mcache_set_pattern(MCACHE_FREE_PATTERN, cl,
2496                                     size);
2497                         }
2498                 }
2499
2500                 MBUF_INIT(ms, 0, MT_FREE);
2501                 if (class == MC_MBUF_16KCL) {
2502                         MBUF_16KCL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE);
2503                 } else if (class == MC_MBUF_BIGCL) {
2504                         MBUF_BIGCL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE);
2505                 } else {
2506                         MBUF_CL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE);
2507                 }
2508                 VERIFY(ms->m_flags == M_EXT);
2509                 VERIFY(MEXT_RFA(ms) != NULL && MBUF_IS_COMPOSITE(ms));
2510
2511                 *list = (mcache_obj_t *)m;
2512                 (*list)->obj_next = NULL;
2513                 list = *plist = &(*list)->obj_next;
2514         }
2515
2516 fail:
2517         /*
2518          * Free up what's left of the above.
2519          */
2520         if (mp_list != NULL)
2521                 mcache_free_ext(m_cache(MC_MBUF), mp_list);
2522         if (clp_list != NULL)
2523                 mcache_free_ext(m_cache(cl_class), clp_list);
2524         if (ref_list != NULL)
2525                 mcache_free_ext(ref_cache, ref_list);
2526
2527         lck_mtx_lock(mbuf_mlock);
2528         if (num > 0 || cnum > 0) {
2529                 m_total(class) += cnum;
2530                 VERIFY(m_total(class) <= m_maxlimit(class));
2531                 m_alloc_cnt(class) += num + cnum;
2532         }
2533         if ((num + cnum) < want)
2534                 m_fail_cnt(class) += (want - (num + cnum));
2535         lck_mtx_unlock(mbuf_mlock);
2536
2537         return (num + cnum);
2538 }
2539
2540 /*
2541  * Common de-allocator for composite objects called by the CPU cache
2542  * layer when one or more elements need to be returned to the appropriate
2543  * global freelist.
2544  */
2545 static void
2546 mbuf_cslab_free(void *arg, mcache_obj_t *list, int purged)
2547 {
2548         mbuf_class_t class = (mbuf_class_t)arg;
2549         unsigned int num;
2550         int w;
2551
2552         ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
2553
2554         lck_mtx_lock(mbuf_mlock);
2555
2556         num = cslab_free(class, list, purged);
2557         m_free_cnt(class) += num;
2558
2559         if ((w = mb_waiters) > 0)
2560                 mb_waiters = 0;
2561
2562         lck_mtx_unlock(mbuf_mlock);
2563
2564         if (w != 0)
2565                 wakeup(mb_waitchan);
2566 }
2567
2568 /*
2569  * Common auditor for composite objects called by the CPU cache layer
2570  * during an allocation or free request.  For the former, this is called
2571  * after the objects are obtained from either the bucket or slab layer
2572  * and before they are returned to the caller.  For the latter, this is
2573  * called immediately during free and before placing the objects into
2574  * the bucket or slab layer.
2575  */
2576 static void
2577 mbuf_cslab_audit(void *arg, mcache_obj_t *list, boolean_t alloc)
2578 {
2579         mbuf_class_t class = (mbuf_class_t)arg;
2580         mcache_audit_t *mca;
2581         struct mbuf *m, *ms;
2582         mcl_slab_t *clsp, *nsp;
2583         size_t size;
2584         void *cl;
2585
2586         ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
2587
2588         while ((m = ms = (struct mbuf *)list) != NULL) {
2589                 lck_mtx_lock(mbuf_mlock);
2590                 /* Do the mbuf sanity checks and record its transaction */
2591                 mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
2592                 mcl_audit_mbuf(mca, m, TRUE, alloc);
2593                 if (mcltrace)
2594                         mcache_buffer_log(mca, m, m_cache(class), &mb_start);
2595
2596                 if (alloc)
2597                         mca->mca_uflags |= MB_COMP_INUSE;
2598                 else
2599                         mca->mca_uflags &= ~MB_COMP_INUSE;
2600
2601                 /*
2602                  * Use the shadow mbuf in the audit structure if we are
2603                  * freeing, since the contents of the actual mbuf has been
2604                  * pattern-filled by the above call to mcl_audit_mbuf().
2605                  */
2606                 if (!alloc && mclverify)
2607                         ms = MCA_SAVED_MBUF_PTR(mca);
2608
2609                 /* Do the cluster sanity checks and record its transaction */
2610                 cl = ms->m_ext.ext_buf;
2611                 clsp = slab_get(cl);
2612                 VERIFY(ms->m_flags == M_EXT && cl != NULL);
2613                 VERIFY(MEXT_RFA(ms) != NULL && MBUF_IS_COMPOSITE(ms));
2614                 if (class == MC_MBUF_CL)
2615                         VERIFY(clsp->sl_refcnt >= 1 &&
2616                             clsp->sl_refcnt <= NCLPBG);
2617                 else
2618                         VERIFY(clsp->sl_refcnt == 1);
2619
2620                 if (class == MC_MBUF_16KCL) {
2621                         int k;
2622                         for (nsp = clsp, k = 1; k < NSLABSP16KB; k++) {
2623                                 nsp = nsp->sl_next;
2624                                 /* Next slab must already be present */
2625                                 VERIFY(nsp != NULL);
2626                                 VERIFY(nsp->sl_refcnt == 1);
2627                         }
2628                 }
2629
2630                 mca = mcl_audit_buf2mca(MC_CL, cl);
2631                 if (class == MC_MBUF_CL)
2632                         size = m_maxsize(MC_CL);
2633                 else if (class == MC_MBUF_BIGCL)
2634                         size = m_maxsize(MC_BIGCL);
2635                 else
2636                         size = m_maxsize(MC_16KCL);
2637                 mcl_audit_cluster(mca, cl, size, alloc, FALSE);
2638                 if (mcltrace)
2639                         mcache_buffer_log(mca, cl, m_cache(class), &mb_start);
2640
2641                 if (alloc)
2642                         mca->mca_uflags |= MB_COMP_INUSE;
2643                 else
2644                         mca->mca_uflags &= ~MB_COMP_INUSE;
2645                 lck_mtx_unlock(mbuf_mlock);
2646
2647                 list = list->obj_next;
2648         }
2649 }
2650
2651 /*
2652  * Allocate some number of mbuf clusters and place on cluster freelist.
2653  */
2654 static int
2655 m_clalloc(const u_int32_t num, const int wait, const u_int32_t bufsize)
2656 {
2657         int i;
2658         vm_size_t size = 0;
2659         int numpages = 0, large_buffer = (bufsize == m_maxsize(MC_16KCL));
2660         vm_offset_t page = 0;
2661         mcache_audit_t *mca_list = NULL;
2662         mcache_obj_t *con_list = NULL;
2663         mcl_slab_t *sp;
2664
2665         VERIFY(bufsize == m_maxsize(MC_BIGCL) ||
2666             bufsize == m_maxsize(MC_16KCL));
2667
2668         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2669
2670         /*
2671          * Multiple threads may attempt to populate the cluster map one
2672          * after another.  Since we drop the lock below prior to acquiring
2673          * the physical page(s), our view of the cluster map may no longer
2674          * be accurate, and we could end up over-committing the pages beyond
2675          * the maximum allowed for each class.  To prevent it, this entire
2676          * operation (including the page mapping) is serialized.
2677          */
2678         while (mb_clalloc_busy) {
2679                 mb_clalloc_waiters++;
2680                 (void) msleep(mb_clalloc_waitchan, mbuf_mlock,
2681                     (PZERO-1), "m_clalloc", NULL);
2682                 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2683         }
2684
2685         /* We are busy now; tell everyone else to go away */
2686         mb_clalloc_busy = TRUE;
2687
2688         /*
2689          * Honor the caller's wish to block or not block.  We have a way
2690          * to grow the pool asynchronously using the mbuf worker thread.
2691          */
2692         i = m_howmany(num, bufsize);
2693         if (i == 0 || (wait & M_DONTWAIT))
2694                 goto out;
2695
2696         lck_mtx_unlock(mbuf_mlock);
2697
2698         size = round_page(i * bufsize);
2699         page = kmem_mb_alloc(mb_map, size, large_buffer);
2700
2701         /*
2702          * If we did ask for "n" 16KB physically contiguous chunks
2703          * and didn't get them, then please try again without this
2704          * restriction.
2705          */
2706         if (large_buffer && page == 0)
2707                 page = kmem_mb_alloc(mb_map, size, 0);
2708
2709         if (page == 0) {
2710                 if (bufsize == m_maxsize(MC_BIGCL)) {
2711                         /* Try for 1 page if failed, only 4KB request */
2712                         size = NBPG;
2713                         page = kmem_mb_alloc(mb_map, size, 0);
2714                 }
2715
2716                 if (page == 0) {
2717                         lck_mtx_lock(mbuf_mlock);
2718                         goto out;
2719                 }
2720         }
2721
2722         VERIFY(IS_P2ALIGNED(page, NBPG));
2723         numpages = size / NBPG;
2724
2725         /* If auditing is enabled, allocate the audit structures now */
2726         if (mclaudit != NULL) {
2727                 int needed;
2728
2729                 /*
2730                  * Yes, I realize this is a waste of memory for clusters
2731                  * that never get transformed into mbufs, as we may end
2732                  * up with NMBPBG-1 unused audit structures per cluster.
2733                  * But doing so tremendously simplifies the allocation
2734                  * strategy, since at this point we are not holding the
2735                  * mbuf lock and the caller is okay to be blocked.
2736                  */
2737                 if (bufsize == m_maxsize(MC_BIGCL)) {
2738                         needed = numpages * NMBPBG;
2739
2740                         i = mcache_alloc_ext(mcl_audit_con_cache,
2741                             &con_list, needed, MCR_SLEEP);
2742
2743                         VERIFY(con_list != NULL && i == needed);
2744                 } else {
2745                         needed = numpages / NSLABSP16KB;
2746                 }
2747
2748                 i = mcache_alloc_ext(mcache_audit_cache,
2749                     (mcache_obj_t **)&mca_list, needed, MCR_SLEEP);
2750
2751                 VERIFY(mca_list != NULL && i == needed);
2752         }
2753
2754         lck_mtx_lock(mbuf_mlock);
2755
2756         for (i = 0; i < numpages; i++, page += NBPG) {
2757                 ppnum_t offset = ((char *)page - (char *)mbutl) / NBPG;
2758                 ppnum_t new_page = pmap_find_phys(kernel_pmap, page);
2759
2760                 /*
2761                  * If there is a mapper the appropriate I/O page is returned;
2762                  * zero out the page to discard its past contents to prevent
2763                  * exposing leftover kernel memory.
2764                  */
2765                 VERIFY(offset < mcl_pages);
2766                 if (mcl_paddr_base != 0) {
2767                         bzero((void *)(uintptr_t) page, page_size);
2768                         new_page = IOMapperInsertPage(mcl_paddr_base,
2769                             offset, new_page);
2770                 }
2771                 mcl_paddr[offset] = new_page;
2772
2773                 /* Pattern-fill this fresh page */
2774                 if (mclverify) {
2775                         mcache_set_pattern(MCACHE_FREE_PATTERN,
2776                             (caddr_t)page, NBPG);
2777                 }
2778                 if (bufsize == m_maxsize(MC_BIGCL)) {
2779                         union mbigcluster *mbc = (union mbigcluster *)page;
2780
2781                         /* One for the entire page */
2782                         sp = slab_get(mbc);
2783                         if (mclaudit != NULL) {
2784                                 mcl_audit_init(mbc, &mca_list, &con_list,
2785                                     AUDIT_CONTENTS_SIZE, NMBPBG);
2786                         }
2787                         VERIFY(sp->sl_refcnt == 0 && sp->sl_flags == 0);
2788                         slab_init(sp, MC_BIGCL, SLF_MAPPED,
2789                             mbc, mbc, bufsize, 0, 1);
2790
2791                         /* Insert this slab */
2792                         slab_insert(sp, MC_BIGCL);
2793
2794                         /* Update stats now since slab_get() drops the lock */
2795                         mbstat.m_bigclfree = ++m_infree(MC_BIGCL) +
2796                             m_infree(MC_MBUF_BIGCL);
2797                         mbstat.m_bigclusters = ++m_total(MC_BIGCL);
2798                         VERIFY(m_total(MC_BIGCL) <= m_maxlimit(MC_BIGCL));
2799                 } else if ((i % NSLABSP16KB) == 0) {
2800                         union m16kcluster *m16kcl = (union m16kcluster *)page;
2801                         mcl_slab_t *nsp;
2802                         int k;
2803
2804                         VERIFY(njcl > 0);
2805                         /* One for the entire 16KB */
2806                         sp = slab_get(m16kcl);
2807                         if (mclaudit != NULL)
2808                                 mcl_audit_init(m16kcl, &mca_list, NULL, 0, 1);
2809
2810                         VERIFY(sp->sl_refcnt == 0 && sp->sl_flags == 0);
2811                         slab_init(sp, MC_16KCL, SLF_MAPPED,
2812                             m16kcl, m16kcl, bufsize, 0, 1);
2813
2814                         /*
2815                          * 2nd-Nth page's slab is part of the first one,
2816                          * where N is NSLABSP16KB.
2817                          */
2818                         for (k = 1; k < NSLABSP16KB; k++) {
2819                                 nsp = slab_get(((union mbigcluster *)page) + k);
2820                                 VERIFY(nsp->sl_refcnt == 0 &&
2821                                     nsp->sl_flags == 0);
2822                                 slab_init(nsp, MC_16KCL,
2823                                     SLF_MAPPED | SLF_PARTIAL,
2824                                     m16kcl, NULL, 0, 0, 0);
2825                         }
2826
2827                         /* Insert this slab */
2828                         slab_insert(sp, MC_16KCL);
2829
2830                         /* Update stats now since slab_get() drops the lock */
2831                         m_infree(MC_16KCL)++;
2832                         m_total(MC_16KCL)++;
2833                         VERIFY(m_total(MC_16KCL) <= m_maxlimit(MC_16KCL));
2834                 }
2835         }
2836         VERIFY(mca_list == NULL && con_list == NULL);
2837
2838         /* We're done; let others enter */
2839         mb_clalloc_busy = FALSE;
2840         if (mb_clalloc_waiters > 0) {
2841                 mb_clalloc_waiters = 0;
2842                 wakeup(mb_clalloc_waitchan);
2843         }
2844
2845         if (bufsize == m_maxsize(MC_BIGCL))
2846                 return (numpages);
2847
2848         VERIFY(bufsize == m_maxsize(MC_16KCL));
2849         return (numpages / NSLABSP16KB);
2850
2851 out:
2852         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2853
2854         /* We're done; let others enter */
2855         mb_clalloc_busy = FALSE;
2856         if (mb_clalloc_waiters > 0) {
2857                 mb_clalloc_waiters = 0;
2858                 wakeup(mb_clalloc_waitchan);
2859         }
2860
2861         /*
2862          * When non-blocking we kick a thread if we have to grow the
2863          * pool or if the number of free clusters is less than requested.
2864          */
2865         if (bufsize == m_maxsize(MC_BIGCL)) {
2866                 if (i > 0) {
2867                         /*
2868                          * Remember total number of 4KB clusters needed
2869                          * at this time.
2870                          */
2871                         i += m_total(MC_BIGCL);
2872                         if (i > mbuf_expand_big) {
2873                                 mbuf_expand_big = i;
2874                                 if (mbuf_worker_ready)
2875                                         wakeup((caddr_t)&mbuf_worker_run);
2876                         }
2877                 }
2878
2879                 if (m_infree(MC_BIGCL) >= num)
2880                         return (1);
2881         } else {
2882                 if (i > 0) {
2883                         /*
2884                          * Remember total number of 16KB clusters needed
2885                          * at this time.
2886                          */
2887                         i += m_total(MC_16KCL);
2888                         if (i > mbuf_expand_16k) {
2889                                 mbuf_expand_16k = i;
2890                                 if (mbuf_worker_ready)
2891                                         wakeup((caddr_t)&mbuf_worker_run);
2892                         }
2893                 }
2894
2895                 if (m_infree(MC_16KCL) >= num)
2896                         return (1);
2897         }
2898         return (0);
2899 }
2900
2901 /*
2902  * Populate the global freelist of the corresponding buffer class.
2903  */
2904 static int
2905 freelist_populate(mbuf_class_t class, unsigned int num, int wait)
2906 {
2907         mcache_obj_t *o = NULL;
2908         int i, numpages = 0, count;
2909
2910         VERIFY(class == MC_MBUF || class == MC_CL || class == MC_BIGCL ||
2911             class == MC_16KCL);
2912
2913         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2914
2915         switch (class) {
2916         case MC_MBUF:
2917         case MC_CL:
2918         case MC_BIGCL:
2919                 numpages = (num * m_size(class) + NBPG - 1) / NBPG;
2920                 i = m_clalloc(numpages, wait, m_maxsize(MC_BIGCL));
2921
2922                 /* Respect the 4KB clusters minimum limit */
2923                 if (m_total(MC_BIGCL) == m_maxlimit(MC_BIGCL) &&
2924                     m_infree(MC_BIGCL) <= m_minlimit(MC_BIGCL)) {
2925                         if (class != MC_BIGCL || (wait & MCR_COMP))
2926                                 return (0);
2927                 }
2928                 if (class == MC_BIGCL)
2929                         return (i != 0);
2930                 break;
2931
2932         case MC_16KCL:
2933                 return (m_clalloc(num, wait, m_maxsize(class)) != 0);
2934                 /* NOTREACHED */
2935
2936         default:
2937                 VERIFY(0);
2938                 /* NOTREACHED */
2939         }
2940
2941         VERIFY(class == MC_MBUF || class == MC_CL);
2942
2943         /* how many objects will we cut the page into? */
2944         int numobj = (class == MC_MBUF ? NMBPBG : NCLPBG);
2945
2946         for (count = 0; count < numpages; count++) {
2947
2948                 /* respect totals, minlimit, maxlimit */
2949                 if (m_total(MC_BIGCL) <= m_minlimit(MC_BIGCL) ||
2950                     m_total(class) >= m_maxlimit(class))
2951                         break;
2952
2953                 if ((o = slab_alloc(MC_BIGCL, wait)) == NULL)
2954                         break;
2955
2956                 struct mbuf *m = (struct mbuf *)o;
2957                 union mcluster *c = (union mcluster *)o;
2958                 mcl_slab_t *sp = slab_get(o);
2959                 mcache_audit_t *mca = NULL;
2960
2961                 VERIFY(slab_is_detached(sp) &&
2962                     (sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED);
2963
2964                 /*
2965                  * Make sure that the cluster is unmolested
2966                  * while in freelist
2967                  */
2968                 if (mclverify) {
2969                         mca = mcl_audit_buf2mca(MC_BIGCL, o);
2970                         mcache_audit_free_verify(mca, o, 0,
2971                             m_maxsize(MC_BIGCL));
2972                 }
2973
2974                 /* Reinitialize it as an mbuf or 2K slab */
2975                 slab_init(sp, class, sp->sl_flags,
2976                     sp->sl_base, NULL, sp->sl_len, 0, numobj);
2977
2978                 VERIFY(o == (mcache_obj_t *)sp->sl_base);
2979                 VERIFY(sp->sl_head == NULL);
2980
2981                 VERIFY(m_total(MC_BIGCL) > 0);
2982                 m_total(MC_BIGCL)--;
2983                 mbstat.m_bigclusters = m_total(MC_BIGCL);
2984
2985                 m_total(class) += numobj;
2986                 m_infree(class) += numobj;
2987
2988                 VERIFY(m_total(MC_BIGCL) >= m_minlimit(MC_BIGCL));
2989                 VERIFY(m_total(class) <= m_maxlimit(class));
2990
2991                 i = numobj;
2992                 if (class == MC_MBUF) {
2993                         mbstat.m_mbufs = m_total(MC_MBUF);
2994                         mtype_stat_add(MT_FREE, NMBPBG);
2995                         while (i--) {
2996                                 /*
2997                                  * If auditing is enabled, construct the
2998                                  * shadow mbuf in the audit structure
2999                                  * instead of the actual one.
3000                                  * mbuf_slab_audit() will take care of
3001                                  * restoring the contents after the
3002                                  * integrity check.
3003                                  */
3004                                 if (mclaudit != NULL) {
3005                                         struct mbuf *ms;
3006                                         mca = mcl_audit_buf2mca(MC_MBUF,
3007                                             (mcache_obj_t *)m);
3008                                         ms = MCA_SAVED_MBUF_PTR(mca);
3009                                         ms->m_type = MT_FREE;
3010                                 } else {
3011                                         m->m_type = MT_FREE;
3012                                 }
3013                                 m->m_next = sp->sl_head;
3014                                 sp->sl_head = (void *)m++;
3015                         }
3016                 } else { /* MC_CL */
3017                         mbstat.m_clfree =
3018                             m_infree(MC_CL) + m_infree(MC_MBUF_CL);
3019                         mbstat.m_clusters = m_total(MC_CL);
3020                         while (i--) {
3021                                 c->mcl_next = sp->sl_head;
3022                                 sp->sl_head = (void *)c++;
3023                         }
3024                 }
3025
3026                 /* Insert into the mbuf or 2k slab list */
3027                 slab_insert(sp, class);
3028
3029                 if ((i = mb_waiters) > 0)
3030                         mb_waiters = 0;
3031                 if (i != 0)
3032                         wakeup(mb_waitchan);
3033         }
3034         return (count != 0);
3035 }
3036
3037 /*
3038  * For each class, initialize the freelist to hold m_minlimit() objects.
3039  */
3040 static void
3041 freelist_init(mbuf_class_t class)
3042 {
3043         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
3044
3045         VERIFY(class == MC_CL || class == MC_BIGCL);
3046         VERIFY(m_total(class) == 0);
3047         VERIFY(m_minlimit(class) > 0);
3048
3049         while (m_total(class) < m_minlimit(class))
3050                 (void) freelist_populate(class, m_minlimit(class), M_WAIT);
3051
3052         VERIFY(m_total(class) >= m_minlimit(class));
3053 }
3054
3055 /*
3056  * (Inaccurately) check if it might be worth a trip back to the
3057  * mcache layer due the availability of objects there.  We'll
3058  * end up back here if there's nothing up there.
3059  */
3060 static boolean_t
3061 mbuf_cached_above(mbuf_class_t class, int wait)
3062 {
3063         switch (class) {
3064         case MC_MBUF:
3065                 if (wait & MCR_COMP)
3066                         return (!mcache_bkt_isempty(m_cache(MC_MBUF_CL)) ||
3067                             !mcache_bkt_isempty(m_cache(MC_MBUF_BIGCL)));
3068                 break;
3069
3070         case MC_CL:
3071                 if (wait & MCR_COMP)
3072                         return (!mcache_bkt_isempty(m_cache(MC_MBUF_CL)));
3073                 break;
3074
3075         case MC_BIGCL:
3076                 if (wait & MCR_COMP)
3077                         return (!mcache_bkt_isempty(m_cache(MC_MBUF_BIGCL)));
3078                 break;
3079
3080         case MC_16KCL:
3081                 if (wait & MCR_COMP)
3082                         return (!mcache_bkt_isempty(m_cache(MC_MBUF_16KCL)));
3083                 break;
3084
3085         case MC_MBUF_CL:
3086         case MC_MBUF_BIGCL:
3087         case MC_MBUF_16KCL:
3088                 break;
3089
3090         default:
3091                 VERIFY(0);
3092                 /* NOTREACHED */
3093         }
3094
3095         return (!mcache_bkt_isempty(m_cache(class)));
3096 }
3097
3098 /*
3099  * If possible, convert constructed objects to raw ones.
3100  */
3101 static boolean_t
3102 mbuf_steal(mbuf_class_t class, unsigned int num)
3103 {
3104         mcache_obj_t *top = NULL;
3105         mcache_obj_t **list = &top;
3106         unsigned int tot = 0;
3107
3108         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
3109
3110         switch (class) {
3111         case MC_MBUF:
3112         case MC_CL:
3113         case MC_BIGCL:
3114         case MC_16KCL:
3115                 return (FALSE);
3116
3117         case MC_MBUF_CL:
3118         case MC_MBUF_BIGCL:
3119         case MC_MBUF_16KCL:
3120                 /* Get the required number of constructed objects if possible */
3121                 if (m_infree(class) > m_minlimit(class)) {
3122                         tot = cslab_alloc(class, &list,
3123                             MIN(num, m_infree(class)));
3124                 }
3125
3126                 /* And destroy them to get back the raw objects */
3127                 if (top != NULL)
3128                         (void) cslab_free(class, top, 1);
3129                 break;
3130
3131         default:
3132                 VERIFY(0);
3133                 /* NOTREACHED */
3134         }
3135
3136         return (tot == num);
3137 }
3138
3139 static void
3140 m_reclaim(mbuf_class_t class, unsigned int num, boolean_t comp)
3141 {
3142         int m, bmap = 0;
3143
3144         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
3145
3146         VERIFY(m_total(MC_CL) <= m_maxlimit(MC_CL));
3147         VERIFY(m_total(MC_BIGCL) <= m_maxlimit(MC_BIGCL));
3148         VERIFY(m_total(MC_16KCL) <= m_maxlimit(MC_16KCL));
3149
3150         /*
3151          * This logic can be made smarter; for now, simply mark
3152          * all other related classes as potential victims.
3153          */
3154         switch (class) {
3155         case MC_MBUF:
3156                 m_wantpurge(MC_CL)++;
3157                 m_wantpurge(MC_BIGCL)++;
3158                 m_wantpurge(MC_MBUF_CL)++;
3159                 m_wantpurge(MC_MBUF_BIGCL)++;
3160                 break;
3161
3162         case MC_CL:
3163                 m_wantpurge(MC_MBUF)++;
3164                 m_wantpurge(MC_BIGCL)++;
3165                 m_wantpurge(MC_MBUF_BIGCL)++;
3166                 if (!comp)
3167                         m_wantpurge(MC_MBUF_CL)++;
3168                 break;
3169
3170         case MC_BIGCL:
3171                 m_wantpurge(MC_MBUF)++;
3172                 m_wantpurge(MC_CL)++;
3173                 m_wantpurge(MC_MBUF_CL)++;
3174                 if (!comp)
3175                         m_wantpurge(MC_MBUF_BIGCL)++;
3176                 break;
3177
3178         case MC_16KCL:
3179                 if (!comp)
3180                         m_wantpurge(MC_MBUF_16KCL)++;
3181                 break;
3182
3183         default:
3184                 VERIFY(0);
3185                 /* NOTREACHED */
3186         }
3187
3188         /*
3189          * Run through each marked class and check if we really need to
3190          * purge (and therefore temporarily disable) the per-CPU caches
3191          * layer used by the class.  If so, remember the classes since
3192          * we are going to drop the lock below prior to purging.
3193          */
3194         for (m = 0; m < NELEM(mbuf_table); m++) {
3195                 if (m_wantpurge(m) > 0) {
3196                         m_wantpurge(m) = 0;
3197                         /*
3198                          * Try hard to steal the required number of objects
3199                          * from the freelist of other mbuf classes.  Only
3200                          * purge and disable the per-CPU caches layer when
3201                          * we don't have enough; it's the last resort.
3202                          */
3203                         if (!mbuf_steal(m, num))
3204                                 bmap |= (1 << m);
3205                 }
3206         }
3207
3208         lck_mtx_unlock(mbuf_mlock);
3209
3210         if (bmap != 0) {
3211                 /* signal the domains to drain */
3212                 net_drain_domains();
3213
3214                 /* Sigh; we have no other choices but to ask mcache to purge */
3215                 for (m = 0; m < NELEM(mbuf_table); m++) {
3216                         if ((bmap & (1 << m)) &&
3217                             mcache_purge_cache(m_cache(m))) {
3218                                 lck_mtx_lock(mbuf_mlock);
3219                                 m_purge_cnt(m)++;
3220                                 mbstat.m_drain++;
3221                                 lck_mtx_unlock(mbuf_mlock);
3222                         }
3223                 }
3224         } else {
3225                 /*
3226                  * Request mcache to reap extra elements from all of its caches;
3227                  * note that all reaps are serialized and happen only at a fixed
3228                  * interval.
3229                  */
3230                 mcache_reap();
3231         }
3232         lck_mtx_lock(mbuf_mlock);
3233 }
3234
3235 static inline struct mbuf *
3236 m_get_common(int wait, short type, int hdr)
3237 {
3238         struct mbuf *m;
3239         int mcflags = MSLEEPF(wait);
3240
3241         /* Is this due to a non-blocking retry?  If so, then try harder */
3242         if (mcflags & MCR_NOSLEEP)
3243                 mcflags |= MCR_TRYHARD;
3244
3245         m = mcache_alloc(m_cache(MC_MBUF), mcflags);
3246         if (m != NULL) {
3247                 MBUF_INIT(m, hdr, type);
3248                 mtype_stat_inc(type);
3249                 mtype_stat_dec(MT_FREE);
3250 #if CONFIG_MACF_NET
3251                 if (hdr && mac_init_mbuf(m, wait) != 0) {
3252                         m_free(m);
3253                         return (NULL);
3254                 }
3255 #endif /* MAC_NET */
3256         }
3257         return (m);
3258 }
3259
3260 /*
3261  * Space allocation routines; these are also available as macros
3262  * for critical paths.
3263  */
3264 #define _M_GET(wait, type)      m_get_common(wait, type, 0)
3265 #define _M_GETHDR(wait, type)   m_get_common(wait, type, 1)
3266 #define _M_RETRY(wait, type)    _M_GET(wait, type)
3267 #define _M_RETRYHDR(wait, type) _M_GETHDR(wait, type)
3268 #define _MGET(m, how, type)     ((m) = _M_GET(how, type))
3269 #define _MGETHDR(m, how, type)  ((m) = _M_GETHDR(how, type))
3270
3271 struct mbuf *
3272 m_get(int wait, int type)
3273 {
3274         return (_M_GET(wait, type));
3275 }
3276
3277 struct mbuf *
3278 m_gethdr(int wait, int type)
3279 {
3280         return (_M_GETHDR(wait, type));
3281 }
3282
3283 struct mbuf *
3284 m_retry(int wait, int type)
3285 {
3286         return (_M_RETRY(wait, type));
3287 }
3288
3289 struct mbuf *
3290 m_retryhdr(int wait, int type)
3291 {
3292         return (_M_RETRYHDR(wait, type));
3293 }
3294
3295 struct mbuf *
3296 m_getclr(int wait, int type)
3297 {
3298         struct mbuf *m;
3299
3300         _MGET(m, wait, type);
3301         if (m != NULL)
3302                 bzero(MTOD(m, caddr_t), MLEN);
3303         return (m);
3304 }
3305
3306 struct mbuf *
3307 m_free(struct mbuf *m)
3308 {
3309         struct mbuf *n = m->m_next;
3310
3311         if (m->m_type == MT_FREE)
3312                 panic("m_free: freeing an already freed mbuf");
3313
3314         if (m->m_flags & M_PKTHDR) {
3315                 /* Check for scratch area overflow */
3316                 m_redzone_verify(m);
3317                 /* Free the aux data and tags if there is any */
3318                 m_tag_delete_chain(m, NULL);
3319         }
3320
3321         if (m->m_flags & M_EXT) {
3322                 u_int32_t refcnt;
3323                 u_int32_t composite;
3324
3325                 refcnt = m_decref(m);
3326                 composite = (MEXT_FLAGS(m) & EXTF_COMPOSITE);
3327                 if (refcnt == 0 && !composite) {
3328                         if (m->m_ext.ext_free == NULL) {
3329                                 mcache_free(m_cache(MC_CL), m->m_ext.ext_buf);
3330                         } else if (m->m_ext.ext_free == m_bigfree) {
3331                                 mcache_free(m_cache(MC_BIGCL),
3332                                     m->m_ext.ext_buf);
3333                         } else if (m->m_ext.ext_free == m_16kfree) {
3334                                 mcache_free(m_cache(MC_16KCL),
3335                                     m->m_ext.ext_buf);
3336                         } else {
3337                                 (*(m->m_ext.ext_free))(m->m_ext.ext_buf,
3338                                     m->m_ext.ext_size, m->m_ext.ext_arg);
3339                         }
3340                         mcache_free(ref_cache, MEXT_RFA(m));
3341                         MEXT_RFA(m) = NULL;
3342                 } else if (refcnt == 0 && composite) {
3343                         VERIFY(m->m_type != MT_FREE);
3344
3345                         mtype_stat_dec(m->m_type);
3346                         mtype_stat_inc(MT_FREE);
3347
3348                         m->m_type = MT_FREE;
3349                         m->m_flags = M_EXT;
3350                         m->m_len = 0;
3351                         m->m_next = m->m_nextpkt = NULL;
3352
3353                         MEXT_FLAGS(m) &= ~EXTF_READONLY;
3354
3355                         /* "Free" into the intermediate cache */
3356                         if (m->m_ext.ext_free == NULL) {
3357                                 mcache_free(m_cache(MC_MBUF_CL), m);
3358                         } else if (m->m_ext.ext_free == m_bigfree) {
3359                                 mcache_free(m_cache(MC_MBUF_BIGCL), m);
3360                         } else {
3361                                 VERIFY(m->m_ext.ext_free == m_16kfree);
3362                                 mcache_free(m_cache(MC_MBUF_16KCL), m);
3363                         }
3364                         return (n);
3365                 }
3366         }
3367
3368         if (m->m_type != MT_FREE) {
3369                 mtype_stat_dec(m->m_type);
3370                 mtype_stat_inc(MT_FREE);
3371         }
3372
3373         m->m_type = MT_FREE;
3374         m->m_flags = m->m_len = 0;
3375         m->m_next = m->m_nextpkt = NULL;
3376
3377         mcache_free(m_cache(MC_MBUF), m);
3378
3379         return (n);
3380 }
3381
3382 __private_extern__ struct mbuf *
3383 m_clattach(struct mbuf *m, int type, caddr_t extbuf,
3384     void (*extfree)(caddr_t, u_int, caddr_t), u_int extsize, caddr_t extarg,
3385     int wait)
3386 {
3387         struct ext_ref *rfa = NULL;
3388
3389         if (m == NULL && (m = _M_GETHDR(wait, type)) == NULL)
3390                 return (NULL);
3391
3392         if (m->m_flags & M_EXT) {
3393                 u_int32_t refcnt;
3394                 u_int32_t composite;
3395
3396                 refcnt = m_decref(m);
3397                 composite = (MEXT_FLAGS(m) & EXTF_COMPOSITE);
3398                 if (refcnt == 0 && !composite) {
3399                         if (m->m_ext.ext_free == NULL) {
3400                                 mcache_free(m_cache(MC_CL), m->m_ext.ext_buf);
3401                         } else if (m->m_ext.ext_free == m_bigfree) {
3402                                 mcache_free(m_cache(MC_BIGCL),
3403                                     m->m_ext.ext_buf);
3404                         } else if (m->m_ext.ext_free == m_16kfree) {
3405                                 mcache_free(m_cache(MC_16KCL),
3406                                     m->m_ext.ext_buf);
3407                         } else {
3408                                 (*(m->m_ext.ext_free))(m->m_ext.ext_buf,
3409                                     m->m_ext.ext_size, m->m_ext.ext_arg);
3410                         }
3411                         /* Re-use the reference structure */
3412                         rfa = MEXT_RFA(m);
3413                 } else if (refcnt == 0 && composite) {
3414                         VERIFY(m->m_type != MT_FREE);
3415
3416                         mtype_stat_dec(m->m_type);
3417                         mtype_stat_inc(MT_FREE);
3418
3419                         m->m_type = MT_FREE;
3420                         m->m_flags = M_EXT;
3421                         m->m_len = 0;
3422                         m->m_next = m->m_nextpkt = NULL;
3423
3424                         MEXT_FLAGS(m) &= ~EXTF_READONLY;
3425
3426                         /* "Free" into the intermediate cache */
3427                         if (m->m_ext.ext_free == NULL) {
3428                                 mcache_free(m_cache(MC_MBUF_CL), m);
3429                         } else if (m->m_ext.ext_free == m_bigfree) {
3430                                 mcache_free(m_cache(MC_MBUF_BIGCL), m);
3431                         } else {
3432                                 VERIFY(m->m_ext.ext_free == m_16kfree);
3433                                 mcache_free(m_cache(MC_MBUF_16KCL), m);
3434                         }
3435                         /*
3436                          * Allocate a new mbuf, since we didn't divorce
3437                          * the composite mbuf + cluster pair above.
3438                          */
3439                         if ((m = _M_GETHDR(wait, type)) == NULL)
3440                                 return (NULL);
3441                 }
3442         }
3443
3444         if (rfa == NULL &&
3445             (rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL) {
3446                 m_free(m);
3447                 return (NULL);
3448         }
3449
3450         MEXT_INIT(m, extbuf, extsize, extfree, extarg, rfa, 1, 0);
3451
3452         return (m);
3453 }
3454
3455 /*
3456  * Perform `fast' allocation mbuf clusters from a cache of recently-freed
3457  * clusters. (If the cache is empty, new clusters are allocated en-masse.)
3458  */
3459 struct mbuf *
3460 m_getcl(int wait, int type, int flags)
3461 {
3462         struct mbuf *m;
3463         int mcflags = MSLEEPF(wait);
3464         int hdr = (flags & M_PKTHDR);
3465
3466         /* Is this due to a non-blocking retry?  If so, then try harder */
3467         if (mcflags & MCR_NOSLEEP)
3468                 mcflags |= MCR_TRYHARD;
3469
3470         m = mcache_alloc(m_cache(MC_MBUF_CL), mcflags);
3471         if (m != NULL) {
3472                 u_int32_t flag;
3473                 struct ext_ref *rfa;
3474                 void *cl;
3475
3476                 VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT);
3477                 cl = m->m_ext.ext_buf;
3478                 rfa = MEXT_RFA(m);
3479
3480                 ASSERT(cl != NULL && rfa != NULL);
3481                 VERIFY(MBUF_IS_COMPOSITE(m) && m->m_ext.ext_free == NULL);
3482
3483                 flag = MEXT_FLAGS(m);
3484
3485                 MBUF_INIT(m, hdr, type);
3486                 MBUF_CL_INIT(m, cl, rfa, 1, flag);
3487
3488                 mtype_stat_inc(type);
3489                 mtype_stat_dec(MT_FREE);
3490 #if CONFIG_MACF_NET
3491                 if (hdr && mac_init_mbuf(m, wait) != 0) {
3492                         m_freem(m);
3493                         return (NULL);
3494                 }
3495 #endif /* MAC_NET */
3496         }
3497         return (m);
3498 }
3499
3500 /* m_mclget() add an mbuf cluster to a normal mbuf */
3501 struct mbuf *
3502 m_mclget(struct mbuf *m, int wait)
3503 {
3504         struct ext_ref *rfa;
3505
3506         if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL)
3507                 return (m);
3508
3509         m->m_ext.ext_buf = m_mclalloc(wait);
3510         if (m->m_ext.ext_buf != NULL) {
3511                 MBUF_CL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0);
3512         } else {
3513                 mcache_free(ref_cache, rfa);
3514         }
3515         return (m);
3516 }
3517
3518 /* Allocate an mbuf cluster */
3519 caddr_t
3520 m_mclalloc(int wait)
3521 {
3522         int mcflags = MSLEEPF(wait);
3523
3524         /* Is this due to a non-blocking retry?  If so, then try harder */
3525         if (mcflags & MCR_NOSLEEP)
3526                 mcflags |= MCR_TRYHARD;
3527
3528         return (mcache_alloc(m_cache(MC_CL), mcflags));
3529 }
3530
3531 /* Free an mbuf cluster */
3532 void
3533 m_mclfree(caddr_t p)
3534 {
3535         mcache_free(m_cache(MC_CL), p);
3536 }
3537
3538 /*
3539  * mcl_hasreference() checks if a cluster of an mbuf is referenced by
3540  * another mbuf; see comments in m_incref() regarding EXTF_READONLY.
3541  */
3542 int
3543 m_mclhasreference(struct mbuf *m)
3544 {
3545         if (!(m->m_flags & M_EXT))
3546                 return (0);
3547
3548         ASSERT(MEXT_RFA(m) != NULL);
3549
3550         return ((MEXT_FLAGS(m) & EXTF_READONLY) ? 1 : 0);
3551 }
3552
3553 __private_extern__ caddr_t
3554 m_bigalloc(int wait)
3555 {
3556         int mcflags = MSLEEPF(wait);
3557
3558         /* Is this due to a non-blocking retry?  If so, then try harder */
3559         if (mcflags & MCR_NOSLEEP)
3560                 mcflags |= MCR_TRYHARD;
3561
3562         return (mcache_alloc(m_cache(MC_BIGCL), mcflags));
3563 }
3564
3565 __private_extern__ void
3566 m_bigfree(caddr_t p, __unused u_int size, __unused caddr_t arg)
3567 {
3568         mcache_free(m_cache(MC_BIGCL), p);
3569 }
3570
3571 /* m_mbigget() add an 4KB mbuf cluster to a normal mbuf */
3572 __private_extern__ struct mbuf *
3573 m_mbigget(struct mbuf *m, int wait)
3574 {
3575         struct ext_ref *rfa;
3576
3577         if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL)
3578                 return (m);
3579
3580         m->m_ext.ext_buf =  m_bigalloc(wait);
3581         if (m->m_ext.ext_buf != NULL) {
3582                 MBUF_BIGCL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0);
3583         } else {
3584                 mcache_free(ref_cache, rfa);
3585         }
3586         return (m);
3587 }
3588
3589 __private_extern__ caddr_t
3590 m_16kalloc(int wait)
3591 {
3592         int mcflags = MSLEEPF(wait);
3593
3594         /* Is this due to a non-blocking retry?  If so, then try harder */
3595         if (mcflags & MCR_NOSLEEP)
3596                 mcflags |= MCR_TRYHARD;
3597
3598         return (mcache_alloc(m_cache(MC_16KCL), mcflags));
3599 }
3600
3601 __private_extern__ void
3602 m_16kfree(caddr_t p, __unused u_int size, __unused caddr_t arg)
3603 {
3604         mcache_free(m_cache(MC_16KCL), p);
3605 }
3606
3607 /* m_m16kget() add a 16KB mbuf cluster to a normal mbuf */
3608 __private_extern__ struct mbuf *
3609 m_m16kget(struct mbuf *m, int wait)
3610 {
3611         struct ext_ref *rfa;
3612
3613         if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL)
3614                 return (m);
3615
3616         m->m_ext.ext_buf =  m_16kalloc(wait);
3617         if (m->m_ext.ext_buf != NULL) {
3618                 MBUF_16KCL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0);
3619         } else {
3620                 mcache_free(ref_cache, rfa);
3621         }
3622         return (m);
3623 }
3624
3625 /*
3626  * "Move" mbuf pkthdr from "from" to "to".
3627  * "from" must have M_PKTHDR set, and "to" must be empty.
3628  */
3629 void
3630 m_copy_pkthdr(struct mbuf *to, struct mbuf *from)
3631 {
3632         VERIFY(from->m_flags & M_PKTHDR);
3633
3634         /* Check for scratch area overflow */
3635         m_redzone_verify(from);
3636
3637         if (to->m_flags & M_PKTHDR) {
3638                 /* Check for scratch area overflow */
3639                 m_redzone_verify(to);
3640                 /* We will be taking over the tags of 'to' */
3641                 m_tag_delete_chain(to, NULL);
3642         }
3643         to->m_pkthdr = from->m_pkthdr;          /* especially tags */
3644         m_classifier_init(from, 0);             /* purge classifier info */
3645         m_tag_init(from, 1);                    /* purge all tags from src */
3646         m_scratch_init(from);                   /* clear src scratch area */
3647         to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT);
3648         if ((to->m_flags & M_EXT) == 0)
3649                 to->m_data = to->m_pktdat;
3650         m_redzone_init(to);                     /* setup red zone on dst */
3651 }
3652
3653 /*
3654  * Duplicate "from"'s mbuf pkthdr in "to".
3655  * "from" must have M_PKTHDR set, and "to" must be empty.
3656  * In particular, this does a deep copy of the packet tags.
3657  */
3658 static int
3659 m_dup_pkthdr(struct mbuf *to, struct mbuf *from, int how)
3660 {
3661         VERIFY(from->m_flags & M_PKTHDR);
3662
3663         /* Check for scratch area overflow */
3664         m_redzone_verify(from);
3665
3666         if (to->m_flags & M_PKTHDR) {
3667                 /* Check for scratch area overflow */
3668                 m_redzone_verify(to);
3669                 /* We will be taking over the tags of 'to' */
3670                 m_tag_delete_chain(to, NULL);
3671         }
3672         to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT);
3673         if ((to->m_flags & M_EXT) == 0)
3674                 to->m_data = to->m_pktdat;
3675         to->m_pkthdr = from->m_pkthdr;
3676         m_redzone_init(to);                     /* setup red zone on dst */
3677         m_tag_init(to, 0);                      /* preserve dst static tags */
3678         return (m_tag_copy_chain(to, from, how));
3679 }
3680
3681 void
3682 m_copy_pftag(struct mbuf *to, struct mbuf *from)
3683 {
3684         to->m_pkthdr.pf_mtag = from->m_pkthdr.pf_mtag;
3685 #if PF_ECN
3686         to->m_pkthdr.pf_mtag.pftag_hdr = NULL;
3687         to->m_pkthdr.pf_mtag.pftag_flags &= ~(PF_TAG_HDR_INET|PF_TAG_HDR_INET6);
3688 #endif /* PF_ECN */
3689 }
3690
3691 void
3692 m_classifier_init(struct mbuf *m, uint32_t pktf_mask)
3693 {
3694         VERIFY(m->m_flags & M_PKTHDR);
3695
3696         m->m_pkthdr.pkt_proto = 0;
3697         m->m_pkthdr.pkt_flowsrc = 0;
3698         m->m_pkthdr.pkt_flowid = 0;
3699         m->m_pkthdr.pkt_flags &= pktf_mask;     /* caller-defined mask */
3700         /* preserve service class and interface info for loopback packets */
3701         if (!(m->m_pkthdr.pkt_flags & PKTF_LOOP))
3702                 (void) m_set_service_class(m, MBUF_SC_BE);
3703         if (!(m->m_pkthdr.pkt_flags & PKTF_IFAINFO))
3704                 m->m_pkthdr.pkt_ifainfo = 0;
3705 #if MEASURE_BW
3706         m->m_pkthdr.pkt_bwseq  = 0;
3707 #endif /* MEASURE_BW */
3708 }
3709
3710 void
3711 m_copy_classifier(struct mbuf *to, struct mbuf *from)
3712 {
3713         VERIFY(to->m_flags & M_PKTHDR);
3714         VERIFY(from->m_flags & M_PKTHDR);
3715
3716         to->m_pkthdr.pkt_proto = from->m_pkthdr.pkt_proto;
3717         to->m_pkthdr.pkt_flowsrc = from->m_pkthdr.pkt_flowsrc;
3718         to->m_pkthdr.pkt_flowid = from->m_pkthdr.pkt_flowid;
3719         to->m_pkthdr.pkt_flags = from->m_pkthdr.pkt_flags;
3720         (void) m_set_service_class(to, from->m_pkthdr.pkt_svc);
3721         to->m_pkthdr.pkt_ifainfo  = from->m_pkthdr.pkt_ifainfo;
3722         to->m_pkthdr.ipsec_policy = from->m_pkthdr.ipsec_policy;
3723 #if MEASURE_BW
3724         to->m_pkthdr.pkt_bwseq  = from->m_pkthdr.pkt_bwseq;
3725 #endif /* MEASURE_BW */
3726 }
3727
3728 /*
3729  * Return a list of mbuf hdrs that point to clusters.  Try for num_needed;
3730  * if wantall is not set, return whatever number were available.  Set up the
3731  * first num_with_pkthdrs with mbuf hdrs configured as packet headers; these
3732  * are chained on the m_nextpkt field.  Any packets requested beyond this
3733  * are chained onto the last packet header's m_next field.  The size of
3734  * the cluster is controlled by the parameter bufsize.
3735  */
3736 __private_extern__ struct mbuf *
3737 m_getpackets_internal(unsigned int *num_needed, int num_with_pkthdrs,
3738     int wait, int wantall, size_t bufsize)
3739 {
3740         struct mbuf *m;
3741         struct mbuf **np, *top;
3742         unsigned int pnum, needed = *num_needed;
3743         mcache_obj_t *mp_list = NULL;
3744         int mcflags = MSLEEPF(wait);
3745         u_int32_t flag;
3746         struct ext_ref *rfa;
3747         mcache_t *cp;
3748         void *cl;
3749
3750         ASSERT(bufsize == m_maxsize(MC_CL) ||
3751             bufsize == m_maxsize(MC_BIGCL) ||
3752             bufsize == m_maxsize(MC_16KCL));
3753
3754         /*
3755          * Caller must first check for njcl because this
3756          * routine is internal and not exposed/used via KPI.
3757          */
3758         VERIFY(bufsize != m_maxsize(MC_16KCL) || njcl > 0);
3759
3760         top = NULL;
3761         np = &top;
3762         pnum = 0;
3763
3764         /*
3765          * The caller doesn't want all the requested buffers; only some.
3766          * Try hard to get what we can, but don't block.  This effectively
3767          * overrides MCR_SLEEP, since this thread will not go to sleep
3768          * if we can't get all the buffers.
3769          */
3770         if (!wantall || (mcflags & MCR_NOSLEEP))
3771                 mcflags |= MCR_TRYHARD;
3772
3773         /* Allocate the composite mbuf + cluster elements from the cache */
3774         if (bufsize == m_maxsize(MC_CL))
3775                 cp = m_cache(MC_MBUF_CL);
3776         else if (bufsize == m_maxsize(MC_BIGCL))
3777                 cp = m_cache(MC_MBUF_BIGCL);
3778         else
3779                 cp = m_cache(MC_MBUF_16KCL);
3780         needed = mcache_alloc_ext(cp, &mp_list, needed, mcflags);
3781
3782         for (pnum = 0; pnum < needed; pnum++) {
3783                 m = (struct mbuf *)mp_list;
3784                 mp_list = mp_list->obj_next;
3785
3786                 VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT);
3787                 cl = m->m_ext.ext_buf;
3788                 rfa = MEXT_RFA(m);
3789
3790                 ASSERT(cl != NULL && rfa != NULL);
3791                 VERIFY(MBUF_IS_COMPOSITE(m));
3792
3793                 flag = MEXT_FLAGS(m);
3794
3795                 MBUF_INIT(m, num_with_pkthdrs, MT_DATA);
3796                 if (bufsize == m_maxsize(MC_16KCL)) {
3797                         MBUF_16KCL_INIT(m, cl, rfa, 1, flag);
3798                 } else if (bufsize == m_maxsize(MC_BIGCL)) {
3799                         MBUF_BIGCL_INIT(m, cl, rfa, 1, flag);
3800                 } else {
3801                         MBUF_CL_INIT(m, cl, rfa, 1, flag);
3802                 }
3803
3804                 if (num_with_pkthdrs > 0) {
3805                         --num_with_pkthdrs;
3806 #if CONFIG_MACF_NET
3807                         if (mac_mbuf_label_init(m, wait) != 0) {
3808                                 m_freem(m);
3809                                 break;
3810                         }
3811 #endif /* MAC_NET */
3812                 }
3813
3814                 *np = m;
3815                 if (num_with_pkthdrs > 0)
3816                         np = &m->m_nextpkt;
3817                 else
3818                         np = &m->m_next;
3819         }
3820         ASSERT(pnum != *num_needed || mp_list == NULL);
3821         if (mp_list != NULL)
3822                 mcache_free_ext(cp, mp_list);
3823
3824         if (pnum > 0) {
3825                 mtype_stat_add(MT_DATA, pnum);
3826                 mtype_stat_sub(MT_FREE, pnum);
3827         }
3828
3829         if (wantall && (pnum != *num_needed)) {
3830                 if (top != NULL)
3831                         m_freem_list(top);
3832                 return (NULL);
3833         }
3834
3835         if (pnum > *num_needed) {
3836                 printf("%s: File a radar related to <rdar://10146739>. \
3837                         needed = %u, pnum = %u, num_needed = %u \n",
3838                         __func__, needed, pnum, *num_needed);
3839         }
3840
3841         *num_needed = pnum;
3842         return (top);
3843 }
3844
3845 /*
3846  * Return list of mbuf linked by m_nextpkt.  Try for numlist, and if
3847  * wantall is not set, return whatever number were available.  The size of
3848  * each mbuf in the list is controlled by the parameter packetlen.  Each
3849  * mbuf of the list may have a chain of mbufs linked by m_next.  Each mbuf
3850  * in the chain is called a segment.  If maxsegments is not null and the
3851  * value pointed to is not null, this specify the maximum number of segments
3852  * for a chain of mbufs.  If maxsegments is zero or the value pointed to
3853  * is zero the caller does not have any restriction on the number of segments.
3854  * The actual  number of segments of a mbuf chain is return in the value
3855  * pointed to by maxsegments.
3856  */
3857 __private_extern__ struct mbuf *
3858 m_allocpacket_internal(unsigned int *numlist, size_t packetlen,
3859     unsigned int *maxsegments, int wait, int wantall, size_t wantsize)
3860 {
3861         struct mbuf **np, *top, *first = NULL;
3862         size_t bufsize, r_bufsize;
3863         unsigned int num = 0;
3864         unsigned int nsegs = 0;
3865         unsigned int needed, resid;
3866         int mcflags = MSLEEPF(wait);
3867         mcache_obj_t *mp_list = NULL, *rmp_list = NULL;
3868         mcache_t *cp = NULL, *rcp = NULL;
3869
3870         if (*numlist == 0)
3871                 return (NULL);
3872
3873         top = NULL;
3874         np = &top;
3875
3876         if (wantsize == 0) {
3877                 if (packetlen <= MINCLSIZE) {
3878                         bufsize = packetlen;
3879                 } else if (packetlen > m_maxsize(MC_CL)) {
3880                         /* Use 4KB if jumbo cluster pool isn't available */
3881                         if (packetlen <= m_maxsize(MC_BIGCL) || njcl == 0)
3882                                 bufsize = m_maxsize(MC_BIGCL);
3883                         else
3884                                 bufsize = m_maxsize(MC_16KCL);
3885                 } else {
3886                         bufsize = m_maxsize(MC_CL);
3887                 }
3888         } else if (wantsize == m_maxsize(MC_CL) ||
3889             wantsize == m_maxsize(MC_BIGCL) ||
3890             (wantsize == m_maxsize(MC_16KCL) && njcl > 0)) {
3891                 bufsize = wantsize;
3892         } else {
3893                 return (NULL);
3894         }
3895
3896         if (bufsize <= MHLEN) {
3897                 nsegs = 1;
3898         } else if (bufsize <= MINCLSIZE) {
3899                 if (maxsegments != NULL && *maxsegments == 1) {
3900                         bufsize = m_maxsize(MC_CL);
3901                         nsegs = 1;
3902                 } else {
3903                         nsegs = 2;
3904                 }
3905         } else if (bufsize == m_maxsize(MC_16KCL)) {
3906                 VERIFY(njcl > 0);
3907                 nsegs = ((packetlen - 1) >> (PGSHIFT + 2)) + 1;
3908         } else if (bufsize == m_maxsize(MC_BIGCL)) {
3909                 nsegs = ((packetlen - 1) >> PGSHIFT) + 1;
3910         } else {
3911                 nsegs = ((packetlen - 1) >> MCLSHIFT) + 1;
3912         }
3913         if (maxsegments != NULL) {
3914                 if (*maxsegments && nsegs > *maxsegments) {
3915                         *maxsegments = nsegs;
3916                         return (NULL);
3917                 }
3918                 *maxsegments = nsegs;
3919         }
3920
3921         /*
3922          * The caller doesn't want all the requested buffers; only some.
3923          * Try hard to get what we can, but don't block.  This effectively
3924          * overrides MCR_SLEEP, since this thread will not go to sleep
3925          * if we can't get all the buffers.
3926          */
3927         if (!wantall || (mcflags & MCR_NOSLEEP))
3928                 mcflags |= MCR_TRYHARD;
3929
3930         /*
3931          * Simple case where all elements in the lists/chains are mbufs.
3932          * Unless bufsize is greater than MHLEN, each segment chain is made
3933          * up of exactly 1 mbuf.  Otherwise, each segment chain is made up
3934          * of 2 mbufs; the second one is used for the residual data, i.e.
3935          * the remaining data that cannot fit into the first mbuf.
3936          */
3937         if (bufsize <= MINCLSIZE) {
3938                 /* Allocate the elements in one shot from the mbuf cache */
3939                 ASSERT(bufsize <= MHLEN || nsegs == 2);
3940                 cp = m_cache(MC_MBUF);
3941                 needed = mcache_alloc_ext(cp, &mp_list,
3942                     (*numlist) * nsegs, mcflags);
3943
3944                 /*
3945                  * The number of elements must be even if we are to use an
3946                  * mbuf (instead of a cluster) to store the residual data.
3947                  * If we couldn't allocate the requested number of mbufs,
3948                  * trim the number down (if it's odd) in order to avoid
3949                  * creating a partial segment chain.
3950                  */
3951                 if (bufsize > MHLEN && (needed & 0x1))
3952                         needed--;
3953
3954                 while (num < needed) {
3955                         struct mbuf *m;
3956
3957                         m = (struct mbuf *)mp_list;
3958                         mp_list = mp_list->obj_next;
3959                         ASSERT(m != NULL);
3960
3961                         MBUF_INIT(m, 1, MT_DATA);
3962 #if CONFIG_MACF_NET
3963                         if (mac_init_mbuf(m, wait) != 0) {
3964                                 m_free(m);
3965                                 break;
3966                         }
3967 #endif /* MAC_NET */
3968                         num++;
3969                         if (bufsize > MHLEN) {
3970                                 /* A second mbuf for this segment chain */
3971                                 m->m_next = (struct mbuf *)mp_list;
3972                                 mp_list = mp_list->obj_next;
3973                                 ASSERT(m->m_next != NULL);
3974
3975                                 MBUF_INIT(m->m_next, 0, MT_DATA);
3976                                 num++;
3977                         }
3978                         *np = m;
3979                         np = &m->m_nextpkt;
3980                 }
3981                 ASSERT(num != *numlist || mp_list == NULL);
3982
3983                 if (num > 0) {
3984                         mtype_stat_add(MT_DATA, num);
3985                         mtype_stat_sub(MT_FREE, num);
3986                 }
3987                 num /= nsegs;
3988
3989                 /* We've got them all; return to caller */
3990                 if (num == *numlist)
3991                         return (top);
3992
3993                 goto fail;
3994         }
3995
3996         /*
3997          * Complex cases where elements are made up of one or more composite
3998          * mbufs + cluster, depending on packetlen.  Each N-segment chain can
3999          * be illustrated as follows:
4000          *
4001          * [mbuf + cluster 1] [mbuf + cluster 2] ... [mbuf + cluster N]
4002          *
4003          * Every composite mbuf + cluster element comes from the intermediate
4004          * cache (either MC_MBUF_CL or MC_MBUF_BIGCL).  For space efficiency,
4005          * the last composite element will come from the MC_MBUF_CL cache,
4006          * unless the residual data is larger than 2KB where we use the
4007          * big cluster composite cache (MC_MBUF_BIGCL) instead.  Residual
4008          * data is defined as extra data beyond the first element that cannot
4009          * fit into the previous element, i.e. there is no residual data if
4010          * the chain only has 1 segment.
4011          */
4012         r_bufsize = bufsize;
4013         resid = packetlen > bufsize ? packetlen % bufsize : 0;
4014         if (resid > 0) {
4015                 /* There is residual data; figure out the cluster size */
4016                 if (wantsize == 0 && packetlen > MINCLSIZE) {
4017                         /*
4018                          * Caller didn't request that all of the segments
4019                          * in the chain use the same cluster size; use the
4020                          * smaller of the cluster sizes.
4021                          */
4022                         if (njcl > 0 && resid > m_maxsize(MC_BIGCL))
4023                                 r_bufsize = m_maxsize(MC_16KCL);
4024                         else if (resid > m_maxsize(MC_CL))
4025                                 r_bufsize = m_maxsize(MC_BIGCL);
4026                         else
4027                                 r_bufsize = m_maxsize(MC_CL);
4028                 } else {
4029                         /* Use the same cluster size as the other segments */
4030                         resid = 0;
4031                 }
4032         }
4033
4034         needed = *numlist;
4035         if (resid > 0) {
4036                 /*
4037                  * Attempt to allocate composite mbuf + cluster elements for
4038                  * the residual data in each chain; record the number of such
4039                  * elements that can be allocated so that we know how many
4040                  * segment chains we can afford to create.
4041                  */
4042                 if (r_bufsize <= m_maxsize(MC_CL))
4043                         rcp = m_cache(MC_MBUF_CL);
4044                 else if (r_bufsize <= m_maxsize(MC_BIGCL))
4045                         rcp = m_cache(MC_MBUF_BIGCL);
4046                 else
4047                         rcp = m_cache(MC_MBUF_16KCL);
4048                 needed = mcache_alloc_ext(rcp, &rmp_list, *numlist, mcflags);
4049
4050                 if (needed == 0)
4051                         goto fail;
4052
4053                 /* This is temporarily reduced for calculation */
4054                 ASSERT(nsegs > 1);
4055                 nsegs--;
4056         }
4057
4058         /*
4059          * Attempt to allocate the rest of the composite mbuf + cluster
4060          * elements for the number of segment chains that we need.
4061          */
4062         if (bufsize <= m_maxsize(MC_CL))
4063                 cp = m_cache(MC_MBUF_CL);
4064         else if (bufsize <= m_maxsize(MC_BIGCL))
4065                 cp = m_cache(MC_MBUF_BIGCL);
4066         else
4067                 cp = m_cache(MC_MBUF_16KCL);
4068         needed = mcache_alloc_ext(cp, &mp_list, needed * nsegs, mcflags);
4069
4070         /* Round it down to avoid creating a partial segment chain */
4071         needed = (needed / nsegs) * nsegs;
4072         if (needed == 0)
4073                 goto fail;
4074
4075         if (resid > 0) {
4076                 /*
4077                  * We're about to construct the chain(s); take into account
4078                  * the number of segments we have created above to hold the
4079                  * residual data for each chain, as well as restore the
4080                  * original count of segments per chain.
4081                  */
4082                 ASSERT(nsegs > 0);
4083                 needed += needed / nsegs;
4084                 nsegs++;
4085         }
4086
4087         for (;;) {
4088                 struct mbuf *m;
4089                 u_int32_t flag;
4090                 struct ext_ref *rfa;
4091                 void *cl;
4092                 int pkthdr;
4093
4094                 ++num;
4095                 if (nsegs == 1 || (num % nsegs) != 0 || resid == 0) {
4096                         m = (struct mbuf *)mp_list;
4097                         mp_list = mp_list->obj_next;
4098                 } else {
4099                         m = (struct mbuf *)rmp_list;
4100                         rmp_list = rmp_list->obj_next;
4101                 }
4102                 ASSERT(m != NULL);
4103                 VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT);
4104                 VERIFY(m->m_ext.ext_free == NULL ||
4105                     m->m_ext.ext_free == m_bigfree ||
4106                     m->m_ext.ext_free == m_16kfree);
4107
4108                 cl = m->m_ext.ext_buf;
4109                 rfa = MEXT_RFA(m);
4110
4111                 ASSERT(cl != NULL && rfa != NULL);
4112                 VERIFY(MBUF_IS_COMPOSITE(m));
4113
4114                 flag = MEXT_FLAGS(m);
4115
4116                 pkthdr = (nsegs == 1 || (num % nsegs) == 1);
4117                 if (pkthdr)
4118                         first = m;
4119                 MBUF_INIT(m, pkthdr, MT_DATA);
4120                 if (m->m_ext.ext_free == m_16kfree) {
4121                         MBUF_16KCL_INIT(m, cl, rfa, 1, flag);
4122                 } else if (m->m_ext.ext_free == m_bigfree) {
4123                         MBUF_BIGCL_INIT(m, cl, rfa, 1, flag);
4124                 } else {
4125                         MBUF_CL_INIT(m, cl, rfa, 1, flag);
4126                 }
4127 #if CONFIG_MACF_NET
4128                 if (pkthdr && mac_init_mbuf(m, wait) != 0) {
4129                         --num;
4130                         m_freem(m);
4131                         break;
4132                 }
4133 #endif /* MAC_NET */
4134
4135                 *np = m;
4136                 if ((num % nsegs) == 0)
4137                         np = &first->m_nextpkt;
4138                 else
4139                         np = &m->m_next;
4140
4141                 if (num == needed)
4142                         break;
4143         }
4144
4145         if (num > 0) {
4146                 mtype_stat_add(MT_DATA, num);
4147                 mtype_stat_sub(MT_FREE, num);
4148         }
4149
4150         num /= nsegs;
4151
4152         /* We've got them all; return to caller */
4153         if (num == *numlist) {
4154                 ASSERT(mp_list == NULL && rmp_list == NULL);
4155                 return (top);
4156         }
4157
4158 fail:
4159         /* Free up what's left of the above */
4160         if (mp_list != NULL)
4161                 mcache_free_ext(cp, mp_list);
4162         if (rmp_list != NULL)
4163                 mcache_free_ext(rcp, rmp_list);
4164         if (wantall && top != NULL) {
4165                 m_freem(top);
4166                 return (NULL);
4167         }
4168         *numlist = num;
4169         return (top);
4170 }
4171
4172 /*
4173  * Best effort to get a mbuf cluster + pkthdr.  Used by drivers to allocated
4174  * packets on receive ring.
4175  */
4176 __private_extern__ struct mbuf *
4177 m_getpacket_how(int wait)
4178 {
4179         unsigned int num_needed = 1;
4180
4181         return (m_getpackets_internal(&num_needed, 1, wait, 1,
4182             m_maxsize(MC_CL)));
4183 }
4184
4185 /*
4186  * Best effort to get a mbuf cluster + pkthdr.  Used by drivers to allocated
4187  * packets on receive ring.
4188  */
4189 struct mbuf *
4190 m_getpacket(void)
4191 {
4192         unsigned int num_needed = 1;
4193
4194         return (m_getpackets_internal(&num_needed, 1, M_WAIT, 1,
4195             m_maxsize(MC_CL)));
4196 }
4197
4198 /*
4199  * Return a list of mbuf hdrs that point to clusters.  Try for num_needed;
4200  * if this can't be met, return whatever number were available.  Set up the
4201  * first num_with_pkthdrs with mbuf hdrs configured as packet headers.  These
4202  * are chained on the m_nextpkt field.  Any packets requested beyond this are
4203  * chained onto the last packet header's m_next field.
4204  */
4205 struct mbuf *
4206 m_getpackets(int num_needed, int num_with_pkthdrs, int how)
4207 {
4208         unsigned int n = num_needed;
4209
4210         return (m_getpackets_internal(&n, num_with_pkthdrs, how, 0,
4211             m_maxsize(MC_CL)));
4212 }
4213
4214 /*
4215  * Return a list of mbuf hdrs set up as packet hdrs chained together
4216  * on the m_nextpkt field
4217  */
4218 struct mbuf *
4219 m_getpackethdrs(int num_needed, int how)
4220 {
4221         struct mbuf *m;
4222         struct mbuf **np, *top;
4223
4224         top = NULL;
4225         np = &top;
4226
4227         while (num_needed--) {
4228                 m = _M_RETRYHDR(how, MT_DATA);
4229                 if (m == NULL)
4230                         break;
4231
4232                 *np = m;
4233                 np = &m->m_nextpkt;
4234         }
4235
4236         return (top);
4237 }
4238
4239 /*
4240  * Free an mbuf list (m_nextpkt) while following m_next.  Returns the count
4241  * for mbufs packets freed.  Used by the drivers.
4242  */
4243 int
4244 m_freem_list(struct mbuf *m)
4245 {
4246         struct mbuf *nextpkt;
4247         mcache_obj_t *mp_list = NULL;
4248         mcache_obj_t *mcl_list = NULL;
4249         mcache_obj_t *mbc_list = NULL;
4250         mcache_obj_t *m16k_list = NULL;
4251         mcache_obj_t *m_mcl_list = NULL;
4252         mcache_obj_t *m_mbc_list = NULL;
4253         mcache_obj_t *m_m16k_list = NULL;
4254         mcache_obj_t *ref_list = NULL;
4255         int pktcount = 0;
4256         int mt_free = 0, mt_data = 0, mt_header = 0, mt_soname = 0, mt_tag = 0;
4257
4258         while (m != NULL) {
4259                 pktcount++;
4260
4261                 nextpkt = m->m_nextpkt;
4262                 m->m_nextpkt = NULL;
4263
4264                 while (m != NULL) {
4265                         struct mbuf *next = m->m_next;
4266                         mcache_obj_t *o, *rfa;
4267                         u_int32_t refcnt, composite;
4268
4269                         if (m->m_type == MT_FREE)
4270                                 panic("m_free: freeing an already freed mbuf");
4271
4272                         if (m->m_type != MT_FREE)
4273                                 mt_free++;
4274
4275                         if (m->m_flags & M_PKTHDR) {
4276                                 /* Check for scratch area overflow */
4277                                 m_redzone_verify(m);
4278                                 /* Free the aux data and tags if there is any */
4279                                 m_tag_delete_chain(m, NULL);
4280                         }
4281
4282                         if (!(m->m_flags & M_EXT))
4283                                 goto simple_free;
4284
4285                         o = (mcache_obj_t *)(void *)m->m_ext.ext_buf;
4286                         refcnt = m_decref(m);
4287                         composite = (MEXT_FLAGS(m) & EXTF_COMPOSITE);
4288                         if (refcnt == 0 && !composite) {
4289                                 if (m->m_ext.ext_free == NULL) {
4290                                         o->obj_next = mcl_list;
4291                                         mcl_list = o;
4292                                 } else if (m->m_ext.ext_free == m_bigfree) {
4293                                         o->obj_next = mbc_list;
4294                                         mbc_list = o;
4295                                 } else if (m->m_ext.ext_free == m_16kfree) {
4296                                         o->obj_next = m16k_list;
4297                                         m16k_list = o;
4298                                 } else {
4299                                         (*(m->m_ext.ext_free))((caddr_t)o,
4300                                             m->m_ext.ext_size,
4301                                             m->m_ext.ext_arg);
4302                                 }
4303                                 rfa = (mcache_obj_t *)(void *)MEXT_RFA(m);
4304                                 rfa->obj_next = ref_list;
4305                                 ref_list = rfa;
4306                                 MEXT_RFA(m) = NULL;
4307                         } else if (refcnt == 0 && composite) {
4308                                 VERIFY(m->m_type != MT_FREE);
4309                                 /*
4310                                  * Amortize the costs of atomic operations
4311                                  * by doing them at the end, if possible.
4312                                  */
4313                                 if (m->m_type == MT_DATA)
4314                                         mt_data++;
4315                                 else if (m->m_type == MT_HEADER)
4316                                         mt_header++;
4317                                 else if (m->m_type == MT_SONAME)
4318                                         mt_soname++;
4319                                 else if (m->m_type == MT_TAG)
4320                                         mt_tag++;
4321                                 else
4322                                         mtype_stat_dec(m->m_type);
4323
4324                                 m->m_type = MT_FREE;
4325                                 m->m_flags = M_EXT;
4326                                 m->m_len = 0;
4327                                 m->m_next = m->m_nextpkt = NULL;
4328
4329                                 MEXT_FLAGS(m) &= ~EXTF_READONLY;
4330
4331                                 /* "Free" into the intermediate cache */
4332                                 o = (mcache_obj_t *)m;
4333                                 if (m->m_ext.ext_free == NULL) {
4334                                         o->obj_next = m_mcl_list;
4335                                         m_mcl_list = o;
4336                                 } else if (m->m_ext.ext_free == m_bigfree) {
4337                                         o->obj_next = m_mbc_list;
4338                                         m_mbc_list = o;
4339                                 } else {
4340                                         VERIFY(m->m_ext.ext_free == m_16kfree);
4341                                         o->obj_next = m_m16k_list;
4342                                         m_m16k_list = o;
4343                                 }
4344                                 m = next;
4345                                 continue;
4346                         }
4347 simple_free:
4348                         /*
4349                          * Amortize the costs of atomic operations
4350                          * by doing them at the end, if possible.
4351                          */
4352                         if (m->m_type == MT_DATA)
4353                                 mt_data++;
4354                         else if (m->m_type == MT_HEADER)
4355                                 mt_header++;
4356                         else if (m->m_type == MT_SONAME)
4357                                 mt_soname++;
4358                         else if (m->m_type == MT_TAG)
4359                                 mt_tag++;
4360                         else if (m->m_type != MT_FREE)
4361                                 mtype_stat_dec(m->m_type);
4362
4363                         m->m_type = MT_FREE;
4364                         m->m_flags = m->m_len = 0;
4365                         m->m_next = m->m_nextpkt = NULL;
4366
4367                         ((mcache_obj_t *)m)->obj_next = mp_list;
4368                         mp_list = (mcache_obj_t *)m;
4369
4370                         m = next;
4371                 }
4372
4373                 m = nextpkt;
4374         }
4375
4376         if (mt_free > 0)
4377                 mtype_stat_add(MT_FREE, mt_free);
4378         if (mt_data > 0)
4379                 mtype_stat_sub(MT_DATA, mt_data);
4380         if (mt_header > 0)
4381                 mtype_stat_sub(MT_HEADER, mt_header);
4382         if (mt_soname > 0)
4383                 mtype_stat_sub(MT_SONAME, mt_soname);
4384         if (mt_tag > 0)
4385                 mtype_stat_sub(MT_TAG, mt_tag);
4386
4387         if (mp_list != NULL)
4388                 mcache_free_ext(m_cache(MC_MBUF), mp_list);
4389         if (mcl_list != NULL)
4390                 mcache_free_ext(m_cache(MC_CL), mcl_list);
4391         if (mbc_list != NULL)
4392                 mcache_free_ext(m_cache(MC_BIGCL), mbc_list);
4393         if (m16k_list != NULL)
4394                 mcache_free_ext(m_cache(MC_16KCL), m16k_list);
4395         if (m_mcl_list != NULL)
4396                 mcache_free_ext(m_cache(MC_MBUF_CL), m_mcl_list);
4397         if (m_mbc_list != NULL)
4398                 mcache_free_ext(m_cache(MC_MBUF_BIGCL), m_mbc_list);
4399         if (m_m16k_list != NULL)
4400                 mcache_free_ext(m_cache(MC_MBUF_16KCL), m_m16k_list);
4401         if (ref_list != NULL)
4402                 mcache_free_ext(ref_cache, ref_list);
4403
4404         return (pktcount);
4405 }
4406
4407 void
4408 m_freem(struct mbuf *m)
4409 {
4410         while (m != NULL)
4411                 m = m_free(m);
4412 }
4413
4414 /*
4415  * Mbuffer utility routines.
4416  */
4417
4418 /*
4419  * Compute the amount of space available before the current start
4420  * of data in an mbuf.
4421  */
4422 int
4423 m_leadingspace(struct mbuf *m)
4424 {
4425         if (m->m_flags & M_EXT) {
4426                 if (MCLHASREFERENCE(m))
4427                         return (0);
4428                 return (m->m_data - m->m_ext.ext_buf);
4429         }
4430         if (m->m_flags & M_PKTHDR)
4431                 return (m->m_data - m->m_pktdat);
4432         return (m->m_data - m->m_dat);
4433 }
4434
4435 /*
4436  * Compute the amount of space available after the end of data in an mbuf.
4437  */
4438 int
4439 m_trailingspace(struct mbuf *m)
4440 {
4441         if (m->m_flags & M_EXT) {
4442                 if (MCLHASREFERENCE(m))
4443                         return (0);
4444                 return (m->m_ext.ext_buf + m->m_ext.ext_size -
4445                     (m->m_data + m->m_len));
4446         }
4447         return (&m->m_dat[MLEN] - (m->m_data + m->m_len));
4448 }
4449
4450 /*
4451  * Lesser-used path for M_PREPEND: allocate new mbuf to prepend to chain,
4452  * copy junk along.  Does not adjust packet header length.
4453  */
4454 struct mbuf *
4455 m_prepend(struct mbuf *m, int len, int how)
4456 {
4457         struct mbuf *mn;
4458
4459         _MGET(mn, how, m->m_type);
4460         if (mn == NULL) {
4461                 m_freem(m);
4462                 return (NULL);
4463         }
4464         if (m->m_flags & M_PKTHDR) {
4465                 M_COPY_PKTHDR(mn, m);
4466                 m->m_flags &= ~M_PKTHDR;
4467         }
4468         mn->m_next = m;
4469         m = mn;
4470         if (len < MHLEN)
4471                 MH_ALIGN(m, len);
4472         m->m_len = len;
4473         return (m);
4474 }
4475
4476 /*
4477  * Replacement for old M_PREPEND macro: allocate new mbuf to prepend to
4478  * chain, copy junk along, and adjust length.
4479  */
4480 struct mbuf *
4481 m_prepend_2(struct mbuf *m, int len, int how)
4482 {
4483         if (M_LEADINGSPACE(m) >= len) {
4484                 m->m_data -= len;
4485                 m->m_len += len;
4486         } else {
4487                 m = m_prepend(m, len, how);
4488         }
4489         if ((m) && (m->m_flags & M_PKTHDR))
4490                 m->m_pkthdr.len += len;
4491         return (m);
4492 }
4493
4494 /*
4495  * Make a copy of an mbuf chain starting "off0" bytes from the beginning,
4496  * continuing for "len" bytes.  If len is M_COPYALL, copy to end of mbuf.
4497  * The wait parameter is a choice of M_WAIT/M_DONTWAIT from caller.
4498  */
4499 int MCFail;
4500
4501 struct mbuf *
4502 m_copym_mode(struct mbuf *m, int off0, int len, int wait, uint32_t mode)
4503 {
4504         struct mbuf *n, *mhdr = NULL, **np;
4505         int off = off0;
4506         struct mbuf *top;
4507         int copyhdr = 0;
4508
4509         if (off < 0 || len < 0)
4510                 panic("m_copym: invalid offset %d or len %d", off, len);
4511
4512         if (off == 0 && (m->m_flags & M_PKTHDR)) {
4513                 mhdr = m;
4514                 copyhdr = 1;
4515         }
4516
4517         while (off >= m->m_len) {
4518                 if (m->m_next == NULL)
4519                         panic("m_copym: invalid mbuf chain");
4520                 off -= m->m_len;
4521                 m = m->m_next;
4522         }
4523         np = &top;
4524         top = NULL;
4525
4526         while (len > 0) {
4527                 if (m == NULL) {
4528                         if (len != M_COPYALL)
4529                                 panic("m_copym: len != M_COPYALL");
4530                         break;
4531                 }
4532
4533                 n = _M_RETRY(wait, m->m_type);
4534                 *np = n;
4535
4536                 if (n == NULL)
4537                         goto nospace;
4538
4539                 if (copyhdr != 0) {
4540                         if (mode == M_COPYM_MOVE_HDR) {
4541                                 M_COPY_PKTHDR(n, mhdr);
4542                         } else if (mode == M_COPYM_COPY_HDR) {
4543                                 if (m_dup_pkthdr(n, mhdr, wait) == 0)
4544                                         goto nospace;
4545                         }
4546                         if (len == M_COPYALL)
4547                                 n->m_pkthdr.len -= off0;
4548                         else
4549                                 n->m_pkthdr.len = len;
4550                         copyhdr = 0;
4551                 }
4552                 if (len == M_COPYALL) {
4553                         if (MIN(len, (m->m_len - off)) == len) {
4554                                 printf("m->m_len %d - off %d = %d, %d\n",
4555                                     m->m_len, off, m->m_len - off,
4556                                     MIN(len, (m->m_len - off)));
4557                         }
4558                 }
4559                 n->m_len = MIN(len, (m->m_len - off));
4560                 if (n->m_len == M_COPYALL) {
4561                         printf("n->m_len == M_COPYALL, fixing\n");
4562                         n->m_len = MHLEN;
4563                 }
4564                 if (m->m_flags & M_EXT) {
4565                         n->m_ext = m->m_ext;
4566                         m_incref(m);
4567                         n->m_data = m->m_data + off;
4568                         n->m_flags |= M_EXT;
4569                 } else {
4570                         bcopy(MTOD(m, caddr_t)+off, MTOD(n, caddr_t),
4571                             (unsigned)n->m_len);
4572                 }
4573                 if (len != M_COPYALL)
4574                         len -= n->m_len;
4575                 off = 0;
4576                 m = m->m_next;
4577                 np = &n->m_next;
4578         }
4579
4580         if (top == NULL)
4581                 MCFail++;
4582
4583         return (top);
4584 nospace:
4585
4586         m_freem(top);
4587         MCFail++;
4588         return (NULL);
4589 }
4590
4591
4592 struct mbuf *
4593 m_copym(struct mbuf *m, int off0, int len, int wait)
4594 {
4595         return (m_copym_mode(m, off0, len, wait, M_COPYM_MOVE_HDR));
4596 }
4597
4598 /*
4599  * Equivalent to m_copym except that all necessary mbuf hdrs are allocated
4600  * within this routine also, the last mbuf and offset accessed are passed
4601  * out and can be passed back in to avoid having to rescan the entire mbuf
4602  * list (normally hung off of the socket)
4603  */
4604 struct mbuf *
4605 m_copym_with_hdrs(struct mbuf *m, int off0, int len0, int wait,
4606     struct mbuf **m_lastm, int *m_off, uint32_t mode)
4607 {
4608         struct mbuf *n, **np = NULL;
4609         int off = off0, len = len0;
4610         struct mbuf *top = NULL;
4611         int mcflags = MSLEEPF(wait);
4612         int copyhdr = 0;
4613         int type = 0;
4614         mcache_obj_t *list = NULL;
4615         int needed = 0;
4616
4617         if (off == 0 && (m->m_flags & M_PKTHDR))
4618                 copyhdr = 1;
4619
4620         if (*m_lastm != NULL) {
4621                 m = *m_lastm;
4622                 off = *m_off;
4623         } else {
4624                 while (off >= m->m_len) {
4625                         off -= m->m_len;
4626                         m = m->m_next;
4627                 }
4628         }
4629
4630         n = m;
4631         while (len > 0) {
4632                 needed++;
4633                 ASSERT(n != NULL);
4634                 len -= MIN(len, (n->m_len - ((needed == 1) ? off : 0)));
4635                 n = n->m_next;
4636         }
4637         needed++;
4638         len = len0;
4639
4640         /*
4641          * If the caller doesn't want to be put to sleep, mark it with
4642          * MCR_TRYHARD so that we may reclaim buffers from other places
4643          * before giving up.
4644          */
4645         if (mcflags & MCR_NOSLEEP)
4646                 mcflags |= MCR_TRYHARD;
4647
4648         if (mcache_alloc_ext(m_cache(MC_MBUF), &list, needed,
4649             mcflags) != needed)
4650                 goto nospace;
4651
4652         needed = 0;
4653         while (len > 0) {
4654                 n = (struct mbuf *)list;
4655                 list = list->obj_next;
4656                 ASSERT(n != NULL && m != NULL);
4657
4658                 type = (top == NULL) ? MT_HEADER : m->m_type;
4659                 MBUF_INIT(n, (top == NULL), type);
4660 #if CONFIG_MACF_NET
4661                 if (top == NULL && mac_mbuf_label_init(n, wait) != 0) {
4662                         mtype_stat_inc(MT_HEADER);
4663                         mtype_stat_dec(MT_FREE);
4664                         m_free(n);
4665                         goto nospace;
4666                 }
4667 #endif /* MAC_NET */
4668
4669                 if (top == NULL) {
4670                         top = n;
4671                         np = &top->m_next;
4672                         continue;
4673                 } else {
4674                         needed++;
4675                         *np = n;
4676                 }
4677
4678                 if (copyhdr) {
4679                         if (mode == M_COPYM_MOVE_HDR) {
4680                                 M_COPY_PKTHDR(n, m);
4681                         } else if (mode == M_COPYM_COPY_HDR) {
4682                                 if (m_dup_pkthdr(n, m, wait) == 0)
4683                                         goto nospace;
4684                         }
4685                         n->m_pkthdr.len = len;
4686                         copyhdr = 0;
4687                 }
4688                 n->m_len = MIN(len, (m->m_len - off));
4689
4690                 if (m->m_flags & M_EXT) {
4691                         n->m_ext = m->m_ext;
4692                         m_incref(m);
4693                         n->m_data = m->m_data + off;
4694                         n->m_flags |= M_EXT;
4695                 } else {
4696                         bcopy(MTOD(m, caddr_t)+off, MTOD(n, caddr_t),
4697                             (unsigned)n->m_len);
4698                 }
4699                 len -= n->m_len;
4700
4701                 if (len == 0) {
4702                         if ((off + n->m_len) == m->m_len) {
4703                                 *m_lastm = m->m_next;
4704                                 *m_off  = 0;
4705                         } else {
4706                                 *m_lastm = m;
4707                                 *m_off  = off + n->m_len;
4708                         }
4709                         break;
4710                 }
4711                 off = 0;
4712                 m = m->m_next;
4713                 np = &n->m_next;
4714         }
4715
4716         mtype_stat_inc(MT_HEADER);
4717         mtype_stat_add(type, needed);
4718         mtype_stat_sub(MT_FREE, needed + 1);
4719
4720         ASSERT(list == NULL);
4721         return (top);
4722
4723 nospace:
4724         if (list != NULL)
4725                 mcache_free_ext(m_cache(MC_MBUF), list);
4726         if (top != NULL)
4727                 m_freem(top);
4728         MCFail++;
4729         return (NULL);
4730 }
4731
4732 /*
4733  * Copy data from an mbuf chain starting "off" bytes from the beginning,
4734  * continuing for "len" bytes, into the indicated buffer.
4735  */
4736 void
4737 m_copydata(struct mbuf *m, int off, int len, void *vp)
4738 {
4739         unsigned count;
4740         char *cp = vp;
4741
4742         if (off < 0 || len < 0)
4743                 panic("m_copydata: invalid offset %d or len %d", off, len);
4744
4745         while (off > 0) {
4746                 if (m == NULL)
4747                         panic("m_copydata: invalid mbuf chain");
4748                 if (off < m->m_len)
4749                         break;
4750                 off -= m->m_len;
4751                 m = m->m_next;
4752         }
4753         while (len > 0) {
4754                 if (m == NULL)
4755                         panic("m_copydata: invalid mbuf chain");
4756                 count = MIN(m->m_len - off, len);
4757                 bcopy(MTOD(m, caddr_t) + off, cp, count);
4758                 len -= count;
4759                 cp += count;
4760                 off = 0;
4761                 m = m->m_next;
4762         }
4763 }
4764
4765 /*
4766  * Concatenate mbuf chain n to m.  Both chains must be of the same type
4767  * (e.g. MT_DATA).  Any m_pkthdr is not updated.
4768  */
4769 void
4770 m_cat(struct mbuf *m, struct mbuf *n)
4771 {
4772         while (m->m_next)
4773                 m = m->m_next;
4774         while (n) {
4775                 if ((m->m_flags & M_EXT) ||
4776                     m->m_data + m->m_len + n->m_len >= &m->m_dat[MLEN]) {
4777                         /* just join the two chains */
4778                         m->m_next = n;
4779                         return;
4780                 }
4781                 /* splat the data from one into the other */
4782                 bcopy(MTOD(n, caddr_t), MTOD(m, caddr_t) + m->m_len,
4783                     (u_int)n->m_len);
4784                 m->m_len += n->m_len;
4785                 n = m_free(n);
4786         }
4787 }
4788
4789 void
4790 m_adj(struct mbuf *mp, int req_len)
4791 {
4792         int len = req_len;
4793         struct mbuf *m;
4794         int count;
4795
4796         if ((m = mp) == NULL)
4797                 return;
4798         if (len >= 0) {
4799                 /*
4800                  * Trim from head.
4801                  */
4802                 while (m != NULL && len > 0) {
4803                         if (m->m_len <= len) {
4804                                 len -= m->m_len;
4805                                 m->m_len = 0;
4806                                 m = m->m_next;
4807                         } else {
4808                                 m->m_len -= len;
4809                                 m->m_data += len;
4810                                 len = 0;
4811                         }
4812                 }
4813                 m = mp;
4814                 if (m->m_flags & M_PKTHDR)
4815                         m->m_pkthdr.len -= (req_len - len);
4816         } else {
4817                 /*
4818                  * Trim from tail.  Scan the mbuf chain,
4819                  * calculating its length and finding the last mbuf.
4820                  * If the adjustment only affects this mbuf, then just
4821                  * adjust and return.  Otherwise, rescan and truncate
4822                  * after the remaining size.
4823                  */
4824                 len = -len;
4825                 count = 0;
4826                 for (;;) {
4827                         count += m->m_len;
4828                         if (m->m_next == (struct mbuf *)0)
4829                                 break;
4830                         m = m->m_next;
4831                 }
4832                 if (m->m_len >= len) {
4833                         m->m_len -= len;
4834                         m = mp;
4835                         if (m->m_flags & M_PKTHDR)
4836                                 m->m_pkthdr.len -= len;
4837                         return;
4838                 }
4839                 count -= len;
4840                 if (count < 0)
4841                         count = 0;
4842                 /*
4843                  * Correct length for chain is "count".
4844                  * Find the mbuf with last data, adjust its length,
4845                  * and toss data from remaining mbufs on chain.
4846                  */
4847                 m = mp;
4848                 if (m->m_flags & M_PKTHDR)
4849                         m->m_pkthdr.len = count;
4850                 for (; m; m = m->m_next) {
4851                         if (m->m_len >= count) {
4852                                 m->m_len = count;
4853                                 break;
4854                         }
4855                         count -= m->m_len;
4856                 }
4857                 while ((m = m->m_next))
4858                         m->m_len = 0;
4859         }
4860 }
4861
4862 /*
4863  * Rearange an mbuf chain so that len bytes are contiguous
4864  * and in the data area of an mbuf (so that mtod and dtom
4865  * will work for a structure of size len).  Returns the resulting
4866  * mbuf chain on success, frees it and returns null on failure.
4867  * If there is room, it will add up to max_protohdr-len extra bytes to the
4868  * contiguous region in an attempt to avoid being called next time.
4869  */
4870 int MPFail;
4871
4872 struct mbuf *
4873 m_pullup(struct mbuf *n, int len)
4874 {
4875         struct mbuf *m;
4876         int count;
4877         int space;
4878
4879         /*
4880          * If first mbuf has no cluster, and has room for len bytes
4881          * without shifting current data, pullup into it,
4882          * otherwise allocate a new mbuf to prepend to the chain.
4883          */
4884         if ((n->m_flags & M_EXT) == 0 &&
4885             n->m_data + len < &n->m_dat[MLEN] && n->m_next) {
4886                 if (n->m_len >= len)
4887                         return (n);
4888                 m = n;
4889                 n = n->m_next;
4890                 len -= m->m_len;
4891         } else {
4892                 if (len > MHLEN)
4893                         goto bad;
4894                 _MGET(m, M_DONTWAIT, n->m_type);
4895                 if (m == 0)
4896                         goto bad;
4897                 m->m_len = 0;
4898                 if (n->m_flags & M_PKTHDR) {
4899                         M_COPY_PKTHDR(m, n);
4900                         n->m_flags &= ~M_PKTHDR;
4901                 }
4902         }
4903         space = &m->m_dat[MLEN] - (m->m_data + m->m_len);
4904         do {
4905                 count = MIN(MIN(MAX(len, max_protohdr), space), n->m_len);
4906                 bcopy(MTOD(n, caddr_t), MTOD(m, caddr_t) + m->m_len,
4907                     (unsigned)count);
4908                 len -= count;
4909                 m->m_len += count;
4910                 n->m_len -= count;
4911                 space -= count;
4912                 if (n->m_len)
4913                         n->m_data += count;
4914                 else
4915                         n = m_free(n);
4916         } while (len > 0 && n);
4917         if (len > 0) {
4918                 (void) m_free(m);
4919                 goto bad;
4920         }
4921         m->m_next = n;
4922         return (m);
4923 bad:
4924         m_freem(n);
4925         MPFail++;
4926         return (0);
4927 }
4928
4929 /*
4930  * Like m_pullup(), except a new mbuf is always allocated, and we allow
4931  * the amount of empty space before the data in the new mbuf to be specified
4932  * (in the event that the caller expects to prepend later).
4933  */
4934 __private_extern__ int MSFail = 0;
4935
4936 __private_extern__ struct mbuf *
4937 m_copyup(struct mbuf *n, int len, int dstoff)
4938 {
4939         struct mbuf *m;
4940         int count, space;
4941
4942         if (len > (MHLEN - dstoff))
4943                 goto bad;
4944         MGET(m, M_DONTWAIT, n->m_type);
4945         if (m == NULL)
4946                 goto bad;
4947         m->m_len = 0;
4948         if (n->m_flags & M_PKTHDR) {
4949                 m_copy_pkthdr(m, n);
4950                 n->m_flags &= ~M_PKTHDR;
4951         }
4952         m->m_data += dstoff;
4953         space = &m->m_dat[MLEN] - (m->m_data + m->m_len);
4954         do {
4955                 count = min(min(max(len, max_protohdr), space), n->m_len);
4956                 memcpy(mtod(m, caddr_t) + m->m_len, mtod(n, caddr_t),
4957                     (unsigned)count);
4958                 len -= count;
4959                 m->m_len += count;
4960                 n->m_len -= count;
4961                 space -= count;
4962                 if (n->m_len)
4963                         n->m_data += count;
4964                 else
4965                         n = m_free(n);
4966         } while (len > 0 && n);
4967         if (len > 0) {
4968                 (void) m_free(m);
4969                 goto bad;
4970         }
4971         m->m_next = n;
4972         return (m);
4973 bad:
4974         m_freem(n);
4975         MSFail++;
4976         return (NULL);
4977 }
4978
4979 /*
4980  * Partition an mbuf chain in two pieces, returning the tail --
4981  * all but the first len0 bytes.  In case of failure, it returns NULL and
4982  * attempts to restore the chain to its original state.
4983  */
4984 struct mbuf *
4985 m_split(struct mbuf *m0, int len0, int wait)
4986 {
4987         return (m_split0(m0, len0, wait, 1));
4988 }
4989
4990 static struct mbuf *
4991 m_split0(struct mbuf *m0, int len0, int wait, int copyhdr)
4992 {
4993         struct mbuf *m, *n;
4994         unsigned len = len0, remain;
4995
4996         for (m = m0; m && len > m->m_len; m = m->m_next)
4997                 len -= m->m_len;
4998         if (m == NULL)
4999                 return (NULL);
5000         remain = m->m_len - len;
5001         if (copyhdr && (m0->m_flags & M_PKTHDR)) {
5002                 _MGETHDR(n, wait, m0->m_type);
5003                 if (n == NULL)
5004                         return (NULL);
5005                 n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif;
5006                 n->m_pkthdr.len = m0->m_pkthdr.len - len0;
5007                 m0->m_pkthdr.len = len0;
5008                 if (m->m_flags & M_EXT)
5009                         goto extpacket;
5010                 if (remain > MHLEN) {
5011                         /* m can't be the lead packet */
5012                         MH_ALIGN(n, 0);
5013                         n->m_next = m_split(m, len, wait);
5014                         if (n->m_next == NULL) {
5015                                 (void) m_free(n);
5016                                 return (NULL);
5017                         } else
5018                                 return (n);
5019                 } else
5020                         MH_ALIGN(n, remain);
5021         } else if (remain == 0) {
5022                 n = m->m_next;
5023                 m->m_next = NULL;
5024                 return (n);
5025         } else {
5026                 _MGET(n, wait, m->m_type);
5027                 if (n == NULL)
5028                         return (NULL);
5029                 M_ALIGN(n, remain);
5030         }
5031 extpacket:
5032         if (m->m_flags & M_EXT) {
5033                 n->m_flags |= M_EXT;
5034                 n->m_ext = m->m_ext;
5035                 m_incref(m);
5036                 n->m_data = m->m_data + len;
5037         } else {
5038                 bcopy(MTOD(m, caddr_t) + len, MTOD(n, caddr_t), remain);
5039         }
5040         n->m_len = remain;
5041         m->m_len = len;
5042         n->m_next = m->m_next;
5043         m->m_next = NULL;
5044         return (n);
5045 }
5046
5047 /*
5048  * Routine to copy from device local memory into mbufs.
5049  */
5050 struct mbuf *
5051 m_devget(char *buf, int totlen, int off0, struct ifnet *ifp,
5052     void (*copy)(const void *, void *, size_t))
5053 {
5054         struct mbuf *m;
5055         struct mbuf *top = NULL, **mp = &top;
5056         int off = off0, len;
5057         char *cp;
5058         char *epkt;
5059
5060         cp = buf;
5061         epkt = cp + totlen;
5062         if (off) {
5063                 /*
5064                  * If 'off' is non-zero, packet is trailer-encapsulated,
5065                  * so we have to skip the type and length fields.
5066                  */
5067                 cp += off + 2 * sizeof (u_int16_t);
5068                 totlen -= 2 * sizeof (u_int16_t);
5069         }
5070         _MGETHDR(m, M_DONTWAIT, MT_DATA);
5071         if (m == NULL)
5072                 return (NULL);
5073         m->m_pkthdr.rcvif = ifp;
5074         m->m_pkthdr.len = totlen;
5075         m->m_len = MHLEN;
5076
5077         while (totlen > 0) {
5078                 if (top != NULL) {
5079                         _MGET(m, M_DONTWAIT, MT_DATA);
5080                         if (m == NULL) {
5081                                 m_freem(top);
5082                                 return (NULL);
5083                         }
5084                         m->m_len = MLEN;
5085                 }
5086                 len = MIN(totlen, epkt - cp);
5087                 if (len >= MINCLSIZE) {
5088                         MCLGET(m, M_DONTWAIT);
5089                         if (m->m_flags & M_EXT) {
5090                                 m->m_len = len = MIN(len, m_maxsize(MC_CL));
5091                         } else {
5092                                 /* give up when it's out of cluster mbufs */
5093                                 if (top != NULL)
5094                                         m_freem(top);
5095                                 m_freem(m);
5096                                 return (NULL);
5097                         }
5098                 } else {
5099                         /*
5100                          * Place initial small packet/header at end of mbuf.
5101                          */
5102                         if (len < m->m_len) {
5103                                 if (top == NULL &&
5104                                     len + max_linkhdr <= m->m_len)
5105                                         m->m_data += max_linkhdr;
5106                                 m->m_len = len;
5107                         } else {
5108                                 len = m->m_len;
5109                         }
5110                 }
5111                 if (copy)
5112                         copy(cp, MTOD(m, caddr_t), (unsigned)len);
5113                 else
5114                         bcopy(cp, MTOD(m, caddr_t), (unsigned)len);
5115                 cp += len;
5116                 *mp = m;
5117                 mp = &m->m_next;
5118                 totlen -= len;
5119                 if (cp == epkt)
5120                         cp = buf;
5121         }
5122         return (top);
5123 }
5124
5125 #ifndef MBUF_GROWTH_NORMAL_THRESH
5126 #define MBUF_GROWTH_NORMAL_THRESH 25
5127 #endif
5128
5129 /*
5130  * Cluster freelist allocation check.
5131  */
5132 static int
5133 m_howmany(int num, size_t bufsize)
5134 {
5135         int i = 0, j = 0;
5136         u_int32_t m_mbclusters, m_clusters, m_bigclusters, m_16kclusters;
5137         u_int32_t m_mbfree, m_clfree, m_bigclfree, m_16kclfree;
5138         u_int32_t sumclusters, freeclusters;
5139         u_int32_t percent_pool, percent_kmem;
5140         u_int32_t mb_growth, mb_growth_thresh;
5141
5142         VERIFY(bufsize == m_maxsize(MC_BIGCL) ||
5143             bufsize == m_maxsize(MC_16KCL));
5144
5145         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
5146
5147         /* Numbers in 2K cluster units */
5148         m_mbclusters = m_total(MC_MBUF) >> NMBPCLSHIFT;
5149         m_clusters = m_total(MC_CL);
5150         m_bigclusters = m_total(MC_BIGCL) << NCLPBGSHIFT;
5151         m_16kclusters = m_total(MC_16KCL);
5152         sumclusters = m_mbclusters + m_clusters + m_bigclusters;
5153
5154         m_mbfree = m_infree(MC_MBUF) >> NMBPCLSHIFT;
5155         m_clfree = m_infree(MC_CL);
5156         m_bigclfree = m_infree(MC_BIGCL) << NCLPBGSHIFT;
5157         m_16kclfree = m_infree(MC_16KCL);
5158         freeclusters = m_mbfree + m_clfree + m_bigclfree;
5159
5160         /* Bail if we've maxed out the mbuf memory map */
5161         if ((bufsize == m_maxsize(MC_BIGCL) && sumclusters >= nclusters) ||
5162             (njcl > 0 && bufsize == m_maxsize(MC_16KCL) &&
5163             (m_16kclusters << NCLPJCLSHIFT) >= njcl)) {
5164                 return (0);
5165         }
5166
5167         if (bufsize == m_maxsize(MC_BIGCL)) {
5168                 /* Under minimum */
5169                 if (m_bigclusters < m_minlimit(MC_BIGCL))
5170                         return (m_minlimit(MC_BIGCL) - m_bigclusters);
5171
5172                 percent_pool =
5173                     ((sumclusters - freeclusters) * 100) / sumclusters;
5174                 percent_kmem = (sumclusters * 100) / nclusters;
5175
5176                 /*
5177                  * If a light/normal user, grow conservatively (75%)
5178                  * If a heavy user, grow aggressively (50%)
5179                  */
5180                 if (percent_kmem < MBUF_GROWTH_NORMAL_THRESH)
5181                         mb_growth = MB_GROWTH_NORMAL;
5182                 else
5183                         mb_growth = MB_GROWTH_AGGRESSIVE;
5184
5185                 if (percent_kmem < 5) {
5186                         /* For initial allocations */
5187                         i = num;
5188                 } else {
5189                         /* Return if >= MBIGCL_LOWAT clusters available */
5190                         if (m_infree(MC_BIGCL) >= MBIGCL_LOWAT &&
5191                             m_total(MC_BIGCL) >=
5192                             MBIGCL_LOWAT + m_minlimit(MC_BIGCL))
5193                                 return (0);
5194
5195                         /* Ensure at least num clusters are accessible */
5196                         if (num >= m_infree(MC_BIGCL))
5197                                 i = num - m_infree(MC_BIGCL);
5198                         if (num > m_total(MC_BIGCL) - m_minlimit(MC_BIGCL))
5199                                 j = num - (m_total(MC_BIGCL) -
5200                                     m_minlimit(MC_BIGCL));
5201
5202                         i = MAX(i, j);
5203
5204                         /*
5205                          * Grow pool if percent_pool > 75 (normal growth)
5206                          * or percent_pool > 50 (aggressive growth).
5207                          */
5208                         mb_growth_thresh = 100 - (100 / (1 << mb_growth));
5209                         if (percent_pool > mb_growth_thresh)
5210                                 j = ((sumclusters + num) >> mb_growth) -
5211                                     freeclusters;
5212                         i = MAX(i, j);
5213                 }
5214
5215                 /* Check to ensure we didn't go over limits */
5216                 if (i + m_bigclusters >= m_maxlimit(MC_BIGCL))
5217                         i = m_maxlimit(MC_BIGCL) - m_bigclusters;
5218                 if ((i << 1) + sumclusters >= nclusters)
5219                         i = (nclusters - sumclusters) >> 1;
5220                 VERIFY((m_total(MC_BIGCL) + i) <= m_maxlimit(MC_BIGCL));
5221                 VERIFY(sumclusters + (i << 1) <= nclusters);
5222
5223         } else { /* 16K CL */
5224                 VERIFY(njcl > 0);
5225                 /* Under minimum */
5226                 if (m_16kclusters < MIN16KCL)
5227                         return (MIN16KCL - m_16kclusters);
5228                 if (m_16kclfree >= M16KCL_LOWAT)
5229                         return (0);
5230
5231                 /* Ensure at least num clusters are available */
5232                 if (num >= m_16kclfree)
5233                         i = num - m_16kclfree;
5234
5235                 /* Always grow 16KCL pool aggressively */
5236                 if (((m_16kclusters + num) >> 1) > m_16kclfree)
5237                         j = ((m_16kclusters + num) >> 1) - m_16kclfree;
5238                 i = MAX(i, j);
5239
5240                 /* Check to ensure we don't go over limit */
5241                 if (i + m_16kclusters >= m_maxlimit(MC_16KCL))
5242                         i = m_maxlimit(MC_16KCL) - m_16kclusters;
5243                 VERIFY((m_total(MC_16KCL) + i) <= m_maxlimit(MC_16KCL));
5244         }
5245         return (i);
5246 }
5247 /*
5248  * Return the number of bytes in the mbuf chain, m.
5249  */
5250 unsigned int
5251 m_length(struct mbuf *m)
5252 {
5253         struct mbuf *m0;
5254         unsigned int pktlen;
5255
5256         if (m->m_flags & M_PKTHDR)
5257                 return (m->m_pkthdr.len);
5258
5259         pktlen = 0;
5260         for (m0 = m; m0 != NULL; m0 = m0->m_next)
5261                 pktlen += m0->m_len;
5262         return (pktlen);
5263 }
5264
5265 /*
5266  * Copy data from a buffer back into the indicated mbuf chain,
5267  * starting "off" bytes from the beginning, extending the mbuf
5268  * chain if necessary.
5269  */
5270 void
5271 m_copyback(struct mbuf *m0, int off, int len, const void *cp)
5272 {
5273 #if DEBUG
5274         struct mbuf *origm = m0;
5275         int error;
5276 #endif /* DEBUG */
5277
5278         if (m0 == NULL)
5279                 return;
5280
5281 #if DEBUG
5282         error =
5283 #endif /* DEBUG */
5284         m_copyback0(&m0, off, len, cp,
5285             M_COPYBACK0_COPYBACK | M_COPYBACK0_EXTEND, M_DONTWAIT);
5286
5287 #if DEBUG
5288         if (error != 0 || (m0 != NULL && origm != m0))
5289                 panic("m_copyback");
5290 #endif /* DEBUG */
5291 }
5292
5293 struct mbuf *
5294 m_copyback_cow(struct mbuf *m0, int off, int len, const void *cp, int how)
5295 {
5296         int error;
5297
5298         /* don't support chain expansion */
5299         VERIFY(off + len <= m_length(m0));
5300
5301         error = m_copyback0(&m0, off, len, cp,
5302             M_COPYBACK0_COPYBACK | M_COPYBACK0_COW, how);
5303         if (error) {
5304                 /*
5305                  * no way to recover from partial success.
5306                  * just free the chain.
5307                  */
5308                 m_freem(m0);
5309                 return (NULL);
5310         }
5311         return (m0);
5312 }
5313
5314 /*
5315  * m_makewritable: ensure the specified range writable.
5316  */
5317 int
5318 m_makewritable(struct mbuf **mp, int off, int len, int how)
5319 {
5320         int error;
5321 #if DEBUG
5322         struct mbuf *n;
5323         int origlen, reslen;
5324
5325         origlen = m_length(*mp);
5326 #endif /* DEBUG */
5327
5328 #if 0 /* M_COPYALL is large enough */
5329         if (len == M_COPYALL)
5330                 len = m_length(*mp) - off; /* XXX */
5331 #endif
5332
5333         error = m_copyback0(mp, off, len, NULL,
5334             M_COPYBACK0_PRESERVE | M_COPYBACK0_COW, how);
5335
5336 #if DEBUG
5337         reslen = 0;
5338         for (n = *mp; n; n = n->m_next)
5339                 reslen += n->m_len;
5340         if (origlen != reslen)
5341                 panic("m_makewritable: length changed");
5342         if (((*mp)->m_flags & M_PKTHDR) && reslen != (*mp)->m_pkthdr.len)
5343                 panic("m_makewritable: inconsist");
5344 #endif /* DEBUG */
5345
5346         return (error);
5347 }
5348
5349 static int
5350 m_copyback0(struct mbuf **mp0, int off, int len, const void *vp, int flags,
5351     int how)
5352 {
5353         int mlen;
5354         struct mbuf *m, *n;
5355         struct mbuf **mp;
5356         int totlen = 0;
5357         const char *cp = vp;
5358
5359         VERIFY(mp0 != NULL);
5360         VERIFY(*mp0 != NULL);
5361         VERIFY((flags & M_COPYBACK0_PRESERVE) == 0 || cp == NULL);
5362         VERIFY((flags & M_COPYBACK0_COPYBACK) == 0 || cp != NULL);
5363
5364         /*
5365          * we don't bother to update "totlen" in the case of M_COPYBACK0_COW,
5366          * assuming that M_COPYBACK0_EXTEND and M_COPYBACK0_COW are exclusive.
5367          */
5368
5369         VERIFY((~flags & (M_COPYBACK0_EXTEND|M_COPYBACK0_COW)) != 0);
5370
5371         mp = mp0;
5372         m = *mp;
5373         while (off > (mlen = m->m_len)) {
5374                 off -= mlen;
5375                 totlen += mlen;
5376                 if (m->m_next == NULL) {
5377                         int tspace;
5378 extend:
5379                         if (!(flags & M_COPYBACK0_EXTEND))
5380                                 goto out;
5381
5382                         /*
5383                          * try to make some space at the end of "m".
5384                          */
5385
5386                         mlen = m->m_len;
5387                         if (off + len >= MINCLSIZE &&
5388                             !(m->m_flags & M_EXT) && m->m_len == 0) {
5389                                 MCLGET(m, how);
5390                         }
5391                         tspace = M_TRAILINGSPACE(m);
5392                         if (tspace > 0) {
5393                                 tspace = MIN(tspace, off + len);
5394                                 VERIFY(tspace > 0);
5395                                 bzero(mtod(m, char *) + m->m_len,
5396                                     MIN(off, tspace));
5397                                 m->m_len += tspace;
5398                                 off += mlen;
5399                                 totlen -= mlen;
5400                                 continue;
5401                         }
5402
5403                         /*
5404                          * need to allocate an mbuf.
5405                          */
5406
5407                         if (off + len >= MINCLSIZE) {
5408                                 n = m_getcl(how, m->m_type, 0);
5409                         } else {
5410                                 n = _M_GET(how, m->m_type);
5411                         }
5412                         if (n == NULL) {
5413                                 goto out;
5414                         }
5415                         n->m_len = 0;
5416                         n->m_len = MIN(M_TRAILINGSPACE(n), off + len);
5417                         bzero(mtod(n, char *), MIN(n->m_len, off));
5418                         m->m_next = n;
5419                 }
5420                 mp = &m->m_next;
5421                 m = m->m_next;
5422         }
5423         while (len > 0) {
5424                 mlen = m->m_len - off;
5425                 if (mlen != 0 && m_mclhasreference(m)) {
5426                         char *datap;
5427                         int eatlen;
5428
5429                         /*
5430                          * this mbuf is read-only.
5431                          * allocate a new writable mbuf and try again.
5432                          */
5433
5434 #if DIAGNOSTIC
5435                         if (!(flags & M_COPYBACK0_COW))
5436                                 panic("m_copyback0: read-only");
5437 #endif /* DIAGNOSTIC */
5438
5439                         /*
5440                          * if we're going to write into the middle of
5441                          * a mbuf, split it first.
5442                          */
5443                         if (off > 0 && len < mlen) {
5444                                 n = m_split0(m, off, how, 0);
5445                                 if (n == NULL)
5446                                         goto enobufs;
5447                                 m->m_next = n;
5448                                 mp = &m->m_next;
5449                                 m = n;
5450                                 off = 0;
5451                                 continue;
5452                         }
5453
5454                         /*
5455                          * XXX TODO coalesce into the trailingspace of
5456                          * the previous mbuf when possible.
5457                          */
5458
5459                         /*
5460                          * allocate a new mbuf.  copy packet header if needed.
5461                          */
5462                         n = _M_GET(how, m->m_type);
5463                         if (n == NULL)
5464                                 goto enobufs;
5465                         if (off == 0 && (m->m_flags & M_PKTHDR)) {
5466                                 M_COPY_PKTHDR(n, m);
5467                                 n->m_len = MHLEN;
5468                         } else {
5469                                 if (len >= MINCLSIZE)
5470                                         MCLGET(n, M_DONTWAIT);
5471                                 n->m_len =
5472                                     (n->m_flags & M_EXT) ? MCLBYTES : MLEN;
5473                         }
5474                         if (n->m_len > len)
5475                                 n->m_len = len;
5476
5477                         /*
5478                          * free the region which has been overwritten.
5479                          * copying data from old mbufs if requested.
5480                          */
5481                         if (flags & M_COPYBACK0_PRESERVE)
5482                                 datap = mtod(n, char *);
5483                         else
5484                                 datap = NULL;
5485                         eatlen = n->m_len;
5486                         VERIFY(off == 0 || eatlen >= mlen);
5487                         if (off > 0) {
5488                                 VERIFY(len >= mlen);
5489                                 m->m_len = off;
5490                                 m->m_next = n;
5491                                 if (datap) {
5492                                         m_copydata(m, off, mlen, datap);
5493                                         datap += mlen;
5494                                 }
5495                                 eatlen -= mlen;
5496                                 mp = &m->m_next;
5497                                 m = m->m_next;
5498                         }
5499                         while (m != NULL && m_mclhasreference(m) &&
5500                             n->m_type == m->m_type && eatlen > 0) {
5501                                 mlen = MIN(eatlen, m->m_len);
5502                                 if (datap) {
5503                                         m_copydata(m, 0, mlen, datap);
5504                                         datap += mlen;
5505                                 }
5506                                 m->m_data += mlen;
5507                                 m->m_len -= mlen;
5508                                 eatlen -= mlen;
5509                                 if (m->m_len == 0)
5510                                         *mp = m = m_free(m);
5511                         }
5512                         if (eatlen > 0)
5513                                 n->m_len -= eatlen;
5514                         n->m_next = m;
5515                         *mp = m = n;
5516                         continue;
5517                 }
5518                 mlen = MIN(mlen, len);
5519                 if (flags & M_COPYBACK0_COPYBACK) {
5520                         bcopy(cp, mtod(m, caddr_t) + off, (unsigned)mlen);
5521                         cp += mlen;
5522                 }
5523                 len -= mlen;
5524                 mlen += off;
5525                 off = 0;
5526                 totlen += mlen;
5527                 if (len == 0)
5528                         break;
5529                 if (m->m_next == NULL) {
5530                         goto extend;
5531                 }
5532                 mp = &m->m_next;
5533                 m = m->m_next;
5534         }
5535 out:
5536         if (((m = *mp0)->m_flags & M_PKTHDR) && (m->m_pkthdr.len < totlen)) {
5537                 VERIFY(flags & M_COPYBACK0_EXTEND);
5538                 m->m_pkthdr.len = totlen;
5539         }
5540
5541         return (0);
5542
5543 enobufs:
5544         return (ENOBUFS);
5545 }
5546
5547 uint64_t
5548 mcl_to_paddr(char *addr)
5549 {
5550         vm_offset_t base_phys;
5551
5552         if (!MBUF_IN_MAP(addr))
5553                 return (0);
5554         base_phys = mcl_paddr[atop_64(addr - (char *)mbutl)];
5555
5556         if (base_phys == 0)
5557                 return (0);
5558         return ((uint64_t)(ptoa_64(base_phys) | ((uint64_t)addr & PAGE_MASK)));
5559 }
5560
5561 /*
5562  * Dup the mbuf chain passed in.  The whole thing.  No cute additional cruft.
5563  * And really copy the thing.  That way, we don't "precompute" checksums
5564  * for unsuspecting consumers.  Assumption: m->m_nextpkt == 0.  Trick: for
5565  * small packets, don't dup into a cluster.  That way received  packets
5566  * don't take up too much room in the sockbuf (cf. sbspace()).
5567  */
5568 int MDFail;
5569
5570 struct mbuf *
5571 m_dup(struct mbuf *m, int how)
5572 {
5573         struct mbuf *n, **np;
5574         struct mbuf *top;
5575         int copyhdr = 0;
5576
5577         np = &top;
5578         top = NULL;
5579         if (m->m_flags & M_PKTHDR)
5580                 copyhdr = 1;
5581
5582         /*
5583          * Quick check: if we have one mbuf and its data fits in an
5584          *  mbuf with packet header, just copy and go.
5585          */
5586         if (m->m_next == NULL) {
5587                 /* Then just move the data into an mbuf and be done... */
5588                 if (copyhdr) {
5589                         if (m->m_pkthdr.len <= MHLEN && m->m_len <= MHLEN) {
5590                                 if ((n = _M_GETHDR(how, m->m_type)) == NULL)
5591                                         return (NULL);
5592                                 n->m_len = m->m_len;
5593                                 m_dup_pkthdr(n, m, how);
5594                                 bcopy(m->m_data, n->m_data, m->m_len);
5595                                 return (n);
5596                         }
5597                 } else if (m->m_len <= MLEN) {
5598                         if ((n = _M_GET(how, m->m_type)) == NULL)
5599                                 return (NULL);
5600                         bcopy(m->m_data, n->m_data, m->m_len);
5601                         n->m_len = m->m_len;
5602                         return (n);
5603                 }
5604         }
5605         while (m != NULL) {
5606 #if BLUE_DEBUG
5607                 kprintf("<%x: %x, %x, %x\n", m, m->m_flags, m->m_len,
5608                     m->m_data);
5609 #endif
5610                 if (copyhdr)
5611                         n = _M_GETHDR(how, m->m_type);
5612                 else
5613                         n = _M_GET(how, m->m_type);
5614                 if (n == NULL)
5615                         goto nospace;
5616                 if (m->m_flags & M_EXT) {
5617                         if (m->m_len <= m_maxsize(MC_CL))
5618                                 MCLGET(n, how);
5619                         else if (m->m_len <= m_maxsize(MC_BIGCL))
5620                                 n = m_mbigget(n, how);
5621                         else if (m->m_len <= m_maxsize(MC_16KCL) && njcl > 0)
5622                                 n = m_m16kget(n, how);
5623                         if (!(n->m_flags & M_EXT)) {
5624                                 (void) m_free(n);
5625                                 goto nospace;
5626                         }
5627                 }
5628                 *np = n;
5629                 if (copyhdr) {
5630                         /* Don't use M_COPY_PKTHDR: preserve m_data */
5631                         m_dup_pkthdr(n, m, how);
5632                         copyhdr = 0;
5633                         if (!(n->m_flags & M_EXT))
5634                                 n->m_data = n->m_pktdat;
5635                 }
5636                 n->m_len = m->m_len;
5637                 /*
5638                  * Get the dup on the same bdry as the original
5639                  * Assume that the two mbufs have the same offset to data area
5640                  * (up to word boundaries)
5641                  */
5642                 bcopy(MTOD(m, caddr_t), MTOD(n, caddr_t), (unsigned)n->m_len);
5643                 m = m->m_next;
5644                 np = &n->m_next;
5645 #if BLUE_DEBUG
5646                 kprintf(">%x: %x, %x, %x\n", n, n->m_flags, n->m_len,
5647                     n->m_data);
5648 #endif
5649         }
5650
5651         if (top == NULL)
5652                 MDFail++;
5653         return (top);
5654
5655 nospace:
5656         m_freem(top);
5657         MDFail++;
5658         return (NULL);
5659 }
5660
5661 #define MBUF_MULTIPAGES(m)                                              \
5662         (((m)->m_flags & M_EXT) &&                                      \
5663         ((IS_P2ALIGNED((m)->m_data, NBPG) && (m)->m_len > NBPG) ||      \
5664         (!IS_P2ALIGNED((m)->m_data, NBPG) &&                            \
5665         P2ROUNDUP((m)->m_data, NBPG) < ((uintptr_t)(m)->m_data + (m)->m_len))))
5666
5667 static struct mbuf *
5668 m_expand(struct mbuf *m, struct mbuf **last)
5669 {
5670         struct mbuf *top = NULL;
5671         struct mbuf **nm = &top;
5672         uintptr_t data0, data;
5673         unsigned int len0, len;
5674
5675         VERIFY(MBUF_MULTIPAGES(m));
5676         VERIFY(m->m_next == NULL);
5677         data0 = (uintptr_t)m->m_data;
5678         len0 = m->m_len;
5679         *last = top;
5680
5681         for (;;) {
5682                 struct mbuf *n;
5683
5684                 data = data0;
5685                 if (IS_P2ALIGNED(data, NBPG) && len0 > NBPG)
5686                         len = NBPG;
5687                 else if (!IS_P2ALIGNED(data, NBPG) &&
5688                     P2ROUNDUP(data, NBPG) < (data + len0))
5689                         len = P2ROUNDUP(data, NBPG) - data;
5690                 else
5691                         len = len0;
5692
5693                 VERIFY(len > 0);
5694                 VERIFY(m->m_flags & M_EXT);
5695                 m->m_data = (void *)data;
5696                 m->m_len = len;
5697
5698                 *nm = *last = m;
5699                 nm = &m->m_next;
5700                 m->m_next = NULL;
5701
5702                 data0 += len;
5703                 len0 -= len;
5704                 if (len0 == 0)
5705                         break;
5706
5707                 n = _M_RETRY(M_DONTWAIT, MT_DATA);
5708                 if (n == NULL) {
5709                         m_freem(top);
5710                         top = *last = NULL;
5711                         break;
5712                 }
5713
5714                 n->m_ext = m->m_ext;
5715                 m_incref(m);
5716                 n->m_flags |= M_EXT;
5717                 m = n;
5718         }
5719         return (top);
5720 }
5721
5722 struct mbuf *
5723 m_normalize(struct mbuf *m)
5724 {
5725         struct mbuf *top = NULL;
5726         struct mbuf **nm = &top;
5727         boolean_t expanded = FALSE;
5728
5729         while (m != NULL) {
5730                 struct mbuf *n;
5731
5732                 n = m->m_next;
5733                 m->m_next = NULL;
5734
5735                 /* Does the data cross one or more page boundaries? */
5736                 if (MBUF_MULTIPAGES(m)) {
5737                         struct mbuf *last;
5738                         if ((m = m_expand(m, &last)) == NULL) {
5739                                 m_freem(n);
5740                                 m_freem(top);
5741                                 top = NULL;
5742                                 break;
5743                         }
5744                         *nm = m;
5745                         nm = &last->m_next;
5746                         expanded = TRUE;
5747                 } else {
5748                         *nm = m;
5749                         nm = &m->m_next;
5750                 }
5751                 m = n;
5752         }
5753         if (expanded)
5754                 atomic_add_32(&mb_normalized, 1);
5755         return (top);
5756 }
5757
5758 /*
5759  * Append the specified data to the indicated mbuf chain,
5760  * Extend the mbuf chain if the new data does not fit in
5761  * existing space.
5762  *
5763  * Return 1 if able to complete the job; otherwise 0.
5764  */
5765 int
5766 m_append(struct mbuf *m0, int len, caddr_t cp)
5767 {
5768         struct mbuf *m, *n;
5769         int remainder, space;
5770
5771         for (m = m0; m->m_next != NULL; m = m->m_next)
5772                 ;
5773         remainder = len;
5774         space = M_TRAILINGSPACE(m);
5775         if (space > 0) {
5776                 /*
5777                  * Copy into available space.
5778                  */
5779                 if (space > remainder)
5780                         space = remainder;
5781                 bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
5782                 m->m_len += space;
5783                 cp += space, remainder -= space;
5784         }
5785         while (remainder > 0) {
5786                 /*
5787                  * Allocate a new mbuf; could check space
5788                  * and allocate a cluster instead.
5789                  */
5790                 n = m_get(M_WAITOK, m->m_type);
5791                 if (n == NULL)
5792                         break;
5793                 n->m_len = min(MLEN, remainder);
5794                 bcopy(cp, mtod(n, caddr_t), n->m_len);
5795                 cp += n->m_len;
5796                 remainder -= n->m_len;
5797                 m->m_next = n;
5798                 m = n;
5799         }
5800         if (m0->m_flags & M_PKTHDR)
5801                 m0->m_pkthdr.len += len - remainder;
5802         return (remainder == 0);
5803 }
5804
5805 struct mbuf *
5806 m_last(struct mbuf *m)
5807 {
5808         while (m->m_next != NULL)
5809                 m = m->m_next;
5810         return (m);
5811 }
5812
5813 unsigned int
5814 m_fixhdr(struct mbuf *m0)
5815 {
5816         u_int len;
5817
5818         VERIFY(m0->m_flags & M_PKTHDR);
5819
5820         len = m_length2(m0, NULL);
5821         m0->m_pkthdr.len = len;
5822         return (len);
5823 }
5824
5825 unsigned int
5826 m_length2(struct mbuf *m0, struct mbuf **last)
5827 {
5828         struct mbuf *m;
5829         u_int len;
5830
5831         len = 0;
5832         for (m = m0; m != NULL; m = m->m_next) {
5833                 len += m->m_len;
5834                 if (m->m_next == NULL)
5835                         break;
5836         }
5837         if (last != NULL)
5838                 *last = m;
5839         return (len);
5840 }
5841
5842 /*
5843  * Defragment a mbuf chain, returning the shortest possible chain of mbufs
5844  * and clusters.  If allocation fails and this cannot be completed, NULL will
5845  * be returned, but the passed in chain will be unchanged.  Upon success,
5846  * the original chain will be freed, and the new chain will be returned.
5847  *
5848  * If a non-packet header is passed in, the original mbuf (chain?) will
5849  * be returned unharmed.
5850  *
5851  * If offset is specfied, the first mbuf in the chain will have a leading
5852  * space of the amount stated by the "off" parameter.
5853  *
5854  * This routine requires that the m_pkthdr.header field of the original
5855  * mbuf chain is cleared by the caller.
5856  */
5857 struct mbuf *
5858 m_defrag_offset(struct mbuf *m0, u_int32_t off, int how)
5859 {
5860         struct mbuf *m_new = NULL, *m_final = NULL;
5861         int progress = 0, length, pktlen;
5862
5863         if (!(m0->m_flags & M_PKTHDR))
5864                 return (m0);
5865
5866         VERIFY(off < MHLEN);
5867         m_fixhdr(m0); /* Needed sanity check */
5868
5869         pktlen = m0->m_pkthdr.len + off;
5870         if (pktlen > MHLEN)
5871                 m_final = m_getcl(how, MT_DATA, M_PKTHDR);
5872         else
5873                 m_final = m_gethdr(how, MT_DATA);
5874
5875         if (m_final == NULL)
5876                 goto nospace;
5877
5878         if (off > 0) {
5879                 pktlen -= off;
5880                 m_final->m_data += off;
5881         }
5882
5883         /*
5884          * Caller must have handled the contents pointed to by this
5885          * pointer before coming here, as otherwise it will point to
5886          * the original mbuf which will get freed upon success.
5887          */
5888         VERIFY(m0->m_pkthdr.pkt_hdr == NULL);
5889
5890         if (m_dup_pkthdr(m_final, m0, how) == 0)
5891                 goto nospace;
5892
5893         m_new = m_final;
5894
5895         while (progress < pktlen) {
5896                 length = pktlen - progress;
5897                 if (length > MCLBYTES)
5898                         length = MCLBYTES;
5899                 length -= ((m_new == m_final) ? off : 0);
5900
5901                 if (m_new == NULL) {
5902                         if (length > MLEN)
5903                                 m_new = m_getcl(how, MT_DATA, 0);
5904                         else
5905                                 m_new = m_get(how, MT_DATA);
5906                         if (m_new == NULL)
5907                                 goto nospace;
5908                 }
5909
5910                 m_copydata(m0, progress, length, mtod(m_new, caddr_t));
5911                 progress += length;
5912                 m_new->m_len = length;
5913                 if (m_new != m_final)
5914                         m_cat(m_final, m_new);
5915                 m_new = NULL;
5916         }
5917         m_freem(m0);
5918         m0 = m_final;
5919         return (m0);
5920 nospace:
5921         if (m_final)
5922                 m_freem(m_final);
5923         return (NULL);
5924 }
5925
5926 struct mbuf *
5927 m_defrag(struct mbuf *m0, int how)
5928 {
5929         return (m_defrag_offset(m0, 0, how));
5930 }
5931
5932 void
5933 m_mchtype(struct mbuf *m, int t)
5934 {
5935         mtype_stat_inc(t);
5936         mtype_stat_dec(m->m_type);
5937         (m)->m_type = t;
5938 }
5939
5940 void *
5941 m_mtod(struct mbuf *m)
5942 {
5943         return (MTOD(m, void *));
5944 }
5945
5946 struct mbuf *
5947 m_dtom(void *x)
5948 {
5949         return ((struct mbuf *)((uintptr_t)(x) & ~(MSIZE-1)));
5950 }
5951
5952 void
5953 m_mcheck(struct mbuf *m)
5954 {
5955         _MCHECK(m);
5956 }
5957
5958 /*
5959  * Return a pointer to mbuf/offset of location in mbuf chain.
5960  */
5961 struct mbuf *
5962 m_getptr(struct mbuf *m, int loc, int *off)
5963 {
5964
5965         while (loc >= 0) {
5966                 /* Normal end of search. */
5967                 if (m->m_len > loc) {
5968                         *off = loc;
5969                         return (m);
5970                 } else {
5971                         loc -= m->m_len;
5972                         if (m->m_next == NULL) {
5973                                 if (loc == 0) {
5974                                         /* Point at the end of valid data. */
5975                                         *off = m->m_len;
5976                                         return (m);
5977                                 }
5978                                 return (NULL);
5979                         }
5980                         m = m->m_next;
5981                 }
5982         }
5983         return (NULL);
5984 }
5985
5986 /*
5987  * Inform the corresponding mcache(s) that there's a waiter below.
5988  */
5989 static void
5990 mbuf_waiter_inc(mbuf_class_t class, boolean_t comp)
5991 {
5992         mcache_waiter_inc(m_cache(class));
5993         if (comp) {
5994                 if (class == MC_CL) {
5995                         mcache_waiter_inc(m_cache(MC_MBUF_CL));
5996                 } else if (class == MC_BIGCL) {
5997                         mcache_waiter_inc(m_cache(MC_MBUF_BIGCL));
5998                 } else if (class == MC_16KCL) {
5999                         mcache_waiter_inc(m_cache(MC_MBUF_16KCL));
6000                 } else {
6001                         mcache_waiter_inc(m_cache(MC_MBUF_CL));
6002                         mcache_waiter_inc(m_cache(MC_MBUF_BIGCL));
6003                 }
6004         }
6005 }
6006
6007 /*
6008  * Inform the corresponding mcache(s) that there's no more waiter below.
6009  */
6010 static void
6011 mbuf_waiter_dec(mbuf_class_t class, boolean_t comp)
6012 {
6013         mcache_waiter_dec(m_cache(class));
6014         if (comp) {
6015                 if (class == MC_CL) {
6016                         mcache_waiter_dec(m_cache(MC_MBUF_CL));
6017                 } else if (class == MC_BIGCL) {
6018                         mcache_waiter_dec(m_cache(MC_MBUF_BIGCL));
6019                 } else if (class == MC_16KCL) {
6020                         mcache_waiter_dec(m_cache(MC_MBUF_16KCL));
6021                 } else {
6022                         mcache_waiter_dec(m_cache(MC_MBUF_CL));
6023                         mcache_waiter_dec(m_cache(MC_MBUF_BIGCL));
6024                 }
6025         }
6026 }
6027
6028 /*
6029  * Called during slab (blocking and non-blocking) allocation.  If there
6030  * is at least one waiter, and the time since the first waiter is blocked
6031  * is greater than the watchdog timeout, panic the system.
6032  */
6033 static void
6034 mbuf_watchdog(void)
6035 {
6036         struct timeval now;
6037         unsigned int since;
6038
6039         if (mb_waiters == 0 || !mb_watchdog)
6040                 return;
6041
6042         microuptime(&now);
6043         since = now.tv_sec - mb_wdtstart.tv_sec;
6044         if (since >= MB_WDT_MAXTIME) {
6045                 panic_plain("%s: %d waiters stuck for %u secs\n%s", __func__,
6046                     mb_waiters, since, mbuf_dump());
6047                 /* NOTREACHED */
6048         }
6049 }
6050
6051 /*
6052  * Called during blocking allocation.  Returns TRUE if one or more objects
6053  * are available at the per-CPU caches layer and that allocation should be
6054  * retried at that level.
6055  */
6056 static boolean_t
6057 mbuf_sleep(mbuf_class_t class, unsigned int num, int wait)
6058 {
6059         boolean_t mcache_retry = FALSE;
6060
6061         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
6062
6063         /* Check if there's anything at the cache layer */
6064         if (mbuf_cached_above(class, wait)) {
6065                 mcache_retry = TRUE;
6066                 goto done;
6067         }
6068
6069         /* Nothing?  Then try hard to get it from somewhere */
6070         m_reclaim(class, num, (wait & MCR_COMP));
6071
6072         /* We tried hard and got something? */
6073         if (m_infree(class) > 0) {
6074                 mbstat.m_wait++;
6075                 goto done;
6076         } else if (mbuf_cached_above(class, wait)) {
6077                 mbstat.m_wait++;
6078                 mcache_retry = TRUE;
6079                 goto done;
6080         } else if (wait & MCR_TRYHARD) {
6081                 mcache_retry = TRUE;
6082                 goto done;
6083         }
6084
6085         /*
6086          * There's really nothing for us right now; inform the
6087          * cache(s) that there is a waiter below and go to sleep.
6088          */
6089         mbuf_waiter_inc(class, (wait & MCR_COMP));
6090
6091         VERIFY(!(wait & MCR_NOSLEEP));
6092
6093         /*
6094          * If this is the first waiter, arm the watchdog timer.  Otherwise
6095          * check if we need to panic the system due to watchdog timeout.
6096          */
6097         if (mb_waiters == 0)
6098                 microuptime(&mb_wdtstart);
6099         else
6100                 mbuf_watchdog();
6101
6102         mb_waiters++;
6103         (void) msleep(mb_waitchan, mbuf_mlock, (PZERO-1), m_cname(class), NULL);
6104
6105         /* We are now up; stop getting notified until next round */
6106         mbuf_waiter_dec(class, (wait & MCR_COMP));
6107
6108         /* We waited and got something */
6109         if (m_infree(class) > 0) {
6110                 mbstat.m_wait++;
6111                 goto done;
6112         } else if (mbuf_cached_above(class, wait)) {
6113                 mbstat.m_wait++;
6114                 mcache_retry = TRUE;
6115         }
6116 done:
6117         return (mcache_retry);
6118 }
6119
6120 static void
6121 mbuf_worker_thread(void)
6122 {
6123         int mbuf_expand;
6124
6125         while (1) {
6126                 lck_mtx_lock(mbuf_mlock);
6127
6128                 mbuf_expand = 0;
6129                 if (mbuf_expand_mcl) {
6130                         int n;
6131
6132                         /* Adjust to current number of cluster in use */
6133                         n = mbuf_expand_mcl -
6134                             (m_total(MC_CL) - m_infree(MC_CL));
6135                         if ((n + m_total(MC_CL)) > m_maxlimit(MC_CL))
6136                                 n = m_maxlimit(MC_CL) - m_total(MC_CL);
6137                         mbuf_expand_mcl = 0;
6138
6139                         if (n > 0 && freelist_populate(MC_CL, n, M_WAIT) > 0)
6140                                 mbuf_expand++;
6141                 }
6142                 if (mbuf_expand_big) {
6143                         int n;
6144
6145                         /* Adjust to current number of 4 KB cluster in use */
6146                         n = mbuf_expand_big -
6147                             (m_total(MC_BIGCL) - m_infree(MC_BIGCL));
6148                         if ((n + m_total(MC_BIGCL)) > m_maxlimit(MC_BIGCL))
6149                                 n = m_maxlimit(MC_BIGCL) - m_total(MC_BIGCL);
6150                         mbuf_expand_big = 0;
6151
6152                         if (n > 0 && freelist_populate(MC_BIGCL, n, M_WAIT) > 0)
6153                                 mbuf_expand++;
6154                 }
6155                 if (mbuf_expand_16k) {
6156                         int n;
6157
6158                         /* Adjust to current number of 16 KB cluster in use */
6159                         n = mbuf_expand_16k -
6160                             (m_total(MC_16KCL) - m_infree(MC_16KCL));
6161                         if ((n + m_total(MC_16KCL)) > m_maxlimit(MC_16KCL))
6162                                 n = m_maxlimit(MC_16KCL) - m_total(MC_16KCL);
6163                         mbuf_expand_16k = 0;
6164
6165                         if (n > 0)
6166                                 (void) freelist_populate(MC_16KCL, n, M_WAIT);
6167                 }
6168
6169                 /*
6170                  * Because we can run out of memory before filling the mbuf
6171                  * map, we should not allocate more clusters than they are
6172                  * mbufs -- otherwise we could have a large number of useless
6173                  * clusters allocated.
6174                  */
6175                 if (mbuf_expand) {
6176                         while (m_total(MC_MBUF) <
6177                             (m_total(MC_BIGCL) + m_total(MC_CL))) {
6178                                 if (freelist_populate(MC_MBUF, 1, M_WAIT) == 0)
6179                                         break;
6180                         }
6181                 }
6182
6183                 lck_mtx_unlock(mbuf_mlock);
6184
6185                 assert_wait(&mbuf_worker_run, THREAD_UNINT);
6186                 (void) thread_block((thread_continue_t)mbuf_worker_thread);
6187         }
6188 }
6189
6190 static void
6191 mbuf_worker_thread_init(void)
6192 {
6193         mbuf_worker_ready++;
6194         mbuf_worker_thread();
6195 }
6196
6197 static mcl_slab_t *
6198 slab_get(void *buf)
6199 {
6200         mcl_slabg_t *slg;
6201         unsigned int ix, k;
6202
6203         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
6204
6205         VERIFY(MBUF_IN_MAP(buf));
6206         ix = ((char *)buf - (char *)mbutl) >> MBSHIFT;
6207         VERIFY(ix < maxslabgrp);
6208
6209         if ((slg = slabstbl[ix]) == NULL) {
6210                 /*
6211                  * In the current implementation, we never shrink the memory
6212                  * pool (hence the cluster map); if we attempt to reallocate
6213                  * a cluster group when it's already allocated, panic since
6214                  * this is a sign of a memory corruption (slabstbl[ix] got
6215                  * nullified).  This also means that there shouldn't be any
6216                  * hole in the kernel sub-map for the mbuf pool.
6217                  */
6218                 ++slabgrp;
6219                 VERIFY(ix < slabgrp);
6220                 /*
6221                  * Slabs expansion can only be done single threaded; when
6222                  * we get here, it must be as a result of m_clalloc() which
6223                  * is serialized and therefore mb_clalloc_busy must be set.
6224                  */
6225                 VERIFY(mb_clalloc_busy);
6226                 lck_mtx_unlock(mbuf_mlock);
6227
6228                 /* This is a new buffer; create the slabs group for it */
6229                 MALLOC(slg, mcl_slabg_t *, sizeof (*slg), M_TEMP,
6230                     M_WAITOK | M_ZERO);
6231                 VERIFY(slg != NULL);
6232
6233                 lck_mtx_lock(mbuf_mlock);
6234                 /*
6235                  * No other thread could have gone into m_clalloc() after
6236                  * we dropped the lock above, so verify that it's true.
6237                  */
6238                 VERIFY(mb_clalloc_busy);
6239
6240                 slabstbl[ix] = slg;
6241
6242                 /* Chain each slab in the group to its forward neighbor */
6243                 for (k = 1; k < NSLABSPMB; k++)
6244                         slg->slg_slab[k - 1].sl_next = &slg->slg_slab[k];
6245                 VERIFY(slg->slg_slab[NSLABSPMB - 1].sl_next == NULL);
6246
6247                 /* And chain the last slab in the previous group to this */
6248                 if (ix > 0) {
6249                         VERIFY(slabstbl[ix - 1]->
6250                             slg_slab[NSLABSPMB - 1].sl_next == NULL);
6251                         slabstbl[ix - 1]->slg_slab[NSLABSPMB - 1].sl_next =
6252                             &slg->slg_slab[0];
6253                 }
6254         }
6255
6256         ix = MTOBG(buf) % NSLABSPMB;
6257         VERIFY(ix < NSLABSPMB);
6258
6259         return (&slg->slg_slab[ix]);
6260 }
6261
6262 static void
6263 slab_init(mcl_slab_t *sp, mbuf_class_t class, u_int32_t flags,
6264     void *base, void *head, unsigned int len, int refcnt, int chunks)
6265 {
6266         sp->sl_class = class;
6267         sp->sl_flags = flags;
6268         sp->sl_base = base;
6269         sp->sl_head = head;
6270         sp->sl_len = len;
6271         sp->sl_refcnt = refcnt;
6272         sp->sl_chunks = chunks;
6273         slab_detach(sp);
6274 }
6275
6276 static void
6277 slab_insert(mcl_slab_t *sp, mbuf_class_t class)
6278 {
6279         VERIFY(slab_is_detached(sp));
6280         m_slab_cnt(class)++;
6281         TAILQ_INSERT_TAIL(&m_slablist(class), sp, sl_link);
6282         sp->sl_flags &= ~SLF_DETACHED;
6283         if (class == MC_16KCL) {
6284                 int k;
6285                 for (k = 1; k < NSLABSP16KB; k++) {
6286                         sp = sp->sl_next;
6287                         /* Next slab must already be present */
6288                         VERIFY(sp != NULL);
6289                         VERIFY(slab_is_detached(sp));
6290                         sp->sl_flags &= ~SLF_DETACHED;
6291                 }
6292         }
6293 }
6294
6295 static void
6296 slab_remove(mcl_slab_t *sp, mbuf_class_t class)
6297 {
6298         VERIFY(!slab_is_detached(sp));
6299         VERIFY(m_slab_cnt(class) > 0);
6300         m_slab_cnt(class)--;
6301         TAILQ_REMOVE(&m_slablist(class), sp, sl_link);
6302         slab_detach(sp);
6303         if (class == MC_16KCL) {
6304                 int k;
6305                 for (k = 1; k < NSLABSP16KB; k++) {
6306                         sp = sp->sl_next;
6307                         /* Next slab must already be present */
6308                         VERIFY(sp != NULL);
6309                         VERIFY(!slab_is_detached(sp));
6310                         slab_detach(sp);
6311                 }
6312         }
6313 }
6314
6315 static boolean_t
6316 slab_inrange(mcl_slab_t *sp, void *buf)
6317 {
6318         return ((uintptr_t)buf >= (uintptr_t)sp->sl_base &&
6319             (uintptr_t)buf < ((uintptr_t)sp->sl_base + sp->sl_len));
6320 }
6321
6322 #undef panic
6323
6324 static void
6325 slab_nextptr_panic(mcl_slab_t *sp, void *addr)
6326 {
6327         int i;
6328         unsigned int chunk_len = sp->sl_len / sp->sl_chunks;
6329         uintptr_t buf = (uintptr_t)sp->sl_base;
6330
6331         for (i = 0; i < sp->sl_chunks; i++, buf += chunk_len) {
6332                 void *next = ((mcache_obj_t *)buf)->obj_next;
6333                 if (next != addr)
6334                         continue;
6335                 if (!mclverify) {
6336                         if (next != NULL && !MBUF_IN_MAP(next)) {
6337                                 mcache_t *cp = m_cache(sp->sl_class);
6338                                 panic("%s: %s buffer %p in slab %p modified "
6339                                     "after free at offset 0: %p out of range "
6340                                     "[%p-%p)\n", __func__, cp->mc_name,
6341                                     (void *)buf, sp, next, mbutl, embutl);
6342                                 /* NOTREACHED */
6343                         }
6344                 } else {
6345                         mcache_audit_t *mca = mcl_audit_buf2mca(sp->sl_class,
6346                             (mcache_obj_t *)buf);
6347                         mcl_audit_verify_nextptr(next, mca);
6348                 }
6349         }
6350 }
6351
6352 static void
6353 slab_detach(mcl_slab_t *sp)
6354 {
6355         sp->sl_link.tqe_next = (mcl_slab_t *)-1;
6356         sp->sl_link.tqe_prev = (mcl_slab_t **)-1;
6357         sp->sl_flags |= SLF_DETACHED;
6358 }
6359
6360 static boolean_t
6361 slab_is_detached(mcl_slab_t *sp)
6362 {
6363         return ((intptr_t)sp->sl_link.tqe_next == -1 &&
6364             (intptr_t)sp->sl_link.tqe_prev == -1 &&
6365             (sp->sl_flags & SLF_DETACHED));
6366 }
6367
6368 static void
6369 mcl_audit_init(void *buf, mcache_audit_t **mca_list,
6370     mcache_obj_t **con_list, size_t con_size, unsigned int num)
6371 {
6372         mcache_audit_t *mca, *mca_tail;
6373         mcache_obj_t *con = NULL;
6374         boolean_t save_contents = (con_list != NULL);
6375         unsigned int i, ix;
6376
6377         ASSERT(num <= NMBPBG);
6378         ASSERT(con_list == NULL || con_size != 0);
6379
6380         ix = MTOBG(buf);
6381         VERIFY(ix < maxclaudit);
6382
6383         /* Make sure we haven't been here before */
6384         for (i = 0; i < NMBPBG; i++)
6385                 VERIFY(mclaudit[ix].cl_audit[i] == NULL);
6386
6387         mca = mca_tail = *mca_list;
6388         if (save_contents)
6389                 con = *con_list;
6390
6391         for (i = 0; i < num; i++) {
6392                 mcache_audit_t *next;
6393
6394                 next = mca->mca_next;
6395                 bzero(mca, sizeof (*mca));
6396                 mca->mca_next = next;
6397                 mclaudit[ix].cl_audit[i] = mca;
6398
6399                 /* Attach the contents buffer if requested */
6400                 if (save_contents) {
6401                         mcl_saved_contents_t *msc =
6402                             (mcl_saved_contents_t *)(void *)con;
6403
6404                         VERIFY(msc != NULL);
6405                         VERIFY(IS_P2ALIGNED(msc, sizeof (u_int64_t)));
6406                         VERIFY(con_size == sizeof (*msc));
6407                         mca->mca_contents_size = con_size;
6408                         mca->mca_contents = msc;
6409                         con = con->obj_next;
6410                         bzero(mca->mca_contents, mca->mca_contents_size);
6411                 }
6412
6413                 mca_tail = mca;
6414                 mca = mca->mca_next;
6415         }
6416
6417         if (save_contents)
6418                 *con_list = con;
6419
6420         *mca_list = mca_tail->mca_next;
6421         mca_tail->mca_next = NULL;
6422 }
6423
6424 /*
6425  * Given an address of a buffer (mbuf/2KB/4KB/16KB), return
6426  * the corresponding audit structure for that buffer.
6427  */
6428 static mcache_audit_t *
6429 mcl_audit_buf2mca(mbuf_class_t class, mcache_obj_t *o)
6430 {
6431         mcache_audit_t *mca = NULL;
6432         int ix = MTOBG(o);
6433
6434         VERIFY(ix < maxclaudit);
6435         VERIFY(IS_P2ALIGNED(o, MIN(m_maxsize(class), NBPG)));
6436
6437         switch (class) {
6438         case MC_MBUF:
6439                 /*
6440                  * For the mbuf case, find the index of the page
6441                  * used by the mbuf and use that index to locate the
6442                  * base address of the page.  Then find out the
6443                  * mbuf index relative to the page base and use
6444                  * it to locate the audit structure.
6445                  */
6446                 VERIFY(MCLIDX(BGTOM(ix), o) < (int)NMBPBG);
6447                 mca = mclaudit[ix].cl_audit[MCLIDX(BGTOM(ix), o)];
6448                 break;
6449
6450         case MC_CL:
6451                 /*
6452                  * Same thing as above, but for 2KB clusters in a page.
6453                  */
6454                 VERIFY(CLBGIDX(BGTOM(ix), o) < (int)NCLPBG);
6455                 mca = mclaudit[ix].cl_audit[CLBGIDX(BGTOM(ix), o)];
6456                 break;
6457
6458         case MC_BIGCL:
6459         case MC_16KCL:
6460                 /*
6461                  * Same as above, but only return the first element.
6462                  */
6463                 mca = mclaudit[ix].cl_audit[0];
6464                 break;
6465
6466         default:
6467                 VERIFY(0);
6468                 /* NOTREACHED */
6469         }
6470
6471         return (mca);
6472 }
6473
6474 static void
6475 mcl_audit_mbuf(mcache_audit_t *mca, void *addr, boolean_t composite,
6476     boolean_t alloc)
6477 {
6478         struct mbuf *m = addr;
6479         mcache_obj_t *next = ((mcache_obj_t *)m)->obj_next;
6480
6481         VERIFY(mca->mca_contents != NULL &&
6482             mca->mca_contents_size == AUDIT_CONTENTS_SIZE);
6483
6484         if (mclverify)
6485                 mcl_audit_verify_nextptr(next, mca);
6486
6487         if (!alloc) {
6488                 /* Save constructed mbuf fields */
6489                 mcl_audit_save_mbuf(m, mca);
6490                 if (mclverify) {
6491                         mcache_set_pattern(MCACHE_FREE_PATTERN, m,
6492                             m_maxsize(MC_MBUF));
6493                 }
6494                 ((mcache_obj_t *)m)->obj_next = next;
6495                 return;
6496         }
6497
6498         /* Check if the buffer has been corrupted while in freelist */
6499         if (mclverify) {
6500                 mcache_audit_free_verify_set(mca, addr, 0, m_maxsize(MC_MBUF));
6501         }
6502         /* Restore constructed mbuf fields */
6503         mcl_audit_restore_mbuf(m, mca, composite);
6504 }
6505
6506 static void
6507 mcl_audit_restore_mbuf(struct mbuf *m, mcache_audit_t *mca, boolean_t composite)
6508 {
6509         struct mbuf *ms = MCA_SAVED_MBUF_PTR(mca);
6510
6511         if (composite) {
6512                 struct mbuf *next = m->m_next;
6513                 VERIFY(ms->m_flags == M_EXT && MEXT_RFA(ms) != NULL &&
6514                     MBUF_IS_COMPOSITE(ms));
6515                 VERIFY(mca->mca_contents_size == AUDIT_CONTENTS_SIZE);
6516                 /*
6517                  * We could have hand-picked the mbuf fields and restore
6518                  * them individually, but that will be a maintenance
6519                  * headache.  Instead, restore everything that was saved;
6520                  * the mbuf layer will recheck and reinitialize anyway.
6521                  */
6522                 bcopy(ms, m, MCA_SAVED_MBUF_SIZE);
6523                 m->m_next = next;
6524         } else {
6525                 /*
6526                  * For a regular mbuf (no cluster attached) there's nothing
6527                  * to restore other than the type field, which is expected
6528                  * to be MT_FREE.
6529                  */
6530                 m->m_type = ms->m_type;
6531         }
6532         _MCHECK(m);
6533 }
6534
6535 static void
6536 mcl_audit_save_mbuf(struct mbuf *m, mcache_audit_t *mca)
6537 {
6538         VERIFY(mca->mca_contents_size == AUDIT_CONTENTS_SIZE);
6539         _MCHECK(m);
6540         bcopy(m, MCA_SAVED_MBUF_PTR(mca), MCA_SAVED_MBUF_SIZE);
6541 }
6542
6543 static void
6544 mcl_audit_cluster(mcache_audit_t *mca, void *addr, size_t size, boolean_t alloc,
6545     boolean_t save_next)
6546 {
6547         mcache_obj_t *next = ((mcache_obj_t *)addr)->obj_next;
6548
6549         if (!alloc) {
6550                 if (mclverify) {
6551                         mcache_set_pattern(MCACHE_FREE_PATTERN, addr, size);
6552                 }
6553                 if (save_next) {
6554                         mcl_audit_verify_nextptr(next, mca);
6555                         ((mcache_obj_t *)addr)->obj_next = next;
6556                 }
6557         } else if (mclverify) {
6558                 /* Check if the buffer has been corrupted while in freelist */
6559                 mcl_audit_verify_nextptr(next, mca);
6560                 mcache_audit_free_verify_set(mca, addr, 0, size);
6561         }
6562 }
6563
6564 static void
6565 mcl_audit_scratch(mcache_audit_t *mca)
6566 {
6567         void *stack[MCACHE_STACK_DEPTH + 1];
6568         mcl_scratch_audit_t *msa;
6569         struct timeval now;
6570
6571         VERIFY(mca->mca_contents != NULL);
6572         msa = MCA_SAVED_SCRATCH_PTR(mca);
6573
6574         msa->msa_pthread = msa->msa_thread;
6575         msa->msa_thread = current_thread();
6576         bcopy(msa->msa_stack, msa->msa_pstack, sizeof (msa->msa_pstack));
6577         msa->msa_pdepth = msa->msa_depth;
6578         bzero(stack, sizeof (stack));
6579         msa->msa_depth = OSBacktrace(stack, MCACHE_STACK_DEPTH + 1) - 1;
6580         bcopy(&stack[1], msa->msa_stack, sizeof (mca->mca_pstack));
6581
6582         msa->msa_ptstamp = msa->msa_tstamp;
6583         microuptime(&now);
6584         /* tstamp is in ms relative to base_ts */
6585         msa->msa_tstamp = ((now.tv_usec - mb_start.tv_usec) / 1000);
6586         if ((now.tv_sec - mb_start.tv_sec) > 0)
6587                 msa->msa_tstamp += ((now.tv_sec - mb_start.tv_sec) * 1000);
6588 }
6589
6590 static void
6591 mcl_audit_mcheck_panic(struct mbuf *m)
6592 {
6593         mcache_audit_t *mca;
6594
6595         MRANGE(m);
6596         mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
6597
6598         panic("mcl_audit: freed mbuf %p with type 0x%x (instead of 0x%x)\n%s\n",
6599             m, (u_int16_t)m->m_type, MT_FREE, mcache_dump_mca(mca));
6600         /* NOTREACHED */
6601 }
6602
6603 static void
6604 mcl_audit_verify_nextptr(void *next, mcache_audit_t *mca)
6605 {
6606         if (next != NULL && !MBUF_IN_MAP(next) &&
6607             (next != (void *)MCACHE_FREE_PATTERN || !mclverify)) {
6608                 panic("mcl_audit: buffer %p modified after free at offset 0: "
6609                     "%p out of range [%p-%p)\n%s\n",
6610                     mca->mca_addr, next, mbutl, embutl, mcache_dump_mca(mca));
6611                 /* NOTREACHED */
6612         }
6613 }
6614
6615 /* This function turns on mbuf leak detection */
6616 static void
6617 mleak_activate(void)
6618 {
6619         mleak_table.mleak_sample_factor = MLEAK_SAMPLE_FACTOR;
6620         PE_parse_boot_argn("mleak_sample_factor",
6621             &mleak_table.mleak_sample_factor,
6622             sizeof (mleak_table.mleak_sample_factor));
6623
6624         if (mleak_table.mleak_sample_factor == 0)
6625                 mclfindleak = 0;
6626
6627         if (mclfindleak == 0)
6628                 return;
6629
6630         vm_size_t alloc_size =
6631             mleak_alloc_buckets * sizeof (struct mallocation);
6632         vm_size_t trace_size = mleak_trace_buckets * sizeof (struct mtrace);
6633
6634         MALLOC(mleak_allocations, struct mallocation *, alloc_size,
6635             M_TEMP, M_WAITOK | M_ZERO);
6636         VERIFY(mleak_allocations != NULL);
6637
6638         MALLOC(mleak_traces, struct mtrace *, trace_size,
6639             M_TEMP, M_WAITOK | M_ZERO);
6640         VERIFY(mleak_traces != NULL);
6641
6642         MALLOC(mleak_stat, mleak_stat_t *, MLEAK_STAT_SIZE(MLEAK_NUM_TRACES),
6643             M_TEMP, M_WAITOK | M_ZERO);
6644         VERIFY(mleak_stat != NULL);
6645         mleak_stat->ml_cnt = MLEAK_NUM_TRACES;
6646 #ifdef __LP64__
6647         mleak_stat->ml_isaddr64 = 1;
6648 #endif /* __LP64__ */
6649 }
6650
6651 static void
6652 mleak_logger(u_int32_t num, mcache_obj_t *addr, boolean_t alloc)
6653 {
6654         int temp;
6655
6656         if (mclfindleak == 0)
6657                 return;
6658
6659         if (!alloc)
6660                 return (mleak_free(addr));
6661
6662         temp = atomic_add_32_ov(&mleak_table.mleak_capture, 1);
6663
6664         if ((temp % mleak_table.mleak_sample_factor) == 0 && addr != NULL) {
6665                 uintptr_t bt[MLEAK_STACK_DEPTH];
6666                 int logged = fastbacktrace(bt, MLEAK_STACK_DEPTH);
6667                 mleak_log(bt, addr, logged, num);
6668         }
6669 }
6670
6671 /*
6672  * This function records the allocation in the mleak_allocations table
6673  * and the backtrace in the mleak_traces table; if allocation slot is in use,
6674  * replace old allocation with new one if the trace slot is in use, return
6675  * (or increment refcount if same trace).
6676  */
6677 static boolean_t
6678 mleak_log(uintptr_t *bt, mcache_obj_t *addr, uint32_t depth, int num)
6679 {
6680         struct mallocation *allocation;
6681         struct mtrace *trace;
6682         uint32_t trace_index;
6683
6684         /* Quit if someone else modifying the tables */
6685         if (!lck_mtx_try_lock_spin(mleak_lock)) {
6686                 mleak_table.total_conflicts++;
6687                 return (FALSE);
6688         }
6689
6690         allocation = &mleak_allocations[hashaddr((uintptr_t)addr,
6691             mleak_alloc_buckets)];
6692         trace_index = hashbacktrace(bt, depth, mleak_trace_buckets);
6693         trace = &mleak_traces[trace_index];
6694
6695         VERIFY(allocation <= &mleak_allocations[mleak_alloc_buckets - 1]);
6696         VERIFY(trace <= &mleak_traces[mleak_trace_buckets - 1]);
6697
6698         allocation->hitcount++;
6699         trace->hitcount++;
6700
6701         /*
6702          * If the allocation bucket we want is occupied
6703          * and the occupier has the same trace, just bail.
6704          */
6705         if (allocation->element != NULL &&
6706             trace_index == allocation->trace_index) {
6707                 mleak_table.alloc_collisions++;
6708                 lck_mtx_unlock(mleak_lock);
6709                 return (TRUE);
6710         }
6711
6712         /*
6713          * Store the backtrace in the traces array;
6714          * Size of zero = trace bucket is free.
6715          */
6716         if (trace->allocs > 0 &&
6717             bcmp(trace->addr, bt, (depth * sizeof (uintptr_t))) != 0) {
6718                 /* Different, unique trace, but the same hash! Bail out. */
6719                 trace->collisions++;
6720                 mleak_table.trace_collisions++;
6721                 lck_mtx_unlock(mleak_lock);
6722                 return (TRUE);
6723         } else if (trace->allocs > 0) {
6724                 /* Same trace, already added, so increment refcount */
6725                 trace->allocs++;
6726         } else {
6727                 /* Found an unused trace bucket, so record the trace here */
6728                 if (trace->depth != 0) {
6729                         /* this slot previously used but not currently in use */
6730                         mleak_table.trace_overwrites++;
6731                 }
6732                 mleak_table.trace_recorded++;
6733                 trace->allocs = 1;
6734                 memcpy(trace->addr, bt, (depth * sizeof (uintptr_t)));
6735                 trace->depth = depth;
6736                 trace->collisions = 0;
6737         }
6738
6739         /* Step 2: Store the allocation record in the allocations array */
6740         if (allocation->element != NULL) {
6741                 /*
6742                  * Replace an existing allocation.  No need to preserve
6743                  * because only a subset of the allocations are being
6744                  * recorded anyway.
6745                  */
6746                 mleak_table.alloc_collisions++;
6747         } else if (allocation->trace_index != 0) {
6748                 mleak_table.alloc_overwrites++;
6749         }
6750         allocation->element = addr;
6751         allocation->trace_index = trace_index;
6752         allocation->count = num;
6753         mleak_table.alloc_recorded++;
6754         mleak_table.outstanding_allocs++;
6755
6756         lck_mtx_unlock(mleak_lock);
6757         return (TRUE);
6758 }
6759
6760 static void
6761 mleak_free(mcache_obj_t *addr)
6762 {
6763         while (addr != NULL) {
6764                 struct mallocation *allocation = &mleak_allocations
6765                     [hashaddr((uintptr_t)addr, mleak_alloc_buckets)];
6766
6767                 if (allocation->element == addr &&
6768                     allocation->trace_index < mleak_trace_buckets) {
6769                         lck_mtx_lock_spin(mleak_lock);
6770                         if (allocation->element == addr &&
6771                             allocation->trace_index < mleak_trace_buckets) {
6772                                 struct mtrace *trace;
6773                                 trace = &mleak_traces[allocation->trace_index];
6774                                 /* allocs = 0 means trace bucket is unused */
6775                                 if (trace->allocs > 0)
6776                                         trace->allocs--;
6777                                 if (trace->allocs == 0)
6778                                         trace->depth = 0;
6779                                 /* NULL element means alloc bucket is unused */
6780                                 allocation->element = NULL;
6781                                 mleak_table.outstanding_allocs--;
6782                         }
6783                         lck_mtx_unlock(mleak_lock);
6784                 }
6785                 addr = addr->obj_next;
6786         }
6787 }
6788
6789 static void
6790 mleak_sort_traces()
6791 {
6792         int i, j, k;
6793         struct mtrace *swap;
6794
6795         for(i = 0; i < MLEAK_NUM_TRACES; i++)
6796                 mleak_top_trace[i] = NULL;
6797
6798         for(i = 0, j = 0; j < MLEAK_NUM_TRACES && i < mleak_trace_buckets; i++)
6799         {
6800                 if (mleak_traces[i].allocs <= 0)
6801                         continue;
6802
6803                 mleak_top_trace[j] = &mleak_traces[i];
6804                 for (k = j; k > 0; k--) {
6805                         if (mleak_top_trace[k]->allocs <=
6806                             mleak_top_trace[k-1]->allocs)
6807                                 break;
6808
6809                         swap = mleak_top_trace[k-1];
6810                         mleak_top_trace[k-1] = mleak_top_trace[k];
6811                         mleak_top_trace[k] = swap;
6812                 }
6813                 j++;
6814         }
6815
6816         j--;
6817         for(; i < mleak_trace_buckets; i++) {
6818                 if (mleak_traces[i].allocs <= mleak_top_trace[j]->allocs)
6819                         continue;
6820
6821                 mleak_top_trace[j] = &mleak_traces[i];
6822
6823                 for (k = j; k > 0; k--) {
6824                         if (mleak_top_trace[k]->allocs <=
6825                             mleak_top_trace[k-1]->allocs)
6826                                 break;
6827
6828                         swap = mleak_top_trace[k-1];
6829                         mleak_top_trace[k-1] = mleak_top_trace[k];
6830                         mleak_top_trace[k] = swap;
6831                 }
6832         }
6833 }
6834
6835 static void
6836 mleak_update_stats()
6837 {
6838         mleak_trace_stat_t *mltr;
6839         int i;
6840
6841         VERIFY(mleak_stat != NULL);
6842 #ifdef __LP64__
6843         VERIFY(mleak_stat->ml_isaddr64);
6844 #else
6845         VERIFY(!mleak_stat->ml_isaddr64);
6846 #endif /* !__LP64__ */
6847         VERIFY(mleak_stat->ml_cnt == MLEAK_NUM_TRACES);
6848
6849         mleak_sort_traces();
6850
6851         mltr = &mleak_stat->ml_trace[0];
6852         bzero(mltr, sizeof (*mltr) * MLEAK_NUM_TRACES);
6853         for (i = 0; i < MLEAK_NUM_TRACES; i++) {
6854         int j;
6855
6856                 if (mleak_top_trace[i] == NULL ||
6857                     mleak_top_trace[i]->allocs == 0)
6858                         continue;
6859
6860                 mltr->mltr_collisions   = mleak_top_trace[i]->collisions;
6861                 mltr->mltr_hitcount     = mleak_top_trace[i]->hitcount;
6862                 mltr->mltr_allocs       = mleak_top_trace[i]->allocs;
6863                 mltr->mltr_depth        = mleak_top_trace[i]->depth;
6864
6865                 VERIFY(mltr->mltr_depth <= MLEAK_STACK_DEPTH);
6866                 for (j = 0; j < mltr->mltr_depth; j++)
6867                         mltr->mltr_addr[j] = mleak_top_trace[i]->addr[j];
6868
6869                 mltr++;
6870         }
6871 }
6872
6873 static struct mbtypes {
6874         int             mt_type;
6875         const char      *mt_name;
6876 } mbtypes[] = {
6877         { MT_DATA,      "data" },
6878         { MT_OOBDATA,   "oob data" },
6879         { MT_CONTROL,   "ancillary data" },
6880         { MT_HEADER,    "packet headers" },
6881         { MT_SOCKET,    "socket structures" },
6882         { MT_PCB,       "protocol control blocks" },
6883         { MT_RTABLE,    "routing table entries" },
6884         { MT_HTABLE,    "IMP host table entries" },
6885         { MT_ATABLE,    "address resolution tables" },
6886         { MT_FTABLE,    "fragment reassembly queue headers" },
6887         { MT_SONAME,    "socket names and addresses" },
6888         { MT_SOOPTS,    "socket options" },
6889         { MT_RIGHTS,    "access rights" },
6890         { MT_IFADDR,    "interface addresses" },
6891         { MT_TAG,       "packet tags" },
6892         { 0,            NULL }
6893 };
6894
6895 #define MBUF_DUMP_BUF_CHK() {   \
6896         clen -= k;              \
6897         if (clen < 1)           \
6898                 goto done;      \
6899         c += k;                 \
6900 }
6901
6902 static char *
6903 mbuf_dump(void)
6904 {
6905         unsigned long totmem = 0, totfree = 0, totmbufs, totused, totpct;
6906         u_int32_t m_mbufs = 0, m_clfree = 0, m_bigclfree = 0;
6907         u_int32_t m_mbufclfree = 0, m_mbufbigclfree = 0;
6908         u_int32_t m_16kclusters = 0, m_16kclfree = 0, m_mbuf16kclfree = 0;
6909         int nmbtypes = sizeof (mbstat.m_mtypes) / sizeof (short);
6910         uint8_t seen[256];
6911         struct mbtypes *mp;
6912         mb_class_stat_t *sp;
6913         mleak_trace_stat_t *mltr;
6914         char *c = mbuf_dump_buf;
6915         int i, k, clen = MBUF_DUMP_BUF_SIZE;
6916
6917         mbuf_dump_buf[0] = '\0';
6918
6919         /* synchronize all statistics in the mbuf table */
6920         mbuf_stat_sync();
6921         mbuf_mtypes_sync(TRUE);
6922
6923         sp = &mb_stat->mbs_class[0];
6924         for (i = 0; i < mb_stat->mbs_cnt; i++, sp++) {
6925                 u_int32_t mem;
6926
6927                 if (m_class(i) == MC_MBUF) {
6928                         m_mbufs = sp->mbcl_active;
6929                 } else if (m_class(i) == MC_CL) {
6930                         m_clfree = sp->mbcl_total - sp->mbcl_active;
6931                 } else if (m_class(i) == MC_BIGCL) {
6932                         m_bigclfree = sp->mbcl_total - sp->mbcl_active;
6933                 } else if (njcl > 0 && m_class(i) == MC_16KCL) {
6934                         m_16kclfree = sp->mbcl_total - sp->mbcl_active;
6935                         m_16kclusters = sp->mbcl_total;
6936                 } else if (m_class(i) == MC_MBUF_CL) {
6937                         m_mbufclfree = sp->mbcl_total - sp->mbcl_active;
6938                 } else if (m_class(i) == MC_MBUF_BIGCL) {
6939                         m_mbufbigclfree = sp->mbcl_total - sp->mbcl_active;
6940                 } else if (njcl > 0 && m_class(i) == MC_MBUF_16KCL) {
6941                         m_mbuf16kclfree = sp->mbcl_total - sp->mbcl_active;
6942                 }
6943
6944                 mem = sp->mbcl_ctotal * sp->mbcl_size;
6945                 totmem += mem;
6946                 totfree += (sp->mbcl_mc_cached + sp->mbcl_infree) *
6947                     sp->mbcl_size;
6948
6949         }
6950
6951         /* adjust free counts to include composite caches */
6952         m_clfree += m_mbufclfree;
6953         m_bigclfree += m_mbufbigclfree;
6954         m_16kclfree += m_mbuf16kclfree;
6955
6956         totmbufs = 0;
6957         for (mp = mbtypes; mp->mt_name != NULL; mp++)
6958                 totmbufs += mbstat.m_mtypes[mp->mt_type];
6959         if (totmbufs > m_mbufs)
6960                 totmbufs = m_mbufs;
6961         k = snprintf(c, clen, "%lu/%u mbufs in use:\n", totmbufs, m_mbufs);
6962         MBUF_DUMP_BUF_CHK();
6963
6964         bzero(&seen, sizeof (seen));
6965         for (mp = mbtypes; mp->mt_name != NULL; mp++) {
6966                 if (mbstat.m_mtypes[mp->mt_type] != 0) {
6967                         seen[mp->mt_type] = 1;
6968                         k = snprintf(c, clen, "\t%u mbufs allocated to %s\n",
6969                             mbstat.m_mtypes[mp->mt_type], mp->mt_name);
6970                         MBUF_DUMP_BUF_CHK();
6971                 }
6972         }
6973         seen[MT_FREE] = 1;
6974         for (i = 0; i < nmbtypes; i++)
6975                 if (!seen[i] && mbstat.m_mtypes[i] != 0) {
6976                         k = snprintf(c, clen, "\t%u mbufs allocated to "
6977                             "<mbuf type %d>\n", mbstat.m_mtypes[i], i);
6978                         MBUF_DUMP_BUF_CHK();
6979                 }
6980         if ((m_mbufs - totmbufs) > 0) {
6981                 k = snprintf(c, clen, "\t%lu mbufs allocated to caches\n",
6982                     m_mbufs - totmbufs);
6983                 MBUF_DUMP_BUF_CHK();
6984         }
6985         k = snprintf(c, clen, "%u/%u mbuf 2KB clusters in use\n"
6986             "%u/%u mbuf 4KB clusters in use\n",
6987             (unsigned int)(mbstat.m_clusters - m_clfree),
6988             (unsigned int)mbstat.m_clusters,
6989             (unsigned int)(mbstat.m_bigclusters - m_bigclfree),
6990             (unsigned int)mbstat.m_bigclusters);
6991         MBUF_DUMP_BUF_CHK();
6992
6993         if (njcl > 0) {
6994                 k = snprintf(c, clen, "%u/%u mbuf %uKB clusters in use\n",
6995                     m_16kclusters - m_16kclfree, m_16kclusters,
6996                     njclbytes / 1024);
6997                 MBUF_DUMP_BUF_CHK();
6998         }
6999         totused = totmem - totfree;
7000         if (totmem == 0) {
7001                 totpct = 0;
7002         } else if (totused < (ULONG_MAX / 100)) {
7003                 totpct = (totused * 100) / totmem;
7004         } else {
7005                 u_long totmem1 = totmem / 100;
7006                 u_long totused1 = totused / 100;
7007                 totpct = (totused1 * 100) / totmem1;
7008         }
7009         k = snprintf(c, clen, "%lu KB allocated to network (approx. %lu%% "
7010             "in use)\n", totmem / 1024, totpct);
7011         MBUF_DUMP_BUF_CHK();
7012
7013         /* mbuf leak detection statistics */
7014         mleak_update_stats();
7015
7016         k = snprintf(c, clen, "\nmbuf leak detection table:\n");
7017         MBUF_DUMP_BUF_CHK();
7018         k = snprintf(c, clen, "\ttotal captured: %u (one per %u)\n",
7019             mleak_table.mleak_capture / mleak_table.mleak_sample_factor,
7020             mleak_table.mleak_sample_factor);
7021         MBUF_DUMP_BUF_CHK();
7022         k = snprintf(c, clen, "\ttotal allocs outstanding: %llu\n",
7023             mleak_table.outstanding_allocs);
7024         MBUF_DUMP_BUF_CHK();
7025         k = snprintf(c, clen, "\tnew hash recorded: %llu allocs, %llu traces\n",
7026             mleak_table.alloc_recorded, mleak_table.trace_recorded);
7027         MBUF_DUMP_BUF_CHK();
7028         k = snprintf(c, clen, "\thash collisions: %llu allocs, %llu traces\n",
7029             mleak_table.alloc_collisions, mleak_table.trace_collisions);
7030         MBUF_DUMP_BUF_CHK();
7031         k = snprintf(c, clen, "\toverwrites: %llu allocs, %llu traces\n",
7032             mleak_table.alloc_overwrites, mleak_table.trace_overwrites);
7033         MBUF_DUMP_BUF_CHK();
7034         k = snprintf(c, clen, "\tlock conflicts: %llu\n\n",
7035             mleak_table.total_conflicts);
7036         MBUF_DUMP_BUF_CHK();
7037
7038         k = snprintf(c, clen, "top %d outstanding traces:\n",
7039             mleak_stat->ml_cnt);
7040         MBUF_DUMP_BUF_CHK();
7041         for (i = 0; i < mleak_stat->ml_cnt; i++) {
7042                 mltr = &mleak_stat->ml_trace[i];
7043                 k = snprintf(c, clen, "[%d] %llu outstanding alloc(s), "
7044                     "%llu hit(s), %llu collision(s)\n", (i + 1),
7045                     mltr->mltr_allocs, mltr->mltr_hitcount,
7046                     mltr->mltr_collisions);
7047                 MBUF_DUMP_BUF_CHK();
7048         }
7049
7050         if (mleak_stat->ml_isaddr64)
7051                 k = snprintf(c, clen, MB_LEAK_HDR_64);
7052         else
7053                 k = snprintf(c, clen, MB_LEAK_HDR_32);
7054         MBUF_DUMP_BUF_CHK();
7055
7056         for (i = 0; i < MLEAK_STACK_DEPTH; i++) {
7057                 int j;
7058                 k = snprintf(c, clen, "%2d: ", (i + 1));
7059                 MBUF_DUMP_BUF_CHK();
7060                 for (j = 0; j < mleak_stat->ml_cnt; j++) {
7061                         mltr = &mleak_stat->ml_trace[j];
7062                         if (i < mltr->mltr_depth) {
7063                                 if (mleak_stat->ml_isaddr64) {
7064                                         k = snprintf(c, clen, "0x%0llx  ",
7065                                             mltr->mltr_addr[i]);
7066                                 } else {
7067                                         k = snprintf(c, clen,
7068                                             "0x%08x  ",
7069                                             (u_int32_t)mltr->mltr_addr[i]);
7070                                 }
7071                         } else {
7072                                 if (mleak_stat->ml_isaddr64)
7073                                         k = snprintf(c, clen,
7074                                             MB_LEAK_SPACING_64);
7075                                 else
7076                                         k = snprintf(c, clen,
7077                                             MB_LEAK_SPACING_32);
7078                         }
7079                         MBUF_DUMP_BUF_CHK();
7080                 }
7081                 k = snprintf(c, clen, "\n");
7082                 MBUF_DUMP_BUF_CHK();
7083         }
7084 done:
7085         return (mbuf_dump_buf);
7086 }
7087
7088 #undef MBUF_DUMP_BUF_CHK
7089
7090 /*
7091  * Convert between a regular and a packet header mbuf.  Caller is responsible
7092  * for setting or clearing M_PKTHDR; this routine does the rest of the work.
7093  */
7094 int
7095 m_reinit(struct mbuf *m, int hdr)
7096 {
7097         int ret = 0;
7098
7099         if (hdr) {
7100                 VERIFY(!(m->m_flags & M_PKTHDR));
7101                 if (!(m->m_flags & M_EXT) &&
7102                     (m->m_data != m->m_dat || m->m_len > 0)) {
7103                         /*
7104                          * If there's no external cluster attached and the
7105                          * mbuf appears to contain user data, we cannot
7106                          * safely convert this to a packet header mbuf,
7107                          * as the packet header structure might overlap
7108                          * with the data.
7109                          */
7110                         printf("%s: cannot set M_PKTHDR on altered mbuf %p, "
7111                             "m_data %p (expected %p), m_len %d (expected 0)\n",
7112                             __func__, m, m->m_data, m->m_dat, m->m_len);
7113                         ret = EBUSY;
7114                 } else {
7115                         VERIFY((m->m_flags & M_EXT) || m->m_data == m->m_dat);
7116                         m->m_flags |= M_PKTHDR;
7117                         MBUF_INIT_PKTHDR(m);
7118                 }
7119         } else {
7120                 /* Check for scratch area overflow */
7121                 m_redzone_verify(m);
7122                 /* Free the aux data and tags if there is any */
7123                 m_tag_delete_chain(m, NULL);
7124                 m->m_flags &= ~M_PKTHDR;
7125         }
7126
7127         return (ret);
7128 }
7129
7130 void
7131 m_scratch_init(struct mbuf *m)
7132 {
7133         VERIFY(m->m_flags & M_PKTHDR);
7134
7135         bzero(&m->m_pkthdr.pkt_mpriv, sizeof (m->m_pkthdr.pkt_mpriv));
7136 }
7137
7138 u_int32_t
7139 m_scratch_get(struct mbuf *m, u_int8_t **p)
7140 {
7141         VERIFY(m->m_flags & M_PKTHDR);
7142
7143         if (mcltrace) {
7144                 mcache_audit_t *mca;
7145
7146                 lck_mtx_lock(mbuf_mlock);
7147                 mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
7148                 if (mca->mca_uflags & MB_SCVALID)
7149                         mcl_audit_scratch(mca);
7150                 lck_mtx_unlock(mbuf_mlock);
7151         }
7152
7153         *p = (u_int8_t *)&m->m_pkthdr.pkt_mpriv;
7154         return (sizeof (m->m_pkthdr.pkt_mpriv));
7155 }
7156
7157 static void
7158 m_redzone_init(struct mbuf *m)
7159 {
7160         VERIFY(m->m_flags & M_PKTHDR);
7161         /*
7162          * Each mbuf has a unique red zone pattern, which is a XOR
7163          * of the red zone cookie and the address of the mbuf.
7164          */
7165         m->m_pkthdr.redzone = ((u_int32_t)(uintptr_t)m) ^ mb_redzone_cookie;
7166 }
7167
7168 static void
7169 m_redzone_verify(struct mbuf *m)
7170 {
7171         u_int32_t mb_redzone;
7172
7173         VERIFY(m->m_flags & M_PKTHDR);
7174
7175         mb_redzone = ((u_int32_t)(uintptr_t)m) ^ mb_redzone_cookie;
7176         if (m->m_pkthdr.redzone != mb_redzone) {
7177                 panic("mbuf %p redzone violation with value 0x%x "
7178                     "(instead of 0x%x, using cookie 0x%x)\n",
7179                     m, m->m_pkthdr.redzone, mb_redzone, mb_redzone_cookie);
7180                 /* NOTREACHED */
7181         }
7182 }
7183
7184 SYSCTL_DECL(_kern_ipc);
7185 SYSCTL_PROC(_kern_ipc, KIPC_MBSTAT, mbstat,
7186     CTLFLAG_RD | CTLFLAG_LOCKED,
7187     0, 0, mbstat_sysctl, "S,mbstat", "");
7188 SYSCTL_PROC(_kern_ipc, OID_AUTO, mb_stat,
7189     CTLFLAG_RD | CTLFLAG_LOCKED,
7190     0, 0, mb_stat_sysctl, "S,mb_stat", "");
7191 SYSCTL_PROC(_kern_ipc, OID_AUTO, mleak_top_trace,
7192     CTLFLAG_RD | CTLFLAG_LOCKED,
7193     0, 0, mleak_top_trace_sysctl, "S,mb_top_trace", "");
7194 SYSCTL_PROC(_kern_ipc, OID_AUTO, mleak_table,
7195     CTLFLAG_RD | CTLFLAG_LOCKED,
7196     0, 0, mleak_table_sysctl, "S,mleak_table", "");
7197 SYSCTL_INT(_kern_ipc, OID_AUTO, mleak_sample_factor,
7198     CTLFLAG_RW | CTLFLAG_LOCKED, &mleak_table.mleak_sample_factor, 0, "");
7199 SYSCTL_INT(_kern_ipc, OID_AUTO, mb_normalized,
7200     CTLFLAG_RD | CTLFLAG_LOCKED, &mb_normalized, 0, "");
7201 SYSCTL_INT(_kern_ipc, OID_AUTO, mb_watchdog,
7202     CTLFLAG_RW | CTLFLAG_LOCKED, &mb_watchdog, 0, "");