bsd/kern/uipc_mbuf.c

   1 /*
   2  * Copyright (c) 2008 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
  29 /*
  30  * Copyright (c) 1982, 1986, 1988, 1991, 1993
  31  *      The Regents of the University of California.  All rights reserved.
  32  *
  33  * Redistribution and use in source and binary forms, with or without
  34  * modification, are permitted provided that the following conditions
  35  * are met:
  36  * 1. Redistributions of source code must retain the above copyright
  37  *    notice, this list of conditions and the following disclaimer.
  38  * 2. Redistributions in binary form must reproduce the above copyright
  39  *    notice, this list of conditions and the following disclaimer in the
  40  *    documentation and/or other materials provided with the distribution.
  41  * 3. All advertising materials mentioning features or use of this software
  42  *    must display the following acknowledgement:
  43  *      This product includes software developed by the University of
  44  *      California, Berkeley and its contributors.
  45  * 4. Neither the name of the University nor the names of its contributors
  46  *    may be used to endorse or promote products derived from this software
  47  *    without specific prior written permission.
  48  *
  49  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  50  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  51  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  52  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  53  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  54  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  55  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  56  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  57  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  58  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  59  * SUCH DAMAGE.
  60  *
  61  *      @(#)uipc_mbuf.c 8.2 (Berkeley) 1/4/94
  62  */
  63 /*
  64  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
  65  * support for mandatory and extensible security protections.  This notice
  66  * is included in support of clause 2.2 (b) of the Apple Public License,
  67  * Version 2.0.
  68  */
  69
  70 #include <sys/param.h>
  71 #include <sys/systm.h>
  72 #include <sys/malloc.h>
  73 #include <sys/mbuf.h>
  74 #include <sys/kernel.h>
  75 #include <sys/sysctl.h>
  76 #include <sys/syslog.h>
  77 #include <sys/protosw.h>
  78 #include <sys/domain.h>
  79 #include <sys/queue.h>
  80 #include <sys/proc.h>
  81
  82 #include <kern/kern_types.h>
  83 #include <kern/simple_lock.h>
  84 #include <kern/queue.h>
  85 #include <kern/sched_prim.h>
  86 #include <kern/cpu_number.h>
  87
  88 #include <libkern/OSAtomic.h>
  89 #include <libkern/libkern.h>
  90
  91 #include <IOKit/IOMapper.h>
  92
  93 #include <machine/limits.h>
  94 #include <machine/machine_routines.h>
  95
  96 #if CONFIG_MACF_NET
  97 #include <security/mac_framework.h>
  98 #endif /* MAC_NET */
  99
 100 #include <sys/mcache.h>
 101
 102 /*
 103  * MBUF IMPLEMENTATION NOTES.
 104  *
 105  * There is a total of 5 per-CPU caches:
 106  *
 107  * MC_MBUF:
 108  *      This is a cache of rudimentary objects of MSIZE in size; each
 109  *      object represents an mbuf structure.  This cache preserves only
 110  *      the m_type field of the mbuf during its transactions.
 111  *
 112  * MC_CL:
 113  *      This is a cache of rudimentary objects of MCLBYTES in size; each
 114  *      object represents a mcluster structure.  This cache does not
 115  *      preserve the contents of the objects during its transactions.
 116  *
 117  * MC_BIGCL:
 118  *      This is a cache of rudimentary objects of NBPG in size; each
 119  *      object represents a mbigcluster structure.  This cache does not
 120  *      preserve the contents of the objects during its transaction.
 121  *
 122  * MC_MBUF_CL:
 123  *      This is a cache of mbufs each having a cluster attached to it.
 124  *      It is backed by MC_MBUF and MC_CL rudimentary caches.  Several
 125  *      fields of the mbuf related to the external cluster are preserved
 126  *      during transactions.
 127  *
 128  * MC_MBUF_BIGCL:
 129  *      This is a cache of mbufs each having a big cluster attached to it.
 130  *      It is backed by MC_MBUF and MC_BIGCL rudimentary caches.  Several
 131  *      fields of the mbuf related to the external cluster are preserved
 132  *      during transactions.
 133  *
 134  * OBJECT ALLOCATION:
 135  *
 136  * Allocation requests are handled first at the per-CPU (mcache) layer
 137  * before falling back to the slab layer.  Performance is optimal when
 138  * the request is satisfied at the CPU layer because global data/lock
 139  * never gets accessed.  When the slab layer is entered for allocation,
 140  * the slab freelist will be checked first for available objects before
 141  * the VM backing store is invoked.  Slab layer operations are serialized
 142  * for all of the caches as the mbuf global lock is held most of the time.
 143  * Allocation paths are different depending on the class of objects:
 144  *
 145  * a. Rudimentary object:
 146  *
 147  *      { m_get_common(), m_clattach(), m_mclget(),
 148  *        m_mclalloc(), m_bigalloc(), m_copym_with_hdrs(),
 149  *        composite object allocation }
 150  *                      |       ^
 151  *                      |       |
 152  *                      |       +-----------------------+
 153  *                      v                               |
 154  *         mcache_alloc/mcache_alloc_ext()      mbuf_slab_audit()
 155  *                      |                               ^
 156  *                      v                               |
 157  *                 [CPU cache] -------> (found?) -------+
 158  *                      |                               |
 159  *                      v                               |
 160  *               mbuf_slab_alloc()                      |
 161  *                      |                               |
 162  *                      v                               |
 163  *      +---------> [freelist] -------> (found?) -------+
 164  *      |               |
 165  *      |               v
 166  *      |           m_clalloc()
 167  *      |               |
 168  *      |               v
 169  *      +---<<---- kmem_mb_alloc()
 170  *
 171  * b. Composite object:
 172  *
 173  *      { m_getpackets_internal(), m_allocpacket_internal() }
 174  *                      |       ^
 175  *                      |       |
 176  *                      |       +------ (done) ---------+
 177  *                      v                               |
 178  *         mcache_alloc/mcache_alloc_ext()      mbuf_cslab_audit()
 179  *                      |                               ^
 180  *                      v                               |
 181  *                 [CPU cache] -------> (found?) -------+
 182  *                      |                               |
 183  *                      v                               |
 184  *               mbuf_cslab_alloc()                     |
 185  *                      |                               |
 186  *                      v                               |
 187  *                  [freelist] -------> (found?) -------+
 188  *                      |                               |
 189  *                      v                               |
 190  *              (rudimentary object)                    |
 191  *         mcache_alloc/mcache_alloc_ext() ------>>-----+
 192  *
 193  * Auditing notes: If auditing is enabled, buffers will be subjected to
 194  * integrity checks by the audit routine.  This is done by verifying their
 195  * contents against DEADBEEF (free) pattern before returning them to caller.
 196  * As part of this step, the routine will also record the transaction and
 197  * pattern-fill the buffers with BADDCAFE (uninitialized) pattern.  It will
 198  * also restore any constructed data structure fields if necessary.
 199  *
 200  * OBJECT DEALLOCATION:
 201  *
 202  * Freeing an object simply involves placing it into the CPU cache; this
 203  * pollutes the cache to benefit subsequent allocations.  The slab layer
 204  * will only be entered if the object is to be purged out of the cache.
 205  * During normal operations, this happens only when the CPU layer resizes
 206  * its bucket while it's adjusting to the allocation load.  Deallocation
 207  * paths are different depending on the class of objects:
 208  *
 209  * a. Rudimentary object:
 210  *
 211  *      { m_free(), m_freem_list(), composite object deallocation }
 212  *                      |       ^
 213  *                      |       |
 214  *                      |       +------ (done) ---------+
 215  *                      v                               |
 216  *         mcache_free/mcache_free_ext()                |
 217  *                      |                               |
 218  *                      v                               |
 219  *              mbuf_slab_audit()                       |
 220  *                      |                               |
 221  *                      v                               |
 222  *                 [CPU cache] ---> (not purging?) -----+
 223  *                      |                               |
 224  *                      v                               |
 225  *               mbuf_slab_free()                       |
 226  *                      |                               |
 227  *                      v                               |
 228  *                  [freelist] ----------->>------------+
 229  *       (objects never get purged to VM)
 230  *
 231  * b. Composite object:
 232  *
 233  *      { m_free(), m_freem_list() }
 234  *                      |       ^
 235  *                      |       |
 236  *                      |       +------ (done) ---------+
 237  *                      v                               |
 238  *         mcache_free/mcache_free_ext()                |
 239  *                      |                               |
 240  *                      v                               |
 241  *              mbuf_cslab_audit()                      |
 242  *                      |                               |
 243  *                      v                               |
 244  *                 [CPU cache] ---> (not purging?) -----+
 245  *                      |                               |
 246  *                      v                               |
 247  *               mbuf_cslab_free()                      |
 248  *                      |                               |
 249  *                      v                               |
 250  *                  [freelist] ---> (not purging?) -----+
 251  *                      |                               |
 252  *                      v                               |
 253  *              (rudimentary object)                    |
 254  *         mcache_free/mcache_free_ext() ------->>------+
 255  *
 256  * Auditing notes: If auditing is enabled, the audit routine will save
 257  * any constructed data structure fields (if necessary) before filling the
 258  * contents of the buffers with DEADBEEF (free) pattern and recording the
 259  * transaction.  Buffers that are freed (whether at CPU or slab layer) are
 260  * expected to contain the free pattern.
 261  *
 262  * DEBUGGING:
 263  *
 264  * Debugging can be enabled by adding "mbuf_debug=0x3" to boot-args; this
 265  * translates to the mcache flags (MCF_VERIFY | MCF_AUDIT).  Additionally,
 266  * the CPU layer cache can be disabled by setting the MCF_NOCPUCACHE flag,
 267  * i.e. modify the boot argument parameter to "mbuf_debug=0x13".  Note
 268  * that debugging consumes more CPU and memory.
 269  *
 270  * Each object is associated with exactly one mcache_audit_t structure that
 271  * contains the information related to its last buffer transaction.  Given
 272  * an address of an object, the audit structure can be retrieved by finding
 273  * the position of the object relevant to the base address of the cluster:
 274  *
 275  *      +------------+                  +=============+
 276  *      | mbuf addr  |                  | mclaudit[i] |
 277  *      +------------+                  +=============+
 278  *            |                         | cl_audit[0] |
 279  *      i = MTOCL(addr)                 +-------------+
 280  *            |                 +-----> | cl_audit[1] | -----> mcache_audit_t
 281  *      b = CLTOM(i)            |       +-------------+
 282  *            |                 |       |     ...     |
 283  *      x = MCLIDX(b, addr)     |       +-------------+
 284  *            |                 |       | cl_audit[7] |
 285  *            +-----------------+       +-------------+
 286  *               (e.g. x == 1)
 287  *
 288  * The mclaudit[] array is allocated at initialization time, but its contents
 289  * get populated when the corresponding cluster is created.  Because a cluster
 290  * can be turned into NMBPCL number of mbufs, we preserve enough space for the
 291  * mbufs so that there is a 1-to-1 mapping between them.  A cluster that never
 292  * gets (or has not yet) turned into mbufs will use only cl_audit[0] with the
 293  * remaining entries unused.  For big clusters, only one entry is allocated
 294  * and used for the entire cluster pair.
 295  */
 296
 297 /* TODO: should be in header file */
 298 /* kernel translater */
 299 extern vm_offset_t kmem_mb_alloc(vm_map_t, int, int);
 300 extern ppnum_t pmap_find_phys(pmap_t pmap, addr64_t va);
 301 extern vm_map_t mb_map;         /* special map */
 302
 303 /* Global lock */
 304 static lck_mtx_t *mbuf_mlock;
 305 static lck_attr_t *mbuf_mlock_attr;
 306 static lck_grp_t *mbuf_mlock_grp;
 307 static lck_grp_attr_t *mbuf_mlock_grp_attr;
 308
 309 /* Back-end (common) layer */
 310 static void *mbuf_worker_run;   /* wait channel for worker thread */
 311 static int mbuf_worker_ready;   /* worker thread is runnable */
 312 static int mbuf_expand_mcl;     /* number of cluster creation requets */
 313 static int mbuf_expand_big;     /* number of big cluster creation requests */
 314 static int mbuf_expand_16k;     /* number of 16K cluster creation requests */
 315 static int ncpu;                /* number of CPUs */
 316 static ppnum_t *mcl_paddr;      /* Array of cluster physical addresses */
 317 static ppnum_t mcl_pages;       /* Size of array (# physical pages) */
 318 static ppnum_t mcl_paddr_base;  /* Handle returned by IOMapper::iovmAlloc() */
 319 static mcache_t *ref_cache;     /* Cache of cluster reference & flags */
 320 static mcache_t *mcl_audit_con_cache; /* Audit contents cache */
 321 static unsigned int mbuf_debug; /* patchable mbuf mcache flags */
 322 static unsigned int mb_normalized; /* number of packets "normalized" */
 323 static unsigned int mbuf_gscale; /* Power-of-two growth scale for m_howmany */
 324
 325 #define MB_GROWTH_AGGRESSIVE    1       /* Threshold: 1/2 of total */
 326 #define MB_GROWTH_NORMAL        4       /* Threshold: 15/16 of total */
 327
 328 typedef enum {
 329         MC_MBUF = 0,    /* Regular mbuf */
 330         MC_CL,          /* Cluster */
 331         MC_BIGCL,       /* Large (4K) cluster */
 332         MC_16KCL,       /* Jumbo (16K) cluster */
 333         MC_MBUF_CL,     /* mbuf + cluster */
 334         MC_MBUF_BIGCL,  /* mbuf + large (4K) cluster */
 335         MC_MBUF_16KCL   /* mbuf + jumbo (16K) cluster */
 336 } mbuf_class_t;
 337
 338 #define MBUF_CLASS_MIN          MC_MBUF
 339 #define MBUF_CLASS_MAX          MC_MBUF_16KCL
 340 #define MBUF_CLASS_LAST         MC_16KCL
 341 #define MBUF_CLASS_VALID(c) \
 342         ((int)(c) >= MBUF_CLASS_MIN && (int)(c) <= MBUF_CLASS_MAX)
 343 #define MBUF_CLASS_COMPOSITE(c) \
 344         ((int)(c) > MBUF_CLASS_LAST)
 345
 346
 347 /*
 348  * mbuf specific mcache allocation request flags.
 349  */
 350 #define MCR_COMP        MCR_USR1 /* for MC_MBUF_{CL,BIGCL,16KCL} caches */
 351
 352 /*
 353  * Per-cluster slab structure.
 354  *
 355  * A slab is a cluster control structure that contains one or more object
 356  * chunks; the available chunks are chained in the slab's freelist (sl_head).
 357  * Each time a chunk is taken out of the slab, the slab's reference count
 358  * gets incremented.  When all chunks have been taken out, the empty slab
 359  * gets removed (SLF_DETACHED) from the class's slab list.  A chunk that is
 360  * returned to a slab causes the slab's reference count to be decremented;
 361  * it also causes the slab to be reinserted back to class's slab list, if
 362  * it's not already done.
 363  *
 364  * Compartmentalizing of the object chunks into slabs allows us to easily
 365  * merge one or more slabs together when the adjacent slabs are idle, as
 366  * well as to convert or move a slab from one class to another; e.g. the
 367  * mbuf cluster slab can be converted to a regular cluster slab when all
 368  * mbufs in the slab have been freed.
 369  *
 370  * A slab may also span across multiple clusters for chunks larger than
 371  * a cluster's size.  In this case, only the slab of the first cluster is
 372  * used.  The rest of the slabs are marked with SLF_PARTIAL to indicate
 373  * that they are part of the larger slab.
 374  */
 375 typedef struct mcl_slab {
 376         struct mcl_slab *sl_next;       /* neighboring slab */
 377         u_int8_t        sl_class;       /* controlling mbuf class */
 378         int8_t          sl_refcnt;      /* outstanding allocations */
 379         int8_t          sl_chunks;      /* chunks (bufs) in this slab */
 380         u_int16_t       sl_flags;       /* slab flags (see below) */
 381         u_int16_t       sl_len;         /* slab length */
 382         void            *sl_base;       /* base of allocated memory */
 383         void            *sl_head;       /* first free buffer */
 384         TAILQ_ENTRY(mcl_slab) sl_link;  /* next/prev slab on freelist */
 385 } mcl_slab_t;
 386
 387 #define SLF_MAPPED      0x0001          /* backed by a mapped page */
 388 #define SLF_PARTIAL     0x0002          /* part of another slab */
 389 #define SLF_DETACHED    0x0004          /* not in slab freelist */
 390
 391 /*
 392  * The array of slabs are broken into groups of arrays per 1MB of kernel
 393  * memory to reduce the footprint.  Each group is allocated on demand
 394  * whenever a new piece of memory mapped in from the VM crosses the 1MB
 395  * boundary.
 396  */
 397 #define NSLABSPMB       ((1 << MBSHIFT) >> MCLSHIFT)    /* 512 slabs/grp */
 398
 399 typedef struct mcl_slabg {
 400         mcl_slab_t      slg_slab[NSLABSPMB];    /* group of slabs */
 401 } mcl_slabg_t;
 402
 403 /*
 404  * Per-cluster audit structure.
 405  */
 406 typedef struct {
 407         mcache_audit_t  *cl_audit[NMBPCL];      /* array of audits */
 408 } mcl_audit_t;
 409
 410 #if CONFIG_MBUF_NOEXPAND
 411 static unsigned int maxmbufcl;
 412 #endif /* CONFIG_MBUF_NOEXPAND */
 413
 414 /*
 415  * Size of data from the beginning of an mbuf that covers m_hdr, pkthdr
 416  * and m_ext structures.  If auditing is enabled, we allocate a shadow
 417  * mbuf structure of this size inside each audit structure, and the
 418  * contents of the real mbuf gets copied into it when the mbuf is freed.
 419  * This allows us to pattern-fill the mbuf for integrity check, and to
 420  * preserve any constructed mbuf fields (e.g. mbuf + cluster cache case).
 421  * Note that we don't save the contents of clusters when they are freed;
 422  * we simply pattern-fill them.
 423  */
 424 #define AUDIT_CONTENTS_SIZE     ((MSIZE - MHLEN) + sizeof (_m_ext_t))
 425
 426 /*
 427  * mbuf specific mcache audit flags
 428  */
 429 #define MB_INUSE        0x01    /* object has not been returned to slab */
 430 #define MB_COMP_INUSE   0x02    /* object has not been returned to cslab */
 431 #define MB_SCVALID      0x04    /* object has valid saved contents */
 432
 433 /*
 434  * Each of the following two arrays hold up to nmbclusters elements.
 435  */
 436 static mcl_audit_t *mclaudit;   /* array of cluster audit information */
 437 static mcl_slabg_t **slabstbl;  /* cluster slabs table */
 438 static unsigned int maxslabgrp; /* max # of entries in slabs table */
 439 static unsigned int slabgrp;    /* # of entries in slabs table */
 440
 441 /* Globals */
 442 int nclusters;                  /* # of clusters for non-jumbo (legacy) sizes */
 443 int njcl;                       /* # of clusters for jumbo sizes */
 444 int njclbytes;                  /* size of a jumbo cluster */
 445 union mcluster *mbutl;          /* first mapped cluster address */
 446 union mcluster *embutl;         /* ending virtual address of mclusters */
 447 int max_linkhdr;                /* largest link-level header */
 448 int max_protohdr;               /* largest protocol header */
 449 int max_hdr;                    /* largest link+protocol header */
 450 int max_datalen;                /* MHLEN - max_hdr */
 451
 452 extern u_int32_t high_sb_max;
 453
 454 /* TODO: should be in header file */
 455 int do_reclaim = 0;
 456
 457 /* The minimum number of objects that are allocated, to start. */
 458 #define MINCL           32
 459 #define MINBIGCL        (MINCL >> 1)
 460 #define MIN16KCL        (MINCL >> 2)
 461
 462 /* Low watermarks (only map in pages once free counts go below) */
 463 #define MCL_LOWAT       MINCL
 464 #define MBIGCL_LOWAT    MINBIGCL
 465 #define M16KCL_LOWAT    MIN16KCL
 466
 467 typedef struct {
 468         mbuf_class_t    mtbl_class;     /* class type */
 469         mcache_t        *mtbl_cache;    /* mcache for this buffer class */
 470         TAILQ_HEAD(mcl_slhead, mcl_slab) mtbl_slablist; /* slab list */
 471         mcache_obj_t    *mtbl_cobjlist; /* composite objects freelist */
 472         mb_class_stat_t *mtbl_stats;    /* statistics fetchable via sysctl */
 473         u_int32_t       mtbl_maxsize;   /* maximum buffer size */
 474         int             mtbl_minlimit;  /* minimum allowed */
 475         int             mtbl_maxlimit;  /* maximum allowed */
 476         u_int32_t       mtbl_wantpurge; /* purge during next reclaim */
 477 } mbuf_table_t;
 478
 479 #define m_class(c)      mbuf_table[c].mtbl_class
 480 #define m_cache(c)      mbuf_table[c].mtbl_cache
 481 #define m_slablist(c)   mbuf_table[c].mtbl_slablist
 482 #define m_cobjlist(c)   mbuf_table[c].mtbl_cobjlist
 483 #define m_maxsize(c)    mbuf_table[c].mtbl_maxsize
 484 #define m_minlimit(c)   mbuf_table[c].mtbl_minlimit
 485 #define m_maxlimit(c)   mbuf_table[c].mtbl_maxlimit
 486 #define m_wantpurge(c)  mbuf_table[c].mtbl_wantpurge
 487 #define m_cname(c)      mbuf_table[c].mtbl_stats->mbcl_cname
 488 #define m_size(c)       mbuf_table[c].mtbl_stats->mbcl_size
 489 #define m_total(c)      mbuf_table[c].mtbl_stats->mbcl_total
 490 #define m_active(c)     mbuf_table[c].mtbl_stats->mbcl_active
 491 #define m_infree(c)     mbuf_table[c].mtbl_stats->mbcl_infree
 492 #define m_slab_cnt(c)   mbuf_table[c].mtbl_stats->mbcl_slab_cnt
 493 #define m_alloc_cnt(c)  mbuf_table[c].mtbl_stats->mbcl_alloc_cnt
 494 #define m_free_cnt(c)   mbuf_table[c].mtbl_stats->mbcl_free_cnt
 495 #define m_notified(c)   mbuf_table[c].mtbl_stats->mbcl_notified
 496 #define m_purge_cnt(c)  mbuf_table[c].mtbl_stats->mbcl_purge_cnt
 497 #define m_fail_cnt(c)   mbuf_table[c].mtbl_stats->mbcl_fail_cnt
 498 #define m_ctotal(c)     mbuf_table[c].mtbl_stats->mbcl_ctotal
 499
 500 static mbuf_table_t mbuf_table[] = {
 501         /*
 502          * The caches for mbufs, regular clusters and big clusters.
 503          */
 504         { MC_MBUF, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_MBUF)),
 505             NULL, NULL, 0, 0, 0, 0 },
 506         { MC_CL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_CL)),
 507             NULL, NULL, 0, 0, 0, 0 },
 508         { MC_BIGCL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_BIGCL)),
 509             NULL, NULL, 0, 0, 0, 0 },
 510         { MC_16KCL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_16KCL)),
 511             NULL, NULL, 0, 0, 0, 0 },
 512         /*
 513          * The following are special caches; they serve as intermediate
 514          * caches backed by the above rudimentary caches.  Each object
 515          * in the cache is an mbuf with a cluster attached to it.  Unlike
 516          * the above caches, these intermediate caches do not directly
 517          * deal with the slab structures; instead, the constructed
 518          * cached elements are simply stored in the freelists.
 519          */
 520         { MC_MBUF_CL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0 },
 521         { MC_MBUF_BIGCL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0 },
 522         { MC_MBUF_16KCL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0 },
 523 };
 524
 525 #define NELEM(a)        (sizeof (a) / sizeof ((a)[0]))
 526
 527 static void *mb_waitchan = &mbuf_table; /* wait channel for all caches */
 528 static int mb_waiters;                  /* number of sleepers */
 529
 530 /* The following are used to serialize m_clalloc() */
 531 static boolean_t mb_clalloc_busy;
 532 static void *mb_clalloc_waitchan = &mb_clalloc_busy;
 533 static int mb_clalloc_waiters;
 534
 535 static int mbstat_sysctl SYSCTL_HANDLER_ARGS;
 536 static int mb_stat_sysctl SYSCTL_HANDLER_ARGS;
 537 static void mbuf_table_init(void);
 538 static inline void m_incref(struct mbuf *);
 539 static inline u_int32_t m_decref(struct mbuf *);
 540 static int m_clalloc(const u_int32_t, const int, const u_int32_t);
 541 static void mbuf_worker_thread_init(void);
 542 static mcache_obj_t *slab_alloc(mbuf_class_t, int);
 543 static void slab_free(mbuf_class_t, mcache_obj_t *);
 544 static unsigned int mbuf_slab_alloc(void *, mcache_obj_t ***,
 545     unsigned int, int);
 546 static void mbuf_slab_free(void *, mcache_obj_t *, int);
 547 static void mbuf_slab_audit(void *, mcache_obj_t *, boolean_t);
 548 static void mbuf_slab_notify(void *, u_int32_t);
 549 static unsigned int cslab_alloc(mbuf_class_t, mcache_obj_t ***,
 550     unsigned int);
 551 static unsigned int cslab_free(mbuf_class_t, mcache_obj_t *, int);
 552 static unsigned int mbuf_cslab_alloc(void *, mcache_obj_t ***,
 553     unsigned int, int);
 554 static void mbuf_cslab_free(void *, mcache_obj_t *, int);
 555 static void mbuf_cslab_audit(void *, mcache_obj_t *, boolean_t);
 556 static int freelist_populate(mbuf_class_t, unsigned int, int);
 557 static boolean_t mbuf_cached_above(mbuf_class_t, int);
 558 static boolean_t mbuf_steal(mbuf_class_t, unsigned int);
 559 static void m_reclaim(mbuf_class_t, unsigned int, boolean_t);
 560 static int m_howmany(int, size_t);
 561 static void mbuf_worker_thread(void);
 562 static boolean_t mbuf_sleep(mbuf_class_t, unsigned int, int);
 563
 564 static void mcl_audit_init(void *, mcache_audit_t **, mcache_obj_t **,
 565     size_t, unsigned int);
 566 static mcache_audit_t *mcl_audit_buf2mca(mbuf_class_t, mcache_obj_t *);
 567 static void mcl_audit_mbuf(mcache_audit_t *, void *, boolean_t, boolean_t);
 568 static void mcl_audit_cluster(mcache_audit_t *, void *, size_t, boolean_t,
 569     boolean_t);
 570 static void mcl_audit_restore_mbuf(struct mbuf *, mcache_audit_t *, boolean_t);
 571 static void mcl_audit_save_mbuf(struct mbuf *, mcache_audit_t *);
 572 static void mcl_audit_mcheck_panic(struct mbuf *);
 573 static void mcl_audit_verify_nextptr(void *, mcache_audit_t *);
 574
 575 static mcl_slab_t *slab_get(void *);
 576 static void slab_init(mcl_slab_t *, mbuf_class_t, u_int32_t,
 577     void *, void *, unsigned int, int, int);
 578 static void slab_insert(mcl_slab_t *, mbuf_class_t);
 579 static void slab_remove(mcl_slab_t *, mbuf_class_t);
 580 static boolean_t slab_inrange(mcl_slab_t *, void *);
 581 static void slab_nextptr_panic(mcl_slab_t *, void *);
 582 static void slab_detach(mcl_slab_t *);
 583 static boolean_t slab_is_detached(mcl_slab_t *);
 584
 585 static unsigned int m_length(struct mbuf *);
 586 static int m_copyback0(struct mbuf **, int, int, const void *, int, int);
 587 static struct mbuf *m_split0(struct mbuf *, int, int, int);
 588
 589 /* flags for m_copyback0 */
 590 #define M_COPYBACK0_COPYBACK    0x0001  /* copyback from cp */
 591 #define M_COPYBACK0_PRESERVE    0x0002  /* preserve original data */
 592 #define M_COPYBACK0_COW         0x0004  /* do copy-on-write */
 593 #define M_COPYBACK0_EXTEND      0x0008  /* extend chain */
 594
 595 /*
 596  * This flag is set for all mbufs that come out of and into the composite
 597  * mbuf + cluster caches, i.e. MC_MBUF_CL and MC_MBUF_BIGCL.  mbufs that
 598  * are marked with such a flag have clusters attached to them, and will be
 599  * treated differently when they are freed; instead of being placed back
 600  * into the mbuf and cluster freelists, the composite mbuf + cluster objects
 601  * are placed back into the appropriate composite cache's freelist, and the
 602  * actual freeing is deferred until the composite objects are purged.  At
 603  * such a time, this flag will be cleared from the mbufs and the objects
 604  * will be freed into their own separate freelists.
 605  */
 606 #define EXTF_COMPOSITE  0x1
 607
 608 #define MEXT_RFA(m)             ((m)->m_ext.ext_refflags)
 609 #define MEXT_REF(m)             (MEXT_RFA(m)->refcnt)
 610 #define MEXT_FLAGS(m)           (MEXT_RFA(m)->flags)
 611 #define MBUF_IS_COMPOSITE(m)    \
 612         (MEXT_REF(m) == 0 && (MEXT_FLAGS(m) & EXTF_COMPOSITE))
 613
 614 /*
 615  * Macros used to verify the integrity of the mbuf.
 616  */
 617 #define _MCHECK(m) {                                                    \
 618         if ((m)->m_type != MT_FREE) {                                   \
 619                 if (mclaudit == NULL)                                   \
 620                         panic("MCHECK: m_type=%d m=%p",                 \
 621                             (u_int16_t)(m)->m_type, m);                 \
 622                 else                                                    \
 623                         mcl_audit_mcheck_panic(m);                      \
 624         }                                                               \
 625 }
 626
 627 #define MBUF_IN_MAP(addr)                                               \
 628         ((void *)(addr) >= (void *)mbutl && (void *)(addr) < (void *)embutl)
 629
 630 #define MRANGE(addr) {                                                  \
 631         if (!MBUF_IN_MAP(addr))                                         \
 632                 panic("MRANGE: address out of range 0x%p", addr);       \
 633 }
 634
 635 /*
 636  * Macro version of mtod.
 637  */
 638 #define MTOD(m, t)      ((t)((m)->m_data))
 639
 640 /*
 641  * Macros to obtain cluster index and base cluster address.
 642  */
 643 #define MTOCL(x)        (((char *)(x) - (char *)mbutl) >> MCLSHIFT)
 644 #define CLTOM(x)        ((union mcluster *)(mbutl + (x)))
 645
 646 /*
 647  * Macro to find the mbuf index relative to the cluster base.
 648  */
 649 #define MCLIDX(c, m)    (((char *)(m) - (char *)(c)) >> 8)
 650
 651 /*
 652  * Macros used during mbuf and cluster initialization.
 653  */
 654 #define MBUF_INIT(m, pkthdr, type) {                                    \
 655         _MCHECK(m);                                                     \
 656         (m)->m_next = (m)->m_nextpkt = NULL;                            \
 657         (m)->m_len = 0;                                                 \
 658         (m)->m_type = type;                                             \
 659         if ((pkthdr) == 0) {                                            \
 660                 (m)->m_data = (m)->m_dat;                               \
 661                 (m)->m_flags = 0;                                       \
 662         } else {                                                        \
 663                 (m)->m_data = (m)->m_pktdat;                            \
 664                 (m)->m_flags = M_PKTHDR;                                \
 665                 (m)->m_pkthdr.rcvif = NULL;                             \
 666                 (m)->m_pkthdr.len = 0;                                  \
 667                 (m)->m_pkthdr.header = NULL;                            \
 668                 (m)->m_pkthdr.csum_flags = 0;                           \
 669                 (m)->m_pkthdr.csum_data = 0;                            \
 670                 (m)->m_pkthdr.tso_segsz = 0;                            \
 671                 (m)->m_pkthdr.vlan_tag = 0;                             \
 672                 (m)->m_pkthdr.socket_id = 0;                            \
 673                 m_tag_init(m);                                          \
 674         }                                                               \
 675 }
 676
 677 #define MEXT_INIT(m, buf, size, free, arg, rfa, ref, flag) {            \
 678         (m)->m_data = (m)->m_ext.ext_buf = (buf);                       \
 679         (m)->m_flags |= M_EXT;                                          \
 680         (m)->m_ext.ext_size = (size);                                   \
 681         (m)->m_ext.ext_free = (free);                                   \
 682         (m)->m_ext.ext_arg = (arg);                                     \
 683         (m)->m_ext.ext_refs.forward = (m)->m_ext.ext_refs.backward =    \
 684             &(m)->m_ext.ext_refs;                                       \
 685         MEXT_RFA(m) = (rfa);                                            \
 686         MEXT_REF(m) = (ref);                                            \
 687         MEXT_FLAGS(m) = (flag);                                         \
 688 }
 689
 690 #define MBUF_CL_INIT(m, buf, rfa, ref, flag)    \
 691         MEXT_INIT(m, buf, m_maxsize(MC_CL), NULL, NULL, rfa, ref, flag)
 692
 693 #define MBUF_BIGCL_INIT(m, buf, rfa, ref, flag) \
 694         MEXT_INIT(m, buf, m_maxsize(MC_BIGCL), m_bigfree, NULL, rfa, ref, flag)
 695
 696 #define MBUF_16KCL_INIT(m, buf, rfa, ref, flag) \
 697         MEXT_INIT(m, buf, m_maxsize(MC_16KCL), m_16kfree, NULL, rfa, ref, flag)
 698
 699 /*
 700  * Macro to convert BSD malloc sleep flag to mcache's
 701  */
 702 #define MSLEEPF(f)      ((!((f) & M_DONTWAIT)) ? MCR_SLEEP : MCR_NOSLEEP)
 703
 704 /*
 705  * The structure that holds all mbuf class statistics exportable via sysctl.
 706  * Similar to mbstat structure, the mb_stat structure is protected by the
 707  * global mbuf lock.  It contains additional information about the classes
 708  * that allows for a more accurate view of the state of the allocator.
 709  */
 710 struct mb_stat *mb_stat;
 711 struct omb_stat *omb_stat;      /* For backwards compatibility */
 712
 713 #define MB_STAT_SIZE(n) \
 714         ((size_t)(&((mb_stat_t *)0)->mbs_class[n]))
 715 #define OMB_STAT_SIZE(n) \
 716         ((size_t)(&((struct omb_stat *)0)->mbs_class[n]))
 717
 718 /*
 719  * The legacy structure holding all of the mbuf allocation statistics.
 720  * The actual statistics used by the kernel are stored in the mbuf_table
 721  * instead, and are updated atomically while the global mbuf lock is held.
 722  * They are mirrored in mbstat to support legacy applications (e.g. netstat).
 723  * Unlike before, the kernel no longer relies on the contents of mbstat for
 724  * its operations (e.g. cluster expansion) because the structure is exposed
 725  * to outside and could possibly be modified, therefore making it unsafe.
 726  * With the exception of the mbstat.m_mtypes array (see below), all of the
 727  * statistics are updated as they change.
 728  */
 729 struct mbstat mbstat;
 730
 731 #define MBSTAT_MTYPES_MAX \
 732         (sizeof (mbstat.m_mtypes) / sizeof (mbstat.m_mtypes[0]))
 733
 734 /*
 735  * Allocation statistics related to mbuf types (up to MT_MAX-1) are updated
 736  * atomically and stored in a per-CPU structure which is lock-free; this is
 737  * done in order to avoid writing to the global mbstat data structure which
 738  * would cause false sharing.  During sysctl request for kern.ipc.mbstat,
 739  * the statistics across all CPUs will be converged into the mbstat.m_mtypes
 740  * array and returned to the application.  Any updates for types greater or
 741  * equal than MT_MAX would be done atomically to the mbstat; this slows down
 742  * performance but is okay since the kernel uses only up to MT_MAX-1 while
 743  * anything beyond that (up to type 255) is considered a corner case.
 744  */
 745 typedef struct {
 746         unsigned int    cpu_mtypes[MT_MAX];
 747 } __attribute__((aligned(CPU_CACHE_SIZE), packed)) mtypes_cpu_t;
 748
 749 typedef struct {
 750         mtypes_cpu_t    mbs_cpu[1];
 751 } mbuf_mtypes_t;
 752
 753 static mbuf_mtypes_t *mbuf_mtypes;      /* per-CPU statistics */
 754
 755 #define MBUF_MTYPES_SIZE(n) \
 756         ((size_t)(&((mbuf_mtypes_t *)0)->mbs_cpu[n]))
 757
 758 #define MTYPES_CPU(p) \
 759         ((mtypes_cpu_t *)((char *)(p) + MBUF_MTYPES_SIZE(cpu_number())))
 760
 761 /* This should be in a header file */
 762 #define atomic_add_16(a, n)     ((void) OSAddAtomic16(n, a))
 763 #define atomic_add_32(a, n)     ((void) OSAddAtomic(n, a))
 764
 765 #define mtype_stat_add(type, n) {                                       \
 766         if ((unsigned)(type) < MT_MAX) {                                \
 767                 mtypes_cpu_t *mbs = MTYPES_CPU(mbuf_mtypes);            \
 768                 atomic_add_32(&mbs->cpu_mtypes[type], n);               \
 769         } else if ((unsigned)(type) < (unsigned)MBSTAT_MTYPES_MAX) {            \
 770                 atomic_add_16((int16_t*)&mbstat.m_mtypes[type], n);             \
 771         }                                                               \
 772 }
 773
 774 #define mtype_stat_sub(t, n)    mtype_stat_add(t, -(n))
 775 #define mtype_stat_inc(t)       mtype_stat_add(t, 1)
 776 #define mtype_stat_dec(t)       mtype_stat_sub(t, 1)
 777
 778 static int
 779 mbstat_sysctl SYSCTL_HANDLER_ARGS
 780 {
 781 #pragma unused(oidp, arg1, arg2)
 782         int m, n;
 783         mtypes_cpu_t mtc;
 784
 785         bzero(&mtc, sizeof (mtc));
 786         for (m = 0; m < ncpu; m++) {
 787                 mtypes_cpu_t *scp = &mbuf_mtypes->mbs_cpu[m];
 788                 mtypes_cpu_t temp;
 789
 790                 bcopy(&scp->cpu_mtypes, &temp.cpu_mtypes,
 791                     sizeof (temp.cpu_mtypes));
 792
 793                 for (n = 0; n < MT_MAX; n++)
 794                         mtc.cpu_mtypes[n] += temp.cpu_mtypes[n];
 795         }
 796         lck_mtx_lock(mbuf_mlock);
 797         for (n = 0; n < MT_MAX; n++)
 798                 mbstat.m_mtypes[n] = mtc.cpu_mtypes[n];
 799         lck_mtx_unlock(mbuf_mlock);
 800
 801         return (SYSCTL_OUT(req, &mbstat, sizeof (mbstat)));
 802 }
 803
 804 static int
 805 mb_stat_sysctl SYSCTL_HANDLER_ARGS
 806 {
 807 #pragma unused(oidp, arg1, arg2)
 808         mcache_t *cp;
 809         mcache_cpu_t *ccp;
 810         mb_class_stat_t *sp;
 811         void *statp;
 812         int k, m, bktsize, statsz, proc64 = proc_is64bit(req->p);
 813
 814         lck_mtx_lock(mbuf_mlock);
 815         for (k = 0; k < NELEM(mbuf_table); k++) {
 816                 cp = m_cache(k);
 817                 ccp = &cp->mc_cpu[0];
 818                 bktsize = ccp->cc_bktsize;
 819                 sp = mbuf_table[k].mtbl_stats;
 820
 821                 if (cp->mc_flags & MCF_NOCPUCACHE)
 822                         sp->mbcl_mc_state = MCS_DISABLED;
 823                 else if (cp->mc_purge_cnt > 0)
 824                         sp->mbcl_mc_state = MCS_PURGING;
 825                 else if (bktsize == 0)
 826                         sp->mbcl_mc_state = MCS_OFFLINE;
 827                 else
 828                         sp->mbcl_mc_state = MCS_ONLINE;
 829
 830                 sp->mbcl_mc_cached = 0;
 831                 for (m = 0; m < ncpu; m++) {
 832                         ccp = &cp->mc_cpu[m];
 833                         if (ccp->cc_objs > 0)
 834                                 sp->mbcl_mc_cached += ccp->cc_objs;
 835                         if (ccp->cc_pobjs > 0)
 836                                 sp->mbcl_mc_cached += ccp->cc_pobjs;
 837                 }
 838                 sp->mbcl_mc_cached += (cp->mc_full.bl_total * bktsize);
 839                 sp->mbcl_active = sp->mbcl_total - sp->mbcl_mc_cached -
 840                     sp->mbcl_infree;
 841
 842                 sp->mbcl_mc_waiter_cnt = cp->mc_waiter_cnt;
 843                 sp->mbcl_mc_wretry_cnt = cp->mc_wretry_cnt;
 844                 sp->mbcl_mc_nwretry_cnt = cp->mc_nwretry_cnt;
 845
 846                 /* Calculate total count specific to each class */
 847                 sp->mbcl_ctotal = sp->mbcl_total;
 848                 switch (m_class(k)) {
 849                 case MC_MBUF:
 850                         /* Deduct mbufs used in composite caches */
 851                         sp->mbcl_ctotal -= (m_total(MC_MBUF_CL) +
 852                             m_total(MC_MBUF_BIGCL));
 853                         break;
 854
 855                 case MC_CL:
 856                         /* Deduct clusters used in composite cache and mbufs */
 857                         sp->mbcl_ctotal -= (m_total(MC_MBUF_CL) +
 858                             (P2ROUNDUP(m_total(MC_MBUF), NMBPCL)/NMBPCL));
 859                         break;
 860
 861                 case MC_BIGCL:
 862                         /* Deduct clusters used in composite cache */
 863                         sp->mbcl_ctotal -= m_total(MC_MBUF_BIGCL);
 864                         break;
 865
 866                 case MC_16KCL:
 867                         /* Deduct clusters used in composite cache */
 868                         sp->mbcl_ctotal -= m_total(MC_MBUF_16KCL);
 869                         break;
 870
 871                 default:
 872                         break;
 873                 }
 874         }
 875
 876         if (!proc64) {
 877                 struct omb_class_stat *oc;
 878                 struct mb_class_stat *c;
 879
 880                 omb_stat->mbs_cnt = mb_stat->mbs_cnt;
 881                 oc = &omb_stat->mbs_class[0];
 882                 c = &mb_stat->mbs_class[0];
 883                 for (k = 0; k < omb_stat->mbs_cnt; k++, oc++, c++) {
 884                         (void) snprintf(oc->mbcl_cname, sizeof (oc->mbcl_cname),
 885                             "%s", c->mbcl_cname);
 886                         oc->mbcl_size = c->mbcl_size;
 887                         oc->mbcl_total = c->mbcl_total;
 888                         oc->mbcl_active = c->mbcl_active;
 889                         oc->mbcl_infree = c->mbcl_infree;
 890                         oc->mbcl_slab_cnt = c->mbcl_slab_cnt;
 891                         oc->mbcl_alloc_cnt = c->mbcl_alloc_cnt;
 892                         oc->mbcl_free_cnt = c->mbcl_free_cnt;
 893                         oc->mbcl_notified = c->mbcl_notified;
 894                         oc->mbcl_purge_cnt = c->mbcl_purge_cnt;
 895                         oc->mbcl_fail_cnt = c->mbcl_fail_cnt;
 896                         oc->mbcl_ctotal = c->mbcl_ctotal;
 897                         oc->mbcl_mc_state = c->mbcl_mc_state;
 898                         oc->mbcl_mc_cached = c->mbcl_mc_cached;
 899                         oc->mbcl_mc_waiter_cnt = c->mbcl_mc_waiter_cnt;
 900                         oc->mbcl_mc_wretry_cnt = c->mbcl_mc_wretry_cnt;
 901                         oc->mbcl_mc_nwretry_cnt = c->mbcl_mc_nwretry_cnt;
 902                 }
 903                 statp = omb_stat;
 904                 statsz = OMB_STAT_SIZE(NELEM(mbuf_table));
 905         } else {
 906                 statp = mb_stat;
 907                 statsz = MB_STAT_SIZE(NELEM(mbuf_table));
 908         }
 909
 910         lck_mtx_unlock(mbuf_mlock);
 911
 912         return (SYSCTL_OUT(req, statp, statsz));
 913 }
 914
 915 static inline void
 916 m_incref(struct mbuf *m)
 917 {
 918         UInt32 old, new;
 919         volatile UInt32 *addr = (volatile UInt32 *)&MEXT_REF(m);
 920
 921         do {
 922                 old = *addr;
 923                 new = old + 1;
 924                 ASSERT(new != 0);
 925         } while (!OSCompareAndSwap(old, new, addr));
 926 }
 927
 928 static inline u_int32_t
 929 m_decref(struct mbuf *m)
 930 {
 931         UInt32 old, new;
 932         volatile UInt32 *addr = (volatile UInt32 *)&MEXT_REF(m);
 933
 934         do {
 935                 old = *addr;
 936                 new = old - 1;
 937                 ASSERT(old != 0);
 938         } while (!OSCompareAndSwap(old, new, addr));
 939
 940         return (new);
 941 }
 942
 943 static void
 944 mbuf_table_init(void)
 945 {
 946         int m;
 947
 948         MALLOC(omb_stat, struct omb_stat *, OMB_STAT_SIZE(NELEM(mbuf_table)),
 949             M_TEMP, M_WAITOK | M_ZERO);
 950         VERIFY(omb_stat != NULL);
 951
 952         MALLOC(mb_stat, mb_stat_t *, MB_STAT_SIZE(NELEM(mbuf_table)),
 953             M_TEMP, M_WAITOK | M_ZERO);
 954         VERIFY(mb_stat != NULL);
 955
 956         mb_stat->mbs_cnt = NELEM(mbuf_table);
 957         for (m = 0; m < NELEM(mbuf_table); m++)
 958                 mbuf_table[m].mtbl_stats = &mb_stat->mbs_class[m];
 959
 960 #if CONFIG_MBUF_JUMBO
 961         /*
 962          * Set aside 1/3 of the mbuf cluster map for jumbo clusters; we do
 963          * this only on platforms where jumbo cluster pool is enabled.
 964          */
 965         njcl = nmbclusters / 3;
 966         njclbytes = M16KCLBYTES;
 967 #endif /* CONFIG_MBUF_JUMBO */
 968
 969         /*
 970          * nclusters is going to be split in 2 to hold both the 2K
 971          * and the 4K pools, so make sure each half is even.
 972          */
 973         nclusters = P2ROUNDDOWN(nmbclusters - njcl, 4);
 974         if (njcl > 0) {
 975                 /*
 976                  * Each jumbo cluster takes 8 2K clusters, so make
 977                  * sure that the pool size is evenly divisible by 8.
 978                  */
 979                 njcl = P2ROUNDDOWN(nmbclusters - nclusters, 8);
 980         }
 981
 982 #if CONFIG_MBUF_NOEXPAND
 983         /* Only use 4k clusters if we're setting aside more than 256k */
 984         if (nmbclusters <= 128) {
 985                 maxmbufcl = nmbclusters / 4;
 986         } else {
 987                 /* Half to big clusters, half to small */
 988                 maxmbufcl = (nmbclusters / 4) * 3;
 989         }
 990 #endif /* CONFIG_MBUF_NOEXPAND */
 991
 992         /*
 993          * 1/2 of the map is reserved for 2K clusters.  Out of this, 1/16th
 994          * of the total number of 2K clusters allocated is reserved and cannot
 995          * be turned into mbufs.  It can only be used for pure cluster objects.
 996          */
 997         m_minlimit(MC_CL) = (nclusters >> 5);
 998         m_maxlimit(MC_CL) = (nclusters >> 1);
 999         m_maxsize(MC_CL) = m_size(MC_CL) = MCLBYTES;
1000         (void) snprintf(m_cname(MC_CL), MAX_MBUF_CNAME, "cl");
1001
1002         /*
1003          * The remaining (15/16th) can be turned into mbufs.
1004          */
1005         m_minlimit(MC_MBUF) = 0;
1006         m_maxlimit(MC_MBUF) = (m_maxlimit(MC_CL) - m_minlimit(MC_CL)) * NMBPCL;
1007         m_maxsize(MC_MBUF) = m_size(MC_MBUF) = MSIZE;
1008         (void) snprintf(m_cname(MC_MBUF), MAX_MBUF_CNAME, "mbuf");
1009
1010         /*
1011          * The other 1/2 of the map is reserved for 4K clusters.
1012          */
1013         m_minlimit(MC_BIGCL) = 0;
1014         m_maxlimit(MC_BIGCL) = m_maxlimit(MC_CL) >> 1;
1015         m_maxsize(MC_BIGCL) = m_size(MC_BIGCL) = NBPG;
1016         (void) snprintf(m_cname(MC_BIGCL), MAX_MBUF_CNAME, "bigcl");
1017
1018         /*
1019          * Set limits for the composite classes.
1020          */
1021         m_minlimit(MC_MBUF_CL) = 0;
1022         m_maxlimit(MC_MBUF_CL) = m_maxlimit(MC_CL) - m_minlimit(MC_CL);
1023         m_maxsize(MC_MBUF_CL) = MCLBYTES;
1024         m_size(MC_MBUF_CL) = m_size(MC_MBUF) + m_size(MC_CL);
1025         (void) snprintf(m_cname(MC_MBUF_CL), MAX_MBUF_CNAME, "mbuf_cl");
1026
1027         m_minlimit(MC_MBUF_BIGCL) = 0;
1028         m_maxlimit(MC_MBUF_BIGCL) = m_maxlimit(MC_BIGCL);
1029         m_maxsize(MC_MBUF_BIGCL) = NBPG;
1030         m_size(MC_MBUF_BIGCL) = m_size(MC_MBUF) + m_size(MC_BIGCL);
1031         (void) snprintf(m_cname(MC_MBUF_BIGCL), MAX_MBUF_CNAME, "mbuf_bigcl");
1032
1033         /*
1034          * And for jumbo classes.
1035          */
1036         m_minlimit(MC_16KCL) = 0;
1037         m_maxlimit(MC_16KCL) = (njcl >> 3);
1038         m_maxsize(MC_16KCL) = m_size(MC_16KCL) = M16KCLBYTES;
1039         (void) snprintf(m_cname(MC_16KCL), MAX_MBUF_CNAME, "16kcl");
1040
1041         m_minlimit(MC_MBUF_16KCL) = 0;
1042         m_maxlimit(MC_MBUF_16KCL) = m_maxlimit(MC_16KCL);
1043         m_maxsize(MC_MBUF_16KCL) = M16KCLBYTES;
1044         m_size(MC_MBUF_16KCL) = m_size(MC_MBUF) + m_size(MC_16KCL);
1045         (void) snprintf(m_cname(MC_MBUF_16KCL), MAX_MBUF_CNAME, "mbuf_16kcl");
1046
1047         /*
1048          * Initialize the legacy mbstat structure.
1049          */
1050         bzero(&mbstat, sizeof (mbstat));
1051         mbstat.m_msize = m_maxsize(MC_MBUF);
1052         mbstat.m_mclbytes = m_maxsize(MC_CL);
1053         mbstat.m_minclsize = MINCLSIZE;
1054         mbstat.m_mlen = MLEN;
1055         mbstat.m_mhlen = MHLEN;
1056         mbstat.m_bigmclbytes = m_maxsize(MC_BIGCL);
1057 }
1058
1059 #if defined(__LP64__)
1060 typedef struct ncl_tbl {
1061         uint64_t nt_maxmem;     /* memory (sane) size */
1062         uint32_t nt_mbpool;     /* mbuf pool size */
1063 } ncl_tbl_t;
1064
1065 /* Non-server */
1066 static ncl_tbl_t ncl_table[] = {
1067         { (1ULL << GBSHIFT)       /*  1 GB */,  (64 << MBSHIFT)  /*  64 MB */ },
1068         { (1ULL << (GBSHIFT + 3)) /*  8 GB */,  (96 << MBSHIFT)  /*  96 MB */ },
1069         { (1ULL << (GBSHIFT + 4)) /* 16 GB */,  (128 << MBSHIFT) /* 128 MB */ },
1070         { 0, 0 }
1071 };
1072
1073 /* Server */
1074 static ncl_tbl_t ncl_table_srv[] = {
1075         { (1ULL << GBSHIFT)       /*  1 GB */,  (96 << MBSHIFT)  /*  96 MB */ },
1076         { (1ULL << (GBSHIFT + 2)) /*  4 GB */,  (128 << MBSHIFT) /* 128 MB */ },
1077         { (1ULL << (GBSHIFT + 3)) /*  8 GB */,  (160 << MBSHIFT) /* 160 MB */ },
1078         { (1ULL << (GBSHIFT + 4)) /* 16 GB */,  (192 << MBSHIFT) /* 192 MB */ },
1079         { (1ULL << (GBSHIFT + 5)) /* 32 GB */,  (256 << MBSHIFT) /* 256 MB */ },
1080         { (1ULL << (GBSHIFT + 6)) /* 64 GB */,  (384 << MBSHIFT) /* 384 MB */ },
1081         { 0, 0 }
1082 };
1083 #endif /* __LP64__ */
1084
1085 __private_extern__ unsigned int
1086 mbuf_default_ncl(int srv, uint64_t mem)
1087 {
1088 #if !defined(__LP64__)
1089 #pragma unused(srv)
1090         unsigned int n;
1091         /*
1092          * 32-bit kernel (default to 64MB of mbuf pool for >= 1GB RAM).
1093          */
1094         if ((n = ((mem / 16) / MCLBYTES)) > 32768)
1095                 n = 32768;
1096 #else
1097         unsigned int n, i;
1098         ncl_tbl_t *tbl = (srv ? ncl_table_srv : ncl_table);
1099         /*
1100          * 64-bit kernel (mbuf pool size based on table).
1101          */
1102         n = tbl[0].nt_mbpool;
1103         for (i = 0; tbl[i].nt_mbpool != 0; i++) {
1104                 if (mem < tbl[i].nt_maxmem)
1105                         break;
1106                 n = tbl[i].nt_mbpool;
1107         }
1108         n >>= MCLSHIFT;
1109 #endif /* !__LP64__ */
1110         return (n);
1111 }
1112
1113 __private_extern__ void
1114 mbinit(void)
1115 {
1116         unsigned int m;
1117         int initmcl = MINCL;
1118         void *buf;
1119         thread_t thread = THREAD_NULL;
1120
1121         if (nmbclusters == 0)
1122                 nmbclusters = NMBCLUSTERS;
1123
1124         /* Setup the mbuf table */
1125         mbuf_table_init();
1126
1127         /* Global lock for common layer */
1128         mbuf_mlock_grp_attr = lck_grp_attr_alloc_init();
1129         mbuf_mlock_grp = lck_grp_alloc_init("mbuf", mbuf_mlock_grp_attr);
1130         mbuf_mlock_attr = lck_attr_alloc_init();
1131         mbuf_mlock = lck_mtx_alloc_init(mbuf_mlock_grp, mbuf_mlock_attr);
1132
1133         /* Allocate cluster slabs table */
1134         maxslabgrp = P2ROUNDUP(nmbclusters, NSLABSPMB) / NSLABSPMB;
1135         MALLOC(slabstbl, mcl_slabg_t **, maxslabgrp * sizeof (mcl_slabg_t *),
1136             M_TEMP, M_WAITOK | M_ZERO);
1137         VERIFY(slabstbl != NULL);
1138
1139         /* Allocate audit structures if needed */
1140         PE_parse_boot_argn("mbuf_debug", &mbuf_debug, sizeof (mbuf_debug));
1141         mbuf_debug |= mcache_getflags();
1142         if (mbuf_debug & MCF_AUDIT) {
1143                 MALLOC(mclaudit, mcl_audit_t *,
1144                     nmbclusters * sizeof (*mclaudit), M_TEMP,
1145                     M_WAITOK | M_ZERO);
1146                 VERIFY(mclaudit != NULL);
1147
1148                 mcl_audit_con_cache = mcache_create("mcl_audit_contents",
1149                     AUDIT_CONTENTS_SIZE, 0, 0, MCR_SLEEP);
1150                 VERIFY(mcl_audit_con_cache != NULL);
1151         }
1152
1153         /* Calculate the number of pages assigned to the cluster pool */
1154         mcl_pages = (nmbclusters * MCLBYTES) / CLBYTES;
1155         MALLOC(mcl_paddr, ppnum_t *, mcl_pages * sizeof (ppnum_t),
1156             M_TEMP, M_WAITOK);
1157         VERIFY(mcl_paddr != NULL);
1158
1159         /* Register with the I/O Bus mapper */
1160         mcl_paddr_base = IOMapperIOVMAlloc(mcl_pages);
1161         bzero((char *)mcl_paddr, mcl_pages * sizeof (ppnum_t));
1162
1163         embutl = (union mcluster *)
1164             ((unsigned char *)mbutl + (nmbclusters * MCLBYTES));
1165
1166         PE_parse_boot_argn("initmcl", &initmcl, sizeof (initmcl));
1167
1168         lck_mtx_lock(mbuf_mlock);
1169
1170         if (m_clalloc(MAX(NBPG/CLBYTES, 1) * initmcl, M_WAIT, MCLBYTES) == 0)
1171                 panic("mbinit: m_clalloc failed\n");
1172
1173         lck_mtx_unlock(mbuf_mlock);
1174
1175         (void) kernel_thread_start((thread_continue_t)mbuf_worker_thread_init, NULL, &thread);
1176         thread_deallocate(thread);
1177
1178         ref_cache = mcache_create("mext_ref", sizeof (struct ext_ref),
1179             0, 0, MCR_SLEEP);
1180
1181         /* Create the cache for each class */
1182         for (m = 0; m < NELEM(mbuf_table); m++) {
1183                 void *allocfunc, *freefunc, *auditfunc;
1184                 u_int32_t flags;
1185
1186                 flags = mbuf_debug;
1187                 if (m_class(m) == MC_MBUF_CL || m_class(m) == MC_MBUF_BIGCL ||
1188                     m_class(m) == MC_MBUF_16KCL) {
1189                         allocfunc = mbuf_cslab_alloc;
1190                         freefunc = mbuf_cslab_free;
1191                         auditfunc = mbuf_cslab_audit;
1192                 } else {
1193                         allocfunc = mbuf_slab_alloc;
1194                         freefunc = mbuf_slab_free;
1195                         auditfunc = mbuf_slab_audit;
1196                 }
1197
1198                 /*
1199                  * Disable per-CPU caches for jumbo classes if there
1200                  * is no jumbo cluster pool available in the system.
1201                  * The cache itself is still created (but will never
1202                  * be populated) since it simplifies the code.
1203                  */
1204                 if ((m_class(m) == MC_MBUF_16KCL || m_class(m) == MC_16KCL) &&
1205                     njcl == 0)
1206                         flags |= MCF_NOCPUCACHE;
1207
1208                 m_cache(m) = mcache_create_ext(m_cname(m), m_maxsize(m),
1209                     allocfunc, freefunc, auditfunc, mbuf_slab_notify,
1210                     (void *)(uintptr_t)m, flags, MCR_SLEEP);
1211         }
1212
1213         /*
1214          * Allocate structure for per-CPU statistics that's aligned
1215          * on the CPU cache boundary; this code assumes that we never
1216          * uninitialize this framework, since the original address
1217          * before alignment is not saved.
1218          */
1219         ncpu = ml_get_max_cpus();
1220         MALLOC(buf, void *, MBUF_MTYPES_SIZE(ncpu) + CPU_CACHE_SIZE,
1221             M_TEMP, M_WAITOK);
1222         VERIFY(buf != NULL);
1223
1224         mbuf_mtypes = (mbuf_mtypes_t *)P2ROUNDUP((intptr_t)buf, CPU_CACHE_SIZE);
1225         bzero(mbuf_mtypes, MBUF_MTYPES_SIZE(ncpu));
1226
1227         mbuf_gscale = MB_GROWTH_NORMAL;
1228
1229         /*
1230          * Set the max limit on sb_max to be 1/16 th of the size of
1231          * memory allocated for mbuf clusters.
1232          */
1233         high_sb_max = (nmbclusters << (MCLSHIFT - 4));
1234         if (high_sb_max < sb_max) {
1235                 /* sb_max is too large for this configuration, scale it down */
1236                 if (high_sb_max > (1 << MBSHIFT)) {
1237                         /* We have atleast 16 M of mbuf pool */
1238                         sb_max = high_sb_max;
1239                 } else if ((nmbclusters << MCLSHIFT) > (1 << MBSHIFT)) {
1240                         /* If we have more than 1M of mbufpool, cap the size of
1241                          * max sock buf at 1M
1242                          */
1243                         sb_max = high_sb_max = (1 << MBSHIFT);
1244                 } else {
1245                         sb_max = high_sb_max;
1246                 }
1247         }
1248
1249         printf("mbinit: done (%d MB memory set for mbuf pool)\n",
1250             (nmbclusters << MCLSHIFT) >> MBSHIFT);
1251 }
1252
1253 /*
1254  * Obtain a slab of object(s) from the class's freelist.
1255  */
1256 static mcache_obj_t *
1257 slab_alloc(mbuf_class_t class, int wait)
1258 {
1259         mcl_slab_t *sp;
1260         mcache_obj_t *buf;
1261
1262         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1263
1264         VERIFY(class != MC_16KCL || njcl > 0);
1265
1266         /* This should always be NULL for us */
1267         VERIFY(m_cobjlist(class) == NULL);
1268
1269         /*
1270          * Treat composite objects as having longer lifespan by using
1271          * a slab from the reverse direction, in hoping that this could
1272          * reduce the probability of fragmentation for slabs that hold
1273          * more than one buffer chunks (e.g. mbuf slabs).  For other
1274          * slabs, this probably doesn't make much of a difference.
1275          */
1276         if (class == MC_MBUF && (wait & MCR_COMP))
1277                 sp = (mcl_slab_t *)TAILQ_LAST(&m_slablist(class), mcl_slhead);
1278         else
1279                 sp = (mcl_slab_t *)TAILQ_FIRST(&m_slablist(class));
1280
1281         if (sp == NULL) {
1282                 VERIFY(m_infree(class) == 0 && m_slab_cnt(class) == 0);
1283                 /* The slab list for this class is empty */
1284                 return (NULL);
1285         }
1286
1287         VERIFY(m_infree(class) > 0);
1288         VERIFY(!slab_is_detached(sp));
1289         VERIFY(sp->sl_class == class &&
1290             (sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED);
1291         buf = sp->sl_head;
1292         VERIFY(slab_inrange(sp, buf) && sp == slab_get(buf));
1293
1294         if (class == MC_MBUF) {
1295                 sp->sl_head = buf->obj_next;
1296                 VERIFY(sp->sl_head != NULL || sp->sl_refcnt == (NMBPCL - 1));
1297         } else {
1298                 sp->sl_head = NULL;
1299         }
1300         if (sp->sl_head != NULL && !slab_inrange(sp, sp->sl_head)) {
1301                 slab_nextptr_panic(sp, sp->sl_head);
1302                 /* In case sl_head is in the map but not in the slab */
1303                 VERIFY(slab_inrange(sp, sp->sl_head));
1304                 /* NOTREACHED */
1305         }
1306
1307         /* Increment slab reference */
1308         sp->sl_refcnt++;
1309
1310         if (mclaudit != NULL) {
1311                 mcache_audit_t *mca = mcl_audit_buf2mca(class, buf);
1312                 mca->mca_uflags = 0;
1313                 /* Save contents on mbuf objects only */
1314                 if (class == MC_MBUF)
1315                         mca->mca_uflags |= MB_SCVALID;
1316         }
1317
1318         if (class == MC_CL) {
1319                 mbstat.m_clfree = (--m_infree(MC_CL)) + m_infree(MC_MBUF_CL);
1320                 /*
1321                  * A 2K cluster slab can have at most 1 reference.
1322                  */
1323                 VERIFY(sp->sl_refcnt == 1 && sp->sl_chunks == 1 &&
1324                     sp->sl_len == m_maxsize(MC_CL) && sp->sl_head == NULL);
1325         } else if (class == MC_BIGCL) {
1326                 mcl_slab_t *nsp = sp->sl_next;
1327                 mbstat.m_bigclfree = (--m_infree(MC_BIGCL)) +
1328                     m_infree(MC_MBUF_BIGCL);
1329                 /*
1330                  * Increment 2nd slab.  A 4K big cluster takes
1331                  * 2 slabs, each having at most 1 reference.
1332                  */
1333                 VERIFY(sp->sl_refcnt == 1 && sp->sl_chunks == 1 &&
1334                     sp->sl_len == m_maxsize(MC_BIGCL) && sp->sl_head == NULL);
1335                 /* Next slab must already be present */
1336                 VERIFY(nsp != NULL);
1337                 nsp->sl_refcnt++;
1338                 VERIFY(!slab_is_detached(nsp));
1339                 VERIFY(nsp->sl_class == MC_BIGCL &&
1340                     nsp->sl_flags == (SLF_MAPPED | SLF_PARTIAL) &&
1341                     nsp->sl_refcnt == 1 && nsp->sl_chunks == 0 &&
1342                     nsp->sl_len == 0 && nsp->sl_base == sp->sl_base &&
1343                     nsp->sl_head == NULL);
1344         } else if (class == MC_16KCL) {
1345                 mcl_slab_t *nsp;
1346                 int k;
1347
1348                 --m_infree(MC_16KCL);
1349                 VERIFY(sp->sl_refcnt == 1 && sp->sl_chunks == 1 &&
1350                     sp->sl_len == m_maxsize(MC_16KCL) && sp->sl_head == NULL);
1351                 /*
1352                  * Increment 2nd-8th slab.  A 16K big cluster takes
1353                  * 8 cluster slabs, each having at most 1 reference.
1354                  */
1355                 for (nsp = sp, k = 1; k < (M16KCLBYTES / MCLBYTES); k++) {
1356                         nsp = nsp->sl_next;
1357                         /* Next slab must already be present */
1358                         VERIFY(nsp != NULL);
1359                         nsp->sl_refcnt++;
1360                         VERIFY(!slab_is_detached(nsp));
1361                         VERIFY(nsp->sl_class == MC_16KCL &&
1362                             nsp->sl_flags == (SLF_MAPPED | SLF_PARTIAL) &&
1363                             nsp->sl_refcnt == 1 && nsp->sl_chunks == 0 &&
1364                             nsp->sl_len == 0 && nsp->sl_base == sp->sl_base &&
1365                             nsp->sl_head == NULL);
1366                 }
1367         } else {
1368                 ASSERT(class == MC_MBUF);
1369                 --m_infree(MC_MBUF);
1370                 /*
1371                  * If auditing is turned on, this check is
1372                  * deferred until later in mbuf_slab_audit().
1373                  */
1374                 if (mclaudit == NULL)
1375                         _MCHECK((struct mbuf *)buf);
1376                 /*
1377                  * Since we have incremented the reference count above,
1378                  * an mbuf slab (formerly a 2K cluster slab that was cut
1379                  * up into mbufs) must have a reference count between 1
1380                  * and NMBPCL at this point.
1381                  */
1382                 VERIFY(sp->sl_refcnt >= 1 &&
1383                     (unsigned short)sp->sl_refcnt <= NMBPCL &&
1384                     sp->sl_chunks == NMBPCL && sp->sl_len == m_maxsize(MC_CL));
1385                 VERIFY((unsigned short)sp->sl_refcnt < NMBPCL ||
1386                     sp->sl_head == NULL);
1387         }
1388
1389         /* If empty, remove this slab from the class's freelist */
1390         if (sp->sl_head == NULL) {
1391                 VERIFY(class != MC_MBUF || sp->sl_refcnt == NMBPCL);
1392                 slab_remove(sp, class);
1393         }
1394
1395         return (buf);
1396 }
1397
1398 /*
1399  * Place a slab of object(s) back into a class's slab list.
1400  */
1401 static void
1402 slab_free(mbuf_class_t class, mcache_obj_t *buf)
1403 {
1404         mcl_slab_t *sp;
1405
1406         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1407
1408         VERIFY(class != MC_16KCL || njcl > 0);
1409         VERIFY(buf->obj_next == NULL);
1410         sp = slab_get(buf);
1411         VERIFY(sp->sl_class == class && slab_inrange(sp, buf) &&
1412             (sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED);
1413
1414         /* Decrement slab reference */
1415         sp->sl_refcnt--;
1416
1417         if (class == MC_CL || class == MC_BIGCL) {
1418                 VERIFY(IS_P2ALIGNED(buf, MCLBYTES));
1419                 /*
1420                  * A 2K cluster slab can have at most 1 reference
1421                  * which must be 0 at this point.
1422                  */
1423                 VERIFY(sp->sl_refcnt == 0 && sp->sl_chunks == 1 &&
1424                     sp->sl_len == m_maxsize(class) && sp->sl_head == NULL);
1425                 VERIFY(slab_is_detached(sp));
1426                 if (class == MC_BIGCL) {
1427                         mcl_slab_t *nsp = sp->sl_next;
1428                         VERIFY(IS_P2ALIGNED(buf, NBPG));
1429                         /* Next slab must already be present */
1430                         VERIFY(nsp != NULL);
1431                         /* Decrement 2nd slab reference */
1432                         nsp->sl_refcnt--;
1433                         /*
1434                          * A 4K big cluster takes 2 slabs, both
1435                          * must now have 0 reference.
1436                          */
1437                         VERIFY(slab_is_detached(nsp));
1438                         VERIFY(nsp->sl_class == MC_BIGCL &&
1439                             (nsp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) &&
1440                             nsp->sl_refcnt == 0 && nsp->sl_chunks == 0 &&
1441                             nsp->sl_len == 0 && nsp->sl_base == sp->sl_base &&
1442                             nsp->sl_head == NULL);
1443                 }
1444         } else if (class == MC_16KCL) {
1445                 mcl_slab_t *nsp;
1446                 int k;
1447                 /*
1448                  * A 16K cluster takes 8 cluster slabs, all must
1449                  * now have 0 reference.
1450                  */
1451                 VERIFY(IS_P2ALIGNED(buf, NBPG));
1452                 VERIFY(sp->sl_refcnt == 0 && sp->sl_chunks == 1 &&
1453                     sp->sl_len == m_maxsize(MC_16KCL) && sp->sl_head == NULL);
1454                 VERIFY(slab_is_detached(sp));
1455                 for (nsp = sp, k = 1; k < (M16KCLBYTES / MCLBYTES); k++) {
1456                         nsp = nsp->sl_next;
1457                         /* Next slab must already be present */
1458                         VERIFY(nsp != NULL);
1459                         nsp->sl_refcnt--;
1460                         VERIFY(slab_is_detached(nsp));
1461                         VERIFY(nsp->sl_class == MC_16KCL &&
1462                             (nsp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) &&
1463                             nsp->sl_refcnt == 0 && nsp->sl_chunks == 0 &&
1464                             nsp->sl_len == 0 && nsp->sl_base == sp->sl_base &&
1465                             nsp->sl_head == NULL);
1466                 }
1467         } else {
1468                 /*
1469                  * An mbuf slab has a total of NMBPL reference counts.
1470                  * Since we have decremented the reference above, it
1471                  * must now be between 0 and NMBPCL-1.
1472                  */
1473                 VERIFY(sp->sl_refcnt >= 0 &&
1474                     (unsigned short)sp->sl_refcnt <= (NMBPCL - 1) &&
1475                     sp->sl_chunks == NMBPCL && sp->sl_len == m_maxsize(MC_CL));
1476                 VERIFY(sp->sl_refcnt < (NMBPCL - 1) ||
1477                     (slab_is_detached(sp) && sp->sl_head == NULL));
1478         }
1479
1480         /*
1481          * When auditing is enabled, ensure that the buffer still
1482          * contains the free pattern.  Otherwise it got corrupted
1483          * while at the CPU cache layer.
1484          */
1485         if (mclaudit != NULL) {
1486                 mcache_audit_t *mca = mcl_audit_buf2mca(class, buf);
1487                 mcache_audit_free_verify(mca, buf, 0, m_maxsize(class));
1488                 mca->mca_uflags &= ~MB_SCVALID;
1489         }
1490
1491         if (class == MC_CL) {
1492                 mbstat.m_clfree = (++m_infree(MC_CL)) + m_infree(MC_MBUF_CL);
1493         } else if (class == MC_BIGCL) {
1494                 mbstat.m_bigclfree = (++m_infree(MC_BIGCL)) +
1495                     m_infree(MC_MBUF_BIGCL);
1496         } else if (class == MC_16KCL) {
1497                 ++m_infree(MC_16KCL);
1498         } else {
1499                 ++m_infree(MC_MBUF);
1500                 buf->obj_next = sp->sl_head;
1501         }
1502         sp->sl_head = buf;
1503
1504         /* All mbufs are freed; return the cluster that we stole earlier */
1505         if (sp->sl_refcnt == 0 && class == MC_MBUF) {
1506                 int i = NMBPCL;
1507
1508                 m_total(MC_MBUF) -= NMBPCL;
1509                 mbstat.m_mbufs = m_total(MC_MBUF);
1510                 m_infree(MC_MBUF) -= NMBPCL;
1511                 mtype_stat_add(MT_FREE, -((unsigned)NMBPCL));
1512
1513                 while (i--) {
1514                         struct mbuf *m = sp->sl_head;
1515                         VERIFY(m != NULL);
1516                         sp->sl_head = m->m_next;
1517                         m->m_next = NULL;
1518                 }
1519                 VERIFY(sp->sl_head == NULL);
1520
1521                 /* Remove the slab from the mbuf class's slab list */
1522                 slab_remove(sp, class);
1523
1524                 /* Reinitialize it as a 2K cluster slab */
1525                 slab_init(sp, MC_CL, sp->sl_flags, sp->sl_base, sp->sl_base,
1526                     sp->sl_len, 0, 1);
1527
1528                 if (mclaudit != NULL)
1529                         mcache_set_pattern(MCACHE_FREE_PATTERN,
1530                             (caddr_t)sp->sl_head, m_maxsize(MC_CL));
1531
1532                 mbstat.m_clfree = (++m_infree(MC_CL)) + m_infree(MC_MBUF_CL);
1533
1534                 VERIFY(slab_is_detached(sp));
1535                 /* And finally switch class */
1536                 class = MC_CL;
1537         }
1538
1539         /* Reinsert the slab to the class's slab list */
1540         if (slab_is_detached(sp))
1541                 slab_insert(sp, class);
1542 }
1543
1544 /*
1545  * Common allocator for rudimentary objects called by the CPU cache layer
1546  * during an allocation request whenever there is no available element in the
1547  * bucket layer.  It returns one or more elements from the appropriate global
1548  * freelist.  If the freelist is empty, it will attempt to populate it and
1549  * retry the allocation.
1550  */
1551 static unsigned int
1552 mbuf_slab_alloc(void *arg, mcache_obj_t ***plist, unsigned int num, int wait)
1553 {
1554         mbuf_class_t class = (mbuf_class_t)arg;
1555         unsigned int need = num;
1556         mcache_obj_t **list = *plist;
1557
1558         ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
1559         ASSERT(need > 0);
1560
1561         lck_mtx_lock(mbuf_mlock);
1562
1563         for (;;) {
1564                 if ((*list = slab_alloc(class, wait)) != NULL) {
1565                         (*list)->obj_next = NULL;
1566                         list = *plist = &(*list)->obj_next;
1567
1568                         if (--need == 0) {
1569                                 /*
1570                                  * If the number of elements in freelist has
1571                                  * dropped below low watermark, asynchronously
1572                                  * populate the freelist now rather than doing
1573                                  * it later when we run out of elements.
1574                                  */
1575                                 if (!mbuf_cached_above(class, wait) &&
1576                                     m_infree(class) < m_total(class) >> 5) {
1577                                         (void) freelist_populate(class, 1,
1578                                             M_DONTWAIT);
1579                                 }
1580                                 break;
1581                         }
1582                 } else {
1583                         VERIFY(m_infree(class) == 0 || class == MC_CL);
1584
1585                         (void) freelist_populate(class, 1,
1586                             (wait & MCR_NOSLEEP) ? M_DONTWAIT : M_WAIT);
1587
1588                         if (m_infree(class) > 0)
1589                                 continue;
1590
1591                         /* Check if there's anything at the cache layer */
1592                         if (mbuf_cached_above(class, wait))
1593                                 break;
1594
1595                         /* We have nothing and cannot block; give up */
1596                         if (wait & MCR_NOSLEEP) {
1597                                 if (!(wait & MCR_TRYHARD)) {
1598                                         m_fail_cnt(class)++;
1599                                         mbstat.m_drops++;
1600                                         break;
1601                                 }
1602                         }
1603
1604                         /*
1605                          * If the freelist is still empty and the caller is
1606                          * willing to be blocked, sleep on the wait channel
1607                          * until an element is available.  Otherwise, if
1608                          * MCR_TRYHARD is set, do our best to satisfy the
1609                          * request without having to go to sleep.
1610                          */
1611                         if (mbuf_worker_ready &&
1612                             mbuf_sleep(class, need, wait))
1613                                 break;
1614
1615                         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1616                 }
1617         }
1618
1619         m_alloc_cnt(class) += num - need;
1620         lck_mtx_unlock(mbuf_mlock);
1621
1622         return (num - need);
1623 }
1624
1625 /*
1626  * Common de-allocator for rudimentary objects called by the CPU cache
1627  * layer when one or more elements need to be returned to the appropriate
1628  * global freelist.
1629  */
1630 static void
1631 mbuf_slab_free(void *arg, mcache_obj_t *list, __unused int purged)
1632 {
1633         mbuf_class_t class = (mbuf_class_t)arg;
1634         mcache_obj_t *nlist;
1635         unsigned int num = 0;
1636         int w;
1637
1638         ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
1639
1640         lck_mtx_lock(mbuf_mlock);
1641
1642         for (;;) {
1643                 nlist = list->obj_next;
1644                 list->obj_next = NULL;
1645                 slab_free(class, list);
1646                 ++num;
1647                 if ((list = nlist) == NULL)
1648                         break;
1649         }
1650         m_free_cnt(class) += num;
1651
1652         if ((w = mb_waiters) > 0)
1653                 mb_waiters = 0;
1654
1655         lck_mtx_unlock(mbuf_mlock);
1656
1657         if (w != 0)
1658                 wakeup(mb_waitchan);
1659 }
1660
1661 /*
1662  * Common auditor for rudimentary objects called by the CPU cache layer
1663  * during an allocation or free request.  For the former, this is called
1664  * after the objects are obtained from either the bucket or slab layer
1665  * and before they are returned to the caller.  For the latter, this is
1666  * called immediately during free and before placing the objects into
1667  * the bucket or slab layer.
1668  */
1669 static void
1670 mbuf_slab_audit(void *arg, mcache_obj_t *list, boolean_t alloc)
1671 {
1672         mbuf_class_t class = (mbuf_class_t)arg;
1673         mcache_audit_t *mca;
1674
1675         ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
1676
1677         while (list != NULL) {
1678                 lck_mtx_lock(mbuf_mlock);
1679                 mca = mcl_audit_buf2mca(class, list);
1680
1681                 /* Do the sanity checks */
1682                 if (class == MC_MBUF) {
1683                         mcl_audit_mbuf(mca, list, FALSE, alloc);
1684                         ASSERT(mca->mca_uflags & MB_SCVALID);
1685                 } else {
1686                         mcl_audit_cluster(mca, list, m_maxsize(class),
1687                             alloc, TRUE);
1688                         ASSERT(!(mca->mca_uflags & MB_SCVALID));
1689                 }
1690                 /* Record this transaction */
1691                 mcache_buffer_log(mca, list, m_cache(class));
1692                 if (alloc)
1693                         mca->mca_uflags |= MB_INUSE;
1694                 else
1695                         mca->mca_uflags &= ~MB_INUSE;
1696                 /* Unpair the object (unconditionally) */
1697                 mca->mca_uptr = NULL;
1698                 lck_mtx_unlock(mbuf_mlock);
1699
1700                 list = list->obj_next;
1701         }
1702 }
1703
1704 /*
1705  * Common notify routine for all caches.  It is called by mcache when
1706  * one or more objects get freed.  We use this indication to trigger
1707  * the wakeup of any sleeping threads so that they can retry their
1708  * allocation requests.
1709  */
1710 static void
1711 mbuf_slab_notify(void *arg, u_int32_t reason)
1712 {
1713         mbuf_class_t class = (mbuf_class_t)arg;
1714         int w;
1715
1716         ASSERT(MBUF_CLASS_VALID(class));
1717
1718         if (reason != MCN_RETRYALLOC)
1719                 return;
1720
1721         lck_mtx_lock(mbuf_mlock);
1722         if ((w = mb_waiters) > 0) {
1723                 m_notified(class)++;
1724                 mb_waiters = 0;
1725         }
1726         lck_mtx_unlock(mbuf_mlock);
1727
1728         if (w != 0)
1729                 wakeup(mb_waitchan);
1730 }
1731
1732 /*
1733  * Obtain object(s) from the composite class's freelist.
1734  */
1735 static unsigned int
1736 cslab_alloc(mbuf_class_t class, mcache_obj_t ***plist, unsigned int num)
1737 {
1738         unsigned int need = num;
1739         mcl_slab_t *sp, *clsp, *nsp;
1740         struct mbuf *m;
1741         mcache_obj_t **list = *plist;
1742         void *cl;
1743
1744         VERIFY(need > 0);
1745         VERIFY(class != MC_MBUF_16KCL || njcl > 0);
1746         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1747
1748         /* Get what we can from the freelist */
1749         while ((*list = m_cobjlist(class)) != NULL) {
1750                 MRANGE(*list);
1751
1752                 m = (struct mbuf *)*list;
1753                 sp = slab_get(m);
1754                 cl = m->m_ext.ext_buf;
1755                 clsp = slab_get(cl);
1756                 VERIFY(m->m_flags == M_EXT && cl != NULL);
1757                 VERIFY(MEXT_RFA(m) != NULL && MBUF_IS_COMPOSITE(m));
1758                 VERIFY(clsp->sl_refcnt == 1);
1759                 if (class == MC_MBUF_BIGCL) {
1760                         nsp = clsp->sl_next;
1761                         /* Next slab must already be present */
1762                         VERIFY(nsp != NULL);
1763                         VERIFY(nsp->sl_refcnt == 1);
1764                 } else if (class == MC_MBUF_16KCL) {
1765                         int k;
1766                         for (nsp = clsp, k = 1;
1767                             k < (M16KCLBYTES / MCLBYTES); k++) {
1768                                 nsp = nsp->sl_next;
1769                                 /* Next slab must already be present */
1770                                 VERIFY(nsp != NULL);
1771                                 VERIFY(nsp->sl_refcnt == 1);
1772                         }
1773                 }
1774
1775                 if ((m_cobjlist(class) = (*list)->obj_next) != NULL &&
1776                     !MBUF_IN_MAP(m_cobjlist(class))) {
1777                         slab_nextptr_panic(sp, m_cobjlist(class));
1778                         /* NOTREACHED */
1779                 }
1780                 (*list)->obj_next = NULL;
1781                 list = *plist = &(*list)->obj_next;
1782
1783                 if (--need == 0)
1784                         break;
1785         }
1786         m_infree(class) -= (num - need);
1787
1788         return (num - need);
1789 }
1790
1791 /*
1792  * Place object(s) back into a composite class's freelist.
1793  */
1794 static unsigned int
1795 cslab_free(mbuf_class_t class, mcache_obj_t *list, int purged)
1796 {
1797         mcache_obj_t *o, *tail;
1798         unsigned int num = 0;
1799         struct mbuf *m, *ms;
1800         mcache_audit_t *mca = NULL;
1801         mcache_obj_t *ref_list = NULL;
1802         mcl_slab_t *clsp, *nsp;
1803         void *cl;
1804
1805         ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
1806         VERIFY(class != MC_MBUF_16KCL || njcl > 0);
1807         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1808
1809         o = tail = list;
1810
1811         while ((m = ms = (struct mbuf *)o) != NULL) {
1812                 mcache_obj_t *rfa, *nexto = o->obj_next;
1813
1814                 /* Do the mbuf sanity checks */
1815                 if (mclaudit != NULL) {
1816                         mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
1817                         mcache_audit_free_verify(mca, m, 0, m_maxsize(MC_MBUF));
1818                         ms = (struct mbuf *)mca->mca_contents;
1819                 }
1820
1821                 /* Do the cluster sanity checks */
1822                 cl = ms->m_ext.ext_buf;
1823                 clsp = slab_get(cl);
1824                 if (mclaudit != NULL) {
1825                         size_t size;
1826                         if (class == MC_MBUF_CL)
1827                                 size = m_maxsize(MC_CL);
1828                         else if (class == MC_MBUF_BIGCL)
1829                                 size = m_maxsize(MC_BIGCL);
1830                         else
1831                                 size = m_maxsize(MC_16KCL);
1832                         mcache_audit_free_verify(mcl_audit_buf2mca(MC_CL,
1833                             (mcache_obj_t *)cl), cl, 0, size);
1834                 }
1835                 VERIFY(ms->m_type == MT_FREE);
1836                 VERIFY(ms->m_flags == M_EXT);
1837                 VERIFY(MEXT_RFA(ms) != NULL && MBUF_IS_COMPOSITE(ms));
1838                 VERIFY(clsp->sl_refcnt == 1);
1839                 if (class == MC_MBUF_BIGCL) {
1840                         nsp = clsp->sl_next;
1841                         /* Next slab must already be present */
1842                         VERIFY(nsp != NULL);
1843                         VERIFY(nsp->sl_refcnt == 1);
1844                 } else if (class == MC_MBUF_16KCL) {
1845                         int k;
1846                         for (nsp = clsp, k = 1;
1847                             k < (M16KCLBYTES / MCLBYTES); k++) {
1848                                 nsp = nsp->sl_next;
1849                                 /* Next slab must already be present */
1850                                 VERIFY(nsp != NULL);
1851                                 VERIFY(nsp->sl_refcnt == 1);
1852                         }
1853                 }
1854
1855                 /*
1856                  * If we're asked to purge, restore the actual mbuf using
1857                  * contents of the shadow structure (if auditing is enabled)
1858                  * and clear EXTF_COMPOSITE flag from the mbuf, as we are
1859                  * about to free it and the attached cluster into their caches.
1860                  */
1861                 if (purged) {
1862                         /* Restore constructed mbuf fields */
1863                         if (mclaudit != NULL)
1864                                 mcl_audit_restore_mbuf(m, mca, TRUE);
1865
1866                         MEXT_REF(m) = 0;
1867                         MEXT_FLAGS(m) = 0;
1868
1869                         rfa = (mcache_obj_t *)MEXT_RFA(m);
1870                         rfa->obj_next = ref_list;
1871                         ref_list = rfa;
1872                         MEXT_RFA(m) = NULL;
1873
1874                         m->m_type = MT_FREE;
1875                         m->m_flags = m->m_len = 0;
1876                         m->m_next = m->m_nextpkt = NULL;
1877
1878                         /* Save mbuf fields and make auditing happy */
1879                         if (mclaudit != NULL)
1880                                 mcl_audit_mbuf(mca, o, FALSE, FALSE);
1881
1882                         VERIFY(m_total(class) > 0);
1883                         m_total(class)--;
1884
1885                         /* Free the mbuf */
1886                         o->obj_next = NULL;
1887                         slab_free(MC_MBUF, o);
1888
1889                         /* And free the cluster */
1890                         ((mcache_obj_t *)cl)->obj_next = NULL;
1891                         if (class == MC_MBUF_CL)
1892                                 slab_free(MC_CL, cl);
1893                         else if (class == MC_MBUF_BIGCL)
1894                                 slab_free(MC_BIGCL, cl);
1895                         else
1896                                 slab_free(MC_16KCL, cl);
1897                 }
1898
1899                 ++num;
1900                 tail = o;
1901                 o = nexto;
1902         }
1903
1904         if (!purged) {
1905                 tail->obj_next = m_cobjlist(class);
1906                 m_cobjlist(class) = list;
1907                 m_infree(class) += num;
1908         } else if (ref_list != NULL) {
1909                 mcache_free_ext(ref_cache, ref_list);
1910         }
1911
1912         return (num);
1913 }
1914
1915 /*
1916  * Common allocator for composite objects called by the CPU cache layer
1917  * during an allocation request whenever there is no available element in
1918  * the bucket layer.  It returns one or more composite elements from the
1919  * appropriate global freelist.  If the freelist is empty, it will attempt
1920  * to obtain the rudimentary objects from their caches and construct them
1921  * into composite mbuf + cluster objects.
1922  */
1923 static unsigned int
1924 mbuf_cslab_alloc(void *arg, mcache_obj_t ***plist, unsigned int needed,
1925     int wait)
1926 {
1927         mbuf_class_t class = (mbuf_class_t)arg;
1928         mcache_t *cp = NULL;
1929         unsigned int num = 0, cnum = 0, want = needed;
1930         mcache_obj_t *ref_list = NULL;
1931         mcache_obj_t *mp_list = NULL;
1932         mcache_obj_t *clp_list = NULL;
1933         mcache_obj_t **list;
1934         struct ext_ref *rfa;
1935         struct mbuf *m;
1936         void *cl;
1937
1938         ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
1939         ASSERT(needed > 0);
1940
1941         VERIFY(class != MC_MBUF_16KCL || njcl > 0);
1942
1943         /* There should not be any slab for this class */
1944         VERIFY(m_slab_cnt(class) == 0 &&
1945             m_slablist(class).tqh_first == NULL &&
1946             m_slablist(class).tqh_last == NULL);
1947
1948         lck_mtx_lock(mbuf_mlock);
1949
1950         /* Try using the freelist first */
1951         num = cslab_alloc(class, plist, needed);
1952         list = *plist;
1953         if (num == needed) {
1954                 m_alloc_cnt(class) += num;
1955                 lck_mtx_unlock(mbuf_mlock);
1956                 return (needed);
1957         }
1958
1959         lck_mtx_unlock(mbuf_mlock);
1960
1961         /*
1962          * We could not satisfy the request using the freelist alone;
1963          * allocate from the appropriate rudimentary caches and use
1964          * whatever we can get to construct the composite objects.
1965          */
1966         needed -= num;
1967
1968         /*
1969          * Mark these allocation requests as coming from a composite cache.
1970          * Also, if the caller is willing to be blocked, mark the request
1971          * with MCR_FAILOK such that we don't end up sleeping at the mbuf
1972          * slab layer waiting for the individual object when one or more
1973          * of the already-constructed composite objects are available.
1974          */
1975         wait |= MCR_COMP;
1976         if (!(wait & MCR_NOSLEEP))
1977                 wait |= MCR_FAILOK;
1978
1979         needed = mcache_alloc_ext(m_cache(MC_MBUF), &mp_list, needed, wait);
1980         if (needed == 0) {
1981                 ASSERT(mp_list == NULL);
1982                 goto fail;
1983         }
1984         if (class == MC_MBUF_CL)
1985                 cp = m_cache(MC_CL);
1986         else if (class == MC_MBUF_BIGCL)
1987                 cp = m_cache(MC_BIGCL);
1988         else
1989                 cp = m_cache(MC_16KCL);
1990         needed = mcache_alloc_ext(cp, &clp_list, needed, wait);
1991         if (needed == 0) {
1992                 ASSERT(clp_list == NULL);
1993                 goto fail;
1994         }
1995         needed = mcache_alloc_ext(ref_cache, &ref_list, needed, wait);
1996         if (needed == 0) {
1997                 ASSERT(ref_list == NULL);
1998                 goto fail;
1999         }
2000
2001         /*
2002          * By this time "needed" is MIN(mbuf, cluster, ref).  Any left
2003          * overs will get freed accordingly before we return to caller.
2004          */
2005         for (cnum = 0; cnum < needed; cnum++) {
2006                 struct mbuf *ms;
2007
2008                 m = ms = (struct mbuf *)mp_list;
2009                 mp_list = mp_list->obj_next;
2010
2011                 cl = clp_list;
2012                 clp_list = clp_list->obj_next;
2013                 ((mcache_obj_t *)cl)->obj_next = NULL;
2014
2015                 rfa = (struct ext_ref *)ref_list;
2016                 ref_list = ref_list->obj_next;
2017                 ((mcache_obj_t *)rfa)->obj_next = NULL;
2018
2019                 /*
2020                  * If auditing is enabled, construct the shadow mbuf
2021                  * in the audit structure instead of in the actual one.
2022                  * mbuf_cslab_audit() will take care of restoring the
2023                  * contents after the integrity check.
2024                  */
2025                 if (mclaudit != NULL) {
2026                         mcache_audit_t *mca, *cl_mca;
2027                         size_t size;
2028
2029                         lck_mtx_lock(mbuf_mlock);
2030                         mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
2031                         ms = ((struct mbuf *)mca->mca_contents);
2032                         cl_mca = mcl_audit_buf2mca(MC_CL, (mcache_obj_t *)cl);
2033
2034                         /*
2035                          * Pair them up.  Note that this is done at the time
2036                          * the mbuf+cluster objects are constructed.  This
2037                          * information should be treated as "best effort"
2038                          * debugging hint since more than one mbufs can refer
2039                          * to a cluster.  In that case, the cluster might not
2040                          * be freed along with the mbuf it was paired with.
2041                          */
2042                         mca->mca_uptr = cl_mca;
2043                         cl_mca->mca_uptr = mca;
2044
2045                         ASSERT(mca->mca_uflags & MB_SCVALID);
2046                         ASSERT(!(cl_mca->mca_uflags & MB_SCVALID));
2047                         lck_mtx_unlock(mbuf_mlock);
2048
2049                         /* Technically, they are in the freelist */
2050                         mcache_set_pattern(MCACHE_FREE_PATTERN, m,
2051                             m_maxsize(MC_MBUF));
2052                         if (class == MC_MBUF_CL)
2053                                 size = m_maxsize(MC_CL);
2054                         else if (class == MC_MBUF_BIGCL)
2055                                 size = m_maxsize(MC_BIGCL);
2056                         else
2057                                 size = m_maxsize(MC_16KCL);
2058                         mcache_set_pattern(MCACHE_FREE_PATTERN, cl, size);
2059                 }
2060
2061                 MBUF_INIT(ms, 0, MT_FREE);
2062                 if (class == MC_MBUF_16KCL) {
2063                         MBUF_16KCL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE);
2064                 } else if (class == MC_MBUF_BIGCL) {
2065                         MBUF_BIGCL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE);
2066                 } else {
2067                         MBUF_CL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE);
2068                 }
2069                 VERIFY(ms->m_flags == M_EXT);
2070                 VERIFY(MEXT_RFA(ms) != NULL && MBUF_IS_COMPOSITE(ms));
2071
2072                 *list = (mcache_obj_t *)m;
2073                 (*list)->obj_next = NULL;
2074                 list = *plist = &(*list)->obj_next;
2075         }
2076
2077 fail:
2078         /*
2079          * Free up what's left of the above.
2080          */
2081         if (mp_list != NULL)
2082                 mcache_free_ext(m_cache(MC_MBUF), mp_list);
2083         if (clp_list != NULL)
2084                 mcache_free_ext(cp, clp_list);
2085         if (ref_list != NULL)
2086                 mcache_free_ext(ref_cache, ref_list);
2087
2088         lck_mtx_lock(mbuf_mlock);
2089         if (num > 0 || cnum > 0) {
2090                 m_total(class) += cnum;
2091                 VERIFY(m_total(class) <= m_maxlimit(class));
2092                 m_alloc_cnt(class) += num + cnum;
2093         }
2094         if ((num + cnum) < want)
2095                 m_fail_cnt(class) += (want - (num + cnum));
2096         lck_mtx_unlock(mbuf_mlock);
2097
2098         return (num + cnum);
2099 }
2100
2101 /*
2102  * Common de-allocator for composite objects called by the CPU cache
2103  * layer when one or more elements need to be returned to the appropriate
2104  * global freelist.
2105  */
2106 static void
2107 mbuf_cslab_free(void *arg, mcache_obj_t *list, int purged)
2108 {
2109         mbuf_class_t class = (mbuf_class_t)arg;
2110         unsigned int num;
2111         int w;
2112
2113         ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
2114
2115         lck_mtx_lock(mbuf_mlock);
2116
2117         num = cslab_free(class, list, purged);
2118         m_free_cnt(class) += num;
2119
2120         if ((w = mb_waiters) > 0)
2121                 mb_waiters = 0;
2122
2123         lck_mtx_unlock(mbuf_mlock);
2124
2125         if (w != 0)
2126                 wakeup(mb_waitchan);
2127 }
2128
2129 /*
2130  * Common auditor for composite objects called by the CPU cache layer
2131  * during an allocation or free request.  For the former, this is called
2132  * after the objects are obtained from either the bucket or slab layer
2133  * and before they are returned to the caller.  For the latter, this is
2134  * called immediately during free and before placing the objects into
2135  * the bucket or slab layer.
2136  */
2137 static void
2138 mbuf_cslab_audit(void *arg, mcache_obj_t *list, boolean_t alloc)
2139 {
2140         mbuf_class_t class = (mbuf_class_t)arg;
2141         mcache_audit_t *mca;
2142         struct mbuf *m, *ms;
2143         mcl_slab_t *clsp, *nsp;
2144         size_t size;
2145         void *cl;
2146
2147         ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
2148
2149         while ((m = ms = (struct mbuf *)list) != NULL) {
2150                 lck_mtx_lock(mbuf_mlock);
2151                 /* Do the mbuf sanity checks and record its transaction */
2152                 mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
2153                 mcl_audit_mbuf(mca, m, TRUE, alloc);
2154                 mcache_buffer_log(mca, m, m_cache(class));
2155                 if (alloc)
2156                         mca->mca_uflags |= MB_COMP_INUSE;
2157                 else
2158                         mca->mca_uflags &= ~MB_COMP_INUSE;
2159
2160                 /*
2161                  * Use the shadow mbuf in the audit structure if we are
2162                  * freeing, since the contents of the actual mbuf has been
2163                  * pattern-filled by the above call to mcl_audit_mbuf().
2164                  */
2165                 if (!alloc)
2166                         ms = (struct mbuf *)mca->mca_contents;
2167
2168                 /* Do the cluster sanity checks and record its transaction */
2169                 cl = ms->m_ext.ext_buf;
2170                 clsp = slab_get(cl);
2171                 VERIFY(ms->m_flags == M_EXT && cl != NULL);
2172                 VERIFY(MEXT_RFA(ms) != NULL && MBUF_IS_COMPOSITE(ms));
2173                 VERIFY(clsp->sl_refcnt == 1);
2174                 if (class == MC_MBUF_BIGCL) {
2175                         nsp = clsp->sl_next;
2176                         /* Next slab must already be present */
2177                         VERIFY(nsp != NULL);
2178                         VERIFY(nsp->sl_refcnt == 1);
2179                 } else if (class == MC_MBUF_16KCL) {
2180                         int k;
2181                         for (nsp = clsp, k = 1;
2182                             k < (M16KCLBYTES / MCLBYTES); k++) {
2183                                 nsp = nsp->sl_next;
2184                                 /* Next slab must already be present */
2185                                 VERIFY(nsp != NULL);
2186                                 VERIFY(nsp->sl_refcnt == 1);
2187                         }
2188                 }
2189
2190                 mca = mcl_audit_buf2mca(MC_CL, cl);
2191                 if (class == MC_MBUF_CL)
2192                         size = m_maxsize(MC_CL);
2193                 else if (class == MC_MBUF_BIGCL)
2194                         size = m_maxsize(MC_BIGCL);
2195                 else
2196                         size = m_maxsize(MC_16KCL);
2197                 mcl_audit_cluster(mca, cl, size, alloc, FALSE);
2198                 mcache_buffer_log(mca, cl, m_cache(class));
2199                 if (alloc)
2200                         mca->mca_uflags |= MB_COMP_INUSE;
2201                 else
2202                         mca->mca_uflags &= ~MB_COMP_INUSE;
2203                 lck_mtx_unlock(mbuf_mlock);
2204
2205                 list = list->obj_next;
2206         }
2207 }
2208
2209 /*
2210  * Allocate some number of mbuf clusters and place on cluster freelist.
2211  */
2212 static int
2213 m_clalloc(const u_int32_t num, const int wait, const u_int32_t bufsize)
2214 {
2215         int i;
2216         vm_size_t size = 0;
2217         int numpages = 0, large_buffer = (bufsize == m_maxsize(MC_16KCL));
2218         vm_offset_t page = 0;
2219         mcache_audit_t *mca_list = NULL;
2220         mcache_obj_t *con_list = NULL;
2221         mcl_slab_t *sp;
2222
2223         VERIFY(bufsize == m_maxsize(MC_CL) ||
2224             bufsize == m_maxsize(MC_BIGCL) || bufsize == m_maxsize(MC_16KCL));
2225
2226         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2227
2228         /*
2229          * Multiple threads may attempt to populate the cluster map one
2230          * after another.  Since we drop the lock below prior to acquiring
2231          * the physical page(s), our view of the cluster map may no longer
2232          * be accurate, and we could end up over-committing the pages beyond
2233          * the maximum allowed for each class.  To prevent it, this entire
2234          * operation (including the page mapping) is serialized.
2235          */
2236         while (mb_clalloc_busy) {
2237                 mb_clalloc_waiters++;
2238                 (void) msleep(mb_clalloc_waitchan, mbuf_mlock,
2239                     (PZERO-1), "m_clalloc", NULL);
2240                 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2241         }
2242
2243         /* We are busy now; tell everyone else to go away */
2244         mb_clalloc_busy = TRUE;
2245
2246         /*
2247          * Honor the caller's wish to block or not block.  We have a way
2248          * to grow the pool asynchronously using the mbuf worker thread.
2249          */
2250         i = m_howmany(num, bufsize);
2251         if (i == 0 || (wait & M_DONTWAIT))
2252                 goto out;
2253
2254         lck_mtx_unlock(mbuf_mlock);
2255
2256         size = round_page(i * bufsize);
2257         page = kmem_mb_alloc(mb_map, size, large_buffer);
2258
2259         /*
2260          * If we did ask for "n" 16K physically contiguous chunks
2261          * and didn't get them, then please try again without this
2262          * restriction.
2263          */
2264         if (large_buffer && page == 0)
2265                 page = kmem_mb_alloc(mb_map, size, 0);
2266
2267         if (page == 0) {
2268                 if (bufsize <= m_maxsize(MC_BIGCL)) {
2269                         /* Try for 1 page if failed, only for 2KB/4KB request */
2270                         size = NBPG;
2271                         page = kmem_mb_alloc(mb_map, size, 0);
2272                 }
2273
2274                 if (page == 0) {
2275                         lck_mtx_lock(mbuf_mlock);
2276                         goto out;
2277                 }
2278         }
2279
2280         VERIFY(IS_P2ALIGNED(page, NBPG));
2281         numpages = size / NBPG;
2282
2283         /* If auditing is enabled, allocate the audit structures now */
2284         if (mclaudit != NULL) {
2285                 int needed;
2286
2287                 /*
2288                  * Yes, I realize this is a waste of memory for clusters
2289                  * that never get transformed into mbufs, as we may end
2290                  * up with NMBPCL-1 unused audit structures per cluster.
2291                  * But doing so tremendously simplifies the allocation
2292                  * strategy, since at this point we are not holding the
2293                  * mbuf lock and the caller is okay to be blocked.  For
2294                  * the case of big clusters, we allocate one structure
2295                  * for each as we never turn them into mbufs.
2296                  */
2297                 if (bufsize == m_maxsize(MC_CL)) {
2298                         needed = numpages * 2 * NMBPCL;
2299
2300                         i = mcache_alloc_ext(mcl_audit_con_cache,
2301                             &con_list, needed, MCR_SLEEP);
2302
2303                         VERIFY(con_list != NULL && i == needed);
2304                 } else if (bufsize == m_maxsize(MC_BIGCL)) {
2305                         needed = numpages;
2306                 } else {
2307                         needed = numpages / (M16KCLBYTES / NBPG);
2308                 }
2309
2310                 i = mcache_alloc_ext(mcache_audit_cache,
2311                     (mcache_obj_t **)&mca_list, needed, MCR_SLEEP);
2312
2313                 VERIFY(mca_list != NULL && i == needed);
2314         }
2315
2316         lck_mtx_lock(mbuf_mlock);
2317
2318         for (i = 0; i < numpages; i++, page += NBPG) {
2319                 ppnum_t offset = ((char *)page - (char *)mbutl) / NBPG;
2320                 ppnum_t new_page = pmap_find_phys(kernel_pmap,
2321                     (vm_offset_t)page);
2322
2323                 /*
2324                  * In the case of no mapper being available the following
2325                  * code noops and returns the input page; if there is a
2326                  * mapper the appropriate I/O page is returned.
2327                  */
2328                 VERIFY(offset < mcl_pages);
2329                 new_page = IOMapperInsertPage(mcl_paddr_base, offset, new_page);
2330                 mcl_paddr[offset] = new_page << PGSHIFT;
2331
2332                 /* Pattern-fill this fresh page */
2333                 if (mclaudit != NULL)
2334                         mcache_set_pattern(MCACHE_FREE_PATTERN,
2335                             (caddr_t)page, NBPG);
2336
2337                 if (bufsize == m_maxsize(MC_CL)) {
2338                         union mcluster *mcl = (union mcluster *)page;
2339
2340                         /* 1st cluster in the page */
2341                         sp = slab_get(mcl);
2342                         if (mclaudit != NULL)
2343                                 mcl_audit_init(mcl, &mca_list, &con_list,
2344                                     AUDIT_CONTENTS_SIZE, NMBPCL);
2345
2346                         VERIFY(sp->sl_refcnt == 0 && sp->sl_flags == 0);
2347                         slab_init(sp, MC_CL, SLF_MAPPED,
2348                             mcl, mcl, bufsize, 0, 1);
2349
2350                         /* Insert this slab */
2351                         slab_insert(sp, MC_CL);
2352
2353                         /* Update stats now since slab_get() drops the lock */
2354                         mbstat.m_clfree = ++m_infree(MC_CL) +
2355                             m_infree(MC_MBUF_CL);
2356                         mbstat.m_clusters = ++m_total(MC_CL);
2357                         VERIFY(m_total(MC_CL) <= m_maxlimit(MC_CL));
2358
2359                         /* 2nd cluster in the page */
2360                         sp = slab_get(++mcl);
2361                         if (mclaudit != NULL)
2362                                 mcl_audit_init(mcl, &mca_list, &con_list,
2363                                     AUDIT_CONTENTS_SIZE, NMBPCL);
2364
2365                         VERIFY(sp->sl_refcnt == 0 && sp->sl_flags == 0);
2366                         slab_init(sp, MC_CL, SLF_MAPPED,
2367                             mcl, mcl, bufsize, 0, 1);
2368
2369                         /* Insert this slab */
2370                         slab_insert(sp, MC_CL);
2371
2372                         /* Update stats now since slab_get() drops the lock */
2373                         mbstat.m_clfree = ++m_infree(MC_CL) +
2374                             m_infree(MC_MBUF_CL);
2375                         mbstat.m_clusters = ++m_total(MC_CL);
2376                         VERIFY(m_total(MC_CL) <= m_maxlimit(MC_CL));
2377                 } else if (bufsize == m_maxsize(MC_BIGCL)) {
2378                         union mbigcluster *mbc = (union mbigcluster *)page;
2379                         mcl_slab_t *nsp;
2380
2381                         /* One for the entire page */
2382                         sp = slab_get(mbc);
2383                         if (mclaudit != NULL)
2384                                 mcl_audit_init(mbc, &mca_list, NULL, 0, 1);
2385
2386                         VERIFY(sp->sl_refcnt == 0 && sp->sl_flags == 0);
2387                         slab_init(sp, MC_BIGCL, SLF_MAPPED,
2388                             mbc, mbc, bufsize, 0, 1);
2389
2390                         /* 2nd cluster's slab is part of the previous one */
2391                         nsp = slab_get(((union mcluster *)page) + 1);
2392                         slab_init(nsp, MC_BIGCL, SLF_MAPPED | SLF_PARTIAL,
2393                             mbc, NULL, 0, 0, 0);
2394
2395                         /* Insert this slab */
2396                         slab_insert(sp, MC_BIGCL);
2397
2398                         /* Update stats now since slab_get() drops the lock */
2399                         mbstat.m_bigclfree = ++m_infree(MC_BIGCL) +
2400                             m_infree(MC_MBUF_BIGCL);
2401                         mbstat.m_bigclusters = ++m_total(MC_BIGCL);
2402                         VERIFY(m_total(MC_BIGCL) <= m_maxlimit(MC_BIGCL));
2403                 } else if ((i % (M16KCLBYTES / NBPG)) == 0) {
2404                         union m16kcluster *m16kcl = (union m16kcluster *)page;
2405                         mcl_slab_t *nsp;
2406                         int k;
2407
2408                         VERIFY(njcl > 0);
2409                         /* One for the entire 16KB */
2410                         sp = slab_get(m16kcl);
2411                         if (mclaudit != NULL)
2412                                 mcl_audit_init(m16kcl, &mca_list, NULL, 0, 1);
2413
2414                         VERIFY(sp->sl_refcnt == 0 && sp->sl_flags == 0);
2415                         slab_init(sp, MC_16KCL, SLF_MAPPED,
2416                             m16kcl, m16kcl, bufsize, 0, 1);
2417
2418                         /* 2nd-8th cluster's slab is part of the first one */
2419                         for (k = 1; k < (M16KCLBYTES / MCLBYTES); k++) {
2420                                 nsp = slab_get(((union mcluster *)page) + k);
2421                                 VERIFY(nsp->sl_refcnt == 0 &&
2422                                     nsp->sl_flags == 0);
2423                                 slab_init(nsp, MC_16KCL,
2424                                     SLF_MAPPED | SLF_PARTIAL,
2425                                     m16kcl, NULL, 0, 0, 0);
2426                         }
2427
2428                         /* Insert this slab */
2429                         slab_insert(sp, MC_16KCL);
2430
2431                         /* Update stats now since slab_get() drops the lock */
2432                         m_infree(MC_16KCL)++;
2433                         m_total(MC_16KCL)++;
2434                         VERIFY(m_total(MC_16KCL) <= m_maxlimit(MC_16KCL));
2435                 }
2436         }
2437         VERIFY(mca_list == NULL && con_list == NULL);
2438
2439         /* We're done; let others enter */
2440         mb_clalloc_busy = FALSE;
2441         if (mb_clalloc_waiters > 0) {
2442                 mb_clalloc_waiters = 0;
2443                 wakeup(mb_clalloc_waitchan);
2444         }
2445
2446         if (bufsize == m_maxsize(MC_CL))
2447                 return (numpages << 1);
2448         else if (bufsize == m_maxsize(MC_BIGCL))
2449                 return (numpages);
2450
2451         VERIFY(bufsize == m_maxsize(MC_16KCL));
2452         return (numpages / (M16KCLBYTES / NBPG));
2453
2454 out:
2455         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2456
2457         /* We're done; let others enter */
2458         mb_clalloc_busy = FALSE;
2459         if (mb_clalloc_waiters > 0) {
2460                 mb_clalloc_waiters = 0;
2461                 wakeup(mb_clalloc_waitchan);
2462         }
2463
2464         /*
2465          * When non-blocking we kick a thread if we have to grow the
2466          * pool or if the number of free clusters is less than requested.
2467          */
2468         if (bufsize == m_maxsize(MC_CL)) {
2469                 if (i > 0) {
2470                         /*
2471                          * Remember total number of clusters needed
2472                          * at this time.
2473                          */
2474                         i += m_total(MC_CL);
2475                         if (i > mbuf_expand_mcl) {
2476                                 mbuf_expand_mcl = i;
2477                                 if (mbuf_worker_ready)
2478                                         wakeup((caddr_t)&mbuf_worker_run);
2479                         }
2480                 }
2481
2482                 if (m_infree(MC_CL) >= num)
2483                         return (1);
2484         } else if (bufsize == m_maxsize(MC_BIGCL)) {
2485                 if (i > 0) {
2486                         /*
2487                          * Remember total number of 4KB clusters needed
2488                          * at this time.
2489                          */
2490                         i += m_total(MC_BIGCL);
2491                         if (i > mbuf_expand_big) {
2492                                 mbuf_expand_big = i;
2493                                 if (mbuf_worker_ready)
2494                                         wakeup((caddr_t)&mbuf_worker_run);
2495                         }
2496                 }
2497
2498                 if (m_infree(MC_BIGCL) >= num)
2499                         return (1);
2500         } else {
2501                 if (i > 0) {
2502                         /*
2503                          * Remember total number of 16KB clusters needed
2504                          * at this time.
2505                          */
2506                         i += m_total(MC_16KCL);
2507                         if (i > mbuf_expand_16k) {
2508                                 mbuf_expand_16k = i;
2509                                 if (mbuf_worker_ready)
2510                                         wakeup((caddr_t)&mbuf_worker_run);
2511                         }
2512                 }
2513
2514                 if (m_infree(MC_16KCL) >= num)
2515                         return (1);
2516         }
2517         return (0);
2518 }
2519
2520 /*
2521  * Populate the global freelist of the corresponding buffer class.
2522  */
2523 static int
2524 freelist_populate(mbuf_class_t class, unsigned int num, int wait)
2525 {
2526         mcache_obj_t *o = NULL;
2527         int i;
2528
2529         VERIFY(class == MC_MBUF || class == MC_CL || class == MC_BIGCL ||
2530             class == MC_16KCL);
2531
2532 #if CONFIG_MBUF_NOEXPAND
2533         if ((mbstat.m_mbufs / NMBPCL) >= maxmbufcl) {
2534 #if DEBUG
2535                 static int printonce = 1;
2536                 if (printonce == 1) {
2537                         printonce = 0;
2538                         printf("m_expand failed, allocated %ld out of %d "
2539                             "clusters\n", mbstat.m_mbufs / NMBPCL,
2540                             nmbclusters);
2541                 }
2542 #endif /* DEBUG */
2543                 return (0);
2544         }
2545 #endif /* CONFIG_MBUF_NOEXPAND */
2546
2547         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2548
2549         switch (class) {
2550         case MC_MBUF:
2551         case MC_CL:
2552                 i = m_clalloc(num, wait, m_maxsize(MC_CL));
2553
2554                 /* Respect the 2K clusters minimum limit */
2555                 if (m_total(MC_CL) == m_maxlimit(MC_CL) &&
2556                     m_infree(MC_CL) <= m_minlimit(MC_CL)) {
2557                         if (class != MC_CL || (wait & MCR_COMP))
2558                                 return (0);
2559                 }
2560                 if (class == MC_CL)
2561                         return (i != 0);
2562                 break;
2563
2564         case MC_BIGCL:
2565         case MC_16KCL:
2566                 return (m_clalloc(num, wait, m_maxsize(class)) != 0);
2567                 /* NOTREACHED */
2568
2569         default:
2570                 VERIFY(0);
2571                 /* NOTREACHED */
2572         }
2573
2574         /* Steal a cluster and cut it up to create NMBPCL mbufs */
2575         if ((o = slab_alloc(MC_CL, wait)) != NULL) {
2576                 struct mbuf *m = (struct mbuf *)o;
2577                 mcache_audit_t *mca = NULL;
2578                 mcl_slab_t *sp = slab_get(o);
2579
2580                 VERIFY(slab_is_detached(sp) &&
2581                     (sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED);
2582
2583                 /* Make sure that the cluster is unmolested while in freelist */
2584                 if (mclaudit != NULL) {
2585                         mca = mcl_audit_buf2mca(MC_CL, o);
2586                         mcache_audit_free_verify(mca, o, 0, m_maxsize(MC_CL));
2587                 }
2588
2589                 /* Reinitialize it as an mbuf slab */
2590                 slab_init(sp, MC_MBUF, sp->sl_flags, sp->sl_base, NULL,
2591                     sp->sl_len, 0, NMBPCL);
2592
2593                 VERIFY(m == (struct mbuf *)sp->sl_base);
2594                 VERIFY(sp->sl_head == NULL);
2595
2596                 m_total(MC_MBUF) += NMBPCL;
2597                 mbstat.m_mbufs = m_total(MC_MBUF);
2598                 m_infree(MC_MBUF) += NMBPCL;
2599                 mtype_stat_add(MT_FREE, NMBPCL);
2600
2601                 i = NMBPCL;
2602                 while (i--) {
2603                         /*
2604                          * If auditing is enabled, construct the shadow mbuf
2605                          * in the audit structure instead of the actual one.
2606                          * mbuf_slab_audit() will take care of restoring the
2607                          * contents after the integrity check.
2608                          */
2609                         if (mclaudit != NULL) {
2610                                 struct mbuf *ms;
2611                                 mca = mcl_audit_buf2mca(MC_MBUF,
2612                                     (mcache_obj_t *)m);
2613                                 ms = ((struct mbuf *)mca->mca_contents);
2614                                 ms->m_type = MT_FREE;
2615                         } else {
2616                                 m->m_type = MT_FREE;
2617                         }
2618                         m->m_next = sp->sl_head;
2619                         sp->sl_head = (void *)m++;
2620                 }
2621
2622                 /* Insert it into the mbuf class's slab list */
2623                 slab_insert(sp, MC_MBUF);
2624
2625                 if ((i = mb_waiters) > 0)
2626                         mb_waiters = 0;
2627                 if (i != 0)
2628                         wakeup(mb_waitchan);
2629
2630                 return (1);
2631         }
2632
2633         return (0);
2634 }
2635
2636 /*
2637  * (Inaccurately) check if it might be worth a trip back to the
2638  * mcache layer due the availability of objects there.  We'll
2639  * end up back here if there's nothing up there.
2640  */
2641 static boolean_t
2642 mbuf_cached_above(mbuf_class_t class, int wait)
2643 {
2644         switch (class) {
2645         case MC_MBUF:
2646                 if (wait & MCR_COMP)
2647                         return (!mcache_bkt_isempty(m_cache(MC_MBUF_CL)) ||
2648                             !mcache_bkt_isempty(m_cache(MC_MBUF_BIGCL)));
2649                 break;
2650
2651         case MC_CL:
2652                 if (wait & MCR_COMP)
2653                         return (!mcache_bkt_isempty(m_cache(MC_MBUF_CL)));
2654                 break;
2655
2656         case MC_BIGCL:
2657                 if (wait & MCR_COMP)
2658                         return (!mcache_bkt_isempty(m_cache(MC_MBUF_BIGCL)));
2659                 break;
2660
2661         case MC_16KCL:
2662                 if (wait & MCR_COMP)
2663                         return (!mcache_bkt_isempty(m_cache(MC_MBUF_16KCL)));
2664                 break;
2665
2666         case MC_MBUF_CL:
2667         case MC_MBUF_BIGCL:
2668         case MC_MBUF_16KCL:
2669                 break;
2670
2671         default:
2672                 VERIFY(0);
2673                 /* NOTREACHED */
2674         }
2675
2676         return (!mcache_bkt_isempty(m_cache(class)));
2677 }
2678
2679 /*
2680  * If possible, convert constructed objects to raw ones.
2681  */
2682 static boolean_t
2683 mbuf_steal(mbuf_class_t class, unsigned int num)
2684 {
2685         mcache_obj_t *top = NULL;
2686         mcache_obj_t **list = &top;
2687         unsigned int tot = 0;
2688
2689         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2690
2691         switch (class) {
2692         case MC_MBUF:
2693         case MC_CL:
2694         case MC_BIGCL:
2695         case MC_16KCL:
2696                 return (FALSE);
2697
2698         case MC_MBUF_CL:
2699         case MC_MBUF_BIGCL:
2700         case MC_MBUF_16KCL:
2701                 /* Get the required number of constructed objects if possible */
2702                 if (m_infree(class) > m_minlimit(class)) {
2703                         tot = cslab_alloc(class, &list,
2704                             MIN(num, m_infree(class)));
2705                 }
2706
2707                 /* And destroy them to get back the raw objects */
2708                 if (top != NULL)
2709                         (void) cslab_free(class, top, 1);
2710                 break;
2711
2712         default:
2713                 VERIFY(0);
2714                 /* NOTREACHED */
2715         }
2716
2717         return (tot == num);
2718 }
2719
2720 static void
2721 m_reclaim(mbuf_class_t class, unsigned int num, boolean_t comp)
2722 {
2723         int m, bmap = 0;
2724
2725         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2726
2727         VERIFY(m_total(MC_CL) <= m_maxlimit(MC_CL));
2728         VERIFY(m_total(MC_BIGCL) <= m_maxlimit(MC_BIGCL));
2729         VERIFY(m_total(MC_16KCL) <= m_maxlimit(MC_16KCL));
2730
2731         /*
2732          * This logic can be made smarter; for now, simply mark
2733          * all other related classes as potential victims.
2734          */
2735         switch (class) {
2736         case MC_MBUF:
2737                 m_wantpurge(MC_CL)++;
2738                 m_wantpurge(MC_MBUF_CL)++;
2739                 m_wantpurge(MC_MBUF_BIGCL)++;
2740                 break;
2741
2742         case MC_CL:
2743                 m_wantpurge(MC_MBUF)++;
2744                 if (!comp)
2745                         m_wantpurge(MC_MBUF_CL)++;
2746                 break;
2747
2748         case MC_BIGCL:
2749                 if (!comp)
2750                         m_wantpurge(MC_MBUF_BIGCL)++;
2751                 break;
2752
2753         case MC_16KCL:
2754                 if (!comp)
2755                         m_wantpurge(MC_MBUF_16KCL)++;
2756                 break;
2757
2758         default:
2759                 VERIFY(0);
2760                 /* NOTREACHED */
2761         }
2762
2763         /*
2764          * Run through each marked class and check if we really need to
2765          * purge (and therefore temporarily disable) the per-CPU caches
2766          * layer used by the class.  If so, remember the classes since
2767          * we are going to drop the lock below prior to purging.
2768          */
2769         for (m = 0; m < NELEM(mbuf_table); m++) {
2770                 if (m_wantpurge(m) > 0) {
2771                         m_wantpurge(m) = 0;
2772                         /*
2773                          * Try hard to steal the required number of objects
2774                          * from the freelist of other mbuf classes.  Only
2775                          * purge and disable the per-CPU caches layer when
2776                          * we don't have enough; it's the last resort.
2777                          */
2778                         if (!mbuf_steal(m, num))
2779                                 bmap |= (1 << m);
2780                 }
2781         }
2782
2783         lck_mtx_unlock(mbuf_mlock);
2784
2785         if (bmap != 0) {
2786                 /* drain is performed in pfslowtimo(), to avoid deadlocks */
2787                 do_reclaim = 1;
2788
2789                 /* Sigh; we have no other choices but to ask mcache to purge */
2790                 for (m = 0; m < NELEM(mbuf_table); m++) {
2791                         if ((bmap & (1 << m)) &&
2792                             mcache_purge_cache(m_cache(m))) {
2793                                 lck_mtx_lock(mbuf_mlock);
2794                                 m_purge_cnt(m)++;
2795                                 mbstat.m_drain++;
2796                                 lck_mtx_unlock(mbuf_mlock);
2797                         }
2798                 }
2799         } else {
2800                 /*
2801                  * Request mcache to reap extra elements from all of its caches;
2802                  * note that all reaps are serialized and happen only at a fixed
2803                  * interval.
2804                  */
2805                 mcache_reap();
2806         }
2807         lck_mtx_lock(mbuf_mlock);
2808 }
2809
2810 static inline struct mbuf *
2811 m_get_common(int wait, short type, int hdr)
2812 {
2813         struct mbuf *m;
2814         int mcflags = MSLEEPF(wait);
2815
2816         /* Is this due to a non-blocking retry?  If so, then try harder */
2817         if (mcflags & MCR_NOSLEEP)
2818                 mcflags |= MCR_TRYHARD;
2819
2820         m = mcache_alloc(m_cache(MC_MBUF), mcflags);
2821         if (m != NULL) {
2822                 MBUF_INIT(m, hdr, type);
2823                 mtype_stat_inc(type);
2824                 mtype_stat_dec(MT_FREE);
2825 #if CONFIG_MACF_NET
2826                 if (hdr && mac_init_mbuf(m, wait) != 0) {
2827                         m_free(m);
2828                         return (NULL);
2829                 }
2830 #endif /* MAC_NET */
2831         }
2832         return (m);
2833 }
2834
2835 /*
2836  * Space allocation routines; these are also available as macros
2837  * for critical paths.
2838  */
2839 #define _M_GET(wait, type)      m_get_common(wait, type, 0)
2840 #define _M_GETHDR(wait, type)   m_get_common(wait, type, 1)
2841 #define _M_RETRY(wait, type)    _M_GET(wait, type)
2842 #define _M_RETRYHDR(wait, type) _M_GETHDR(wait, type)
2843 #define _MGET(m, how, type)     ((m) = _M_GET(how, type))
2844 #define _MGETHDR(m, how, type)  ((m) = _M_GETHDR(how, type))
2845
2846 struct mbuf *
2847 m_get(int wait, int type)
2848 {
2849         return (_M_GET(wait, type));
2850 }
2851
2852 struct mbuf *
2853 m_gethdr(int wait, int type)
2854 {
2855         return (_M_GETHDR(wait, type));
2856 }
2857
2858 struct mbuf *
2859 m_retry(int wait, int type)
2860 {
2861         return (_M_RETRY(wait, type));
2862 }
2863
2864 struct mbuf *
2865 m_retryhdr(int wait, int type)
2866 {
2867         return (_M_RETRYHDR(wait, type));
2868 }
2869
2870 struct mbuf *
2871 m_getclr(int wait, int type)
2872 {
2873         struct mbuf *m;
2874
2875         _MGET(m, wait, type);
2876         if (m != NULL)
2877                 bzero(MTOD(m, caddr_t), MLEN);
2878         return (m);
2879 }
2880
2881 struct mbuf *
2882 m_free(struct mbuf *m)
2883 {
2884         struct mbuf *n = m->m_next;
2885
2886         if (m->m_type == MT_FREE)
2887                 panic("m_free: freeing an already freed mbuf");
2888
2889         /* Free the aux data and tags if there is any */
2890         if (m->m_flags & M_PKTHDR) {
2891                 m_tag_delete_chain(m, NULL);
2892         }
2893
2894         if (m->m_flags & M_EXT) {
2895                 u_int32_t refcnt;
2896                 u_int32_t flags;
2897
2898                 refcnt = m_decref(m);
2899                 flags = MEXT_FLAGS(m);
2900                 if (refcnt == 0 && flags == 0) {
2901                         if (m->m_ext.ext_free == NULL) {
2902                                 mcache_free(m_cache(MC_CL), m->m_ext.ext_buf);
2903                         } else if (m->m_ext.ext_free == m_bigfree) {
2904                                 mcache_free(m_cache(MC_BIGCL),
2905                                     m->m_ext.ext_buf);
2906                         } else if (m->m_ext.ext_free == m_16kfree) {
2907                                 mcache_free(m_cache(MC_16KCL),
2908                                     m->m_ext.ext_buf);
2909                         } else {
2910                                 (*(m->m_ext.ext_free))(m->m_ext.ext_buf,
2911                                     m->m_ext.ext_size, m->m_ext.ext_arg);
2912                         }
2913                         mcache_free(ref_cache, MEXT_RFA(m));
2914                         MEXT_RFA(m) = NULL;
2915                 } else if (refcnt == 0 && (flags & EXTF_COMPOSITE)) {
2916                         VERIFY(m->m_type != MT_FREE);
2917
2918                         mtype_stat_dec(m->m_type);
2919                         mtype_stat_inc(MT_FREE);
2920
2921                         m->m_type = MT_FREE;
2922                         m->m_flags = M_EXT;
2923                         m->m_len = 0;
2924                         m->m_next = m->m_nextpkt = NULL;
2925
2926                         /* "Free" into the intermediate cache */
2927                         if (m->m_ext.ext_free == NULL) {
2928                                 mcache_free(m_cache(MC_MBUF_CL), m);
2929                         } else if (m->m_ext.ext_free == m_bigfree) {
2930                                 mcache_free(m_cache(MC_MBUF_BIGCL), m);
2931                         } else {
2932                                 VERIFY(m->m_ext.ext_free == m_16kfree);
2933                                 mcache_free(m_cache(MC_MBUF_16KCL), m);
2934                         }
2935                         return (n);
2936                 }
2937         }
2938
2939         if (m->m_type != MT_FREE) {
2940                 mtype_stat_dec(m->m_type);
2941                 mtype_stat_inc(MT_FREE);
2942         }
2943
2944         m->m_type = MT_FREE;
2945         m->m_flags = m->m_len = 0;
2946         m->m_next = m->m_nextpkt = NULL;
2947
2948         mcache_free(m_cache(MC_MBUF), m);
2949
2950         return (n);
2951 }
2952
2953 __private_extern__ struct mbuf *
2954 m_clattach(struct mbuf *m, int type, caddr_t extbuf,
2955     void (*extfree)(caddr_t, u_int, caddr_t), u_int extsize, caddr_t extarg,
2956     int wait)
2957 {
2958         struct ext_ref *rfa = NULL;
2959
2960         if (m == NULL && (m = _M_GETHDR(wait, type)) == NULL)
2961                 return (NULL);
2962
2963         if (m->m_flags & M_EXT) {
2964                 u_int32_t refcnt;
2965                 u_int32_t flags;
2966
2967                 refcnt = m_decref(m);
2968                 flags = MEXT_FLAGS(m);
2969                 if (refcnt == 0 && flags == 0) {
2970                         if (m->m_ext.ext_free == NULL) {
2971                                 mcache_free(m_cache(MC_CL), m->m_ext.ext_buf);
2972                         } else if (m->m_ext.ext_free == m_bigfree) {
2973                                 mcache_free(m_cache(MC_BIGCL),
2974                                     m->m_ext.ext_buf);
2975                         } else if (m->m_ext.ext_free == m_16kfree) {
2976                                 mcache_free(m_cache(MC_16KCL),
2977                                     m->m_ext.ext_buf);
2978                         } else {
2979                                 (*(m->m_ext.ext_free))(m->m_ext.ext_buf,
2980                                     m->m_ext.ext_size, m->m_ext.ext_arg);
2981                         }
2982                         /* Re-use the reference structure */
2983                         rfa = MEXT_RFA(m);
2984                 } else if (refcnt == 0 && (flags & EXTF_COMPOSITE)) {
2985                         VERIFY(m->m_type != MT_FREE);
2986
2987                         mtype_stat_dec(m->m_type);
2988                         mtype_stat_inc(MT_FREE);
2989
2990                         m->m_type = MT_FREE;
2991                         m->m_flags = M_EXT;
2992                         m->m_len = 0;
2993                         m->m_next = m->m_nextpkt = NULL;
2994                         /* "Free" into the intermediate cache */
2995                         if (m->m_ext.ext_free == NULL) {
2996                                 mcache_free(m_cache(MC_MBUF_CL), m);
2997                         } else if (m->m_ext.ext_free == m_bigfree) {
2998                                 mcache_free(m_cache(MC_MBUF_BIGCL), m);
2999                         } else {
3000                                 VERIFY(m->m_ext.ext_free == m_16kfree);
3001                                 mcache_free(m_cache(MC_MBUF_16KCL), m);
3002                         }
3003                         /*
3004                          * Allocate a new mbuf, since we didn't divorce
3005                          * the composite mbuf + cluster pair above.
3006                          */
3007                         if ((m = _M_GETHDR(wait, type)) == NULL)
3008                                 return (NULL);
3009                 }
3010         }
3011
3012         if (rfa == NULL &&
3013             (rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL) {
3014                 m_free(m);
3015                 return (NULL);
3016         }
3017
3018         MEXT_INIT(m, extbuf, extsize, extfree, extarg, rfa, 1, 0);
3019
3020         return (m);
3021 }
3022
3023 /*
3024  * Perform `fast' allocation mbuf clusters from a cache of recently-freed
3025  * clusters. (If the cache is empty, new clusters are allocated en-masse.)
3026  */
3027 struct mbuf *
3028 m_getcl(int wait, int type, int flags)
3029 {
3030         struct mbuf *m;
3031         int mcflags = MSLEEPF(wait);
3032         int hdr = (flags & M_PKTHDR);
3033
3034         /* Is this due to a non-blocking retry?  If so, then try harder */
3035         if (mcflags & MCR_NOSLEEP)
3036                 mcflags |= MCR_TRYHARD;
3037
3038         m = mcache_alloc(m_cache(MC_MBUF_CL), mcflags);
3039         if (m != NULL) {
3040                 MBUF_INIT(m, hdr, type);
3041                 mtype_stat_inc(type);
3042                 mtype_stat_dec(MT_FREE);
3043 #if CONFIG_MACF_NET
3044                 if (hdr && mac_init_mbuf(m, wait) != 0) {
3045                         m_free(m);
3046                         return (NULL);
3047                 }
3048 #endif /* MAC_NET */
3049         }
3050         return (m);
3051 }
3052
3053 /* m_mclget() add an mbuf cluster to a normal mbuf */
3054 struct mbuf *
3055 m_mclget(struct mbuf *m, int wait)
3056 {
3057         struct ext_ref *rfa;
3058
3059         if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL)
3060                 return (m);
3061
3062         m->m_ext.ext_buf = m_mclalloc(wait);
3063         if (m->m_ext.ext_buf != NULL) {
3064                 MBUF_CL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0);
3065         } else {
3066                 mcache_free(ref_cache, rfa);
3067         }
3068         return (m);
3069 }
3070
3071 /* Allocate an mbuf cluster */
3072 caddr_t
3073 m_mclalloc(int wait)
3074 {
3075         int mcflags = MSLEEPF(wait);
3076
3077         /* Is this due to a non-blocking retry?  If so, then try harder */
3078         if (mcflags & MCR_NOSLEEP)
3079                 mcflags |= MCR_TRYHARD;
3080
3081         return (mcache_alloc(m_cache(MC_CL), mcflags));
3082 }
3083
3084 /* Free an mbuf cluster */
3085 void
3086 m_mclfree(caddr_t p)
3087 {
3088         mcache_free(m_cache(MC_CL), p);
3089 }
3090
3091 /*
3092  * mcl_hasreference() checks if a cluster of an mbuf is referenced by
3093  * another mbuf
3094  */
3095 int
3096 m_mclhasreference(struct mbuf *m)
3097 {
3098         if (!(m->m_flags & M_EXT))
3099                 return (0);
3100
3101         ASSERT(MEXT_RFA(m) != NULL);
3102
3103         return (MEXT_REF(m) > 1);
3104 }
3105
3106 __private_extern__ caddr_t
3107 m_bigalloc(int wait)
3108 {
3109         int mcflags = MSLEEPF(wait);
3110
3111         /* Is this due to a non-blocking retry?  If so, then try harder */
3112         if (mcflags & MCR_NOSLEEP)
3113                 mcflags |= MCR_TRYHARD;
3114
3115         return (mcache_alloc(m_cache(MC_BIGCL), mcflags));
3116 }
3117
3118 __private_extern__ void
3119 m_bigfree(caddr_t p, __unused u_int size, __unused caddr_t arg)
3120 {
3121         mcache_free(m_cache(MC_BIGCL), p);
3122 }
3123
3124 /* m_mbigget() add an 4KB mbuf cluster to a normal mbuf */
3125 __private_extern__ struct mbuf *
3126 m_mbigget(struct mbuf *m, int wait)
3127 {
3128         struct ext_ref *rfa;
3129
3130         if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL)
3131                 return (m);
3132
3133         m->m_ext.ext_buf =  m_bigalloc(wait);
3134         if (m->m_ext.ext_buf != NULL) {
3135                 MBUF_BIGCL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0);
3136         } else {
3137                 mcache_free(ref_cache, rfa);
3138         }
3139         return (m);
3140 }
3141
3142 __private_extern__ caddr_t
3143 m_16kalloc(int wait)
3144 {
3145         int mcflags = MSLEEPF(wait);
3146
3147         /* Is this due to a non-blocking retry?  If so, then try harder */
3148         if (mcflags & MCR_NOSLEEP)
3149                 mcflags |= MCR_TRYHARD;
3150
3151         return (mcache_alloc(m_cache(MC_16KCL), mcflags));
3152 }
3153
3154 __private_extern__ void
3155 m_16kfree(caddr_t p, __unused u_int size, __unused caddr_t arg)
3156 {
3157         mcache_free(m_cache(MC_16KCL), p);
3158 }
3159
3160 /* m_m16kget() add a 16KB mbuf cluster to a normal mbuf */
3161 __private_extern__ struct mbuf *
3162 m_m16kget(struct mbuf *m, int wait)
3163 {
3164         struct ext_ref *rfa;
3165
3166         if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL)
3167                 return (m);
3168
3169         m->m_ext.ext_buf =  m_16kalloc(wait);
3170         if (m->m_ext.ext_buf != NULL) {
3171                 MBUF_16KCL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0);
3172         } else {
3173                 mcache_free(ref_cache, rfa);
3174         }
3175         return (m);
3176 }
3177
3178 /*
3179  * "Move" mbuf pkthdr from "from" to "to".
3180  * "from" must have M_PKTHDR set, and "to" must be empty.
3181  */
3182 void
3183 m_copy_pkthdr(struct mbuf *to, struct mbuf *from)
3184 {
3185         /* We will be taking over the tags of 'to' */
3186         if (to->m_flags & M_PKTHDR)
3187                 m_tag_delete_chain(to, NULL);
3188         to->m_pkthdr = from->m_pkthdr;          /* especially tags */
3189         m_tag_init(from);                       /* purge tags from src */
3190         to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT);
3191         if ((to->m_flags & M_EXT) == 0)
3192                 to->m_data = to->m_pktdat;
3193 }
3194
3195 /*
3196  * Duplicate "from"'s mbuf pkthdr in "to".
3197  * "from" must have M_PKTHDR set, and "to" must be empty.
3198  * In particular, this does a deep copy of the packet tags.
3199  */
3200 static int
3201 m_dup_pkthdr(struct mbuf *to, struct mbuf *from, int how)
3202 {
3203         if (to->m_flags & M_PKTHDR)
3204                 m_tag_delete_chain(to, NULL);
3205         to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT);
3206         if ((to->m_flags & M_EXT) == 0)
3207                 to->m_data = to->m_pktdat;
3208         to->m_pkthdr = from->m_pkthdr;
3209         m_tag_init(to);
3210         return (m_tag_copy_chain(to, from, how));
3211 }
3212
3213 /*
3214  * Return a list of mbuf hdrs that point to clusters.  Try for num_needed;
3215  * if wantall is not set, return whatever number were available.  Set up the
3216  * first num_with_pkthdrs with mbuf hdrs configured as packet headers; these
3217  * are chained on the m_nextpkt field.  Any packets requested beyond this
3218  * are chained onto the last packet header's m_next field.  The size of
3219  * the cluster is controlled by the parameter bufsize.
3220  */
3221 __private_extern__ struct mbuf *
3222 m_getpackets_internal(unsigned int *num_needed, int num_with_pkthdrs,
3223     int wait, int wantall, size_t bufsize)
3224 {
3225         struct mbuf *m;
3226         struct mbuf **np, *top;
3227         unsigned int pnum, needed = *num_needed;
3228         mcache_obj_t *mp_list = NULL;
3229         int mcflags = MSLEEPF(wait);
3230         u_int32_t flag;
3231         struct ext_ref *rfa;
3232         mcache_t *cp;
3233         void *cl;
3234
3235         ASSERT(bufsize == m_maxsize(MC_CL) ||
3236             bufsize == m_maxsize(MC_BIGCL) ||
3237             bufsize == m_maxsize(MC_16KCL));
3238
3239         /*
3240          * Caller must first check for njcl because this
3241          * routine is internal and not exposed/used via KPI.
3242          */
3243         VERIFY(bufsize != m_maxsize(MC_16KCL) || njcl > 0);
3244
3245         top = NULL;
3246         np = &top;
3247         pnum = 0;
3248
3249         /*
3250          * The caller doesn't want all the requested buffers; only some.
3251          * Try hard to get what we can, but don't block.  This effectively
3252          * overrides MCR_SLEEP, since this thread will not go to sleep
3253          * if we can't get all the buffers.
3254          */
3255         if (!wantall || (mcflags & MCR_NOSLEEP))
3256                 mcflags |= MCR_TRYHARD;
3257
3258         /* Allocate the composite mbuf + cluster elements from the cache */
3259         if (bufsize == m_maxsize(MC_CL))
3260                 cp = m_cache(MC_MBUF_CL);
3261         else if (bufsize == m_maxsize(MC_BIGCL))
3262                 cp = m_cache(MC_MBUF_BIGCL);
3263         else
3264                 cp = m_cache(MC_MBUF_16KCL);
3265         needed = mcache_alloc_ext(cp, &mp_list, needed, mcflags);
3266
3267         for (pnum = 0; pnum < needed; pnum++) {
3268                 m = (struct mbuf *)mp_list;
3269                 mp_list = mp_list->obj_next;
3270
3271                 VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT);
3272                 cl = m->m_ext.ext_buf;
3273                 rfa = MEXT_RFA(m);
3274
3275                 ASSERT(cl != NULL && rfa != NULL);
3276                 VERIFY(MBUF_IS_COMPOSITE(m));
3277
3278                 flag = MEXT_FLAGS(m);
3279
3280                 MBUF_INIT(m, num_with_pkthdrs, MT_DATA);
3281                 if (bufsize == m_maxsize(MC_16KCL)) {
3282                         MBUF_16KCL_INIT(m, cl, rfa, 1, flag);
3283                 } else if (bufsize == m_maxsize(MC_BIGCL)) {
3284                         MBUF_BIGCL_INIT(m, cl, rfa, 1, flag);
3285                 } else {
3286                         MBUF_CL_INIT(m, cl, rfa, 1, flag);
3287                 }
3288
3289                 if (num_with_pkthdrs > 0) {
3290                         --num_with_pkthdrs;
3291 #if CONFIG_MACF_NET
3292                         if (mac_mbuf_label_init(m, wait) != 0) {
3293                                 m_free(m);
3294                                 break;
3295                         }
3296 #endif /* MAC_NET */
3297                 }
3298
3299                 *np = m;
3300                 if (num_with_pkthdrs > 0)
3301                         np = &m->m_nextpkt;
3302                 else
3303                         np = &m->m_next;
3304         }
3305         ASSERT(pnum != *num_needed || mp_list == NULL);
3306         if (mp_list != NULL)
3307                 mcache_free_ext(cp, mp_list);
3308
3309         if (pnum > 0) {
3310                 mtype_stat_add(MT_DATA, pnum);
3311                 mtype_stat_sub(MT_FREE, pnum);
3312         }
3313
3314         if (wantall && (pnum != *num_needed)) {
3315                 if (top != NULL)
3316                         m_freem_list(top);
3317                 return (NULL);
3318         }
3319
3320         *num_needed = pnum;
3321         return (top);
3322 }
3323
3324 /*
3325  * Return list of mbuf linked by m_nextpkt.  Try for numlist, and if
3326  * wantall is not set, return whatever number were available.  The size of
3327  * each mbuf in the list is controlled by the parameter packetlen.  Each
3328  * mbuf of the list may have a chain of mbufs linked by m_next.  Each mbuf
3329  * in the chain is called a segment.  If maxsegments is not null and the
3330  * value pointed to is not null, this specify the maximum number of segments
3331  * for a chain of mbufs.  If maxsegments is zero or the value pointed to
3332  * is zero the caller does not have any restriction on the number of segments.
3333  * The actual  number of segments of a mbuf chain is return in the value
3334  * pointed to by maxsegments.
3335  */
3336 __private_extern__ struct mbuf *
3337 m_allocpacket_internal(unsigned int *numlist, size_t packetlen,
3338     unsigned int *maxsegments, int wait, int wantall, size_t wantsize)
3339 {
3340         struct mbuf **np, *top, *first = NULL;
3341         size_t bufsize, r_bufsize;
3342         unsigned int num = 0;
3343         unsigned int nsegs = 0;
3344         unsigned int needed, resid;
3345         int mcflags = MSLEEPF(wait);
3346         mcache_obj_t *mp_list = NULL, *rmp_list = NULL;
3347         mcache_t *cp = NULL, *rcp = NULL;
3348
3349         if (*numlist == 0)
3350                 return (NULL);
3351
3352         top = NULL;
3353         np = &top;
3354
3355         if (wantsize == 0) {
3356                 if (packetlen <= MINCLSIZE) {
3357                         bufsize = packetlen;
3358                 } else if (packetlen > m_maxsize(MC_CL)) {
3359                         /* Use 4KB if jumbo cluster pool isn't available */
3360                         if (packetlen <= m_maxsize(MC_BIGCL) || njcl == 0)
3361                                 bufsize = m_maxsize(MC_BIGCL);
3362                         else
3363                                 bufsize = m_maxsize(MC_16KCL);
3364                 } else {
3365                         bufsize = m_maxsize(MC_CL);
3366                 }
3367         } else if (wantsize == m_maxsize(MC_CL) ||
3368             wantsize == m_maxsize(MC_BIGCL) ||
3369             (wantsize == m_maxsize(MC_16KCL) && njcl > 0)) {
3370                 bufsize = wantsize;
3371         } else {
3372                 return (NULL);
3373         }
3374
3375         if (bufsize <= MHLEN) {
3376                 nsegs = 1;
3377         } else if (bufsize <= MINCLSIZE) {
3378                 if (maxsegments != NULL && *maxsegments == 1) {
3379                         bufsize = m_maxsize(MC_CL);
3380                         nsegs = 1;
3381                 } else {
3382                         nsegs = 2;
3383                 }
3384         } else if (bufsize == m_maxsize(MC_16KCL)) {
3385                 VERIFY(njcl > 0);
3386                 nsegs = ((packetlen - 1) >> (PGSHIFT + 2)) + 1;
3387         } else if (bufsize == m_maxsize(MC_BIGCL)) {
3388                 nsegs = ((packetlen - 1) >> PGSHIFT) + 1;
3389         } else {
3390                 nsegs = ((packetlen - 1) >> MCLSHIFT) + 1;
3391         }
3392         if (maxsegments != NULL) {
3393                 if (*maxsegments && nsegs > *maxsegments) {
3394                         *maxsegments = nsegs;
3395                         return (NULL);
3396                 }
3397                 *maxsegments = nsegs;
3398         }
3399
3400         /*
3401          * The caller doesn't want all the requested buffers; only some.
3402          * Try hard to get what we can, but don't block.  This effectively
3403          * overrides MCR_SLEEP, since this thread will not go to sleep
3404          * if we can't get all the buffers.
3405          */
3406         if (!wantall || (mcflags & MCR_NOSLEEP))
3407                 mcflags |= MCR_TRYHARD;
3408
3409         /*
3410          * Simple case where all elements in the lists/chains are mbufs.
3411          * Unless bufsize is greater than MHLEN, each segment chain is made
3412          * up of exactly 1 mbuf.  Otherwise, each segment chain is made up
3413          * of 2 mbufs; the second one is used for the residual data, i.e.
3414          * the remaining data that cannot fit into the first mbuf.
3415          */
3416         if (bufsize <= MINCLSIZE) {
3417                 /* Allocate the elements in one shot from the mbuf cache */
3418                 ASSERT(bufsize <= MHLEN || nsegs == 2);
3419                 cp = m_cache(MC_MBUF);
3420                 needed = mcache_alloc_ext(cp, &mp_list,
3421                     (*numlist) * nsegs, mcflags);
3422
3423                 /*
3424                  * The number of elements must be even if we are to use an
3425                  * mbuf (instead of a cluster) to store the residual data.
3426                  * If we couldn't allocate the requested number of mbufs,
3427                  * trim the number down (if it's odd) in order to avoid
3428                  * creating a partial segment chain.
3429                  */
3430                 if (bufsize > MHLEN && (needed & 0x1))
3431                         needed--;
3432
3433                 while (num < needed) {
3434                         struct mbuf *m;
3435
3436                         m = (struct mbuf *)mp_list;
3437                         mp_list = mp_list->obj_next;
3438                         ASSERT(m != NULL);
3439
3440                         MBUF_INIT(m, 1, MT_DATA);
3441 #if CONFIG_MACF_NET
3442                         if (mac_init_mbuf(m, wait) != 0) {
3443                                 m_free(m);
3444                                 break;
3445                         }
3446 #endif /* MAC_NET */
3447                         num++;
3448                         if (bufsize > MHLEN) {
3449                                 /* A second mbuf for this segment chain */
3450                                 m->m_next = (struct mbuf *)mp_list;
3451                                 mp_list = mp_list->obj_next;
3452                                 ASSERT(m->m_next != NULL);
3453
3454                                 MBUF_INIT(m->m_next, 0, MT_DATA);
3455                                 num++;
3456                         }
3457                         *np = m;
3458                         np = &m->m_nextpkt;
3459                 }
3460                 ASSERT(num != *numlist || mp_list == NULL);
3461
3462                 if (num > 0) {
3463                         mtype_stat_add(MT_DATA, num);
3464                         mtype_stat_sub(MT_FREE, num);
3465                 }
3466                 num /= nsegs;
3467
3468                 /* We've got them all; return to caller */
3469                 if (num == *numlist)
3470                         return (top);
3471
3472                 goto fail;
3473         }
3474
3475         /*
3476          * Complex cases where elements are made up of one or more composite
3477          * mbufs + cluster, depending on packetlen.  Each N-segment chain can
3478          * be illustrated as follows:
3479          *
3480          * [mbuf + cluster 1] [mbuf + cluster 2] ... [mbuf + cluster N]
3481          *
3482          * Every composite mbuf + cluster element comes from the intermediate
3483          * cache (either MC_MBUF_CL or MC_MBUF_BIGCL).  For space efficiency,
3484          * the last composite element will come from the MC_MBUF_CL cache,
3485          * unless the residual data is larger than 2KB where we use the
3486          * big cluster composite cache (MC_MBUF_BIGCL) instead.  Residual
3487          * data is defined as extra data beyond the first element that cannot
3488          * fit into the previous element, i.e. there is no residual data if
3489          * the chain only has 1 segment.
3490          */
3491         r_bufsize = bufsize;
3492         resid = packetlen > bufsize ? packetlen % bufsize : 0;
3493         if (resid > 0) {
3494                 /* There is residual data; figure out the cluster size */
3495                 if (wantsize == 0 && packetlen > MINCLSIZE) {
3496                         /*
3497                          * Caller didn't request that all of the segments
3498                          * in the chain use the same cluster size; use the
3499                          * smaller of the cluster sizes.
3500                          */
3501                         if (njcl > 0 && resid > m_maxsize(MC_BIGCL))
3502                                 r_bufsize = m_maxsize(MC_16KCL);
3503                         else if (resid > m_maxsize(MC_CL))
3504                                 r_bufsize = m_maxsize(MC_BIGCL);
3505                         else
3506                                 r_bufsize = m_maxsize(MC_CL);
3507                 } else {
3508                         /* Use the same cluster size as the other segments */
3509                         resid = 0;
3510                 }
3511         }
3512
3513         needed = *numlist;
3514         if (resid > 0) {
3515                 /*
3516                  * Attempt to allocate composite mbuf + cluster elements for
3517                  * the residual data in each chain; record the number of such
3518                  * elements that can be allocated so that we know how many
3519                  * segment chains we can afford to create.
3520                  */
3521                 if (r_bufsize <= m_maxsize(MC_CL))
3522                         rcp = m_cache(MC_MBUF_CL);
3523                 else if (r_bufsize <= m_maxsize(MC_BIGCL))
3524                         rcp = m_cache(MC_MBUF_BIGCL);
3525                 else
3526                         rcp = m_cache(MC_MBUF_16KCL);
3527                 needed = mcache_alloc_ext(rcp, &rmp_list, *numlist, mcflags);
3528
3529                 if (needed == 0)
3530                         goto fail;
3531
3532                 /* This is temporarily reduced for calculation */
3533                 ASSERT(nsegs > 1);
3534                 nsegs--;
3535         }
3536
3537         /*
3538          * Attempt to allocate the rest of the composite mbuf + cluster
3539          * elements for the number of segment chains that we need.
3540          */
3541         if (bufsize <= m_maxsize(MC_CL))
3542                 cp = m_cache(MC_MBUF_CL);
3543         else if (bufsize <= m_maxsize(MC_BIGCL))
3544                 cp = m_cache(MC_MBUF_BIGCL);
3545         else
3546                 cp = m_cache(MC_MBUF_16KCL);
3547         needed = mcache_alloc_ext(cp, &mp_list, needed * nsegs, mcflags);
3548
3549         /* Round it down to avoid creating a partial segment chain */
3550         needed = (needed / nsegs) * nsegs;
3551         if (needed == 0)
3552                 goto fail;
3553
3554         if (resid > 0) {
3555                 /*
3556                  * We're about to construct the chain(s); take into account
3557                  * the number of segments we have created above to hold the
3558                  * residual data for each chain, as well as restore the
3559                  * original count of segments per chain.
3560                  */
3561                 ASSERT(nsegs > 0);
3562                 needed += needed / nsegs;
3563                 nsegs++;
3564         }
3565
3566         for (;;) {
3567                 struct mbuf *m;
3568                 u_int32_t flag;
3569                 struct ext_ref *rfa;
3570                 void *cl;
3571                 int pkthdr;
3572
3573                 ++num;
3574                 if (nsegs == 1 || (num % nsegs) != 0 || resid == 0) {
3575                         m = (struct mbuf *)mp_list;
3576                         mp_list = mp_list->obj_next;
3577                 } else {
3578                         m = (struct mbuf *)rmp_list;
3579                         rmp_list = rmp_list->obj_next;
3580                 }
3581                 ASSERT(m != NULL);
3582                 VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT);
3583                 VERIFY(m->m_ext.ext_free == NULL ||
3584                     m->m_ext.ext_free == m_bigfree ||
3585                     m->m_ext.ext_free == m_16kfree);
3586
3587                 cl = m->m_ext.ext_buf;
3588                 rfa = MEXT_RFA(m);
3589
3590                 ASSERT(cl != NULL && rfa != NULL);
3591                 VERIFY(MBUF_IS_COMPOSITE(m));
3592
3593                 flag = MEXT_FLAGS(m);
3594
3595                 pkthdr = (nsegs == 1 || (num % nsegs) == 1);
3596                 if (pkthdr)
3597                         first = m;
3598                 MBUF_INIT(m, pkthdr, MT_DATA);
3599                 if (m->m_ext.ext_free == m_16kfree) {
3600                         MBUF_16KCL_INIT(m, cl, rfa, 1, flag);
3601                 } else if (m->m_ext.ext_free == m_bigfree) {
3602                         MBUF_BIGCL_INIT(m, cl, rfa, 1, flag);
3603                 } else {
3604                         MBUF_CL_INIT(m, cl, rfa, 1, flag);
3605                 }
3606 #if CONFIG_MACF_NET
3607                 if (pkthdr && mac_init_mbuf(m, wait) != 0) {
3608                         --num;
3609                         m_free(m);
3610                         break;
3611                 }
3612 #endif /* MAC_NET */
3613
3614                 *np = m;
3615                 if ((num % nsegs) == 0)
3616                         np = &first->m_nextpkt;
3617                 else
3618                         np = &m->m_next;
3619
3620                 if (num == needed)
3621                         break;
3622         }
3623
3624         if (num > 0) {
3625                 mtype_stat_add(MT_DATA, num);
3626                 mtype_stat_sub(MT_FREE, num);
3627         }
3628
3629         num /= nsegs;
3630
3631         /* We've got them all; return to caller */
3632         if (num == *numlist) {
3633                 ASSERT(mp_list == NULL && rmp_list == NULL);
3634                 return (top);
3635         }
3636
3637 fail:
3638         /* Free up what's left of the above */
3639         if (mp_list != NULL)
3640                 mcache_free_ext(cp, mp_list);
3641         if (rmp_list != NULL)
3642                 mcache_free_ext(rcp, rmp_list);
3643         if (wantall && top != NULL) {
3644                 m_freem(top);
3645                 return (NULL);
3646         }
3647         *numlist = num;
3648         return (top);
3649 }
3650
3651 /*
3652  * Best effort to get a mbuf cluster + pkthdr.  Used by drivers to allocated
3653  * packets on receive ring.
3654  */
3655 __private_extern__ struct mbuf *
3656 m_getpacket_how(int wait)
3657 {
3658         unsigned int num_needed = 1;
3659
3660         return (m_getpackets_internal(&num_needed, 1, wait, 1,
3661             m_maxsize(MC_CL)));
3662 }
3663
3664 /*
3665  * Best effort to get a mbuf cluster + pkthdr.  Used by drivers to allocated
3666  * packets on receive ring.
3667  */
3668 struct mbuf *
3669 m_getpacket(void)
3670 {
3671         unsigned int num_needed = 1;
3672
3673         return (m_getpackets_internal(&num_needed, 1, M_WAIT, 1,
3674             m_maxsize(MC_CL)));
3675 }
3676
3677 /*
3678  * Return a list of mbuf hdrs that point to clusters.  Try for num_needed;
3679  * if this can't be met, return whatever number were available.  Set up the
3680  * first num_with_pkthdrs with mbuf hdrs configured as packet headers.  These
3681  * are chained on the m_nextpkt field.  Any packets requested beyond this are
3682  * chained onto the last packet header's m_next field.
3683  */
3684 struct mbuf *
3685 m_getpackets(int num_needed, int num_with_pkthdrs, int how)
3686 {
3687         unsigned int n = num_needed;
3688
3689         return (m_getpackets_internal(&n, num_with_pkthdrs, how, 0,
3690             m_maxsize(MC_CL)));
3691 }
3692
3693 /*
3694  * Return a list of mbuf hdrs set up as packet hdrs chained together
3695  * on the m_nextpkt field
3696  */
3697 struct mbuf *
3698 m_getpackethdrs(int num_needed, int how)
3699 {
3700         struct mbuf *m;
3701         struct mbuf **np, *top;
3702
3703         top = NULL;
3704         np = &top;
3705
3706         while (num_needed--) {
3707                 m = _M_RETRYHDR(how, MT_DATA);
3708                 if (m == NULL)
3709                         break;
3710
3711                 *np = m;
3712                 np = &m->m_nextpkt;
3713         }
3714
3715         return (top);
3716 }
3717
3718 /*
3719  * Free an mbuf list (m_nextpkt) while following m_next.  Returns the count
3720  * for mbufs packets freed.  Used by the drivers.
3721  */
3722 int
3723 m_freem_list(struct mbuf *m)
3724 {
3725         struct mbuf *nextpkt;
3726         mcache_obj_t *mp_list = NULL;
3727         mcache_obj_t *mcl_list = NULL;
3728         mcache_obj_t *mbc_list = NULL;
3729         mcache_obj_t *m16k_list = NULL;
3730         mcache_obj_t *m_mcl_list = NULL;
3731         mcache_obj_t *m_mbc_list = NULL;
3732         mcache_obj_t *m_m16k_list = NULL;
3733         mcache_obj_t *ref_list = NULL;
3734         int pktcount = 0;
3735         int mt_free = 0, mt_data = 0, mt_header = 0, mt_soname = 0, mt_tag = 0;
3736
3737         while (m != NULL) {
3738                 pktcount++;
3739
3740                 nextpkt = m->m_nextpkt;
3741                 m->m_nextpkt = NULL;
3742
3743                 while (m != NULL) {
3744                         struct mbuf *next = m->m_next;
3745                         mcache_obj_t *o, *rfa;
3746                         u_int32_t refcnt, flags;
3747
3748                         if (m->m_type == MT_FREE)
3749                                 panic("m_free: freeing an already freed mbuf");
3750
3751                         if (m->m_type != MT_FREE)
3752                                 mt_free++;
3753
3754                         if (m->m_flags & M_PKTHDR) {
3755                                 m_tag_delete_chain(m, NULL);
3756                         }
3757
3758                         if (!(m->m_flags & M_EXT))
3759                                 goto simple_free;
3760
3761                         o = (mcache_obj_t *)m->m_ext.ext_buf;
3762                         refcnt = m_decref(m);
3763                         flags = MEXT_FLAGS(m);
3764                         if (refcnt == 0 && flags == 0) {
3765                                 if (m->m_ext.ext_free == NULL) {
3766                                         o->obj_next = mcl_list;
3767                                         mcl_list = o;
3768                                 } else if (m->m_ext.ext_free == m_bigfree) {
3769                                         o->obj_next = mbc_list;
3770                                         mbc_list = o;
3771                                 } else if (m->m_ext.ext_free == m_16kfree) {
3772                                         o->obj_next = m16k_list;
3773                                         m16k_list = o;
3774                                 } else {
3775                                         (*(m->m_ext.ext_free))((caddr_t)o,
3776                                             m->m_ext.ext_size,
3777                                             m->m_ext.ext_arg);
3778                                 }
3779                                 rfa = (mcache_obj_t *)MEXT_RFA(m);
3780                                 rfa->obj_next = ref_list;
3781                                 ref_list = rfa;
3782                                 MEXT_RFA(m) = NULL;
3783                         } else if (refcnt == 0 && (flags & EXTF_COMPOSITE)) {
3784                                 VERIFY(m->m_type != MT_FREE);
3785                                 /*
3786                                  * Amortize the costs of atomic operations
3787                                  * by doing them at the end, if possible.
3788                                  */
3789                                 if (m->m_type == MT_DATA)
3790                                         mt_data++;
3791                                 else if (m->m_type == MT_HEADER)
3792                                         mt_header++;
3793                                 else if (m->m_type == MT_SONAME)
3794                                         mt_soname++;
3795                                 else if (m->m_type == MT_TAG)
3796                                         mt_tag++;
3797                                 else
3798                                         mtype_stat_dec(m->m_type);
3799
3800                                 m->m_type = MT_FREE;
3801                                 m->m_flags = M_EXT;
3802                                 m->m_len = 0;
3803                                 m->m_next = m->m_nextpkt = NULL;
3804
3805                                 /* "Free" into the intermediate cache */
3806                                 o = (mcache_obj_t *)m;
3807                                 if (m->m_ext.ext_free == NULL) {
3808                                         o->obj_next = m_mcl_list;
3809                                         m_mcl_list = o;
3810                                 } else if (m->m_ext.ext_free == m_bigfree) {
3811                                         o->obj_next = m_mbc_list;
3812                                         m_mbc_list = o;
3813                                 } else {
3814                                         VERIFY(m->m_ext.ext_free == m_16kfree);
3815                                         o->obj_next = m_m16k_list;
3816                                         m_m16k_list = o;
3817                                 }
3818                                 m = next;
3819                                 continue;
3820                         }
3821 simple_free:
3822                         /*
3823                          * Amortize the costs of atomic operations
3824                          * by doing them at the end, if possible.
3825                          */
3826                         if (m->m_type == MT_DATA)
3827                                 mt_data++;
3828                         else if (m->m_type == MT_HEADER)
3829                                 mt_header++;
3830                         else if (m->m_type == MT_SONAME)
3831                                 mt_soname++;
3832                         else if (m->m_type == MT_TAG)
3833                                 mt_tag++;
3834                         else if (m->m_type != MT_FREE)
3835                                 mtype_stat_dec(m->m_type);
3836
3837                         m->m_type = MT_FREE;
3838                         m->m_flags = m->m_len = 0;
3839                         m->m_next = m->m_nextpkt = NULL;
3840
3841                         ((mcache_obj_t *)m)->obj_next = mp_list;
3842                         mp_list = (mcache_obj_t *)m;
3843
3844                         m = next;
3845                 }
3846
3847                 m = nextpkt;
3848         }
3849
3850         if (mt_free > 0)
3851                 mtype_stat_add(MT_FREE, mt_free);
3852         if (mt_data > 0)
3853                 mtype_stat_sub(MT_DATA, mt_data);
3854         if (mt_header > 0)
3855                 mtype_stat_sub(MT_HEADER, mt_header);
3856         if (mt_soname > 0)
3857                 mtype_stat_sub(MT_SONAME, mt_soname);
3858         if (mt_tag > 0)
3859                 mtype_stat_sub(MT_TAG, mt_tag);
3860
3861         if (mp_list != NULL)
3862                 mcache_free_ext(m_cache(MC_MBUF), mp_list);
3863         if (mcl_list != NULL)
3864                 mcache_free_ext(m_cache(MC_CL), mcl_list);
3865         if (mbc_list != NULL)
3866                 mcache_free_ext(m_cache(MC_BIGCL), mbc_list);
3867         if (m16k_list != NULL)
3868                 mcache_free_ext(m_cache(MC_16KCL), m16k_list);
3869         if (m_mcl_list != NULL)
3870                 mcache_free_ext(m_cache(MC_MBUF_CL), m_mcl_list);
3871         if (m_mbc_list != NULL)
3872                 mcache_free_ext(m_cache(MC_MBUF_BIGCL), m_mbc_list);
3873         if (m_m16k_list != NULL)
3874                 mcache_free_ext(m_cache(MC_MBUF_16KCL), m_m16k_list);
3875         if (ref_list != NULL)
3876                 mcache_free_ext(ref_cache, ref_list);
3877
3878         return (pktcount);
3879 }
3880
3881 void
3882 m_freem(struct mbuf *m)
3883 {
3884         while (m != NULL)
3885                 m = m_free(m);
3886 }
3887
3888 /*
3889  * Mbuffer utility routines.
3890  */
3891
3892 /*
3893  * Compute the amount of space available before the current start
3894  * of data in an mbuf.
3895  */
3896 int
3897 m_leadingspace(struct mbuf *m)
3898 {
3899         if (m->m_flags & M_EXT) {
3900                 if (MCLHASREFERENCE(m))
3901                         return (0);
3902                 return (m->m_data - m->m_ext.ext_buf);
3903         }
3904         if (m->m_flags & M_PKTHDR)
3905                 return (m->m_data - m->m_pktdat);
3906         return (m->m_data - m->m_dat);
3907 }
3908
3909 /*
3910  * Compute the amount of space available after the end of data in an mbuf.
3911  */
3912 int
3913 m_trailingspace(struct mbuf *m)
3914 {
3915         if (m->m_flags & M_EXT) {
3916                 if (MCLHASREFERENCE(m))
3917                         return (0);
3918                 return (m->m_ext.ext_buf + m->m_ext.ext_size -
3919                     (m->m_data + m->m_len));
3920         }
3921         return (&m->m_dat[MLEN] - (m->m_data + m->m_len));
3922 }
3923
3924 /*
3925  * Lesser-used path for M_PREPEND: allocate new mbuf to prepend to chain,
3926  * copy junk along.  Does not adjust packet header length.
3927  */
3928 struct mbuf *
3929 m_prepend(struct mbuf *m, int len, int how)
3930 {
3931         struct mbuf *mn;
3932
3933         _MGET(mn, how, m->m_type);
3934         if (mn == NULL) {
3935                 m_freem(m);
3936                 return (NULL);
3937         }
3938         if (m->m_flags & M_PKTHDR) {
3939                 M_COPY_PKTHDR(mn, m);
3940                 m->m_flags &= ~M_PKTHDR;
3941         }
3942         mn->m_next = m;
3943         m = mn;
3944         if (len < MHLEN)
3945                 MH_ALIGN(m, len);
3946         m->m_len = len;
3947         return (m);
3948 }
3949
3950 /*
3951  * Replacement for old M_PREPEND macro: allocate new mbuf to prepend to
3952  * chain, copy junk along, and adjust length.
3953  */
3954 struct mbuf *
3955 m_prepend_2(struct mbuf *m, int len, int how)
3956 {
3957         if (M_LEADINGSPACE(m) >= len) {
3958                 m->m_data -= len;
3959                 m->m_len += len;
3960         } else {
3961                 m = m_prepend(m, len, how);
3962         }
3963         if ((m) && (m->m_flags & M_PKTHDR))
3964                 m->m_pkthdr.len += len;
3965         return (m);
3966 }
3967
3968 /*
3969  * Make a copy of an mbuf chain starting "off0" bytes from the beginning,
3970  * continuing for "len" bytes.  If len is M_COPYALL, copy to end of mbuf.
3971  * The wait parameter is a choice of M_WAIT/M_DONTWAIT from caller.
3972  */
3973 int MCFail;
3974
3975 struct mbuf *
3976 m_copym(struct mbuf *m, int off0, int len, int wait)
3977 {
3978         struct mbuf *n, *mhdr = NULL, **np;
3979         int off = off0;
3980         struct mbuf *top;
3981         int copyhdr = 0;
3982
3983         if (off < 0 || len < 0)
3984                 panic("m_copym: invalid offset %d or len %d", off, len);
3985
3986         if (off == 0 && (m->m_flags & M_PKTHDR)) {
3987                 mhdr = m;
3988                 copyhdr = 1;
3989         }
3990
3991         while (off >= m->m_len) {
3992                 if (m->m_next == NULL)
3993                         panic("m_copym: invalid mbuf chain");
3994                 off -= m->m_len;
3995                 m = m->m_next;
3996         }
3997         np = &top;
3998         top = NULL;
3999
4000         while (len > 0) {
4001                 if (m == NULL) {
4002                         if (len != M_COPYALL)
4003                                 panic("m_copym: len != M_COPYALL");
4004                         break;
4005                 }
4006
4007                 n = _M_RETRY(wait, m->m_type);
4008                 *np = n;
4009
4010                 if (n == NULL)
4011                         goto nospace;
4012
4013                 if (copyhdr != 0) {
4014                         M_COPY_PKTHDR(n, mhdr);
4015                         if (len == M_COPYALL)
4016                                 n->m_pkthdr.len -= off0;
4017                         else
4018                                 n->m_pkthdr.len = len;
4019                         copyhdr = 0;
4020                 }
4021                 if (len == M_COPYALL) {
4022                         if (MIN(len, (m->m_len - off)) == len) {
4023                                 printf("m->m_len %d - off %d = %d, %d\n",
4024                                     m->m_len, off, m->m_len - off,
4025                                     MIN(len, (m->m_len - off)));
4026                         }
4027                 }
4028                 n->m_len = MIN(len, (m->m_len - off));
4029                 if (n->m_len == M_COPYALL) {
4030                         printf("n->m_len == M_COPYALL, fixing\n");
4031                         n->m_len = MHLEN;
4032                 }
4033                 if (m->m_flags & M_EXT) {
4034                         n->m_ext = m->m_ext;
4035                         m_incref(m);
4036                         n->m_data = m->m_data + off;
4037                         n->m_flags |= M_EXT;
4038                 } else {
4039                         bcopy(MTOD(m, caddr_t)+off, MTOD(n, caddr_t),
4040                             (unsigned)n->m_len);
4041                 }
4042                 if (len != M_COPYALL)
4043                         len -= n->m_len;
4044                 off = 0;
4045                 m = m->m_next;
4046                 np = &n->m_next;
4047         }
4048
4049         if (top == NULL)
4050                 MCFail++;
4051
4052         return (top);
4053 nospace:
4054
4055         m_freem(top);
4056         MCFail++;
4057         return (NULL);
4058 }
4059
4060 /*
4061  * Equivalent to m_copym except that all necessary mbuf hdrs are allocated
4062  * within this routine also, the last mbuf and offset accessed are passed
4063  * out and can be passed back in to avoid having to rescan the entire mbuf
4064  * list (normally hung off of the socket)
4065  */
4066 struct mbuf *
4067 m_copym_with_hdrs(struct mbuf *m, int off0, int len0, int wait,
4068     struct mbuf **m_last, int *m_off)
4069 {
4070         struct mbuf *n, **np = NULL;
4071         int off = off0, len = len0;
4072         struct mbuf *top = NULL;
4073         int mcflags = MSLEEPF(wait);
4074         int copyhdr = 0;
4075         int type = 0;
4076         mcache_obj_t *list = NULL;
4077         int needed = 0;
4078
4079         if (off == 0 && (m->m_flags & M_PKTHDR))
4080                 copyhdr = 1;
4081
4082         if (*m_last != NULL) {
4083                 m = *m_last;
4084                 off = *m_off;
4085         } else {
4086                 while (off >= m->m_len) {
4087                         off -= m->m_len;
4088                         m = m->m_next;
4089                 }
4090         }
4091
4092         n = m;
4093         while (len > 0) {
4094                 needed++;
4095                 ASSERT(n != NULL);
4096                 len -= MIN(len, (n->m_len - ((needed == 1) ? off : 0)));
4097                 n = n->m_next;
4098         }
4099         needed++;
4100         len = len0;
4101
4102         /*
4103          * If the caller doesn't want to be put to sleep, mark it with
4104          * MCR_TRYHARD so that we may reclaim buffers from other places
4105          * before giving up.
4106          */
4107         if (mcflags & MCR_NOSLEEP)
4108                 mcflags |= MCR_TRYHARD;
4109
4110         if (mcache_alloc_ext(m_cache(MC_MBUF), &list, needed,
4111             mcflags) != needed)
4112                 goto nospace;
4113
4114         needed = 0;
4115         while (len > 0) {
4116                 n = (struct mbuf *)list;
4117                 list = list->obj_next;
4118                 ASSERT(n != NULL && m != NULL);
4119
4120                 type = (top == NULL) ? MT_HEADER : m->m_type;
4121                 MBUF_INIT(n, (top == NULL), type);
4122 #if CONFIG_MACF_NET
4123                 if (top == NULL && mac_mbuf_label_init(n, wait) != 0) {
4124                         mtype_stat_inc(MT_HEADER);
4125                         mtype_stat_dec(MT_FREE);
4126                         m_free(n);
4127                         goto nospace;
4128                 }
4129 #endif /* MAC_NET */
4130
4131                 if (top == NULL) {
4132                         top = n;
4133                         np = &top->m_next;
4134                         continue;
4135                 } else {
4136                         needed++;
4137                         *np = n;
4138                 }
4139
4140                 if (copyhdr) {
4141                         M_COPY_PKTHDR(n, m);
4142                         n->m_pkthdr.len = len;
4143                         copyhdr = 0;
4144                 }
4145                 n->m_len = MIN(len, (m->m_len - off));
4146
4147                 if (m->m_flags & M_EXT) {
4148                         n->m_ext = m->m_ext;
4149                         m_incref(m);
4150                         n->m_data = m->m_data + off;
4151                         n->m_flags |= M_EXT;
4152                 } else {
4153                         bcopy(MTOD(m, caddr_t)+off, MTOD(n, caddr_t),
4154                             (unsigned)n->m_len);
4155                 }
4156                 len -= n->m_len;
4157
4158                 if (len == 0) {
4159                         if ((off + n->m_len) == m->m_len) {
4160                                 *m_last = m->m_next;
4161                                 *m_off  = 0;
4162                         } else {
4163                                 *m_last = m;
4164                                 *m_off  = off + n->m_len;
4165                         }
4166                         break;
4167                 }
4168                 off = 0;
4169                 m = m->m_next;
4170                 np = &n->m_next;
4171         }
4172
4173         mtype_stat_inc(MT_HEADER);
4174         mtype_stat_add(type, needed);
4175         mtype_stat_sub(MT_FREE, needed + 1);
4176
4177         ASSERT(list == NULL);
4178         return (top);
4179
4180 nospace:
4181         if (list != NULL)
4182                 mcache_free_ext(m_cache(MC_MBUF), list);
4183         if (top != NULL)
4184                 m_freem(top);
4185         MCFail++;
4186         return (NULL);
4187 }
4188
4189 /*
4190  * Copy data from an mbuf chain starting "off" bytes from the beginning,
4191  * continuing for "len" bytes, into the indicated buffer.
4192  */
4193 void
4194 m_copydata(struct mbuf *m, int off, int len, void *vp)
4195 {
4196         unsigned count;
4197         char *cp = vp;
4198
4199         if (off < 0 || len < 0)
4200                 panic("m_copydata: invalid offset %d or len %d", off, len);
4201
4202         while (off > 0) {
4203                 if (m == NULL)
4204                         panic("m_copydata: invalid mbuf chain");
4205                 if (off < m->m_len)
4206                         break;
4207                 off -= m->m_len;
4208                 m = m->m_next;
4209         }
4210         while (len > 0) {
4211                 if (m == NULL)
4212                         panic("m_copydata: invalid mbuf chain");
4213                 count = MIN(m->m_len - off, len);
4214                 bcopy(MTOD(m, caddr_t) + off, cp, count);
4215                 len -= count;
4216                 cp += count;
4217                 off = 0;
4218                 m = m->m_next;
4219         }
4220 }
4221
4222 /*
4223  * Concatenate mbuf chain n to m.  Both chains must be of the same type
4224  * (e.g. MT_DATA).  Any m_pkthdr is not updated.
4225  */
4226 void
4227 m_cat(struct mbuf *m, struct mbuf *n)
4228 {
4229         while (m->m_next)
4230                 m = m->m_next;
4231         while (n) {
4232                 if ((m->m_flags & M_EXT) ||
4233                     m->m_data + m->m_len + n->m_len >= &m->m_dat[MLEN]) {
4234                         /* just join the two chains */
4235                         m->m_next = n;
4236                         return;
4237                 }
4238                 /* splat the data from one into the other */
4239                 bcopy(MTOD(n, caddr_t), MTOD(m, caddr_t) + m->m_len,
4240                     (u_int)n->m_len);
4241                 m->m_len += n->m_len;
4242                 n = m_free(n);
4243         }
4244 }
4245
4246 void
4247 m_adj(struct mbuf *mp, int req_len)
4248 {
4249         int len = req_len;
4250         struct mbuf *m;
4251         int count;
4252
4253         if ((m = mp) == NULL)
4254                 return;
4255         if (len >= 0) {
4256                 /*
4257                  * Trim from head.
4258                  */
4259                 while (m != NULL && len > 0) {
4260                         if (m->m_len <= len) {
4261                                 len -= m->m_len;
4262                                 m->m_len = 0;
4263                                 m = m->m_next;
4264                         } else {
4265                                 m->m_len -= len;
4266                                 m->m_data += len;
4267                                 len = 0;
4268                         }
4269                 }
4270                 m = mp;
4271                 if (m->m_flags & M_PKTHDR)
4272                         m->m_pkthdr.len -= (req_len - len);
4273         } else {
4274                 /*
4275                  * Trim from tail.  Scan the mbuf chain,
4276                  * calculating its length and finding the last mbuf.
4277                  * If the adjustment only affects this mbuf, then just
4278                  * adjust and return.  Otherwise, rescan and truncate
4279                  * after the remaining size.
4280                  */
4281                 len = -len;
4282                 count = 0;
4283                 for (;;) {
4284                         count += m->m_len;
4285                         if (m->m_next == (struct mbuf *)0)
4286                                 break;
4287                         m = m->m_next;
4288                 }
4289                 if (m->m_len >= len) {
4290                         m->m_len -= len;
4291                         m = mp;
4292                         if (m->m_flags & M_PKTHDR)
4293                                 m->m_pkthdr.len -= len;
4294                         return;
4295                 }
4296                 count -= len;
4297                 if (count < 0)
4298                         count = 0;
4299                 /*
4300                  * Correct length for chain is "count".
4301                  * Find the mbuf with last data, adjust its length,
4302                  * and toss data from remaining mbufs on chain.
4303                  */
4304                 m = mp;
4305                 if (m->m_flags & M_PKTHDR)
4306                         m->m_pkthdr.len = count;
4307                 for (; m; m = m->m_next) {
4308                         if (m->m_len >= count) {
4309                                 m->m_len = count;
4310                                 break;
4311                         }
4312                         count -= m->m_len;
4313                 }
4314                 while ((m = m->m_next))
4315                         m->m_len = 0;
4316         }
4317 }
4318
4319 /*
4320  * Rearange an mbuf chain so that len bytes are contiguous
4321  * and in the data area of an mbuf (so that mtod and dtom
4322  * will work for a structure of size len).  Returns the resulting
4323  * mbuf chain on success, frees it and returns null on failure.
4324  * If there is room, it will add up to max_protohdr-len extra bytes to the
4325  * contiguous region in an attempt to avoid being called next time.
4326  */
4327 int MPFail;
4328
4329 struct mbuf *
4330 m_pullup(struct mbuf *n, int len)
4331 {
4332         struct mbuf *m;
4333         int count;
4334         int space;
4335
4336         /*
4337          * If first mbuf has no cluster, and has room for len bytes
4338          * without shifting current data, pullup into it,
4339          * otherwise allocate a new mbuf to prepend to the chain.
4340          */
4341         if ((n->m_flags & M_EXT) == 0 &&
4342             n->m_data + len < &n->m_dat[MLEN] && n->m_next) {
4343                 if (n->m_len >= len)
4344                         return (n);
4345                 m = n;
4346                 n = n->m_next;
4347                 len -= m->m_len;
4348         } else {
4349                 if (len > MHLEN)
4350                         goto bad;
4351                 _MGET(m, M_DONTWAIT, n->m_type);
4352                 if (m == 0)
4353                         goto bad;
4354                 m->m_len = 0;
4355                 if (n->m_flags & M_PKTHDR) {
4356                         M_COPY_PKTHDR(m, n);
4357                         n->m_flags &= ~M_PKTHDR;
4358                 }
4359         }
4360         space = &m->m_dat[MLEN] - (m->m_data + m->m_len);
4361         do {
4362                 count = MIN(MIN(MAX(len, max_protohdr), space), n->m_len);
4363                 bcopy(MTOD(n, caddr_t), MTOD(m, caddr_t) + m->m_len,
4364                     (unsigned)count);
4365                 len -= count;
4366                 m->m_len += count;
4367                 n->m_len -= count;
4368                 space -= count;
4369                 if (n->m_len)
4370                         n->m_data += count;
4371                 else
4372                         n = m_free(n);
4373         } while (len > 0 && n);
4374         if (len > 0) {
4375                 (void) m_free(m);
4376                 goto bad;
4377         }
4378         m->m_next = n;
4379         return (m);
4380 bad:
4381         m_freem(n);
4382         MPFail++;
4383         return (0);
4384 }
4385
4386 /*
4387  * Partition an mbuf chain in two pieces, returning the tail --
4388  * all but the first len0 bytes.  In case of failure, it returns NULL and
4389  * attempts to restore the chain to its original state.
4390  */
4391 struct mbuf *
4392 m_split(struct mbuf *m0, int len0, int wait)
4393 {
4394         return (m_split0(m0, len0, wait, 1));
4395 }
4396
4397 static struct mbuf *
4398 m_split0(struct mbuf *m0, int len0, int wait, int copyhdr)
4399 {
4400         struct mbuf *m, *n;
4401         unsigned len = len0, remain;
4402
4403         for (m = m0; m && len > m->m_len; m = m->m_next)
4404                 len -= m->m_len;
4405         if (m == NULL)
4406                 return (NULL);
4407         remain = m->m_len - len;
4408         if (copyhdr && (m0->m_flags & M_PKTHDR)) {
4409                 _MGETHDR(n, wait, m0->m_type);
4410                 if (n == NULL)
4411                         return (NULL);
4412                 n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif;
4413                 n->m_pkthdr.len = m0->m_pkthdr.len - len0;
4414                 m0->m_pkthdr.len = len0;
4415                 if (m->m_flags & M_EXT)
4416                         goto extpacket;
4417                 if (remain > MHLEN) {
4418                         /* m can't be the lead packet */
4419                         MH_ALIGN(n, 0);
4420                         n->m_next = m_split(m, len, wait);
4421                         if (n->m_next == NULL) {
4422                                 (void) m_free(n);
4423                                 return (NULL);
4424                         } else
4425                                 return (n);
4426                 } else
4427                         MH_ALIGN(n, remain);
4428         } else if (remain == 0) {
4429                 n = m->m_next;
4430                 m->m_next = NULL;
4431                 return (n);
4432         } else {
4433                 _MGET(n, wait, m->m_type);
4434                 if (n == NULL)
4435                         return (NULL);
4436                 M_ALIGN(n, remain);
4437         }
4438 extpacket:
4439         if (m->m_flags & M_EXT) {
4440                 n->m_flags |= M_EXT;
4441                 n->m_ext = m->m_ext;
4442                 m_incref(m);
4443                 n->m_data = m->m_data + len;
4444         } else {
4445                 bcopy(MTOD(m, caddr_t) + len, MTOD(n, caddr_t), remain);
4446         }
4447         n->m_len = remain;
4448         m->m_len = len;
4449         n->m_next = m->m_next;
4450         m->m_next = NULL;
4451         return (n);
4452 }
4453
4454 /*
4455  * Routine to copy from device local memory into mbufs.
4456  */
4457 struct mbuf *
4458 m_devget(char *buf, int totlen, int off0, struct ifnet *ifp,
4459     void (*copy)(const void *, void *, size_t))
4460 {
4461         struct mbuf *m;
4462         struct mbuf *top = NULL, **mp = &top;
4463         int off = off0, len;
4464         char *cp;
4465         char *epkt;
4466
4467         cp = buf;
4468         epkt = cp + totlen;
4469         if (off) {
4470                 /*
4471                  * If 'off' is non-zero, packet is trailer-encapsulated,
4472                  * so we have to skip the type and length fields.
4473                  */
4474                 cp += off + 2 * sizeof (u_int16_t);
4475                 totlen -= 2 * sizeof (u_int16_t);
4476         }
4477         _MGETHDR(m, M_DONTWAIT, MT_DATA);
4478         if (m == NULL)
4479                 return (NULL);
4480         m->m_pkthdr.rcvif = ifp;
4481         m->m_pkthdr.len = totlen;
4482         m->m_len = MHLEN;
4483
4484         while (totlen > 0) {
4485                 if (top != NULL) {
4486                         _MGET(m, M_DONTWAIT, MT_DATA);
4487                         if (m == NULL) {
4488                                 m_freem(top);
4489                                 return (NULL);
4490                         }
4491                         m->m_len = MLEN;
4492                 }
4493                 len = MIN(totlen, epkt - cp);
4494                 if (len >= MINCLSIZE) {
4495                         MCLGET(m, M_DONTWAIT);
4496                         if (m->m_flags & M_EXT) {
4497                                 m->m_len = len = MIN(len, m_maxsize(MC_CL));
4498                         } else {
4499                                 /* give up when it's out of cluster mbufs */
4500                                 if (top != NULL)
4501                                         m_freem(top);
4502                                 m_freem(m);
4503                                 return (NULL);
4504                         }
4505                 } else {
4506                         /*
4507                          * Place initial small packet/header at end of mbuf.
4508                          */
4509                         if (len < m->m_len) {
4510                                 if (top == NULL &&
4511                                     len + max_linkhdr <= m->m_len)
4512                                         m->m_data += max_linkhdr;
4513                                 m->m_len = len;
4514                         } else {
4515                                 len = m->m_len;
4516                         }
4517                 }
4518                 if (copy)
4519                         copy(cp, MTOD(m, caddr_t), (unsigned)len);
4520                 else
4521                         bcopy(cp, MTOD(m, caddr_t), (unsigned)len);
4522                 cp += len;
4523                 *mp = m;
4524                 mp = &m->m_next;
4525                 totlen -= len;
4526                 if (cp == epkt)
4527                         cp = buf;
4528         }
4529         return (top);
4530 }
4531
4532 void
4533 mbuf_growth_aggressive(void)
4534 {
4535         lck_mtx_lock(mbuf_mlock);
4536         /*
4537          * Don't start to grow the pool until we are at least
4538          * 1/2 (50%) of current total capacity.
4539          */
4540         mbuf_gscale = MB_GROWTH_AGGRESSIVE;
4541         lck_mtx_unlock(mbuf_mlock);
4542 }
4543
4544 void
4545 mbuf_growth_normal(void)
4546 {
4547         lck_mtx_lock(mbuf_mlock);
4548         /*
4549          * Don't start to grow the pool until we are at least
4550          * 15/16 (93.75%) of current total capacity.
4551          */
4552         mbuf_gscale = MB_GROWTH_NORMAL;
4553         lck_mtx_unlock(mbuf_mlock);
4554 }
4555
4556 /*
4557  * Cluster freelist allocation check.
4558  */
4559 static int
4560 m_howmany(int num, size_t bufsize)
4561 {
4562         int i = 0, j = 0;
4563         u_int32_t m_clusters, m_bigclusters, m_16kclusters;
4564         u_int32_t m_clfree, m_bigclfree, m_16kclfree;
4565         u_int32_t s = mbuf_gscale;
4566
4567         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
4568
4569         m_clusters = m_total(MC_CL);
4570         m_bigclusters = m_total(MC_BIGCL);
4571         m_16kclusters = m_total(MC_16KCL);
4572         m_clfree = m_infree(MC_CL);
4573         m_bigclfree = m_infree(MC_BIGCL);
4574         m_16kclfree = m_infree(MC_16KCL);
4575
4576         /* Bail if we've maxed out the mbuf memory map */
4577         if ((bufsize != m_maxsize(MC_16KCL) &&
4578             (m_clusters + (m_bigclusters << 1) >= nclusters)) ||
4579             (njcl > 0 && bufsize == m_maxsize(MC_16KCL) &&
4580             (m_16kclusters << 3) >= njcl)) {
4581 #if DEBUG
4582                 if (bufsize == MCLBYTES && num > m_clfree) {
4583                         printf("m_howmany - out of small clusters, "
4584                             "%d short\n", num - mbstat.m_clfree);
4585                 }
4586 #endif /* DEBUG */
4587                 return (0);
4588         }
4589
4590         if (bufsize == m_maxsize(MC_CL)) {
4591                 /* Under minimum */
4592                 if (m_clusters < MINCL)
4593                         return (MINCL - m_clusters);
4594                 /* Too few (free < threshold) and not over maximum */
4595                 if (m_clusters < m_maxlimit(MC_CL)) {
4596                         if (m_clfree >= MCL_LOWAT)
4597                                 return (0);
4598                         if (num >= m_clfree)
4599                                 i = num - m_clfree;
4600                         if (((m_clusters + num) >> s) > m_clfree)
4601                                 j = ((m_clusters + num) >> s) - m_clfree;
4602                         i = MAX(i, j);
4603                         if (i + m_clusters >= m_maxlimit(MC_CL))
4604                                 i = m_maxlimit(MC_CL) - m_clusters;
4605                 }
4606                 VERIFY((m_total(MC_CL) + i) <= m_maxlimit(MC_CL));
4607         } else if (bufsize == m_maxsize(MC_BIGCL)) {
4608                 /* Under minimum */
4609                 if (m_bigclusters < MINBIGCL)
4610                         return (MINBIGCL - m_bigclusters);
4611                 /* Too few (free < 1/16 total) and not over maximum */
4612                 if (m_bigclusters < m_maxlimit(MC_BIGCL)) {
4613                         if (m_bigclfree >= MBIGCL_LOWAT)
4614                                 return (0);
4615                         if (num >= m_bigclfree)
4616                                 i = num - m_bigclfree;
4617                         if (((m_bigclusters + num) >> 4) > m_bigclfree)
4618                                 j = ((m_bigclusters + num) >> 4) - m_bigclfree;
4619                         i = MAX(i, j);
4620                         if (i + m_bigclusters >= m_maxlimit(MC_BIGCL))
4621                                 i = m_maxlimit(MC_BIGCL) - m_bigclusters;
4622                 }
4623                 VERIFY((m_total(MC_BIGCL) + i) <= m_maxlimit(MC_BIGCL));
4624         } else {
4625                 VERIFY(njcl > 0);
4626                 /* Under minimum */
4627                 if (m_16kclusters < MIN16KCL)
4628                         return (MIN16KCL - m_16kclusters);
4629                 /* Too few (free < 1/16 total) and not over maximum */
4630                 if (m_16kclusters < m_maxlimit(MC_16KCL)) {
4631                         if (m_16kclfree >= M16KCL_LOWAT)
4632                                 return (0);
4633                         if (num >= m_16kclfree)
4634                                 i = num - m_16kclfree;
4635                         if (((m_16kclusters + num) >> 4) > m_16kclfree)
4636                                 j = ((m_16kclusters + num) >> 4) - m_16kclfree;
4637                         i = MAX(i, j);
4638                         if (i + m_16kclusters >= m_maxlimit(MC_16KCL))
4639                                 i = m_maxlimit(MC_16KCL) - m_16kclusters;
4640                 }
4641                 VERIFY((m_total(MC_16KCL) + i) <= m_maxlimit(MC_16KCL));
4642         }
4643
4644         return (i);
4645 }
4646
4647 /*
4648  * Return the number of bytes in the mbuf chain, m.
4649   */
4650 static unsigned int
4651 m_length(struct mbuf *m)
4652 {
4653         struct mbuf *m0;
4654         unsigned int pktlen;
4655
4656         if (m->m_flags & M_PKTHDR)
4657                 return (m->m_pkthdr.len);
4658
4659         pktlen = 0;
4660         for (m0 = m; m0 != NULL; m0 = m0->m_next)
4661                 pktlen += m0->m_len;
4662         return (pktlen);
4663 }
4664
4665 /*
4666  * Copy data from a buffer back into the indicated mbuf chain,
4667  * starting "off" bytes from the beginning, extending the mbuf
4668  * chain if necessary.
4669  */
4670 void
4671 m_copyback(struct mbuf *m0, int off, int len, const void *cp)
4672 {
4673 #if DEBUG
4674         struct mbuf *origm = m0;
4675         int error;
4676 #endif /* DEBUG */
4677
4678         if (m0 == NULL)
4679                 return;
4680
4681 #if DEBUG
4682         error =
4683 #endif /* DEBUG */
4684         m_copyback0(&m0, off, len, cp,
4685             M_COPYBACK0_COPYBACK | M_COPYBACK0_EXTEND, M_DONTWAIT);
4686
4687 #if DEBUG
4688         if (error != 0 || (m0 != NULL && origm != m0))
4689                 panic("m_copyback");
4690 #endif /* DEBUG */
4691 }
4692
4693 struct mbuf *
4694 m_copyback_cow(struct mbuf *m0, int off, int len, const void *cp, int how)
4695 {
4696         int error;
4697
4698         /* don't support chain expansion */
4699         VERIFY(off + len <= m_length(m0));
4700
4701         error = m_copyback0(&m0, off, len, cp,
4702             M_COPYBACK0_COPYBACK | M_COPYBACK0_COW, how);
4703         if (error) {
4704                 /*
4705                  * no way to recover from partial success.
4706                  * just free the chain.
4707                  */
4708                 m_freem(m0);
4709                 return (NULL);
4710         }
4711         return (m0);
4712 }
4713
4714 /*
4715  * m_makewritable: ensure the specified range writable.
4716  */
4717 int
4718 m_makewritable(struct mbuf **mp, int off, int len, int how)
4719 {
4720         int error;
4721 #if DEBUG
4722         struct mbuf *n;
4723         int origlen, reslen;
4724
4725         origlen = m_length(*mp);
4726 #endif /* DEBUG */
4727
4728 #if 0 /* M_COPYALL is large enough */
4729         if (len == M_COPYALL)
4730                 len = m_length(*mp) - off; /* XXX */
4731 #endif
4732
4733         error = m_copyback0(mp, off, len, NULL,
4734             M_COPYBACK0_PRESERVE | M_COPYBACK0_COW, how);
4735
4736 #if DEBUG
4737         reslen = 0;
4738         for (n = *mp; n; n = n->m_next)
4739                 reslen += n->m_len;
4740         if (origlen != reslen)
4741                 panic("m_makewritable: length changed");
4742         if (((*mp)->m_flags & M_PKTHDR) && reslen != (*mp)->m_pkthdr.len)
4743                 panic("m_makewritable: inconsist");
4744 #endif /* DEBUG */
4745
4746         return (error);
4747 }
4748
4749 static int
4750 m_copyback0(struct mbuf **mp0, int off, int len, const void *vp, int flags,
4751     int how)
4752 {
4753         int mlen;
4754         struct mbuf *m, *n;
4755         struct mbuf **mp;
4756         int totlen = 0;
4757         const char *cp = vp;
4758
4759         VERIFY(mp0 != NULL);
4760         VERIFY(*mp0 != NULL);
4761         VERIFY((flags & M_COPYBACK0_PRESERVE) == 0 || cp == NULL);
4762         VERIFY((flags & M_COPYBACK0_COPYBACK) == 0 || cp != NULL);
4763
4764         /*
4765          * we don't bother to update "totlen" in the case of M_COPYBACK0_COW,
4766          * assuming that M_COPYBACK0_EXTEND and M_COPYBACK0_COW are exclusive.
4767          */
4768
4769         VERIFY((~flags & (M_COPYBACK0_EXTEND|M_COPYBACK0_COW)) != 0);
4770
4771         mp = mp0;
4772         m = *mp;
4773         while (off > (mlen = m->m_len)) {
4774                 off -= mlen;
4775                 totlen += mlen;
4776                 if (m->m_next == NULL) {
4777                         int tspace;
4778 extend:
4779                         if (!(flags & M_COPYBACK0_EXTEND))
4780                                 goto out;
4781
4782                         /*
4783                          * try to make some space at the end of "m".
4784                          */
4785
4786                         mlen = m->m_len;
4787                         if (off + len >= MINCLSIZE &&
4788                             !(m->m_flags & M_EXT) && m->m_len == 0) {
4789                                 MCLGET(m, how);
4790                         }
4791                         tspace = M_TRAILINGSPACE(m);
4792                         if (tspace > 0) {
4793                                 tspace = MIN(tspace, off + len);
4794                                 VERIFY(tspace > 0);
4795                                 bzero(mtod(m, char *) + m->m_len,
4796                                     MIN(off, tspace));
4797                                 m->m_len += tspace;
4798                                 off += mlen;
4799                                 totlen -= mlen;
4800                                 continue;
4801                         }
4802
4803                         /*
4804                          * need to allocate an mbuf.
4805                          */
4806
4807                         if (off + len >= MINCLSIZE) {
4808                                 n = m_getcl(how, m->m_type, 0);
4809                         } else {
4810                                 n = _M_GET(how, m->m_type);
4811                         }
4812                         if (n == NULL) {
4813                                 goto out;
4814                         }
4815                         n->m_len = 0;
4816                         n->m_len = MIN(M_TRAILINGSPACE(n), off + len);
4817                         bzero(mtod(n, char *), MIN(n->m_len, off));
4818                         m->m_next = n;
4819                 }
4820                 mp = &m->m_next;
4821                 m = m->m_next;
4822         }
4823         while (len > 0) {
4824                 mlen = m->m_len - off;
4825                 if (mlen != 0 && m_mclhasreference(m)) {
4826                         char *datap;
4827                         int eatlen;
4828
4829                         /*
4830                          * this mbuf is read-only.
4831                          * allocate a new writable mbuf and try again.
4832                          */
4833
4834 #if defined(DIAGNOSTIC)
4835                         if (!(flags & M_COPYBACK0_COW))
4836                                 panic("m_copyback0: read-only");
4837 #endif /* defined(DIAGNOSTIC) */
4838
4839                         /*
4840                          * if we're going to write into the middle of
4841                          * a mbuf, split it first.
4842                          */
4843                         if (off > 0 && len < mlen) {
4844                                 n = m_split0(m, off, how, 0);
4845                                 if (n == NULL)
4846                                         goto enobufs;
4847                                 m->m_next = n;
4848                                 mp = &m->m_next;
4849                                 m = n;
4850                                 off = 0;
4851                                 continue;
4852                         }
4853
4854                         /*
4855                          * XXX TODO coalesce into the trailingspace of
4856                          * the previous mbuf when possible.
4857                          */
4858
4859                         /*
4860                          * allocate a new mbuf.  copy packet header if needed.
4861                          */
4862                         n = _M_GET(how, m->m_type);
4863                         if (n == NULL)
4864                                 goto enobufs;
4865                         if (off == 0 && (m->m_flags & M_PKTHDR)) {
4866                                 M_COPY_PKTHDR(n, m);
4867                                 n->m_len = MHLEN;
4868                         } else {
4869                                 if (len >= MINCLSIZE)
4870                                         MCLGET(n, M_DONTWAIT);
4871                                 n->m_len =
4872                                     (n->m_flags & M_EXT) ? MCLBYTES : MLEN;
4873                         }
4874                         if (n->m_len > len)
4875                                 n->m_len = len;
4876
4877                         /*
4878                          * free the region which has been overwritten.
4879                          * copying data from old mbufs if requested.
4880                          */
4881                         if (flags & M_COPYBACK0_PRESERVE)
4882                                 datap = mtod(n, char *);
4883                         else
4884                                 datap = NULL;
4885                         eatlen = n->m_len;
4886                         VERIFY(off == 0 || eatlen >= mlen);
4887                         if (off > 0) {
4888                                 VERIFY(len >= mlen);
4889                                 m->m_len = off;
4890                                 m->m_next = n;
4891                                 if (datap) {
4892                                         m_copydata(m, off, mlen, datap);
4893                                         datap += mlen;
4894                                 }
4895                                 eatlen -= mlen;
4896                                 mp = &m->m_next;
4897                                 m = m->m_next;
4898                         }
4899                         while (m != NULL && m_mclhasreference(m) &&
4900                             n->m_type == m->m_type && eatlen > 0) {
4901                                 mlen = MIN(eatlen, m->m_len);
4902                                 if (datap) {
4903                                         m_copydata(m, 0, mlen, datap);
4904                                         datap += mlen;
4905                                 }
4906                                 m->m_data += mlen;
4907                                 m->m_len -= mlen;
4908                                 eatlen -= mlen;
4909                                 if (m->m_len == 0)
4910                                         *mp = m = m_free(m);
4911                         }
4912                         if (eatlen > 0)
4913                                 n->m_len -= eatlen;
4914                         n->m_next = m;
4915                         *mp = m = n;
4916                         continue;
4917                 }
4918                 mlen = MIN(mlen, len);
4919                 if (flags & M_COPYBACK0_COPYBACK) {
4920                         bcopy(cp, mtod(m, caddr_t) + off, (unsigned)mlen);
4921                         cp += mlen;
4922                 }
4923                 len -= mlen;
4924                 mlen += off;
4925                 off = 0;
4926                 totlen += mlen;
4927                 if (len == 0)
4928                         break;
4929                 if (m->m_next == NULL) {
4930                         goto extend;
4931                 }
4932                 mp = &m->m_next;
4933                 m = m->m_next;
4934         }
4935 out:
4936         if (((m = *mp0)->m_flags & M_PKTHDR) && (m->m_pkthdr.len < totlen)) {
4937                 VERIFY(flags & M_COPYBACK0_EXTEND);
4938                 m->m_pkthdr.len = totlen;
4939         }
4940
4941         return (0);
4942
4943 enobufs:
4944         return (ENOBUFS);
4945 }
4946
4947 char *
4948 mcl_to_paddr(char *addr)
4949 {
4950         vm_offset_t base_phys;
4951
4952         if (!MBUF_IN_MAP(addr))
4953                 return (NULL);
4954         base_phys = mcl_paddr[(addr - (char *)mbutl) >> PGSHIFT];
4955
4956         if (base_phys == 0)
4957                 return (NULL);
4958         return ((char *)((uintptr_t)base_phys | ((uintptr_t)addr & PGOFSET)));
4959 }
4960
4961 /*
4962  * Dup the mbuf chain passed in.  The whole thing.  No cute additional cruft.
4963  * And really copy the thing.  That way, we don't "precompute" checksums
4964  * for unsuspecting consumers.  Assumption: m->m_nextpkt == 0.  Trick: for
4965  * small packets, don't dup into a cluster.  That way received  packets
4966  * don't take up too much room in the sockbuf (cf. sbspace()).
4967  */
4968 int MDFail;
4969
4970 struct mbuf *
4971 m_dup(struct mbuf *m, int how)
4972 {
4973         struct mbuf *n, **np;
4974         struct mbuf *top;
4975         int copyhdr = 0;
4976
4977         np = &top;
4978         top = NULL;
4979         if (m->m_flags & M_PKTHDR)
4980                 copyhdr = 1;
4981
4982         /*
4983          * Quick check: if we have one mbuf and its data fits in an
4984          *  mbuf with packet header, just copy and go.
4985          */
4986         if (m->m_next == NULL) {
4987                 /* Then just move the data into an mbuf and be done... */
4988                 if (copyhdr) {
4989                         if (m->m_pkthdr.len <= MHLEN && m->m_len <= MHLEN) {
4990                                 if ((n = _M_GETHDR(how, m->m_type)) == NULL)
4991                                         return (NULL);
4992                                 n->m_len = m->m_len;
4993                                 m_dup_pkthdr(n, m, how);
4994                                 bcopy(m->m_data, n->m_data, m->m_len);
4995                                 return (n);
4996                         }
4997                 } else if (m->m_len <= MLEN) {
4998                         if ((n = _M_GET(how, m->m_type)) == NULL)
4999                                 return (NULL);
5000                         bcopy(m->m_data, n->m_data, m->m_len);
5001                         n->m_len = m->m_len;
5002                         return (n);
5003                 }
5004         }
5005         while (m != NULL) {
5006 #if BLUE_DEBUG
5007                 kprintf("<%x: %x, %x, %x\n", m, m->m_flags, m->m_len,
5008                     m->m_data);
5009 #endif
5010                 if (copyhdr)
5011                         n = _M_GETHDR(how, m->m_type);
5012                 else
5013                         n = _M_GET(how, m->m_type);
5014                 if (n == NULL)
5015                         goto nospace;
5016                 if (m->m_flags & M_EXT) {
5017                         if (m->m_len <= m_maxsize(MC_CL))
5018                                 MCLGET(n, how);
5019                         else if (m->m_len <= m_maxsize(MC_BIGCL))
5020                                 n = m_mbigget(n, how);
5021                         else if (m->m_len <= m_maxsize(MC_16KCL) && njcl > 0)
5022                                 n = m_m16kget(n, how);
5023                         if (!(n->m_flags & M_EXT)) {
5024                                 (void) m_free(n);
5025                                 goto nospace;
5026                         }
5027                 }
5028                 *np = n;
5029                 if (copyhdr) {
5030                         /* Don't use M_COPY_PKTHDR: preserve m_data */
5031                         m_dup_pkthdr(n, m, how);
5032                         copyhdr = 0;
5033                         if (!(n->m_flags & M_EXT))
5034                                 n->m_data = n->m_pktdat;
5035                 }
5036                 n->m_len = m->m_len;
5037                 /*
5038                  * Get the dup on the same bdry as the original
5039                  * Assume that the two mbufs have the same offset to data area
5040                  * (up to word boundaries)
5041                  */
5042                 bcopy(MTOD(m, caddr_t), MTOD(n, caddr_t), (unsigned)n->m_len);
5043                 m = m->m_next;
5044                 np = &n->m_next;
5045 #if BLUE_DEBUG
5046                 kprintf(">%x: %x, %x, %x\n", n, n->m_flags, n->m_len,
5047                     n->m_data);
5048 #endif
5049         }
5050
5051         if (top == NULL)
5052                 MDFail++;
5053         return (top);
5054
5055 nospace:
5056         m_freem(top);
5057         MDFail++;
5058         return (NULL);
5059 }
5060
5061 #define MBUF_MULTIPAGES(m)                                              \
5062         (((m)->m_flags & M_EXT) &&                                      \
5063         ((IS_P2ALIGNED((m)->m_data, NBPG) && (m)->m_len > NBPG) ||      \
5064         (!IS_P2ALIGNED((m)->m_data, NBPG) &&                            \
5065         P2ROUNDUP((m)->m_data, NBPG) < ((uintptr_t)(m)->m_data + (m)->m_len))))
5066
5067 static struct mbuf *
5068 m_expand(struct mbuf *m, struct mbuf **last)
5069 {
5070         struct mbuf *top = NULL;
5071         struct mbuf **nm = &top;
5072         uintptr_t data0, data;
5073         unsigned int len0, len;
5074
5075         VERIFY(MBUF_MULTIPAGES(m));
5076         VERIFY(m->m_next == NULL);
5077         data0 = (uintptr_t)m->m_data;
5078         len0 = m->m_len;
5079         *last = top;
5080
5081         for (;;) {
5082                 struct mbuf *n;
5083
5084                 data = data0;
5085                 if (IS_P2ALIGNED(data, NBPG) && len0 > NBPG)
5086                         len = NBPG;
5087                 else if (!IS_P2ALIGNED(data, NBPG) &&
5088                     P2ROUNDUP(data, NBPG) < (data + len0))
5089                         len = P2ROUNDUP(data, NBPG) - data;
5090                 else
5091                         len = len0;
5092
5093                 VERIFY(len > 0);
5094                 VERIFY(m->m_flags & M_EXT);
5095                 m->m_data = (void *)data;
5096                 m->m_len = len;
5097
5098                 *nm = *last = m;
5099                 nm = &m->m_next;
5100                 m->m_next = NULL;
5101
5102                 data0 += len;
5103                 len0 -= len;
5104                 if (len0 == 0)
5105                         break;
5106
5107                 n = _M_RETRY(M_DONTWAIT, MT_DATA);
5108                 if (n == NULL) {
5109                         m_freem(top);
5110                         top = *last = NULL;
5111                         break;
5112                 }
5113
5114                 n->m_ext = m->m_ext;
5115                 m_incref(m);
5116                 n->m_flags |= M_EXT;
5117                 m = n;
5118         }
5119         return (top);
5120 }
5121
5122 struct mbuf *
5123 m_normalize(struct mbuf *m)
5124 {
5125         struct mbuf *top = NULL;
5126         struct mbuf **nm = &top;
5127         boolean_t expanded = FALSE;
5128
5129         while (m != NULL) {
5130                 struct mbuf *n;
5131
5132                 n = m->m_next;
5133                 m->m_next = NULL;
5134
5135                 /* Does the data cross one or more page boundaries? */
5136                 if (MBUF_MULTIPAGES(m)) {
5137                         struct mbuf *last;
5138                         if ((m = m_expand(m, &last)) == NULL) {
5139                                 m_freem(n);
5140                                 m_freem(top);
5141                                 top = NULL;
5142                                 break;
5143                         }
5144                         *nm = m;
5145                         nm = &last->m_next;
5146                         expanded = TRUE;
5147                 } else {
5148                         *nm = m;
5149                         nm = &m->m_next;
5150                 }
5151                 m = n;
5152         }
5153         if (expanded)
5154                 atomic_add_32(&mb_normalized, 1);
5155         return (top);
5156 }
5157
5158 void
5159 m_mchtype(struct mbuf *m, int t)
5160 {
5161         mtype_stat_inc(t);
5162         mtype_stat_dec(m->m_type);
5163         (m)->m_type = t;
5164 }
5165
5166 void *
5167 m_mtod(struct mbuf *m)
5168 {
5169         return (MTOD(m, void *));
5170 }
5171
5172 struct mbuf *
5173 m_dtom(void *x)
5174 {
5175         return ((struct mbuf *)((uintptr_t)(x) & ~(MSIZE-1)));
5176 }
5177
5178 void
5179 m_mcheck(struct mbuf *m)
5180 {
5181         _MCHECK(m);
5182 }
5183
5184 /*
5185  * Inform the corresponding mcache(s) that there's a waiter below.
5186  */
5187 static void
5188 mbuf_waiter_inc(mbuf_class_t class, boolean_t comp)
5189 {
5190         mcache_waiter_inc(m_cache(class));
5191         if (comp) {
5192                 if (class == MC_CL) {
5193                         mcache_waiter_inc(m_cache(MC_MBUF_CL));
5194                 } else if (class == MC_BIGCL) {
5195                         mcache_waiter_inc(m_cache(MC_MBUF_BIGCL));
5196                 } else if (class == MC_16KCL) {
5197                         mcache_waiter_inc(m_cache(MC_MBUF_16KCL));
5198                 } else {
5199                         mcache_waiter_inc(m_cache(MC_MBUF_CL));
5200                         mcache_waiter_inc(m_cache(MC_MBUF_BIGCL));
5201                 }
5202         }
5203 }
5204
5205 /*
5206  * Inform the corresponding mcache(s) that there's no more waiter below.
5207  */
5208 static void
5209 mbuf_waiter_dec(mbuf_class_t class, boolean_t comp)
5210 {
5211         mcache_waiter_dec(m_cache(class));
5212         if (comp) {
5213                 if (class == MC_CL) {
5214                         mcache_waiter_dec(m_cache(MC_MBUF_CL));
5215                 } else if (class == MC_BIGCL) {
5216                         mcache_waiter_dec(m_cache(MC_MBUF_BIGCL));
5217                 } else if (class == MC_16KCL) {
5218                         mcache_waiter_dec(m_cache(MC_MBUF_16KCL));
5219                 } else {
5220                         mcache_waiter_dec(m_cache(MC_MBUF_CL));
5221                         mcache_waiter_dec(m_cache(MC_MBUF_BIGCL));
5222                 }
5223         }
5224 }
5225
5226 /*
5227  * Called during blocking allocation.  Returns TRUE if one or more objects
5228  * are available at the per-CPU caches layer and that allocation should be
5229  * retried at that level.
5230  */
5231 static boolean_t
5232 mbuf_sleep(mbuf_class_t class, unsigned int num, int wait)
5233 {
5234         boolean_t mcache_retry = FALSE;
5235
5236         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
5237
5238         /* Check if there's anything at the cache layer */
5239         if (mbuf_cached_above(class, wait)) {
5240                 mcache_retry = TRUE;
5241                 goto done;
5242         }
5243
5244         /* Nothing?  Then try hard to get it from somewhere */
5245         m_reclaim(class, num, (wait & MCR_COMP));
5246
5247         /* We tried hard and got something? */
5248         if (m_infree(class) > 0) {
5249                 mbstat.m_wait++;
5250                 goto done;
5251         } else if (mbuf_cached_above(class, wait)) {
5252                 mbstat.m_wait++;
5253                 mcache_retry = TRUE;
5254                 goto done;
5255         } else if (wait & MCR_TRYHARD) {
5256                 mcache_retry = TRUE;
5257                 goto done;
5258         }
5259
5260         /*
5261          * There's really nothing for us right now; inform the
5262          * cache(s) that there is a waiter below and go to sleep.
5263          */
5264         mbuf_waiter_inc(class, (wait & MCR_COMP));
5265
5266         VERIFY(!(wait & MCR_NOSLEEP));
5267         mb_waiters++;
5268         (void) msleep(mb_waitchan, mbuf_mlock, (PZERO-1), m_cname(class), NULL);
5269
5270         /* We are now up; stop getting notified until next round */
5271         mbuf_waiter_dec(class, (wait & MCR_COMP));
5272
5273         /* We waited and got something */
5274         if (m_infree(class) > 0) {
5275                 mbstat.m_wait++;
5276                 goto done;
5277         } else if (mbuf_cached_above(class, wait)) {
5278                 mbstat.m_wait++;
5279                 mcache_retry = TRUE;
5280         }
5281 done:
5282         return (mcache_retry);
5283 }
5284
5285 static void
5286 mbuf_worker_thread(void)
5287 {
5288         int mbuf_expand;
5289
5290         while (1) {
5291                 lck_mtx_lock(mbuf_mlock);
5292
5293                 mbuf_expand = 0;
5294                 if (mbuf_expand_mcl) {
5295                         int n;
5296
5297                         /* Adjust to current number of cluster in use */
5298                         n = mbuf_expand_mcl -
5299                             (m_total(MC_CL) - m_infree(MC_CL));
5300                         if ((n + m_total(MC_CL)) > m_maxlimit(MC_CL))
5301                                 n = m_maxlimit(MC_CL) - m_total(MC_CL);
5302                         mbuf_expand_mcl = 0;
5303
5304                         if (n > 0 && freelist_populate(MC_CL, n, M_WAIT) > 0)
5305                                 mbuf_expand++;
5306                 }
5307                 if (mbuf_expand_big) {
5308                         int n;
5309
5310                         /* Adjust to current number of 4 KB cluster in use */
5311                         n = mbuf_expand_big -
5312                             (m_total(MC_BIGCL) - m_infree(MC_BIGCL));
5313                         if ((n + m_total(MC_BIGCL)) > m_maxlimit(MC_BIGCL))
5314                                 n = m_maxlimit(MC_BIGCL) - m_total(MC_BIGCL);
5315                         mbuf_expand_big = 0;
5316
5317                         if (n > 0 && freelist_populate(MC_BIGCL, n, M_WAIT) > 0)
5318                                 mbuf_expand++;
5319                 }
5320                 if (mbuf_expand_16k) {
5321                         int n;
5322
5323                         /* Adjust to current number of 16 KB cluster in use */
5324                         n = mbuf_expand_16k -
5325                             (m_total(MC_16KCL) - m_infree(MC_16KCL));
5326                         if ((n + m_total(MC_16KCL)) > m_maxlimit(MC_16KCL))
5327                                 n = m_maxlimit(MC_16KCL) - m_total(MC_16KCL);
5328                         mbuf_expand_16k = 0;
5329
5330                         if (n > 0)
5331                                 (void) freelist_populate(MC_16KCL, n, M_WAIT);
5332                 }
5333
5334                 /*
5335                  * Because we can run out of memory before filling the mbuf
5336                  * map, we should not allocate more clusters than they are
5337                  * mbufs -- otherwise we could have a large number of useless
5338                  * clusters allocated.
5339                  */
5340                 if (mbuf_expand) {
5341                         while (m_total(MC_MBUF) <
5342                             (m_total(MC_BIGCL) + m_total(MC_CL))) {
5343                                 if (freelist_populate(MC_MBUF, 1, M_WAIT) == 0)
5344                                         break;
5345                         }
5346                 }
5347
5348                 lck_mtx_unlock(mbuf_mlock);
5349
5350                 assert_wait(&mbuf_worker_run, THREAD_UNINT);
5351                 (void) thread_block((thread_continue_t)mbuf_worker_thread);
5352         }
5353 }
5354
5355 static void
5356 mbuf_worker_thread_init(void)
5357 {
5358         mbuf_worker_ready++;
5359         mbuf_worker_thread();
5360 }
5361
5362 static mcl_slab_t *
5363 slab_get(void *buf)
5364 {
5365         mcl_slabg_t *slg;
5366         unsigned int ix, k;
5367
5368         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
5369
5370         VERIFY(MBUF_IN_MAP(buf));
5371         ix = ((char *)buf - (char *)mbutl) >> MBSHIFT;
5372         VERIFY(ix < maxslabgrp);
5373
5374         if ((slg = slabstbl[ix]) == NULL) {
5375                 /*
5376                  * In the current implementation, we never shrink the memory
5377                  * pool (hence the cluster map); if we attempt to reallocate
5378                  * a cluster group when it's already allocated, panic since
5379                  * this is a sign of a memory corruption (slabstbl[ix] got
5380                  * nullified).  This also means that there shouldn't be any
5381                  * hole in the kernel sub-map for the mbuf pool.
5382                  */
5383                 ++slabgrp;
5384                 VERIFY(ix < slabgrp);
5385                 /*
5386                  * Slabs expansion can only be done single threaded; when
5387                  * we get here, it must be as a result of m_clalloc() which
5388                  * is serialized and therefore mb_clalloc_busy must be set.
5389                  */
5390                 VERIFY(mb_clalloc_busy);
5391                 lck_mtx_unlock(mbuf_mlock);
5392
5393                 /* This is a new buffer; create the slabs group for it */
5394                 MALLOC(slg, mcl_slabg_t *, sizeof (*slg), M_TEMP,
5395                     M_WAITOK | M_ZERO);
5396                 VERIFY(slg != NULL);
5397
5398                 lck_mtx_lock(mbuf_mlock);
5399                 /*
5400                  * No other thread could have gone into m_clalloc() after
5401                  * we dropped the lock above, so verify that it's true.
5402                  */
5403                 VERIFY(mb_clalloc_busy);
5404
5405                 slabstbl[ix] = slg;
5406
5407                 /* Chain each slab in the group to its forward neighbor */
5408                 for (k = 1; k < NSLABSPMB; k++)
5409                         slg->slg_slab[k - 1].sl_next = &slg->slg_slab[k];
5410                 VERIFY(slg->slg_slab[NSLABSPMB - 1].sl_next == NULL);
5411
5412                 /* And chain the last slab in the previous group to this */
5413                 if (ix > 0) {
5414                         VERIFY(slabstbl[ix - 1]->
5415                             slg_slab[NSLABSPMB - 1].sl_next == NULL);
5416                         slabstbl[ix - 1]->slg_slab[NSLABSPMB - 1].sl_next =
5417                             &slg->slg_slab[0];
5418                 }
5419         }
5420
5421         ix = MTOCL(buf) % NSLABSPMB;
5422         VERIFY(ix < NSLABSPMB);
5423
5424         return (&slg->slg_slab[ix]);
5425 }
5426
5427 static void
5428 slab_init(mcl_slab_t *sp, mbuf_class_t class, u_int32_t flags,
5429     void *base, void *head, unsigned int len, int refcnt, int chunks)
5430 {
5431         sp->sl_class = class;
5432         sp->sl_flags = flags;
5433         sp->sl_base = base;
5434         sp->sl_head = head;
5435         sp->sl_len = len;
5436         sp->sl_refcnt = refcnt;
5437         sp->sl_chunks = chunks;
5438         slab_detach(sp);
5439 }
5440
5441 static void
5442 slab_insert(mcl_slab_t *sp, mbuf_class_t class)
5443 {
5444         VERIFY(slab_is_detached(sp));
5445         m_slab_cnt(class)++;
5446         TAILQ_INSERT_TAIL(&m_slablist(class), sp, sl_link);
5447         sp->sl_flags &= ~SLF_DETACHED;
5448         if (class == MC_BIGCL) {
5449                 sp = sp->sl_next;
5450                 /* Next slab must already be present */
5451                 VERIFY(sp != NULL);
5452                 VERIFY(slab_is_detached(sp));
5453                 sp->sl_flags &= ~SLF_DETACHED;
5454         } else if (class == MC_16KCL) {
5455                 int k;
5456                 for (k = 1; k < (M16KCLBYTES / MCLBYTES); k++) {
5457                         sp = sp->sl_next;
5458                         /* Next slab must already be present */
5459                         VERIFY(sp != NULL);
5460                         VERIFY(slab_is_detached(sp));
5461                         sp->sl_flags &= ~SLF_DETACHED;
5462                 }
5463         }
5464 }
5465
5466 static void
5467 slab_remove(mcl_slab_t *sp, mbuf_class_t class)
5468 {
5469         VERIFY(!slab_is_detached(sp));
5470         VERIFY(m_slab_cnt(class) > 0);
5471         m_slab_cnt(class)--;
5472         TAILQ_REMOVE(&m_slablist(class), sp, sl_link);
5473         slab_detach(sp);
5474         if (class == MC_BIGCL) {
5475                 sp = sp->sl_next;
5476                 /* Next slab must already be present */
5477                 VERIFY(sp != NULL);
5478                 VERIFY(!slab_is_detached(sp));
5479                 slab_detach(sp);
5480         } else if (class == MC_16KCL) {
5481                 int k;
5482                 for (k = 1; k < (M16KCLBYTES / MCLBYTES); k++) {
5483                         sp = sp->sl_next;
5484                         /* Next slab must already be present */
5485                         VERIFY(sp != NULL);
5486                         VERIFY(!slab_is_detached(sp));
5487                         slab_detach(sp);
5488                 }
5489         }
5490 }
5491
5492 static boolean_t
5493 slab_inrange(mcl_slab_t *sp, void *buf)
5494 {
5495         return ((uintptr_t)buf >= (uintptr_t)sp->sl_base &&
5496             (uintptr_t)buf < ((uintptr_t)sp->sl_base + sp->sl_len));
5497 }
5498
5499 #undef panic
5500
5501 static void
5502 slab_nextptr_panic(mcl_slab_t *sp, void *addr)
5503 {
5504         int i;
5505         unsigned int chunk_len = sp->sl_len / sp->sl_chunks;
5506         uintptr_t buf = (uintptr_t)sp->sl_base;
5507
5508         for (i = 0; i < sp->sl_chunks; i++, buf += chunk_len) {
5509                 void *next = ((mcache_obj_t *)buf)->obj_next;
5510                 if (next != addr)
5511                         continue;
5512                 if (mclaudit == NULL) {
5513                         if (next != NULL && !MBUF_IN_MAP(next)) {
5514                                 mcache_t *cp = m_cache(sp->sl_class);
5515                                 panic("%s: %s buffer %p in slab %p modified "
5516                                     "after free at offset 0: %p out of range "
5517                                     "[%p-%p)\n", __func__, cp->mc_name,
5518                                     (void *)buf, sp, next, mbutl, embutl);
5519                                 /* NOTREACHED */
5520                         }
5521                 } else {
5522                         mcache_audit_t *mca = mcl_audit_buf2mca(sp->sl_class,
5523                             (mcache_obj_t *)buf);
5524                         mcl_audit_verify_nextptr(next, mca);
5525                 }
5526         }
5527 }
5528
5529 static void
5530 slab_detach(mcl_slab_t *sp)
5531 {
5532         sp->sl_link.tqe_next = (mcl_slab_t *)-1;
5533         sp->sl_link.tqe_prev = (mcl_slab_t **)-1;
5534         sp->sl_flags |= SLF_DETACHED;
5535 }
5536
5537 static boolean_t
5538 slab_is_detached(mcl_slab_t *sp)
5539 {
5540         return ((intptr_t)sp->sl_link.tqe_next == -1 &&
5541             (intptr_t)sp->sl_link.tqe_prev == -1 &&
5542             (sp->sl_flags & SLF_DETACHED));
5543 }
5544
5545 static void
5546 mcl_audit_init(void *buf, mcache_audit_t **mca_list,
5547     mcache_obj_t **con_list, size_t con_size, unsigned int num)
5548 {
5549         mcache_audit_t *mca, *mca_tail;
5550         mcache_obj_t *con = NULL;
5551         boolean_t save_contents = (con_list != NULL);
5552         unsigned int i, ix;
5553
5554         ASSERT(num <= NMBPCL);
5555         ASSERT(con_list == NULL || con_size != 0);
5556
5557         ix = MTOCL(buf);
5558         /* Make sure we haven't been here before */
5559         for (i = 0; i < NMBPCL; i++)
5560                 VERIFY(mclaudit[ix].cl_audit[i] == NULL);
5561
5562         mca = mca_tail = *mca_list;
5563         if (save_contents)
5564                 con = *con_list;
5565
5566         for (i = 0; i < num; i++) {
5567                 mcache_audit_t *next;
5568
5569                 next = mca->mca_next;
5570                 bzero(mca, sizeof (*mca));
5571                 mca->mca_next = next;
5572                 mclaudit[ix].cl_audit[i] = mca;
5573
5574                 /* Attach the contents buffer if requested */
5575                 if (save_contents) {
5576                         VERIFY(con != NULL);
5577                         mca->mca_contents_size = con_size;
5578                         mca->mca_contents = con;
5579                         con = con->obj_next;
5580                         bzero(mca->mca_contents, mca->mca_contents_size);
5581                 }
5582
5583                 mca_tail = mca;
5584                 mca = mca->mca_next;
5585         }
5586
5587         if (save_contents)
5588                 *con_list = con;
5589
5590         *mca_list = mca_tail->mca_next;
5591         mca_tail->mca_next = NULL;
5592 }
5593
5594 /*
5595  * Given an address of a buffer (mbuf/cluster/big cluster), return
5596  * the corresponding audit structure for that buffer.
5597  */
5598 static mcache_audit_t *
5599 mcl_audit_buf2mca(mbuf_class_t class, mcache_obj_t *o)
5600 {
5601         mcache_audit_t *mca = NULL;
5602         int ix = MTOCL(o);
5603
5604         VERIFY(IS_P2ALIGNED(o, MIN(m_maxsize(class), NBPG)));
5605
5606         switch (class) {
5607         case MC_MBUF:
5608                 /*
5609                  * For the mbuf case, find the index of the cluster
5610                  * used by the mbuf and use that index to locate the
5611                  * base address of the cluster.  Then find out the
5612                  * mbuf index relative to the cluster base and use
5613                  * it to locate the audit structure.
5614                  */
5615                 VERIFY(MCLIDX(CLTOM(ix), o) < (int)NMBPCL);
5616                 mca = mclaudit[ix].cl_audit[MCLIDX(CLTOM(ix), o)];
5617                 break;
5618
5619         case MC_CL:
5620         case MC_BIGCL:
5621         case MC_16KCL:
5622                 /*
5623                  * Same as above, but only return the first element.
5624                  */
5625                 mca = mclaudit[ix].cl_audit[0];
5626                 break;
5627
5628         default:
5629                 VERIFY(0);
5630                 /* NOTREACHED */
5631         }
5632
5633         return (mca);
5634 }
5635
5636 static void
5637 mcl_audit_mbuf(mcache_audit_t *mca, void *addr, boolean_t composite,
5638     boolean_t alloc)
5639 {
5640         struct mbuf *m = addr;
5641         mcache_obj_t *next = ((mcache_obj_t *)m)->obj_next;
5642
5643         VERIFY(mca->mca_contents != NULL &&
5644             mca->mca_contents_size == AUDIT_CONTENTS_SIZE);
5645
5646         mcl_audit_verify_nextptr(next, mca);
5647
5648         if (!alloc) {
5649                 /* Save constructed mbuf fields */
5650                 mcl_audit_save_mbuf(m, mca);
5651                 mcache_set_pattern(MCACHE_FREE_PATTERN, m, m_maxsize(MC_MBUF));
5652                 ((mcache_obj_t *)m)->obj_next = next;
5653                 return;
5654         }
5655
5656         /* Check if the buffer has been corrupted while in freelist */
5657         mcache_audit_free_verify_set(mca, addr, 0, m_maxsize(MC_MBUF));
5658
5659         /* Restore constructed mbuf fields */
5660         mcl_audit_restore_mbuf(m, mca, composite);
5661 }
5662
5663 static void
5664 mcl_audit_restore_mbuf(struct mbuf *m, mcache_audit_t *mca, boolean_t composite)
5665 {
5666         struct mbuf *ms = (struct mbuf *)mca->mca_contents;
5667
5668         if (composite) {
5669                 struct mbuf *next = m->m_next;
5670                 VERIFY(ms->m_flags == M_EXT && MEXT_RFA(ms) != NULL &&
5671                     MBUF_IS_COMPOSITE(ms));
5672                 /*
5673                  * We could have hand-picked the mbuf fields and restore
5674                  * them individually, but that will be a maintenance
5675                  * headache.  Instead, restore everything that was saved;
5676                  * the mbuf layer will recheck and reinitialize anyway.
5677                  */
5678                 bcopy(ms, m, mca->mca_contents_size);
5679                 m->m_next = next;
5680         } else {
5681                 /*
5682                  * For a regular mbuf (no cluster attached) there's nothing
5683                  * to restore other than the type field, which is expected
5684                  * to be MT_FREE.
5685                  */
5686                 m->m_type = ms->m_type;
5687         }
5688         _MCHECK(m);
5689 }
5690
5691 static void
5692 mcl_audit_save_mbuf(struct mbuf *m, mcache_audit_t *mca)
5693 {
5694         _MCHECK(m);
5695         bcopy(m, mca->mca_contents, mca->mca_contents_size);
5696 }
5697
5698 static void
5699 mcl_audit_cluster(mcache_audit_t *mca, void *addr, size_t size, boolean_t alloc,
5700     boolean_t save_next)
5701 {
5702         mcache_obj_t *next = ((mcache_obj_t *)addr)->obj_next;
5703
5704         if (!alloc) {
5705                 mcache_set_pattern(MCACHE_FREE_PATTERN, addr, size);
5706                 if (save_next) {
5707                         mcl_audit_verify_nextptr(next, mca);
5708                         ((mcache_obj_t *)addr)->obj_next = next;
5709                 }
5710         } else {
5711                 /* Check if the buffer has been corrupted while in freelist */
5712                 mcl_audit_verify_nextptr(next, mca);
5713                 mcache_audit_free_verify_set(mca, addr, 0, size);
5714         }
5715 }
5716
5717 static void
5718 mcl_audit_mcheck_panic(struct mbuf *m)
5719 {
5720         mcache_audit_t *mca;
5721
5722         MRANGE(m);
5723         mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
5724
5725         panic("mcl_audit: freed mbuf %p with type 0x%x (instead of 0x%x)\n%s\n",
5726             m, (u_int16_t)m->m_type, MT_FREE, mcache_dump_mca(mca));
5727         /* NOTREACHED */
5728 }
5729
5730 static void
5731 mcl_audit_verify_nextptr(void *next, mcache_audit_t *mca)
5732 {
5733         if (next != NULL && next != (void *)MCACHE_FREE_PATTERN &&
5734             !MBUF_IN_MAP(next)) {
5735                 panic("mcl_audit: buffer %p modified after free at offset 0: "
5736                     "%p out of range [%p-%p)\n%s\n",
5737                     mca->mca_addr, next, mbutl, embutl, mcache_dump_mca(mca));
5738                 /* NOTREACHED */
5739         }
5740 }
5741
5742 SYSCTL_DECL(_kern_ipc);
5743 SYSCTL_PROC(_kern_ipc, KIPC_MBSTAT, mbstat, CTLFLAG_RD | CTLFLAG_LOCKED,
5744     0, 0, mbstat_sysctl, "S,mbstat", "");
5745 SYSCTL_PROC(_kern_ipc, OID_AUTO, mb_stat, CTLFLAG_RD | CTLFLAG_LOCKED,
5746     0, 0, mb_stat_sysctl, "S,mb_stat", "");
5747 SYSCTL_INT(_kern_ipc, OID_AUTO, mb_normalized, CTLFLAG_RD | CTLFLAG_LOCKED,
5748     &mb_normalized, 0, "");