bsd/kern/uipc_mbuf.c

   1 /*
   2  * Copyright (c) 2008 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
  29 /*
  30  * Copyright (c) 1982, 1986, 1988, 1991, 1993
  31  *      The Regents of the University of California.  All rights reserved.
  32  *
  33  * Redistribution and use in source and binary forms, with or without
  34  * modification, are permitted provided that the following conditions
  35  * are met:
  36  * 1. Redistributions of source code must retain the above copyright
  37  *    notice, this list of conditions and the following disclaimer.
  38  * 2. Redistributions in binary form must reproduce the above copyright
  39  *    notice, this list of conditions and the following disclaimer in the
  40  *    documentation and/or other materials provided with the distribution.
  41  * 3. All advertising materials mentioning features or use of this software
  42  *    must display the following acknowledgement:
  43  *      This product includes software developed by the University of
  44  *      California, Berkeley and its contributors.
  45  * 4. Neither the name of the University nor the names of its contributors
  46  *    may be used to endorse or promote products derived from this software
  47  *    without specific prior written permission.
  48  *
  49  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  50  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  51  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  52  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  53  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  54  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  55  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  56  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  57  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  58  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  59  * SUCH DAMAGE.
  60  *
  61  *      @(#)uipc_mbuf.c 8.2 (Berkeley) 1/4/94
  62  */
  63 /*
  64  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
  65  * support for mandatory and extensible security protections.  This notice
  66  * is included in support of clause 2.2 (b) of the Apple Public License,
  67  * Version 2.0.
  68  */
  69
  70 #include <sys/param.h>
  71 #include <sys/systm.h>
  72 #include <sys/malloc.h>
  73 #include <sys/mbuf.h>
  74 #include <sys/kernel.h>
  75 #include <sys/sysctl.h>
  76 #include <sys/syslog.h>
  77 #include <sys/protosw.h>
  78 #include <sys/domain.h>
  79 #include <sys/queue.h>
  80 #include <sys/proc.h>
  81
  82 #include <kern/kern_types.h>
  83 #include <kern/simple_lock.h>
  84 #include <kern/queue.h>
  85 #include <kern/sched_prim.h>
  86 #include <kern/cpu_number.h>
  87
  88 #include <libkern/OSAtomic.h>
  89 #include <libkern/libkern.h>
  90
  91 #include <IOKit/IOMapper.h>
  92
  93 #include <machine/limits.h>
  94 #include <machine/machine_routines.h>
  95
  96 #if CONFIG_MACF_NET
  97 #include <security/mac_framework.h>
  98 #endif /* MAC_NET */
  99
 100 #include <sys/mcache.h>
 101
 102 /*
 103  * MBUF IMPLEMENTATION NOTES.
 104  *
 105  * There is a total of 5 per-CPU caches:
 106  *
 107  * MC_MBUF:
 108  *      This is a cache of rudimentary objects of MSIZE in size; each
 109  *      object represents an mbuf structure.  This cache preserves only
 110  *      the m_type field of the mbuf during its transactions.
 111  *
 112  * MC_CL:
 113  *      This is a cache of rudimentary objects of MCLBYTES in size; each
 114  *      object represents a mcluster structure.  This cache does not
 115  *      preserve the contents of the objects during its transactions.
 116  *
 117  * MC_BIGCL:
 118  *      This is a cache of rudimentary objects of NBPG in size; each
 119  *      object represents a mbigcluster structure.  This cache does not
 120  *      preserve the contents of the objects during its transaction.
 121  *
 122  * MC_MBUF_CL:
 123  *      This is a cache of mbufs each having a cluster attached to it.
 124  *      It is backed by MC_MBUF and MC_CL rudimentary caches.  Several
 125  *      fields of the mbuf related to the external cluster are preserved
 126  *      during transactions.
 127  *
 128  * MC_MBUF_BIGCL:
 129  *      This is a cache of mbufs each having a big cluster attached to it.
 130  *      It is backed by MC_MBUF and MC_BIGCL rudimentary caches.  Several
 131  *      fields of the mbuf related to the external cluster are preserved
 132  *      during transactions.
 133  *
 134  * OBJECT ALLOCATION:
 135  *
 136  * Allocation requests are handled first at the per-CPU (mcache) layer
 137  * before falling back to the slab layer.  Performance is optimal when
 138  * the request is satisfied at the CPU layer because global data/lock
 139  * never gets accessed.  When the slab layer is entered for allocation,
 140  * the slab freelist will be checked first for available objects before
 141  * the VM backing store is invoked.  Slab layer operations are serialized
 142  * for all of the caches as the mbuf global lock is held most of the time.
 143  * Allocation paths are different depending on the class of objects:
 144  *
 145  * a. Rudimentary object:
 146  *
 147  *      { m_get_common(), m_clattach(), m_mclget(),
 148  *        m_mclalloc(), m_bigalloc(), m_copym_with_hdrs(),
 149  *        composite object allocation }
 150  *                      |       ^
 151  *                      |       |
 152  *                      |       +-----------------------+
 153  *                      v                               |
 154  *         mcache_alloc/mcache_alloc_ext()      mbuf_slab_audit()
 155  *                      |                               ^
 156  *                      v                               |
 157  *                 [CPU cache] -------> (found?) -------+
 158  *                      |                               |
 159  *                      v                               |
 160  *               mbuf_slab_alloc()                      |
 161  *                      |                               |
 162  *                      v                               |
 163  *      +---------> [freelist] -------> (found?) -------+
 164  *      |               |
 165  *      |               v
 166  *      |           m_clalloc()
 167  *      |               |
 168  *      |               v
 169  *      +---<<---- kmem_mb_alloc()
 170  *
 171  * b. Composite object:
 172  *
 173  *      { m_getpackets_internal(), m_allocpacket_internal() }
 174  *                      |       ^
 175  *                      |       |
 176  *                      |       +------ (done) ---------+
 177  *                      v                               |
 178  *         mcache_alloc/mcache_alloc_ext()      mbuf_cslab_audit()
 179  *                      |                               ^
 180  *                      v                               |
 181  *                 [CPU cache] -------> (found?) -------+
 182  *                      |                               |
 183  *                      v                               |
 184  *               mbuf_cslab_alloc()                     |
 185  *                      |                               |
 186  *                      v                               |
 187  *                  [freelist] -------> (found?) -------+
 188  *                      |                               |
 189  *                      v                               |
 190  *              (rudimentary object)                    |
 191  *         mcache_alloc/mcache_alloc_ext() ------>>-----+
 192  *
 193  * Auditing notes: If auditing is enabled, buffers will be subjected to
 194  * integrity checks by the audit routine.  This is done by verifying their
 195  * contents against DEADBEEF (free) pattern before returning them to caller.
 196  * As part of this step, the routine will also record the transaction and
 197  * pattern-fill the buffers with BADDCAFE (uninitialized) pattern.  It will
 198  * also restore any constructed data structure fields if necessary.
 199  *
 200  * OBJECT DEALLOCATION:
 201  *
 202  * Freeing an object simply involves placing it into the CPU cache; this
 203  * pollutes the cache to benefit subsequent allocations.  The slab layer
 204  * will only be entered if the object is to be purged out of the cache.
 205  * During normal operations, this happens only when the CPU layer resizes
 206  * its bucket while it's adjusting to the allocation load.  Deallocation
 207  * paths are different depending on the class of objects:
 208  *
 209  * a. Rudimentary object:
 210  *
 211  *      { m_free(), m_freem_list(), composite object deallocation }
 212  *                      |       ^
 213  *                      |       |
 214  *                      |       +------ (done) ---------+
 215  *                      v                               |
 216  *         mcache_free/mcache_free_ext()                |
 217  *                      |                               |
 218  *                      v                               |
 219  *              mbuf_slab_audit()                       |
 220  *                      |                               |
 221  *                      v                               |
 222  *                 [CPU cache] ---> (not purging?) -----+
 223  *                      |                               |
 224  *                      v                               |
 225  *               mbuf_slab_free()                       |
 226  *                      |                               |
 227  *                      v                               |
 228  *                  [freelist] ----------->>------------+
 229  *       (objects never get purged to VM)
 230  *
 231  * b. Composite object:
 232  *
 233  *      { m_free(), m_freem_list() }
 234  *                      |       ^
 235  *                      |       |
 236  *                      |       +------ (done) ---------+
 237  *                      v                               |
 238  *         mcache_free/mcache_free_ext()                |
 239  *                      |                               |
 240  *                      v                               |
 241  *              mbuf_cslab_audit()                      |
 242  *                      |                               |
 243  *                      v                               |
 244  *                 [CPU cache] ---> (not purging?) -----+
 245  *                      |                               |
 246  *                      v                               |
 247  *               mbuf_cslab_free()                      |
 248  *                      |                               |
 249  *                      v                               |
 250  *                  [freelist] ---> (not purging?) -----+
 251  *                      |                               |
 252  *                      v                               |
 253  *              (rudimentary object)                    |
 254  *         mcache_free/mcache_free_ext() ------->>------+
 255  *
 256  * Auditing notes: If auditing is enabled, the audit routine will save
 257  * any constructed data structure fields (if necessary) before filling the
 258  * contents of the buffers with DEADBEEF (free) pattern and recording the
 259  * transaction.  Buffers that are freed (whether at CPU or slab layer) are
 260  * expected to contain the free pattern.
 261  *
 262  * DEBUGGING:
 263  *
 264  * Debugging can be enabled by adding "mbuf_debug=0x3" to boot-args; this
 265  * translates to the mcache flags (MCF_VERIFY | MCF_AUDIT).  Additionally,
 266  * the CPU layer cache can be disabled by setting the MCF_NOCPUCACHE flag,
 267  * i.e. modify the boot argument parameter to "mbuf_debug=0x13".  Note
 268  * that debugging consumes more CPU and memory.
 269  *
 270  * Each object is associated with exactly one mcache_audit_t structure that
 271  * contains the information related to its last buffer transaction.  Given
 272  * an address of an object, the audit structure can be retrieved by finding
 273  * the position of the object relevant to the base address of the cluster:
 274  *
 275  *      +------------+                  +=============+
 276  *      | mbuf addr  |                  | mclaudit[i] |
 277  *      +------------+                  +=============+
 278  *            |                         | cl_audit[0] |
 279  *      i = MTOCL(addr)                 +-------------+
 280  *            |                 +-----> | cl_audit[1] | -----> mcache_audit_t
 281  *      b = CLTOM(i)            |       +-------------+
 282  *            |                 |       |     ...     |
 283  *      x = MCLIDX(b, addr)     |       +-------------+
 284  *            |                 |       | cl_audit[7] |
 285  *            +-----------------+       +-------------+
 286  *               (e.g. x == 1)
 287  *
 288  * The mclaudit[] array is allocated at initialization time, but its contents
 289  * get populated when the corresponding cluster is created.  Because a cluster
 290  * can be turned into NMBPCL number of mbufs, we preserve enough space for the
 291  * mbufs so that there is a 1-to-1 mapping between them.  A cluster that never
 292  * gets (or has not yet) turned into mbufs will use only cl_audit[0] with the
 293  * remaining entries unused.  For big clusters, only one entry is allocated
 294  * and used for the entire cluster pair.
 295  */
 296
 297 /* TODO: should be in header file */
 298 /* kernel translater */
 299 extern vm_offset_t kmem_mb_alloc(vm_map_t, int, int);
 300 extern ppnum_t pmap_find_phys(pmap_t pmap, addr64_t va);
 301 extern vm_map_t mb_map;         /* special map */
 302
 303 /* Global lock */
 304 static lck_mtx_t *mbuf_mlock;
 305 static lck_attr_t *mbuf_mlock_attr;
 306 static lck_grp_t *mbuf_mlock_grp;
 307 static lck_grp_attr_t *mbuf_mlock_grp_attr;
 308
 309 /* Back-end (common) layer */
 310 static void *mbuf_worker_run;   /* wait channel for worker thread */
 311 static int mbuf_worker_ready;   /* worker thread is runnable */
 312 static int mbuf_expand_mcl;     /* number of cluster creation requets */
 313 static int mbuf_expand_big;     /* number of big cluster creation requests */
 314 static int mbuf_expand_16k;     /* number of 16K cluster creation requests */
 315 static int ncpu;                /* number of CPUs */
 316 static ppnum_t *mcl_paddr;      /* Array of cluster physical addresses */
 317 static ppnum_t mcl_pages;       /* Size of array (# physical pages) */
 318 static ppnum_t mcl_paddr_base;  /* Handle returned by IOMapper::iovmAlloc() */
 319 static mcache_t *ref_cache;     /* Cache of cluster reference & flags */
 320 static mcache_t *mcl_audit_con_cache; /* Audit contents cache */
 321 static unsigned int mbuf_debug; /* patchable mbuf mcache flags */
 322 static unsigned int mb_normalized; /* number of packets "normalized" */
 323 static unsigned int mbuf_gscale; /* Power-of-two growth scale for m_howmany */
 324
 325 #define MB_GROWTH_AGGRESSIVE    1       /* Threshold: 1/2 of total */
 326 #define MB_GROWTH_NORMAL        4       /* Threshold: 15/16 of total */
 327
 328 typedef enum {
 329         MC_MBUF = 0,    /* Regular mbuf */
 330         MC_CL,          /* Cluster */
 331         MC_BIGCL,       /* Large (4K) cluster */
 332         MC_16KCL,       /* Jumbo (16K) cluster */
 333         MC_MBUF_CL,     /* mbuf + cluster */
 334         MC_MBUF_BIGCL,  /* mbuf + large (4K) cluster */
 335         MC_MBUF_16KCL   /* mbuf + jumbo (16K) cluster */
 336 } mbuf_class_t;
 337
 338 #define MBUF_CLASS_MIN          MC_MBUF
 339 #define MBUF_CLASS_MAX          MC_MBUF_16KCL
 340 #define MBUF_CLASS_LAST         MC_16KCL
 341 #define MBUF_CLASS_VALID(c) \
 342         ((int)(c) >= MBUF_CLASS_MIN && (int)(c) <= MBUF_CLASS_MAX)
 343 #define MBUF_CLASS_COMPOSITE(c) \
 344         ((int)(c) > MBUF_CLASS_LAST)
 345
 346
 347 /*
 348  * mbuf specific mcache allocation request flags.
 349  */
 350 #define MCR_COMP        MCR_USR1 /* for MC_MBUF_{CL,BIGCL,16KCL} caches */
 351
 352 /*
 353  * Per-cluster slab structure.
 354  *
 355  * A slab is a cluster control structure that contains one or more object
 356  * chunks; the available chunks are chained in the slab's freelist (sl_head).
 357  * Each time a chunk is taken out of the slab, the slab's reference count
 358  * gets incremented.  When all chunks have been taken out, the empty slab
 359  * gets removed (SLF_DETACHED) from the class's slab list.  A chunk that is
 360  * returned to a slab causes the slab's reference count to be decremented;
 361  * it also causes the slab to be reinserted back to class's slab list, if
 362  * it's not already done.
 363  *
 364  * Compartmentalizing of the object chunks into slabs allows us to easily
 365  * merge one or more slabs together when the adjacent slabs are idle, as
 366  * well as to convert or move a slab from one class to another; e.g. the
 367  * mbuf cluster slab can be converted to a regular cluster slab when all
 368  * mbufs in the slab have been freed.
 369  *
 370  * A slab may also span across multiple clusters for chunks larger than
 371  * a cluster's size.  In this case, only the slab of the first cluster is
 372  * used.  The rest of the slabs are marked with SLF_PARTIAL to indicate
 373  * that they are part of the larger slab.
 374  */
 375 typedef struct mcl_slab {
 376         struct mcl_slab *sl_next;       /* neighboring slab */
 377         u_int8_t        sl_class;       /* controlling mbuf class */
 378         int8_t          sl_refcnt;      /* outstanding allocations */
 379         int8_t          sl_chunks;      /* chunks (bufs) in this slab */
 380         u_int16_t       sl_flags;       /* slab flags (see below) */
 381         u_int16_t       sl_len;         /* slab length */
 382         void            *sl_base;       /* base of allocated memory */
 383         void            *sl_head;       /* first free buffer */
 384         TAILQ_ENTRY(mcl_slab) sl_link;  /* next/prev slab on freelist */
 385 } mcl_slab_t;
 386
 387 #define SLF_MAPPED      0x0001          /* backed by a mapped page */
 388 #define SLF_PARTIAL     0x0002          /* part of another slab */
 389 #define SLF_DETACHED    0x0004          /* not in slab freelist */
 390
 391 /*
 392  * The array of slabs are broken into groups of arrays per 1MB of kernel
 393  * memory to reduce the footprint.  Each group is allocated on demand
 394  * whenever a new piece of memory mapped in from the VM crosses the 1MB
 395  * boundary.
 396  */
 397 #define NSLABSPMB       ((1 << MBSHIFT) >> MCLSHIFT)    /* 512 slabs/grp */
 398
 399 typedef struct mcl_slabg {
 400         mcl_slab_t      slg_slab[NSLABSPMB];    /* group of slabs */
 401 } mcl_slabg_t;
 402
 403 /*
 404  * Per-cluster audit structure.
 405  */
 406 typedef struct {
 407         mcache_audit_t  *cl_audit[NMBPCL];      /* array of audits */
 408 } mcl_audit_t;
 409
 410 #if CONFIG_MBUF_NOEXPAND
 411 static unsigned int maxmbufcl;
 412 #endif /* CONFIG_MBUF_NOEXPAND */
 413
 414 /*
 415  * Size of data from the beginning of an mbuf that covers m_hdr, pkthdr
 416  * and m_ext structures.  If auditing is enabled, we allocate a shadow
 417  * mbuf structure of this size inside each audit structure, and the
 418  * contents of the real mbuf gets copied into it when the mbuf is freed.
 419  * This allows us to pattern-fill the mbuf for integrity check, and to
 420  * preserve any constructed mbuf fields (e.g. mbuf + cluster cache case).
 421  * Note that we don't save the contents of clusters when they are freed;
 422  * we simply pattern-fill them.
 423  */
 424 #define AUDIT_CONTENTS_SIZE     ((MSIZE - MHLEN) + sizeof (_m_ext_t))
 425
 426 /*
 427  * mbuf specific mcache audit flags
 428  */
 429 #define MB_INUSE        0x01    /* object has not been returned to slab */
 430 #define MB_COMP_INUSE   0x02    /* object has not been returned to cslab */
 431 #define MB_SCVALID      0x04    /* object has valid saved contents */
 432
 433 /*
 434  * Each of the following two arrays hold up to nmbclusters elements.
 435  */
 436 static mcl_audit_t *mclaudit;   /* array of cluster audit information */
 437 static mcl_slabg_t **slabstbl;  /* cluster slabs table */
 438 static unsigned int maxslabgrp; /* max # of entries in slabs table */
 439 static unsigned int slabgrp;    /* # of entries in slabs table */
 440
 441 /* Globals */
 442 int nclusters;                  /* # of clusters for non-jumbo (legacy) sizes */
 443 int njcl;                       /* # of clusters for jumbo sizes */
 444 int njclbytes;                  /* size of a jumbo cluster */
 445 union mcluster *mbutl;          /* first mapped cluster address */
 446 union mcluster *embutl;         /* ending virtual address of mclusters */
 447 int max_linkhdr;                /* largest link-level header */
 448 int max_protohdr;               /* largest protocol header */
 449 int max_hdr;                    /* largest link+protocol header */
 450 int max_datalen;                /* MHLEN - max_hdr */
 451
 452 extern u_int32_t high_sb_max;
 453
 454 /* TODO: should be in header file */
 455 int do_reclaim = 0;
 456
 457 /* The minimum number of objects that are allocated, to start. */
 458 #define MINCL           32
 459 #define MINBIGCL        (MINCL >> 1)
 460 #define MIN16KCL        (MINCL >> 2)
 461
 462 /* Low watermarks (only map in pages once free counts go below) */
 463 #define MCL_LOWAT       MINCL
 464 #define MBIGCL_LOWAT    MINBIGCL
 465 #define M16KCL_LOWAT    MIN16KCL
 466
 467 typedef struct {
 468         mbuf_class_t    mtbl_class;     /* class type */
 469         mcache_t        *mtbl_cache;    /* mcache for this buffer class */
 470         TAILQ_HEAD(mcl_slhead, mcl_slab) mtbl_slablist; /* slab list */
 471         mcache_obj_t    *mtbl_cobjlist; /* composite objects freelist */
 472         mb_class_stat_t *mtbl_stats;    /* statistics fetchable via sysctl */
 473         u_int32_t       mtbl_maxsize;   /* maximum buffer size */
 474         int             mtbl_minlimit;  /* minimum allowed */
 475         int             mtbl_maxlimit;  /* maximum allowed */
 476         u_int32_t       mtbl_wantpurge; /* purge during next reclaim */
 477 } mbuf_table_t;
 478
 479 #define m_class(c)      mbuf_table[c].mtbl_class
 480 #define m_cache(c)      mbuf_table[c].mtbl_cache
 481 #define m_slablist(c)   mbuf_table[c].mtbl_slablist
 482 #define m_cobjlist(c)   mbuf_table[c].mtbl_cobjlist
 483 #define m_maxsize(c)    mbuf_table[c].mtbl_maxsize
 484 #define m_minlimit(c)   mbuf_table[c].mtbl_minlimit
 485 #define m_maxlimit(c)   mbuf_table[c].mtbl_maxlimit
 486 #define m_wantpurge(c)  mbuf_table[c].mtbl_wantpurge
 487 #define m_cname(c)      mbuf_table[c].mtbl_stats->mbcl_cname
 488 #define m_size(c)       mbuf_table[c].mtbl_stats->mbcl_size
 489 #define m_total(c)      mbuf_table[c].mtbl_stats->mbcl_total
 490 #define m_active(c)     mbuf_table[c].mtbl_stats->mbcl_active
 491 #define m_infree(c)     mbuf_table[c].mtbl_stats->mbcl_infree
 492 #define m_slab_cnt(c)   mbuf_table[c].mtbl_stats->mbcl_slab_cnt
 493 #define m_alloc_cnt(c)  mbuf_table[c].mtbl_stats->mbcl_alloc_cnt
 494 #define m_free_cnt(c)   mbuf_table[c].mtbl_stats->mbcl_free_cnt
 495 #define m_notified(c)   mbuf_table[c].mtbl_stats->mbcl_notified
 496 #define m_purge_cnt(c)  mbuf_table[c].mtbl_stats->mbcl_purge_cnt
 497 #define m_fail_cnt(c)   mbuf_table[c].mtbl_stats->mbcl_fail_cnt
 498 #define m_ctotal(c)     mbuf_table[c].mtbl_stats->mbcl_ctotal
 499
 500 static mbuf_table_t mbuf_table[] = {
 501         /*
 502          * The caches for mbufs, regular clusters and big clusters.
 503          */
 504         { MC_MBUF, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_MBUF)),
 505             NULL, NULL, 0, 0, 0, 0 },
 506         { MC_CL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_CL)),
 507             NULL, NULL, 0, 0, 0, 0 },
 508         { MC_BIGCL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_BIGCL)),
 509             NULL, NULL, 0, 0, 0, 0 },
 510         { MC_16KCL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_16KCL)),
 511             NULL, NULL, 0, 0, 0, 0 },
 512         /*
 513          * The following are special caches; they serve as intermediate
 514          * caches backed by the above rudimentary caches.  Each object
 515          * in the cache is an mbuf with a cluster attached to it.  Unlike
 516          * the above caches, these intermediate caches do not directly
 517          * deal with the slab structures; instead, the constructed
 518          * cached elements are simply stored in the freelists.
 519          */
 520         { MC_MBUF_CL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0 },
 521         { MC_MBUF_BIGCL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0 },
 522         { MC_MBUF_16KCL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0 },
 523 };
 524
 525 #define NELEM(a)        (sizeof (a) / sizeof ((a)[0]))
 526
 527 static void *mb_waitchan = &mbuf_table; /* wait channel for all caches */
 528 static int mb_waiters;                  /* number of sleepers */
 529
 530 /* The following are used to serialize m_clalloc() */
 531 static boolean_t mb_clalloc_busy;
 532 static void *mb_clalloc_waitchan = &mb_clalloc_busy;
 533 static int mb_clalloc_waiters;
 534
 535 static int mbstat_sysctl SYSCTL_HANDLER_ARGS;
 536 static int mb_stat_sysctl SYSCTL_HANDLER_ARGS;
 537 static void mbuf_table_init(void);
 538 static inline void m_incref(struct mbuf *);
 539 static inline u_int32_t m_decref(struct mbuf *);
 540 static int m_clalloc(const u_int32_t, const int, const u_int32_t);
 541 static void mbuf_worker_thread_init(void);
 542 static mcache_obj_t *slab_alloc(mbuf_class_t, int);
 543 static void slab_free(mbuf_class_t, mcache_obj_t *);
 544 static unsigned int mbuf_slab_alloc(void *, mcache_obj_t ***,
 545     unsigned int, int);
 546 static void mbuf_slab_free(void *, mcache_obj_t *, int);
 547 static void mbuf_slab_audit(void *, mcache_obj_t *, boolean_t);
 548 static void mbuf_slab_notify(void *, u_int32_t);
 549 static unsigned int cslab_alloc(mbuf_class_t, mcache_obj_t ***,
 550     unsigned int);
 551 static unsigned int cslab_free(mbuf_class_t, mcache_obj_t *, int);
 552 static unsigned int mbuf_cslab_alloc(void *, mcache_obj_t ***,
 553     unsigned int, int);
 554 static void mbuf_cslab_free(void *, mcache_obj_t *, int);
 555 static void mbuf_cslab_audit(void *, mcache_obj_t *, boolean_t);
 556 static int freelist_populate(mbuf_class_t, unsigned int, int);
 557 static boolean_t mbuf_cached_above(mbuf_class_t, int);
 558 static boolean_t mbuf_steal(mbuf_class_t, unsigned int);
 559 static void m_reclaim(mbuf_class_t, unsigned int, boolean_t);
 560 static int m_howmany(int, size_t);
 561 static void mbuf_worker_thread(void);
 562 static boolean_t mbuf_sleep(mbuf_class_t, unsigned int, int);
 563
 564 static void mcl_audit_init(void *, mcache_audit_t **, mcache_obj_t **,
 565     size_t, unsigned int);
 566 static mcache_audit_t *mcl_audit_buf2mca(mbuf_class_t, mcache_obj_t *);
 567 static void mcl_audit_mbuf(mcache_audit_t *, void *, boolean_t, boolean_t);
 568 static void mcl_audit_cluster(mcache_audit_t *, void *, size_t, boolean_t,
 569     boolean_t);
 570 static void mcl_audit_restore_mbuf(struct mbuf *, mcache_audit_t *, boolean_t);
 571 static void mcl_audit_save_mbuf(struct mbuf *, mcache_audit_t *);
 572 static void mcl_audit_mcheck_panic(struct mbuf *);
 573 static void mcl_audit_verify_nextptr(void *, mcache_audit_t *);
 574
 575 static mcl_slab_t *slab_get(void *);
 576 static void slab_init(mcl_slab_t *, mbuf_class_t, u_int32_t,
 577     void *, void *, unsigned int, int, int);
 578 static void slab_insert(mcl_slab_t *, mbuf_class_t);
 579 static void slab_remove(mcl_slab_t *, mbuf_class_t);
 580 static boolean_t slab_inrange(mcl_slab_t *, void *);
 581 static void slab_nextptr_panic(mcl_slab_t *, void *);
 582 static void slab_detach(mcl_slab_t *);
 583 static boolean_t slab_is_detached(mcl_slab_t *);
 584
 585 static unsigned int m_length(struct mbuf *);
 586 static int m_copyback0(struct mbuf **, int, int, const void *, int, int);
 587 static struct mbuf *m_split0(struct mbuf *, int, int, int);
 588
 589 /* flags for m_copyback0 */
 590 #define M_COPYBACK0_COPYBACK    0x0001  /* copyback from cp */
 591 #define M_COPYBACK0_PRESERVE    0x0002  /* preserve original data */
 592 #define M_COPYBACK0_COW         0x0004  /* do copy-on-write */
 593 #define M_COPYBACK0_EXTEND      0x0008  /* extend chain */
 594
 595 /*
 596  * This flag is set for all mbufs that come out of and into the composite
 597  * mbuf + cluster caches, i.e. MC_MBUF_CL and MC_MBUF_BIGCL.  mbufs that
 598  * are marked with such a flag have clusters attached to them, and will be
 599  * treated differently when they are freed; instead of being placed back
 600  * into the mbuf and cluster freelists, the composite mbuf + cluster objects
 601  * are placed back into the appropriate composite cache's freelist, and the
 602  * actual freeing is deferred until the composite objects are purged.  At
 603  * such a time, this flag will be cleared from the mbufs and the objects
 604  * will be freed into their own separate freelists.
 605  */
 606 #define EXTF_COMPOSITE  0x1
 607
 608 #define MEXT_RFA(m)             ((m)->m_ext.ext_refflags)
 609 #define MEXT_REF(m)             (MEXT_RFA(m)->refcnt)
 610 #define MEXT_FLAGS(m)           (MEXT_RFA(m)->flags)
 611 #define MBUF_IS_COMPOSITE(m)    \
 612         (MEXT_REF(m) == 0 && (MEXT_FLAGS(m) & EXTF_COMPOSITE))
 613
 614 /*
 615  * Macros used to verify the integrity of the mbuf.
 616  */
 617 #define _MCHECK(m) {                                                    \
 618         if ((m)->m_type != MT_FREE) {                                   \
 619                 if (mclaudit == NULL)                                   \
 620                         panic("MCHECK: m_type=%d m=%p",                 \
 621                             (u_int16_t)(m)->m_type, m);                 \
 622                 else                                                    \
 623                         mcl_audit_mcheck_panic(m);                      \
 624         }                                                               \
 625 }
 626
 627 #define MBUF_IN_MAP(addr)                                               \
 628         ((void *)(addr) >= (void *)mbutl && (void *)(addr) < (void *)embutl)
 629
 630 #define MRANGE(addr) {                                                  \
 631         if (!MBUF_IN_MAP(addr))                                         \
 632                 panic("MRANGE: address out of range 0x%p", addr);       \
 633 }
 634
 635 /*
 636  * Macro version of mtod.
 637  */
 638 #define MTOD(m, t)      ((t)((m)->m_data))
 639
 640 /*
 641  * Macros to obtain cluster index and base cluster address.
 642  */
 643 #define MTOCL(x)        (((char *)(x) - (char *)mbutl) >> MCLSHIFT)
 644 #define CLTOM(x)        ((union mcluster *)(mbutl + (x)))
 645
 646 /*
 647  * Macro to find the mbuf index relative to the cluster base.
 648  */
 649 #define MCLIDX(c, m)    (((char *)(m) - (char *)(c)) >> 8)
 650
 651 /*
 652  * Macros used during mbuf and cluster initialization.
 653  */
 654 #define MBUF_INIT(m, pkthdr, type) {                                    \
 655         _MCHECK(m);                                                     \
 656         (m)->m_next = (m)->m_nextpkt = NULL;                            \
 657         (m)->m_len = 0;                                                 \
 658         (m)->m_type = type;                                             \
 659         if ((pkthdr) == 0) {                                            \
 660                 (m)->m_data = (m)->m_dat;                               \
 661                 (m)->m_flags = 0;                                       \
 662         } else {                                                        \
 663                 (m)->m_data = (m)->m_pktdat;                            \
 664                 (m)->m_flags = M_PKTHDR;                                \
 665                 (m)->m_pkthdr.rcvif = NULL;                             \
 666                 (m)->m_pkthdr.len = 0;                                  \
 667                 (m)->m_pkthdr.header = NULL;                            \
 668                 (m)->m_pkthdr.csum_flags = 0;                           \
 669                 (m)->m_pkthdr.csum_data = 0;                            \
 670                 (m)->m_pkthdr.tso_segsz = 0;                            \
 671                 (m)->m_pkthdr.vlan_tag = 0;                             \
 672                 (m)->m_pkthdr.socket_id = 0;                            \
 673                 m_tag_init(m);                                          \
 674                 m_prio_init(m);                                         \
 675         }                                                               \
 676 }
 677
 678 #define MEXT_INIT(m, buf, size, free, arg, rfa, ref, flag) {            \
 679         (m)->m_data = (m)->m_ext.ext_buf = (buf);                       \
 680         (m)->m_flags |= M_EXT;                                          \
 681         (m)->m_ext.ext_size = (size);                                   \
 682         (m)->m_ext.ext_free = (free);                                   \
 683         (m)->m_ext.ext_arg = (arg);                                     \
 684         (m)->m_ext.ext_refs.forward = (m)->m_ext.ext_refs.backward =    \
 685             &(m)->m_ext.ext_refs;                                       \
 686         MEXT_RFA(m) = (rfa);                                            \
 687         MEXT_REF(m) = (ref);                                            \
 688         MEXT_FLAGS(m) = (flag);                                         \
 689 }
 690
 691 #define MBUF_CL_INIT(m, buf, rfa, ref, flag)    \
 692         MEXT_INIT(m, buf, m_maxsize(MC_CL), NULL, NULL, rfa, ref, flag)
 693
 694 #define MBUF_BIGCL_INIT(m, buf, rfa, ref, flag) \
 695         MEXT_INIT(m, buf, m_maxsize(MC_BIGCL), m_bigfree, NULL, rfa, ref, flag)
 696
 697 #define MBUF_16KCL_INIT(m, buf, rfa, ref, flag) \
 698         MEXT_INIT(m, buf, m_maxsize(MC_16KCL), m_16kfree, NULL, rfa, ref, flag)
 699
 700 /*
 701  * Macro to convert BSD malloc sleep flag to mcache's
 702  */
 703 #define MSLEEPF(f)      ((!((f) & M_DONTWAIT)) ? MCR_SLEEP : MCR_NOSLEEP)
 704
 705 /*
 706  * The structure that holds all mbuf class statistics exportable via sysctl.
 707  * Similar to mbstat structure, the mb_stat structure is protected by the
 708  * global mbuf lock.  It contains additional information about the classes
 709  * that allows for a more accurate view of the state of the allocator.
 710  */
 711 struct mb_stat *mb_stat;
 712 struct omb_stat *omb_stat;      /* For backwards compatibility */
 713
 714 #define MB_STAT_SIZE(n) \
 715         ((size_t)(&((mb_stat_t *)0)->mbs_class[n]))
 716 #define OMB_STAT_SIZE(n) \
 717         ((size_t)(&((struct omb_stat *)0)->mbs_class[n]))
 718
 719 /*
 720  * The legacy structure holding all of the mbuf allocation statistics.
 721  * The actual statistics used by the kernel are stored in the mbuf_table
 722  * instead, and are updated atomically while the global mbuf lock is held.
 723  * They are mirrored in mbstat to support legacy applications (e.g. netstat).
 724  * Unlike before, the kernel no longer relies on the contents of mbstat for
 725  * its operations (e.g. cluster expansion) because the structure is exposed
 726  * to outside and could possibly be modified, therefore making it unsafe.
 727  * With the exception of the mbstat.m_mtypes array (see below), all of the
 728  * statistics are updated as they change.
 729  */
 730 struct mbstat mbstat;
 731
 732 #define MBSTAT_MTYPES_MAX \
 733         (sizeof (mbstat.m_mtypes) / sizeof (mbstat.m_mtypes[0]))
 734
 735 /*
 736  * Allocation statistics related to mbuf types (up to MT_MAX-1) are updated
 737  * atomically and stored in a per-CPU structure which is lock-free; this is
 738  * done in order to avoid writing to the global mbstat data structure which
 739  * would cause false sharing.  During sysctl request for kern.ipc.mbstat,
 740  * the statistics across all CPUs will be converged into the mbstat.m_mtypes
 741  * array and returned to the application.  Any updates for types greater or
 742  * equal than MT_MAX would be done atomically to the mbstat; this slows down
 743  * performance but is okay since the kernel uses only up to MT_MAX-1 while
 744  * anything beyond that (up to type 255) is considered a corner case.
 745  */
 746 typedef struct {
 747         unsigned int    cpu_mtypes[MT_MAX];
 748 } __attribute__((aligned(CPU_CACHE_SIZE), packed)) mtypes_cpu_t;
 749
 750 typedef struct {
 751         mtypes_cpu_t    mbs_cpu[1];
 752 } mbuf_mtypes_t;
 753
 754 static mbuf_mtypes_t *mbuf_mtypes;      /* per-CPU statistics */
 755
 756 #define MBUF_MTYPES_SIZE(n) \
 757         ((size_t)(&((mbuf_mtypes_t *)0)->mbs_cpu[n]))
 758
 759 #define MTYPES_CPU(p) \
 760         ((mtypes_cpu_t *)((char *)(p) + MBUF_MTYPES_SIZE(cpu_number())))
 761
 762 /* This should be in a header file */
 763 #define atomic_add_16(a, n)     ((void) OSAddAtomic16(n, a))
 764 #define atomic_add_32(a, n)     ((void) OSAddAtomic(n, a))
 765
 766 #define mtype_stat_add(type, n) {                                       \
 767         if ((unsigned)(type) < MT_MAX) {                                \
 768                 mtypes_cpu_t *mbs = MTYPES_CPU(mbuf_mtypes);            \
 769                 atomic_add_32(&mbs->cpu_mtypes[type], n);               \
 770         } else if ((unsigned)(type) < (unsigned)MBSTAT_MTYPES_MAX) {            \
 771                 atomic_add_16((int16_t*)&mbstat.m_mtypes[type], n);             \
 772         }                                                               \
 773 }
 774
 775 #define mtype_stat_sub(t, n)    mtype_stat_add(t, -(n))
 776 #define mtype_stat_inc(t)       mtype_stat_add(t, 1)
 777 #define mtype_stat_dec(t)       mtype_stat_sub(t, 1)
 778
 779 static int
 780 mbstat_sysctl SYSCTL_HANDLER_ARGS
 781 {
 782 #pragma unused(oidp, arg1, arg2)
 783         int m, n;
 784         mtypes_cpu_t mtc;
 785
 786         bzero(&mtc, sizeof (mtc));
 787         for (m = 0; m < ncpu; m++) {
 788                 mtypes_cpu_t *scp = &mbuf_mtypes->mbs_cpu[m];
 789                 mtypes_cpu_t temp;
 790
 791                 bcopy(&scp->cpu_mtypes, &temp.cpu_mtypes,
 792                     sizeof (temp.cpu_mtypes));
 793
 794                 for (n = 0; n < MT_MAX; n++)
 795                         mtc.cpu_mtypes[n] += temp.cpu_mtypes[n];
 796         }
 797         lck_mtx_lock(mbuf_mlock);
 798         for (n = 0; n < MT_MAX; n++)
 799                 mbstat.m_mtypes[n] = mtc.cpu_mtypes[n];
 800         lck_mtx_unlock(mbuf_mlock);
 801
 802         return (SYSCTL_OUT(req, &mbstat, sizeof (mbstat)));
 803 }
 804
 805 static int
 806 mb_stat_sysctl SYSCTL_HANDLER_ARGS
 807 {
 808 #pragma unused(oidp, arg1, arg2)
 809         mcache_t *cp;
 810         mcache_cpu_t *ccp;
 811         mb_class_stat_t *sp;
 812         void *statp;
 813         int k, m, bktsize, statsz, proc64 = proc_is64bit(req->p);
 814
 815         lck_mtx_lock(mbuf_mlock);
 816         for (k = 0; k < NELEM(mbuf_table); k++) {
 817                 cp = m_cache(k);
 818                 ccp = &cp->mc_cpu[0];
 819                 bktsize = ccp->cc_bktsize;
 820                 sp = mbuf_table[k].mtbl_stats;
 821
 822                 if (cp->mc_flags & MCF_NOCPUCACHE)
 823                         sp->mbcl_mc_state = MCS_DISABLED;
 824                 else if (cp->mc_purge_cnt > 0)
 825                         sp->mbcl_mc_state = MCS_PURGING;
 826                 else if (bktsize == 0)
 827                         sp->mbcl_mc_state = MCS_OFFLINE;
 828                 else
 829                         sp->mbcl_mc_state = MCS_ONLINE;
 830
 831                 sp->mbcl_mc_cached = 0;
 832                 for (m = 0; m < ncpu; m++) {
 833                         ccp = &cp->mc_cpu[m];
 834                         if (ccp->cc_objs > 0)
 835                                 sp->mbcl_mc_cached += ccp->cc_objs;
 836                         if (ccp->cc_pobjs > 0)
 837                                 sp->mbcl_mc_cached += ccp->cc_pobjs;
 838                 }
 839                 sp->mbcl_mc_cached += (cp->mc_full.bl_total * bktsize);
 840                 sp->mbcl_active = sp->mbcl_total - sp->mbcl_mc_cached -
 841                     sp->mbcl_infree;
 842
 843                 sp->mbcl_mc_waiter_cnt = cp->mc_waiter_cnt;
 844                 sp->mbcl_mc_wretry_cnt = cp->mc_wretry_cnt;
 845                 sp->mbcl_mc_nwretry_cnt = cp->mc_nwretry_cnt;
 846
 847                 /* Calculate total count specific to each class */
 848                 sp->mbcl_ctotal = sp->mbcl_total;
 849                 switch (m_class(k)) {
 850                 case MC_MBUF:
 851                         /* Deduct mbufs used in composite caches */
 852                         sp->mbcl_ctotal -= (m_total(MC_MBUF_CL) +
 853                             m_total(MC_MBUF_BIGCL));
 854                         break;
 855
 856                 case MC_CL:
 857                         /* Deduct clusters used in composite cache and mbufs */
 858                         sp->mbcl_ctotal -= (m_total(MC_MBUF_CL) +
 859                             (P2ROUNDUP(m_total(MC_MBUF), NMBPCL)/NMBPCL));
 860                         break;
 861
 862                 case MC_BIGCL:
 863                         /* Deduct clusters used in composite cache */
 864                         sp->mbcl_ctotal -= m_total(MC_MBUF_BIGCL);
 865                         break;
 866
 867                 case MC_16KCL:
 868                         /* Deduct clusters used in composite cache */
 869                         sp->mbcl_ctotal -= m_total(MC_MBUF_16KCL);
 870                         break;
 871
 872                 default:
 873                         break;
 874                 }
 875         }
 876
 877         if (!proc64) {
 878                 struct omb_class_stat *oc;
 879                 struct mb_class_stat *c;
 880
 881                 omb_stat->mbs_cnt = mb_stat->mbs_cnt;
 882                 oc = &omb_stat->mbs_class[0];
 883                 c = &mb_stat->mbs_class[0];
 884                 for (k = 0; k < omb_stat->mbs_cnt; k++, oc++, c++) {
 885                         (void) snprintf(oc->mbcl_cname, sizeof (oc->mbcl_cname),
 886                             "%s", c->mbcl_cname);
 887                         oc->mbcl_size = c->mbcl_size;
 888                         oc->mbcl_total = c->mbcl_total;
 889                         oc->mbcl_active = c->mbcl_active;
 890                         oc->mbcl_infree = c->mbcl_infree;
 891                         oc->mbcl_slab_cnt = c->mbcl_slab_cnt;
 892                         oc->mbcl_alloc_cnt = c->mbcl_alloc_cnt;
 893                         oc->mbcl_free_cnt = c->mbcl_free_cnt;
 894                         oc->mbcl_notified = c->mbcl_notified;
 895                         oc->mbcl_purge_cnt = c->mbcl_purge_cnt;
 896                         oc->mbcl_fail_cnt = c->mbcl_fail_cnt;
 897                         oc->mbcl_ctotal = c->mbcl_ctotal;
 898                         oc->mbcl_mc_state = c->mbcl_mc_state;
 899                         oc->mbcl_mc_cached = c->mbcl_mc_cached;
 900                         oc->mbcl_mc_waiter_cnt = c->mbcl_mc_waiter_cnt;
 901                         oc->mbcl_mc_wretry_cnt = c->mbcl_mc_wretry_cnt;
 902                         oc->mbcl_mc_nwretry_cnt = c->mbcl_mc_nwretry_cnt;
 903                 }
 904                 statp = omb_stat;
 905                 statsz = OMB_STAT_SIZE(NELEM(mbuf_table));
 906         } else {
 907                 statp = mb_stat;
 908                 statsz = MB_STAT_SIZE(NELEM(mbuf_table));
 909         }
 910
 911         lck_mtx_unlock(mbuf_mlock);
 912
 913         return (SYSCTL_OUT(req, statp, statsz));
 914 }
 915
 916 static inline void
 917 m_incref(struct mbuf *m)
 918 {
 919         UInt32 old, new;
 920         volatile UInt32 *addr = (volatile UInt32 *)&MEXT_REF(m);
 921
 922         do {
 923                 old = *addr;
 924                 new = old + 1;
 925                 ASSERT(new != 0);
 926         } while (!OSCompareAndSwap(old, new, addr));
 927 }
 928
 929 static inline u_int32_t
 930 m_decref(struct mbuf *m)
 931 {
 932         UInt32 old, new;
 933         volatile UInt32 *addr = (volatile UInt32 *)&MEXT_REF(m);
 934
 935         do {
 936                 old = *addr;
 937                 new = old - 1;
 938                 ASSERT(old != 0);
 939         } while (!OSCompareAndSwap(old, new, addr));
 940
 941         return (new);
 942 }
 943
 944 static void
 945 mbuf_table_init(void)
 946 {
 947         int m;
 948
 949         MALLOC(omb_stat, struct omb_stat *, OMB_STAT_SIZE(NELEM(mbuf_table)),
 950             M_TEMP, M_WAITOK | M_ZERO);
 951         VERIFY(omb_stat != NULL);
 952
 953         MALLOC(mb_stat, mb_stat_t *, MB_STAT_SIZE(NELEM(mbuf_table)),
 954             M_TEMP, M_WAITOK | M_ZERO);
 955         VERIFY(mb_stat != NULL);
 956
 957         mb_stat->mbs_cnt = NELEM(mbuf_table);
 958         for (m = 0; m < NELEM(mbuf_table); m++)
 959                 mbuf_table[m].mtbl_stats = &mb_stat->mbs_class[m];
 960
 961 #if CONFIG_MBUF_JUMBO
 962         /*
 963          * Set aside 1/3 of the mbuf cluster map for jumbo clusters; we do
 964          * this only on platforms where jumbo cluster pool is enabled.
 965          */
 966         njcl = nmbclusters / 3;
 967         njclbytes = M16KCLBYTES;
 968 #endif /* CONFIG_MBUF_JUMBO */
 969
 970         /*
 971          * nclusters is going to be split in 2 to hold both the 2K
 972          * and the 4K pools, so make sure each half is even.
 973          */
 974         nclusters = P2ROUNDDOWN(nmbclusters - njcl, 4);
 975         if (njcl > 0) {
 976                 /*
 977                  * Each jumbo cluster takes 8 2K clusters, so make
 978                  * sure that the pool size is evenly divisible by 8.
 979                  */
 980                 njcl = P2ROUNDDOWN(nmbclusters - nclusters, 8);
 981         }
 982
 983 #if CONFIG_MBUF_NOEXPAND
 984         /* Only use 4k clusters if we're setting aside more than 256k */
 985         if (nmbclusters <= 128) {
 986                 maxmbufcl = nmbclusters / 4;
 987         } else {
 988                 /* Half to big clusters, half to small */
 989                 maxmbufcl = (nmbclusters / 4) * 3;
 990         }
 991 #endif /* CONFIG_MBUF_NOEXPAND */
 992
 993         /*
 994          * 1/2 of the map is reserved for 2K clusters.  Out of this, 1/16th
 995          * of the total number of 2K clusters allocated is reserved and cannot
 996          * be turned into mbufs.  It can only be used for pure cluster objects.
 997          */
 998         m_minlimit(MC_CL) = (nclusters >> 5);
 999         m_maxlimit(MC_CL) = (nclusters >> 1);
1000         m_maxsize(MC_CL) = m_size(MC_CL) = MCLBYTES;
1001         (void) snprintf(m_cname(MC_CL), MAX_MBUF_CNAME, "cl");
1002
1003         /*
1004          * The remaining (15/16th) can be turned into mbufs.
1005          */
1006         m_minlimit(MC_MBUF) = 0;
1007         m_maxlimit(MC_MBUF) = (m_maxlimit(MC_CL) - m_minlimit(MC_CL)) * NMBPCL;
1008         m_maxsize(MC_MBUF) = m_size(MC_MBUF) = MSIZE;
1009         (void) snprintf(m_cname(MC_MBUF), MAX_MBUF_CNAME, "mbuf");
1010
1011         /*
1012          * The other 1/2 of the map is reserved for 4K clusters.
1013          */
1014         m_minlimit(MC_BIGCL) = 0;
1015         m_maxlimit(MC_BIGCL) = m_maxlimit(MC_CL) >> 1;
1016         m_maxsize(MC_BIGCL) = m_size(MC_BIGCL) = NBPG;
1017         (void) snprintf(m_cname(MC_BIGCL), MAX_MBUF_CNAME, "bigcl");
1018
1019         /*
1020          * Set limits for the composite classes.
1021          */
1022         m_minlimit(MC_MBUF_CL) = 0;
1023         m_maxlimit(MC_MBUF_CL) = m_maxlimit(MC_CL) - m_minlimit(MC_CL);
1024         m_maxsize(MC_MBUF_CL) = MCLBYTES;
1025         m_size(MC_MBUF_CL) = m_size(MC_MBUF) + m_size(MC_CL);
1026         (void) snprintf(m_cname(MC_MBUF_CL), MAX_MBUF_CNAME, "mbuf_cl");
1027
1028         m_minlimit(MC_MBUF_BIGCL) = 0;
1029         m_maxlimit(MC_MBUF_BIGCL) = m_maxlimit(MC_BIGCL);
1030         m_maxsize(MC_MBUF_BIGCL) = NBPG;
1031         m_size(MC_MBUF_BIGCL) = m_size(MC_MBUF) + m_size(MC_BIGCL);
1032         (void) snprintf(m_cname(MC_MBUF_BIGCL), MAX_MBUF_CNAME, "mbuf_bigcl");
1033
1034         /*
1035          * And for jumbo classes.
1036          */
1037         m_minlimit(MC_16KCL) = 0;
1038         m_maxlimit(MC_16KCL) = (njcl >> 3);
1039         m_maxsize(MC_16KCL) = m_size(MC_16KCL) = M16KCLBYTES;
1040         (void) snprintf(m_cname(MC_16KCL), MAX_MBUF_CNAME, "16kcl");
1041
1042         m_minlimit(MC_MBUF_16KCL) = 0;
1043         m_maxlimit(MC_MBUF_16KCL) = m_maxlimit(MC_16KCL);
1044         m_maxsize(MC_MBUF_16KCL) = M16KCLBYTES;
1045         m_size(MC_MBUF_16KCL) = m_size(MC_MBUF) + m_size(MC_16KCL);
1046         (void) snprintf(m_cname(MC_MBUF_16KCL), MAX_MBUF_CNAME, "mbuf_16kcl");
1047
1048         /*
1049          * Initialize the legacy mbstat structure.
1050          */
1051         bzero(&mbstat, sizeof (mbstat));
1052         mbstat.m_msize = m_maxsize(MC_MBUF);
1053         mbstat.m_mclbytes = m_maxsize(MC_CL);
1054         mbstat.m_minclsize = MINCLSIZE;
1055         mbstat.m_mlen = MLEN;
1056         mbstat.m_mhlen = MHLEN;
1057         mbstat.m_bigmclbytes = m_maxsize(MC_BIGCL);
1058 }
1059
1060 #if defined(__LP64__)
1061 typedef struct ncl_tbl {
1062         uint64_t nt_maxmem;     /* memory (sane) size */
1063         uint32_t nt_mbpool;     /* mbuf pool size */
1064 } ncl_tbl_t;
1065
1066 /* Non-server */
1067 static ncl_tbl_t ncl_table[] = {
1068         { (1ULL << GBSHIFT)       /*  1 GB */,  (64 << MBSHIFT)  /*  64 MB */ },
1069         { (1ULL << (GBSHIFT + 3)) /*  8 GB */,  (96 << MBSHIFT)  /*  96 MB */ },
1070         { (1ULL << (GBSHIFT + 4)) /* 16 GB */,  (128 << MBSHIFT) /* 128 MB */ },
1071         { 0, 0 }
1072 };
1073
1074 /* Server */
1075 static ncl_tbl_t ncl_table_srv[] = {
1076         { (1ULL << GBSHIFT)       /*  1 GB */,  (96 << MBSHIFT)  /*  96 MB */ },
1077         { (1ULL << (GBSHIFT + 2)) /*  4 GB */,  (128 << MBSHIFT) /* 128 MB */ },
1078         { (1ULL << (GBSHIFT + 3)) /*  8 GB */,  (160 << MBSHIFT) /* 160 MB */ },
1079         { (1ULL << (GBSHIFT + 4)) /* 16 GB */,  (192 << MBSHIFT) /* 192 MB */ },
1080         { (1ULL << (GBSHIFT + 5)) /* 32 GB */,  (256 << MBSHIFT) /* 256 MB */ },
1081         { (1ULL << (GBSHIFT + 6)) /* 64 GB */,  (384 << MBSHIFT) /* 384 MB */ },
1082         { 0, 0 }
1083 };
1084 #endif /* __LP64__ */
1085
1086 __private_extern__ unsigned int
1087 mbuf_default_ncl(int srv, uint64_t mem)
1088 {
1089 #if !defined(__LP64__)
1090 #pragma unused(srv)
1091         unsigned int n;
1092         /*
1093          * 32-bit kernel (default to 64MB of mbuf pool for >= 1GB RAM).
1094          */
1095         if ((n = ((mem / 16) / MCLBYTES)) > 32768)
1096                 n = 32768;
1097 #else
1098         unsigned int n, i;
1099         ncl_tbl_t *tbl = (srv ? ncl_table_srv : ncl_table);
1100         /*
1101          * 64-bit kernel (mbuf pool size based on table).
1102          */
1103         n = tbl[0].nt_mbpool;
1104         for (i = 0; tbl[i].nt_mbpool != 0; i++) {
1105                 if (mem < tbl[i].nt_maxmem)
1106                         break;
1107                 n = tbl[i].nt_mbpool;
1108         }
1109         n >>= MCLSHIFT;
1110 #endif /* !__LP64__ */
1111         return (n);
1112 }
1113
1114 __private_extern__ void
1115 mbinit(void)
1116 {
1117         unsigned int m;
1118         int initmcl = MINCL;
1119         void *buf;
1120         thread_t thread = THREAD_NULL;
1121
1122         if (nmbclusters == 0)
1123                 nmbclusters = NMBCLUSTERS;
1124
1125         /* Setup the mbuf table */
1126         mbuf_table_init();
1127
1128         /* Global lock for common layer */
1129         mbuf_mlock_grp_attr = lck_grp_attr_alloc_init();
1130         mbuf_mlock_grp = lck_grp_alloc_init("mbuf", mbuf_mlock_grp_attr);
1131         mbuf_mlock_attr = lck_attr_alloc_init();
1132         mbuf_mlock = lck_mtx_alloc_init(mbuf_mlock_grp, mbuf_mlock_attr);
1133
1134         /* Allocate cluster slabs table */
1135         maxslabgrp = P2ROUNDUP(nmbclusters, NSLABSPMB) / NSLABSPMB;
1136         MALLOC(slabstbl, mcl_slabg_t **, maxslabgrp * sizeof (mcl_slabg_t *),
1137             M_TEMP, M_WAITOK | M_ZERO);
1138         VERIFY(slabstbl != NULL);
1139
1140         /* Allocate audit structures if needed */
1141         PE_parse_boot_argn("mbuf_debug", &mbuf_debug, sizeof (mbuf_debug));
1142         mbuf_debug |= mcache_getflags();
1143         if (mbuf_debug & MCF_AUDIT) {
1144                 MALLOC(mclaudit, mcl_audit_t *,
1145                     nmbclusters * sizeof (*mclaudit), M_TEMP,
1146                     M_WAITOK | M_ZERO);
1147                 VERIFY(mclaudit != NULL);
1148
1149                 mcl_audit_con_cache = mcache_create("mcl_audit_contents",
1150                     AUDIT_CONTENTS_SIZE, 0, 0, MCR_SLEEP);
1151                 VERIFY(mcl_audit_con_cache != NULL);
1152         }
1153
1154         /* Calculate the number of pages assigned to the cluster pool */
1155         mcl_pages = (nmbclusters * MCLBYTES) / CLBYTES;
1156         MALLOC(mcl_paddr, ppnum_t *, mcl_pages * sizeof (ppnum_t),
1157             M_TEMP, M_WAITOK);
1158         VERIFY(mcl_paddr != NULL);
1159
1160         /* Register with the I/O Bus mapper */
1161         mcl_paddr_base = IOMapperIOVMAlloc(mcl_pages);
1162         bzero((char *)mcl_paddr, mcl_pages * sizeof (ppnum_t));
1163
1164         embutl = (union mcluster *)
1165             ((unsigned char *)mbutl + (nmbclusters * MCLBYTES));
1166
1167         PE_parse_boot_argn("initmcl", &initmcl, sizeof (initmcl));
1168
1169         lck_mtx_lock(mbuf_mlock);
1170
1171         if (m_clalloc(MAX(NBPG/CLBYTES, 1) * initmcl, M_WAIT, MCLBYTES) == 0)
1172                 panic("mbinit: m_clalloc failed\n");
1173
1174         lck_mtx_unlock(mbuf_mlock);
1175
1176         (void) kernel_thread_start((thread_continue_t)mbuf_worker_thread_init, NULL, &thread);
1177         thread_deallocate(thread);
1178
1179         ref_cache = mcache_create("mext_ref", sizeof (struct ext_ref),
1180             0, 0, MCR_SLEEP);
1181
1182         /* Create the cache for each class */
1183         for (m = 0; m < NELEM(mbuf_table); m++) {
1184                 void *allocfunc, *freefunc, *auditfunc;
1185                 u_int32_t flags;
1186
1187                 flags = mbuf_debug;
1188                 if (m_class(m) == MC_MBUF_CL || m_class(m) == MC_MBUF_BIGCL ||
1189                     m_class(m) == MC_MBUF_16KCL) {
1190                         allocfunc = mbuf_cslab_alloc;
1191                         freefunc = mbuf_cslab_free;
1192                         auditfunc = mbuf_cslab_audit;
1193                 } else {
1194                         allocfunc = mbuf_slab_alloc;
1195                         freefunc = mbuf_slab_free;
1196                         auditfunc = mbuf_slab_audit;
1197                 }
1198
1199                 /*
1200                  * Disable per-CPU caches for jumbo classes if there
1201                  * is no jumbo cluster pool available in the system.
1202                  * The cache itself is still created (but will never
1203                  * be populated) since it simplifies the code.
1204                  */
1205                 if ((m_class(m) == MC_MBUF_16KCL || m_class(m) == MC_16KCL) &&
1206                     njcl == 0)
1207                         flags |= MCF_NOCPUCACHE;
1208
1209                 m_cache(m) = mcache_create_ext(m_cname(m), m_maxsize(m),
1210                     allocfunc, freefunc, auditfunc, mbuf_slab_notify,
1211                     (void *)(uintptr_t)m, flags, MCR_SLEEP);
1212         }
1213
1214         /*
1215          * Allocate structure for per-CPU statistics that's aligned
1216          * on the CPU cache boundary; this code assumes that we never
1217          * uninitialize this framework, since the original address
1218          * before alignment is not saved.
1219          */
1220         ncpu = ml_get_max_cpus();
1221         MALLOC(buf, void *, MBUF_MTYPES_SIZE(ncpu) + CPU_CACHE_SIZE,
1222             M_TEMP, M_WAITOK);
1223         VERIFY(buf != NULL);
1224
1225         mbuf_mtypes = (mbuf_mtypes_t *)P2ROUNDUP((intptr_t)buf, CPU_CACHE_SIZE);
1226         bzero(mbuf_mtypes, MBUF_MTYPES_SIZE(ncpu));
1227
1228         mbuf_gscale = MB_GROWTH_NORMAL;
1229
1230         /*
1231          * Set the max limit on sb_max to be 1/16 th of the size of
1232          * memory allocated for mbuf clusters.
1233          */
1234         high_sb_max = (nmbclusters << (MCLSHIFT - 4));
1235         if (high_sb_max < sb_max) {
1236                 /* sb_max is too large for this configuration, scale it down */
1237                 if (high_sb_max > (1 << MBSHIFT)) {
1238                         /* We have atleast 16 M of mbuf pool */
1239                         sb_max = high_sb_max;
1240                 } else if ((nmbclusters << MCLSHIFT) > (1 << MBSHIFT)) {
1241                         /* If we have more than 1M of mbufpool, cap the size of
1242                          * max sock buf at 1M
1243                          */
1244                         sb_max = high_sb_max = (1 << MBSHIFT);
1245                 } else {
1246                         sb_max = high_sb_max;
1247                 }
1248         }
1249
1250         printf("mbinit: done (%d MB memory set for mbuf pool)\n",
1251             (nmbclusters << MCLSHIFT) >> MBSHIFT);
1252 }
1253
1254 /*
1255  * Obtain a slab of object(s) from the class's freelist.
1256  */
1257 static mcache_obj_t *
1258 slab_alloc(mbuf_class_t class, int wait)
1259 {
1260         mcl_slab_t *sp;
1261         mcache_obj_t *buf;
1262
1263         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1264
1265         VERIFY(class != MC_16KCL || njcl > 0);
1266
1267         /* This should always be NULL for us */
1268         VERIFY(m_cobjlist(class) == NULL);
1269
1270         /*
1271          * Treat composite objects as having longer lifespan by using
1272          * a slab from the reverse direction, in hoping that this could
1273          * reduce the probability of fragmentation for slabs that hold
1274          * more than one buffer chunks (e.g. mbuf slabs).  For other
1275          * slabs, this probably doesn't make much of a difference.
1276          */
1277         if (class == MC_MBUF && (wait & MCR_COMP))
1278                 sp = (mcl_slab_t *)TAILQ_LAST(&m_slablist(class), mcl_slhead);
1279         else
1280                 sp = (mcl_slab_t *)TAILQ_FIRST(&m_slablist(class));
1281
1282         if (sp == NULL) {
1283                 VERIFY(m_infree(class) == 0 && m_slab_cnt(class) == 0);
1284                 /* The slab list for this class is empty */
1285                 return (NULL);
1286         }
1287
1288         VERIFY(m_infree(class) > 0);
1289         VERIFY(!slab_is_detached(sp));
1290         VERIFY(sp->sl_class == class &&
1291             (sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED);
1292         buf = sp->sl_head;
1293         VERIFY(slab_inrange(sp, buf) && sp == slab_get(buf));
1294
1295         if (class == MC_MBUF) {
1296                 sp->sl_head = buf->obj_next;
1297                 VERIFY(sp->sl_head != NULL || sp->sl_refcnt == (NMBPCL - 1));
1298         } else {
1299                 sp->sl_head = NULL;
1300         }
1301         if (sp->sl_head != NULL && !slab_inrange(sp, sp->sl_head)) {
1302                 slab_nextptr_panic(sp, sp->sl_head);
1303                 /* In case sl_head is in the map but not in the slab */
1304                 VERIFY(slab_inrange(sp, sp->sl_head));
1305                 /* NOTREACHED */
1306         }
1307
1308         /* Increment slab reference */
1309         sp->sl_refcnt++;
1310
1311         if (mclaudit != NULL) {
1312                 mcache_audit_t *mca = mcl_audit_buf2mca(class, buf);
1313                 mca->mca_uflags = 0;
1314                 /* Save contents on mbuf objects only */
1315                 if (class == MC_MBUF)
1316                         mca->mca_uflags |= MB_SCVALID;
1317         }
1318
1319         if (class == MC_CL) {
1320                 mbstat.m_clfree = (--m_infree(MC_CL)) + m_infree(MC_MBUF_CL);
1321                 /*
1322                  * A 2K cluster slab can have at most 1 reference.
1323                  */
1324                 VERIFY(sp->sl_refcnt == 1 && sp->sl_chunks == 1 &&
1325                     sp->sl_len == m_maxsize(MC_CL) && sp->sl_head == NULL);
1326         } else if (class == MC_BIGCL) {
1327                 mcl_slab_t *nsp = sp->sl_next;
1328                 mbstat.m_bigclfree = (--m_infree(MC_BIGCL)) +
1329                     m_infree(MC_MBUF_BIGCL);
1330                 /*
1331                  * Increment 2nd slab.  A 4K big cluster takes
1332                  * 2 slabs, each having at most 1 reference.
1333                  */
1334                 VERIFY(sp->sl_refcnt == 1 && sp->sl_chunks == 1 &&
1335                     sp->sl_len == m_maxsize(MC_BIGCL) && sp->sl_head == NULL);
1336                 /* Next slab must already be present */
1337                 VERIFY(nsp != NULL);
1338                 nsp->sl_refcnt++;
1339                 VERIFY(!slab_is_detached(nsp));
1340                 VERIFY(nsp->sl_class == MC_BIGCL &&
1341                     nsp->sl_flags == (SLF_MAPPED | SLF_PARTIAL) &&
1342                     nsp->sl_refcnt == 1 && nsp->sl_chunks == 0 &&
1343                     nsp->sl_len == 0 && nsp->sl_base == sp->sl_base &&
1344                     nsp->sl_head == NULL);
1345         } else if (class == MC_16KCL) {
1346                 mcl_slab_t *nsp;
1347                 int k;
1348
1349                 --m_infree(MC_16KCL);
1350                 VERIFY(sp->sl_refcnt == 1 && sp->sl_chunks == 1 &&
1351                     sp->sl_len == m_maxsize(MC_16KCL) && sp->sl_head == NULL);
1352                 /*
1353                  * Increment 2nd-8th slab.  A 16K big cluster takes
1354                  * 8 cluster slabs, each having at most 1 reference.
1355                  */
1356                 for (nsp = sp, k = 1; k < (M16KCLBYTES / MCLBYTES); k++) {
1357                         nsp = nsp->sl_next;
1358                         /* Next slab must already be present */
1359                         VERIFY(nsp != NULL);
1360                         nsp->sl_refcnt++;
1361                         VERIFY(!slab_is_detached(nsp));
1362                         VERIFY(nsp->sl_class == MC_16KCL &&
1363                             nsp->sl_flags == (SLF_MAPPED | SLF_PARTIAL) &&
1364                             nsp->sl_refcnt == 1 && nsp->sl_chunks == 0 &&
1365                             nsp->sl_len == 0 && nsp->sl_base == sp->sl_base &&
1366                             nsp->sl_head == NULL);
1367                 }
1368         } else {
1369                 ASSERT(class == MC_MBUF);
1370                 --m_infree(MC_MBUF);
1371                 /*
1372                  * If auditing is turned on, this check is
1373                  * deferred until later in mbuf_slab_audit().
1374                  */
1375                 if (mclaudit == NULL)
1376                         _MCHECK((struct mbuf *)buf);
1377                 /*
1378                  * Since we have incremented the reference count above,
1379                  * an mbuf slab (formerly a 2K cluster slab that was cut
1380                  * up into mbufs) must have a reference count between 1
1381                  * and NMBPCL at this point.
1382                  */
1383                 VERIFY(sp->sl_refcnt >= 1 &&
1384                     (unsigned short)sp->sl_refcnt <= NMBPCL &&
1385                     sp->sl_chunks == NMBPCL && sp->sl_len == m_maxsize(MC_CL));
1386                 VERIFY((unsigned short)sp->sl_refcnt < NMBPCL ||
1387                     sp->sl_head == NULL);
1388         }
1389
1390         /* If empty, remove this slab from the class's freelist */
1391         if (sp->sl_head == NULL) {
1392                 VERIFY(class != MC_MBUF || sp->sl_refcnt == NMBPCL);
1393                 slab_remove(sp, class);
1394         }
1395
1396         return (buf);
1397 }
1398
1399 /*
1400  * Place a slab of object(s) back into a class's slab list.
1401  */
1402 static void
1403 slab_free(mbuf_class_t class, mcache_obj_t *buf)
1404 {
1405         mcl_slab_t *sp;
1406
1407         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1408
1409         VERIFY(class != MC_16KCL || njcl > 0);
1410         VERIFY(buf->obj_next == NULL);
1411         sp = slab_get(buf);
1412         VERIFY(sp->sl_class == class && slab_inrange(sp, buf) &&
1413             (sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED);
1414
1415         /* Decrement slab reference */
1416         sp->sl_refcnt--;
1417
1418         if (class == MC_CL || class == MC_BIGCL) {
1419                 VERIFY(IS_P2ALIGNED(buf, MCLBYTES));
1420                 /*
1421                  * A 2K cluster slab can have at most 1 reference
1422                  * which must be 0 at this point.
1423                  */
1424                 VERIFY(sp->sl_refcnt == 0 && sp->sl_chunks == 1 &&
1425                     sp->sl_len == m_maxsize(class) && sp->sl_head == NULL);
1426                 VERIFY(slab_is_detached(sp));
1427                 if (class == MC_BIGCL) {
1428                         mcl_slab_t *nsp = sp->sl_next;
1429                         VERIFY(IS_P2ALIGNED(buf, NBPG));
1430                         /* Next slab must already be present */
1431                         VERIFY(nsp != NULL);
1432                         /* Decrement 2nd slab reference */
1433                         nsp->sl_refcnt--;
1434                         /*
1435                          * A 4K big cluster takes 2 slabs, both
1436                          * must now have 0 reference.
1437                          */
1438                         VERIFY(slab_is_detached(nsp));
1439                         VERIFY(nsp->sl_class == MC_BIGCL &&
1440                             (nsp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) &&
1441                             nsp->sl_refcnt == 0 && nsp->sl_chunks == 0 &&
1442                             nsp->sl_len == 0 && nsp->sl_base == sp->sl_base &&
1443                             nsp->sl_head == NULL);
1444                 }
1445         } else if (class == MC_16KCL) {
1446                 mcl_slab_t *nsp;
1447                 int k;
1448                 /*
1449                  * A 16K cluster takes 8 cluster slabs, all must
1450                  * now have 0 reference.
1451                  */
1452                 VERIFY(IS_P2ALIGNED(buf, NBPG));
1453                 VERIFY(sp->sl_refcnt == 0 && sp->sl_chunks == 1 &&
1454                     sp->sl_len == m_maxsize(MC_16KCL) && sp->sl_head == NULL);
1455                 VERIFY(slab_is_detached(sp));
1456                 for (nsp = sp, k = 1; k < (M16KCLBYTES / MCLBYTES); k++) {
1457                         nsp = nsp->sl_next;
1458                         /* Next slab must already be present */
1459                         VERIFY(nsp != NULL);
1460                         nsp->sl_refcnt--;
1461                         VERIFY(slab_is_detached(nsp));
1462                         VERIFY(nsp->sl_class == MC_16KCL &&
1463                             (nsp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) &&
1464                             nsp->sl_refcnt == 0 && nsp->sl_chunks == 0 &&
1465                             nsp->sl_len == 0 && nsp->sl_base == sp->sl_base &&
1466                             nsp->sl_head == NULL);
1467                 }
1468         } else {
1469                 /*
1470                  * An mbuf slab has a total of NMBPL reference counts.
1471                  * Since we have decremented the reference above, it
1472                  * must now be between 0 and NMBPCL-1.
1473                  */
1474                 VERIFY(sp->sl_refcnt >= 0 &&
1475                     (unsigned short)sp->sl_refcnt <= (NMBPCL - 1) &&
1476                     sp->sl_chunks == NMBPCL && sp->sl_len == m_maxsize(MC_CL));
1477                 VERIFY(sp->sl_refcnt < (NMBPCL - 1) ||
1478                     (slab_is_detached(sp) && sp->sl_head == NULL));
1479         }
1480
1481         /*
1482          * When auditing is enabled, ensure that the buffer still
1483          * contains the free pattern.  Otherwise it got corrupted
1484          * while at the CPU cache layer.
1485          */
1486         if (mclaudit != NULL) {
1487                 mcache_audit_t *mca = mcl_audit_buf2mca(class, buf);
1488                 mcache_audit_free_verify(mca, buf, 0, m_maxsize(class));
1489                 mca->mca_uflags &= ~MB_SCVALID;
1490         }
1491
1492         if (class == MC_CL) {
1493                 mbstat.m_clfree = (++m_infree(MC_CL)) + m_infree(MC_MBUF_CL);
1494         } else if (class == MC_BIGCL) {
1495                 mbstat.m_bigclfree = (++m_infree(MC_BIGCL)) +
1496                     m_infree(MC_MBUF_BIGCL);
1497         } else if (class == MC_16KCL) {
1498                 ++m_infree(MC_16KCL);
1499         } else {
1500                 ++m_infree(MC_MBUF);
1501                 buf->obj_next = sp->sl_head;
1502         }
1503         sp->sl_head = buf;
1504
1505         /* All mbufs are freed; return the cluster that we stole earlier */
1506         if (sp->sl_refcnt == 0 && class == MC_MBUF) {
1507                 int i = NMBPCL;
1508
1509                 m_total(MC_MBUF) -= NMBPCL;
1510                 mbstat.m_mbufs = m_total(MC_MBUF);
1511                 m_infree(MC_MBUF) -= NMBPCL;
1512                 mtype_stat_add(MT_FREE, -((unsigned)NMBPCL));
1513
1514                 while (i--) {
1515                         struct mbuf *m = sp->sl_head;
1516                         VERIFY(m != NULL);
1517                         sp->sl_head = m->m_next;
1518                         m->m_next = NULL;
1519                 }
1520                 VERIFY(sp->sl_head == NULL);
1521
1522                 /* Remove the slab from the mbuf class's slab list */
1523                 slab_remove(sp, class);
1524
1525                 /* Reinitialize it as a 2K cluster slab */
1526                 slab_init(sp, MC_CL, sp->sl_flags, sp->sl_base, sp->sl_base,
1527                     sp->sl_len, 0, 1);
1528
1529                 if (mclaudit != NULL)
1530                         mcache_set_pattern(MCACHE_FREE_PATTERN,
1531                             (caddr_t)sp->sl_head, m_maxsize(MC_CL));
1532
1533                 mbstat.m_clfree = (++m_infree(MC_CL)) + m_infree(MC_MBUF_CL);
1534
1535                 VERIFY(slab_is_detached(sp));
1536                 /* And finally switch class */
1537                 class = MC_CL;
1538         }
1539
1540         /* Reinsert the slab to the class's slab list */
1541         if (slab_is_detached(sp))
1542                 slab_insert(sp, class);
1543 }
1544
1545 /*
1546  * Common allocator for rudimentary objects called by the CPU cache layer
1547  * during an allocation request whenever there is no available element in the
1548  * bucket layer.  It returns one or more elements from the appropriate global
1549  * freelist.  If the freelist is empty, it will attempt to populate it and
1550  * retry the allocation.
1551  */
1552 static unsigned int
1553 mbuf_slab_alloc(void *arg, mcache_obj_t ***plist, unsigned int num, int wait)
1554 {
1555         mbuf_class_t class = (mbuf_class_t)arg;
1556         unsigned int need = num;
1557         mcache_obj_t **list = *plist;
1558
1559         ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
1560         ASSERT(need > 0);
1561
1562         lck_mtx_lock(mbuf_mlock);
1563
1564         for (;;) {
1565                 if ((*list = slab_alloc(class, wait)) != NULL) {
1566                         (*list)->obj_next = NULL;
1567                         list = *plist = &(*list)->obj_next;
1568
1569                         if (--need == 0) {
1570                                 /*
1571                                  * If the number of elements in freelist has
1572                                  * dropped below low watermark, asynchronously
1573                                  * populate the freelist now rather than doing
1574                                  * it later when we run out of elements.
1575                                  */
1576                                 if (!mbuf_cached_above(class, wait) &&
1577                                     m_infree(class) < m_total(class) >> 5) {
1578                                         (void) freelist_populate(class, 1,
1579                                             M_DONTWAIT);
1580                                 }
1581                                 break;
1582                         }
1583                 } else {
1584                         VERIFY(m_infree(class) == 0 || class == MC_CL);
1585
1586                         (void) freelist_populate(class, 1,
1587                             (wait & MCR_NOSLEEP) ? M_DONTWAIT : M_WAIT);
1588
1589                         if (m_infree(class) > 0)
1590                                 continue;
1591
1592                         /* Check if there's anything at the cache layer */
1593                         if (mbuf_cached_above(class, wait))
1594                                 break;
1595
1596                         /* We have nothing and cannot block; give up */
1597                         if (wait & MCR_NOSLEEP) {
1598                                 if (!(wait & MCR_TRYHARD)) {
1599                                         m_fail_cnt(class)++;
1600                                         mbstat.m_drops++;
1601                                         break;
1602                                 }
1603                         }
1604
1605                         /*
1606                          * If the freelist is still empty and the caller is
1607                          * willing to be blocked, sleep on the wait channel
1608                          * until an element is available.  Otherwise, if
1609                          * MCR_TRYHARD is set, do our best to satisfy the
1610                          * request without having to go to sleep.
1611                          */
1612                         if (mbuf_worker_ready &&
1613                             mbuf_sleep(class, need, wait))
1614                                 break;
1615
1616                         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1617                 }
1618         }
1619
1620         m_alloc_cnt(class) += num - need;
1621         lck_mtx_unlock(mbuf_mlock);
1622
1623         return (num - need);
1624 }
1625
1626 /*
1627  * Common de-allocator for rudimentary objects called by the CPU cache
1628  * layer when one or more elements need to be returned to the appropriate
1629  * global freelist.
1630  */
1631 static void
1632 mbuf_slab_free(void *arg, mcache_obj_t *list, __unused int purged)
1633 {
1634         mbuf_class_t class = (mbuf_class_t)arg;
1635         mcache_obj_t *nlist;
1636         unsigned int num = 0;
1637         int w;
1638
1639         ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
1640
1641         lck_mtx_lock(mbuf_mlock);
1642
1643         for (;;) {
1644                 nlist = list->obj_next;
1645                 list->obj_next = NULL;
1646                 slab_free(class, list);
1647                 ++num;
1648                 if ((list = nlist) == NULL)
1649                         break;
1650         }
1651         m_free_cnt(class) += num;
1652
1653         if ((w = mb_waiters) > 0)
1654                 mb_waiters = 0;
1655
1656         lck_mtx_unlock(mbuf_mlock);
1657
1658         if (w != 0)
1659                 wakeup(mb_waitchan);
1660 }
1661
1662 /*
1663  * Common auditor for rudimentary objects called by the CPU cache layer
1664  * during an allocation or free request.  For the former, this is called
1665  * after the objects are obtained from either the bucket or slab layer
1666  * and before they are returned to the caller.  For the latter, this is
1667  * called immediately during free and before placing the objects into
1668  * the bucket or slab layer.
1669  */
1670 static void
1671 mbuf_slab_audit(void *arg, mcache_obj_t *list, boolean_t alloc)
1672 {
1673         mbuf_class_t class = (mbuf_class_t)arg;
1674         mcache_audit_t *mca;
1675
1676         ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
1677
1678         while (list != NULL) {
1679                 lck_mtx_lock(mbuf_mlock);
1680                 mca = mcl_audit_buf2mca(class, list);
1681
1682                 /* Do the sanity checks */
1683                 if (class == MC_MBUF) {
1684                         mcl_audit_mbuf(mca, list, FALSE, alloc);
1685                         ASSERT(mca->mca_uflags & MB_SCVALID);
1686                 } else {
1687                         mcl_audit_cluster(mca, list, m_maxsize(class),
1688                             alloc, TRUE);
1689                         ASSERT(!(mca->mca_uflags & MB_SCVALID));
1690                 }
1691                 /* Record this transaction */
1692                 mcache_buffer_log(mca, list, m_cache(class));
1693                 if (alloc)
1694                         mca->mca_uflags |= MB_INUSE;
1695                 else
1696                         mca->mca_uflags &= ~MB_INUSE;
1697                 /* Unpair the object (unconditionally) */
1698                 mca->mca_uptr = NULL;
1699                 lck_mtx_unlock(mbuf_mlock);
1700
1701                 list = list->obj_next;
1702         }
1703 }
1704
1705 /*
1706  * Common notify routine for all caches.  It is called by mcache when
1707  * one or more objects get freed.  We use this indication to trigger
1708  * the wakeup of any sleeping threads so that they can retry their
1709  * allocation requests.
1710  */
1711 static void
1712 mbuf_slab_notify(void *arg, u_int32_t reason)
1713 {
1714         mbuf_class_t class = (mbuf_class_t)arg;
1715         int w;
1716
1717         ASSERT(MBUF_CLASS_VALID(class));
1718
1719         if (reason != MCN_RETRYALLOC)
1720                 return;
1721
1722         lck_mtx_lock(mbuf_mlock);
1723         if ((w = mb_waiters) > 0) {
1724                 m_notified(class)++;
1725                 mb_waiters = 0;
1726         }
1727         lck_mtx_unlock(mbuf_mlock);
1728
1729         if (w != 0)
1730                 wakeup(mb_waitchan);
1731 }
1732
1733 /*
1734  * Obtain object(s) from the composite class's freelist.
1735  */
1736 static unsigned int
1737 cslab_alloc(mbuf_class_t class, mcache_obj_t ***plist, unsigned int num)
1738 {
1739         unsigned int need = num;
1740         mcl_slab_t *sp, *clsp, *nsp;
1741         struct mbuf *m;
1742         mcache_obj_t **list = *plist;
1743         void *cl;
1744
1745         VERIFY(need > 0);
1746         VERIFY(class != MC_MBUF_16KCL || njcl > 0);
1747         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1748
1749         /* Get what we can from the freelist */
1750         while ((*list = m_cobjlist(class)) != NULL) {
1751                 MRANGE(*list);
1752
1753                 m = (struct mbuf *)*list;
1754                 sp = slab_get(m);
1755                 cl = m->m_ext.ext_buf;
1756                 clsp = slab_get(cl);
1757                 VERIFY(m->m_flags == M_EXT && cl != NULL);
1758                 VERIFY(MEXT_RFA(m) != NULL && MBUF_IS_COMPOSITE(m));
1759                 VERIFY(clsp->sl_refcnt == 1);
1760                 if (class == MC_MBUF_BIGCL) {
1761                         nsp = clsp->sl_next;
1762                         /* Next slab must already be present */
1763                         VERIFY(nsp != NULL);
1764                         VERIFY(nsp->sl_refcnt == 1);
1765                 } else if (class == MC_MBUF_16KCL) {
1766                         int k;
1767                         for (nsp = clsp, k = 1;
1768                             k < (M16KCLBYTES / MCLBYTES); k++) {
1769                                 nsp = nsp->sl_next;
1770                                 /* Next slab must already be present */
1771                                 VERIFY(nsp != NULL);
1772                                 VERIFY(nsp->sl_refcnt == 1);
1773                         }
1774                 }
1775
1776                 if ((m_cobjlist(class) = (*list)->obj_next) != NULL &&
1777                     !MBUF_IN_MAP(m_cobjlist(class))) {
1778                         slab_nextptr_panic(sp, m_cobjlist(class));
1779                         /* NOTREACHED */
1780                 }
1781                 (*list)->obj_next = NULL;
1782                 list = *plist = &(*list)->obj_next;
1783
1784                 if (--need == 0)
1785                         break;
1786         }
1787         m_infree(class) -= (num - need);
1788
1789         return (num - need);
1790 }
1791
1792 /*
1793  * Place object(s) back into a composite class's freelist.
1794  */
1795 static unsigned int
1796 cslab_free(mbuf_class_t class, mcache_obj_t *list, int purged)
1797 {
1798         mcache_obj_t *o, *tail;
1799         unsigned int num = 0;
1800         struct mbuf *m, *ms;
1801         mcache_audit_t *mca = NULL;
1802         mcache_obj_t *ref_list = NULL;
1803         mcl_slab_t *clsp, *nsp;
1804         void *cl;
1805
1806         ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
1807         VERIFY(class != MC_MBUF_16KCL || njcl > 0);
1808         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1809
1810         o = tail = list;
1811
1812         while ((m = ms = (struct mbuf *)o) != NULL) {
1813                 mcache_obj_t *rfa, *nexto = o->obj_next;
1814
1815                 /* Do the mbuf sanity checks */
1816                 if (mclaudit != NULL) {
1817                         mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
1818                         mcache_audit_free_verify(mca, m, 0, m_maxsize(MC_MBUF));
1819                         ms = (struct mbuf *)mca->mca_contents;
1820                 }
1821
1822                 /* Do the cluster sanity checks */
1823                 cl = ms->m_ext.ext_buf;
1824                 clsp = slab_get(cl);
1825                 if (mclaudit != NULL) {
1826                         size_t size;
1827                         if (class == MC_MBUF_CL)
1828                                 size = m_maxsize(MC_CL);
1829                         else if (class == MC_MBUF_BIGCL)
1830                                 size = m_maxsize(MC_BIGCL);
1831                         else
1832                                 size = m_maxsize(MC_16KCL);
1833                         mcache_audit_free_verify(mcl_audit_buf2mca(MC_CL,
1834                             (mcache_obj_t *)cl), cl, 0, size);
1835                 }
1836                 VERIFY(ms->m_type == MT_FREE);
1837                 VERIFY(ms->m_flags == M_EXT);
1838                 VERIFY(MEXT_RFA(ms) != NULL && MBUF_IS_COMPOSITE(ms));
1839                 VERIFY(clsp->sl_refcnt == 1);
1840                 if (class == MC_MBUF_BIGCL) {
1841                         nsp = clsp->sl_next;
1842                         /* Next slab must already be present */
1843                         VERIFY(nsp != NULL);
1844                         VERIFY(nsp->sl_refcnt == 1);
1845                 } else if (class == MC_MBUF_16KCL) {
1846                         int k;
1847                         for (nsp = clsp, k = 1;
1848                             k < (M16KCLBYTES / MCLBYTES); k++) {
1849                                 nsp = nsp->sl_next;
1850                                 /* Next slab must already be present */
1851                                 VERIFY(nsp != NULL);
1852                                 VERIFY(nsp->sl_refcnt == 1);
1853                         }
1854                 }
1855
1856                 /*
1857                  * If we're asked to purge, restore the actual mbuf using
1858                  * contents of the shadow structure (if auditing is enabled)
1859                  * and clear EXTF_COMPOSITE flag from the mbuf, as we are
1860                  * about to free it and the attached cluster into their caches.
1861                  */
1862                 if (purged) {
1863                         /* Restore constructed mbuf fields */
1864                         if (mclaudit != NULL)
1865                                 mcl_audit_restore_mbuf(m, mca, TRUE);
1866
1867                         MEXT_REF(m) = 0;
1868                         MEXT_FLAGS(m) = 0;
1869
1870                         rfa = (mcache_obj_t *)MEXT_RFA(m);
1871                         rfa->obj_next = ref_list;
1872                         ref_list = rfa;
1873                         MEXT_RFA(m) = NULL;
1874
1875                         m->m_type = MT_FREE;
1876                         m->m_flags = m->m_len = 0;
1877                         m->m_next = m->m_nextpkt = NULL;
1878
1879                         /* Save mbuf fields and make auditing happy */
1880                         if (mclaudit != NULL)
1881                                 mcl_audit_mbuf(mca, o, FALSE, FALSE);
1882
1883                         VERIFY(m_total(class) > 0);
1884                         m_total(class)--;
1885
1886                         /* Free the mbuf */
1887                         o->obj_next = NULL;
1888                         slab_free(MC_MBUF, o);
1889
1890                         /* And free the cluster */
1891                         ((mcache_obj_t *)cl)->obj_next = NULL;
1892                         if (class == MC_MBUF_CL)
1893                                 slab_free(MC_CL, cl);
1894                         else if (class == MC_MBUF_BIGCL)
1895                                 slab_free(MC_BIGCL, cl);
1896                         else
1897                                 slab_free(MC_16KCL, cl);
1898                 }
1899
1900                 ++num;
1901                 tail = o;
1902                 o = nexto;
1903         }
1904
1905         if (!purged) {
1906                 tail->obj_next = m_cobjlist(class);
1907                 m_cobjlist(class) = list;
1908                 m_infree(class) += num;
1909         } else if (ref_list != NULL) {
1910                 mcache_free_ext(ref_cache, ref_list);
1911         }
1912
1913         return (num);
1914 }
1915
1916 /*
1917  * Common allocator for composite objects called by the CPU cache layer
1918  * during an allocation request whenever there is no available element in
1919  * the bucket layer.  It returns one or more composite elements from the
1920  * appropriate global freelist.  If the freelist is empty, it will attempt
1921  * to obtain the rudimentary objects from their caches and construct them
1922  * into composite mbuf + cluster objects.
1923  */
1924 static unsigned int
1925 mbuf_cslab_alloc(void *arg, mcache_obj_t ***plist, unsigned int needed,
1926     int wait)
1927 {
1928         mbuf_class_t class = (mbuf_class_t)arg;
1929         mcache_t *cp = NULL;
1930         unsigned int num = 0, cnum = 0, want = needed;
1931         mcache_obj_t *ref_list = NULL;
1932         mcache_obj_t *mp_list = NULL;
1933         mcache_obj_t *clp_list = NULL;
1934         mcache_obj_t **list;
1935         struct ext_ref *rfa;
1936         struct mbuf *m;
1937         void *cl;
1938
1939         ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
1940         ASSERT(needed > 0);
1941
1942         VERIFY(class != MC_MBUF_16KCL || njcl > 0);
1943
1944         /* There should not be any slab for this class */
1945         VERIFY(m_slab_cnt(class) == 0 &&
1946             m_slablist(class).tqh_first == NULL &&
1947             m_slablist(class).tqh_last == NULL);
1948
1949         lck_mtx_lock(mbuf_mlock);
1950
1951         /* Try using the freelist first */
1952         num = cslab_alloc(class, plist, needed);
1953         list = *plist;
1954         if (num == needed) {
1955                 m_alloc_cnt(class) += num;
1956                 lck_mtx_unlock(mbuf_mlock);
1957                 return (needed);
1958         }
1959
1960         lck_mtx_unlock(mbuf_mlock);
1961
1962         /*
1963          * We could not satisfy the request using the freelist alone;
1964          * allocate from the appropriate rudimentary caches and use
1965          * whatever we can get to construct the composite objects.
1966          */
1967         needed -= num;
1968
1969         /*
1970          * Mark these allocation requests as coming from a composite cache.
1971          * Also, if the caller is willing to be blocked, mark the request
1972          * with MCR_FAILOK such that we don't end up sleeping at the mbuf
1973          * slab layer waiting for the individual object when one or more
1974          * of the already-constructed composite objects are available.
1975          */
1976         wait |= MCR_COMP;
1977         if (!(wait & MCR_NOSLEEP))
1978                 wait |= MCR_FAILOK;
1979
1980         needed = mcache_alloc_ext(m_cache(MC_MBUF), &mp_list, needed, wait);
1981         if (needed == 0) {
1982                 ASSERT(mp_list == NULL);
1983                 goto fail;
1984         }
1985         if (class == MC_MBUF_CL)
1986                 cp = m_cache(MC_CL);
1987         else if (class == MC_MBUF_BIGCL)
1988                 cp = m_cache(MC_BIGCL);
1989         else
1990                 cp = m_cache(MC_16KCL);
1991         needed = mcache_alloc_ext(cp, &clp_list, needed, wait);
1992         if (needed == 0) {
1993                 ASSERT(clp_list == NULL);
1994                 goto fail;
1995         }
1996         needed = mcache_alloc_ext(ref_cache, &ref_list, needed, wait);
1997         if (needed == 0) {
1998                 ASSERT(ref_list == NULL);
1999                 goto fail;
2000         }
2001
2002         /*
2003          * By this time "needed" is MIN(mbuf, cluster, ref).  Any left
2004          * overs will get freed accordingly before we return to caller.
2005          */
2006         for (cnum = 0; cnum < needed; cnum++) {
2007                 struct mbuf *ms;
2008
2009                 m = ms = (struct mbuf *)mp_list;
2010                 mp_list = mp_list->obj_next;
2011
2012                 cl = clp_list;
2013                 clp_list = clp_list->obj_next;
2014                 ((mcache_obj_t *)cl)->obj_next = NULL;
2015
2016                 rfa = (struct ext_ref *)ref_list;
2017                 ref_list = ref_list->obj_next;
2018                 ((mcache_obj_t *)rfa)->obj_next = NULL;
2019
2020                 /*
2021                  * If auditing is enabled, construct the shadow mbuf
2022                  * in the audit structure instead of in the actual one.
2023                  * mbuf_cslab_audit() will take care of restoring the
2024                  * contents after the integrity check.
2025                  */
2026                 if (mclaudit != NULL) {
2027                         mcache_audit_t *mca, *cl_mca;
2028                         size_t size;
2029
2030                         lck_mtx_lock(mbuf_mlock);
2031                         mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
2032                         ms = ((struct mbuf *)mca->mca_contents);
2033                         cl_mca = mcl_audit_buf2mca(MC_CL, (mcache_obj_t *)cl);
2034
2035                         /*
2036                          * Pair them up.  Note that this is done at the time
2037                          * the mbuf+cluster objects are constructed.  This
2038                          * information should be treated as "best effort"
2039                          * debugging hint since more than one mbufs can refer
2040                          * to a cluster.  In that case, the cluster might not
2041                          * be freed along with the mbuf it was paired with.
2042                          */
2043                         mca->mca_uptr = cl_mca;
2044                         cl_mca->mca_uptr = mca;
2045
2046                         ASSERT(mca->mca_uflags & MB_SCVALID);
2047                         ASSERT(!(cl_mca->mca_uflags & MB_SCVALID));
2048                         lck_mtx_unlock(mbuf_mlock);
2049
2050                         /* Technically, they are in the freelist */
2051                         mcache_set_pattern(MCACHE_FREE_PATTERN, m,
2052                             m_maxsize(MC_MBUF));
2053                         if (class == MC_MBUF_CL)
2054                                 size = m_maxsize(MC_CL);
2055                         else if (class == MC_MBUF_BIGCL)
2056                                 size = m_maxsize(MC_BIGCL);
2057                         else
2058                                 size = m_maxsize(MC_16KCL);
2059                         mcache_set_pattern(MCACHE_FREE_PATTERN, cl, size);
2060                 }
2061
2062                 MBUF_INIT(ms, 0, MT_FREE);
2063                 if (class == MC_MBUF_16KCL) {
2064                         MBUF_16KCL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE);
2065                 } else if (class == MC_MBUF_BIGCL) {
2066                         MBUF_BIGCL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE);
2067                 } else {
2068                         MBUF_CL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE);
2069                 }
2070                 VERIFY(ms->m_flags == M_EXT);
2071                 VERIFY(MEXT_RFA(ms) != NULL && MBUF_IS_COMPOSITE(ms));
2072
2073                 *list = (mcache_obj_t *)m;
2074                 (*list)->obj_next = NULL;
2075                 list = *plist = &(*list)->obj_next;
2076         }
2077
2078 fail:
2079         /*
2080          * Free up what's left of the above.
2081          */
2082         if (mp_list != NULL)
2083                 mcache_free_ext(m_cache(MC_MBUF), mp_list);
2084         if (clp_list != NULL)
2085                 mcache_free_ext(cp, clp_list);
2086         if (ref_list != NULL)
2087                 mcache_free_ext(ref_cache, ref_list);
2088
2089         lck_mtx_lock(mbuf_mlock);
2090         if (num > 0 || cnum > 0) {
2091                 m_total(class) += cnum;
2092                 VERIFY(m_total(class) <= m_maxlimit(class));
2093                 m_alloc_cnt(class) += num + cnum;
2094         }
2095         if ((num + cnum) < want)
2096                 m_fail_cnt(class) += (want - (num + cnum));
2097         lck_mtx_unlock(mbuf_mlock);
2098
2099         return (num + cnum);
2100 }
2101
2102 /*
2103  * Common de-allocator for composite objects called by the CPU cache
2104  * layer when one or more elements need to be returned to the appropriate
2105  * global freelist.
2106  */
2107 static void
2108 mbuf_cslab_free(void *arg, mcache_obj_t *list, int purged)
2109 {
2110         mbuf_class_t class = (mbuf_class_t)arg;
2111         unsigned int num;
2112         int w;
2113
2114         ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
2115
2116         lck_mtx_lock(mbuf_mlock);
2117
2118         num = cslab_free(class, list, purged);
2119         m_free_cnt(class) += num;
2120
2121         if ((w = mb_waiters) > 0)
2122                 mb_waiters = 0;
2123
2124         lck_mtx_unlock(mbuf_mlock);
2125
2126         if (w != 0)
2127                 wakeup(mb_waitchan);
2128 }
2129
2130 /*
2131  * Common auditor for composite objects called by the CPU cache layer
2132  * during an allocation or free request.  For the former, this is called
2133  * after the objects are obtained from either the bucket or slab layer
2134  * and before they are returned to the caller.  For the latter, this is
2135  * called immediately during free and before placing the objects into
2136  * the bucket or slab layer.
2137  */
2138 static void
2139 mbuf_cslab_audit(void *arg, mcache_obj_t *list, boolean_t alloc)
2140 {
2141         mbuf_class_t class = (mbuf_class_t)arg;
2142         mcache_audit_t *mca;
2143         struct mbuf *m, *ms;
2144         mcl_slab_t *clsp, *nsp;
2145         size_t size;
2146         void *cl;
2147
2148         ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
2149
2150         while ((m = ms = (struct mbuf *)list) != NULL) {
2151                 lck_mtx_lock(mbuf_mlock);
2152                 /* Do the mbuf sanity checks and record its transaction */
2153                 mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
2154                 mcl_audit_mbuf(mca, m, TRUE, alloc);
2155                 mcache_buffer_log(mca, m, m_cache(class));
2156                 if (alloc)
2157                         mca->mca_uflags |= MB_COMP_INUSE;
2158                 else
2159                         mca->mca_uflags &= ~MB_COMP_INUSE;
2160
2161                 /*
2162                  * Use the shadow mbuf in the audit structure if we are
2163                  * freeing, since the contents of the actual mbuf has been
2164                  * pattern-filled by the above call to mcl_audit_mbuf().
2165                  */
2166                 if (!alloc)
2167                         ms = (struct mbuf *)mca->mca_contents;
2168
2169                 /* Do the cluster sanity checks and record its transaction */
2170                 cl = ms->m_ext.ext_buf;
2171                 clsp = slab_get(cl);
2172                 VERIFY(ms->m_flags == M_EXT && cl != NULL);
2173                 VERIFY(MEXT_RFA(ms) != NULL && MBUF_IS_COMPOSITE(ms));
2174                 VERIFY(clsp->sl_refcnt == 1);
2175                 if (class == MC_MBUF_BIGCL) {
2176                         nsp = clsp->sl_next;
2177                         /* Next slab must already be present */
2178                         VERIFY(nsp != NULL);
2179                         VERIFY(nsp->sl_refcnt == 1);
2180                 } else if (class == MC_MBUF_16KCL) {
2181                         int k;
2182                         for (nsp = clsp, k = 1;
2183                             k < (M16KCLBYTES / MCLBYTES); k++) {
2184                                 nsp = nsp->sl_next;
2185                                 /* Next slab must already be present */
2186                                 VERIFY(nsp != NULL);
2187                                 VERIFY(nsp->sl_refcnt == 1);
2188                         }
2189                 }
2190
2191                 mca = mcl_audit_buf2mca(MC_CL, cl);
2192                 if (class == MC_MBUF_CL)
2193                         size = m_maxsize(MC_CL);
2194                 else if (class == MC_MBUF_BIGCL)
2195                         size = m_maxsize(MC_BIGCL);
2196                 else
2197                         size = m_maxsize(MC_16KCL);
2198                 mcl_audit_cluster(mca, cl, size, alloc, FALSE);
2199                 mcache_buffer_log(mca, cl, m_cache(class));
2200                 if (alloc)
2201                         mca->mca_uflags |= MB_COMP_INUSE;
2202                 else
2203                         mca->mca_uflags &= ~MB_COMP_INUSE;
2204                 lck_mtx_unlock(mbuf_mlock);
2205
2206                 list = list->obj_next;
2207         }
2208 }
2209
2210 /*
2211  * Allocate some number of mbuf clusters and place on cluster freelist.
2212  */
2213 static int
2214 m_clalloc(const u_int32_t num, const int wait, const u_int32_t bufsize)
2215 {
2216         int i;
2217         vm_size_t size = 0;
2218         int numpages = 0, large_buffer = (bufsize == m_maxsize(MC_16KCL));
2219         vm_offset_t page = 0;
2220         mcache_audit_t *mca_list = NULL;
2221         mcache_obj_t *con_list = NULL;
2222         mcl_slab_t *sp;
2223
2224         VERIFY(bufsize == m_maxsize(MC_CL) ||
2225             bufsize == m_maxsize(MC_BIGCL) || bufsize == m_maxsize(MC_16KCL));
2226
2227         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2228
2229         /*
2230          * Multiple threads may attempt to populate the cluster map one
2231          * after another.  Since we drop the lock below prior to acquiring
2232          * the physical page(s), our view of the cluster map may no longer
2233          * be accurate, and we could end up over-committing the pages beyond
2234          * the maximum allowed for each class.  To prevent it, this entire
2235          * operation (including the page mapping) is serialized.
2236          */
2237         while (mb_clalloc_busy) {
2238                 mb_clalloc_waiters++;
2239                 (void) msleep(mb_clalloc_waitchan, mbuf_mlock,
2240                     (PZERO-1), "m_clalloc", NULL);
2241                 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2242         }
2243
2244         /* We are busy now; tell everyone else to go away */
2245         mb_clalloc_busy = TRUE;
2246
2247         /*
2248          * Honor the caller's wish to block or not block.  We have a way
2249          * to grow the pool asynchronously using the mbuf worker thread.
2250          */
2251         i = m_howmany(num, bufsize);
2252         if (i == 0 || (wait & M_DONTWAIT))
2253                 goto out;
2254
2255         lck_mtx_unlock(mbuf_mlock);
2256
2257         size = round_page(i * bufsize);
2258         page = kmem_mb_alloc(mb_map, size, large_buffer);
2259
2260         /*
2261          * If we did ask for "n" 16K physically contiguous chunks
2262          * and didn't get them, then please try again without this
2263          * restriction.
2264          */
2265         if (large_buffer && page == 0)
2266                 page = kmem_mb_alloc(mb_map, size, 0);
2267
2268         if (page == 0) {
2269                 if (bufsize <= m_maxsize(MC_BIGCL)) {
2270                         /* Try for 1 page if failed, only for 2KB/4KB request */
2271                         size = NBPG;
2272                         page = kmem_mb_alloc(mb_map, size, 0);
2273                 }
2274
2275                 if (page == 0) {
2276                         lck_mtx_lock(mbuf_mlock);
2277                         goto out;
2278                 }
2279         }
2280
2281         VERIFY(IS_P2ALIGNED(page, NBPG));
2282         numpages = size / NBPG;
2283
2284         /* If auditing is enabled, allocate the audit structures now */
2285         if (mclaudit != NULL) {
2286                 int needed;
2287
2288                 /*
2289                  * Yes, I realize this is a waste of memory for clusters
2290                  * that never get transformed into mbufs, as we may end
2291                  * up with NMBPCL-1 unused audit structures per cluster.
2292                  * But doing so tremendously simplifies the allocation
2293                  * strategy, since at this point we are not holding the
2294                  * mbuf lock and the caller is okay to be blocked.  For
2295                  * the case of big clusters, we allocate one structure
2296                  * for each as we never turn them into mbufs.
2297                  */
2298                 if (bufsize == m_maxsize(MC_CL)) {
2299                         needed = numpages * 2 * NMBPCL;
2300
2301                         i = mcache_alloc_ext(mcl_audit_con_cache,
2302                             &con_list, needed, MCR_SLEEP);
2303
2304                         VERIFY(con_list != NULL && i == needed);
2305                 } else if (bufsize == m_maxsize(MC_BIGCL)) {
2306                         needed = numpages;
2307                 } else {
2308                         needed = numpages / (M16KCLBYTES / NBPG);
2309                 }
2310
2311                 i = mcache_alloc_ext(mcache_audit_cache,
2312                     (mcache_obj_t **)&mca_list, needed, MCR_SLEEP);
2313
2314                 VERIFY(mca_list != NULL && i == needed);
2315         }
2316
2317         lck_mtx_lock(mbuf_mlock);
2318
2319         for (i = 0; i < numpages; i++, page += NBPG) {
2320                 ppnum_t offset = ((char *)page - (char *)mbutl) / NBPG;
2321                 ppnum_t new_page = pmap_find_phys(kernel_pmap,
2322                     (vm_offset_t)page);
2323
2324                 /*
2325                  * In the case of no mapper being available the following
2326                  * code noops and returns the input page; if there is a
2327                  * mapper the appropriate I/O page is returned.
2328                  */
2329                 VERIFY(offset < mcl_pages);
2330                 new_page = IOMapperInsertPage(mcl_paddr_base, offset, new_page);
2331                 mcl_paddr[offset] = new_page << PGSHIFT;
2332
2333                 /* Pattern-fill this fresh page */
2334                 if (mclaudit != NULL)
2335                         mcache_set_pattern(MCACHE_FREE_PATTERN,
2336                             (caddr_t)page, NBPG);
2337
2338                 if (bufsize == m_maxsize(MC_CL)) {
2339                         union mcluster *mcl = (union mcluster *)page;
2340
2341                         /* 1st cluster in the page */
2342                         sp = slab_get(mcl);
2343                         if (mclaudit != NULL)
2344                                 mcl_audit_init(mcl, &mca_list, &con_list,
2345                                     AUDIT_CONTENTS_SIZE, NMBPCL);
2346
2347                         VERIFY(sp->sl_refcnt == 0 && sp->sl_flags == 0);
2348                         slab_init(sp, MC_CL, SLF_MAPPED,
2349                             mcl, mcl, bufsize, 0, 1);
2350
2351                         /* Insert this slab */
2352                         slab_insert(sp, MC_CL);
2353
2354                         /* Update stats now since slab_get() drops the lock */
2355                         mbstat.m_clfree = ++m_infree(MC_CL) +
2356                             m_infree(MC_MBUF_CL);
2357                         mbstat.m_clusters = ++m_total(MC_CL);
2358                         VERIFY(m_total(MC_CL) <= m_maxlimit(MC_CL));
2359
2360                         /* 2nd cluster in the page */
2361                         sp = slab_get(++mcl);
2362                         if (mclaudit != NULL)
2363                                 mcl_audit_init(mcl, &mca_list, &con_list,
2364                                     AUDIT_CONTENTS_SIZE, NMBPCL);
2365
2366                         VERIFY(sp->sl_refcnt == 0 && sp->sl_flags == 0);
2367                         slab_init(sp, MC_CL, SLF_MAPPED,
2368                             mcl, mcl, bufsize, 0, 1);
2369
2370                         /* Insert this slab */
2371                         slab_insert(sp, MC_CL);
2372
2373                         /* Update stats now since slab_get() drops the lock */
2374                         mbstat.m_clfree = ++m_infree(MC_CL) +
2375                             m_infree(MC_MBUF_CL);
2376                         mbstat.m_clusters = ++m_total(MC_CL);
2377                         VERIFY(m_total(MC_CL) <= m_maxlimit(MC_CL));
2378                 } else if (bufsize == m_maxsize(MC_BIGCL)) {
2379                         union mbigcluster *mbc = (union mbigcluster *)page;
2380                         mcl_slab_t *nsp;
2381
2382                         /* One for the entire page */
2383                         sp = slab_get(mbc);
2384                         if (mclaudit != NULL)
2385                                 mcl_audit_init(mbc, &mca_list, NULL, 0, 1);
2386
2387                         VERIFY(sp->sl_refcnt == 0 && sp->sl_flags == 0);
2388                         slab_init(sp, MC_BIGCL, SLF_MAPPED,
2389                             mbc, mbc, bufsize, 0, 1);
2390
2391                         /* 2nd cluster's slab is part of the previous one */
2392                         nsp = slab_get(((union mcluster *)page) + 1);
2393                         slab_init(nsp, MC_BIGCL, SLF_MAPPED | SLF_PARTIAL,
2394                             mbc, NULL, 0, 0, 0);
2395
2396                         /* Insert this slab */
2397                         slab_insert(sp, MC_BIGCL);
2398
2399                         /* Update stats now since slab_get() drops the lock */
2400                         mbstat.m_bigclfree = ++m_infree(MC_BIGCL) +
2401                             m_infree(MC_MBUF_BIGCL);
2402                         mbstat.m_bigclusters = ++m_total(MC_BIGCL);
2403                         VERIFY(m_total(MC_BIGCL) <= m_maxlimit(MC_BIGCL));
2404                 } else if ((i % (M16KCLBYTES / NBPG)) == 0) {
2405                         union m16kcluster *m16kcl = (union m16kcluster *)page;
2406                         mcl_slab_t *nsp;
2407                         int k;
2408
2409                         VERIFY(njcl > 0);
2410                         /* One for the entire 16KB */
2411                         sp = slab_get(m16kcl);
2412                         if (mclaudit != NULL)
2413                                 mcl_audit_init(m16kcl, &mca_list, NULL, 0, 1);
2414
2415                         VERIFY(sp->sl_refcnt == 0 && sp->sl_flags == 0);
2416                         slab_init(sp, MC_16KCL, SLF_MAPPED,
2417                             m16kcl, m16kcl, bufsize, 0, 1);
2418
2419                         /* 2nd-8th cluster's slab is part of the first one */
2420                         for (k = 1; k < (M16KCLBYTES / MCLBYTES); k++) {
2421                                 nsp = slab_get(((union mcluster *)page) + k);
2422                                 VERIFY(nsp->sl_refcnt == 0 &&
2423                                     nsp->sl_flags == 0);
2424                                 slab_init(nsp, MC_16KCL,
2425                                     SLF_MAPPED | SLF_PARTIAL,
2426                                     m16kcl, NULL, 0, 0, 0);
2427                         }
2428
2429                         /* Insert this slab */
2430                         slab_insert(sp, MC_16KCL);
2431
2432                         /* Update stats now since slab_get() drops the lock */
2433                         m_infree(MC_16KCL)++;
2434                         m_total(MC_16KCL)++;
2435                         VERIFY(m_total(MC_16KCL) <= m_maxlimit(MC_16KCL));
2436                 }
2437         }
2438         VERIFY(mca_list == NULL && con_list == NULL);
2439
2440         /* We're done; let others enter */
2441         mb_clalloc_busy = FALSE;
2442         if (mb_clalloc_waiters > 0) {
2443                 mb_clalloc_waiters = 0;
2444                 wakeup(mb_clalloc_waitchan);
2445         }
2446
2447         if (bufsize == m_maxsize(MC_CL))
2448                 return (numpages << 1);
2449         else if (bufsize == m_maxsize(MC_BIGCL))
2450                 return (numpages);
2451
2452         VERIFY(bufsize == m_maxsize(MC_16KCL));
2453         return (numpages / (M16KCLBYTES / NBPG));
2454
2455 out:
2456         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2457
2458         /* We're done; let others enter */
2459         mb_clalloc_busy = FALSE;
2460         if (mb_clalloc_waiters > 0) {
2461                 mb_clalloc_waiters = 0;
2462                 wakeup(mb_clalloc_waitchan);
2463         }
2464
2465         /*
2466          * When non-blocking we kick a thread if we have to grow the
2467          * pool or if the number of free clusters is less than requested.
2468          */
2469         if (bufsize == m_maxsize(MC_CL)) {
2470                 if (i > 0) {
2471                         /*
2472                          * Remember total number of clusters needed
2473                          * at this time.
2474                          */
2475                         i += m_total(MC_CL);
2476                         if (i > mbuf_expand_mcl) {
2477                                 mbuf_expand_mcl = i;
2478                                 if (mbuf_worker_ready)
2479                                         wakeup((caddr_t)&mbuf_worker_run);
2480                         }
2481                 }
2482
2483                 if (m_infree(MC_CL) >= num)
2484                         return (1);
2485         } else if (bufsize == m_maxsize(MC_BIGCL)) {
2486                 if (i > 0) {
2487                         /*
2488                          * Remember total number of 4KB clusters needed
2489                          * at this time.
2490                          */
2491                         i += m_total(MC_BIGCL);
2492                         if (i > mbuf_expand_big) {
2493                                 mbuf_expand_big = i;
2494                                 if (mbuf_worker_ready)
2495                                         wakeup((caddr_t)&mbuf_worker_run);
2496                         }
2497                 }
2498
2499                 if (m_infree(MC_BIGCL) >= num)
2500                         return (1);
2501         } else {
2502                 if (i > 0) {
2503                         /*
2504                          * Remember total number of 16KB clusters needed
2505                          * at this time.
2506                          */
2507                         i += m_total(MC_16KCL);
2508                         if (i > mbuf_expand_16k) {
2509                                 mbuf_expand_16k = i;
2510                                 if (mbuf_worker_ready)
2511                                         wakeup((caddr_t)&mbuf_worker_run);
2512                         }
2513                 }
2514
2515                 if (m_infree(MC_16KCL) >= num)
2516                         return (1);
2517         }
2518         return (0);
2519 }
2520
2521 /*
2522  * Populate the global freelist of the corresponding buffer class.
2523  */
2524 static int
2525 freelist_populate(mbuf_class_t class, unsigned int num, int wait)
2526 {
2527         mcache_obj_t *o = NULL;
2528         int i;
2529
2530         VERIFY(class == MC_MBUF || class == MC_CL || class == MC_BIGCL ||
2531             class == MC_16KCL);
2532
2533 #if CONFIG_MBUF_NOEXPAND
2534         if ((mbstat.m_mbufs / NMBPCL) >= maxmbufcl) {
2535 #if DEBUG
2536                 static int printonce = 1;
2537                 if (printonce == 1) {
2538                         printonce = 0;
2539                         printf("m_expand failed, allocated %ld out of %d "
2540                             "clusters\n", mbstat.m_mbufs / NMBPCL,
2541                             nmbclusters);
2542                 }
2543 #endif /* DEBUG */
2544                 return (0);
2545         }
2546 #endif /* CONFIG_MBUF_NOEXPAND */
2547
2548         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2549
2550         switch (class) {
2551         case MC_MBUF:
2552         case MC_CL:
2553                 i = m_clalloc(num, wait, m_maxsize(MC_CL));
2554
2555                 /* Respect the 2K clusters minimum limit */
2556                 if (m_total(MC_CL) == m_maxlimit(MC_CL) &&
2557                     m_infree(MC_CL) <= m_minlimit(MC_CL)) {
2558                         if (class != MC_CL || (wait & MCR_COMP))
2559                                 return (0);
2560                 }
2561                 if (class == MC_CL)
2562                         return (i != 0);
2563                 break;
2564
2565         case MC_BIGCL:
2566         case MC_16KCL:
2567                 return (m_clalloc(num, wait, m_maxsize(class)) != 0);
2568                 /* NOTREACHED */
2569
2570         default:
2571                 VERIFY(0);
2572                 /* NOTREACHED */
2573         }
2574
2575         /* Steal a cluster and cut it up to create NMBPCL mbufs */
2576         if ((o = slab_alloc(MC_CL, wait)) != NULL) {
2577                 struct mbuf *m = (struct mbuf *)o;
2578                 mcache_audit_t *mca = NULL;
2579                 mcl_slab_t *sp = slab_get(o);
2580
2581                 VERIFY(slab_is_detached(sp) &&
2582                     (sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED);
2583
2584                 /* Make sure that the cluster is unmolested while in freelist */
2585                 if (mclaudit != NULL) {
2586                         mca = mcl_audit_buf2mca(MC_CL, o);
2587                         mcache_audit_free_verify(mca, o, 0, m_maxsize(MC_CL));
2588                 }
2589
2590                 /* Reinitialize it as an mbuf slab */
2591                 slab_init(sp, MC_MBUF, sp->sl_flags, sp->sl_base, NULL,
2592                     sp->sl_len, 0, NMBPCL);
2593
2594                 VERIFY(m == (struct mbuf *)sp->sl_base);
2595                 VERIFY(sp->sl_head == NULL);
2596
2597                 m_total(MC_MBUF) += NMBPCL;
2598                 mbstat.m_mbufs = m_total(MC_MBUF);
2599                 m_infree(MC_MBUF) += NMBPCL;
2600                 mtype_stat_add(MT_FREE, NMBPCL);
2601
2602                 i = NMBPCL;
2603                 while (i--) {
2604                         /*
2605                          * If auditing is enabled, construct the shadow mbuf
2606                          * in the audit structure instead of the actual one.
2607                          * mbuf_slab_audit() will take care of restoring the
2608                          * contents after the integrity check.
2609                          */
2610                         if (mclaudit != NULL) {
2611                                 struct mbuf *ms;
2612                                 mca = mcl_audit_buf2mca(MC_MBUF,
2613                                     (mcache_obj_t *)m);
2614                                 ms = ((struct mbuf *)mca->mca_contents);
2615                                 ms->m_type = MT_FREE;
2616                         } else {
2617                                 m->m_type = MT_FREE;
2618                         }
2619                         m->m_next = sp->sl_head;
2620                         sp->sl_head = (void *)m++;
2621                 }
2622
2623                 /* Insert it into the mbuf class's slab list */
2624                 slab_insert(sp, MC_MBUF);
2625
2626                 if ((i = mb_waiters) > 0)
2627                         mb_waiters = 0;
2628                 if (i != 0)
2629                         wakeup(mb_waitchan);
2630
2631                 return (1);
2632         }
2633
2634         return (0);
2635 }
2636
2637 /*
2638  * (Inaccurately) check if it might be worth a trip back to the
2639  * mcache layer due the availability of objects there.  We'll
2640  * end up back here if there's nothing up there.
2641  */
2642 static boolean_t
2643 mbuf_cached_above(mbuf_class_t class, int wait)
2644 {
2645         switch (class) {
2646         case MC_MBUF:
2647                 if (wait & MCR_COMP)
2648                         return (!mcache_bkt_isempty(m_cache(MC_MBUF_CL)) ||
2649                             !mcache_bkt_isempty(m_cache(MC_MBUF_BIGCL)));
2650                 break;
2651
2652         case MC_CL:
2653                 if (wait & MCR_COMP)
2654                         return (!mcache_bkt_isempty(m_cache(MC_MBUF_CL)));
2655                 break;
2656
2657         case MC_BIGCL:
2658                 if (wait & MCR_COMP)
2659                         return (!mcache_bkt_isempty(m_cache(MC_MBUF_BIGCL)));
2660                 break;
2661
2662         case MC_16KCL:
2663                 if (wait & MCR_COMP)
2664                         return (!mcache_bkt_isempty(m_cache(MC_MBUF_16KCL)));
2665                 break;
2666
2667         case MC_MBUF_CL:
2668         case MC_MBUF_BIGCL:
2669         case MC_MBUF_16KCL:
2670                 break;
2671
2672         default:
2673                 VERIFY(0);
2674                 /* NOTREACHED */
2675         }
2676
2677         return (!mcache_bkt_isempty(m_cache(class)));
2678 }
2679
2680 /*
2681  * If possible, convert constructed objects to raw ones.
2682  */
2683 static boolean_t
2684 mbuf_steal(mbuf_class_t class, unsigned int num)
2685 {
2686         mcache_obj_t *top = NULL;
2687         mcache_obj_t **list = &top;
2688         unsigned int tot = 0;
2689
2690         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2691
2692         switch (class) {
2693         case MC_MBUF:
2694         case MC_CL:
2695         case MC_BIGCL:
2696         case MC_16KCL:
2697                 return (FALSE);
2698
2699         case MC_MBUF_CL:
2700         case MC_MBUF_BIGCL:
2701         case MC_MBUF_16KCL:
2702                 /* Get the required number of constructed objects if possible */
2703                 if (m_infree(class) > m_minlimit(class)) {
2704                         tot = cslab_alloc(class, &list,
2705                             MIN(num, m_infree(class)));
2706                 }
2707
2708                 /* And destroy them to get back the raw objects */
2709                 if (top != NULL)
2710                         (void) cslab_free(class, top, 1);
2711                 break;
2712
2713         default:
2714                 VERIFY(0);
2715                 /* NOTREACHED */
2716         }
2717
2718         return (tot == num);
2719 }
2720
2721 static void
2722 m_reclaim(mbuf_class_t class, unsigned int num, boolean_t comp)
2723 {
2724         int m, bmap = 0;
2725
2726         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2727
2728         VERIFY(m_total(MC_CL) <= m_maxlimit(MC_CL));
2729         VERIFY(m_total(MC_BIGCL) <= m_maxlimit(MC_BIGCL));
2730         VERIFY(m_total(MC_16KCL) <= m_maxlimit(MC_16KCL));
2731
2732         /*
2733          * This logic can be made smarter; for now, simply mark
2734          * all other related classes as potential victims.
2735          */
2736         switch (class) {
2737         case MC_MBUF:
2738                 m_wantpurge(MC_CL)++;
2739                 m_wantpurge(MC_MBUF_CL)++;
2740                 m_wantpurge(MC_MBUF_BIGCL)++;
2741                 break;
2742
2743         case MC_CL:
2744                 m_wantpurge(MC_MBUF)++;
2745                 if (!comp)
2746                         m_wantpurge(MC_MBUF_CL)++;
2747                 break;
2748
2749         case MC_BIGCL:
2750                 if (!comp)
2751                         m_wantpurge(MC_MBUF_BIGCL)++;
2752                 break;
2753
2754         case MC_16KCL:
2755                 if (!comp)
2756                         m_wantpurge(MC_MBUF_16KCL)++;
2757                 break;
2758
2759         default:
2760                 VERIFY(0);
2761                 /* NOTREACHED */
2762         }
2763
2764         /*
2765          * Run through each marked class and check if we really need to
2766          * purge (and therefore temporarily disable) the per-CPU caches
2767          * layer used by the class.  If so, remember the classes since
2768          * we are going to drop the lock below prior to purging.
2769          */
2770         for (m = 0; m < NELEM(mbuf_table); m++) {
2771                 if (m_wantpurge(m) > 0) {
2772                         m_wantpurge(m) = 0;
2773                         /*
2774                          * Try hard to steal the required number of objects
2775                          * from the freelist of other mbuf classes.  Only
2776                          * purge and disable the per-CPU caches layer when
2777                          * we don't have enough; it's the last resort.
2778                          */
2779                         if (!mbuf_steal(m, num))
2780                                 bmap |= (1 << m);
2781                 }
2782         }
2783
2784         lck_mtx_unlock(mbuf_mlock);
2785
2786         if (bmap != 0) {
2787                 /* drain is performed in pfslowtimo(), to avoid deadlocks */
2788                 do_reclaim = 1;
2789
2790                 /* Sigh; we have no other choices but to ask mcache to purge */
2791                 for (m = 0; m < NELEM(mbuf_table); m++) {
2792                         if ((bmap & (1 << m)) &&
2793                             mcache_purge_cache(m_cache(m))) {
2794                                 lck_mtx_lock(mbuf_mlock);
2795                                 m_purge_cnt(m)++;
2796                                 mbstat.m_drain++;
2797                                 lck_mtx_unlock(mbuf_mlock);
2798                         }
2799                 }
2800         } else {
2801                 /*
2802                  * Request mcache to reap extra elements from all of its caches;
2803                  * note that all reaps are serialized and happen only at a fixed
2804                  * interval.
2805                  */
2806                 mcache_reap();
2807         }
2808         lck_mtx_lock(mbuf_mlock);
2809 }
2810
2811 static inline struct mbuf *
2812 m_get_common(int wait, short type, int hdr)
2813 {
2814         struct mbuf *m;
2815         int mcflags = MSLEEPF(wait);
2816
2817         /* Is this due to a non-blocking retry?  If so, then try harder */
2818         if (mcflags & MCR_NOSLEEP)
2819                 mcflags |= MCR_TRYHARD;
2820
2821         m = mcache_alloc(m_cache(MC_MBUF), mcflags);
2822         if (m != NULL) {
2823                 MBUF_INIT(m, hdr, type);
2824                 mtype_stat_inc(type);
2825                 mtype_stat_dec(MT_FREE);
2826 #if CONFIG_MACF_NET
2827                 if (hdr && mac_init_mbuf(m, wait) != 0) {
2828                         m_free(m);
2829                         return (NULL);
2830                 }
2831 #endif /* MAC_NET */
2832         }
2833         return (m);
2834 }
2835
2836 /*
2837  * Space allocation routines; these are also available as macros
2838  * for critical paths.
2839  */
2840 #define _M_GET(wait, type)      m_get_common(wait, type, 0)
2841 #define _M_GETHDR(wait, type)   m_get_common(wait, type, 1)
2842 #define _M_RETRY(wait, type)    _M_GET(wait, type)
2843 #define _M_RETRYHDR(wait, type) _M_GETHDR(wait, type)
2844 #define _MGET(m, how, type)     ((m) = _M_GET(how, type))
2845 #define _MGETHDR(m, how, type)  ((m) = _M_GETHDR(how, type))
2846
2847 struct mbuf *
2848 m_get(int wait, int type)
2849 {
2850         return (_M_GET(wait, type));
2851 }
2852
2853 struct mbuf *
2854 m_gethdr(int wait, int type)
2855 {
2856         return (_M_GETHDR(wait, type));
2857 }
2858
2859 struct mbuf *
2860 m_retry(int wait, int type)
2861 {
2862         return (_M_RETRY(wait, type));
2863 }
2864
2865 struct mbuf *
2866 m_retryhdr(int wait, int type)
2867 {
2868         return (_M_RETRYHDR(wait, type));
2869 }
2870
2871 struct mbuf *
2872 m_getclr(int wait, int type)
2873 {
2874         struct mbuf *m;
2875
2876         _MGET(m, wait, type);
2877         if (m != NULL)
2878                 bzero(MTOD(m, caddr_t), MLEN);
2879         return (m);
2880 }
2881
2882 struct mbuf *
2883 m_free(struct mbuf *m)
2884 {
2885         struct mbuf *n = m->m_next;
2886
2887         if (m->m_type == MT_FREE)
2888                 panic("m_free: freeing an already freed mbuf");
2889
2890         /* Free the aux data and tags if there is any */
2891         if (m->m_flags & M_PKTHDR) {
2892                 m_tag_delete_chain(m, NULL);
2893         }
2894
2895         if (m->m_flags & M_EXT) {
2896                 u_int32_t refcnt;
2897                 u_int32_t flags;
2898
2899                 refcnt = m_decref(m);
2900                 flags = MEXT_FLAGS(m);
2901                 if (refcnt == 0 && flags == 0) {
2902                         if (m->m_ext.ext_free == NULL) {
2903                                 mcache_free(m_cache(MC_CL), m->m_ext.ext_buf);
2904                         } else if (m->m_ext.ext_free == m_bigfree) {
2905                                 mcache_free(m_cache(MC_BIGCL),
2906                                     m->m_ext.ext_buf);
2907                         } else if (m->m_ext.ext_free == m_16kfree) {
2908                                 mcache_free(m_cache(MC_16KCL),
2909                                     m->m_ext.ext_buf);
2910                         } else {
2911                                 (*(m->m_ext.ext_free))(m->m_ext.ext_buf,
2912                                     m->m_ext.ext_size, m->m_ext.ext_arg);
2913                         }
2914                         mcache_free(ref_cache, MEXT_RFA(m));
2915                         MEXT_RFA(m) = NULL;
2916                 } else if (refcnt == 0 && (flags & EXTF_COMPOSITE)) {
2917                         VERIFY(m->m_type != MT_FREE);
2918
2919                         mtype_stat_dec(m->m_type);
2920                         mtype_stat_inc(MT_FREE);
2921
2922                         m->m_type = MT_FREE;
2923                         m->m_flags = M_EXT;
2924                         m->m_len = 0;
2925                         m->m_next = m->m_nextpkt = NULL;
2926
2927                         /* "Free" into the intermediate cache */
2928                         if (m->m_ext.ext_free == NULL) {
2929                                 mcache_free(m_cache(MC_MBUF_CL), m);
2930                         } else if (m->m_ext.ext_free == m_bigfree) {
2931                                 mcache_free(m_cache(MC_MBUF_BIGCL), m);
2932                         } else {
2933                                 VERIFY(m->m_ext.ext_free == m_16kfree);
2934                                 mcache_free(m_cache(MC_MBUF_16KCL), m);
2935                         }
2936                         return (n);
2937                 }
2938         }
2939
2940         if (m->m_type != MT_FREE) {
2941                 mtype_stat_dec(m->m_type);
2942                 mtype_stat_inc(MT_FREE);
2943         }
2944
2945         m->m_type = MT_FREE;
2946         m->m_flags = m->m_len = 0;
2947         m->m_next = m->m_nextpkt = NULL;
2948
2949         mcache_free(m_cache(MC_MBUF), m);
2950
2951         return (n);
2952 }
2953
2954 __private_extern__ struct mbuf *
2955 m_clattach(struct mbuf *m, int type, caddr_t extbuf,
2956     void (*extfree)(caddr_t, u_int, caddr_t), u_int extsize, caddr_t extarg,
2957     int wait)
2958 {
2959         struct ext_ref *rfa = NULL;
2960
2961         if (m == NULL && (m = _M_GETHDR(wait, type)) == NULL)
2962                 return (NULL);
2963
2964         if (m->m_flags & M_EXT) {
2965                 u_int32_t refcnt;
2966                 u_int32_t flags;
2967
2968                 refcnt = m_decref(m);
2969                 flags = MEXT_FLAGS(m);
2970                 if (refcnt == 0 && flags == 0) {
2971                         if (m->m_ext.ext_free == NULL) {
2972                                 mcache_free(m_cache(MC_CL), m->m_ext.ext_buf);
2973                         } else if (m->m_ext.ext_free == m_bigfree) {
2974                                 mcache_free(m_cache(MC_BIGCL),
2975                                     m->m_ext.ext_buf);
2976                         } else if (m->m_ext.ext_free == m_16kfree) {
2977                                 mcache_free(m_cache(MC_16KCL),
2978                                     m->m_ext.ext_buf);
2979                         } else {
2980                                 (*(m->m_ext.ext_free))(m->m_ext.ext_buf,
2981                                     m->m_ext.ext_size, m->m_ext.ext_arg);
2982                         }
2983                         /* Re-use the reference structure */
2984                         rfa = MEXT_RFA(m);
2985                 } else if (refcnt == 0 && (flags & EXTF_COMPOSITE)) {
2986                         VERIFY(m->m_type != MT_FREE);
2987
2988                         mtype_stat_dec(m->m_type);
2989                         mtype_stat_inc(MT_FREE);
2990
2991                         m->m_type = MT_FREE;
2992                         m->m_flags = M_EXT;
2993                         m->m_len = 0;
2994                         m->m_next = m->m_nextpkt = NULL;
2995                         /* "Free" into the intermediate cache */
2996                         if (m->m_ext.ext_free == NULL) {
2997                                 mcache_free(m_cache(MC_MBUF_CL), m);
2998                         } else if (m->m_ext.ext_free == m_bigfree) {
2999                                 mcache_free(m_cache(MC_MBUF_BIGCL), m);
3000                         } else {
3001                                 VERIFY(m->m_ext.ext_free == m_16kfree);
3002                                 mcache_free(m_cache(MC_MBUF_16KCL), m);
3003                         }
3004                         /*
3005                          * Allocate a new mbuf, since we didn't divorce
3006                          * the composite mbuf + cluster pair above.
3007                          */
3008                         if ((m = _M_GETHDR(wait, type)) == NULL)
3009                                 return (NULL);
3010                 }
3011         }
3012
3013         if (rfa == NULL &&
3014             (rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL) {
3015                 m_free(m);
3016                 return (NULL);
3017         }
3018
3019         MEXT_INIT(m, extbuf, extsize, extfree, extarg, rfa, 1, 0);
3020
3021         return (m);
3022 }
3023
3024 /*
3025  * Perform `fast' allocation mbuf clusters from a cache of recently-freed
3026  * clusters. (If the cache is empty, new clusters are allocated en-masse.)
3027  */
3028 struct mbuf *
3029 m_getcl(int wait, int type, int flags)
3030 {
3031         struct mbuf *m;
3032         int mcflags = MSLEEPF(wait);
3033         int hdr = (flags & M_PKTHDR);
3034
3035         /* Is this due to a non-blocking retry?  If so, then try harder */
3036         if (mcflags & MCR_NOSLEEP)
3037                 mcflags |= MCR_TRYHARD;
3038
3039         m = mcache_alloc(m_cache(MC_MBUF_CL), mcflags);
3040         if (m != NULL) {
3041                 MBUF_INIT(m, hdr, type);
3042                 mtype_stat_inc(type);
3043                 mtype_stat_dec(MT_FREE);
3044 #if CONFIG_MACF_NET
3045                 if (hdr && mac_init_mbuf(m, wait) != 0) {
3046                         m_free(m);
3047                         return (NULL);
3048                 }
3049 #endif /* MAC_NET */
3050         }
3051         return (m);
3052 }
3053
3054 /* m_mclget() add an mbuf cluster to a normal mbuf */
3055 struct mbuf *
3056 m_mclget(struct mbuf *m, int wait)
3057 {
3058         struct ext_ref *rfa;
3059
3060         if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL)
3061                 return (m);
3062
3063         m->m_ext.ext_buf = m_mclalloc(wait);
3064         if (m->m_ext.ext_buf != NULL) {
3065                 MBUF_CL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0);
3066         } else {
3067                 mcache_free(ref_cache, rfa);
3068         }
3069         return (m);
3070 }
3071
3072 /* Allocate an mbuf cluster */
3073 caddr_t
3074 m_mclalloc(int wait)
3075 {
3076         int mcflags = MSLEEPF(wait);
3077
3078         /* Is this due to a non-blocking retry?  If so, then try harder */
3079         if (mcflags & MCR_NOSLEEP)
3080                 mcflags |= MCR_TRYHARD;
3081
3082         return (mcache_alloc(m_cache(MC_CL), mcflags));
3083 }
3084
3085 /* Free an mbuf cluster */
3086 void
3087 m_mclfree(caddr_t p)
3088 {
3089         mcache_free(m_cache(MC_CL), p);
3090 }
3091
3092 /*
3093  * mcl_hasreference() checks if a cluster of an mbuf is referenced by
3094  * another mbuf
3095  */
3096 int
3097 m_mclhasreference(struct mbuf *m)
3098 {
3099         if (!(m->m_flags & M_EXT))
3100                 return (0);
3101
3102         ASSERT(MEXT_RFA(m) != NULL);
3103
3104         return (MEXT_REF(m) > 1);
3105 }
3106
3107 __private_extern__ caddr_t
3108 m_bigalloc(int wait)
3109 {
3110         int mcflags = MSLEEPF(wait);
3111
3112         /* Is this due to a non-blocking retry?  If so, then try harder */
3113         if (mcflags & MCR_NOSLEEP)
3114                 mcflags |= MCR_TRYHARD;
3115
3116         return (mcache_alloc(m_cache(MC_BIGCL), mcflags));
3117 }
3118
3119 __private_extern__ void
3120 m_bigfree(caddr_t p, __unused u_int size, __unused caddr_t arg)
3121 {
3122         mcache_free(m_cache(MC_BIGCL), p);
3123 }
3124
3125 /* m_mbigget() add an 4KB mbuf cluster to a normal mbuf */
3126 __private_extern__ struct mbuf *
3127 m_mbigget(struct mbuf *m, int wait)
3128 {
3129         struct ext_ref *rfa;
3130
3131         if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL)
3132                 return (m);
3133
3134         m->m_ext.ext_buf =  m_bigalloc(wait);
3135         if (m->m_ext.ext_buf != NULL) {
3136                 MBUF_BIGCL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0);
3137         } else {
3138                 mcache_free(ref_cache, rfa);
3139         }
3140         return (m);
3141 }
3142
3143 __private_extern__ caddr_t
3144 m_16kalloc(int wait)
3145 {
3146         int mcflags = MSLEEPF(wait);
3147
3148         /* Is this due to a non-blocking retry?  If so, then try harder */
3149         if (mcflags & MCR_NOSLEEP)
3150                 mcflags |= MCR_TRYHARD;
3151
3152         return (mcache_alloc(m_cache(MC_16KCL), mcflags));
3153 }
3154
3155 __private_extern__ void
3156 m_16kfree(caddr_t p, __unused u_int size, __unused caddr_t arg)
3157 {
3158         mcache_free(m_cache(MC_16KCL), p);
3159 }
3160
3161 /* m_m16kget() add a 16KB mbuf cluster to a normal mbuf */
3162 __private_extern__ struct mbuf *
3163 m_m16kget(struct mbuf *m, int wait)
3164 {
3165         struct ext_ref *rfa;
3166
3167         if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL)
3168                 return (m);
3169
3170         m->m_ext.ext_buf =  m_16kalloc(wait);
3171         if (m->m_ext.ext_buf != NULL) {
3172                 MBUF_16KCL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0);
3173         } else {
3174                 mcache_free(ref_cache, rfa);
3175         }
3176         return (m);
3177 }
3178
3179 /*
3180  * "Move" mbuf pkthdr from "from" to "to".
3181  * "from" must have M_PKTHDR set, and "to" must be empty.
3182  */
3183 void
3184 m_copy_pkthdr(struct mbuf *to, struct mbuf *from)
3185 {
3186         /* We will be taking over the tags of 'to' */
3187         if (to->m_flags & M_PKTHDR)
3188                 m_tag_delete_chain(to, NULL);
3189         to->m_pkthdr = from->m_pkthdr;          /* especially tags */
3190         m_tag_init(from);                       /* purge tags from src */
3191         m_prio_init(from);                      /* reset priority from src */
3192         to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT);
3193         if ((to->m_flags & M_EXT) == 0)
3194                 to->m_data = to->m_pktdat;
3195 }
3196
3197 /*
3198  * Duplicate "from"'s mbuf pkthdr in "to".
3199  * "from" must have M_PKTHDR set, and "to" must be empty.
3200  * In particular, this does a deep copy of the packet tags.
3201  */
3202 static int
3203 m_dup_pkthdr(struct mbuf *to, struct mbuf *from, int how)
3204 {
3205         if (to->m_flags & M_PKTHDR)
3206                 m_tag_delete_chain(to, NULL);
3207         to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT);
3208         if ((to->m_flags & M_EXT) == 0)
3209                 to->m_data = to->m_pktdat;
3210         to->m_pkthdr = from->m_pkthdr;
3211         m_tag_init(to);
3212         return (m_tag_copy_chain(to, from, how));
3213 }
3214
3215 /*
3216  * Return a list of mbuf hdrs that point to clusters.  Try for num_needed;
3217  * if wantall is not set, return whatever number were available.  Set up the
3218  * first num_with_pkthdrs with mbuf hdrs configured as packet headers; these
3219  * are chained on the m_nextpkt field.  Any packets requested beyond this
3220  * are chained onto the last packet header's m_next field.  The size of
3221  * the cluster is controlled by the parameter bufsize.
3222  */
3223 __private_extern__ struct mbuf *
3224 m_getpackets_internal(unsigned int *num_needed, int num_with_pkthdrs,
3225     int wait, int wantall, size_t bufsize)
3226 {
3227         struct mbuf *m;
3228         struct mbuf **np, *top;
3229         unsigned int pnum, needed = *num_needed;
3230         mcache_obj_t *mp_list = NULL;
3231         int mcflags = MSLEEPF(wait);
3232         u_int32_t flag;
3233         struct ext_ref *rfa;
3234         mcache_t *cp;
3235         void *cl;
3236
3237         ASSERT(bufsize == m_maxsize(MC_CL) ||
3238             bufsize == m_maxsize(MC_BIGCL) ||
3239             bufsize == m_maxsize(MC_16KCL));
3240
3241         /*
3242          * Caller must first check for njcl because this
3243          * routine is internal and not exposed/used via KPI.
3244          */
3245         VERIFY(bufsize != m_maxsize(MC_16KCL) || njcl > 0);
3246
3247         top = NULL;
3248         np = &top;
3249         pnum = 0;
3250
3251         /*
3252          * The caller doesn't want all the requested buffers; only some.
3253          * Try hard to get what we can, but don't block.  This effectively
3254          * overrides MCR_SLEEP, since this thread will not go to sleep
3255          * if we can't get all the buffers.
3256          */
3257         if (!wantall || (mcflags & MCR_NOSLEEP))
3258                 mcflags |= MCR_TRYHARD;
3259
3260         /* Allocate the composite mbuf + cluster elements from the cache */
3261         if (bufsize == m_maxsize(MC_CL))
3262                 cp = m_cache(MC_MBUF_CL);
3263         else if (bufsize == m_maxsize(MC_BIGCL))
3264                 cp = m_cache(MC_MBUF_BIGCL);
3265         else
3266                 cp = m_cache(MC_MBUF_16KCL);
3267         needed = mcache_alloc_ext(cp, &mp_list, needed, mcflags);
3268
3269         for (pnum = 0; pnum < needed; pnum++) {
3270                 m = (struct mbuf *)mp_list;
3271                 mp_list = mp_list->obj_next;
3272
3273                 VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT);
3274                 cl = m->m_ext.ext_buf;
3275                 rfa = MEXT_RFA(m);
3276
3277                 ASSERT(cl != NULL && rfa != NULL);
3278                 VERIFY(MBUF_IS_COMPOSITE(m));
3279
3280                 flag = MEXT_FLAGS(m);
3281
3282                 MBUF_INIT(m, num_with_pkthdrs, MT_DATA);
3283                 if (bufsize == m_maxsize(MC_16KCL)) {
3284                         MBUF_16KCL_INIT(m, cl, rfa, 1, flag);
3285                 } else if (bufsize == m_maxsize(MC_BIGCL)) {
3286                         MBUF_BIGCL_INIT(m, cl, rfa, 1, flag);
3287                 } else {
3288                         MBUF_CL_INIT(m, cl, rfa, 1, flag);
3289                 }
3290
3291                 if (num_with_pkthdrs > 0) {
3292                         --num_with_pkthdrs;
3293 #if CONFIG_MACF_NET
3294                         if (mac_mbuf_label_init(m, wait) != 0) {
3295                                 m_free(m);
3296                                 break;
3297                         }
3298 #endif /* MAC_NET */
3299                 }
3300
3301                 *np = m;
3302                 if (num_with_pkthdrs > 0)
3303                         np = &m->m_nextpkt;
3304                 else
3305                         np = &m->m_next;
3306         }
3307         ASSERT(pnum != *num_needed || mp_list == NULL);
3308         if (mp_list != NULL)
3309                 mcache_free_ext(cp, mp_list);
3310
3311         if (pnum > 0) {
3312                 mtype_stat_add(MT_DATA, pnum);
3313                 mtype_stat_sub(MT_FREE, pnum);
3314         }
3315
3316         if (wantall && (pnum != *num_needed)) {
3317                 if (top != NULL)
3318                         m_freem_list(top);
3319                 return (NULL);
3320         }
3321
3322         *num_needed = pnum;
3323         return (top);
3324 }
3325
3326 /*
3327  * Return list of mbuf linked by m_nextpkt.  Try for numlist, and if
3328  * wantall is not set, return whatever number were available.  The size of
3329  * each mbuf in the list is controlled by the parameter packetlen.  Each
3330  * mbuf of the list may have a chain of mbufs linked by m_next.  Each mbuf
3331  * in the chain is called a segment.  If maxsegments is not null and the
3332  * value pointed to is not null, this specify the maximum number of segments
3333  * for a chain of mbufs.  If maxsegments is zero or the value pointed to
3334  * is zero the caller does not have any restriction on the number of segments.
3335  * The actual  number of segments of a mbuf chain is return in the value
3336  * pointed to by maxsegments.
3337  */
3338 __private_extern__ struct mbuf *
3339 m_allocpacket_internal(unsigned int *numlist, size_t packetlen,
3340     unsigned int *maxsegments, int wait, int wantall, size_t wantsize)
3341 {
3342         struct mbuf **np, *top, *first = NULL;
3343         size_t bufsize, r_bufsize;
3344         unsigned int num = 0;
3345         unsigned int nsegs = 0;
3346         unsigned int needed, resid;
3347         int mcflags = MSLEEPF(wait);
3348         mcache_obj_t *mp_list = NULL, *rmp_list = NULL;
3349         mcache_t *cp = NULL, *rcp = NULL;
3350
3351         if (*numlist == 0)
3352                 return (NULL);
3353
3354         top = NULL;
3355         np = &top;
3356
3357         if (wantsize == 0) {
3358                 if (packetlen <= MINCLSIZE) {
3359                         bufsize = packetlen;
3360                 } else if (packetlen > m_maxsize(MC_CL)) {
3361                         /* Use 4KB if jumbo cluster pool isn't available */
3362                         if (packetlen <= m_maxsize(MC_BIGCL) || njcl == 0)
3363                                 bufsize = m_maxsize(MC_BIGCL);
3364                         else
3365                                 bufsize = m_maxsize(MC_16KCL);
3366                 } else {
3367                         bufsize = m_maxsize(MC_CL);
3368                 }
3369         } else if (wantsize == m_maxsize(MC_CL) ||
3370             wantsize == m_maxsize(MC_BIGCL) ||
3371             (wantsize == m_maxsize(MC_16KCL) && njcl > 0)) {
3372                 bufsize = wantsize;
3373         } else {
3374                 return (NULL);
3375         }
3376
3377         if (bufsize <= MHLEN) {
3378                 nsegs = 1;
3379         } else if (bufsize <= MINCLSIZE) {
3380                 if (maxsegments != NULL && *maxsegments == 1) {
3381                         bufsize = m_maxsize(MC_CL);
3382                         nsegs = 1;
3383                 } else {
3384                         nsegs = 2;
3385                 }
3386         } else if (bufsize == m_maxsize(MC_16KCL)) {
3387                 VERIFY(njcl > 0);
3388                 nsegs = ((packetlen - 1) >> (PGSHIFT + 2)) + 1;
3389         } else if (bufsize == m_maxsize(MC_BIGCL)) {
3390                 nsegs = ((packetlen - 1) >> PGSHIFT) + 1;
3391         } else {
3392                 nsegs = ((packetlen - 1) >> MCLSHIFT) + 1;
3393         }
3394         if (maxsegments != NULL) {
3395                 if (*maxsegments && nsegs > *maxsegments) {
3396                         *maxsegments = nsegs;
3397                         return (NULL);
3398                 }
3399                 *maxsegments = nsegs;
3400         }
3401
3402         /*
3403          * The caller doesn't want all the requested buffers; only some.
3404          * Try hard to get what we can, but don't block.  This effectively
3405          * overrides MCR_SLEEP, since this thread will not go to sleep
3406          * if we can't get all the buffers.
3407          */
3408         if (!wantall || (mcflags & MCR_NOSLEEP))
3409                 mcflags |= MCR_TRYHARD;
3410
3411         /*
3412          * Simple case where all elements in the lists/chains are mbufs.
3413          * Unless bufsize is greater than MHLEN, each segment chain is made
3414          * up of exactly 1 mbuf.  Otherwise, each segment chain is made up
3415          * of 2 mbufs; the second one is used for the residual data, i.e.
3416          * the remaining data that cannot fit into the first mbuf.
3417          */
3418         if (bufsize <= MINCLSIZE) {
3419                 /* Allocate the elements in one shot from the mbuf cache */
3420                 ASSERT(bufsize <= MHLEN || nsegs == 2);
3421                 cp = m_cache(MC_MBUF);
3422                 needed = mcache_alloc_ext(cp, &mp_list,
3423                     (*numlist) * nsegs, mcflags);
3424
3425                 /*
3426                  * The number of elements must be even if we are to use an
3427                  * mbuf (instead of a cluster) to store the residual data.
3428                  * If we couldn't allocate the requested number of mbufs,
3429                  * trim the number down (if it's odd) in order to avoid
3430                  * creating a partial segment chain.
3431                  */
3432                 if (bufsize > MHLEN && (needed & 0x1))
3433                         needed--;
3434
3435                 while (num < needed) {
3436                         struct mbuf *m;
3437
3438                         m = (struct mbuf *)mp_list;
3439                         mp_list = mp_list->obj_next;
3440                         ASSERT(m != NULL);
3441
3442                         MBUF_INIT(m, 1, MT_DATA);
3443 #if CONFIG_MACF_NET
3444                         if (mac_init_mbuf(m, wait) != 0) {
3445                                 m_free(m);
3446                                 break;
3447                         }
3448 #endif /* MAC_NET */
3449                         num++;
3450                         if (bufsize > MHLEN) {
3451                                 /* A second mbuf for this segment chain */
3452                                 m->m_next = (struct mbuf *)mp_list;
3453                                 mp_list = mp_list->obj_next;
3454                                 ASSERT(m->m_next != NULL);
3455
3456                                 MBUF_INIT(m->m_next, 0, MT_DATA);
3457                                 num++;
3458                         }
3459                         *np = m;
3460                         np = &m->m_nextpkt;
3461                 }
3462                 ASSERT(num != *numlist || mp_list == NULL);
3463
3464                 if (num > 0) {
3465                         mtype_stat_add(MT_DATA, num);
3466                         mtype_stat_sub(MT_FREE, num);
3467                 }
3468                 num /= nsegs;
3469
3470                 /* We've got them all; return to caller */
3471                 if (num == *numlist)
3472                         return (top);
3473
3474                 goto fail;
3475         }
3476
3477         /*
3478          * Complex cases where elements are made up of one or more composite
3479          * mbufs + cluster, depending on packetlen.  Each N-segment chain can
3480          * be illustrated as follows:
3481          *
3482          * [mbuf + cluster 1] [mbuf + cluster 2] ... [mbuf + cluster N]
3483          *
3484          * Every composite mbuf + cluster element comes from the intermediate
3485          * cache (either MC_MBUF_CL or MC_MBUF_BIGCL).  For space efficiency,
3486          * the last composite element will come from the MC_MBUF_CL cache,
3487          * unless the residual data is larger than 2KB where we use the
3488          * big cluster composite cache (MC_MBUF_BIGCL) instead.  Residual
3489          * data is defined as extra data beyond the first element that cannot
3490          * fit into the previous element, i.e. there is no residual data if
3491          * the chain only has 1 segment.
3492          */
3493         r_bufsize = bufsize;
3494         resid = packetlen > bufsize ? packetlen % bufsize : 0;
3495         if (resid > 0) {
3496                 /* There is residual data; figure out the cluster size */
3497                 if (wantsize == 0 && packetlen > MINCLSIZE) {
3498                         /*
3499                          * Caller didn't request that all of the segments
3500                          * in the chain use the same cluster size; use the
3501                          * smaller of the cluster sizes.
3502                          */
3503                         if (njcl > 0 && resid > m_maxsize(MC_BIGCL))
3504                                 r_bufsize = m_maxsize(MC_16KCL);
3505                         else if (resid > m_maxsize(MC_CL))
3506                                 r_bufsize = m_maxsize(MC_BIGCL);
3507                         else
3508                                 r_bufsize = m_maxsize(MC_CL);
3509                 } else {
3510                         /* Use the same cluster size as the other segments */
3511                         resid = 0;
3512                 }
3513         }
3514
3515         needed = *numlist;
3516         if (resid > 0) {
3517                 /*
3518                  * Attempt to allocate composite mbuf + cluster elements for
3519                  * the residual data in each chain; record the number of such
3520                  * elements that can be allocated so that we know how many
3521                  * segment chains we can afford to create.
3522                  */
3523                 if (r_bufsize <= m_maxsize(MC_CL))
3524                         rcp = m_cache(MC_MBUF_CL);
3525                 else if (r_bufsize <= m_maxsize(MC_BIGCL))
3526                         rcp = m_cache(MC_MBUF_BIGCL);
3527                 else
3528                         rcp = m_cache(MC_MBUF_16KCL);
3529                 needed = mcache_alloc_ext(rcp, &rmp_list, *numlist, mcflags);
3530
3531                 if (needed == 0)
3532                         goto fail;
3533
3534                 /* This is temporarily reduced for calculation */
3535                 ASSERT(nsegs > 1);
3536                 nsegs--;
3537         }
3538
3539         /*
3540          * Attempt to allocate the rest of the composite mbuf + cluster
3541          * elements for the number of segment chains that we need.
3542          */
3543         if (bufsize <= m_maxsize(MC_CL))
3544                 cp = m_cache(MC_MBUF_CL);
3545         else if (bufsize <= m_maxsize(MC_BIGCL))
3546                 cp = m_cache(MC_MBUF_BIGCL);
3547         else
3548                 cp = m_cache(MC_MBUF_16KCL);
3549         needed = mcache_alloc_ext(cp, &mp_list, needed * nsegs, mcflags);
3550
3551         /* Round it down to avoid creating a partial segment chain */
3552         needed = (needed / nsegs) * nsegs;
3553         if (needed == 0)
3554                 goto fail;
3555
3556         if (resid > 0) {
3557                 /*
3558                  * We're about to construct the chain(s); take into account
3559                  * the number of segments we have created above to hold the
3560                  * residual data for each chain, as well as restore the
3561                  * original count of segments per chain.
3562                  */
3563                 ASSERT(nsegs > 0);
3564                 needed += needed / nsegs;
3565                 nsegs++;
3566         }
3567
3568         for (;;) {
3569                 struct mbuf *m;
3570                 u_int32_t flag;
3571                 struct ext_ref *rfa;
3572                 void *cl;
3573                 int pkthdr;
3574
3575                 ++num;
3576                 if (nsegs == 1 || (num % nsegs) != 0 || resid == 0) {
3577                         m = (struct mbuf *)mp_list;
3578                         mp_list = mp_list->obj_next;
3579                 } else {
3580                         m = (struct mbuf *)rmp_list;
3581                         rmp_list = rmp_list->obj_next;
3582                 }
3583                 ASSERT(m != NULL);
3584                 VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT);
3585                 VERIFY(m->m_ext.ext_free == NULL ||
3586                     m->m_ext.ext_free == m_bigfree ||
3587                     m->m_ext.ext_free == m_16kfree);
3588
3589                 cl = m->m_ext.ext_buf;
3590                 rfa = MEXT_RFA(m);
3591
3592                 ASSERT(cl != NULL && rfa != NULL);
3593                 VERIFY(MBUF_IS_COMPOSITE(m));
3594
3595                 flag = MEXT_FLAGS(m);
3596
3597                 pkthdr = (nsegs == 1 || (num % nsegs) == 1);
3598                 if (pkthdr)
3599                         first = m;
3600                 MBUF_INIT(m, pkthdr, MT_DATA);
3601                 if (m->m_ext.ext_free == m_16kfree) {
3602                         MBUF_16KCL_INIT(m, cl, rfa, 1, flag);
3603                 } else if (m->m_ext.ext_free == m_bigfree) {
3604                         MBUF_BIGCL_INIT(m, cl, rfa, 1, flag);
3605                 } else {
3606                         MBUF_CL_INIT(m, cl, rfa, 1, flag);
3607                 }
3608 #if CONFIG_MACF_NET
3609                 if (pkthdr && mac_init_mbuf(m, wait) != 0) {
3610                         --num;
3611                         m_free(m);
3612                         break;
3613                 }
3614 #endif /* MAC_NET */
3615
3616                 *np = m;
3617                 if ((num % nsegs) == 0)
3618                         np = &first->m_nextpkt;
3619                 else
3620                         np = &m->m_next;
3621
3622                 if (num == needed)
3623                         break;
3624         }
3625
3626         if (num > 0) {
3627                 mtype_stat_add(MT_DATA, num);
3628                 mtype_stat_sub(MT_FREE, num);
3629         }
3630
3631         num /= nsegs;
3632
3633         /* We've got them all; return to caller */
3634         if (num == *numlist) {
3635                 ASSERT(mp_list == NULL && rmp_list == NULL);
3636                 return (top);
3637         }
3638
3639 fail:
3640         /* Free up what's left of the above */
3641         if (mp_list != NULL)
3642                 mcache_free_ext(cp, mp_list);
3643         if (rmp_list != NULL)
3644                 mcache_free_ext(rcp, rmp_list);
3645         if (wantall && top != NULL) {
3646                 m_freem(top);
3647                 return (NULL);
3648         }
3649         *numlist = num;
3650         return (top);
3651 }
3652
3653 /*
3654  * Best effort to get a mbuf cluster + pkthdr.  Used by drivers to allocated
3655  * packets on receive ring.
3656  */
3657 __private_extern__ struct mbuf *
3658 m_getpacket_how(int wait)
3659 {
3660         unsigned int num_needed = 1;
3661
3662         return (m_getpackets_internal(&num_needed, 1, wait, 1,
3663             m_maxsize(MC_CL)));
3664 }
3665
3666 /*
3667  * Best effort to get a mbuf cluster + pkthdr.  Used by drivers to allocated
3668  * packets on receive ring.
3669  */
3670 struct mbuf *
3671 m_getpacket(void)
3672 {
3673         unsigned int num_needed = 1;
3674
3675         return (m_getpackets_internal(&num_needed, 1, M_WAIT, 1,
3676             m_maxsize(MC_CL)));
3677 }
3678
3679 /*
3680  * Return a list of mbuf hdrs that point to clusters.  Try for num_needed;
3681  * if this can't be met, return whatever number were available.  Set up the
3682  * first num_with_pkthdrs with mbuf hdrs configured as packet headers.  These
3683  * are chained on the m_nextpkt field.  Any packets requested beyond this are
3684  * chained onto the last packet header's m_next field.
3685  */
3686 struct mbuf *
3687 m_getpackets(int num_needed, int num_with_pkthdrs, int how)
3688 {
3689         unsigned int n = num_needed;
3690
3691         return (m_getpackets_internal(&n, num_with_pkthdrs, how, 0,
3692             m_maxsize(MC_CL)));
3693 }
3694
3695 /*
3696  * Return a list of mbuf hdrs set up as packet hdrs chained together
3697  * on the m_nextpkt field
3698  */
3699 struct mbuf *
3700 m_getpackethdrs(int num_needed, int how)
3701 {
3702         struct mbuf *m;
3703         struct mbuf **np, *top;
3704
3705         top = NULL;
3706         np = &top;
3707
3708         while (num_needed--) {
3709                 m = _M_RETRYHDR(how, MT_DATA);
3710                 if (m == NULL)
3711                         break;
3712
3713                 *np = m;
3714                 np = &m->m_nextpkt;
3715         }
3716
3717         return (top);
3718 }
3719
3720 /*
3721  * Free an mbuf list (m_nextpkt) while following m_next.  Returns the count
3722  * for mbufs packets freed.  Used by the drivers.
3723  */
3724 int
3725 m_freem_list(struct mbuf *m)
3726 {
3727         struct mbuf *nextpkt;
3728         mcache_obj_t *mp_list = NULL;
3729         mcache_obj_t *mcl_list = NULL;
3730         mcache_obj_t *mbc_list = NULL;
3731         mcache_obj_t *m16k_list = NULL;
3732         mcache_obj_t *m_mcl_list = NULL;
3733         mcache_obj_t *m_mbc_list = NULL;
3734         mcache_obj_t *m_m16k_list = NULL;
3735         mcache_obj_t *ref_list = NULL;
3736         int pktcount = 0;
3737         int mt_free = 0, mt_data = 0, mt_header = 0, mt_soname = 0, mt_tag = 0;
3738
3739         while (m != NULL) {
3740                 pktcount++;
3741
3742                 nextpkt = m->m_nextpkt;
3743                 m->m_nextpkt = NULL;
3744
3745                 while (m != NULL) {
3746                         struct mbuf *next = m->m_next;
3747                         mcache_obj_t *o, *rfa;
3748                         u_int32_t refcnt, flags;
3749
3750                         if (m->m_type == MT_FREE)
3751                                 panic("m_free: freeing an already freed mbuf");
3752
3753                         if (m->m_type != MT_FREE)
3754                                 mt_free++;
3755
3756                         if (m->m_flags & M_PKTHDR) {
3757                                 m_tag_delete_chain(m, NULL);
3758                         }
3759
3760                         if (!(m->m_flags & M_EXT))
3761                                 goto simple_free;
3762
3763                         o = (mcache_obj_t *)m->m_ext.ext_buf;
3764                         refcnt = m_decref(m);
3765                         flags = MEXT_FLAGS(m);
3766                         if (refcnt == 0 && flags == 0) {
3767                                 if (m->m_ext.ext_free == NULL) {
3768                                         o->obj_next = mcl_list;
3769                                         mcl_list = o;
3770                                 } else if (m->m_ext.ext_free == m_bigfree) {
3771                                         o->obj_next = mbc_list;
3772                                         mbc_list = o;
3773                                 } else if (m->m_ext.ext_free == m_16kfree) {
3774                                         o->obj_next = m16k_list;
3775                                         m16k_list = o;
3776                                 } else {
3777                                         (*(m->m_ext.ext_free))((caddr_t)o,
3778                                             m->m_ext.ext_size,
3779                                             m->m_ext.ext_arg);
3780                                 }
3781                                 rfa = (mcache_obj_t *)MEXT_RFA(m);
3782                                 rfa->obj_next = ref_list;
3783                                 ref_list = rfa;
3784                                 MEXT_RFA(m) = NULL;
3785                         } else if (refcnt == 0 && (flags & EXTF_COMPOSITE)) {
3786                                 VERIFY(m->m_type != MT_FREE);
3787                                 /*
3788                                  * Amortize the costs of atomic operations
3789                                  * by doing them at the end, if possible.
3790                                  */
3791                                 if (m->m_type == MT_DATA)
3792                                         mt_data++;
3793                                 else if (m->m_type == MT_HEADER)
3794                                         mt_header++;
3795                                 else if (m->m_type == MT_SONAME)
3796                                         mt_soname++;
3797                                 else if (m->m_type == MT_TAG)
3798                                         mt_tag++;
3799                                 else
3800                                         mtype_stat_dec(m->m_type);
3801
3802                                 m->m_type = MT_FREE;
3803                                 m->m_flags = M_EXT;
3804                                 m->m_len = 0;
3805                                 m->m_next = m->m_nextpkt = NULL;
3806
3807                                 /* "Free" into the intermediate cache */
3808                                 o = (mcache_obj_t *)m;
3809                                 if (m->m_ext.ext_free == NULL) {
3810                                         o->obj_next = m_mcl_list;
3811                                         m_mcl_list = o;
3812                                 } else if (m->m_ext.ext_free == m_bigfree) {
3813                                         o->obj_next = m_mbc_list;
3814                                         m_mbc_list = o;
3815                                 } else {
3816                                         VERIFY(m->m_ext.ext_free == m_16kfree);
3817                                         o->obj_next = m_m16k_list;
3818                                         m_m16k_list = o;
3819                                 }
3820                                 m = next;
3821                                 continue;
3822                         }
3823 simple_free:
3824                         /*
3825                          * Amortize the costs of atomic operations
3826                          * by doing them at the end, if possible.
3827                          */
3828                         if (m->m_type == MT_DATA)
3829                                 mt_data++;
3830                         else if (m->m_type == MT_HEADER)
3831                                 mt_header++;
3832                         else if (m->m_type == MT_SONAME)
3833                                 mt_soname++;
3834                         else if (m->m_type == MT_TAG)
3835                                 mt_tag++;
3836                         else if (m->m_type != MT_FREE)
3837                                 mtype_stat_dec(m->m_type);
3838
3839                         m->m_type = MT_FREE;
3840                         m->m_flags = m->m_len = 0;
3841                         m->m_next = m->m_nextpkt = NULL;
3842
3843                         ((mcache_obj_t *)m)->obj_next = mp_list;
3844                         mp_list = (mcache_obj_t *)m;
3845
3846                         m = next;
3847                 }
3848
3849                 m = nextpkt;
3850         }
3851
3852         if (mt_free > 0)
3853                 mtype_stat_add(MT_FREE, mt_free);
3854         if (mt_data > 0)
3855                 mtype_stat_sub(MT_DATA, mt_data);
3856         if (mt_header > 0)
3857                 mtype_stat_sub(MT_HEADER, mt_header);
3858         if (mt_soname > 0)
3859                 mtype_stat_sub(MT_SONAME, mt_soname);
3860         if (mt_tag > 0)
3861                 mtype_stat_sub(MT_TAG, mt_tag);
3862
3863         if (mp_list != NULL)
3864                 mcache_free_ext(m_cache(MC_MBUF), mp_list);
3865         if (mcl_list != NULL)
3866                 mcache_free_ext(m_cache(MC_CL), mcl_list);
3867         if (mbc_list != NULL)
3868                 mcache_free_ext(m_cache(MC_BIGCL), mbc_list);
3869         if (m16k_list != NULL)
3870                 mcache_free_ext(m_cache(MC_16KCL), m16k_list);
3871         if (m_mcl_list != NULL)
3872                 mcache_free_ext(m_cache(MC_MBUF_CL), m_mcl_list);
3873         if (m_mbc_list != NULL)
3874                 mcache_free_ext(m_cache(MC_MBUF_BIGCL), m_mbc_list);
3875         if (m_m16k_list != NULL)
3876                 mcache_free_ext(m_cache(MC_MBUF_16KCL), m_m16k_list);
3877         if (ref_list != NULL)
3878                 mcache_free_ext(ref_cache, ref_list);
3879
3880         return (pktcount);
3881 }
3882
3883 void
3884 m_freem(struct mbuf *m)
3885 {
3886         while (m != NULL)
3887                 m = m_free(m);
3888 }
3889
3890 /*
3891  * Mbuffer utility routines.
3892  */
3893
3894 /*
3895  * Compute the amount of space available before the current start
3896  * of data in an mbuf.
3897  */
3898 int
3899 m_leadingspace(struct mbuf *m)
3900 {
3901         if (m->m_flags & M_EXT) {
3902                 if (MCLHASREFERENCE(m))
3903                         return (0);
3904                 return (m->m_data - m->m_ext.ext_buf);
3905         }
3906         if (m->m_flags & M_PKTHDR)
3907                 return (m->m_data - m->m_pktdat);
3908         return (m->m_data - m->m_dat);
3909 }
3910
3911 /*
3912  * Compute the amount of space available after the end of data in an mbuf.
3913  */
3914 int
3915 m_trailingspace(struct mbuf *m)
3916 {
3917         if (m->m_flags & M_EXT) {
3918                 if (MCLHASREFERENCE(m))
3919                         return (0);
3920                 return (m->m_ext.ext_buf + m->m_ext.ext_size -
3921                     (m->m_data + m->m_len));
3922         }
3923         return (&m->m_dat[MLEN] - (m->m_data + m->m_len));
3924 }
3925
3926 /*
3927  * Lesser-used path for M_PREPEND: allocate new mbuf to prepend to chain,
3928  * copy junk along.  Does not adjust packet header length.
3929  */
3930 struct mbuf *
3931 m_prepend(struct mbuf *m, int len, int how)
3932 {
3933         struct mbuf *mn;
3934
3935         _MGET(mn, how, m->m_type);
3936         if (mn == NULL) {
3937                 m_freem(m);
3938                 return (NULL);
3939         }
3940         if (m->m_flags & M_PKTHDR) {
3941                 M_COPY_PKTHDR(mn, m);
3942                 m->m_flags &= ~M_PKTHDR;
3943         }
3944         mn->m_next = m;
3945         m = mn;
3946         if (len < MHLEN)
3947                 MH_ALIGN(m, len);
3948         m->m_len = len;
3949         return (m);
3950 }
3951
3952 /*
3953  * Replacement for old M_PREPEND macro: allocate new mbuf to prepend to
3954  * chain, copy junk along, and adjust length.
3955  */
3956 struct mbuf *
3957 m_prepend_2(struct mbuf *m, int len, int how)
3958 {
3959         if (M_LEADINGSPACE(m) >= len) {
3960                 m->m_data -= len;
3961                 m->m_len += len;
3962         } else {
3963                 m = m_prepend(m, len, how);
3964         }
3965         if ((m) && (m->m_flags & M_PKTHDR))
3966                 m->m_pkthdr.len += len;
3967         return (m);
3968 }
3969
3970 /*
3971  * Make a copy of an mbuf chain starting "off0" bytes from the beginning,
3972  * continuing for "len" bytes.  If len is M_COPYALL, copy to end of mbuf.
3973  * The wait parameter is a choice of M_WAIT/M_DONTWAIT from caller.
3974  */
3975 int MCFail;
3976
3977 struct mbuf *
3978 m_copym(struct mbuf *m, int off0, int len, int wait)
3979 {
3980         struct mbuf *n, *mhdr = NULL, **np;
3981         int off = off0;
3982         struct mbuf *top;
3983         int copyhdr = 0;
3984
3985         if (off < 0 || len < 0)
3986                 panic("m_copym: invalid offset %d or len %d", off, len);
3987
3988         if (off == 0 && (m->m_flags & M_PKTHDR)) {
3989                 mhdr = m;
3990                 copyhdr = 1;
3991         }
3992
3993         while (off >= m->m_len) {
3994                 if (m->m_next == NULL)
3995                         panic("m_copym: invalid mbuf chain");
3996                 off -= m->m_len;
3997                 m = m->m_next;
3998         }
3999         np = &top;
4000         top = NULL;
4001
4002         while (len > 0) {
4003                 if (m == NULL) {
4004                         if (len != M_COPYALL)
4005                                 panic("m_copym: len != M_COPYALL");
4006                         break;
4007                 }
4008
4009                 n = _M_RETRY(wait, m->m_type);
4010                 *np = n;
4011
4012                 if (n == NULL)
4013                         goto nospace;
4014
4015                 if (copyhdr != 0) {
4016                         M_COPY_PKTHDR(n, mhdr);
4017                         if (len == M_COPYALL)
4018                                 n->m_pkthdr.len -= off0;
4019                         else
4020                                 n->m_pkthdr.len = len;
4021                         copyhdr = 0;
4022                 }
4023                 if (len == M_COPYALL) {
4024                         if (MIN(len, (m->m_len - off)) == len) {
4025                                 printf("m->m_len %d - off %d = %d, %d\n",
4026                                     m->m_len, off, m->m_len - off,
4027                                     MIN(len, (m->m_len - off)));
4028                         }
4029                 }
4030                 n->m_len = MIN(len, (m->m_len - off));
4031                 if (n->m_len == M_COPYALL) {
4032                         printf("n->m_len == M_COPYALL, fixing\n");
4033                         n->m_len = MHLEN;
4034                 }
4035                 if (m->m_flags & M_EXT) {
4036                         n->m_ext = m->m_ext;
4037                         m_incref(m);
4038                         n->m_data = m->m_data + off;
4039                         n->m_flags |= M_EXT;
4040                 } else {
4041                         bcopy(MTOD(m, caddr_t)+off, MTOD(n, caddr_t),
4042                             (unsigned)n->m_len);
4043                 }
4044                 if (len != M_COPYALL)
4045                         len -= n->m_len;
4046                 off = 0;
4047                 m = m->m_next;
4048                 np = &n->m_next;
4049         }
4050
4051         if (top == NULL)
4052                 MCFail++;
4053
4054         return (top);
4055 nospace:
4056
4057         m_freem(top);
4058         MCFail++;
4059         return (NULL);
4060 }
4061
4062 /*
4063  * Equivalent to m_copym except that all necessary mbuf hdrs are allocated
4064  * within this routine also, the last mbuf and offset accessed are passed
4065  * out and can be passed back in to avoid having to rescan the entire mbuf
4066  * list (normally hung off of the socket)
4067  */
4068 struct mbuf *
4069 m_copym_with_hdrs(struct mbuf *m, int off0, int len0, int wait,
4070     struct mbuf **m_last, int *m_off)
4071 {
4072         struct mbuf *n, **np = NULL;
4073         int off = off0, len = len0;
4074         struct mbuf *top = NULL;
4075         int mcflags = MSLEEPF(wait);
4076         int copyhdr = 0;
4077         int type = 0;
4078         mcache_obj_t *list = NULL;
4079         int needed = 0;
4080
4081         if (off == 0 && (m->m_flags & M_PKTHDR))
4082                 copyhdr = 1;
4083
4084         if (*m_last != NULL) {
4085                 m = *m_last;
4086                 off = *m_off;
4087         } else {
4088                 while (off >= m->m_len) {
4089                         off -= m->m_len;
4090                         m = m->m_next;
4091                 }
4092         }
4093
4094         n = m;
4095         while (len > 0) {
4096                 needed++;
4097                 ASSERT(n != NULL);
4098                 len -= MIN(len, (n->m_len - ((needed == 1) ? off : 0)));
4099                 n = n->m_next;
4100         }
4101         needed++;
4102         len = len0;
4103
4104         /*
4105          * If the caller doesn't want to be put to sleep, mark it with
4106          * MCR_TRYHARD so that we may reclaim buffers from other places
4107          * before giving up.
4108          */
4109         if (mcflags & MCR_NOSLEEP)
4110                 mcflags |= MCR_TRYHARD;
4111
4112         if (mcache_alloc_ext(m_cache(MC_MBUF), &list, needed,
4113             mcflags) != needed)
4114                 goto nospace;
4115
4116         needed = 0;
4117         while (len > 0) {
4118                 n = (struct mbuf *)list;
4119                 list = list->obj_next;
4120                 ASSERT(n != NULL && m != NULL);
4121
4122                 type = (top == NULL) ? MT_HEADER : m->m_type;
4123                 MBUF_INIT(n, (top == NULL), type);
4124 #if CONFIG_MACF_NET
4125                 if (top == NULL && mac_mbuf_label_init(n, wait) != 0) {
4126                         mtype_stat_inc(MT_HEADER);
4127                         mtype_stat_dec(MT_FREE);
4128                         m_free(n);
4129                         goto nospace;
4130                 }
4131 #endif /* MAC_NET */
4132
4133                 if (top == NULL) {
4134                         top = n;
4135                         np = &top->m_next;
4136                         continue;
4137                 } else {
4138                         needed++;
4139                         *np = n;
4140                 }
4141
4142                 if (copyhdr) {
4143                         M_COPY_PKTHDR(n, m);
4144                         n->m_pkthdr.len = len;
4145                         copyhdr = 0;
4146                 }
4147                 n->m_len = MIN(len, (m->m_len - off));
4148
4149                 if (m->m_flags & M_EXT) {
4150                         n->m_ext = m->m_ext;
4151                         m_incref(m);
4152                         n->m_data = m->m_data + off;
4153                         n->m_flags |= M_EXT;
4154                 } else {
4155                         bcopy(MTOD(m, caddr_t)+off, MTOD(n, caddr_t),
4156                             (unsigned)n->m_len);
4157                 }
4158                 len -= n->m_len;
4159
4160                 if (len == 0) {
4161                         if ((off + n->m_len) == m->m_len) {
4162                                 *m_last = m->m_next;
4163                                 *m_off  = 0;
4164                         } else {
4165                                 *m_last = m;
4166                                 *m_off  = off + n->m_len;
4167                         }
4168                         break;
4169                 }
4170                 off = 0;
4171                 m = m->m_next;
4172                 np = &n->m_next;
4173         }
4174
4175         mtype_stat_inc(MT_HEADER);
4176         mtype_stat_add(type, needed);
4177         mtype_stat_sub(MT_FREE, needed + 1);
4178
4179         ASSERT(list == NULL);
4180         return (top);
4181
4182 nospace:
4183         if (list != NULL)
4184                 mcache_free_ext(m_cache(MC_MBUF), list);
4185         if (top != NULL)
4186                 m_freem(top);
4187         MCFail++;
4188         return (NULL);
4189 }
4190
4191 /*
4192  * Copy data from an mbuf chain starting "off" bytes from the beginning,
4193  * continuing for "len" bytes, into the indicated buffer.
4194  */
4195 void
4196 m_copydata(struct mbuf *m, int off, int len, void *vp)
4197 {
4198         unsigned count;
4199         char *cp = vp;
4200
4201         if (off < 0 || len < 0)
4202                 panic("m_copydata: invalid offset %d or len %d", off, len);
4203
4204         while (off > 0) {
4205                 if (m == NULL)
4206                         panic("m_copydata: invalid mbuf chain");
4207                 if (off < m->m_len)
4208                         break;
4209                 off -= m->m_len;
4210                 m = m->m_next;
4211         }
4212         while (len > 0) {
4213                 if (m == NULL)
4214                         panic("m_copydata: invalid mbuf chain");
4215                 count = MIN(m->m_len - off, len);
4216                 bcopy(MTOD(m, caddr_t) + off, cp, count);
4217                 len -= count;
4218                 cp += count;
4219                 off = 0;
4220                 m = m->m_next;
4221         }
4222 }
4223
4224 /*
4225  * Concatenate mbuf chain n to m.  Both chains must be of the same type
4226  * (e.g. MT_DATA).  Any m_pkthdr is not updated.
4227  */
4228 void
4229 m_cat(struct mbuf *m, struct mbuf *n)
4230 {
4231         while (m->m_next)
4232                 m = m->m_next;
4233         while (n) {
4234                 if ((m->m_flags & M_EXT) ||
4235                     m->m_data + m->m_len + n->m_len >= &m->m_dat[MLEN]) {
4236                         /* just join the two chains */
4237                         m->m_next = n;
4238                         return;
4239                 }
4240                 /* splat the data from one into the other */
4241                 bcopy(MTOD(n, caddr_t), MTOD(m, caddr_t) + m->m_len,
4242                     (u_int)n->m_len);
4243                 m->m_len += n->m_len;
4244                 n = m_free(n);
4245         }
4246 }
4247
4248 void
4249 m_adj(struct mbuf *mp, int req_len)
4250 {
4251         int len = req_len;
4252         struct mbuf *m;
4253         int count;
4254
4255         if ((m = mp) == NULL)
4256                 return;
4257         if (len >= 0) {
4258                 /*
4259                  * Trim from head.
4260                  */
4261                 while (m != NULL && len > 0) {
4262                         if (m->m_len <= len) {
4263                                 len -= m->m_len;
4264                                 m->m_len = 0;
4265                                 m = m->m_next;
4266                         } else {
4267                                 m->m_len -= len;
4268                                 m->m_data += len;
4269                                 len = 0;
4270                         }
4271                 }
4272                 m = mp;
4273                 if (m->m_flags & M_PKTHDR)
4274                         m->m_pkthdr.len -= (req_len - len);
4275         } else {
4276                 /*
4277                  * Trim from tail.  Scan the mbuf chain,
4278                  * calculating its length and finding the last mbuf.
4279                  * If the adjustment only affects this mbuf, then just
4280                  * adjust and return.  Otherwise, rescan and truncate
4281                  * after the remaining size.
4282                  */
4283                 len = -len;
4284                 count = 0;
4285                 for (;;) {
4286                         count += m->m_len;
4287                         if (m->m_next == (struct mbuf *)0)
4288                                 break;
4289                         m = m->m_next;
4290                 }
4291                 if (m->m_len >= len) {
4292                         m->m_len -= len;
4293                         m = mp;
4294                         if (m->m_flags & M_PKTHDR)
4295                                 m->m_pkthdr.len -= len;
4296                         return;
4297                 }
4298                 count -= len;
4299                 if (count < 0)
4300                         count = 0;
4301                 /*
4302                  * Correct length for chain is "count".
4303                  * Find the mbuf with last data, adjust its length,
4304                  * and toss data from remaining mbufs on chain.
4305                  */
4306                 m = mp;
4307                 if (m->m_flags & M_PKTHDR)
4308                         m->m_pkthdr.len = count;
4309                 for (; m; m = m->m_next) {
4310                         if (m->m_len >= count) {
4311                                 m->m_len = count;
4312                                 break;
4313                         }
4314                         count -= m->m_len;
4315                 }
4316                 while ((m = m->m_next))
4317                         m->m_len = 0;
4318         }
4319 }
4320
4321 /*
4322  * Rearange an mbuf chain so that len bytes are contiguous
4323  * and in the data area of an mbuf (so that mtod and dtom
4324  * will work for a structure of size len).  Returns the resulting
4325  * mbuf chain on success, frees it and returns null on failure.
4326  * If there is room, it will add up to max_protohdr-len extra bytes to the
4327  * contiguous region in an attempt to avoid being called next time.
4328  */
4329 int MPFail;
4330
4331 struct mbuf *
4332 m_pullup(struct mbuf *n, int len)
4333 {
4334         struct mbuf *m;
4335         int count;
4336         int space;
4337
4338         /*
4339          * If first mbuf has no cluster, and has room for len bytes
4340          * without shifting current data, pullup into it,
4341          * otherwise allocate a new mbuf to prepend to the chain.
4342          */
4343         if ((n->m_flags & M_EXT) == 0 &&
4344             n->m_data + len < &n->m_dat[MLEN] && n->m_next) {
4345                 if (n->m_len >= len)
4346                         return (n);
4347                 m = n;
4348                 n = n->m_next;
4349                 len -= m->m_len;
4350         } else {
4351                 if (len > MHLEN)
4352                         goto bad;
4353                 _MGET(m, M_DONTWAIT, n->m_type);
4354                 if (m == 0)
4355                         goto bad;
4356                 m->m_len = 0;
4357                 if (n->m_flags & M_PKTHDR) {
4358                         M_COPY_PKTHDR(m, n);
4359                         n->m_flags &= ~M_PKTHDR;
4360                 }
4361         }
4362         space = &m->m_dat[MLEN] - (m->m_data + m->m_len);
4363         do {
4364                 count = MIN(MIN(MAX(len, max_protohdr), space), n->m_len);
4365                 bcopy(MTOD(n, caddr_t), MTOD(m, caddr_t) + m->m_len,
4366                     (unsigned)count);
4367                 len -= count;
4368                 m->m_len += count;
4369                 n->m_len -= count;
4370                 space -= count;
4371                 if (n->m_len)
4372                         n->m_data += count;
4373                 else
4374                         n = m_free(n);
4375         } while (len > 0 && n);
4376         if (len > 0) {
4377                 (void) m_free(m);
4378                 goto bad;
4379         }
4380         m->m_next = n;
4381         return (m);
4382 bad:
4383         m_freem(n);
4384         MPFail++;
4385         return (0);
4386 }
4387
4388 /*
4389  * Partition an mbuf chain in two pieces, returning the tail --
4390  * all but the first len0 bytes.  In case of failure, it returns NULL and
4391  * attempts to restore the chain to its original state.
4392  */
4393 struct mbuf *
4394 m_split(struct mbuf *m0, int len0, int wait)
4395 {
4396         return (m_split0(m0, len0, wait, 1));
4397 }
4398
4399 static struct mbuf *
4400 m_split0(struct mbuf *m0, int len0, int wait, int copyhdr)
4401 {
4402         struct mbuf *m, *n;
4403         unsigned len = len0, remain;
4404
4405         for (m = m0; m && len > m->m_len; m = m->m_next)
4406                 len -= m->m_len;
4407         if (m == NULL)
4408                 return (NULL);
4409         remain = m->m_len - len;
4410         if (copyhdr && (m0->m_flags & M_PKTHDR)) {
4411                 _MGETHDR(n, wait, m0->m_type);
4412                 if (n == NULL)
4413                         return (NULL);
4414                 n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif;
4415                 n->m_pkthdr.len = m0->m_pkthdr.len - len0;
4416                 m0->m_pkthdr.len = len0;
4417                 if (m->m_flags & M_EXT)
4418                         goto extpacket;
4419                 if (remain > MHLEN) {
4420                         /* m can't be the lead packet */
4421                         MH_ALIGN(n, 0);
4422                         n->m_next = m_split(m, len, wait);
4423                         if (n->m_next == NULL) {
4424                                 (void) m_free(n);
4425                                 return (NULL);
4426                         } else
4427                                 return (n);
4428                 } else
4429                         MH_ALIGN(n, remain);
4430         } else if (remain == 0) {
4431                 n = m->m_next;
4432                 m->m_next = NULL;
4433                 return (n);
4434         } else {
4435                 _MGET(n, wait, m->m_type);
4436                 if (n == NULL)
4437                         return (NULL);
4438                 M_ALIGN(n, remain);
4439         }
4440 extpacket:
4441         if (m->m_flags & M_EXT) {
4442                 n->m_flags |= M_EXT;
4443                 n->m_ext = m->m_ext;
4444                 m_incref(m);
4445                 n->m_data = m->m_data + len;
4446         } else {
4447                 bcopy(MTOD(m, caddr_t) + len, MTOD(n, caddr_t), remain);
4448         }
4449         n->m_len = remain;
4450         m->m_len = len;
4451         n->m_next = m->m_next;
4452         m->m_next = NULL;
4453         return (n);
4454 }
4455
4456 /*
4457  * Routine to copy from device local memory into mbufs.
4458  */
4459 struct mbuf *
4460 m_devget(char *buf, int totlen, int off0, struct ifnet *ifp,
4461     void (*copy)(const void *, void *, size_t))
4462 {
4463         struct mbuf *m;
4464         struct mbuf *top = NULL, **mp = &top;
4465         int off = off0, len;
4466         char *cp;
4467         char *epkt;
4468
4469         cp = buf;
4470         epkt = cp + totlen;
4471         if (off) {
4472                 /*
4473                  * If 'off' is non-zero, packet is trailer-encapsulated,
4474                  * so we have to skip the type and length fields.
4475                  */
4476                 cp += off + 2 * sizeof (u_int16_t);
4477                 totlen -= 2 * sizeof (u_int16_t);
4478         }
4479         _MGETHDR(m, M_DONTWAIT, MT_DATA);
4480         if (m == NULL)
4481                 return (NULL);
4482         m->m_pkthdr.rcvif = ifp;
4483         m->m_pkthdr.len = totlen;
4484         m->m_len = MHLEN;
4485
4486         while (totlen > 0) {
4487                 if (top != NULL) {
4488                         _MGET(m, M_DONTWAIT, MT_DATA);
4489                         if (m == NULL) {
4490                                 m_freem(top);
4491                                 return (NULL);
4492                         }
4493                         m->m_len = MLEN;
4494                 }
4495                 len = MIN(totlen, epkt - cp);
4496                 if (len >= MINCLSIZE) {
4497                         MCLGET(m, M_DONTWAIT);
4498                         if (m->m_flags & M_EXT) {
4499                                 m->m_len = len = MIN(len, m_maxsize(MC_CL));
4500                         } else {
4501                                 /* give up when it's out of cluster mbufs */
4502                                 if (top != NULL)
4503                                         m_freem(top);
4504                                 m_freem(m);
4505                                 return (NULL);
4506                         }
4507                 } else {
4508                         /*
4509                          * Place initial small packet/header at end of mbuf.
4510                          */
4511                         if (len < m->m_len) {
4512                                 if (top == NULL &&
4513                                     len + max_linkhdr <= m->m_len)
4514                                         m->m_data += max_linkhdr;
4515                                 m->m_len = len;
4516                         } else {
4517                                 len = m->m_len;
4518                         }
4519                 }
4520                 if (copy)
4521                         copy(cp, MTOD(m, caddr_t), (unsigned)len);
4522                 else
4523                         bcopy(cp, MTOD(m, caddr_t), (unsigned)len);
4524                 cp += len;
4525                 *mp = m;
4526                 mp = &m->m_next;
4527                 totlen -= len;
4528                 if (cp == epkt)
4529                         cp = buf;
4530         }
4531         return (top);
4532 }
4533
4534 void
4535 mbuf_growth_aggressive(void)
4536 {
4537         lck_mtx_lock(mbuf_mlock);
4538         /*
4539          * Don't start to grow the pool until we are at least
4540          * 1/2 (50%) of current total capacity.
4541          */
4542         mbuf_gscale = MB_GROWTH_AGGRESSIVE;
4543         lck_mtx_unlock(mbuf_mlock);
4544 }
4545
4546 void
4547 mbuf_growth_normal(void)
4548 {
4549         lck_mtx_lock(mbuf_mlock);
4550         /*
4551          * Don't start to grow the pool until we are at least
4552          * 15/16 (93.75%) of current total capacity.
4553          */
4554         mbuf_gscale = MB_GROWTH_NORMAL;
4555         lck_mtx_unlock(mbuf_mlock);
4556 }
4557
4558 /*
4559  * Cluster freelist allocation check.
4560  */
4561 static int
4562 m_howmany(int num, size_t bufsize)
4563 {
4564         int i = 0, j = 0;
4565         u_int32_t m_clusters, m_bigclusters, m_16kclusters;
4566         u_int32_t m_clfree, m_bigclfree, m_16kclfree;
4567         u_int32_t s = mbuf_gscale;
4568
4569         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
4570
4571         m_clusters = m_total(MC_CL);
4572         m_bigclusters = m_total(MC_BIGCL);
4573         m_16kclusters = m_total(MC_16KCL);
4574         m_clfree = m_infree(MC_CL);
4575         m_bigclfree = m_infree(MC_BIGCL);
4576         m_16kclfree = m_infree(MC_16KCL);
4577
4578         /* Bail if we've maxed out the mbuf memory map */
4579         if ((bufsize != m_maxsize(MC_16KCL) &&
4580             (m_clusters + (m_bigclusters << 1) >= nclusters)) ||
4581             (njcl > 0 && bufsize == m_maxsize(MC_16KCL) &&
4582             (m_16kclusters << 3) >= njcl)) {
4583 #if DEBUG
4584                 if (bufsize == MCLBYTES && num > m_clfree) {
4585                         printf("m_howmany - out of small clusters, "
4586                             "%d short\n", num - mbstat.m_clfree);
4587                 }
4588 #endif /* DEBUG */
4589                 return (0);
4590         }
4591
4592         if (bufsize == m_maxsize(MC_CL)) {
4593                 /* Under minimum */
4594                 if (m_clusters < MINCL)
4595                         return (MINCL - m_clusters);
4596                 /* Too few (free < threshold) and not over maximum */
4597                 if (m_clusters < m_maxlimit(MC_CL)) {
4598                         if (m_clfree >= MCL_LOWAT)
4599                                 return (0);
4600                         if (num >= m_clfree)
4601                                 i = num - m_clfree;
4602                         if (((m_clusters + num) >> s) > m_clfree)
4603                                 j = ((m_clusters + num) >> s) - m_clfree;
4604                         i = MAX(i, j);
4605                         if (i + m_clusters >= m_maxlimit(MC_CL))
4606                                 i = m_maxlimit(MC_CL) - m_clusters;
4607                 }
4608                 VERIFY((m_total(MC_CL) + i) <= m_maxlimit(MC_CL));
4609         } else if (bufsize == m_maxsize(MC_BIGCL)) {
4610                 /* Under minimum */
4611                 if (m_bigclusters < MINBIGCL)
4612                         return (MINBIGCL - m_bigclusters);
4613                 /* Too few (free < 1/16 total) and not over maximum */
4614                 if (m_bigclusters < m_maxlimit(MC_BIGCL)) {
4615                         if (m_bigclfree >= MBIGCL_LOWAT)
4616                                 return (0);
4617                         if (num >= m_bigclfree)
4618                                 i = num - m_bigclfree;
4619                         if (((m_bigclusters + num) >> 4) > m_bigclfree)
4620                                 j = ((m_bigclusters + num) >> 4) - m_bigclfree;
4621                         i = MAX(i, j);
4622                         if (i + m_bigclusters >= m_maxlimit(MC_BIGCL))
4623                                 i = m_maxlimit(MC_BIGCL) - m_bigclusters;
4624                 }
4625                 VERIFY((m_total(MC_BIGCL) + i) <= m_maxlimit(MC_BIGCL));
4626         } else {
4627                 VERIFY(njcl > 0);
4628                 /* Under minimum */
4629                 if (m_16kclusters < MIN16KCL)
4630                         return (MIN16KCL - m_16kclusters);
4631                 /* Too few (free < 1/16 total) and not over maximum */
4632                 if (m_16kclusters < m_maxlimit(MC_16KCL)) {
4633                         if (m_16kclfree >= M16KCL_LOWAT)
4634                                 return (0);
4635                         if (num >= m_16kclfree)
4636                                 i = num - m_16kclfree;
4637                         if (((m_16kclusters + num) >> 4) > m_16kclfree)
4638                                 j = ((m_16kclusters + num) >> 4) - m_16kclfree;
4639                         i = MAX(i, j);
4640                         if (i + m_16kclusters >= m_maxlimit(MC_16KCL))
4641                                 i = m_maxlimit(MC_16KCL) - m_16kclusters;
4642                 }
4643                 VERIFY((m_total(MC_16KCL) + i) <= m_maxlimit(MC_16KCL));
4644         }
4645
4646         return (i);
4647 }
4648
4649 /*
4650  * Return the number of bytes in the mbuf chain, m.
4651   */
4652 static unsigned int
4653 m_length(struct mbuf *m)
4654 {
4655         struct mbuf *m0;
4656         unsigned int pktlen;
4657
4658         if (m->m_flags & M_PKTHDR)
4659                 return (m->m_pkthdr.len);
4660
4661         pktlen = 0;
4662         for (m0 = m; m0 != NULL; m0 = m0->m_next)
4663                 pktlen += m0->m_len;
4664         return (pktlen);
4665 }
4666
4667 /*
4668  * Copy data from a buffer back into the indicated mbuf chain,
4669  * starting "off" bytes from the beginning, extending the mbuf
4670  * chain if necessary.
4671  */
4672 void
4673 m_copyback(struct mbuf *m0, int off, int len, const void *cp)
4674 {
4675 #if DEBUG
4676         struct mbuf *origm = m0;
4677         int error;
4678 #endif /* DEBUG */
4679
4680         if (m0 == NULL)
4681                 return;
4682
4683 #if DEBUG
4684         error =
4685 #endif /* DEBUG */
4686         m_copyback0(&m0, off, len, cp,
4687             M_COPYBACK0_COPYBACK | M_COPYBACK0_EXTEND, M_DONTWAIT);
4688
4689 #if DEBUG
4690         if (error != 0 || (m0 != NULL && origm != m0))
4691                 panic("m_copyback");
4692 #endif /* DEBUG */
4693 }
4694
4695 struct mbuf *
4696 m_copyback_cow(struct mbuf *m0, int off, int len, const void *cp, int how)
4697 {
4698         int error;
4699
4700         /* don't support chain expansion */
4701         VERIFY(off + len <= m_length(m0));
4702
4703         error = m_copyback0(&m0, off, len, cp,
4704             M_COPYBACK0_COPYBACK | M_COPYBACK0_COW, how);
4705         if (error) {
4706                 /*
4707                  * no way to recover from partial success.
4708                  * just free the chain.
4709                  */
4710                 m_freem(m0);
4711                 return (NULL);
4712         }
4713         return (m0);
4714 }
4715
4716 /*
4717  * m_makewritable: ensure the specified range writable.
4718  */
4719 int
4720 m_makewritable(struct mbuf **mp, int off, int len, int how)
4721 {
4722         int error;
4723 #if DEBUG
4724         struct mbuf *n;
4725         int origlen, reslen;
4726
4727         origlen = m_length(*mp);
4728 #endif /* DEBUG */
4729
4730 #if 0 /* M_COPYALL is large enough */
4731         if (len == M_COPYALL)
4732                 len = m_length(*mp) - off; /* XXX */
4733 #endif
4734
4735         error = m_copyback0(mp, off, len, NULL,
4736             M_COPYBACK0_PRESERVE | M_COPYBACK0_COW, how);
4737
4738 #if DEBUG
4739         reslen = 0;
4740         for (n = *mp; n; n = n->m_next)
4741                 reslen += n->m_len;
4742         if (origlen != reslen)
4743                 panic("m_makewritable: length changed");
4744         if (((*mp)->m_flags & M_PKTHDR) && reslen != (*mp)->m_pkthdr.len)
4745                 panic("m_makewritable: inconsist");
4746 #endif /* DEBUG */
4747
4748         return (error);
4749 }
4750
4751 static int
4752 m_copyback0(struct mbuf **mp0, int off, int len, const void *vp, int flags,
4753     int how)
4754 {
4755         int mlen;
4756         struct mbuf *m, *n;
4757         struct mbuf **mp;
4758         int totlen = 0;
4759         const char *cp = vp;
4760
4761         VERIFY(mp0 != NULL);
4762         VERIFY(*mp0 != NULL);
4763         VERIFY((flags & M_COPYBACK0_PRESERVE) == 0 || cp == NULL);
4764         VERIFY((flags & M_COPYBACK0_COPYBACK) == 0 || cp != NULL);
4765
4766         /*
4767          * we don't bother to update "totlen" in the case of M_COPYBACK0_COW,
4768          * assuming that M_COPYBACK0_EXTEND and M_COPYBACK0_COW are exclusive.
4769          */
4770
4771         VERIFY((~flags & (M_COPYBACK0_EXTEND|M_COPYBACK0_COW)) != 0);
4772
4773         mp = mp0;
4774         m = *mp;
4775         while (off > (mlen = m->m_len)) {
4776                 off -= mlen;
4777                 totlen += mlen;
4778                 if (m->m_next == NULL) {
4779                         int tspace;
4780 extend:
4781                         if (!(flags & M_COPYBACK0_EXTEND))
4782                                 goto out;
4783
4784                         /*
4785                          * try to make some space at the end of "m".
4786                          */
4787
4788                         mlen = m->m_len;
4789                         if (off + len >= MINCLSIZE &&
4790                             !(m->m_flags & M_EXT) && m->m_len == 0) {
4791                                 MCLGET(m, how);
4792                         }
4793                         tspace = M_TRAILINGSPACE(m);
4794                         if (tspace > 0) {
4795                                 tspace = MIN(tspace, off + len);
4796                                 VERIFY(tspace > 0);
4797                                 bzero(mtod(m, char *) + m->m_len,
4798                                     MIN(off, tspace));
4799                                 m->m_len += tspace;
4800                                 off += mlen;
4801                                 totlen -= mlen;
4802                                 continue;
4803                         }
4804
4805                         /*
4806                          * need to allocate an mbuf.
4807                          */
4808
4809                         if (off + len >= MINCLSIZE) {
4810                                 n = m_getcl(how, m->m_type, 0);
4811                         } else {
4812                                 n = _M_GET(how, m->m_type);
4813                         }
4814                         if (n == NULL) {
4815                                 goto out;
4816                         }
4817                         n->m_len = 0;
4818                         n->m_len = MIN(M_TRAILINGSPACE(n), off + len);
4819                         bzero(mtod(n, char *), MIN(n->m_len, off));
4820                         m->m_next = n;
4821                 }
4822                 mp = &m->m_next;
4823                 m = m->m_next;
4824         }
4825         while (len > 0) {
4826                 mlen = m->m_len - off;
4827                 if (mlen != 0 && m_mclhasreference(m)) {
4828                         char *datap;
4829                         int eatlen;
4830
4831                         /*
4832                          * this mbuf is read-only.
4833                          * allocate a new writable mbuf and try again.
4834                          */
4835
4836 #if defined(DIAGNOSTIC)
4837                         if (!(flags & M_COPYBACK0_COW))
4838                                 panic("m_copyback0: read-only");
4839 #endif /* defined(DIAGNOSTIC) */
4840
4841                         /*
4842                          * if we're going to write into the middle of
4843                          * a mbuf, split it first.
4844                          */
4845                         if (off > 0 && len < mlen) {
4846                                 n = m_split0(m, off, how, 0);
4847                                 if (n == NULL)
4848                                         goto enobufs;
4849                                 m->m_next = n;
4850                                 mp = &m->m_next;
4851                                 m = n;
4852                                 off = 0;
4853                                 continue;
4854                         }
4855
4856                         /*
4857                          * XXX TODO coalesce into the trailingspace of
4858                          * the previous mbuf when possible.
4859                          */
4860
4861                         /*
4862                          * allocate a new mbuf.  copy packet header if needed.
4863                          */
4864                         n = _M_GET(how, m->m_type);
4865                         if (n == NULL)
4866                                 goto enobufs;
4867                         if (off == 0 && (m->m_flags & M_PKTHDR)) {
4868                                 M_COPY_PKTHDR(n, m);
4869                                 n->m_len = MHLEN;
4870                         } else {
4871                                 if (len >= MINCLSIZE)
4872                                         MCLGET(n, M_DONTWAIT);
4873                                 n->m_len =
4874                                     (n->m_flags & M_EXT) ? MCLBYTES : MLEN;
4875                         }
4876                         if (n->m_len > len)
4877                                 n->m_len = len;
4878
4879                         /*
4880                          * free the region which has been overwritten.
4881                          * copying data from old mbufs if requested.
4882                          */
4883                         if (flags & M_COPYBACK0_PRESERVE)
4884                                 datap = mtod(n, char *);
4885                         else
4886                                 datap = NULL;
4887                         eatlen = n->m_len;
4888                         VERIFY(off == 0 || eatlen >= mlen);
4889                         if (off > 0) {
4890                                 VERIFY(len >= mlen);
4891                                 m->m_len = off;
4892                                 m->m_next = n;
4893                                 if (datap) {
4894                                         m_copydata(m, off, mlen, datap);
4895                                         datap += mlen;
4896                                 }
4897                                 eatlen -= mlen;
4898                                 mp = &m->m_next;
4899                                 m = m->m_next;
4900                         }
4901                         while (m != NULL && m_mclhasreference(m) &&
4902                             n->m_type == m->m_type && eatlen > 0) {
4903                                 mlen = MIN(eatlen, m->m_len);
4904                                 if (datap) {
4905                                         m_copydata(m, 0, mlen, datap);
4906                                         datap += mlen;
4907                                 }
4908                                 m->m_data += mlen;
4909                                 m->m_len -= mlen;
4910                                 eatlen -= mlen;
4911                                 if (m->m_len == 0)
4912                                         *mp = m = m_free(m);
4913                         }
4914                         if (eatlen > 0)
4915                                 n->m_len -= eatlen;
4916                         n->m_next = m;
4917                         *mp = m = n;
4918                         continue;
4919                 }
4920                 mlen = MIN(mlen, len);
4921                 if (flags & M_COPYBACK0_COPYBACK) {
4922                         bcopy(cp, mtod(m, caddr_t) + off, (unsigned)mlen);
4923                         cp += mlen;
4924                 }
4925                 len -= mlen;
4926                 mlen += off;
4927                 off = 0;
4928                 totlen += mlen;
4929                 if (len == 0)
4930                         break;
4931                 if (m->m_next == NULL) {
4932                         goto extend;
4933                 }
4934                 mp = &m->m_next;
4935                 m = m->m_next;
4936         }
4937 out:
4938         if (((m = *mp0)->m_flags & M_PKTHDR) && (m->m_pkthdr.len < totlen)) {
4939                 VERIFY(flags & M_COPYBACK0_EXTEND);
4940                 m->m_pkthdr.len = totlen;
4941         }
4942
4943         return (0);
4944
4945 enobufs:
4946         return (ENOBUFS);
4947 }
4948
4949 char *
4950 mcl_to_paddr(char *addr)
4951 {
4952         vm_offset_t base_phys;
4953
4954         if (!MBUF_IN_MAP(addr))
4955                 return (NULL);
4956         base_phys = mcl_paddr[(addr - (char *)mbutl) >> PGSHIFT];
4957
4958         if (base_phys == 0)
4959                 return (NULL);
4960         return ((char *)((uintptr_t)base_phys | ((uintptr_t)addr & PGOFSET)));
4961 }
4962
4963 /*
4964  * Dup the mbuf chain passed in.  The whole thing.  No cute additional cruft.
4965  * And really copy the thing.  That way, we don't "precompute" checksums
4966  * for unsuspecting consumers.  Assumption: m->m_nextpkt == 0.  Trick: for
4967  * small packets, don't dup into a cluster.  That way received  packets
4968  * don't take up too much room in the sockbuf (cf. sbspace()).
4969  */
4970 int MDFail;
4971
4972 struct mbuf *
4973 m_dup(struct mbuf *m, int how)
4974 {
4975         struct mbuf *n, **np;
4976         struct mbuf *top;
4977         int copyhdr = 0;
4978
4979         np = &top;
4980         top = NULL;
4981         if (m->m_flags & M_PKTHDR)
4982                 copyhdr = 1;
4983
4984         /*
4985          * Quick check: if we have one mbuf and its data fits in an
4986          *  mbuf with packet header, just copy and go.
4987          */
4988         if (m->m_next == NULL) {
4989                 /* Then just move the data into an mbuf and be done... */
4990                 if (copyhdr) {
4991                         if (m->m_pkthdr.len <= MHLEN && m->m_len <= MHLEN) {
4992                                 if ((n = _M_GETHDR(how, m->m_type)) == NULL)
4993                                         return (NULL);
4994                                 n->m_len = m->m_len;
4995                                 m_dup_pkthdr(n, m, how);
4996                                 bcopy(m->m_data, n->m_data, m->m_len);
4997                                 return (n);
4998                         }
4999                 } else if (m->m_len <= MLEN) {
5000                         if ((n = _M_GET(how, m->m_type)) == NULL)
5001                                 return (NULL);
5002                         bcopy(m->m_data, n->m_data, m->m_len);
5003                         n->m_len = m->m_len;
5004                         return (n);
5005                 }
5006         }
5007         while (m != NULL) {
5008 #if BLUE_DEBUG
5009                 kprintf("<%x: %x, %x, %x\n", m, m->m_flags, m->m_len,
5010                     m->m_data);
5011 #endif
5012                 if (copyhdr)
5013                         n = _M_GETHDR(how, m->m_type);
5014                 else
5015                         n = _M_GET(how, m->m_type);
5016                 if (n == NULL)
5017                         goto nospace;
5018                 if (m->m_flags & M_EXT) {
5019                         if (m->m_len <= m_maxsize(MC_CL))
5020                                 MCLGET(n, how);
5021                         else if (m->m_len <= m_maxsize(MC_BIGCL))
5022                                 n = m_mbigget(n, how);
5023                         else if (m->m_len <= m_maxsize(MC_16KCL) && njcl > 0)
5024                                 n = m_m16kget(n, how);
5025                         if (!(n->m_flags & M_EXT)) {
5026                                 (void) m_free(n);
5027                                 goto nospace;
5028                         }
5029                 }
5030                 *np = n;
5031                 if (copyhdr) {
5032                         /* Don't use M_COPY_PKTHDR: preserve m_data */
5033                         m_dup_pkthdr(n, m, how);
5034                         copyhdr = 0;
5035                         if (!(n->m_flags & M_EXT))
5036                                 n->m_data = n->m_pktdat;
5037                 }
5038                 n->m_len = m->m_len;
5039                 /*
5040                  * Get the dup on the same bdry as the original
5041                  * Assume that the two mbufs have the same offset to data area
5042                  * (up to word boundaries)
5043                  */
5044                 bcopy(MTOD(m, caddr_t), MTOD(n, caddr_t), (unsigned)n->m_len);
5045                 m = m->m_next;
5046                 np = &n->m_next;
5047 #if BLUE_DEBUG
5048                 kprintf(">%x: %x, %x, %x\n", n, n->m_flags, n->m_len,
5049                     n->m_data);
5050 #endif
5051         }
5052
5053         if (top == NULL)
5054                 MDFail++;
5055         return (top);
5056
5057 nospace:
5058         m_freem(top);
5059         MDFail++;
5060         return (NULL);
5061 }
5062
5063 #define MBUF_MULTIPAGES(m)                                              \
5064         (((m)->m_flags & M_EXT) &&                                      \
5065         ((IS_P2ALIGNED((m)->m_data, NBPG) && (m)->m_len > NBPG) ||      \
5066         (!IS_P2ALIGNED((m)->m_data, NBPG) &&                            \
5067         P2ROUNDUP((m)->m_data, NBPG) < ((uintptr_t)(m)->m_data + (m)->m_len))))
5068
5069 static struct mbuf *
5070 m_expand(struct mbuf *m, struct mbuf **last)
5071 {
5072         struct mbuf *top = NULL;
5073         struct mbuf **nm = &top;
5074         uintptr_t data0, data;
5075         unsigned int len0, len;
5076
5077         VERIFY(MBUF_MULTIPAGES(m));
5078         VERIFY(m->m_next == NULL);
5079         data0 = (uintptr_t)m->m_data;
5080         len0 = m->m_len;
5081         *last = top;
5082
5083         for (;;) {
5084                 struct mbuf *n;
5085
5086                 data = data0;
5087                 if (IS_P2ALIGNED(data, NBPG) && len0 > NBPG)
5088                         len = NBPG;
5089                 else if (!IS_P2ALIGNED(data, NBPG) &&
5090                     P2ROUNDUP(data, NBPG) < (data + len0))
5091                         len = P2ROUNDUP(data, NBPG) - data;
5092                 else
5093                         len = len0;
5094
5095                 VERIFY(len > 0);
5096                 VERIFY(m->m_flags & M_EXT);
5097                 m->m_data = (void *)data;
5098                 m->m_len = len;
5099
5100                 *nm = *last = m;
5101                 nm = &m->m_next;
5102                 m->m_next = NULL;
5103
5104                 data0 += len;
5105                 len0 -= len;
5106                 if (len0 == 0)
5107                         break;
5108
5109                 n = _M_RETRY(M_DONTWAIT, MT_DATA);
5110                 if (n == NULL) {
5111                         m_freem(top);
5112                         top = *last = NULL;
5113                         break;
5114                 }
5115
5116                 n->m_ext = m->m_ext;
5117                 m_incref(m);
5118                 n->m_flags |= M_EXT;
5119                 m = n;
5120         }
5121         return (top);
5122 }
5123
5124 struct mbuf *
5125 m_normalize(struct mbuf *m)
5126 {
5127         struct mbuf *top = NULL;
5128         struct mbuf **nm = &top;
5129         boolean_t expanded = FALSE;
5130
5131         while (m != NULL) {
5132                 struct mbuf *n;
5133
5134                 n = m->m_next;
5135                 m->m_next = NULL;
5136
5137                 /* Does the data cross one or more page boundaries? */
5138                 if (MBUF_MULTIPAGES(m)) {
5139                         struct mbuf *last;
5140                         if ((m = m_expand(m, &last)) == NULL) {
5141                                 m_freem(n);
5142                                 m_freem(top);
5143                                 top = NULL;
5144                                 break;
5145                         }
5146                         *nm = m;
5147                         nm = &last->m_next;
5148                         expanded = TRUE;
5149                 } else {
5150                         *nm = m;
5151                         nm = &m->m_next;
5152                 }
5153                 m = n;
5154         }
5155         if (expanded)
5156                 atomic_add_32(&mb_normalized, 1);
5157         return (top);
5158 }
5159
5160 void
5161 m_mchtype(struct mbuf *m, int t)
5162 {
5163         mtype_stat_inc(t);
5164         mtype_stat_dec(m->m_type);
5165         (m)->m_type = t;
5166 }
5167
5168 void *
5169 m_mtod(struct mbuf *m)
5170 {
5171         return (MTOD(m, void *));
5172 }
5173
5174 struct mbuf *
5175 m_dtom(void *x)
5176 {
5177         return ((struct mbuf *)((uintptr_t)(x) & ~(MSIZE-1)));
5178 }
5179
5180 void
5181 m_mcheck(struct mbuf *m)
5182 {
5183         _MCHECK(m);
5184 }
5185
5186 /*
5187  * Inform the corresponding mcache(s) that there's a waiter below.
5188  */
5189 static void
5190 mbuf_waiter_inc(mbuf_class_t class, boolean_t comp)
5191 {
5192         mcache_waiter_inc(m_cache(class));
5193         if (comp) {
5194                 if (class == MC_CL) {
5195                         mcache_waiter_inc(m_cache(MC_MBUF_CL));
5196                 } else if (class == MC_BIGCL) {
5197                         mcache_waiter_inc(m_cache(MC_MBUF_BIGCL));
5198                 } else if (class == MC_16KCL) {
5199                         mcache_waiter_inc(m_cache(MC_MBUF_16KCL));
5200                 } else {
5201                         mcache_waiter_inc(m_cache(MC_MBUF_CL));
5202                         mcache_waiter_inc(m_cache(MC_MBUF_BIGCL));
5203                 }
5204         }
5205 }
5206
5207 /*
5208  * Inform the corresponding mcache(s) that there's no more waiter below.
5209  */
5210 static void
5211 mbuf_waiter_dec(mbuf_class_t class, boolean_t comp)
5212 {
5213         mcache_waiter_dec(m_cache(class));
5214         if (comp) {
5215                 if (class == MC_CL) {
5216                         mcache_waiter_dec(m_cache(MC_MBUF_CL));
5217                 } else if (class == MC_BIGCL) {
5218                         mcache_waiter_dec(m_cache(MC_MBUF_BIGCL));
5219                 } else if (class == MC_16KCL) {
5220                         mcache_waiter_dec(m_cache(MC_MBUF_16KCL));
5221                 } else {
5222                         mcache_waiter_dec(m_cache(MC_MBUF_CL));
5223                         mcache_waiter_dec(m_cache(MC_MBUF_BIGCL));
5224                 }
5225         }
5226 }
5227
5228 /*
5229  * Called during blocking allocation.  Returns TRUE if one or more objects
5230  * are available at the per-CPU caches layer and that allocation should be
5231  * retried at that level.
5232  */
5233 static boolean_t
5234 mbuf_sleep(mbuf_class_t class, unsigned int num, int wait)
5235 {
5236         boolean_t mcache_retry = FALSE;
5237
5238         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
5239
5240         /* Check if there's anything at the cache layer */
5241         if (mbuf_cached_above(class, wait)) {
5242                 mcache_retry = TRUE;
5243                 goto done;
5244         }
5245
5246         /* Nothing?  Then try hard to get it from somewhere */
5247         m_reclaim(class, num, (wait & MCR_COMP));
5248
5249         /* We tried hard and got something? */
5250         if (m_infree(class) > 0) {
5251                 mbstat.m_wait++;
5252                 goto done;
5253         } else if (mbuf_cached_above(class, wait)) {
5254                 mbstat.m_wait++;
5255                 mcache_retry = TRUE;
5256                 goto done;
5257         } else if (wait & MCR_TRYHARD) {
5258                 mcache_retry = TRUE;
5259                 goto done;
5260         }
5261
5262         /*
5263          * There's really nothing for us right now; inform the
5264          * cache(s) that there is a waiter below and go to sleep.
5265          */
5266         mbuf_waiter_inc(class, (wait & MCR_COMP));
5267
5268         VERIFY(!(wait & MCR_NOSLEEP));
5269         mb_waiters++;
5270         (void) msleep(mb_waitchan, mbuf_mlock, (PZERO-1), m_cname(class), NULL);
5271
5272         /* We are now up; stop getting notified until next round */
5273         mbuf_waiter_dec(class, (wait & MCR_COMP));
5274
5275         /* We waited and got something */
5276         if (m_infree(class) > 0) {
5277                 mbstat.m_wait++;
5278                 goto done;
5279         } else if (mbuf_cached_above(class, wait)) {
5280                 mbstat.m_wait++;
5281                 mcache_retry = TRUE;
5282         }
5283 done:
5284         return (mcache_retry);
5285 }
5286
5287 static void
5288 mbuf_worker_thread(void)
5289 {
5290         int mbuf_expand;
5291
5292         while (1) {
5293                 lck_mtx_lock(mbuf_mlock);
5294
5295                 mbuf_expand = 0;
5296                 if (mbuf_expand_mcl) {
5297                         int n;
5298
5299                         /* Adjust to current number of cluster in use */
5300                         n = mbuf_expand_mcl -
5301                             (m_total(MC_CL) - m_infree(MC_CL));
5302                         if ((n + m_total(MC_CL)) > m_maxlimit(MC_CL))
5303                                 n = m_maxlimit(MC_CL) - m_total(MC_CL);
5304                         mbuf_expand_mcl = 0;
5305
5306                         if (n > 0 && freelist_populate(MC_CL, n, M_WAIT) > 0)
5307                                 mbuf_expand++;
5308                 }
5309                 if (mbuf_expand_big) {
5310                         int n;
5311
5312                         /* Adjust to current number of 4 KB cluster in use */
5313                         n = mbuf_expand_big -
5314                             (m_total(MC_BIGCL) - m_infree(MC_BIGCL));
5315                         if ((n + m_total(MC_BIGCL)) > m_maxlimit(MC_BIGCL))
5316                                 n = m_maxlimit(MC_BIGCL) - m_total(MC_BIGCL);
5317                         mbuf_expand_big = 0;
5318
5319                         if (n > 0 && freelist_populate(MC_BIGCL, n, M_WAIT) > 0)
5320                                 mbuf_expand++;
5321                 }
5322                 if (mbuf_expand_16k) {
5323                         int n;
5324
5325                         /* Adjust to current number of 16 KB cluster in use */
5326                         n = mbuf_expand_16k -
5327                             (m_total(MC_16KCL) - m_infree(MC_16KCL));
5328                         if ((n + m_total(MC_16KCL)) > m_maxlimit(MC_16KCL))
5329                                 n = m_maxlimit(MC_16KCL) - m_total(MC_16KCL);
5330                         mbuf_expand_16k = 0;
5331
5332                         if (n > 0)
5333                                 (void) freelist_populate(MC_16KCL, n, M_WAIT);
5334                 }
5335
5336                 /*
5337                  * Because we can run out of memory before filling the mbuf
5338                  * map, we should not allocate more clusters than they are
5339                  * mbufs -- otherwise we could have a large number of useless
5340                  * clusters allocated.
5341                  */
5342                 if (mbuf_expand) {
5343                         while (m_total(MC_MBUF) <
5344                             (m_total(MC_BIGCL) + m_total(MC_CL))) {
5345                                 if (freelist_populate(MC_MBUF, 1, M_WAIT) == 0)
5346                                         break;
5347                         }
5348                 }
5349
5350                 lck_mtx_unlock(mbuf_mlock);
5351
5352                 assert_wait(&mbuf_worker_run, THREAD_UNINT);
5353                 (void) thread_block((thread_continue_t)mbuf_worker_thread);
5354         }
5355 }
5356
5357 static void
5358 mbuf_worker_thread_init(void)
5359 {
5360         mbuf_worker_ready++;
5361         mbuf_worker_thread();
5362 }
5363
5364 static mcl_slab_t *
5365 slab_get(void *buf)
5366 {
5367         mcl_slabg_t *slg;
5368         unsigned int ix, k;
5369
5370         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
5371
5372         VERIFY(MBUF_IN_MAP(buf));
5373         ix = ((char *)buf - (char *)mbutl) >> MBSHIFT;
5374         VERIFY(ix < maxslabgrp);
5375
5376         if ((slg = slabstbl[ix]) == NULL) {
5377                 /*
5378                  * In the current implementation, we never shrink the memory
5379                  * pool (hence the cluster map); if we attempt to reallocate
5380                  * a cluster group when it's already allocated, panic since
5381                  * this is a sign of a memory corruption (slabstbl[ix] got
5382                  * nullified).  This also means that there shouldn't be any
5383                  * hole in the kernel sub-map for the mbuf pool.
5384                  */
5385                 ++slabgrp;
5386                 VERIFY(ix < slabgrp);
5387                 /*
5388                  * Slabs expansion can only be done single threaded; when
5389                  * we get here, it must be as a result of m_clalloc() which
5390                  * is serialized and therefore mb_clalloc_busy must be set.
5391                  */
5392                 VERIFY(mb_clalloc_busy);
5393                 lck_mtx_unlock(mbuf_mlock);
5394
5395                 /* This is a new buffer; create the slabs group for it */
5396                 MALLOC(slg, mcl_slabg_t *, sizeof (*slg), M_TEMP,
5397                     M_WAITOK | M_ZERO);
5398                 VERIFY(slg != NULL);
5399
5400                 lck_mtx_lock(mbuf_mlock);
5401                 /*
5402                  * No other thread could have gone into m_clalloc() after
5403                  * we dropped the lock above, so verify that it's true.
5404                  */
5405                 VERIFY(mb_clalloc_busy);
5406
5407                 slabstbl[ix] = slg;
5408
5409                 /* Chain each slab in the group to its forward neighbor */
5410                 for (k = 1; k < NSLABSPMB; k++)
5411                         slg->slg_slab[k - 1].sl_next = &slg->slg_slab[k];
5412                 VERIFY(slg->slg_slab[NSLABSPMB - 1].sl_next == NULL);
5413
5414                 /* And chain the last slab in the previous group to this */
5415                 if (ix > 0) {
5416                         VERIFY(slabstbl[ix - 1]->
5417                             slg_slab[NSLABSPMB - 1].sl_next == NULL);
5418                         slabstbl[ix - 1]->slg_slab[NSLABSPMB - 1].sl_next =
5419                             &slg->slg_slab[0];
5420                 }
5421         }
5422
5423         ix = MTOCL(buf) % NSLABSPMB;
5424         VERIFY(ix < NSLABSPMB);
5425
5426         return (&slg->slg_slab[ix]);
5427 }
5428
5429 static void
5430 slab_init(mcl_slab_t *sp, mbuf_class_t class, u_int32_t flags,
5431     void *base, void *head, unsigned int len, int refcnt, int chunks)
5432 {
5433         sp->sl_class = class;
5434         sp->sl_flags = flags;
5435         sp->sl_base = base;
5436         sp->sl_head = head;
5437         sp->sl_len = len;
5438         sp->sl_refcnt = refcnt;
5439         sp->sl_chunks = chunks;
5440         slab_detach(sp);
5441 }
5442
5443 static void
5444 slab_insert(mcl_slab_t *sp, mbuf_class_t class)
5445 {
5446         VERIFY(slab_is_detached(sp));
5447         m_slab_cnt(class)++;
5448         TAILQ_INSERT_TAIL(&m_slablist(class), sp, sl_link);
5449         sp->sl_flags &= ~SLF_DETACHED;
5450         if (class == MC_BIGCL) {
5451                 sp = sp->sl_next;
5452                 /* Next slab must already be present */
5453                 VERIFY(sp != NULL);
5454                 VERIFY(slab_is_detached(sp));
5455                 sp->sl_flags &= ~SLF_DETACHED;
5456         } else if (class == MC_16KCL) {
5457                 int k;
5458                 for (k = 1; k < (M16KCLBYTES / MCLBYTES); k++) {
5459                         sp = sp->sl_next;
5460                         /* Next slab must already be present */
5461                         VERIFY(sp != NULL);
5462                         VERIFY(slab_is_detached(sp));
5463                         sp->sl_flags &= ~SLF_DETACHED;
5464                 }
5465         }
5466 }
5467
5468 static void
5469 slab_remove(mcl_slab_t *sp, mbuf_class_t class)
5470 {
5471         VERIFY(!slab_is_detached(sp));
5472         VERIFY(m_slab_cnt(class) > 0);
5473         m_slab_cnt(class)--;
5474         TAILQ_REMOVE(&m_slablist(class), sp, sl_link);
5475         slab_detach(sp);
5476         if (class == MC_BIGCL) {
5477                 sp = sp->sl_next;
5478                 /* Next slab must already be present */
5479                 VERIFY(sp != NULL);
5480                 VERIFY(!slab_is_detached(sp));
5481                 slab_detach(sp);
5482         } else if (class == MC_16KCL) {
5483                 int k;
5484                 for (k = 1; k < (M16KCLBYTES / MCLBYTES); k++) {
5485                         sp = sp->sl_next;
5486                         /* Next slab must already be present */
5487                         VERIFY(sp != NULL);
5488                         VERIFY(!slab_is_detached(sp));
5489                         slab_detach(sp);
5490                 }
5491         }
5492 }
5493
5494 static boolean_t
5495 slab_inrange(mcl_slab_t *sp, void *buf)
5496 {
5497         return ((uintptr_t)buf >= (uintptr_t)sp->sl_base &&
5498             (uintptr_t)buf < ((uintptr_t)sp->sl_base + sp->sl_len));
5499 }
5500
5501 #undef panic
5502
5503 static void
5504 slab_nextptr_panic(mcl_slab_t *sp, void *addr)
5505 {
5506         int i;
5507         unsigned int chunk_len = sp->sl_len / sp->sl_chunks;
5508         uintptr_t buf = (uintptr_t)sp->sl_base;
5509
5510         for (i = 0; i < sp->sl_chunks; i++, buf += chunk_len) {
5511                 void *next = ((mcache_obj_t *)buf)->obj_next;
5512                 if (next != addr)
5513                         continue;
5514                 if (mclaudit == NULL) {
5515                         if (next != NULL && !MBUF_IN_MAP(next)) {
5516                                 mcache_t *cp = m_cache(sp->sl_class);
5517                                 panic("%s: %s buffer %p in slab %p modified "
5518                                     "after free at offset 0: %p out of range "
5519                                     "[%p-%p)\n", __func__, cp->mc_name,
5520                                     (void *)buf, sp, next, mbutl, embutl);
5521                                 /* NOTREACHED */
5522                         }
5523                 } else {
5524                         mcache_audit_t *mca = mcl_audit_buf2mca(sp->sl_class,
5525                             (mcache_obj_t *)buf);
5526                         mcl_audit_verify_nextptr(next, mca);
5527                 }
5528         }
5529 }
5530
5531 static void
5532 slab_detach(mcl_slab_t *sp)
5533 {
5534         sp->sl_link.tqe_next = (mcl_slab_t *)-1;
5535         sp->sl_link.tqe_prev = (mcl_slab_t **)-1;
5536         sp->sl_flags |= SLF_DETACHED;
5537 }
5538
5539 static boolean_t
5540 slab_is_detached(mcl_slab_t *sp)
5541 {
5542         return ((intptr_t)sp->sl_link.tqe_next == -1 &&
5543             (intptr_t)sp->sl_link.tqe_prev == -1 &&
5544             (sp->sl_flags & SLF_DETACHED));
5545 }
5546
5547 static void
5548 mcl_audit_init(void *buf, mcache_audit_t **mca_list,
5549     mcache_obj_t **con_list, size_t con_size, unsigned int num)
5550 {
5551         mcache_audit_t *mca, *mca_tail;
5552         mcache_obj_t *con = NULL;
5553         boolean_t save_contents = (con_list != NULL);
5554         unsigned int i, ix;
5555
5556         ASSERT(num <= NMBPCL);
5557         ASSERT(con_list == NULL || con_size != 0);
5558
5559         ix = MTOCL(buf);
5560         /* Make sure we haven't been here before */
5561         for (i = 0; i < NMBPCL; i++)
5562                 VERIFY(mclaudit[ix].cl_audit[i] == NULL);
5563
5564         mca = mca_tail = *mca_list;
5565         if (save_contents)
5566                 con = *con_list;
5567
5568         for (i = 0; i < num; i++) {
5569                 mcache_audit_t *next;
5570
5571                 next = mca->mca_next;
5572                 bzero(mca, sizeof (*mca));
5573                 mca->mca_next = next;
5574                 mclaudit[ix].cl_audit[i] = mca;
5575
5576                 /* Attach the contents buffer if requested */
5577                 if (save_contents) {
5578                         VERIFY(con != NULL);
5579                         mca->mca_contents_size = con_size;
5580                         mca->mca_contents = con;
5581                         con = con->obj_next;
5582                         bzero(mca->mca_contents, mca->mca_contents_size);
5583                 }
5584
5585                 mca_tail = mca;
5586                 mca = mca->mca_next;
5587         }
5588
5589         if (save_contents)
5590                 *con_list = con;
5591
5592         *mca_list = mca_tail->mca_next;
5593         mca_tail->mca_next = NULL;
5594 }
5595
5596 /*
5597  * Given an address of a buffer (mbuf/cluster/big cluster), return
5598  * the corresponding audit structure for that buffer.
5599  */
5600 static mcache_audit_t *
5601 mcl_audit_buf2mca(mbuf_class_t class, mcache_obj_t *o)
5602 {
5603         mcache_audit_t *mca = NULL;
5604         int ix = MTOCL(o);
5605
5606         VERIFY(IS_P2ALIGNED(o, MIN(m_maxsize(class), NBPG)));
5607
5608         switch (class) {
5609         case MC_MBUF:
5610                 /*
5611                  * For the mbuf case, find the index of the cluster
5612                  * used by the mbuf and use that index to locate the
5613                  * base address of the cluster.  Then find out the
5614                  * mbuf index relative to the cluster base and use
5615                  * it to locate the audit structure.
5616                  */
5617                 VERIFY(MCLIDX(CLTOM(ix), o) < (int)NMBPCL);
5618                 mca = mclaudit[ix].cl_audit[MCLIDX(CLTOM(ix), o)];
5619                 break;
5620
5621         case MC_CL:
5622         case MC_BIGCL:
5623         case MC_16KCL:
5624                 /*
5625                  * Same as above, but only return the first element.
5626                  */
5627                 mca = mclaudit[ix].cl_audit[0];
5628                 break;
5629
5630         default:
5631                 VERIFY(0);
5632                 /* NOTREACHED */
5633         }
5634
5635         return (mca);
5636 }
5637
5638 static void
5639 mcl_audit_mbuf(mcache_audit_t *mca, void *addr, boolean_t composite,
5640     boolean_t alloc)
5641 {
5642         struct mbuf *m = addr;
5643         mcache_obj_t *next = ((mcache_obj_t *)m)->obj_next;
5644
5645         VERIFY(mca->mca_contents != NULL &&
5646             mca->mca_contents_size == AUDIT_CONTENTS_SIZE);
5647
5648         mcl_audit_verify_nextptr(next, mca);
5649
5650         if (!alloc) {
5651                 /* Save constructed mbuf fields */
5652                 mcl_audit_save_mbuf(m, mca);
5653                 mcache_set_pattern(MCACHE_FREE_PATTERN, m, m_maxsize(MC_MBUF));
5654                 ((mcache_obj_t *)m)->obj_next = next;
5655                 return;
5656         }
5657
5658         /* Check if the buffer has been corrupted while in freelist */
5659         mcache_audit_free_verify_set(mca, addr, 0, m_maxsize(MC_MBUF));
5660
5661         /* Restore constructed mbuf fields */
5662         mcl_audit_restore_mbuf(m, mca, composite);
5663 }
5664
5665 static void
5666 mcl_audit_restore_mbuf(struct mbuf *m, mcache_audit_t *mca, boolean_t composite)
5667 {
5668         struct mbuf *ms = (struct mbuf *)mca->mca_contents;
5669
5670         if (composite) {
5671                 struct mbuf *next = m->m_next;
5672                 VERIFY(ms->m_flags == M_EXT && MEXT_RFA(ms) != NULL &&
5673                     MBUF_IS_COMPOSITE(ms));
5674                 /*
5675                  * We could have hand-picked the mbuf fields and restore
5676                  * them individually, but that will be a maintenance
5677                  * headache.  Instead, restore everything that was saved;
5678                  * the mbuf layer will recheck and reinitialize anyway.
5679                  */
5680                 bcopy(ms, m, mca->mca_contents_size);
5681                 m->m_next = next;
5682         } else {
5683                 /*
5684                  * For a regular mbuf (no cluster attached) there's nothing
5685                  * to restore other than the type field, which is expected
5686                  * to be MT_FREE.
5687                  */
5688                 m->m_type = ms->m_type;
5689         }
5690         _MCHECK(m);
5691 }
5692
5693 static void
5694 mcl_audit_save_mbuf(struct mbuf *m, mcache_audit_t *mca)
5695 {
5696         _MCHECK(m);
5697         bcopy(m, mca->mca_contents, mca->mca_contents_size);
5698 }
5699
5700 static void
5701 mcl_audit_cluster(mcache_audit_t *mca, void *addr, size_t size, boolean_t alloc,
5702     boolean_t save_next)
5703 {
5704         mcache_obj_t *next = ((mcache_obj_t *)addr)->obj_next;
5705
5706         if (!alloc) {
5707                 mcache_set_pattern(MCACHE_FREE_PATTERN, addr, size);
5708                 if (save_next) {
5709                         mcl_audit_verify_nextptr(next, mca);
5710                         ((mcache_obj_t *)addr)->obj_next = next;
5711                 }
5712         } else {
5713                 /* Check if the buffer has been corrupted while in freelist */
5714                 mcl_audit_verify_nextptr(next, mca);
5715                 mcache_audit_free_verify_set(mca, addr, 0, size);
5716         }
5717 }
5718
5719 static void
5720 mcl_audit_mcheck_panic(struct mbuf *m)
5721 {
5722         mcache_audit_t *mca;
5723
5724         MRANGE(m);
5725         mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
5726
5727         panic("mcl_audit: freed mbuf %p with type 0x%x (instead of 0x%x)\n%s\n",
5728             m, (u_int16_t)m->m_type, MT_FREE, mcache_dump_mca(mca));
5729         /* NOTREACHED */
5730 }
5731
5732 static void
5733 mcl_audit_verify_nextptr(void *next, mcache_audit_t *mca)
5734 {
5735         if (next != NULL && next != (void *)MCACHE_FREE_PATTERN &&
5736             !MBUF_IN_MAP(next)) {
5737                 panic("mcl_audit: buffer %p modified after free at offset 0: "
5738                     "%p out of range [%p-%p)\n%s\n",
5739                     mca->mca_addr, next, mbutl, embutl, mcache_dump_mca(mca));
5740                 /* NOTREACHED */
5741         }
5742 }
5743
5744 SYSCTL_DECL(_kern_ipc);
5745 SYSCTL_PROC(_kern_ipc, KIPC_MBSTAT, mbstat, CTLFLAG_RD | CTLFLAG_LOCKED,
5746     0, 0, mbstat_sysctl, "S,mbstat", "");
5747 SYSCTL_PROC(_kern_ipc, OID_AUTO, mb_stat, CTLFLAG_RD | CTLFLAG_LOCKED,
5748     0, 0, mb_stat_sysctl, "S,mb_stat", "");
5749 SYSCTL_INT(_kern_ipc, OID_AUTO, mb_normalized, CTLFLAG_RD | CTLFLAG_LOCKED,
5750     &mb_normalized, 0, "");