bsd/kern/uipc_mbuf.c

   1 /*
   2  * Copyright (c) 2000-2007 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
  29 /*
  30  * Copyright (c) 1982, 1986, 1988, 1991, 1993
  31  *      The Regents of the University of California.  All rights reserved.
  32  *
  33  * Redistribution and use in source and binary forms, with or without
  34  * modification, are permitted provided that the following conditions
  35  * are met:
  36  * 1. Redistributions of source code must retain the above copyright
  37  *    notice, this list of conditions and the following disclaimer.
  38  * 2. Redistributions in binary form must reproduce the above copyright
  39  *    notice, this list of conditions and the following disclaimer in the
  40  *    documentation and/or other materials provided with the distribution.
  41  * 3. All advertising materials mentioning features or use of this software
  42  *    must display the following acknowledgement:
  43  *      This product includes software developed by the University of
  44  *      California, Berkeley and its contributors.
  45  * 4. Neither the name of the University nor the names of its contributors
  46  *    may be used to endorse or promote products derived from this software
  47  *    without specific prior written permission.
  48  *
  49  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  50  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  51  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  52  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  53  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  54  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  55  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  56  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  57  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  58  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  59  * SUCH DAMAGE.
  60  *
  61  *      @(#)uipc_mbuf.c 8.2 (Berkeley) 1/4/94
  62  */
  63 /*
  64  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
  65  * support for mandatory and extensible security protections.  This notice
  66  * is included in support of clause 2.2 (b) of the Apple Public License,
  67  * Version 2.0.
  68  */
  69
  70 #include <sys/param.h>
  71 #include <sys/systm.h>
  72 #include <sys/malloc.h>
  73 #include <sys/mbuf.h>
  74 #include <sys/kernel.h>
  75 #include <sys/sysctl.h>
  76 #include <sys/syslog.h>
  77 #include <sys/protosw.h>
  78 #include <sys/domain.h>
  79 #include <sys/queue.h>
  80
  81 #include <kern/kern_types.h>
  82 #include <kern/simple_lock.h>
  83 #include <kern/queue.h>
  84 #include <kern/sched_prim.h>
  85 #include <kern/cpu_number.h>
  86
  87 #include <libkern/OSAtomic.h>
  88 #include <libkern/libkern.h>
  89
  90 #include <IOKit/IOMapper.h>
  91
  92 #include <machine/limits.h>
  93 #include <machine/machine_routines.h>
  94
  95 #if CONFIG_MACF_NET
  96 #include <security/mac_framework.h>
  97 #endif /* MAC_NET */
  98
  99 #include <sys/mcache.h>
 100
 101 /*
 102  * MBUF IMPLEMENTATION NOTES.
 103  *
 104  * There is a total of 5 per-CPU caches:
 105  *
 106  * MC_MBUF:
 107  *      This is a cache of rudimentary objects of MSIZE in size; each
 108  *      object represents an mbuf structure.  This cache preserves only
 109  *      the m_type field of the mbuf during its transactions.
 110  *
 111  * MC_CL:
 112  *      This is a cache of rudimentary objects of MCLBYTES in size; each
 113  *      object represents a mcluster structure.  This cache does not
 114  *      preserve the contents of the objects during its transactions.
 115  *
 116  * MC_BIGCL:
 117  *      This is a cache of rudimentary objects of NBPG in size; each
 118  *      object represents a mbigcluster structure.  This cache does not
 119  *      preserve the contents of the objects during its transaction.
 120  *
 121  * MC_MBUF_CL:
 122  *      This is a cache of mbufs each having a cluster attached to it.
 123  *      It is backed by MC_MBUF and MC_CL rudimentary caches.  Several
 124  *      fields of the mbuf related to the external cluster are preserved
 125  *      during transactions.
 126  *
 127  * MC_MBUF_BIGCL:
 128  *      This is a cache of mbufs each having a big cluster attached to it.
 129  *      It is backed by MC_MBUF and MC_BIGCL rudimentary caches.  Several
 130  *      fields of the mbuf related to the external cluster are preserved
 131  *      during transactions.
 132  *
 133  * OBJECT ALLOCATION:
 134  *
 135  * Allocation requests are handled first at the per-CPU (mcache) layer
 136  * before falling back to the slab layer.  Performance is optimal when
 137  * the request is satisfied at the CPU layer because global data/lock
 138  * never gets accessed.  When the slab layer is entered for allocation,
 139  * the slab freelist will be checked first for available objects before
 140  * the VM backing store is invoked.  Slab layer operations are serialized
 141  * for all of the caches as the mbuf global lock is held most of the time.
 142  * Allocation paths are different depending on the class of objects:
 143  *
 144  * a. Rudimentary object:
 145  *
 146  *      { m_get_common(), m_clattach(), m_mclget(),
 147  *        m_mclalloc(), m_bigalloc(), m_copym_with_hdrs(),
 148  *        composite object allocation }
 149  *                      |       ^
 150  *                      |       |
 151  *                      |       +-----------------------+
 152  *                      v                               |
 153  *         mcache_alloc/mcache_alloc_ext()      mbuf_slab_audit()
 154  *                      |                               ^
 155  *                      v                               |
 156  *                 [CPU cache] -------> (found?) -------+
 157  *                      |                               |
 158  *                      v                               |
 159  *               mbuf_slab_alloc()                      |
 160  *                      |                               |
 161  *                      v                               |
 162  *      +---------> [freelist] -------> (found?) -------+
 163  *      |               |
 164  *      |               v
 165  *      |           m_clalloc()
 166  *      |               |
 167  *      |               v
 168  *      +---<<---- kmem_mb_alloc()
 169  *
 170  * b. Composite object:
 171  *
 172  *      { m_getpackets_internal(), m_allocpacket_internal() }
 173  *                      |       ^
 174  *                      |       |
 175  *                      |       +------ (done) ---------+
 176  *                      v                               |
 177  *         mcache_alloc/mcache_alloc_ext()      mbuf_cslab_audit()
 178  *                      |                               ^
 179  *                      v                               |
 180  *                 [CPU cache] -------> (found?) -------+
 181  *                      |                               |
 182  *                      v                               |
 183  *               mbuf_cslab_alloc()                     |
 184  *                      |                               |
 185  *                      v                               |
 186  *                  [freelist] -------> (found?) -------+
 187  *                      |                               |
 188  *                      v                               |
 189  *              (rudimentary object)                    |
 190  *         mcache_alloc/mcache_alloc_ext() ------>>-----+
 191  *
 192  * Auditing notes: If auditing is enabled, buffers will be subjected to
 193  * integrity checks by the audit routine.  This is done by verifying their
 194  * contents against DEADBEEF (free) pattern before returning them to caller.
 195  * As part of this step, the routine will also record the transaction and
 196  * pattern-fill the buffers with BADDCAFE (uninitialized) pattern.  It will
 197  * also restore any constructed data structure fields if necessary.
 198  *
 199  * OBJECT DEALLOCATION:
 200  *
 201  * Freeing an object simply involves placing it into the CPU cache; this
 202  * pollutes the cache to benefit subsequent allocations.  The slab layer
 203  * will only be entered if the object is to be purged out of the cache.
 204  * During normal operations, this happens only when the CPU layer resizes
 205  * its bucket while it's adjusting to the allocation load.  Deallocation
 206  * paths are different depending on the class of objects:
 207  *
 208  * a. Rudimentary object:
 209  *
 210  *      { m_free(), m_freem_list(), composite object deallocation }
 211  *                      |       ^
 212  *                      |       |
 213  *                      |       +------ (done) ---------+
 214  *                      v                               |
 215  *         mcache_free/mcache_free_ext()                |
 216  *                      |                               |
 217  *                      v                               |
 218  *              mbuf_slab_audit()                       |
 219  *                      |                               |
 220  *                      v                               |
 221  *                 [CPU cache] ---> (not purging?) -----+
 222  *                      |                               |
 223  *                      v                               |
 224  *               mbuf_slab_free()                       |
 225  *                      |                               |
 226  *                      v                               |
 227  *                  [freelist] ----------->>------------+
 228  *       (objects never get purged to VM)
 229  *
 230  * b. Composite object:
 231  *
 232  *      { m_free(), m_freem_list() }
 233  *                      |       ^
 234  *                      |       |
 235  *                      |       +------ (done) ---------+
 236  *                      v                               |
 237  *         mcache_free/mcache_free_ext()                |
 238  *                      |                               |
 239  *                      v                               |
 240  *              mbuf_cslab_audit()                      |
 241  *                      |                               |
 242  *                      v                               |
 243  *                 [CPU cache] ---> (not purging?) -----+
 244  *                      |                               |
 245  *                      v                               |
 246  *               mbuf_cslab_free()                      |
 247  *                      |                               |
 248  *                      v                               |
 249  *                  [freelist] ---> (not purging?) -----+
 250  *                      |                               |
 251  *                      v                               |
 252  *              (rudimentary object)                    |
 253  *         mcache_free/mcache_free_ext() ------->>------+
 254  *
 255  * Auditing notes: If auditing is enabled, the audit routine will save
 256  * any constructed data structure fields (if necessary) before filling the
 257  * contents of the buffers with DEADBEEF (free) pattern and recording the
 258  * transaction.  Buffers that are freed (whether at CPU or slab layer) are
 259  * expected to contain the free pattern.
 260  *
 261  * DEBUGGING:
 262  *
 263  * Debugging can be enabled by adding "mbuf_debug=0x3" to boot-args; this
 264  * translates to the mcache flags (MCF_VERIFY | MCF_AUDIT).  Additionally,
 265  * the CPU layer cache can be disabled by setting the MCF_NOCPUCACHE flag,
 266  * i.e. modify the boot argument parameter to "mbuf_debug=0x13".  Note
 267  * that debugging consumes more CPU and memory.
 268  *
 269  * Each object is associated with exactly one mcache_audit_t structure that
 270  * contains the information related to its last buffer transaction.  Given
 271  * an address of an object, the audit structure can be retrieved by finding
 272  * the position of the object relevant to the base address of the cluster:
 273  *
 274  *      +------------+                  +=============+
 275  *      | mbuf addr  |                  | mclaudit[i] |
 276  *      +------------+                  +=============+
 277  *            |                         | cl_audit[0] |
 278  *      i = MTOCL(addr)                 +-------------+
 279  *            |                 +-----> | cl_audit[1] | -----> mcache_audit_t
 280  *      b = CLTOM(i)            |       +-------------+
 281  *            |                 |       |     ...     |
 282  *      x = MCLIDX(b, addr)     |       +-------------+
 283  *            |                 |       | cl_audit[7] |
 284  *            +-----------------+       +-------------+
 285  *               (e.g. x == 1)
 286  *
 287  * The mclaudit[] array is allocated at initialization time, but its contents
 288  * get populated when the corresponding cluster is created.  Because a cluster
 289  * can be turned into NMBPCL number of mbufs, we preserve enough space for the
 290  * mbufs so that there is a 1-to-1 mapping between them.  A cluster that never
 291  * gets (or has not yet) turned into mbufs will use only cl_audit[0] with the
 292  * remaining entries unused.  For big clusters, only one entry is allocated
 293  * and used for the entire cluster pair.
 294  */
 295
 296 /* TODO: should be in header file */
 297 /* kernel translater */
 298 extern vm_offset_t kmem_mb_alloc(vm_map_t, int);
 299 extern ppnum_t pmap_find_phys(pmap_t pmap, addr64_t va);
 300 extern vm_map_t mb_map;         /* special map */
 301
 302 /* Global lock */
 303 static lck_mtx_t *mbuf_mlock;
 304 static lck_attr_t *mbuf_mlock_attr;
 305 static lck_grp_t *mbuf_mlock_grp;
 306 static lck_grp_attr_t *mbuf_mlock_grp_attr;
 307
 308 /* Back-end (common) layer */
 309 static void *mbuf_worker_run;   /* wait channel for worker thread */
 310 static int mbuf_worker_ready;   /* worker thread is runnable */
 311 static int mbuf_expand_mcl;     /* number of cluster creation requets */
 312 static int mbuf_expand_big;     /* number of big cluster creation requests */
 313 static int mbuf_expand_16k;     /* number of 16K cluster creation requests */
 314 static int ncpu;                /* number of CPUs */
 315 static int *mcl_paddr;          /* Array of cluster physical addresses */
 316 static ppnum_t mcl_paddr_base;  /* Handle returned by IOMapper::iovmAlloc() */
 317 static mcache_t *ref_cache;     /* Cache of cluster reference & flags */
 318 static mcache_t *mcl_audit_con_cache; /* Audit contents cache */
 319 static unsigned int mbuf_debug; /* patchable mbuf mcache flags */
 320 static unsigned int mb_normalized; /* number of packets "normalized" */
 321
 322 typedef enum {
 323         MC_MBUF = 0,    /* Regular mbuf */
 324         MC_CL,          /* Cluster */
 325         MC_BIGCL,       /* Large (4K) cluster */
 326         MC_16KCL,       /* Jumbo (16K) cluster */
 327         MC_MBUF_CL,     /* mbuf + cluster */
 328         MC_MBUF_BIGCL,  /* mbuf + large (4K) cluster */
 329         MC_MBUF_16KCL   /* mbuf + jumbo (16K) cluster */
 330 } mbuf_class_t;
 331
 332 #define MBUF_CLASS_MIN          MC_MBUF
 333 #define MBUF_CLASS_MAX          MC_MBUF_16KCL
 334 #define MBUF_CLASS_LAST         MC_16KCL
 335 #define MBUF_CLASS_VALID(c) \
 336         ((int)(c) >= MBUF_CLASS_MIN && (int)(c) <= MBUF_CLASS_MAX)
 337 #define MBUF_CLASS_COMPOSITE(c) \
 338         ((int)(c) > MBUF_CLASS_LAST)
 339
 340
 341 /*
 342  * mbuf specific mcache allocation request flags.
 343  */
 344 #define MCR_COMP        MCR_USR1 /* for MC_MBUF_{CL,BIGCL,16KCL} caches */
 345
 346 /*
 347  * Per-cluster slab structure.
 348  *
 349  * A slab is a cluster control structure that contains one or more object
 350  * chunks; the available chunks are chained in the slab's freelist (sl_head).
 351  * Each time a chunk is taken out of the slab, the slab's reference count
 352  * gets incremented.  When all chunks have been taken out, the empty slab
 353  * gets removed (SLF_DETACHED) from the class's slab list.  A chunk that is
 354  * returned to a slab causes the slab's reference count to be decremented;
 355  * it also causes the slab to be reinserted back to class's slab list, if
 356  * it's not already done.
 357  *
 358  * Compartmentalizing of the object chunks into slabs allows us to easily
 359  * merge one or more slabs together when the adjacent slabs are idle, as
 360  * well as to convert or move a slab from one class to another; e.g. the
 361  * mbuf cluster slab can be converted to a regular cluster slab when all
 362  * mbufs in the slab have been freed.
 363  *
 364  * A slab may also span across multiple clusters for chunks larger than
 365  * a cluster's size.  In this case, only the slab of the first cluster is
 366  * used.  The rest of the slabs are marked with SLF_PARTIAL to indicate
 367  * that they are part of the larger slab.
 368  */
 369 typedef struct mcl_slab {
 370         struct mcl_slab *sl_next;       /* neighboring slab */
 371         u_int8_t        sl_class;       /* controlling mbuf class */
 372         int8_t          sl_refcnt;      /* outstanding allocations */
 373         int8_t          sl_chunks;      /* chunks (bufs) in this slab */
 374         u_int16_t       sl_flags;       /* slab flags (see below) */
 375         u_int16_t       sl_len;         /* slab length */
 376         void            *sl_base;       /* base of allocated memory */
 377         void            *sl_head;       /* first free buffer */
 378         TAILQ_ENTRY(mcl_slab) sl_link;  /* next/prev slab on freelist */
 379 } mcl_slab_t;
 380
 381 #define SLF_MAPPED      0x0001          /* backed by a mapped page */
 382 #define SLF_PARTIAL     0x0002          /* part of another slab */
 383 #define SLF_DETACHED    0x0004          /* not in slab freelist */
 384
 385 /*
 386  * The array of slabs are broken into groups of arrays per 1MB of kernel
 387  * memory to reduce the footprint.  Each group is allocated on demand
 388  * whenever a new piece of memory mapped in from the VM crosses the 1MB
 389  * boundary.
 390  */
 391 #define NSLABSPMB       ((1 << MBSHIFT) >> MCLSHIFT)    /* 512 slabs/grp */
 392
 393 typedef struct mcl_slabg {
 394         mcl_slab_t      slg_slab[NSLABSPMB];    /* group of slabs */
 395 } mcl_slabg_t;
 396
 397 /*
 398  * Per-cluster audit structure.
 399  */
 400 typedef struct {
 401         mcache_audit_t  *cl_audit[NMBPCL];      /* array of audits */
 402 } mcl_audit_t;
 403
 404 #if CONFIG_MBUF_NOEXPAND
 405 static unsigned int maxmbufcl;
 406 #endif /* CONFIG_MBUF_NOEXPAND */
 407
 408 /*
 409  * Size of data from the beginning of an mbuf that covers m_hdr, pkthdr
 410  * and m_ext structures.  If auditing is enabled, we allocate a shadow
 411  * mbuf structure of this size inside each audit structure, and the
 412  * contents of the real mbuf gets copied into it when the mbuf is freed.
 413  * This allows us to pattern-fill the mbuf for integrity check, and to
 414  * preserve any constructed mbuf fields (e.g. mbuf + cluster cache case).
 415  * Note that we don't save the contents of clusters when they are freed;
 416  * we simply pattern-fill them.
 417  */
 418 #if defined(__LP64__)
 419 #define AUDIT_CONTENTS_SIZE     160
 420 #else
 421 #define AUDIT_CONTENTS_SIZE     80
 422 #endif /* __LP64__ */
 423
 424 /*
 425  * mbuf specific mcache audit flags
 426  */
 427 #define MB_INUSE        0x01    /* object has not been returned to slab */
 428 #define MB_COMP_INUSE   0x02    /* object has not been returned to cslab */
 429 #define MB_SCVALID      0x04    /* object has valid saved contents */
 430
 431 /*
 432  * Each of the following two arrays hold up to nmbclusters elements.
 433  */
 434 static mcl_audit_t *mclaudit;   /* array of cluster audit information */
 435 static mcl_slabg_t **slabstbl;  /* cluster slabs table */
 436 static unsigned int maxslabgrp; /* max # of entries in slabs table */
 437 static unsigned int slabgrp;    /* # of entries in slabs table */
 438
 439 /* Globals */
 440 int nclusters;                  /* # of clusters for non-jumbo (legacy) sizes */
 441 int njcl;                       /* # of clusters for jumbo sizes */
 442 int njclbytes;                  /* size of a jumbo cluster */
 443 union mcluster *mbutl;          /* first mapped cluster address */
 444 union mcluster *embutl;         /* ending virtual address of mclusters */
 445 int max_linkhdr;                /* largest link-level header */
 446 int max_protohdr;               /* largest protocol header */
 447 int max_hdr;                    /* largest link+protocol header */
 448 int max_datalen;                /* MHLEN - max_hdr */
 449
 450 /* TODO: should be in header file */
 451 int do_reclaim = 0;
 452
 453 /* The minimum number of objects that are allocated, to start. */
 454 #define MINCL           32
 455 #define MINBIGCL        (MINCL >> 1)
 456 #define MIN16KCL        (MINCL >> 2)
 457
 458 /* Low watermarks (only map in pages once free counts go below) */
 459 #define MCL_LOWAT       MINCL
 460 #define MBIGCL_LOWAT    MINBIGCL
 461 #define M16KCL_LOWAT    MIN16KCL
 462
 463 typedef struct {
 464         mbuf_class_t    mtbl_class;     /* class type */
 465         mcache_t        *mtbl_cache;    /* mcache for this buffer class */
 466         TAILQ_HEAD(mcl_slhead, mcl_slab) mtbl_slablist; /* slab list */
 467         mcache_obj_t    *mtbl_cobjlist; /* composite objects freelist */
 468         mb_class_stat_t *mtbl_stats;    /* statistics fetchable via sysctl */
 469         u_int32_t       mtbl_maxsize;   /* maximum buffer size */
 470         int             mtbl_minlimit;  /* minimum allowed */
 471         int             mtbl_maxlimit;  /* maximum allowed */
 472         u_int32_t       mtbl_wantpurge; /* purge during next reclaim */
 473 } mbuf_table_t;
 474
 475 #define m_class(c)      mbuf_table[c].mtbl_class
 476 #define m_cache(c)      mbuf_table[c].mtbl_cache
 477 #define m_slablist(c)   mbuf_table[c].mtbl_slablist
 478 #define m_cobjlist(c)   mbuf_table[c].mtbl_cobjlist
 479 #define m_maxsize(c)    mbuf_table[c].mtbl_maxsize
 480 #define m_minlimit(c)   mbuf_table[c].mtbl_minlimit
 481 #define m_maxlimit(c)   mbuf_table[c].mtbl_maxlimit
 482 #define m_wantpurge(c)  mbuf_table[c].mtbl_wantpurge
 483 #define m_cname(c)      mbuf_table[c].mtbl_stats->mbcl_cname
 484 #define m_size(c)       mbuf_table[c].mtbl_stats->mbcl_size
 485 #define m_total(c)      mbuf_table[c].mtbl_stats->mbcl_total
 486 #define m_active(c)     mbuf_table[c].mtbl_stats->mbcl_active
 487 #define m_infree(c)     mbuf_table[c].mtbl_stats->mbcl_infree
 488 #define m_slab_cnt(c)   mbuf_table[c].mtbl_stats->mbcl_slab_cnt
 489 #define m_alloc_cnt(c)  mbuf_table[c].mtbl_stats->mbcl_alloc_cnt
 490 #define m_free_cnt(c)   mbuf_table[c].mtbl_stats->mbcl_free_cnt
 491 #define m_notified(c)   mbuf_table[c].mtbl_stats->mbcl_notified
 492 #define m_purge_cnt(c)  mbuf_table[c].mtbl_stats->mbcl_purge_cnt
 493 #define m_fail_cnt(c)   mbuf_table[c].mtbl_stats->mbcl_fail_cnt
 494 #define m_ctotal(c)     mbuf_table[c].mtbl_stats->mbcl_ctotal
 495
 496 static mbuf_table_t mbuf_table[] = {
 497         /*
 498          * The caches for mbufs, regular clusters and big clusters.
 499          */
 500         { MC_MBUF, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_MBUF)),
 501             NULL, NULL, 0, 0, 0, 0 },
 502         { MC_CL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_CL)),
 503             NULL, NULL, 0, 0, 0, 0 },
 504         { MC_BIGCL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_BIGCL)),
 505             NULL, NULL, 0, 0, 0, 0 },
 506         { MC_16KCL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_16KCL)),
 507             NULL, NULL, 0, 0, 0, 0 },
 508         /*
 509          * The following are special caches; they serve as intermediate
 510          * caches backed by the above rudimentary caches.  Each object
 511          * in the cache is an mbuf with a cluster attached to it.  Unlike
 512          * the above caches, these intermediate caches do not directly
 513          * deal with the slab structures; instead, the constructed
 514          * cached elements are simply stored in the freelists.
 515          */
 516         { MC_MBUF_CL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0 },
 517         { MC_MBUF_BIGCL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0 },
 518         { MC_MBUF_16KCL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0 },
 519 };
 520
 521 #define NELEM(a)        (sizeof (a) / sizeof ((a)[0]))
 522
 523 static void *mb_waitchan = &mbuf_table; /* wait channel for all caches */
 524 static int mb_waiters;                  /* number of sleepers */
 525
 526 /* The following are used to serialize m_clalloc() */
 527 static boolean_t mb_clalloc_busy;
 528 static void *mb_clalloc_waitchan = &mb_clalloc_busy;
 529 static int mb_clalloc_waiters;
 530
 531 static int mbstat_sysctl SYSCTL_HANDLER_ARGS;
 532 static int mb_stat_sysctl SYSCTL_HANDLER_ARGS;
 533 static void mbuf_table_init(void);
 534 static inline void m_incref(struct mbuf *);
 535 static inline u_int32_t m_decref(struct mbuf *);
 536 static int m_clalloc(const u_int32_t, const int, const u_int32_t);
 537 static void mbuf_worker_thread_init(void);
 538 static mcache_obj_t *slab_alloc(mbuf_class_t, int);
 539 static void slab_free(mbuf_class_t, mcache_obj_t *);
 540 static unsigned int mbuf_slab_alloc(void *, mcache_obj_t ***,
 541     unsigned int, int);
 542 static void mbuf_slab_free(void *, mcache_obj_t *, int);
 543 static void mbuf_slab_audit(void *, mcache_obj_t *, boolean_t);
 544 static void mbuf_slab_notify(void *, u_int32_t);
 545 static unsigned int cslab_alloc(mbuf_class_t, mcache_obj_t ***,
 546     unsigned int);
 547 static unsigned int cslab_free(mbuf_class_t, mcache_obj_t *, int);
 548 static unsigned int mbuf_cslab_alloc(void *, mcache_obj_t ***,
 549     unsigned int, int);
 550 static void mbuf_cslab_free(void *, mcache_obj_t *, int);
 551 static void mbuf_cslab_audit(void *, mcache_obj_t *, boolean_t);
 552 static int freelist_populate(mbuf_class_t, unsigned int, int);
 553 static boolean_t mbuf_cached_above(mbuf_class_t, int);
 554 static boolean_t mbuf_steal(mbuf_class_t, unsigned int);
 555 static void m_reclaim(mbuf_class_t, unsigned int, boolean_t);
 556 static int m_howmany(int, size_t);
 557 static void mbuf_worker_thread(void);
 558 static boolean_t mbuf_sleep(mbuf_class_t, unsigned int, int);
 559
 560 static void mcl_audit_init(void *, mcache_audit_t **, mcache_obj_t **,
 561     size_t, unsigned int);
 562 static mcache_audit_t *mcl_audit_buf2mca(mbuf_class_t, mcache_obj_t *);
 563 static void mcl_audit_mbuf(mcache_audit_t *, void *, boolean_t, boolean_t);
 564 static void mcl_audit_cluster(mcache_audit_t *, void *, size_t, boolean_t,
 565     boolean_t);
 566 static void mcl_audit_restore_mbuf(struct mbuf *, mcache_audit_t *, boolean_t);
 567 static void mcl_audit_save_mbuf(struct mbuf *, mcache_audit_t *);
 568 static void mcl_audit_mcheck_panic(struct mbuf *);
 569 static void mcl_audit_verify_nextptr(void *, mcache_audit_t *);
 570
 571 static mcl_slab_t *slab_get(void *);
 572 static void slab_init(mcl_slab_t *, mbuf_class_t, u_int32_t,
 573     void *, void *, unsigned int, int, int);
 574 static void slab_insert(mcl_slab_t *, mbuf_class_t);
 575 static void slab_remove(mcl_slab_t *, mbuf_class_t);
 576 static boolean_t slab_inrange(mcl_slab_t *, void *);
 577 static void slab_nextptr_panic(mcl_slab_t *, void *);
 578 static void slab_detach(mcl_slab_t *);
 579 static boolean_t slab_is_detached(mcl_slab_t *);
 580
 581 /*
 582  * This flag is set for all mbufs that come out of and into the composite
 583  * mbuf + cluster caches, i.e. MC_MBUF_CL and MC_MBUF_BIGCL.  mbufs that
 584  * are marked with such a flag have clusters attached to them, and will be
 585  * treated differently when they are freed; instead of being placed back
 586  * into the mbuf and cluster freelists, the composite mbuf + cluster objects
 587  * are placed back into the appropriate composite cache's freelist, and the
 588  * actual freeing is deferred until the composite objects are purged.  At
 589  * such a time, this flag will be cleared from the mbufs and the objects
 590  * will be freed into their own separate freelists.
 591  */
 592 #define EXTF_COMPOSITE  0x1
 593
 594 #define MEXT_RFA(m)             ((m)->m_ext.ext_refflags)
 595 #define MEXT_REF(m)             (MEXT_RFA(m)->refcnt)
 596 #define MEXT_FLAGS(m)           (MEXT_RFA(m)->flags)
 597 #define MBUF_IS_COMPOSITE(m)    \
 598         (MEXT_REF(m) == 0 && (MEXT_FLAGS(m) & EXTF_COMPOSITE))
 599
 600 /*
 601  * Macros used to verify the integrity of the mbuf.
 602  */
 603 #define _MCHECK(m) {                                                    \
 604         if ((m)->m_type != MT_FREE) {                                   \
 605                 if (mclaudit == NULL)                                   \
 606                         panic("MCHECK: m_type=%d m=%p",                 \
 607                             (u_int16_t)(m)->m_type, m);                 \
 608                 else                                                    \
 609                         mcl_audit_mcheck_panic(m);                      \
 610         }                                                               \
 611 }
 612
 613 #define MBUF_IN_MAP(addr)                                               \
 614         ((void *)(addr) >= (void *)mbutl && (void *)(addr) < (void *)embutl)
 615
 616 #define MRANGE(addr) {                                                  \
 617         if (!MBUF_IN_MAP(addr))                                         \
 618                 panic("MRANGE: address out of range 0x%p", addr);       \
 619 }
 620
 621 /*
 622  * Macro version of mtod.
 623  */
 624 #define MTOD(m, t)      ((t)((m)->m_data))
 625
 626 /*
 627  * Macros to obtain cluster index and base cluster address.
 628  */
 629 #define MTOCL(x)        (((char *)(x) - (char *)mbutl) >> MCLSHIFT)
 630 #define CLTOM(x)        ((union mcluster *)(mbutl + (x)))
 631
 632 /*
 633  * Macro to find the mbuf index relative to the cluster base.
 634  */
 635 #define MCLIDX(c, m)    (((char *)(m) - (char *)(c)) >> 8)
 636
 637 /*
 638  * Macros used during mbuf and cluster initialization.
 639  */
 640 #define MBUF_INIT(m, pkthdr, type) {                                    \
 641         _MCHECK(m);                                                     \
 642         (m)->m_next = (m)->m_nextpkt = NULL;                            \
 643         (m)->m_len = 0;                                                 \
 644         (m)->m_type = type;                                             \
 645         if ((pkthdr) == 0) {                                            \
 646                 (m)->m_data = (m)->m_dat;                               \
 647                 (m)->m_flags = 0;                                       \
 648         } else {                                                        \
 649                 (m)->m_data = (m)->m_pktdat;                            \
 650                 (m)->m_flags = M_PKTHDR;                                \
 651                 (m)->m_pkthdr.rcvif = NULL;                             \
 652                 (m)->m_pkthdr.len = 0;                                  \
 653                 (m)->m_pkthdr.header = NULL;                            \
 654                 (m)->m_pkthdr.csum_flags = 0;                           \
 655                 (m)->m_pkthdr.csum_data = 0;                            \
 656                 (m)->m_pkthdr.reserved0 = NULL;                         \
 657                 (m)->m_pkthdr.vlan_tag = 0;                             \
 658                 (m)->m_pkthdr.socket_id = 0;                            \
 659                 m_tag_init(m);                                          \
 660         }                                                               \
 661 }
 662
 663 #define MEXT_INIT(m, buf, size, free, arg, rfa, ref, flag) {            \
 664         (m)->m_data = (m)->m_ext.ext_buf = (buf);                       \
 665         (m)->m_flags |= M_EXT;                                          \
 666         (m)->m_ext.ext_size = (size);                                   \
 667         (m)->m_ext.ext_free = (free);                                   \
 668         (m)->m_ext.ext_arg = (arg);                                     \
 669         (m)->m_ext.ext_refs.forward = (m)->m_ext.ext_refs.backward =    \
 670             &(m)->m_ext.ext_refs;                                       \
 671         MEXT_RFA(m) = (rfa);                                            \
 672         MEXT_REF(m) = (ref);                                            \
 673         MEXT_FLAGS(m) = (flag);                                         \
 674 }
 675
 676 #define MBUF_CL_INIT(m, buf, rfa, ref, flag)    \
 677         MEXT_INIT(m, buf, m_maxsize(MC_CL), NULL, NULL, rfa, ref, flag)
 678
 679 #define MBUF_BIGCL_INIT(m, buf, rfa, ref, flag) \
 680         MEXT_INIT(m, buf, m_maxsize(MC_BIGCL), m_bigfree, NULL, rfa, ref, flag)
 681
 682 #define MBUF_16KCL_INIT(m, buf, rfa, ref, flag) \
 683         MEXT_INIT(m, buf, m_maxsize(MC_16KCL), m_16kfree, NULL, rfa, ref, flag)
 684
 685 /*
 686  * Macro to convert BSD malloc sleep flag to mcache's
 687  */
 688 #define MSLEEPF(f)      ((!((f) & M_DONTWAIT)) ? MCR_SLEEP : MCR_NOSLEEP)
 689
 690 /*
 691  * The structure that holds all mbuf class statistics exportable via sysctl.
 692  * Similar to mbstat structure, the mb_stat structure is protected by the
 693  * global mbuf lock.  It contains additional information about the classes
 694  * that allows for a more accurate view of the state of the allocator.
 695  */
 696 struct mb_stat *mb_stat;
 697
 698 #define MB_STAT_SIZE(n) \
 699         ((size_t)(&((mb_stat_t *)0)->mbs_class[n]))
 700
 701 /*
 702  * The legacy structure holding all of the mbuf allocation statistics.
 703  * The actual statistics used by the kernel are stored in the mbuf_table
 704  * instead, and are updated atomically while the global mbuf lock is held.
 705  * They are mirrored in mbstat to support legacy applications (e.g. netstat).
 706  * Unlike before, the kernel no longer relies on the contents of mbstat for
 707  * its operations (e.g. cluster expansion) because the structure is exposed
 708  * to outside and could possibly be modified, therefore making it unsafe.
 709  * With the exception of the mbstat.m_mtypes array (see below), all of the
 710  * statistics are updated as they change.
 711  */
 712 struct mbstat mbstat;
 713
 714 #define MBSTAT_MTYPES_MAX \
 715         (sizeof (mbstat.m_mtypes) / sizeof (mbstat.m_mtypes[0]))
 716
 717 /*
 718  * Allocation statistics related to mbuf types (up to MT_MAX-1) are updated
 719  * atomically and stored in a per-CPU structure which is lock-free; this is
 720  * done in order to avoid writing to the global mbstat data structure which
 721  * would cause false sharing.  During sysctl request for kern.ipc.mbstat,
 722  * the statistics across all CPUs will be converged into the mbstat.m_mtypes
 723  * array and returned to the application.  Any updates for types greater or
 724  * equal than MT_MAX would be done atomically to the mbstat; this slows down
 725  * performance but is okay since the kernel uses only up to MT_MAX-1 while
 726  * anything beyond that (up to type 255) is considered a corner case.
 727  */
 728 typedef struct {
 729         unsigned int    cpu_mtypes[MT_MAX];
 730 } __attribute__((aligned(CPU_CACHE_SIZE), packed)) mtypes_cpu_t;
 731
 732 typedef struct {
 733         mtypes_cpu_t    mbs_cpu[1];
 734 } mbuf_mtypes_t;
 735
 736 static mbuf_mtypes_t *mbuf_mtypes;      /* per-CPU statistics */
 737
 738 #define MBUF_MTYPES_SIZE(n) \
 739         ((size_t)(&((mbuf_mtypes_t *)0)->mbs_cpu[n]))
 740
 741 #define MTYPES_CPU(p) \
 742         ((mtypes_cpu_t *)((char *)(p) + MBUF_MTYPES_SIZE(cpu_number())))
 743
 744 /* This should be in a header file */
 745 #define atomic_add_32(a, n)     ((void) OSAddAtomic(n, (volatile SInt32 *)a))
 746
 747 #define mtype_stat_add(type, n) {                                       \
 748         if ((unsigned)(type) < MT_MAX) {                                \
 749                 mtypes_cpu_t *mbs = MTYPES_CPU(mbuf_mtypes);            \
 750                 atomic_add_32(&mbs->cpu_mtypes[type], n);               \
 751         } else if ((unsigned)(type) < MBSTAT_MTYPES_MAX) {              \
 752                 atomic_add_32(&mbstat.m_mtypes[type], n);               \
 753         }                                                               \
 754 }
 755
 756 #define mtype_stat_sub(t, n)    mtype_stat_add(t, -(n))
 757 #define mtype_stat_inc(t)       mtype_stat_add(t, 1)
 758 #define mtype_stat_dec(t)       mtype_stat_sub(t, 1)
 759
 760 static int
 761 mbstat_sysctl SYSCTL_HANDLER_ARGS
 762 {
 763 #pragma unused(oidp, arg1, arg2)
 764         int m, n;
 765         mtypes_cpu_t mtc;
 766
 767         bzero(&mtc, sizeof (mtc));
 768         for (m = 0; m < ncpu; m++) {
 769                 mtypes_cpu_t *scp = &mbuf_mtypes->mbs_cpu[m];
 770                 mtypes_cpu_t temp;
 771
 772                 bcopy(&scp->cpu_mtypes, &temp.cpu_mtypes,
 773                     sizeof (temp.cpu_mtypes));
 774
 775                 for (n = 0; n < MT_MAX; n++)
 776                         mtc.cpu_mtypes[n] += temp.cpu_mtypes[n];
 777         }
 778         lck_mtx_lock(mbuf_mlock);
 779         for (n = 0; n < MT_MAX; n++)
 780                 mbstat.m_mtypes[n] = mtc.cpu_mtypes[n];
 781         lck_mtx_unlock(mbuf_mlock);
 782
 783         return (SYSCTL_OUT(req, &mbstat, sizeof (mbstat)));
 784 }
 785
 786 static int
 787 mb_stat_sysctl SYSCTL_HANDLER_ARGS
 788 {
 789 #pragma unused(oidp, arg1, arg2)
 790         mcache_t *cp;
 791         mcache_cpu_t *ccp;
 792         mb_class_stat_t *sp;
 793         int k, m, bktsize;
 794
 795         lck_mtx_lock(mbuf_mlock);
 796         for (k = 0; k < NELEM(mbuf_table); k++) {
 797                 cp = m_cache(k);
 798                 ccp = &cp->mc_cpu[0];
 799                 bktsize = ccp->cc_bktsize;
 800                 sp = mbuf_table[k].mtbl_stats;
 801
 802                 if (cp->mc_flags & MCF_NOCPUCACHE)
 803                         sp->mbcl_mc_state = MCS_DISABLED;
 804                 else if (cp->mc_purge_cnt > 0)
 805                         sp->mbcl_mc_state = MCS_PURGING;
 806                 else if (bktsize == 0)
 807                         sp->mbcl_mc_state = MCS_OFFLINE;
 808                 else
 809                         sp->mbcl_mc_state = MCS_ONLINE;
 810
 811                 sp->mbcl_mc_cached = 0;
 812                 for (m = 0; m < ncpu; m++) {
 813                         ccp = &cp->mc_cpu[m];
 814                         if (ccp->cc_objs > 0)
 815                                 sp->mbcl_mc_cached += ccp->cc_objs;
 816                         if (ccp->cc_pobjs > 0)
 817                                 sp->mbcl_mc_cached += ccp->cc_pobjs;
 818                 }
 819                 sp->mbcl_mc_cached += (cp->mc_full.bl_total * bktsize);
 820                 sp->mbcl_active = sp->mbcl_total - sp->mbcl_mc_cached -
 821                     sp->mbcl_infree;
 822
 823                 sp->mbcl_mc_waiter_cnt = cp->mc_waiter_cnt;
 824                 sp->mbcl_mc_wretry_cnt = cp->mc_wretry_cnt;
 825                 sp->mbcl_mc_nwretry_cnt = cp->mc_nwretry_cnt;
 826
 827                 /* Calculate total count specific to each class */
 828                 sp->mbcl_ctotal = sp->mbcl_total;
 829                 switch (m_class(k)) {
 830                 case MC_MBUF:
 831                         /* Deduct mbufs used in composite caches */
 832                         sp->mbcl_ctotal -= (m_total(MC_MBUF_CL) +
 833                             m_total(MC_MBUF_BIGCL));
 834                         break;
 835
 836                 case MC_CL:
 837                         /* Deduct clusters used in composite cache and mbufs */
 838                         sp->mbcl_ctotal -= (m_total(MC_MBUF_CL) +
 839                             (P2ROUNDUP(m_total(MC_MBUF), NMBPCL)/NMBPCL));
 840                         break;
 841
 842                 case MC_BIGCL:
 843                         /* Deduct clusters used in composite cache */
 844                         sp->mbcl_ctotal -= m_total(MC_MBUF_BIGCL);
 845                         break;
 846
 847                 case MC_16KCL:
 848                         /* Deduct clusters used in composite cache */
 849                         sp->mbcl_ctotal -= m_total(MC_MBUF_16KCL);
 850                         break;
 851
 852                 default:
 853                         break;
 854                 }
 855         }
 856         lck_mtx_unlock(mbuf_mlock);
 857
 858         return (SYSCTL_OUT(req, mb_stat, MB_STAT_SIZE(NELEM(mbuf_table))));
 859 }
 860
 861 static inline void
 862 m_incref(struct mbuf *m)
 863 {
 864         UInt32 old, new;
 865         volatile UInt32 *addr = (volatile UInt32 *)&MEXT_REF(m);
 866
 867         do {
 868                 old = *addr;
 869                 new = old + 1;
 870                 ASSERT(new != 0);
 871         } while (!OSCompareAndSwap(old, new, addr));
 872 }
 873
 874 static inline u_int32_t
 875 m_decref(struct mbuf *m)
 876 {
 877         UInt32 old, new;
 878         volatile UInt32 *addr = (volatile UInt32 *)&MEXT_REF(m);
 879
 880         do {
 881                 old = *addr;
 882                 new = old - 1;
 883                 ASSERT(old != 0);
 884         } while (!OSCompareAndSwap(old, new, addr));
 885
 886         return (new);
 887 }
 888
 889 static void
 890 mbuf_table_init(void)
 891 {
 892         int m;
 893
 894         MALLOC(mb_stat, mb_stat_t *, MB_STAT_SIZE(NELEM(mbuf_table)),
 895             M_TEMP, M_WAITOK | M_ZERO);
 896         VERIFY(mb_stat != NULL);
 897
 898         mb_stat->mbs_cnt = NELEM(mbuf_table);
 899         for (m = 0; m < NELEM(mbuf_table); m++)
 900                 mbuf_table[m].mtbl_stats = &mb_stat->mbs_class[m];
 901
 902 #if CONFIG_MBUF_JUMBO
 903         /*
 904          * Set aside 1/3 of the mbuf cluster map for jumbo clusters; we do
 905          * this only on platforms where jumbo cluster pool is enabled.
 906          */
 907         njcl = nmbclusters / 3;
 908         njclbytes = M16KCLBYTES;
 909 #endif /* CONFIG_MBUF_JUMBO */
 910
 911         /*
 912          * nclusters is going to be split in 2 to hold both the 2K
 913          * and the 4K pools, so make sure each half is even.
 914          */
 915         nclusters = P2ROUNDDOWN(nmbclusters - njcl, 4);
 916         if (njcl > 0) {
 917                 /*
 918                  * Each jumbo cluster takes 8 2K clusters, so make
 919                  * sure that the pool size is evenly divisible by 8.
 920                  */
 921                 njcl = P2ROUNDDOWN(nmbclusters - nclusters, 8);
 922         }
 923
 924 #if CONFIG_MBUF_NOEXPAND
 925         /* Only use 4k clusters if we're setting aside more than 256k */
 926         if (nmbclusters <= 128) {
 927                 maxmbufcl = nmbclusters / 4;
 928         } else {
 929                 /* Half to big clusters, half to small */
 930                 maxmbufcl = (nmbclusters / 4) * 3;
 931         }
 932 #endif /* CONFIG_MBUF_NOEXPAND */
 933
 934         /*
 935          * 1/2 of the map is reserved for 2K clusters.  Out of this, 1/16th
 936          * of the total number of 2K clusters allocated is reserved and cannot
 937          * be turned into mbufs.  It can only be used for pure cluster objects.
 938          */
 939         m_minlimit(MC_CL) = (nclusters >> 5);
 940         m_maxlimit(MC_CL) = (nclusters >> 1);
 941         m_maxsize(MC_CL) = m_size(MC_CL) = MCLBYTES;
 942         (void) snprintf(m_cname(MC_CL), MAX_MBUF_CNAME, "cl");
 943
 944         /*
 945          * The remaining (15/16th) can be turned into mbufs.
 946          */
 947         m_minlimit(MC_MBUF) = 0;
 948         m_maxlimit(MC_MBUF) = (m_maxlimit(MC_CL) - m_minlimit(MC_CL)) * NMBPCL;
 949         m_maxsize(MC_MBUF) = m_size(MC_MBUF) = MSIZE;
 950         (void) snprintf(m_cname(MC_MBUF), MAX_MBUF_CNAME, "mbuf");
 951
 952         /*
 953          * The other 1/2 of the map is reserved for 4K clusters.
 954          */
 955         m_minlimit(MC_BIGCL) = 0;
 956         m_maxlimit(MC_BIGCL) = m_maxlimit(MC_CL) >> 1;
 957         m_maxsize(MC_BIGCL) = m_size(MC_BIGCL) = NBPG;
 958         (void) snprintf(m_cname(MC_BIGCL), MAX_MBUF_CNAME, "bigcl");
 959
 960         /*
 961          * Set limits for the composite classes.
 962          */
 963         m_minlimit(MC_MBUF_CL) = 0;
 964         m_maxlimit(MC_MBUF_CL) = m_maxlimit(MC_CL) - m_minlimit(MC_CL);
 965         m_maxsize(MC_MBUF_CL) = MCLBYTES;
 966         m_size(MC_MBUF_CL) = m_size(MC_MBUF) + m_size(MC_CL);
 967         (void) snprintf(m_cname(MC_MBUF_CL), MAX_MBUF_CNAME, "mbuf_cl");
 968
 969         m_minlimit(MC_MBUF_BIGCL) = 0;
 970         m_maxlimit(MC_MBUF_BIGCL) = m_maxlimit(MC_BIGCL);
 971         m_maxsize(MC_MBUF_BIGCL) = NBPG;
 972         m_size(MC_MBUF_BIGCL) = m_size(MC_MBUF) + m_size(MC_BIGCL);
 973         (void) snprintf(m_cname(MC_MBUF_BIGCL), MAX_MBUF_CNAME, "mbuf_bigcl");
 974
 975         /*
 976          * And for jumbo classes.
 977          */
 978         m_minlimit(MC_16KCL) = 0;
 979         m_maxlimit(MC_16KCL) = (njcl >> 3);
 980         m_maxsize(MC_16KCL) = m_size(MC_16KCL) = M16KCLBYTES;
 981         (void) snprintf(m_cname(MC_16KCL), MAX_MBUF_CNAME, "16kcl");
 982
 983         m_minlimit(MC_MBUF_16KCL) = 0;
 984         m_maxlimit(MC_MBUF_16KCL) = m_maxlimit(MC_16KCL);
 985         m_maxsize(MC_MBUF_16KCL) = M16KCLBYTES;
 986         m_size(MC_MBUF_16KCL) = m_size(MC_MBUF) + m_size(MC_16KCL);
 987         (void) snprintf(m_cname(MC_MBUF_16KCL), MAX_MBUF_CNAME, "mbuf_16kcl");
 988
 989         /*
 990          * Initialize the legacy mbstat structure.
 991          */
 992         bzero(&mbstat, sizeof (mbstat));
 993         mbstat.m_msize = m_maxsize(MC_MBUF);
 994         mbstat.m_mclbytes = m_maxsize(MC_CL);
 995         mbstat.m_minclsize = MINCLSIZE;
 996         mbstat.m_mlen = MLEN;
 997         mbstat.m_mhlen = MHLEN;
 998         mbstat.m_bigmclbytes = m_maxsize(MC_BIGCL);
 999 }
1000
1001 __private_extern__ void
1002 mbinit(void)
1003 {
1004         unsigned int m;
1005         int initmcl = MINCL;
1006         int mcl_pages;
1007         void *buf;
1008
1009         if (nmbclusters == 0)
1010                 nmbclusters = NMBCLUSTERS;
1011
1012         /* Setup the mbuf table */
1013         mbuf_table_init();
1014
1015         /* Global lock for common layer */
1016         mbuf_mlock_grp_attr = lck_grp_attr_alloc_init();
1017         mbuf_mlock_grp = lck_grp_alloc_init("mbuf", mbuf_mlock_grp_attr);
1018         mbuf_mlock_attr = lck_attr_alloc_init();
1019         mbuf_mlock = lck_mtx_alloc_init(mbuf_mlock_grp, mbuf_mlock_attr);
1020
1021         /* Allocate cluster slabs table */
1022         maxslabgrp = P2ROUNDUP(nmbclusters, NSLABSPMB) / NSLABSPMB;
1023         MALLOC(slabstbl, mcl_slabg_t **, maxslabgrp * sizeof (mcl_slabg_t *),
1024             M_TEMP, M_WAITOK | M_ZERO);
1025         VERIFY(slabstbl != NULL);
1026
1027         /* Allocate audit structures if needed */
1028         PE_parse_boot_argn("mbuf_debug", &mbuf_debug, sizeof (mbuf_debug));
1029         mbuf_debug |= mcache_getflags();
1030         if (mbuf_debug & MCF_AUDIT) {
1031                 MALLOC(mclaudit, mcl_audit_t *,
1032                     nmbclusters * sizeof (*mclaudit), M_TEMP,
1033                     M_WAITOK | M_ZERO);
1034                 VERIFY(mclaudit != NULL);
1035
1036                 mcl_audit_con_cache = mcache_create("mcl_audit_contents",
1037                     AUDIT_CONTENTS_SIZE, 0, 0, MCR_SLEEP);
1038                 VERIFY(mcl_audit_con_cache != NULL);
1039         }
1040
1041         /* Calculate the number of pages assigned to the cluster pool */
1042         mcl_pages = nmbclusters/(NBPG/CLBYTES);
1043         MALLOC(mcl_paddr, int *, mcl_pages * sizeof (int), M_TEMP, M_WAITOK);
1044         VERIFY(mcl_paddr != NULL);
1045
1046         /* Register with the I/O Bus mapper */
1047         mcl_paddr_base = IOMapperIOVMAlloc(mcl_pages);
1048         bzero((char *)mcl_paddr, mcl_pages * sizeof (int));
1049
1050         embutl = (union mcluster *)
1051             ((unsigned char *)mbutl + (nmbclusters * MCLBYTES));
1052
1053         PE_parse_boot_argn("initmcl", &initmcl, sizeof (initmcl));
1054
1055         lck_mtx_lock(mbuf_mlock);
1056
1057         if (m_clalloc(MAX(NBPG/CLBYTES, 1) * initmcl, M_WAIT, MCLBYTES) == 0)
1058                 panic("mbinit: m_clalloc failed\n");
1059
1060         lck_mtx_unlock(mbuf_mlock);
1061
1062         (void) kernel_thread(kernel_task, mbuf_worker_thread_init);
1063
1064         ref_cache = mcache_create("mext_ref", sizeof (struct ext_ref),
1065             0, 0, MCR_SLEEP);
1066
1067         /* Create the cache for each class */
1068         for (m = 0; m < NELEM(mbuf_table); m++) {
1069                 void *allocfunc, *freefunc, *auditfunc;
1070                 u_int32_t flags;
1071
1072                 flags = mbuf_debug;
1073                 if (m_class(m) == MC_MBUF_CL || m_class(m) == MC_MBUF_BIGCL ||
1074                     m_class(m) == MC_MBUF_16KCL) {
1075                         allocfunc = mbuf_cslab_alloc;
1076                         freefunc = mbuf_cslab_free;
1077                         auditfunc = mbuf_cslab_audit;
1078                 } else {
1079                         allocfunc = mbuf_slab_alloc;
1080                         freefunc = mbuf_slab_free;
1081                         auditfunc = mbuf_slab_audit;
1082                 }
1083
1084                 /*
1085                  * Disable per-CPU caches for jumbo classes if there
1086                  * is no jumbo cluster pool available in the system.
1087                  * The cache itself is still created (but will never
1088                  * be populated) since it simplifies the code.
1089                  */
1090                 if ((m_class(m) == MC_MBUF_16KCL || m_class(m) == MC_16KCL) &&
1091                     njcl == 0)
1092                         flags |= MCF_NOCPUCACHE;
1093
1094                 m_cache(m) = mcache_create_ext(m_cname(m), m_maxsize(m),
1095                     allocfunc, freefunc, auditfunc, mbuf_slab_notify,
1096                     (void *)m, flags, MCR_SLEEP);
1097         }
1098
1099         /*
1100          * Allocate structure for per-CPU statistics that's aligned
1101          * on the CPU cache boundary; this code assumes that we never
1102          * uninitialize this framework, since the original address
1103          * before alignment is not saved.
1104          */
1105         ncpu = ml_get_max_cpus();
1106         MALLOC(buf, void *, MBUF_MTYPES_SIZE(ncpu) + CPU_CACHE_SIZE,
1107             M_TEMP, M_WAITOK);
1108         VERIFY(buf != NULL);
1109
1110         mbuf_mtypes = (mbuf_mtypes_t *)P2ROUNDUP((intptr_t)buf, CPU_CACHE_SIZE);
1111         bzero(mbuf_mtypes, MBUF_MTYPES_SIZE(ncpu));
1112
1113         printf("mbinit: done\n");
1114 }
1115
1116 /*
1117  * Obtain a slab of object(s) from the class's freelist.
1118  */
1119 static mcache_obj_t *
1120 slab_alloc(mbuf_class_t class, int wait)
1121 {
1122         mcl_slab_t *sp;
1123         mcache_obj_t *buf;
1124
1125         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1126
1127         VERIFY(class != MC_16KCL || njcl > 0);
1128
1129         /* This should always be NULL for us */
1130         VERIFY(m_cobjlist(class) == NULL);
1131
1132         /*
1133          * Treat composite objects as having longer lifespan by using
1134          * a slab from the reverse direction, in hoping that this could
1135          * reduce the probability of fragmentation for slabs that hold
1136          * more than one buffer chunks (e.g. mbuf slabs).  For other
1137          * slabs, this probably doesn't make much of a difference.
1138          */
1139         if (class == MC_MBUF && (wait & MCR_COMP))
1140                 sp = (mcl_slab_t *)TAILQ_LAST(&m_slablist(class), mcl_slhead);
1141         else
1142                 sp = (mcl_slab_t *)TAILQ_FIRST(&m_slablist(class));
1143
1144         if (sp == NULL) {
1145                 VERIFY(m_infree(class) == 0 && m_slab_cnt(class) == 0);
1146                 /* The slab list for this class is empty */
1147                 return (NULL);
1148         }
1149
1150         VERIFY(m_infree(class) > 0);
1151         VERIFY(!slab_is_detached(sp));
1152         VERIFY(sp->sl_class == class &&
1153             (sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED);
1154         buf = sp->sl_head;
1155         VERIFY(slab_inrange(sp, buf) && sp == slab_get(buf));
1156
1157         if (class == MC_MBUF) {
1158                 sp->sl_head = buf->obj_next;
1159                 VERIFY(sp->sl_head != NULL || sp->sl_refcnt == (NMBPCL - 1));
1160         } else {
1161                 sp->sl_head = NULL;
1162         }
1163         if (sp->sl_head != NULL && !slab_inrange(sp, sp->sl_head)) {
1164                 slab_nextptr_panic(sp, sp->sl_head);
1165                 /* In case sl_head is in the map but not in the slab */
1166                 VERIFY(slab_inrange(sp, sp->sl_head));
1167                 /* NOTREACHED */
1168         }
1169
1170         /* Increment slab reference */
1171         sp->sl_refcnt++;
1172
1173         if (mclaudit != NULL) {
1174                 mcache_audit_t *mca = mcl_audit_buf2mca(class, buf);
1175                 mca->mca_uflags = 0;
1176                 /* Save contents on mbuf objects only */
1177                 if (class == MC_MBUF)
1178                         mca->mca_uflags |= MB_SCVALID;
1179         }
1180
1181         if (class == MC_CL) {
1182                 mbstat.m_clfree = (--m_infree(MC_CL)) + m_infree(MC_MBUF_CL);
1183                 /*
1184                  * A 2K cluster slab can have at most 1 reference.
1185                  */
1186                 VERIFY(sp->sl_refcnt == 1 && sp->sl_chunks == 1 &&
1187                     sp->sl_len == m_maxsize(MC_CL) && sp->sl_head == NULL);
1188         } else if (class == MC_BIGCL) {
1189                 mcl_slab_t *nsp = sp->sl_next;
1190                 mbstat.m_bigclfree = (--m_infree(MC_BIGCL)) +
1191                     m_infree(MC_MBUF_BIGCL);
1192                 /*
1193                  * Increment 2nd slab.  A 4K big cluster takes
1194                  * 2 slabs, each having at most 1 reference.
1195                  */
1196                 VERIFY(sp->sl_refcnt == 1 && sp->sl_chunks == 1 &&
1197                     sp->sl_len == m_maxsize(MC_BIGCL) && sp->sl_head == NULL);
1198                 /* Next slab must already be present */
1199                 VERIFY(nsp != NULL);
1200                 nsp->sl_refcnt++;
1201                 VERIFY(!slab_is_detached(nsp));
1202                 VERIFY(nsp->sl_class == MC_BIGCL &&
1203                     nsp->sl_flags == (SLF_MAPPED | SLF_PARTIAL) &&
1204                     nsp->sl_refcnt == 1 && nsp->sl_chunks == 0 &&
1205                     nsp->sl_len == 0 && nsp->sl_base == sp->sl_base &&
1206                     nsp->sl_head == NULL);
1207         } else if (class == MC_16KCL) {
1208                 mcl_slab_t *nsp;
1209                 int k;
1210
1211                 --m_infree(MC_16KCL);
1212                 VERIFY(sp->sl_refcnt == 1 && sp->sl_chunks == 1 &&
1213                     sp->sl_len == m_maxsize(MC_16KCL) && sp->sl_head == NULL);
1214                 /*
1215                  * Increment 2nd-8th slab.  A 16K big cluster takes
1216                  * 8 cluster slabs, each having at most 1 reference.
1217                  */
1218                 for (nsp = sp, k = 1; k < (M16KCLBYTES / MCLBYTES); k++) {
1219                         nsp = nsp->sl_next;
1220                         /* Next slab must already be present */
1221                         VERIFY(nsp != NULL);
1222                         nsp->sl_refcnt++;
1223                         VERIFY(!slab_is_detached(nsp));
1224                         VERIFY(nsp->sl_class == MC_16KCL &&
1225                             nsp->sl_flags == (SLF_MAPPED | SLF_PARTIAL) &&
1226                             nsp->sl_refcnt == 1 && nsp->sl_chunks == 0 &&
1227                             nsp->sl_len == 0 && nsp->sl_base == sp->sl_base &&
1228                             nsp->sl_head == NULL);
1229                 }
1230         } else {
1231                 ASSERT(class == MC_MBUF);
1232                 --m_infree(MC_MBUF);
1233                 /*
1234                  * If auditing is turned on, this check is
1235                  * deferred until later in mbuf_slab_audit().
1236                  */
1237                 if (mclaudit == NULL)
1238                         _MCHECK((struct mbuf *)buf);
1239                 /*
1240                  * Since we have incremented the reference count above,
1241                  * an mbuf slab (formerly a 2K cluster slab that was cut
1242                  * up into mbufs) must have a reference count between 1
1243                  * and NMBPCL at this point.
1244                  */
1245                 VERIFY(sp->sl_refcnt >= 1 &&
1246                     (unsigned short)sp->sl_refcnt <= NMBPCL &&
1247                     sp->sl_chunks == NMBPCL && sp->sl_len == m_maxsize(MC_CL));
1248                 VERIFY((unsigned short)sp->sl_refcnt < NMBPCL ||
1249                     sp->sl_head == NULL);
1250         }
1251
1252         /* If empty, remove this slab from the class's freelist */
1253         if (sp->sl_head == NULL) {
1254                 VERIFY(class != MC_MBUF || sp->sl_refcnt == NMBPCL);
1255                 slab_remove(sp, class);
1256         }
1257
1258         return (buf);
1259 }
1260
1261 /*
1262  * Place a slab of object(s) back into a class's slab list.
1263  */
1264 static void
1265 slab_free(mbuf_class_t class, mcache_obj_t *buf)
1266 {
1267         mcl_slab_t *sp;
1268
1269         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1270
1271         VERIFY(class != MC_16KCL || njcl > 0);
1272         VERIFY(buf->obj_next == NULL);
1273         sp = slab_get(buf);
1274         VERIFY(sp->sl_class == class && slab_inrange(sp, buf) &&
1275             (sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED);
1276
1277         /* Decrement slab reference */
1278         sp->sl_refcnt--;
1279
1280         if (class == MC_CL || class == MC_BIGCL) {
1281                 VERIFY(IS_P2ALIGNED(buf, MCLBYTES));
1282                 /*
1283                  * A 2K cluster slab can have at most 1 reference
1284                  * which must be 0 at this point.
1285                  */
1286                 VERIFY(sp->sl_refcnt == 0 && sp->sl_chunks == 1 &&
1287                     sp->sl_len == m_maxsize(class) && sp->sl_head == NULL);
1288                 VERIFY(slab_is_detached(sp));
1289                 if (class == MC_BIGCL) {
1290                         mcl_slab_t *nsp = sp->sl_next;
1291                         VERIFY(IS_P2ALIGNED(buf, NBPG));
1292                         /* Next slab must already be present */
1293                         VERIFY(nsp != NULL);
1294                         /* Decrement 2nd slab reference */
1295                         nsp->sl_refcnt--;
1296                         /*
1297                          * A 4K big cluster takes 2 slabs, both
1298                          * must now have 0 reference.
1299                          */
1300                         VERIFY(slab_is_detached(nsp));
1301                         VERIFY(nsp->sl_class == MC_BIGCL &&
1302                             (nsp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) &&
1303                             nsp->sl_refcnt == 0 && nsp->sl_chunks == 0 &&
1304                             nsp->sl_len == 0 && nsp->sl_base == sp->sl_base &&
1305                             nsp->sl_head == NULL);
1306                 }
1307         } else if (class == MC_16KCL) {
1308                 mcl_slab_t *nsp;
1309                 int k;
1310                 /*
1311                  * A 16K cluster takes 8 cluster slabs, all must
1312                  * now have 0 reference.
1313                  */
1314                 VERIFY(IS_P2ALIGNED(buf, NBPG));
1315                 VERIFY(sp->sl_refcnt == 0 && sp->sl_chunks == 1 &&
1316                     sp->sl_len == m_maxsize(MC_16KCL) && sp->sl_head == NULL);
1317                 VERIFY(slab_is_detached(sp));
1318                 for (nsp = sp, k = 1; k < (M16KCLBYTES / MCLBYTES); k++) {
1319                         nsp = nsp->sl_next;
1320                         /* Next slab must already be present */
1321                         VERIFY(nsp != NULL);
1322                         nsp->sl_refcnt--;
1323                         VERIFY(slab_is_detached(nsp));
1324                         VERIFY(nsp->sl_class == MC_16KCL &&
1325                             (nsp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) &&
1326                             nsp->sl_refcnt == 0 && nsp->sl_chunks == 0 &&
1327                             nsp->sl_len == 0 && nsp->sl_base == sp->sl_base &&
1328                             nsp->sl_head == NULL);
1329                 }
1330         } else {
1331                 /*
1332                  * An mbuf slab has a total of NMBPL reference counts.
1333                  * Since we have decremented the reference above, it
1334                  * must now be between 0 and NMBPCL-1.
1335                  */
1336                 VERIFY(sp->sl_refcnt >= 0 &&
1337                     (unsigned short)sp->sl_refcnt <= (NMBPCL - 1) &&
1338                     sp->sl_chunks == NMBPCL && sp->sl_len == m_maxsize(MC_CL));
1339                 VERIFY(sp->sl_refcnt < (NMBPCL - 1) ||
1340                     (slab_is_detached(sp) && sp->sl_head == NULL));
1341         }
1342
1343         /*
1344          * When auditing is enabled, ensure that the buffer still
1345          * contains the free pattern.  Otherwise it got corrupted
1346          * while at the CPU cache layer.
1347          */
1348         if (mclaudit != NULL) {
1349                 mcache_audit_t *mca = mcl_audit_buf2mca(class, buf);
1350                 mcache_audit_free_verify(mca, buf, 0, m_maxsize(class));
1351                 mca->mca_uflags &= ~MB_SCVALID;
1352         }
1353
1354         if (class == MC_CL) {
1355                 mbstat.m_clfree = (++m_infree(MC_CL)) + m_infree(MC_MBUF_CL);
1356         } else if (class == MC_BIGCL) {
1357                 mbstat.m_bigclfree = (++m_infree(MC_BIGCL)) +
1358                     m_infree(MC_MBUF_BIGCL);
1359         } else if (class == MC_16KCL) {
1360                 ++m_infree(MC_16KCL);
1361         } else {
1362                 ++m_infree(MC_MBUF);
1363                 buf->obj_next = sp->sl_head;
1364         }
1365         sp->sl_head = buf;
1366
1367         /* All mbufs are freed; return the cluster that we stole earlier */
1368         if (sp->sl_refcnt == 0 && class == MC_MBUF) {
1369                 int i = NMBPCL;
1370
1371                 m_total(MC_MBUF) -= NMBPCL;
1372                 mbstat.m_mbufs = m_total(MC_MBUF);
1373                 m_infree(MC_MBUF) -= NMBPCL;
1374                 mtype_stat_add(MT_FREE, -NMBPCL);
1375
1376                 while (i--) {
1377                         struct mbuf *m = sp->sl_head;
1378                         VERIFY(m != NULL);
1379                         sp->sl_head = m->m_next;
1380                         m->m_next = NULL;
1381                 }
1382                 VERIFY(sp->sl_head == NULL);
1383
1384                 /* Remove the slab from the mbuf class's slab list */
1385                 slab_remove(sp, class);
1386
1387                 /* Reinitialize it as a 2K cluster slab */
1388                 slab_init(sp, MC_CL, sp->sl_flags, sp->sl_base, sp->sl_base,
1389                     sp->sl_len, 0, 1);
1390
1391                 if (mclaudit != NULL)
1392                         mcache_set_pattern(MCACHE_FREE_PATTERN,
1393                             (caddr_t)sp->sl_head, m_maxsize(MC_CL));
1394
1395                 mbstat.m_clfree = (++m_infree(MC_CL)) + m_infree(MC_MBUF_CL);
1396
1397                 VERIFY(slab_is_detached(sp));
1398                 /* And finally switch class */
1399                 class = MC_CL;
1400         }
1401
1402         /* Reinsert the slab to the class's slab list */
1403         if (slab_is_detached(sp))
1404                 slab_insert(sp, class);
1405 }
1406
1407 /*
1408  * Common allocator for rudimentary objects called by the CPU cache layer
1409  * during an allocation request whenever there is no available element in the
1410  * bucket layer.  It returns one or more elements from the appropriate global
1411  * freelist.  If the freelist is empty, it will attempt to populate it and
1412  * retry the allocation.
1413  */
1414 static unsigned int
1415 mbuf_slab_alloc(void *arg, mcache_obj_t ***plist, unsigned int num, int wait)
1416 {
1417         mbuf_class_t class = (mbuf_class_t)arg;
1418         unsigned int need = num;
1419         mcache_obj_t **list = *plist;
1420
1421         ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
1422         ASSERT(need > 0);
1423
1424         lck_mtx_lock(mbuf_mlock);
1425
1426         for (;;) {
1427                 if ((*list = slab_alloc(class, wait)) != NULL) {
1428                         (*list)->obj_next = NULL;
1429                         list = *plist = &(*list)->obj_next;
1430
1431                         if (--need == 0) {
1432                                 /*
1433                                  * If the number of elements in freelist has
1434                                  * dropped below low watermark, asynchronously
1435                                  * populate the freelist now rather than doing
1436                                  * it later when we run out of elements.
1437                                  */
1438                                 if (!mbuf_cached_above(class, wait) &&
1439                                     m_infree(class) < m_total(class) >> 5) {
1440                                         (void) freelist_populate(class, 1,
1441                                             M_DONTWAIT);
1442                                 }
1443                                 break;
1444                         }
1445                 } else {
1446                         VERIFY(m_infree(class) == 0 || class == MC_CL);
1447
1448                         (void) freelist_populate(class, 1,
1449                             (wait & MCR_NOSLEEP) ? M_DONTWAIT : M_WAIT);
1450
1451                         if (m_infree(class) > 0)
1452                                 continue;
1453
1454                         /* Check if there's anything at the cache layer */
1455                         if (mbuf_cached_above(class, wait))
1456                                 break;
1457
1458                         /* We have nothing and cannot block; give up */
1459                         if (wait & MCR_NOSLEEP) {
1460                                 if (!(wait & MCR_TRYHARD)) {
1461                                         m_fail_cnt(class)++;
1462                                         mbstat.m_drops++;
1463                                         break;
1464                                 }
1465                         }
1466
1467                         /*
1468                          * If the freelist is still empty and the caller is
1469                          * willing to be blocked, sleep on the wait channel
1470                          * until an element is available.  Otherwise, if
1471                          * MCR_TRYHARD is set, do our best to satisfy the
1472                          * request without having to go to sleep.
1473                          */
1474                         if (mbuf_worker_ready &&
1475                             mbuf_sleep(class, need, wait))
1476                                 break;
1477
1478                         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1479                 }
1480         }
1481
1482         m_alloc_cnt(class) += num - need;
1483         lck_mtx_unlock(mbuf_mlock);
1484
1485         return (num - need);
1486 }
1487
1488 /*
1489  * Common de-allocator for rudimentary objects called by the CPU cache
1490  * layer when one or more elements need to be returned to the appropriate
1491  * global freelist.
1492  */
1493 static void
1494 mbuf_slab_free(void *arg, mcache_obj_t *list, __unused int purged)
1495 {
1496         mbuf_class_t class = (mbuf_class_t)arg;
1497         mcache_obj_t *nlist;
1498         unsigned int num = 0;
1499         int w;
1500
1501         ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
1502
1503         lck_mtx_lock(mbuf_mlock);
1504
1505         for (;;) {
1506                 nlist = list->obj_next;
1507                 list->obj_next = NULL;
1508                 slab_free(class, list);
1509                 ++num;
1510                 if ((list = nlist) == NULL)
1511                         break;
1512         }
1513         m_free_cnt(class) += num;
1514
1515         if ((w = mb_waiters) > 0)
1516                 mb_waiters = 0;
1517
1518         lck_mtx_unlock(mbuf_mlock);
1519
1520         if (w != 0)
1521                 wakeup(mb_waitchan);
1522 }
1523
1524 /*
1525  * Common auditor for rudimentary objects called by the CPU cache layer
1526  * during an allocation or free request.  For the former, this is called
1527  * after the objects are obtained from either the bucket or slab layer
1528  * and before they are returned to the caller.  For the latter, this is
1529  * called immediately during free and before placing the objects into
1530  * the bucket or slab layer.
1531  */
1532 static void
1533 mbuf_slab_audit(void *arg, mcache_obj_t *list, boolean_t alloc)
1534 {
1535         mbuf_class_t class = (mbuf_class_t)arg;
1536         mcache_audit_t *mca;
1537
1538         ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
1539
1540         while (list != NULL) {
1541                 lck_mtx_lock(mbuf_mlock);
1542                 mca = mcl_audit_buf2mca(class, list);
1543
1544                 /* Do the sanity checks */
1545                 if (class == MC_MBUF) {
1546                         mcl_audit_mbuf(mca, list, FALSE, alloc);
1547                         ASSERT(mca->mca_uflags & MB_SCVALID);
1548                 } else {
1549                         mcl_audit_cluster(mca, list, m_maxsize(class),
1550                             alloc, TRUE);
1551                         ASSERT(!(mca->mca_uflags & MB_SCVALID));
1552                 }
1553                 /* Record this transaction */
1554                 mcache_buffer_log(mca, list, m_cache(class));
1555                 if (alloc)
1556                         mca->mca_uflags |= MB_INUSE;
1557                 else
1558                         mca->mca_uflags &= ~MB_INUSE;
1559                 /* Unpair the object (unconditionally) */
1560                 mca->mca_uptr = NULL;
1561                 lck_mtx_unlock(mbuf_mlock);
1562
1563                 list = list->obj_next;
1564         }
1565 }
1566
1567 /*
1568  * Common notify routine for all caches.  It is called by mcache when
1569  * one or more objects get freed.  We use this indication to trigger
1570  * the wakeup of any sleeping threads so that they can retry their
1571  * allocation requests.
1572  */
1573 static void
1574 mbuf_slab_notify(void *arg, u_int32_t reason)
1575 {
1576         mbuf_class_t class = (mbuf_class_t)arg;
1577         int w;
1578
1579         ASSERT(MBUF_CLASS_VALID(class));
1580
1581         if (reason != MCN_RETRYALLOC)
1582                 return;
1583
1584         lck_mtx_lock(mbuf_mlock);
1585         if ((w = mb_waiters) > 0) {
1586                 m_notified(class)++;
1587                 mb_waiters = 0;
1588         }
1589         lck_mtx_unlock(mbuf_mlock);
1590
1591         if (w != 0)
1592                 wakeup(mb_waitchan);
1593 }
1594
1595 /*
1596  * Obtain object(s) from the composite class's freelist.
1597  */
1598 static unsigned int
1599 cslab_alloc(mbuf_class_t class, mcache_obj_t ***plist, unsigned int num)
1600 {
1601         unsigned int need = num;
1602         mcl_slab_t *sp, *clsp, *nsp;
1603         struct mbuf *m;
1604         mcache_obj_t **list = *plist;
1605         void *cl;
1606
1607         VERIFY(need > 0);
1608         VERIFY(class != MC_MBUF_16KCL || njcl > 0);
1609         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1610
1611         /* Get what we can from the freelist */
1612         while ((*list = m_cobjlist(class)) != NULL) {
1613                 MRANGE(*list);
1614
1615                 m = (struct mbuf *)*list;
1616                 sp = slab_get(m);
1617                 cl = m->m_ext.ext_buf;
1618                 clsp = slab_get(cl);
1619                 VERIFY(m->m_flags == M_EXT && cl != NULL);
1620                 VERIFY(MEXT_RFA(m) != NULL && MBUF_IS_COMPOSITE(m));
1621                 VERIFY(clsp->sl_refcnt == 1);
1622                 if (class == MC_MBUF_BIGCL) {
1623                         nsp = clsp->sl_next;
1624                         /* Next slab must already be present */
1625                         VERIFY(nsp != NULL);
1626                         VERIFY(nsp->sl_refcnt == 1);
1627                 } else if (class == MC_MBUF_16KCL) {
1628                         int k;
1629                         for (nsp = clsp, k = 1;
1630                             k < (M16KCLBYTES / MCLBYTES); k++) {
1631                                 nsp = nsp->sl_next;
1632                                 /* Next slab must already be present */
1633                                 VERIFY(nsp != NULL);
1634                                 VERIFY(nsp->sl_refcnt == 1);
1635                         }
1636                 }
1637
1638                 if ((m_cobjlist(class) = (*list)->obj_next) != NULL &&
1639                     !MBUF_IN_MAP(m_cobjlist(class))) {
1640                         slab_nextptr_panic(sp, m_cobjlist(class));
1641                         /* NOTREACHED */
1642                 }
1643                 (*list)->obj_next = NULL;
1644                 list = *plist = &(*list)->obj_next;
1645
1646                 if (--need == 0)
1647                         break;
1648         }
1649         m_infree(class) -= (num - need);
1650
1651         return (num - need);
1652 }
1653
1654 /*
1655  * Place object(s) back into a composite class's freelist.
1656  */
1657 static unsigned int
1658 cslab_free(mbuf_class_t class, mcache_obj_t *list, int purged)
1659 {
1660         mcache_obj_t *o, *tail;
1661         unsigned int num = 0;
1662         struct mbuf *m, *ms;
1663         mcache_audit_t *mca = NULL;
1664         mcache_obj_t *ref_list = NULL;
1665         mcl_slab_t *clsp, *nsp;
1666         void *cl;
1667
1668         ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
1669         VERIFY(class != MC_MBUF_16KCL || njcl > 0);
1670         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1671
1672         o = tail = list;
1673
1674         while ((m = ms = (struct mbuf *)o) != NULL) {
1675                 mcache_obj_t *rfa, *nexto = o->obj_next;
1676
1677                 /* Do the mbuf sanity checks */
1678                 if (mclaudit != NULL) {
1679                         mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
1680                         mcache_audit_free_verify(mca, m, 0, m_maxsize(MC_MBUF));
1681                         ms = (struct mbuf *)mca->mca_contents;
1682                 }
1683
1684                 /* Do the cluster sanity checks */
1685                 cl = ms->m_ext.ext_buf;
1686                 clsp = slab_get(cl);
1687                 if (mclaudit != NULL) {
1688                         size_t size;
1689                         if (class == MC_MBUF_CL)
1690                                 size = m_maxsize(MC_CL);
1691                         else if (class == MC_MBUF_BIGCL)
1692                                 size = m_maxsize(MC_BIGCL);
1693                         else
1694                                 size = m_maxsize(MC_16KCL);
1695                         mcache_audit_free_verify(mcl_audit_buf2mca(MC_CL,
1696                             (mcache_obj_t *)cl), cl, 0, size);
1697                 }
1698                 VERIFY(ms->m_type == MT_FREE);
1699                 VERIFY(ms->m_flags == M_EXT);
1700                 VERIFY(MEXT_RFA(ms) != NULL && MBUF_IS_COMPOSITE(ms));
1701                 VERIFY(clsp->sl_refcnt == 1);
1702                 if (class == MC_MBUF_BIGCL) {
1703                         nsp = clsp->sl_next;
1704                         /* Next slab must already be present */
1705                         VERIFY(nsp != NULL);
1706                         VERIFY(nsp->sl_refcnt == 1);
1707                 } else if (class == MC_MBUF_16KCL) {
1708                         int k;
1709                         for (nsp = clsp, k = 1;
1710                             k < (M16KCLBYTES / MCLBYTES); k++) {
1711                                 nsp = nsp->sl_next;
1712                                 /* Next slab must already be present */
1713                                 VERIFY(nsp != NULL);
1714                                 VERIFY(nsp->sl_refcnt == 1);
1715                         }
1716                 }
1717
1718                 /*
1719                  * If we're asked to purge, restore the actual mbuf using
1720                  * contents of the shadow structure (if auditing is enabled)
1721                  * and clear EXTF_COMPOSITE flag from the mbuf, as we are
1722                  * about to free it and the attached cluster into their caches.
1723                  */
1724                 if (purged) {
1725                         /* Restore constructed mbuf fields */
1726                         if (mclaudit != NULL)
1727                                 mcl_audit_restore_mbuf(m, mca, TRUE);
1728
1729                         MEXT_REF(m) = 0;
1730                         MEXT_FLAGS(m) = 0;
1731
1732                         rfa = (mcache_obj_t *)MEXT_RFA(m);
1733                         rfa->obj_next = ref_list;
1734                         ref_list = rfa;
1735                         MEXT_RFA(m) = NULL;
1736
1737                         m->m_type = MT_FREE;
1738                         m->m_flags = m->m_len = 0;
1739                         m->m_next = m->m_nextpkt = NULL;
1740
1741                         /* Save mbuf fields and make auditing happy */
1742                         if (mclaudit != NULL)
1743                                 mcl_audit_mbuf(mca, o, FALSE, FALSE);
1744
1745                         VERIFY(m_total(class) > 0);
1746                         m_total(class)--;
1747
1748                         /* Free the mbuf */
1749                         o->obj_next = NULL;
1750                         slab_free(MC_MBUF, o);
1751
1752                         /* And free the cluster */
1753                         ((mcache_obj_t *)cl)->obj_next = NULL;
1754                         if (class == MC_MBUF_CL)
1755                                 slab_free(MC_CL, cl);
1756                         else if (class == MC_MBUF_BIGCL)
1757                                 slab_free(MC_BIGCL, cl);
1758                         else
1759                                 slab_free(MC_16KCL, cl);
1760                 }
1761
1762                 ++num;
1763                 tail = o;
1764                 o = nexto;
1765         }
1766
1767         if (!purged) {
1768                 tail->obj_next = m_cobjlist(class);
1769                 m_cobjlist(class) = list;
1770                 m_infree(class) += num;
1771         } else if (ref_list != NULL) {
1772                 mcache_free_ext(ref_cache, ref_list);
1773         }
1774
1775         return (num);
1776 }
1777
1778 /*
1779  * Common allocator for composite objects called by the CPU cache layer
1780  * during an allocation request whenever there is no available element in
1781  * the bucket layer.  It returns one or more composite elements from the
1782  * appropriate global freelist.  If the freelist is empty, it will attempt
1783  * to obtain the rudimentary objects from their caches and construct them
1784  * into composite mbuf + cluster objects.
1785  */
1786 static unsigned int
1787 mbuf_cslab_alloc(void *arg, mcache_obj_t ***plist, unsigned int needed,
1788     int wait)
1789 {
1790         mbuf_class_t class = (mbuf_class_t)arg;
1791         mcache_t *cp = NULL;
1792         unsigned int num = 0, cnum = 0, want = needed;
1793         mcache_obj_t *ref_list = NULL;
1794         mcache_obj_t *mp_list = NULL;
1795         mcache_obj_t *clp_list = NULL;
1796         mcache_obj_t **list;
1797         struct ext_ref *rfa;
1798         struct mbuf *m;
1799         void *cl;
1800
1801         ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
1802         ASSERT(needed > 0);
1803
1804         VERIFY(class != MC_MBUF_16KCL || njcl > 0);
1805
1806         /* There should not be any slab for this class */
1807         VERIFY(m_slab_cnt(class) == 0 &&
1808             m_slablist(class).tqh_first == NULL &&
1809             m_slablist(class).tqh_last == NULL);
1810
1811         lck_mtx_lock(mbuf_mlock);
1812
1813         /* Try using the freelist first */
1814         num = cslab_alloc(class, plist, needed);
1815         list = *plist;
1816         if (num == needed) {
1817                 m_alloc_cnt(class) += num;
1818                 lck_mtx_unlock(mbuf_mlock);
1819                 return (needed);
1820         }
1821
1822         lck_mtx_unlock(mbuf_mlock);
1823
1824         /*
1825          * We could not satisfy the request using the freelist alone;
1826          * allocate from the appropriate rudimentary caches and use
1827          * whatever we can get to construct the composite objects.
1828          */
1829         needed -= num;
1830
1831         /*
1832          * Mark these allocation requests as coming from a composite cache.
1833          * Also, if the caller is willing to be blocked, mark the request
1834          * with MCR_FAILOK such that we don't end up sleeping at the mbuf
1835          * slab layer waiting for the individual object when one or more
1836          * of the already-constructed composite objects are available.
1837          */
1838         wait |= MCR_COMP;
1839         if (!(wait & MCR_NOSLEEP))
1840                 wait |= MCR_FAILOK;
1841
1842         needed = mcache_alloc_ext(m_cache(MC_MBUF), &mp_list, needed, wait);
1843         if (needed == 0) {
1844                 ASSERT(mp_list == NULL);
1845                 goto fail;
1846         }
1847         if (class == MC_MBUF_CL)
1848                 cp = m_cache(MC_CL);
1849         else if (class == MC_MBUF_BIGCL)
1850                 cp = m_cache(MC_BIGCL);
1851         else
1852                 cp = m_cache(MC_16KCL);
1853         needed = mcache_alloc_ext(cp, &clp_list, needed, wait);
1854         if (needed == 0) {
1855                 ASSERT(clp_list == NULL);
1856                 goto fail;
1857         }
1858         needed = mcache_alloc_ext(ref_cache, &ref_list, needed, wait);
1859         if (needed == 0) {
1860                 ASSERT(ref_list == NULL);
1861                 goto fail;
1862         }
1863
1864         /*
1865          * By this time "needed" is MIN(mbuf, cluster, ref).  Any left
1866          * overs will get freed accordingly before we return to caller.
1867          */
1868         for (cnum = 0; cnum < needed; cnum++) {
1869                 struct mbuf *ms;
1870
1871                 m = ms = (struct mbuf *)mp_list;
1872                 mp_list = mp_list->obj_next;
1873
1874                 cl = clp_list;
1875                 clp_list = clp_list->obj_next;
1876                 ((mcache_obj_t *)cl)->obj_next = NULL;
1877
1878                 rfa = (struct ext_ref *)ref_list;
1879                 ref_list = ref_list->obj_next;
1880                 ((mcache_obj_t *)rfa)->obj_next = NULL;
1881
1882                 /*
1883                  * If auditing is enabled, construct the shadow mbuf
1884                  * in the audit structure instead of in the actual one.
1885                  * mbuf_cslab_audit() will take care of restoring the
1886                  * contents after the integrity check.
1887                  */
1888                 if (mclaudit != NULL) {
1889                         mcache_audit_t *mca, *cl_mca;
1890                         size_t size;
1891
1892                         lck_mtx_lock(mbuf_mlock);
1893                         mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
1894                         ms = ((struct mbuf *)mca->mca_contents);
1895                         cl_mca = mcl_audit_buf2mca(MC_CL, (mcache_obj_t *)cl);
1896
1897                         /*
1898                          * Pair them up.  Note that this is done at the time
1899                          * the mbuf+cluster objects are constructed.  This
1900                          * information should be treated as "best effort"
1901                          * debugging hint since more than one mbufs can refer
1902                          * to a cluster.  In that case, the cluster might not
1903                          * be freed along with the mbuf it was paired with.
1904                          */
1905                         mca->mca_uptr = cl_mca;
1906                         cl_mca->mca_uptr = mca;
1907
1908                         ASSERT(mca->mca_uflags & MB_SCVALID);
1909                         ASSERT(!(cl_mca->mca_uflags & MB_SCVALID));
1910                         lck_mtx_unlock(mbuf_mlock);
1911
1912                         /* Technically, they are in the freelist */
1913                         mcache_set_pattern(MCACHE_FREE_PATTERN, m,
1914                             m_maxsize(MC_MBUF));
1915                         if (class == MC_MBUF_CL)
1916                                 size = m_maxsize(MC_CL);
1917                         else if (class == MC_MBUF_BIGCL)
1918                                 size = m_maxsize(MC_BIGCL);
1919                         else
1920                                 size = m_maxsize(MC_16KCL);
1921                         mcache_set_pattern(MCACHE_FREE_PATTERN, cl, size);
1922                 }
1923
1924                 MBUF_INIT(ms, 0, MT_FREE);
1925                 if (class == MC_MBUF_16KCL) {
1926                         MBUF_16KCL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE);
1927                 } else if (class == MC_MBUF_BIGCL) {
1928                         MBUF_BIGCL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE);
1929                 } else {
1930                         MBUF_CL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE);
1931                 }
1932                 VERIFY(ms->m_flags == M_EXT);
1933                 VERIFY(MEXT_RFA(ms) != NULL && MBUF_IS_COMPOSITE(ms));
1934
1935                 *list = (mcache_obj_t *)m;
1936                 (*list)->obj_next = NULL;
1937                 list = *plist = &(*list)->obj_next;
1938         }
1939
1940 fail:
1941         /*
1942          * Free up what's left of the above.
1943          */
1944         if (mp_list != NULL)
1945                 mcache_free_ext(m_cache(MC_MBUF), mp_list);
1946         if (clp_list != NULL)
1947                 mcache_free_ext(cp, clp_list);
1948         if (ref_list != NULL)
1949                 mcache_free_ext(ref_cache, ref_list);
1950
1951         lck_mtx_lock(mbuf_mlock);
1952         if (num > 0 || cnum > 0) {
1953                 m_total(class) += cnum;
1954                 VERIFY(m_total(class) <= m_maxlimit(class));
1955                 m_alloc_cnt(class) += num + cnum;
1956         }
1957         if ((num + cnum) < want)
1958                 m_fail_cnt(class) += (want - (num + cnum));
1959         lck_mtx_unlock(mbuf_mlock);
1960
1961         return (num + cnum);
1962 }
1963
1964 /*
1965  * Common de-allocator for composite objects called by the CPU cache
1966  * layer when one or more elements need to be returned to the appropriate
1967  * global freelist.
1968  */
1969 static void
1970 mbuf_cslab_free(void *arg, mcache_obj_t *list, int purged)
1971 {
1972         mbuf_class_t class = (mbuf_class_t)arg;
1973         unsigned int num;
1974         int w;
1975
1976         ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
1977
1978         lck_mtx_lock(mbuf_mlock);
1979
1980         num = cslab_free(class, list, purged);
1981         m_free_cnt(class) += num;
1982
1983         if ((w = mb_waiters) > 0)
1984                 mb_waiters = 0;
1985
1986         lck_mtx_unlock(mbuf_mlock);
1987
1988         if (w != 0)
1989                 wakeup(mb_waitchan);
1990 }
1991
1992 /*
1993  * Common auditor for composite objects called by the CPU cache layer
1994  * during an allocation or free request.  For the former, this is called
1995  * after the objects are obtained from either the bucket or slab layer
1996  * and before they are returned to the caller.  For the latter, this is
1997  * called immediately during free and before placing the objects into
1998  * the bucket or slab layer.
1999  */
2000 static void
2001 mbuf_cslab_audit(void *arg, mcache_obj_t *list, boolean_t alloc)
2002 {
2003         mbuf_class_t class = (mbuf_class_t)arg;
2004         mcache_audit_t *mca;
2005         struct mbuf *m, *ms;
2006         mcl_slab_t *clsp, *nsp;
2007         size_t size;
2008         void *cl;
2009
2010         ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
2011
2012         while ((m = ms = (struct mbuf *)list) != NULL) {
2013                 lck_mtx_lock(mbuf_mlock);
2014                 /* Do the mbuf sanity checks and record its transaction */
2015                 mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
2016                 mcl_audit_mbuf(mca, m, TRUE, alloc);
2017                 mcache_buffer_log(mca, m, m_cache(class));
2018                 if (alloc)
2019                         mca->mca_uflags |= MB_COMP_INUSE;
2020                 else
2021                         mca->mca_uflags &= ~MB_COMP_INUSE;
2022
2023                 /*
2024                  * Use the shadow mbuf in the audit structure if we are
2025                  * freeing, since the contents of the actual mbuf has been
2026                  * pattern-filled by the above call to mcl_audit_mbuf().
2027                  */
2028                 if (!alloc)
2029                         ms = (struct mbuf *)mca->mca_contents;
2030
2031                 /* Do the cluster sanity checks and record its transaction */
2032                 cl = ms->m_ext.ext_buf;
2033                 clsp = slab_get(cl);
2034                 VERIFY(ms->m_flags == M_EXT && cl != NULL);
2035                 VERIFY(MEXT_RFA(ms) != NULL && MBUF_IS_COMPOSITE(ms));
2036                 VERIFY(clsp->sl_refcnt == 1);
2037                 if (class == MC_MBUF_BIGCL) {
2038                         nsp = clsp->sl_next;
2039                         /* Next slab must already be present */
2040                         VERIFY(nsp != NULL);
2041                         VERIFY(nsp->sl_refcnt == 1);
2042                 } else if (class == MC_MBUF_16KCL) {
2043                         int k;
2044                         for (nsp = clsp, k = 1;
2045                             k < (M16KCLBYTES / MCLBYTES); k++) {
2046                                 nsp = nsp->sl_next;
2047                                 /* Next slab must already be present */
2048                                 VERIFY(nsp != NULL);
2049                                 VERIFY(nsp->sl_refcnt == 1);
2050                         }
2051                 }
2052
2053                 mca = mcl_audit_buf2mca(MC_CL, cl);
2054                 if (class == MC_MBUF_CL)
2055                         size = m_maxsize(MC_CL);
2056                 else if (class == MC_MBUF_BIGCL)
2057                         size = m_maxsize(MC_BIGCL);
2058                 else
2059                         size = m_maxsize(MC_16KCL);
2060                 mcl_audit_cluster(mca, cl, size, alloc, FALSE);
2061                 mcache_buffer_log(mca, cl, m_cache(class));
2062                 if (alloc)
2063                         mca->mca_uflags |= MB_COMP_INUSE;
2064                 else
2065                         mca->mca_uflags &= ~MB_COMP_INUSE;
2066                 lck_mtx_unlock(mbuf_mlock);
2067
2068                 list = list->obj_next;
2069         }
2070 }
2071
2072 /*
2073  * Allocate some number of mbuf clusters and place on cluster freelist.
2074  */
2075 static int
2076 m_clalloc(const u_int32_t num, const int wait, const u_int32_t bufsize)
2077 {
2078         int i;
2079         vm_size_t size = 0;
2080         int numpages = 0;
2081         vm_offset_t page = 0;
2082         mcache_audit_t *mca_list = NULL;
2083         mcache_obj_t *con_list = NULL;
2084         mcl_slab_t *sp;
2085
2086         VERIFY(bufsize == m_maxsize(MC_CL) ||
2087             bufsize == m_maxsize(MC_BIGCL) || bufsize == m_maxsize(MC_16KCL));
2088
2089         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2090
2091         /*
2092          * Multiple threads may attempt to populate the cluster map one
2093          * after another.  Since we drop the lock below prior to acquiring
2094          * the physical page(s), our view of the cluster map may no longer
2095          * be accurate, and we could end up over-committing the pages beyond
2096          * the maximum allowed for each class.  To prevent it, this entire
2097          * operation (including the page mapping) is serialized.
2098          */
2099         while (mb_clalloc_busy) {
2100                 mb_clalloc_waiters++;
2101                 (void) msleep(mb_clalloc_waitchan, mbuf_mlock,
2102                     (PZERO-1), "m_clalloc", NULL);
2103                 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2104         }
2105
2106         /* We are busy now; tell everyone else to go away */
2107         mb_clalloc_busy = TRUE;
2108
2109         /*
2110          * Honor the caller's wish to block or not block.  We have a way
2111          * to grow the pool asynchronously using the mbuf worker thread.
2112          */
2113         i = m_howmany(num, bufsize);
2114         if (i == 0 || (wait & M_DONTWAIT))
2115                 goto out;
2116
2117         lck_mtx_unlock(mbuf_mlock);
2118
2119         size = round_page_32(i * bufsize);
2120         page = kmem_mb_alloc(mb_map, size);
2121
2122         if (page == 0) {
2123                 if (bufsize <= m_maxsize(MC_BIGCL)) {
2124                         /* Try for 1 page if failed, only for 2KB/4KB request */
2125                         size = NBPG;
2126                         page = kmem_mb_alloc(mb_map, size);
2127                 }
2128
2129                 if (page == 0) {
2130                         lck_mtx_lock(mbuf_mlock);
2131                         goto out;
2132                 }
2133         }
2134
2135         VERIFY(IS_P2ALIGNED(page, NBPG));
2136         numpages = size / NBPG;
2137
2138         /* If auditing is enabled, allocate the audit structures now */
2139         if (mclaudit != NULL) {
2140                 int needed;
2141
2142                 /*
2143                  * Yes, I realize this is a waste of memory for clusters
2144                  * that never get transformed into mbufs, as we may end
2145                  * up with NMBPCL-1 unused audit structures per cluster.
2146                  * But doing so tremendously simplifies the allocation
2147                  * strategy, since at this point we are not holding the
2148                  * mbuf lock and the caller is okay to be blocked.  For
2149                  * the case of big clusters, we allocate one structure
2150                  * for each as we never turn them into mbufs.
2151                  */
2152                 if (bufsize == m_maxsize(MC_CL)) {
2153                         needed = numpages * 2 * NMBPCL;
2154
2155                         i = mcache_alloc_ext(mcl_audit_con_cache,
2156                             &con_list, needed, MCR_SLEEP);
2157
2158                         VERIFY(con_list != NULL && i == needed);
2159                 } else if (bufsize == m_maxsize(MC_BIGCL)) {
2160                         needed = numpages;
2161                 } else {
2162                         needed = numpages / (M16KCLBYTES / NBPG);
2163                 }
2164
2165                 i = mcache_alloc_ext(mcache_audit_cache,
2166                     (mcache_obj_t **)&mca_list, needed, MCR_SLEEP);
2167
2168                 VERIFY(mca_list != NULL && i == needed);
2169         }
2170
2171         lck_mtx_lock(mbuf_mlock);
2172
2173         for (i = 0; i < numpages; i++, page += NBPG) {
2174                 ppnum_t offset = ((char *)page - (char *)mbutl) / NBPG;
2175                 ppnum_t new_page = pmap_find_phys(kernel_pmap,
2176                     (vm_address_t)page);
2177
2178                 /*
2179                  * In the case of no mapper being available the following
2180                  * code noops and returns the input page; if there is a
2181                  * mapper the appropriate I/O page is returned.
2182                  */
2183                 new_page = IOMapperInsertPage(mcl_paddr_base, offset, new_page);
2184                 mcl_paddr[offset] = new_page << PGSHIFT;
2185
2186                 /* Pattern-fill this fresh page */
2187                 if (mclaudit != NULL)
2188                         mcache_set_pattern(MCACHE_FREE_PATTERN,
2189                             (caddr_t)page, NBPG);
2190
2191                 if (bufsize == m_maxsize(MC_CL)) {
2192                         union mcluster *mcl = (union mcluster *)page;
2193
2194                         /* 1st cluster in the page */
2195                         sp = slab_get(mcl);
2196                         if (mclaudit != NULL)
2197                                 mcl_audit_init(mcl, &mca_list, &con_list,
2198                                     AUDIT_CONTENTS_SIZE, NMBPCL);
2199
2200                         VERIFY(sp->sl_refcnt == 0 && sp->sl_flags == 0);
2201                         slab_init(sp, MC_CL, SLF_MAPPED,
2202                             mcl, mcl, bufsize, 0, 1);
2203
2204                         /* Insert this slab */
2205                         slab_insert(sp, MC_CL);
2206
2207                         /* Update stats now since slab_get() drops the lock */
2208                         mbstat.m_clfree = ++m_infree(MC_CL) +
2209                             m_infree(MC_MBUF_CL);
2210                         mbstat.m_clusters = ++m_total(MC_CL);
2211                         VERIFY(m_total(MC_CL) <= m_maxlimit(MC_CL));
2212
2213                         /* 2nd cluster in the page */
2214                         sp = slab_get(++mcl);
2215                         if (mclaudit != NULL)
2216                                 mcl_audit_init(mcl, &mca_list, &con_list,
2217                                     AUDIT_CONTENTS_SIZE, NMBPCL);
2218
2219                         VERIFY(sp->sl_refcnt == 0 && sp->sl_flags == 0);
2220                         slab_init(sp, MC_CL, SLF_MAPPED,
2221                             mcl, mcl, bufsize, 0, 1);
2222
2223                         /* Insert this slab */
2224                         slab_insert(sp, MC_CL);
2225
2226                         /* Update stats now since slab_get() drops the lock */
2227                         mbstat.m_clfree = ++m_infree(MC_CL) +
2228                             m_infree(MC_MBUF_CL);
2229                         mbstat.m_clusters = ++m_total(MC_CL);
2230                         VERIFY(m_total(MC_CL) <= m_maxlimit(MC_CL));
2231                 } else if (bufsize == m_maxsize(MC_BIGCL)) {
2232                         union mbigcluster *mbc = (union mbigcluster *)page;
2233                         mcl_slab_t *nsp;
2234
2235                         /* One for the entire page */
2236                         sp = slab_get(mbc);
2237                         if (mclaudit != NULL)
2238                                 mcl_audit_init(mbc, &mca_list, NULL, 0, 1);
2239
2240                         VERIFY(sp->sl_refcnt == 0 && sp->sl_flags == 0);
2241                         slab_init(sp, MC_BIGCL, SLF_MAPPED,
2242                             mbc, mbc, bufsize, 0, 1);
2243
2244                         /* 2nd cluster's slab is part of the previous one */
2245                         nsp = slab_get(((union mcluster *)page) + 1);
2246                         slab_init(nsp, MC_BIGCL, SLF_MAPPED | SLF_PARTIAL,
2247                             mbc, NULL, 0, 0, 0);
2248
2249                         /* Insert this slab */
2250                         slab_insert(sp, MC_BIGCL);
2251
2252                         /* Update stats now since slab_get() drops the lock */
2253                         mbstat.m_bigclfree = ++m_infree(MC_BIGCL) +
2254                             m_infree(MC_MBUF_BIGCL);
2255                         mbstat.m_bigclusters = ++m_total(MC_BIGCL);
2256                         VERIFY(m_total(MC_BIGCL) <= m_maxlimit(MC_BIGCL));
2257                 } else if ((i % (M16KCLBYTES / NBPG)) == 0) {
2258                         union m16kcluster *m16kcl = (union m16kcluster *)page;
2259                         mcl_slab_t *nsp;
2260                         int k;
2261
2262                         VERIFY(njcl > 0);
2263                         /* One for the entire 16KB */
2264                         sp = slab_get(m16kcl);
2265                         if (mclaudit != NULL)
2266                                 mcl_audit_init(m16kcl, &mca_list, NULL, 0, 1);
2267
2268                         VERIFY(sp->sl_refcnt == 0 && sp->sl_flags == 0);
2269                         slab_init(sp, MC_16KCL, SLF_MAPPED,
2270                             m16kcl, m16kcl, bufsize, 0, 1);
2271
2272                         /* 2nd-8th cluster's slab is part of the first one */
2273                         for (k = 1; k < (M16KCLBYTES / MCLBYTES); k++) {
2274                                 nsp = slab_get(((union mcluster *)page) + k);
2275                                 VERIFY(nsp->sl_refcnt == 0 &&
2276                                     nsp->sl_flags == 0);
2277                                 slab_init(nsp, MC_16KCL,
2278                                     SLF_MAPPED | SLF_PARTIAL,
2279                                     m16kcl, NULL, 0, 0, 0);
2280                         }
2281
2282                         /* Insert this slab */
2283                         slab_insert(sp, MC_16KCL);
2284
2285                         /* Update stats now since slab_get() drops the lock */
2286                         m_infree(MC_16KCL)++;
2287                         m_total(MC_16KCL)++;
2288                         VERIFY(m_total(MC_16KCL) <= m_maxlimit(MC_16KCL));
2289                 }
2290         }
2291         VERIFY(mca_list == NULL && con_list == NULL);
2292
2293         /* We're done; let others enter */
2294         mb_clalloc_busy = FALSE;
2295         if (mb_clalloc_waiters > 0) {
2296                 mb_clalloc_waiters = 0;
2297                 wakeup(mb_clalloc_waitchan);
2298         }
2299
2300         if (bufsize == m_maxsize(MC_CL))
2301                 return (numpages << 1);
2302         else if (bufsize == m_maxsize(MC_BIGCL))
2303                 return (numpages);
2304
2305         VERIFY(bufsize == m_maxsize(MC_16KCL));
2306         return (numpages / (M16KCLBYTES / NBPG));
2307
2308 out:
2309         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2310
2311         /* We're done; let others enter */
2312         mb_clalloc_busy = FALSE;
2313         if (mb_clalloc_waiters > 0) {
2314                 mb_clalloc_waiters = 0;
2315                 wakeup(mb_clalloc_waitchan);
2316         }
2317
2318         /*
2319          * When non-blocking we kick a thread if we have to grow the
2320          * pool or if the number of free clusters is less than requested.
2321          */
2322         if (bufsize == m_maxsize(MC_CL)) {
2323                 if (i > 0) {
2324                         /*
2325                          * Remember total number of clusters needed
2326                          * at this time.
2327                          */
2328                         i += m_total(MC_CL);
2329                         if (i > mbuf_expand_mcl) {
2330                                 mbuf_expand_mcl = i;
2331                                 if (mbuf_worker_ready)
2332                                         wakeup((caddr_t)&mbuf_worker_run);
2333                         }
2334                 }
2335
2336                 if (m_infree(MC_CL) >= num)
2337                         return (1);
2338         } else if (bufsize == m_maxsize(MC_BIGCL)) {
2339                 if (i > 0) {
2340                         /*
2341                          * Remember total number of 4KB clusters needed
2342                          * at this time.
2343                          */
2344                         i += m_total(MC_BIGCL);
2345                         if (i > mbuf_expand_big) {
2346                                 mbuf_expand_big = i;
2347                                 if (mbuf_worker_ready)
2348                                         wakeup((caddr_t)&mbuf_worker_run);
2349                         }
2350                 }
2351
2352                 if (m_infree(MC_BIGCL) >= num)
2353                         return (1);
2354         } else {
2355                 if (i > 0) {
2356                         /*
2357                          * Remember total number of 16KB clusters needed
2358                          * at this time.
2359                          */
2360                         i += m_total(MC_16KCL);
2361                         if (i > mbuf_expand_16k) {
2362                                 mbuf_expand_16k = i;
2363                                 if (mbuf_worker_ready)
2364                                         wakeup((caddr_t)&mbuf_worker_run);
2365                         }
2366                 }
2367
2368                 if (m_infree(MC_16KCL) >= num)
2369                         return (1);
2370         }
2371         return (0);
2372 }
2373
2374 /*
2375  * Populate the global freelist of the corresponding buffer class.
2376  */
2377 static int
2378 freelist_populate(mbuf_class_t class, unsigned int num, int wait)
2379 {
2380         mcache_obj_t *o = NULL;
2381         int i;
2382
2383         VERIFY(class == MC_MBUF || class == MC_CL || class == MC_BIGCL ||
2384             class == MC_16KCL);
2385
2386 #if CONFIG_MBUF_NOEXPAND
2387         if ((mbstat.m_mbufs / NMBPCL) >= maxmbufcl) {
2388 #if DEBUG
2389                 static int printonce = 1;
2390                 if (printonce == 1) {
2391                         printonce = 0;
2392                         printf("m_expand failed, allocated %ld out of %d "
2393                             "clusters\n", mbstat.m_mbufs / NMBPCL,
2394                             nmbclusters);
2395                 }
2396 #endif /* DEBUG */
2397                 return (0);
2398         }
2399 #endif /* CONFIG_MBUF_NOEXPAND */
2400
2401         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2402
2403         switch (class) {
2404         case MC_MBUF:
2405         case MC_CL:
2406                 i = m_clalloc(num, wait, m_maxsize(MC_CL));
2407
2408                 /* Respect the 2K clusters minimum limit */
2409                 if (m_total(MC_CL) == m_maxlimit(MC_CL) &&
2410                     m_infree(MC_CL) <= m_minlimit(MC_CL)) {
2411                         if (class != MC_CL || (wait & MCR_COMP))
2412                                 return (0);
2413                 }
2414                 if (class == MC_CL)
2415                         return (i != 0);
2416                 break;
2417
2418         case MC_BIGCL:
2419         case MC_16KCL:
2420                 return (m_clalloc(num, wait, m_maxsize(class)) != 0);
2421                 /* NOTREACHED */
2422
2423         default:
2424                 VERIFY(0);
2425                 /* NOTREACHED */
2426         }
2427
2428         /* Steal a cluster and cut it up to create NMBPCL mbufs */
2429         if ((o = slab_alloc(MC_CL, wait)) != NULL) {
2430                 struct mbuf *m = (struct mbuf *)o;
2431                 mcache_audit_t *mca = NULL;
2432                 mcl_slab_t *sp = slab_get(o);
2433
2434                 VERIFY(slab_is_detached(sp) &&
2435                     (sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED);
2436
2437                 /* Make sure that the cluster is unmolested while in freelist */
2438                 if (mclaudit != NULL) {
2439                         mca = mcl_audit_buf2mca(MC_CL, o);
2440                         mcache_audit_free_verify(mca, o, 0, m_maxsize(MC_CL));
2441                 }
2442
2443                 /* Reinitialize it as an mbuf slab */
2444                 slab_init(sp, MC_MBUF, sp->sl_flags, sp->sl_base, NULL,
2445                     sp->sl_len, 0, NMBPCL);
2446
2447                 VERIFY(m == (struct mbuf *)sp->sl_base);
2448                 VERIFY(sp->sl_head == NULL);
2449
2450                 m_total(MC_MBUF) += NMBPCL;
2451                 mbstat.m_mbufs = m_total(MC_MBUF);
2452                 m_infree(MC_MBUF) += NMBPCL;
2453                 mtype_stat_add(MT_FREE, NMBPCL);
2454
2455                 i = NMBPCL;
2456                 while (i--) {
2457                         /*
2458                          * If auditing is enabled, construct the shadow mbuf
2459                          * in the audit structure instead of the actual one.
2460                          * mbuf_slab_audit() will take care of restoring the
2461                          * contents after the integrity check.
2462                          */
2463                         if (mclaudit != NULL) {
2464                                 struct mbuf *ms;
2465                                 mca = mcl_audit_buf2mca(MC_MBUF,
2466                                     (mcache_obj_t *)m);
2467                                 ms = ((struct mbuf *)mca->mca_contents);
2468                                 ms->m_type = MT_FREE;
2469                         } else {
2470                                 m->m_type = MT_FREE;
2471                         }
2472                         m->m_next = sp->sl_head;
2473                         sp->sl_head = (void *)m++;
2474                 }
2475
2476                 /* Insert it into the mbuf class's slab list */
2477                 slab_insert(sp, MC_MBUF);
2478
2479                 if ((i = mb_waiters) > 0)
2480                         mb_waiters = 0;
2481                 if (i != 0)
2482                         wakeup(mb_waitchan);
2483
2484                 return (1);
2485         }
2486
2487         return (0);
2488 }
2489
2490 /*
2491  * (Inaccurately) check if it might be worth a trip back to the
2492  * mcache layer due the availability of objects there.  We'll
2493  * end up back here if there's nothing up there.
2494  */
2495 static boolean_t
2496 mbuf_cached_above(mbuf_class_t class, int wait)
2497 {
2498         switch (class) {
2499         case MC_MBUF:
2500                 if (wait & MCR_COMP)
2501                         return (!mcache_bkt_isempty(m_cache(MC_MBUF_CL)) ||
2502                             !mcache_bkt_isempty(m_cache(MC_MBUF_BIGCL)));
2503                 break;
2504
2505         case MC_CL:
2506                 if (wait & MCR_COMP)
2507                         return (!mcache_bkt_isempty(m_cache(MC_MBUF_CL)));
2508                 break;
2509
2510         case MC_BIGCL:
2511                 if (wait & MCR_COMP)
2512                         return (!mcache_bkt_isempty(m_cache(MC_MBUF_BIGCL)));
2513                 break;
2514
2515         case MC_16KCL:
2516                 if (wait & MCR_COMP)
2517                         return (!mcache_bkt_isempty(m_cache(MC_MBUF_16KCL)));
2518                 break;
2519
2520         case MC_MBUF_CL:
2521         case MC_MBUF_BIGCL:
2522         case MC_MBUF_16KCL:
2523                 break;
2524
2525         default:
2526                 VERIFY(0);
2527                 /* NOTREACHED */
2528         }
2529
2530         return (!mcache_bkt_isempty(m_cache(class)));
2531 }
2532
2533 /*
2534  * If possible, convert constructed objects to raw ones.
2535  */
2536 static boolean_t
2537 mbuf_steal(mbuf_class_t class, unsigned int num)
2538 {
2539         mcache_obj_t *top = NULL;
2540         mcache_obj_t **list = &top;
2541         unsigned int tot = 0;
2542
2543         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2544
2545         switch (class) {
2546         case MC_MBUF:
2547         case MC_CL:
2548         case MC_BIGCL:
2549         case MC_16KCL:
2550                 return (FALSE);
2551
2552         case MC_MBUF_CL:
2553         case MC_MBUF_BIGCL:
2554         case MC_MBUF_16KCL:
2555                 /* Get the required number of constructed objects if possible */
2556                 if (m_infree(class) > m_minlimit(class)) {
2557                         tot = cslab_alloc(class, &list,
2558                             MIN(num, m_infree(class)));
2559                 }
2560
2561                 /* And destroy them to get back the raw objects */
2562                 if (top != NULL)
2563                         (void) cslab_free(class, top, 1);
2564                 break;
2565
2566         default:
2567                 VERIFY(0);
2568                 /* NOTREACHED */
2569         }
2570
2571         return (tot == num);
2572 }
2573
2574 static void
2575 m_reclaim(mbuf_class_t class, unsigned int num, boolean_t comp)
2576 {
2577         int m, bmap = 0;
2578
2579         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2580
2581         VERIFY(m_total(MC_CL) <= m_maxlimit(MC_CL));
2582         VERIFY(m_total(MC_BIGCL) <= m_maxlimit(MC_BIGCL));
2583         VERIFY(m_total(MC_16KCL) <= m_maxlimit(MC_16KCL));
2584
2585         /*
2586          * This logic can be made smarter; for now, simply mark
2587          * all other related classes as potential victims.
2588          */
2589         switch (class) {
2590         case MC_MBUF:
2591                 m_wantpurge(MC_CL)++;
2592                 m_wantpurge(MC_MBUF_CL)++;
2593                 m_wantpurge(MC_MBUF_BIGCL)++;
2594                 break;
2595
2596         case MC_CL:
2597                 m_wantpurge(MC_MBUF)++;
2598                 if (!comp)
2599                         m_wantpurge(MC_MBUF_CL)++;
2600                 break;
2601
2602         case MC_BIGCL:
2603                 if (!comp)
2604                         m_wantpurge(MC_MBUF_BIGCL)++;
2605                 break;
2606
2607         case MC_16KCL:
2608                 if (!comp)
2609                         m_wantpurge(MC_MBUF_16KCL)++;
2610                 break;
2611
2612         default:
2613                 VERIFY(0);
2614                 /* NOTREACHED */
2615         }
2616
2617         /*
2618          * Run through each marked class and check if we really need to
2619          * purge (and therefore temporarily disable) the per-CPU caches
2620          * layer used by the class.  If so, remember the classes since
2621          * we are going to drop the lock below prior to purging.
2622          */
2623         for (m = 0; m < NELEM(mbuf_table); m++) {
2624                 if (m_wantpurge(m) > 0) {
2625                         m_wantpurge(m) = 0;
2626                         /*
2627                          * Try hard to steal the required number of objects
2628                          * from the freelist of other mbuf classes.  Only
2629                          * purge and disable the per-CPU caches layer when
2630                          * we don't have enough; it's the last resort.
2631                          */
2632                         if (!mbuf_steal(m, num))
2633                                 bmap |= (1 << m);
2634                 }
2635         }
2636
2637         lck_mtx_unlock(mbuf_mlock);
2638
2639         if (bmap != 0) {
2640                 /* drain is performed in pfslowtimo(), to avoid deadlocks */
2641                 do_reclaim = 1;
2642
2643                 /* Sigh; we have no other choices but to ask mcache to purge */
2644                 for (m = 0; m < NELEM(mbuf_table); m++) {
2645                         if ((bmap & (1 << m)) &&
2646                             mcache_purge_cache(m_cache(m))) {
2647                                 lck_mtx_lock(mbuf_mlock);
2648                                 m_purge_cnt(m)++;
2649                                 mbstat.m_drain++;
2650                                 lck_mtx_unlock(mbuf_mlock);
2651                         }
2652                 }
2653         } else {
2654                 /*
2655                  * Request mcache to reap extra elements from all of its caches;
2656                  * note that all reaps are serialized and happen only at a fixed
2657                  * interval.
2658                  */
2659                 mcache_reap();
2660         }
2661         lck_mtx_lock(mbuf_mlock);
2662 }
2663
2664 static inline struct mbuf *
2665 m_get_common(int wait, short type, int hdr)
2666 {
2667         struct mbuf *m;
2668         int mcflags = MSLEEPF(wait);
2669
2670         /* Is this due to a non-blocking retry?  If so, then try harder */
2671         if (mcflags & MCR_NOSLEEP)
2672                 mcflags |= MCR_TRYHARD;
2673
2674         m = mcache_alloc(m_cache(MC_MBUF), mcflags);
2675         if (m != NULL) {
2676                 MBUF_INIT(m, hdr, type);
2677                 mtype_stat_inc(type);
2678                 mtype_stat_dec(MT_FREE);
2679 #if CONFIG_MACF_NET
2680                 if (hdr && mac_init_mbuf(m, wait) != 0) {
2681                         m_free(m);
2682                         return (NULL);
2683                 }
2684 #endif /* MAC_NET */
2685         }
2686         return (m);
2687 }
2688
2689 /*
2690  * Space allocation routines; these are also available as macros
2691  * for critical paths.
2692  */
2693 #define _M_GET(wait, type)      m_get_common(wait, type, 0)
2694 #define _M_GETHDR(wait, type)   m_get_common(wait, type, 1)
2695 #define _M_RETRY(wait, type)    _M_GET(wait, type)
2696 #define _M_RETRYHDR(wait, type) _M_GETHDR(wait, type)
2697 #define _MGET(m, how, type)     ((m) = _M_GET(how, type))
2698 #define _MGETHDR(m, how, type)  ((m) = _M_GETHDR(how, type))
2699
2700 struct mbuf *
2701 m_get(int wait, int type)
2702 {
2703         return (_M_GET(wait, type));
2704 }
2705
2706 struct mbuf *
2707 m_gethdr(int wait, int type)
2708 {
2709         return (_M_GETHDR(wait, type));
2710 }
2711
2712 struct mbuf *
2713 m_retry(int wait, int type)
2714 {
2715         return (_M_RETRY(wait, type));
2716 }
2717
2718 struct mbuf *
2719 m_retryhdr(int wait, int type)
2720 {
2721         return (_M_RETRYHDR(wait, type));
2722 }
2723
2724 struct mbuf *
2725 m_getclr(int wait, int type)
2726 {
2727         struct mbuf *m;
2728
2729         _MGET(m, wait, type);
2730         if (m != NULL)
2731                 bzero(MTOD(m, caddr_t), MLEN);
2732         return (m);
2733 }
2734
2735 struct mbuf *
2736 m_free(struct mbuf *m)
2737 {
2738         struct mbuf *n = m->m_next;
2739
2740         if (m->m_type == MT_FREE)
2741                 panic("m_free: freeing an already freed mbuf");
2742
2743         /* Free the aux data and tags if there is any */
2744         if (m->m_flags & M_PKTHDR) {
2745                 m_tag_delete_chain(m, NULL);
2746         }
2747
2748         if (m->m_flags & M_EXT) {
2749                 u_int32_t refcnt;
2750                 u_int32_t flags;
2751
2752                 refcnt = m_decref(m);
2753                 flags = MEXT_FLAGS(m);
2754                 if (refcnt == 0 && flags == 0) {
2755                         if (m->m_ext.ext_free == NULL) {
2756                                 mcache_free(m_cache(MC_CL), m->m_ext.ext_buf);
2757                         } else if (m->m_ext.ext_free == m_bigfree) {
2758                                 mcache_free(m_cache(MC_BIGCL),
2759                                     m->m_ext.ext_buf);
2760                         } else if (m->m_ext.ext_free == m_16kfree) {
2761                                 mcache_free(m_cache(MC_16KCL),
2762                                     m->m_ext.ext_buf);
2763                         } else {
2764                                 (*(m->m_ext.ext_free))(m->m_ext.ext_buf,
2765                                     m->m_ext.ext_size, m->m_ext.ext_arg);
2766                         }
2767                         mcache_free(ref_cache, MEXT_RFA(m));
2768                         MEXT_RFA(m) = NULL;
2769                 } else if (refcnt == 0 && (flags & EXTF_COMPOSITE)) {
2770                         VERIFY(m->m_type != MT_FREE);
2771
2772                         mtype_stat_dec(m->m_type);
2773                         mtype_stat_inc(MT_FREE);
2774
2775                         m->m_type = MT_FREE;
2776                         m->m_flags = M_EXT;
2777                         m->m_len = 0;
2778                         m->m_next = m->m_nextpkt = NULL;
2779
2780                         /* "Free" into the intermediate cache */
2781                         if (m->m_ext.ext_free == NULL) {
2782                                 mcache_free(m_cache(MC_MBUF_CL), m);
2783                         } else if (m->m_ext.ext_free == m_bigfree) {
2784                                 mcache_free(m_cache(MC_MBUF_BIGCL), m);
2785                         } else {
2786                                 VERIFY(m->m_ext.ext_free == m_16kfree);
2787                                 mcache_free(m_cache(MC_MBUF_16KCL), m);
2788                         }
2789                         return (n);
2790                 }
2791         }
2792
2793         if (m->m_type != MT_FREE) {
2794                 mtype_stat_dec(m->m_type);
2795                 mtype_stat_inc(MT_FREE);
2796         }
2797
2798         m->m_type = MT_FREE;
2799         m->m_flags = m->m_len = 0;
2800         m->m_next = m->m_nextpkt = NULL;
2801
2802         mcache_free(m_cache(MC_MBUF), m);
2803
2804         return (n);
2805 }
2806
2807 __private_extern__ struct mbuf *
2808 m_clattach(struct mbuf *m, int type, caddr_t extbuf,
2809     void (*extfree)(caddr_t, u_int, caddr_t), u_int extsize, caddr_t extarg,
2810     int wait)
2811 {
2812         struct ext_ref *rfa = NULL;
2813
2814         if (m == NULL && (m = _M_GETHDR(wait, type)) == NULL)
2815                 return (NULL);
2816
2817         if (m->m_flags & M_EXT) {
2818                 u_int32_t refcnt;
2819                 u_int32_t flags;
2820
2821                 refcnt = m_decref(m);
2822                 flags = MEXT_FLAGS(m);
2823                 if (refcnt == 0 && flags == 0) {
2824                         if (m->m_ext.ext_free == NULL) {
2825                                 mcache_free(m_cache(MC_CL), m->m_ext.ext_buf);
2826                         } else if (m->m_ext.ext_free == m_bigfree) {
2827                                 mcache_free(m_cache(MC_BIGCL),
2828                                     m->m_ext.ext_buf);
2829                         } else if (m->m_ext.ext_free == m_16kfree) {
2830                                 mcache_free(m_cache(MC_16KCL),
2831                                     m->m_ext.ext_buf);
2832                         } else {
2833                                 (*(m->m_ext.ext_free))(m->m_ext.ext_buf,
2834                                     m->m_ext.ext_size, m->m_ext.ext_arg);
2835                         }
2836                         /* Re-use the reference structure */
2837                         rfa = MEXT_RFA(m);
2838                 } else if (refcnt == 0 && (flags & EXTF_COMPOSITE)) {
2839                         VERIFY(m->m_type != MT_FREE);
2840
2841                         mtype_stat_dec(m->m_type);
2842                         mtype_stat_inc(MT_FREE);
2843
2844                         m->m_type = MT_FREE;
2845                         m->m_flags = M_EXT;
2846                         m->m_len = 0;
2847                         m->m_next = m->m_nextpkt = NULL;
2848                         /* "Free" into the intermediate cache */
2849                         if (m->m_ext.ext_free == NULL) {
2850                                 mcache_free(m_cache(MC_MBUF_CL), m);
2851                         } else if (m->m_ext.ext_free == m_bigfree) {
2852                                 mcache_free(m_cache(MC_MBUF_BIGCL), m);
2853                         } else {
2854                                 VERIFY(m->m_ext.ext_free == m_16kfree);
2855                                 mcache_free(m_cache(MC_MBUF_16KCL), m);
2856                         }
2857                         /*
2858                          * Allocate a new mbuf, since we didn't divorce
2859                          * the composite mbuf + cluster pair above.
2860                          */
2861                         if ((m = _M_GETHDR(wait, type)) == NULL)
2862                                 return (NULL);
2863                 }
2864         }
2865
2866         if (rfa == NULL &&
2867             (rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL) {
2868                 m_free(m);
2869                 return (NULL);
2870         }
2871
2872         MEXT_INIT(m, extbuf, extsize, extfree, extarg, rfa, 1, 0);
2873
2874         return (m);
2875 }
2876
2877 /* m_mclget() add an mbuf cluster to a normal mbuf */
2878 struct mbuf *
2879 m_mclget(struct mbuf *m, int wait)
2880 {
2881         struct ext_ref *rfa;
2882
2883         if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL)
2884                 return (m);
2885
2886         m->m_ext.ext_buf = m_mclalloc(wait);
2887         if (m->m_ext.ext_buf != NULL) {
2888                 MBUF_CL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0);
2889         } else {
2890                 mcache_free(ref_cache, rfa);
2891         }
2892         return (m);
2893 }
2894
2895 /* Allocate an mbuf cluster */
2896 caddr_t
2897 m_mclalloc(int wait)
2898 {
2899         int mcflags = MSLEEPF(wait);
2900
2901         /* Is this due to a non-blocking retry?  If so, then try harder */
2902         if (mcflags & MCR_NOSLEEP)
2903                 mcflags |= MCR_TRYHARD;
2904
2905         return (mcache_alloc(m_cache(MC_CL), mcflags));
2906 }
2907
2908 /* Free an mbuf cluster */
2909 void
2910 m_mclfree(caddr_t p)
2911 {
2912         mcache_free(m_cache(MC_CL), p);
2913 }
2914
2915 /*
2916  * mcl_hasreference() checks if a cluster of an mbuf is referenced by
2917  * another mbuf
2918  */
2919 int
2920 m_mclhasreference(struct mbuf *m)
2921 {
2922         if (!(m->m_flags & M_EXT))
2923                 return (0);
2924
2925         ASSERT(MEXT_RFA(m) != NULL);
2926
2927         return (MEXT_REF(m) > 1);
2928 }
2929
2930 __private_extern__ caddr_t
2931 m_bigalloc(int wait)
2932 {
2933         int mcflags = MSLEEPF(wait);
2934
2935         /* Is this due to a non-blocking retry?  If so, then try harder */
2936         if (mcflags & MCR_NOSLEEP)
2937                 mcflags |= MCR_TRYHARD;
2938
2939         return (mcache_alloc(m_cache(MC_BIGCL), mcflags));
2940 }
2941
2942 __private_extern__ void
2943 m_bigfree(caddr_t p, __unused u_int size, __unused caddr_t arg)
2944 {
2945         mcache_free(m_cache(MC_BIGCL), p);
2946 }
2947
2948 /* m_mbigget() add an 4KB mbuf cluster to a normal mbuf */
2949 __private_extern__ struct mbuf *
2950 m_mbigget(struct mbuf *m, int wait)
2951 {
2952         struct ext_ref *rfa;
2953
2954         if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL)
2955                 return (m);
2956
2957         m->m_ext.ext_buf =  m_bigalloc(wait);
2958         if (m->m_ext.ext_buf != NULL) {
2959                 MBUF_BIGCL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0);
2960         } else {
2961                 mcache_free(ref_cache, rfa);
2962         }
2963         return (m);
2964 }
2965
2966 __private_extern__ caddr_t
2967 m_16kalloc(int wait)
2968 {
2969         int mcflags = MSLEEPF(wait);
2970
2971         /* Is this due to a non-blocking retry?  If so, then try harder */
2972         if (mcflags & MCR_NOSLEEP)
2973                 mcflags |= MCR_TRYHARD;
2974
2975         return (mcache_alloc(m_cache(MC_16KCL), mcflags));
2976 }
2977
2978 __private_extern__ void
2979 m_16kfree(caddr_t p, __unused u_int size, __unused caddr_t arg)
2980 {
2981         mcache_free(m_cache(MC_16KCL), p);
2982 }
2983
2984 /* m_m16kget() add a 16KB mbuf cluster to a normal mbuf */
2985 __private_extern__ struct mbuf *
2986 m_m16kget(struct mbuf *m, int wait)
2987 {
2988         struct ext_ref *rfa;
2989
2990         if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL)
2991                 return (m);
2992
2993         m->m_ext.ext_buf =  m_16kalloc(wait);
2994         if (m->m_ext.ext_buf != NULL) {
2995                 MBUF_16KCL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0);
2996         } else {
2997                 mcache_free(ref_cache, rfa);
2998         }
2999         return (m);
3000 }
3001
3002 /* */
3003 void
3004 m_copy_pkthdr(struct mbuf *to, struct mbuf *from)
3005 {
3006 #if CONFIG_MACF_NET
3007         /* We will be taking over the tags of 'to' */
3008         if (to->m_flags & M_PKTHDR)
3009                 m_tag_delete_chain(to, NULL);
3010 #endif /* MAC_NET */
3011         to->m_pkthdr = from->m_pkthdr;          /* especially tags */
3012         m_tag_init(from);                       /* purge tags from src */
3013         to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT);
3014         if ((to->m_flags & M_EXT) == 0)
3015                 to->m_data = to->m_pktdat;
3016 }
3017
3018 /*
3019  * Duplicate "from"'s mbuf pkthdr in "to".
3020  * "from" must have M_PKTHDR set, and "to" must be empty.
3021  * In particular, this does a deep copy of the packet tags.
3022  */
3023 static int
3024 m_dup_pkthdr(struct mbuf *to, struct mbuf *from, int how)
3025 {
3026 #if CONFIG_MACF_NET
3027         if (to->m_flags & M_PKTHDR)
3028                 m_tag_delete_chain(to, NULL);
3029 #endif /* MAC_NET */
3030         to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT);
3031         if ((to->m_flags & M_EXT) == 0)
3032                 to->m_data = to->m_pktdat;
3033         to->m_pkthdr = from->m_pkthdr;
3034         m_tag_init(to);
3035         return (m_tag_copy_chain(to, from, how));
3036 }
3037
3038 /*
3039  * Return a list of mbuf hdrs that point to clusters.  Try for num_needed;
3040  * if wantall is not set, return whatever number were available.  Set up the
3041  * first num_with_pkthdrs with mbuf hdrs configured as packet headers; these
3042  * are chained on the m_nextpkt field.  Any packets requested beyond this
3043  * are chained onto the last packet header's m_next field.  The size of
3044  * the cluster is controlled by the parameter bufsize.
3045  */
3046 __private_extern__ struct mbuf *
3047 m_getpackets_internal(unsigned int *num_needed, int num_with_pkthdrs,
3048     int wait, int wantall, size_t bufsize)
3049 {
3050         struct mbuf *m;
3051         struct mbuf **np, *top;
3052         unsigned int pnum, needed = *num_needed;
3053         mcache_obj_t *mp_list = NULL;
3054         int mcflags = MSLEEPF(wait);
3055         u_int32_t flag;
3056         struct ext_ref *rfa;
3057         mcache_t *cp;
3058         void *cl;
3059
3060         ASSERT(bufsize == m_maxsize(MC_CL) ||
3061             bufsize == m_maxsize(MC_BIGCL) ||
3062             bufsize == m_maxsize(MC_16KCL));
3063
3064         /*
3065          * Caller must first check for njcl because this
3066          * routine is internal and not exposed/used via KPI.
3067          */
3068         VERIFY(bufsize != m_maxsize(MC_16KCL) || njcl > 0);
3069
3070         top = NULL;
3071         np = &top;
3072         pnum = 0;
3073
3074         /*
3075          * The caller doesn't want all the requested buffers; only some.
3076          * Try hard to get what we can, but don't block.  This effectively
3077          * overrides MCR_SLEEP, since this thread will not go to sleep
3078          * if we can't get all the buffers.
3079          */
3080         if (!wantall || (mcflags & MCR_NOSLEEP))
3081                 mcflags |= MCR_TRYHARD;
3082
3083         /* Allocate the composite mbuf + cluster elements from the cache */
3084         if (bufsize == m_maxsize(MC_CL))
3085                 cp = m_cache(MC_MBUF_CL);
3086         else if (bufsize == m_maxsize(MC_BIGCL))
3087                 cp = m_cache(MC_MBUF_BIGCL);
3088         else
3089                 cp = m_cache(MC_MBUF_16KCL);
3090         needed = mcache_alloc_ext(cp, &mp_list, needed, mcflags);
3091
3092         for (pnum = 0; pnum < needed; pnum++) {
3093                 m = (struct mbuf *)mp_list;
3094                 mp_list = mp_list->obj_next;
3095
3096                 VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT);
3097                 cl = m->m_ext.ext_buf;
3098                 rfa = MEXT_RFA(m);
3099
3100                 ASSERT(cl != NULL && rfa != NULL);
3101                 VERIFY(MBUF_IS_COMPOSITE(m));
3102
3103                 flag = MEXT_FLAGS(m);
3104
3105                 MBUF_INIT(m, num_with_pkthdrs, MT_DATA);
3106                 if (bufsize == m_maxsize(MC_16KCL)) {
3107                         MBUF_16KCL_INIT(m, cl, rfa, 1, flag);
3108                 } else if (bufsize == m_maxsize(MC_BIGCL)) {
3109                         MBUF_BIGCL_INIT(m, cl, rfa, 1, flag);
3110                 } else {
3111                         MBUF_CL_INIT(m, cl, rfa, 1, flag);
3112                 }
3113
3114                 if (num_with_pkthdrs > 0) {
3115                         --num_with_pkthdrs;
3116 #if CONFIG_MACF_NET
3117                         if (mac_mbuf_label_init(m, wait) != 0) {
3118                                 m_free(m);
3119                                 break;
3120                         }
3121 #endif /* MAC_NET */
3122                 }
3123
3124                 *np = m;
3125                 if (num_with_pkthdrs > 0)
3126                         np = &m->m_nextpkt;
3127                 else
3128                         np = &m->m_next;
3129         }
3130         ASSERT(pnum != *num_needed || mp_list == NULL);
3131         if (mp_list != NULL)
3132                 mcache_free_ext(cp, mp_list);
3133
3134         if (pnum > 0) {
3135                 mtype_stat_add(MT_DATA, pnum);
3136                 mtype_stat_sub(MT_FREE, pnum);
3137         }
3138
3139         if (wantall && (pnum != *num_needed)) {
3140                 if (top != NULL)
3141                         m_freem_list(top);
3142                 return (NULL);
3143         }
3144
3145         *num_needed = pnum;
3146         return (top);
3147 }
3148
3149 /*
3150  * Return list of mbuf linked by m_nextpkt.  Try for numlist, and if
3151  * wantall is not set, return whatever number were available.  The size of
3152  * each mbuf in the list is controlled by the parameter packetlen.  Each
3153  * mbuf of the list may have a chain of mbufs linked by m_next.  Each mbuf
3154  * in the chain is called a segment.  If maxsegments is not null and the
3155  * value pointed to is not null, this specify the maximum number of segments
3156  * for a chain of mbufs.  If maxsegments is zero or the value pointed to
3157  * is zero the caller does not have any restriction on the number of segments.
3158  * The actual  number of segments of a mbuf chain is return in the value
3159  * pointed to by maxsegments.
3160  */
3161 __private_extern__ struct mbuf *
3162 m_allocpacket_internal(unsigned int *numlist, size_t packetlen,
3163     unsigned int *maxsegments, int wait, int wantall, size_t wantsize)
3164 {
3165         struct mbuf **np, *top, *first = NULL;
3166         size_t bufsize, r_bufsize;
3167         unsigned int num = 0;
3168         unsigned int nsegs = 0;
3169         unsigned int needed, resid;
3170         int mcflags = MSLEEPF(wait);
3171         mcache_obj_t *mp_list = NULL, *rmp_list = NULL;
3172         mcache_t *cp = NULL, *rcp = NULL;
3173
3174         if (*numlist == 0)
3175                 return (NULL);
3176
3177         top = NULL;
3178         np = &top;
3179
3180         if (wantsize == 0) {
3181                 if (packetlen <= MINCLSIZE) {
3182                         bufsize = packetlen;
3183                 } else if (packetlen > m_maxsize(MC_CL)) {
3184                         /* Use 4KB if jumbo cluster pool isn't available */
3185                         if (packetlen <= m_maxsize(MC_BIGCL) || njcl == 0)
3186                                 bufsize = m_maxsize(MC_BIGCL);
3187                         else
3188                                 bufsize = m_maxsize(MC_16KCL);
3189                 } else {
3190                         bufsize = m_maxsize(MC_CL);
3191                 }
3192         } else if (wantsize == m_maxsize(MC_CL) ||
3193             wantsize == m_maxsize(MC_BIGCL) ||
3194             (wantsize == m_maxsize(MC_16KCL) && njcl > 0)) {
3195                 bufsize = wantsize;
3196         } else {
3197                 return (NULL);
3198         }
3199
3200         if (bufsize <= MHLEN) {
3201                 nsegs = 1;
3202         } else if (bufsize <= MINCLSIZE) {
3203                 if (maxsegments != NULL && *maxsegments == 1) {
3204                         bufsize = m_maxsize(MC_CL);
3205                         nsegs = 1;
3206                 } else {
3207                         nsegs = 2;
3208                 }
3209         } else if (bufsize == m_maxsize(MC_16KCL)) {
3210                 VERIFY(njcl > 0);
3211                 nsegs = ((packetlen - 1) >> (PGSHIFT + 2)) + 1;
3212         } else if (bufsize == m_maxsize(MC_BIGCL)) {
3213                 nsegs = ((packetlen - 1) >> PGSHIFT) + 1;
3214         } else {
3215                 nsegs = ((packetlen - 1) >> MCLSHIFT) + 1;
3216         }
3217         if (maxsegments != NULL) {
3218                 if (*maxsegments && nsegs > *maxsegments) {
3219                         *maxsegments = nsegs;
3220                         return (NULL);
3221                 }
3222                 *maxsegments = nsegs;
3223         }
3224
3225         /*
3226          * The caller doesn't want all the requested buffers; only some.
3227          * Try hard to get what we can, but don't block.  This effectively
3228          * overrides MCR_SLEEP, since this thread will not go to sleep
3229          * if we can't get all the buffers.
3230          */
3231         if (!wantall || (mcflags & MCR_NOSLEEP))
3232                 mcflags |= MCR_TRYHARD;
3233
3234         /*
3235          * Simple case where all elements in the lists/chains are mbufs.
3236          * Unless bufsize is greater than MHLEN, each segment chain is made
3237          * up of exactly 1 mbuf.  Otherwise, each segment chain is made up
3238          * of 2 mbufs; the second one is used for the residual data, i.e.
3239          * the remaining data that cannot fit into the first mbuf.
3240          */
3241         if (bufsize <= MINCLSIZE) {
3242                 /* Allocate the elements in one shot from the mbuf cache */
3243                 ASSERT(bufsize <= MHLEN || nsegs == 2);
3244                 cp = m_cache(MC_MBUF);
3245                 needed = mcache_alloc_ext(cp, &mp_list,
3246                     (*numlist) * nsegs, mcflags);
3247
3248                 /*
3249                  * The number of elements must be even if we are to use an
3250                  * mbuf (instead of a cluster) to store the residual data.
3251                  * If we couldn't allocate the requested number of mbufs,
3252                  * trim the number down (if it's odd) in order to avoid
3253                  * creating a partial segment chain.
3254                  */
3255                 if (bufsize > MHLEN && (needed & 0x1))
3256                         needed--;
3257
3258                 while (num < needed) {
3259                         struct mbuf *m;
3260
3261                         m = (struct mbuf *)mp_list;
3262                         mp_list = mp_list->obj_next;
3263                         ASSERT(m != NULL);
3264
3265                         MBUF_INIT(m, 1, MT_DATA);
3266 #if CONFIG_MACF_NET
3267                         if (mac_init_mbuf(m, wait) != 0) {
3268                                 m_free(m);
3269                                 break;
3270                         }
3271 #endif /* MAC_NET */
3272                         num++;
3273                         if (bufsize > MHLEN) {
3274                                 /* A second mbuf for this segment chain */
3275                                 m->m_next = (struct mbuf *)mp_list;
3276                                 mp_list = mp_list->obj_next;
3277                                 ASSERT(m->m_next != NULL);
3278
3279                                 MBUF_INIT(m->m_next, 0, MT_DATA);
3280                                 num++;
3281                         }
3282                         *np = m;
3283                         np = &m->m_nextpkt;
3284                 }
3285                 ASSERT(num != *numlist || mp_list == NULL);
3286
3287                 if (num > 0) {
3288                         mtype_stat_add(MT_DATA, num);
3289                         mtype_stat_sub(MT_FREE, num);
3290                 }
3291                 num /= nsegs;
3292
3293                 /* We've got them all; return to caller */
3294                 if (num == *numlist)
3295                         return (top);
3296
3297                 goto fail;
3298         }
3299
3300         /*
3301          * Complex cases where elements are made up of one or more composite
3302          * mbufs + cluster, depending on packetlen.  Each N-segment chain can
3303          * be illustrated as follows:
3304          *
3305          * [mbuf + cluster 1] [mbuf + cluster 2] ... [mbuf + cluster N]
3306          *
3307          * Every composite mbuf + cluster element comes from the intermediate
3308          * cache (either MC_MBUF_CL or MC_MBUF_BIGCL).  For space efficiency,
3309          * the last composite element will come from the MC_MBUF_CL cache,
3310          * unless the residual data is larger than 2KB where we use the
3311          * big cluster composite cache (MC_MBUF_BIGCL) instead.  Residual
3312          * data is defined as extra data beyond the first element that cannot
3313          * fit into the previous element, i.e. there is no residual data if
3314          * the chain only has 1 segment.
3315          */
3316         r_bufsize = bufsize;
3317         resid = packetlen > bufsize ? packetlen % bufsize : 0;
3318         if (resid > 0) {
3319                 /* There is residual data; figure out the cluster size */
3320                 if (wantsize == 0 && packetlen > MINCLSIZE) {
3321                         /*
3322                          * Caller didn't request that all of the segments
3323                          * in the chain use the same cluster size; use the
3324                          * smaller of the cluster sizes.
3325                          */
3326                         if (njcl > 0 && resid > m_maxsize(MC_BIGCL))
3327                                 r_bufsize = m_maxsize(MC_16KCL);
3328                         else if (resid > m_maxsize(MC_CL))
3329                                 r_bufsize = m_maxsize(MC_BIGCL);
3330                         else
3331                                 r_bufsize = m_maxsize(MC_CL);
3332                 } else {
3333                         /* Use the same cluster size as the other segments */
3334                         resid = 0;
3335                 }
3336         }
3337
3338         needed = *numlist;
3339         if (resid > 0) {
3340                 /*
3341                  * Attempt to allocate composite mbuf + cluster elements for
3342                  * the residual data in each chain; record the number of such
3343                  * elements that can be allocated so that we know how many
3344                  * segment chains we can afford to create.
3345                  */
3346                 if (r_bufsize <= m_maxsize(MC_CL))
3347                         rcp = m_cache(MC_MBUF_CL);
3348                 else if (r_bufsize <= m_maxsize(MC_BIGCL))
3349                         rcp = m_cache(MC_MBUF_BIGCL);
3350                 else
3351                         rcp = m_cache(MC_MBUF_16KCL);
3352                 needed = mcache_alloc_ext(rcp, &rmp_list, *numlist, mcflags);
3353
3354                 if (needed == 0)
3355                         goto fail;
3356
3357                 /* This is temporarily reduced for calculation */
3358                 ASSERT(nsegs > 1);
3359                 nsegs--;
3360         }
3361
3362         /*
3363          * Attempt to allocate the rest of the composite mbuf + cluster
3364          * elements for the number of segment chains that we need.
3365          */
3366         if (bufsize <= m_maxsize(MC_CL))
3367                 cp = m_cache(MC_MBUF_CL);
3368         else if (bufsize <= m_maxsize(MC_BIGCL))
3369                 cp = m_cache(MC_MBUF_BIGCL);
3370         else
3371                 cp = m_cache(MC_MBUF_16KCL);
3372         needed = mcache_alloc_ext(cp, &mp_list, needed * nsegs, mcflags);
3373
3374         /* Round it down to avoid creating a partial segment chain */
3375         needed = (needed / nsegs) * nsegs;
3376         if (needed == 0)
3377                 goto fail;
3378
3379         if (resid > 0) {
3380                 /*
3381                  * We're about to construct the chain(s); take into account
3382                  * the number of segments we have created above to hold the
3383                  * residual data for each chain, as well as restore the
3384                  * original count of segments per chain.
3385                  */
3386                 ASSERT(nsegs > 0);
3387                 needed += needed / nsegs;
3388                 nsegs++;
3389         }
3390
3391         for (;;) {
3392                 struct mbuf *m;
3393                 u_int32_t flag;
3394                 struct ext_ref *rfa;
3395                 void *cl;
3396                 int pkthdr;
3397
3398                 ++num;
3399                 if (nsegs == 1 || (num % nsegs) != 0 || resid == 0) {
3400                         m = (struct mbuf *)mp_list;
3401                         mp_list = mp_list->obj_next;
3402                 } else {
3403                         m = (struct mbuf *)rmp_list;
3404                         rmp_list = rmp_list->obj_next;
3405                 }
3406                 ASSERT(m != NULL);
3407                 VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT);
3408                 VERIFY(m->m_ext.ext_free == NULL ||
3409                     m->m_ext.ext_free == m_bigfree ||
3410                     m->m_ext.ext_free == m_16kfree);
3411
3412                 cl = m->m_ext.ext_buf;
3413                 rfa = MEXT_RFA(m);
3414
3415                 ASSERT(cl != NULL && rfa != NULL);
3416                 VERIFY(MBUF_IS_COMPOSITE(m));
3417
3418                 flag = MEXT_FLAGS(m);
3419
3420                 pkthdr = (nsegs == 1 || (num % nsegs) == 1);
3421                 if (pkthdr)
3422                         first = m;
3423                 MBUF_INIT(m, pkthdr, MT_DATA);
3424                 if (m->m_ext.ext_free == m_16kfree) {
3425                         MBUF_16KCL_INIT(m, cl, rfa, 1, flag);
3426                 } else if (m->m_ext.ext_free == m_bigfree) {
3427                         MBUF_BIGCL_INIT(m, cl, rfa, 1, flag);
3428                 } else {
3429                         MBUF_CL_INIT(m, cl, rfa, 1, flag);
3430                 }
3431 #if CONFIG_MACF_NET
3432                 if (pkthdr && mac_init_mbuf(m, wait) != 0) {
3433                         --num;
3434                         m_free(m);
3435                         break;
3436                 }
3437 #endif /* MAC_NET */
3438
3439                 *np = m;
3440                 if ((num % nsegs) == 0)
3441                         np = &first->m_nextpkt;
3442                 else
3443                         np = &m->m_next;
3444
3445                 if (num == needed)
3446                         break;
3447         }
3448
3449         if (num > 0) {
3450                 mtype_stat_add(MT_DATA, num);
3451                 mtype_stat_sub(MT_FREE, num);
3452         }
3453
3454         num /= nsegs;
3455
3456         /* We've got them all; return to caller */
3457         if (num == *numlist) {
3458                 ASSERT(mp_list == NULL && rmp_list == NULL);
3459                 return (top);
3460         }
3461
3462 fail:
3463         /* Free up what's left of the above */
3464         if (mp_list != NULL)
3465                 mcache_free_ext(cp, mp_list);
3466         if (rmp_list != NULL)
3467                 mcache_free_ext(rcp, rmp_list);
3468         if (wantall && top != NULL) {
3469                 m_freem(top);
3470                 return (NULL);
3471         }
3472         *numlist = num;
3473         return (top);
3474 }
3475
3476 /*
3477  * Best effort to get a mbuf cluster + pkthdr.  Used by drivers to allocated
3478  * packets on receive ring.
3479  */
3480 __private_extern__ struct mbuf *
3481 m_getpacket_how(int wait)
3482 {
3483         unsigned int num_needed = 1;
3484
3485         return (m_getpackets_internal(&num_needed, 1, wait, 1,
3486             m_maxsize(MC_CL)));
3487 }
3488
3489 /*
3490  * Best effort to get a mbuf cluster + pkthdr.  Used by drivers to allocated
3491  * packets on receive ring.
3492  */
3493 struct mbuf *
3494 m_getpacket(void)
3495 {
3496         unsigned int num_needed = 1;
3497
3498         return (m_getpackets_internal(&num_needed, 1, M_WAIT, 1,
3499             m_maxsize(MC_CL)));
3500 }
3501
3502 /*
3503  * Return a list of mbuf hdrs that point to clusters.  Try for num_needed;
3504  * if this can't be met, return whatever number were available.  Set up the
3505  * first num_with_pkthdrs with mbuf hdrs configured as packet headers.  These
3506  * are chained on the m_nextpkt field.  Any packets requested beyond this are
3507  * chained onto the last packet header's m_next field.
3508  */
3509 struct mbuf *
3510 m_getpackets(int num_needed, int num_with_pkthdrs, int how)
3511 {
3512         unsigned int n = num_needed;
3513
3514         return (m_getpackets_internal(&n, num_with_pkthdrs, how, 0,
3515             m_maxsize(MC_CL)));
3516 }
3517
3518 /*
3519  * Return a list of mbuf hdrs set up as packet hdrs chained together
3520  * on the m_nextpkt field
3521  */
3522 struct mbuf *
3523 m_getpackethdrs(int num_needed, int how)
3524 {
3525         struct mbuf *m;
3526         struct mbuf **np, *top;
3527
3528         top = NULL;
3529         np = &top;
3530
3531         while (num_needed--) {
3532                 m = _M_RETRYHDR(how, MT_DATA);
3533                 if (m == NULL)
3534                         break;
3535
3536                 *np = m;
3537                 np = &m->m_nextpkt;
3538         }
3539
3540         return (top);
3541 }
3542
3543 /*
3544  * Free an mbuf list (m_nextpkt) while following m_next.  Returns the count
3545  * for mbufs packets freed.  Used by the drivers.
3546  */
3547 int
3548 m_freem_list(struct mbuf *m)
3549 {
3550         struct mbuf *nextpkt;
3551         mcache_obj_t *mp_list = NULL;
3552         mcache_obj_t *mcl_list = NULL;
3553         mcache_obj_t *mbc_list = NULL;
3554         mcache_obj_t *m16k_list = NULL;
3555         mcache_obj_t *m_mcl_list = NULL;
3556         mcache_obj_t *m_mbc_list = NULL;
3557         mcache_obj_t *m_m16k_list = NULL;
3558         mcache_obj_t *ref_list = NULL;
3559         int pktcount = 0;
3560         int mt_free = 0, mt_data = 0, mt_header = 0, mt_soname = 0, mt_tag = 0;
3561
3562         while (m != NULL) {
3563                 pktcount++;
3564
3565                 nextpkt = m->m_nextpkt;
3566                 m->m_nextpkt = NULL;
3567
3568                 while (m != NULL) {
3569                         struct mbuf *next = m->m_next;
3570                         mcache_obj_t *o, *rfa;
3571                         u_int32_t refcnt, flags;
3572
3573                         if (m->m_type == MT_FREE)
3574                                 panic("m_free: freeing an already freed mbuf");
3575
3576                         if (m->m_type != MT_FREE)
3577                                 mt_free++;
3578
3579                         if (m->m_flags & M_PKTHDR) {
3580                                 m_tag_delete_chain(m, NULL);
3581                         }
3582
3583                         if (!(m->m_flags & M_EXT))
3584                                 goto simple_free;
3585
3586                         o = (mcache_obj_t *)m->m_ext.ext_buf;
3587                         refcnt = m_decref(m);
3588                         flags = MEXT_FLAGS(m);
3589                         if (refcnt == 0 && flags == 0) {
3590                                 if (m->m_ext.ext_free == NULL) {
3591                                         o->obj_next = mcl_list;
3592                                         mcl_list = o;
3593                                 } else if (m->m_ext.ext_free == m_bigfree) {
3594                                         o->obj_next = mbc_list;
3595                                         mbc_list = o;
3596                                 } else if (m->m_ext.ext_free == m_16kfree) {
3597                                         o->obj_next = m16k_list;
3598                                         m16k_list = o;
3599                                 } else {
3600                                         (*(m->m_ext.ext_free))((caddr_t)o,
3601                                             m->m_ext.ext_size,
3602                                             m->m_ext.ext_arg);
3603                                 }
3604                                 rfa = (mcache_obj_t *)MEXT_RFA(m);
3605                                 rfa->obj_next = ref_list;
3606                                 ref_list = rfa;
3607                                 MEXT_RFA(m) = NULL;
3608                         } else if (refcnt == 0 && (flags & EXTF_COMPOSITE)) {
3609                                 VERIFY(m->m_type != MT_FREE);
3610                                 /*
3611                                  * Amortize the costs of atomic operations
3612                                  * by doing them at the end, if possible.
3613                                  */
3614                                 if (m->m_type == MT_DATA)
3615                                         mt_data++;
3616                                 else if (m->m_type == MT_HEADER)
3617                                         mt_header++;
3618                                 else if (m->m_type == MT_SONAME)
3619                                         mt_soname++;
3620                                 else if (m->m_type == MT_TAG)
3621                                         mt_tag++;
3622                                 else
3623                                         mtype_stat_dec(m->m_type);
3624
3625                                 m->m_type = MT_FREE;
3626                                 m->m_flags = M_EXT;
3627                                 m->m_len = 0;
3628                                 m->m_next = m->m_nextpkt = NULL;
3629
3630                                 /* "Free" into the intermediate cache */
3631                                 o = (mcache_obj_t *)m;
3632                                 if (m->m_ext.ext_free == NULL) {
3633                                         o->obj_next = m_mcl_list;
3634                                         m_mcl_list = o;
3635                                 } else if (m->m_ext.ext_free == m_bigfree) {
3636                                         o->obj_next = m_mbc_list;
3637                                         m_mbc_list = o;
3638                                 } else {
3639                                         VERIFY(m->m_ext.ext_free == m_16kfree);
3640                                         o->obj_next = m_m16k_list;
3641                                         m_m16k_list = o;
3642                                 }
3643                                 m = next;
3644                                 continue;
3645                         }
3646 simple_free:
3647                         /*
3648                          * Amortize the costs of atomic operations
3649                          * by doing them at the end, if possible.
3650                          */
3651                         if (m->m_type == MT_DATA)
3652                                 mt_data++;
3653                         else if (m->m_type == MT_HEADER)
3654                                 mt_header++;
3655                         else if (m->m_type == MT_SONAME)
3656                                 mt_soname++;
3657                         else if (m->m_type == MT_TAG)
3658                                 mt_tag++;
3659                         else if (m->m_type != MT_FREE)
3660                                 mtype_stat_dec(m->m_type);
3661
3662                         m->m_type = MT_FREE;
3663                         m->m_flags = m->m_len = 0;
3664                         m->m_next = m->m_nextpkt = NULL;
3665
3666                         ((mcache_obj_t *)m)->obj_next = mp_list;
3667                         mp_list = (mcache_obj_t *)m;
3668
3669                         m = next;
3670                 }
3671
3672                 m = nextpkt;
3673         }
3674
3675         if (mt_free > 0)
3676                 mtype_stat_add(MT_FREE, mt_free);
3677         if (mt_data > 0)
3678                 mtype_stat_sub(MT_DATA, mt_data);
3679         if (mt_header > 0)
3680                 mtype_stat_sub(MT_HEADER, mt_header);
3681         if (mt_soname > 0)
3682                 mtype_stat_sub(MT_SONAME, mt_soname);
3683         if (mt_tag > 0)
3684                 mtype_stat_sub(MT_TAG, mt_tag);
3685
3686         if (mp_list != NULL)
3687                 mcache_free_ext(m_cache(MC_MBUF), mp_list);
3688         if (mcl_list != NULL)
3689                 mcache_free_ext(m_cache(MC_CL), mcl_list);
3690         if (mbc_list != NULL)
3691                 mcache_free_ext(m_cache(MC_BIGCL), mbc_list);
3692         if (m16k_list != NULL)
3693                 mcache_free_ext(m_cache(MC_16KCL), m16k_list);
3694         if (m_mcl_list != NULL)
3695                 mcache_free_ext(m_cache(MC_MBUF_CL), m_mcl_list);
3696         if (m_mbc_list != NULL)
3697                 mcache_free_ext(m_cache(MC_MBUF_BIGCL), m_mbc_list);
3698         if (m_m16k_list != NULL)
3699                 mcache_free_ext(m_cache(MC_MBUF_16KCL), m_m16k_list);
3700         if (ref_list != NULL)
3701                 mcache_free_ext(ref_cache, ref_list);
3702
3703         return (pktcount);
3704 }
3705
3706 void
3707 m_freem(struct mbuf *m)
3708 {
3709         while (m != NULL)
3710                 m = m_free(m);
3711 }
3712
3713 /*
3714  * Mbuffer utility routines.
3715  */
3716
3717 /*
3718  * Compute the amount of space available before the current start
3719  * of data in an mbuf.
3720  */
3721 int
3722 m_leadingspace(struct mbuf *m)
3723 {
3724         if (m->m_flags & M_EXT) {
3725                 if (MCLHASREFERENCE(m))
3726                         return (0);
3727                 return (m->m_data - m->m_ext.ext_buf);
3728         }
3729         if (m->m_flags & M_PKTHDR)
3730                 return (m->m_data - m->m_pktdat);
3731         return (m->m_data - m->m_dat);
3732 }
3733
3734 /*
3735  * Compute the amount of space available after the end of data in an mbuf.
3736  */
3737 int
3738 m_trailingspace(struct mbuf *m)
3739 {
3740         if (m->m_flags & M_EXT) {
3741                 if (MCLHASREFERENCE(m))
3742                         return (0);
3743                 return (m->m_ext.ext_buf + m->m_ext.ext_size -
3744                     (m->m_data + m->m_len));
3745         }
3746         return (&m->m_dat[MLEN] - (m->m_data + m->m_len));
3747 }
3748
3749 /*
3750  * Lesser-used path for M_PREPEND: allocate new mbuf to prepend to chain,
3751  * copy junk along.  Does not adjust packet header length.
3752  */
3753 struct mbuf *
3754 m_prepend(struct mbuf *m, int len, int how)
3755 {
3756         struct mbuf *mn;
3757
3758         _MGET(mn, how, m->m_type);
3759         if (mn == NULL) {
3760                 m_freem(m);
3761                 return (NULL);
3762         }
3763         if (m->m_flags & M_PKTHDR) {
3764                 M_COPY_PKTHDR(mn, m);
3765                 m->m_flags &= ~M_PKTHDR;
3766         }
3767         mn->m_next = m;
3768         m = mn;
3769         if (len < MHLEN)
3770                 MH_ALIGN(m, len);
3771         m->m_len = len;
3772         return (m);
3773 }
3774
3775 /*
3776  * Replacement for old M_PREPEND macro: allocate new mbuf to prepend to
3777  * chain, copy junk along, and adjust length.
3778  */
3779 struct mbuf *
3780 m_prepend_2(struct mbuf *m, int len, int how)
3781 {
3782         if (M_LEADINGSPACE(m) >= len) {
3783                 m->m_data -= len;
3784                 m->m_len += len;
3785         } else {
3786                 m = m_prepend(m, len, how);
3787         }
3788         if ((m) && (m->m_flags & M_PKTHDR))
3789                 m->m_pkthdr.len += len;
3790         return (m);
3791 }
3792
3793 /*
3794  * Make a copy of an mbuf chain starting "off0" bytes from the beginning,
3795  * continuing for "len" bytes.  If len is M_COPYALL, copy to end of mbuf.
3796  * The wait parameter is a choice of M_WAIT/M_DONTWAIT from caller.
3797  */
3798 int MCFail;
3799
3800 struct mbuf *
3801 m_copym(struct mbuf *m, int off0, int len, int wait)
3802 {
3803         struct mbuf *n, *mhdr = NULL, **np;
3804         int off = off0;
3805         struct mbuf *top;
3806         int copyhdr = 0;
3807
3808         if (off < 0 || len < 0)
3809                 panic("m_copym: invalid offset %d or len %d", off, len);
3810
3811         if (off == 0 && (m->m_flags & M_PKTHDR)) {
3812                 mhdr = m;
3813                 copyhdr = 1;
3814         }
3815
3816         while (off >= m->m_len) {
3817                 if (m->m_next == NULL)
3818                         panic("m_copym: invalid mbuf chain");
3819                 off -= m->m_len;
3820                 m = m->m_next;
3821         }
3822         np = &top;
3823         top = NULL;
3824
3825         while (len > 0) {
3826                 if (m == NULL) {
3827                         if (len != M_COPYALL)
3828                                 panic("m_copym: len != M_COPYALL");
3829                         break;
3830                 }
3831
3832                 n = _M_RETRY(wait, m->m_type);
3833                 *np = n;
3834
3835                 if (n == NULL)
3836                         goto nospace;
3837
3838                 if (copyhdr != 0) {
3839                         M_COPY_PKTHDR(n, mhdr);
3840                         if (len == M_COPYALL)
3841                                 n->m_pkthdr.len -= off0;
3842                         else
3843                                 n->m_pkthdr.len = len;
3844                         copyhdr = 0;
3845                 }
3846                 if (len == M_COPYALL) {
3847                         if (MIN(len, (m->m_len - off)) == len) {
3848                                 printf("m->m_len %ld - off %d = %ld, %ld\n",
3849                                     m->m_len, off, m->m_len - off,
3850                                     MIN(len, (m->m_len - off)));
3851                         }
3852                 }
3853                 n->m_len = MIN(len, (m->m_len - off));
3854                 if (n->m_len == M_COPYALL) {
3855                         printf("n->m_len == M_COPYALL, fixing\n");
3856                         n->m_len = MHLEN;
3857                 }
3858                 if (m->m_flags & M_EXT) {
3859                         n->m_ext = m->m_ext;
3860                         m_incref(m);
3861                         n->m_data = m->m_data + off;
3862                         n->m_flags |= M_EXT;
3863                 } else {
3864                         bcopy(MTOD(m, caddr_t)+off, MTOD(n, caddr_t),
3865                             (unsigned)n->m_len);
3866                 }
3867                 if (len != M_COPYALL)
3868                         len -= n->m_len;
3869                 off = 0;
3870                 m = m->m_next;
3871                 np = &n->m_next;
3872         }
3873
3874         if (top == NULL)
3875                 MCFail++;
3876
3877         return (top);
3878 nospace:
3879
3880         m_freem(top);
3881         MCFail++;
3882         return (NULL);
3883 }
3884
3885 /*
3886  * Equivalent to m_copym except that all necessary mbuf hdrs are allocated
3887  * within this routine also, the last mbuf and offset accessed are passed
3888  * out and can be passed back in to avoid having to rescan the entire mbuf
3889  * list (normally hung off of the socket)
3890  */
3891 struct mbuf *
3892 m_copym_with_hdrs(struct mbuf *m, int off0, int len0, int wait,
3893     struct mbuf **m_last, int *m_off)
3894 {
3895         struct mbuf *n, **np = NULL;
3896         int off = off0, len = len0;
3897         struct mbuf *top = NULL;
3898         int mcflags = MSLEEPF(wait);
3899         int copyhdr = 0;
3900         int type = 0;
3901         mcache_obj_t *list = NULL;
3902         int needed = 0;
3903
3904         if (off == 0 && (m->m_flags & M_PKTHDR))
3905                 copyhdr = 1;
3906
3907         if (*m_last != NULL) {
3908                 m = *m_last;
3909                 off = *m_off;
3910         } else {
3911                 while (off >= m->m_len) {
3912                         off -= m->m_len;
3913                         m = m->m_next;
3914                 }
3915         }
3916
3917         n = m;
3918         while (len > 0) {
3919                 needed++;
3920                 ASSERT(n != NULL);
3921                 len -= MIN(len, (n->m_len - ((needed == 1) ? off : 0)));
3922                 n = n->m_next;
3923         }
3924         needed++;
3925         len = len0;
3926
3927         /*
3928          * If the caller doesn't want to be put to sleep, mark it with
3929          * MCR_TRYHARD so that we may reclaim buffers from other places
3930          * before giving up.
3931          */
3932         if (mcflags & MCR_NOSLEEP)
3933                 mcflags |= MCR_TRYHARD;
3934
3935         if (mcache_alloc_ext(m_cache(MC_MBUF), &list, needed,
3936             mcflags) != needed)
3937                 goto nospace;
3938
3939         needed = 0;
3940         while (len > 0) {
3941                 n = (struct mbuf *)list;
3942                 list = list->obj_next;
3943                 ASSERT(n != NULL && m != NULL);
3944
3945                 type = (top == NULL) ? MT_HEADER : m->m_type;
3946                 MBUF_INIT(n, (top == NULL), type);
3947 #if CONFIG_MACF_NET
3948                 if (top == NULL && mac_mbuf_label_init(n, wait) != 0) {
3949                         mtype_stat_inc(MT_HEADER);
3950                         mtype_stat_dec(MT_FREE);
3951                         m_free(n);
3952                         goto nospace;
3953                 }
3954 #endif /* MAC_NET */
3955
3956                 if (top == NULL) {
3957                         top = n;
3958                         np = &top->m_next;
3959                         continue;
3960                 } else {
3961                         needed++;
3962                         *np = n;
3963                 }
3964
3965                 if (copyhdr) {
3966                         M_COPY_PKTHDR(n, m);
3967                         n->m_pkthdr.len = len;
3968                         copyhdr = 0;
3969                 }
3970                 n->m_len = MIN(len, (m->m_len - off));
3971
3972                 if (m->m_flags & M_EXT) {
3973                         n->m_ext = m->m_ext;
3974                         m_incref(m);
3975                         n->m_data = m->m_data + off;
3976                         n->m_flags |= M_EXT;
3977                 } else {
3978                         bcopy(MTOD(m, caddr_t)+off, MTOD(n, caddr_t),
3979                             (unsigned)n->m_len);
3980                 }
3981                 len -= n->m_len;
3982
3983                 if (len == 0) {
3984                         if ((off + n->m_len) == m->m_len) {
3985                                 *m_last = m->m_next;
3986                                 *m_off  = 0;
3987                         } else {
3988                                 *m_last = m;
3989                                 *m_off  = off + n->m_len;
3990                         }
3991                         break;
3992                 }
3993                 off = 0;
3994                 m = m->m_next;
3995                 np = &n->m_next;
3996         }
3997
3998         mtype_stat_inc(MT_HEADER);
3999         mtype_stat_add(type, needed);
4000         mtype_stat_sub(MT_FREE, needed + 1);
4001
4002         ASSERT(list == NULL);
4003         return (top);
4004
4005 nospace:
4006         if (list != NULL)
4007                 mcache_free_ext(m_cache(MC_MBUF), list);
4008         if (top != NULL)
4009                 m_freem(top);
4010         MCFail++;
4011         return (NULL);
4012 }
4013
4014 /*
4015  * Copy data from an mbuf chain starting "off" bytes from the beginning,
4016  * continuing for "len" bytes, into the indicated buffer.
4017  */
4018 void
4019 m_copydata(struct mbuf *m, int off, int len, caddr_t cp)
4020 {
4021         unsigned count;
4022
4023         if (off < 0 || len < 0)
4024                 panic("m_copydata: invalid offset %d or len %d", off, len);
4025
4026         while (off > 0) {
4027                 if (m == NULL)
4028                         panic("m_copydata: invalid mbuf chain");
4029                 if (off < m->m_len)
4030                         break;
4031                 off -= m->m_len;
4032                 m = m->m_next;
4033         }
4034         while (len > 0) {
4035                 if (m == NULL)
4036                         panic("m_copydata: invalid mbuf chain");
4037                 count = MIN(m->m_len - off, len);
4038                 bcopy(MTOD(m, caddr_t) + off, cp, count);
4039                 len -= count;
4040                 cp += count;
4041                 off = 0;
4042                 m = m->m_next;
4043         }
4044 }
4045
4046 /*
4047  * Concatenate mbuf chain n to m.  Both chains must be of the same type
4048  * (e.g. MT_DATA).  Any m_pkthdr is not updated.
4049  */
4050 void
4051 m_cat(struct mbuf *m, struct mbuf *n)
4052 {
4053         while (m->m_next)
4054                 m = m->m_next;
4055         while (n) {
4056                 if ((m->m_flags & M_EXT) ||
4057                     m->m_data + m->m_len + n->m_len >= &m->m_dat[MLEN]) {
4058                         /* just join the two chains */
4059                         m->m_next = n;
4060                         return;
4061                 }
4062                 /* splat the data from one into the other */
4063                 bcopy(MTOD(n, caddr_t), MTOD(m, caddr_t) + m->m_len,
4064                     (u_int)n->m_len);
4065                 m->m_len += n->m_len;
4066                 n = m_free(n);
4067         }
4068 }
4069
4070 void
4071 m_adj(struct mbuf *mp, int req_len)
4072 {
4073         int len = req_len;
4074         struct mbuf *m;
4075         int count;
4076
4077         if ((m = mp) == NULL)
4078                 return;
4079         if (len >= 0) {
4080                 /*
4081                  * Trim from head.
4082                  */
4083                 while (m != NULL && len > 0) {
4084                         if (m->m_len <= len) {
4085                                 len -= m->m_len;
4086                                 m->m_len = 0;
4087                                 m = m->m_next;
4088                         } else {
4089                                 m->m_len -= len;
4090                                 m->m_data += len;
4091                                 len = 0;
4092                         }
4093                 }
4094                 m = mp;
4095                 if (m->m_flags & M_PKTHDR)
4096                         m->m_pkthdr.len -= (req_len - len);
4097         } else {
4098                 /*
4099                  * Trim from tail.  Scan the mbuf chain,
4100                  * calculating its length and finding the last mbuf.
4101                  * If the adjustment only affects this mbuf, then just
4102                  * adjust and return.  Otherwise, rescan and truncate
4103                  * after the remaining size.
4104                  */
4105                 len = -len;
4106                 count = 0;
4107                 for (;;) {
4108                         count += m->m_len;
4109                         if (m->m_next == (struct mbuf *)0)
4110                                 break;
4111                         m = m->m_next;
4112                 }
4113                 if (m->m_len >= len) {
4114                         m->m_len -= len;
4115                         m = mp;
4116                         if (m->m_flags & M_PKTHDR)
4117                                 m->m_pkthdr.len -= len;
4118                         return;
4119                 }
4120                 count -= len;
4121                 if (count < 0)
4122                         count = 0;
4123                 /*
4124                  * Correct length for chain is "count".
4125                  * Find the mbuf with last data, adjust its length,
4126                  * and toss data from remaining mbufs on chain.
4127                  */
4128                 m = mp;
4129                 if (m->m_flags & M_PKTHDR)
4130                         m->m_pkthdr.len = count;
4131                 for (; m; m = m->m_next) {
4132                         if (m->m_len >= count) {
4133                                 m->m_len = count;
4134                                 break;
4135                         }
4136                         count -= m->m_len;
4137                 }
4138                 while ((m = m->m_next))
4139                         m->m_len = 0;
4140         }
4141 }
4142
4143 /*
4144  * Rearange an mbuf chain so that len bytes are contiguous
4145  * and in the data area of an mbuf (so that mtod and dtom
4146  * will work for a structure of size len).  Returns the resulting
4147  * mbuf chain on success, frees it and returns null on failure.
4148  * If there is room, it will add up to max_protohdr-len extra bytes to the
4149  * contiguous region in an attempt to avoid being called next time.
4150  */
4151 int MPFail;
4152
4153 struct mbuf *
4154 m_pullup(struct mbuf *n, int len)
4155 {
4156         struct mbuf *m;
4157         int count;
4158         int space;
4159
4160         /*
4161          * If first mbuf has no cluster, and has room for len bytes
4162          * without shifting current data, pullup into it,
4163          * otherwise allocate a new mbuf to prepend to the chain.
4164          */
4165         if ((n->m_flags & M_EXT) == 0 &&
4166             n->m_data + len < &n->m_dat[MLEN] && n->m_next) {
4167                 if (n->m_len >= len)
4168                         return (n);
4169                 m = n;
4170                 n = n->m_next;
4171                 len -= m->m_len;
4172         } else {
4173                 if (len > MHLEN)
4174                         goto bad;
4175                 _MGET(m, M_DONTWAIT, n->m_type);
4176                 if (m == 0)
4177                         goto bad;
4178                 m->m_len = 0;
4179                 if (n->m_flags & M_PKTHDR) {
4180                         M_COPY_PKTHDR(m, n);
4181                         n->m_flags &= ~M_PKTHDR;
4182                 }
4183         }
4184         space = &m->m_dat[MLEN] - (m->m_data + m->m_len);
4185         do {
4186                 count = MIN(MIN(MAX(len, max_protohdr), space), n->m_len);
4187                 bcopy(MTOD(n, caddr_t), MTOD(m, caddr_t) + m->m_len,
4188                     (unsigned)count);
4189                 len -= count;
4190                 m->m_len += count;
4191                 n->m_len -= count;
4192                 space -= count;
4193                 if (n->m_len)
4194                         n->m_data += count;
4195                 else
4196                         n = m_free(n);
4197         } while (len > 0 && n);
4198         if (len > 0) {
4199                 (void) m_free(m);
4200                 goto bad;
4201         }
4202         m->m_next = n;
4203         return (m);
4204 bad:
4205         m_freem(n);
4206         MPFail++;
4207         return (0);
4208 }
4209
4210 /*
4211  * Partition an mbuf chain in two pieces, returning the tail --
4212  * all but the first len0 bytes.  In case of failure, it returns NULL and
4213  * attempts to restore the chain to its original state.
4214  */
4215 struct mbuf *
4216 m_split(struct mbuf *m0, int len0, int wait)
4217 {
4218         struct mbuf *m, *n;
4219         unsigned len = len0, remain;
4220
4221         for (m = m0; m && len > m->m_len; m = m->m_next)
4222                 len -= m->m_len;
4223         if (m == NULL)
4224                 return (NULL);
4225         remain = m->m_len - len;
4226         if (m0->m_flags & M_PKTHDR) {
4227                 _MGETHDR(n, wait, m0->m_type);
4228                 if (n == NULL)
4229                         return (NULL);
4230                 n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif;
4231                 n->m_pkthdr.len = m0->m_pkthdr.len - len0;
4232                 m0->m_pkthdr.len = len0;
4233                 if (m->m_flags & M_EXT)
4234                         goto extpacket;
4235                 if (remain > MHLEN) {
4236                         /* m can't be the lead packet */
4237                         MH_ALIGN(n, 0);
4238                         n->m_next = m_split(m, len, wait);
4239                         if (n->m_next == NULL) {
4240                                 (void) m_free(n);
4241                                 return (NULL);
4242                         } else
4243                                 return (n);
4244                 } else
4245                         MH_ALIGN(n, remain);
4246         } else if (remain == 0) {
4247                 n = m->m_next;
4248                 m->m_next = NULL;
4249                 return (n);
4250         } else {
4251                 _MGET(n, wait, m->m_type);
4252                 if (n == NULL)
4253                         return (NULL);
4254                 M_ALIGN(n, remain);
4255         }
4256 extpacket:
4257         if (m->m_flags & M_EXT) {
4258                 n->m_flags |= M_EXT;
4259                 n->m_ext = m->m_ext;
4260                 m_incref(m);
4261                 n->m_data = m->m_data + len;
4262         } else {
4263                 bcopy(MTOD(m, caddr_t) + len, MTOD(n, caddr_t), remain);
4264         }
4265         n->m_len = remain;
4266         m->m_len = len;
4267         n->m_next = m->m_next;
4268         m->m_next = NULL;
4269         return (n);
4270 }
4271
4272 /*
4273  * Routine to copy from device local memory into mbufs.
4274  */
4275 struct mbuf *
4276 m_devget(char *buf, int totlen, int off0, struct ifnet *ifp,
4277     void (*copy)(const void *, void *, size_t))
4278 {
4279         struct mbuf *m;
4280         struct mbuf *top = NULL, **mp = &top;
4281         int off = off0, len;
4282         char *cp;
4283         char *epkt;
4284
4285         cp = buf;
4286         epkt = cp + totlen;
4287         if (off) {
4288                 /*
4289                  * If 'off' is non-zero, packet is trailer-encapsulated,
4290                  * so we have to skip the type and length fields.
4291                  */
4292                 cp += off + 2 * sizeof (u_int16_t);
4293                 totlen -= 2 * sizeof (u_int16_t);
4294         }
4295         _MGETHDR(m, M_DONTWAIT, MT_DATA);
4296         if (m == NULL)
4297                 return (NULL);
4298         m->m_pkthdr.rcvif = ifp;
4299         m->m_pkthdr.len = totlen;
4300         m->m_len = MHLEN;
4301
4302         while (totlen > 0) {
4303                 if (top != NULL) {
4304                         _MGET(m, M_DONTWAIT, MT_DATA);
4305                         if (m == NULL) {
4306                                 m_freem(top);
4307                                 return (NULL);
4308                         }
4309                         m->m_len = MLEN;
4310                 }
4311                 len = MIN(totlen, epkt - cp);
4312                 if (len >= MINCLSIZE) {
4313                         MCLGET(m, M_DONTWAIT);
4314                         if (m->m_flags & M_EXT) {
4315                                 m->m_len = len = MIN(len, m_maxsize(MC_CL));
4316                         } else {
4317                                 /* give up when it's out of cluster mbufs */
4318                                 if (top != NULL)
4319                                         m_freem(top);
4320                                 m_freem(m);
4321                                 return (NULL);
4322                         }
4323                 } else {
4324                         /*
4325                          * Place initial small packet/header at end of mbuf.
4326                          */
4327                         if (len < m->m_len) {
4328                                 if (top == NULL &&
4329                                     len + max_linkhdr <= m->m_len)
4330                                         m->m_data += max_linkhdr;
4331                                 m->m_len = len;
4332                         } else {
4333                                 len = m->m_len;
4334                         }
4335                 }
4336                 if (copy)
4337                         copy(cp, MTOD(m, caddr_t), (unsigned)len);
4338                 else
4339                         bcopy(cp, MTOD(m, caddr_t), (unsigned)len);
4340                 cp += len;
4341                 *mp = m;
4342                 mp = &m->m_next;
4343                 totlen -= len;
4344                 if (cp == epkt)
4345                         cp = buf;
4346         }
4347         return (top);
4348 }
4349
4350 /*
4351  * Cluster freelist allocation check.
4352  */
4353 static int
4354 m_howmany(int num, size_t bufsize)
4355 {
4356         int i = 0, j = 0;
4357         u_int32_t m_clusters, m_bigclusters, m_16kclusters;
4358         u_int32_t m_clfree, m_bigclfree, m_16kclfree;
4359
4360         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
4361
4362         m_clusters = m_total(MC_CL);
4363         m_bigclusters = m_total(MC_BIGCL);
4364         m_16kclusters = m_total(MC_16KCL);
4365         m_clfree = m_infree(MC_CL);
4366         m_bigclfree = m_infree(MC_BIGCL);
4367         m_16kclfree = m_infree(MC_16KCL);
4368
4369         /* Bail if we've maxed out the mbuf memory map */
4370         if ((bufsize != m_maxsize(MC_16KCL) &&
4371             (m_clusters + (m_bigclusters << 1) >= nclusters)) ||
4372             (njcl > 0 && bufsize == m_maxsize(MC_16KCL) &&
4373             (m_16kclusters << 3) >= njcl)) {
4374 #if DEBUG
4375                 if (bufsize == MCLBYTES && num > m_clfree) {
4376                         printf("m_howmany - out of small clusters, "
4377                             "%d short\n", num - mbstat.m_clfree);
4378                 }
4379 #endif /* DEBUG */
4380                 return (0);
4381         }
4382
4383         if (bufsize == m_maxsize(MC_CL)) {
4384                 /* Under minimum */
4385                 if (m_clusters < MINCL)
4386                         return (MINCL - m_clusters);
4387                 /* Too few (free < 1/16 total) and not over maximum */
4388                 if (m_clusters < m_maxlimit(MC_CL)) {
4389                         if (m_clfree >= MCL_LOWAT)
4390                                 return (0);
4391                         if (num >= m_clfree)
4392                                 i = num - m_clfree;
4393                         if (((m_clusters + num) >> 4) > m_clfree)
4394                                 j = ((m_clusters + num) >> 4) - m_clfree;
4395                         i = MAX(i, j);
4396                         if (i + m_clusters >= m_maxlimit(MC_CL))
4397                                 i = m_maxlimit(MC_CL) - m_clusters;
4398                 }
4399                 VERIFY((m_total(MC_CL) + i) <= m_maxlimit(MC_CL));
4400         } else if (bufsize == m_maxsize(MC_BIGCL)) {
4401                 /* Under minimum */
4402                 if (m_bigclusters < MINBIGCL)
4403                         return (MINBIGCL - m_bigclusters);
4404                 /* Too few (free < 1/16 total) and not over maximum */
4405                 if (m_bigclusters < m_maxlimit(MC_BIGCL)) {
4406                         if (m_bigclfree >= MBIGCL_LOWAT)
4407                                 return (0);
4408                         if (num >= m_bigclfree)
4409                                 i = num - m_bigclfree;
4410                         if (((m_bigclusters + num) >> 4) > m_bigclfree)
4411                                 j = ((m_bigclusters + num) >> 4) - m_bigclfree;
4412                         i = MAX(i, j);
4413                         if (i + m_bigclusters >= m_maxlimit(MC_BIGCL))
4414                                 i = m_maxlimit(MC_BIGCL) - m_bigclusters;
4415                 }
4416                 VERIFY((m_total(MC_BIGCL) + i) <= m_maxlimit(MC_BIGCL));
4417         } else {
4418                 VERIFY(njcl > 0);
4419                 /* Under minimum */
4420                 if (m_16kclusters < MIN16KCL)
4421                         return (MIN16KCL - m_16kclusters);
4422                 /* Too few (free < 1/16 total) and not over maximum */
4423                 if (m_16kclusters < m_maxlimit(MC_16KCL)) {
4424                         if (m_16kclfree >= M16KCL_LOWAT)
4425                                 return (0);
4426                         if (num >= m_16kclfree)
4427                                 i = num - m_16kclfree;
4428                         if (((m_16kclusters + num) >> 4) > m_16kclfree)
4429                                 j = ((m_16kclusters + num) >> 4) - m_16kclfree;
4430                         i = MAX(i, j);
4431                         if (i + m_16kclusters >= m_maxlimit(MC_16KCL))
4432                                 i = m_maxlimit(MC_16KCL) - m_16kclusters;
4433                 }
4434                 VERIFY((m_total(MC_16KCL) + i) <= m_maxlimit(MC_16KCL));
4435         }
4436
4437         return (i);
4438 }
4439
4440 /*
4441  * Copy data from a buffer back into the indicated mbuf chain,
4442  * starting "off" bytes from the beginning, extending the mbuf
4443  * chain if necessary.
4444  */
4445 void
4446 m_copyback(struct mbuf *m0, int off, int len, caddr_t cp)
4447 {
4448         int mlen;
4449         struct mbuf *m = m0, *n;
4450         int totlen = 0;
4451
4452         if (m0 == NULL)
4453                 return;
4454         while (off > (mlen = m->m_len)) {
4455                 off -= mlen;
4456                 totlen += mlen;
4457                 if (m->m_next == NULL) {
4458                         n = m_getclr(M_DONTWAIT, m->m_type);
4459                         if (n == NULL)
4460                                 goto out;
4461                         n->m_len = MIN(MLEN, len + off);
4462                         m->m_next = n;
4463                 }
4464                 m = m->m_next;
4465         }
4466         while (len > 0) {
4467                 mlen = MIN(m->m_len - off, len);
4468                 bcopy(cp, off + MTOD(m, caddr_t), (unsigned)mlen);
4469                 cp += mlen;
4470                 len -= mlen;
4471                 mlen += off;
4472                 off = 0;
4473                 totlen += mlen;
4474                 if (len == 0)
4475                         break;
4476                 if (m->m_next == NULL) {
4477                         n = _M_GET(M_DONTWAIT, m->m_type);
4478                         if (n == NULL)
4479                                 break;
4480                         n->m_len = MIN(MLEN, len);
4481                         m->m_next = n;
4482                 }
4483                 m = m->m_next;
4484         }
4485 out:
4486         if (((m = m0)->m_flags & M_PKTHDR) && (m->m_pkthdr.len < totlen))
4487                 m->m_pkthdr.len = totlen;
4488 }
4489
4490 char *
4491 mcl_to_paddr(char *addr)
4492 {
4493         int base_phys;
4494
4495         if (!MBUF_IN_MAP(addr))
4496                 return (NULL);
4497         base_phys = mcl_paddr[(addr - (char *)mbutl) >> PGSHIFT];
4498
4499         if (base_phys == 0)
4500                 return (NULL);
4501         return ((char *)((int)base_phys | ((int)addr & PGOFSET)));
4502 }
4503
4504 /*
4505  * Dup the mbuf chain passed in.  The whole thing.  No cute additional cruft.
4506  * And really copy the thing.  That way, we don't "precompute" checksums
4507  * for unsuspecting consumers.  Assumption: m->m_nextpkt == 0.  Trick: for
4508  * small packets, don't dup into a cluster.  That way received  packets
4509  * don't take up too much room in the sockbuf (cf. sbspace()).
4510  */
4511 int MDFail;
4512
4513 struct mbuf *
4514 m_dup(struct mbuf *m, int how)
4515 {
4516         struct mbuf *n, **np;
4517         struct mbuf *top;
4518         int copyhdr = 0;
4519
4520         np = &top;
4521         top = NULL;
4522         if (m->m_flags & M_PKTHDR)
4523                 copyhdr = 1;
4524
4525         /*
4526          * Quick check: if we have one mbuf and its data fits in an
4527          *  mbuf with packet header, just copy and go.
4528          */
4529         if (m->m_next == NULL) {
4530                 /* Then just move the data into an mbuf and be done... */
4531                 if (copyhdr) {
4532                         if (m->m_pkthdr.len <= MHLEN && m->m_len <= MHLEN) {
4533                                 if ((n = _M_GETHDR(how, m->m_type)) == NULL)
4534                                         return (NULL);
4535                                 n->m_len = m->m_len;
4536                                 m_dup_pkthdr(n, m, how);
4537                                 bcopy(m->m_data, n->m_data, m->m_len);
4538                                 return (n);
4539                         }
4540                 } else if (m->m_len <= MLEN) {
4541                         if ((n = _M_GET(how, m->m_type)) == NULL)
4542                                 return (NULL);
4543                         bcopy(m->m_data, n->m_data, m->m_len);
4544                         n->m_len = m->m_len;
4545                         return (n);
4546                 }
4547         }
4548         while (m != NULL) {
4549 #if BLUE_DEBUG
4550                 kprintf("<%x: %x, %x, %x\n", m, m->m_flags, m->m_len,
4551                     m->m_data);
4552 #endif
4553                 if (copyhdr)
4554                         n = _M_GETHDR(how, m->m_type);
4555                 else
4556                         n = _M_GET(how, m->m_type);
4557                 if (n == NULL)
4558                         goto nospace;
4559                 if (m->m_flags & M_EXT) {
4560                         if (m->m_len <= m_maxsize(MC_CL))
4561                                 MCLGET(n, how);
4562                         else if (m->m_len <= m_maxsize(MC_BIGCL))
4563                                 n = m_mbigget(n, how);
4564                         else if (m->m_len <= m_maxsize(MC_16KCL) && njcl > 0)
4565                                 n = m_m16kget(n, how);
4566                         if (!(n->m_flags & M_EXT)) {
4567                                 (void) m_free(n);
4568                                 goto nospace;
4569                         }
4570                 }
4571                 *np = n;
4572                 if (copyhdr) {
4573                         /* Don't use M_COPY_PKTHDR: preserve m_data */
4574                         m_dup_pkthdr(n, m, how);
4575                         copyhdr = 0;
4576                         if (!(n->m_flags & M_EXT))
4577                                 n->m_data = n->m_pktdat;
4578                 }
4579                 n->m_len = m->m_len;
4580                 /*
4581                  * Get the dup on the same bdry as the original
4582                  * Assume that the two mbufs have the same offset to data area
4583                  * (up to word boundaries)
4584                  */
4585                 bcopy(MTOD(m, caddr_t), MTOD(n, caddr_t), (unsigned)n->m_len);
4586                 m = m->m_next;
4587                 np = &n->m_next;
4588 #if BLUE_DEBUG
4589                 kprintf(">%x: %x, %x, %x\n", n, n->m_flags, n->m_len,
4590                     n->m_data);
4591 #endif
4592         }
4593
4594         if (top == NULL)
4595                 MDFail++;
4596         return (top);
4597
4598 nospace:
4599         m_freem(top);
4600         MDFail++;
4601         return (NULL);
4602 }
4603
4604 #define MBUF_MULTIPAGES(m)                                              \
4605         (((m)->m_flags & M_EXT) &&                                      \
4606         ((IS_P2ALIGNED((m)->m_data, NBPG) && (m)->m_len > NBPG) ||      \
4607         (!IS_P2ALIGNED((m)->m_data, NBPG) &&                            \
4608         P2ROUNDUP((m)->m_data, NBPG) < ((uintptr_t)(m)->m_data + (m)->m_len))))
4609
4610 static struct mbuf *
4611 m_expand(struct mbuf *m, struct mbuf **last)
4612 {
4613         struct mbuf *top = NULL;
4614         struct mbuf **nm = &top;
4615         uintptr_t data0, data;
4616         unsigned int len0, len;
4617
4618         VERIFY(MBUF_MULTIPAGES(m));
4619         VERIFY(m->m_next == NULL);
4620         data0 = (uintptr_t)m->m_data;
4621         len0 = m->m_len;
4622         *last = top;
4623
4624         for (;;) {
4625                 struct mbuf *n;
4626
4627                 data = data0;
4628                 if (IS_P2ALIGNED(data, NBPG) && len0 > NBPG)
4629                         len = NBPG;
4630                 else if (!IS_P2ALIGNED(data, NBPG) &&
4631                     P2ROUNDUP(data, NBPG) < (data + len0))
4632                         len = P2ROUNDUP(data, NBPG) - data;
4633                 else
4634                         len = len0;
4635
4636                 VERIFY(len > 0);
4637                 VERIFY(m->m_flags & M_EXT);
4638                 m->m_data = (void *)data;
4639                 m->m_len = len;
4640
4641                 *nm = *last = m;
4642                 nm = &m->m_next;
4643                 m->m_next = NULL;
4644
4645                 data0 += len;
4646                 len0 -= len;
4647                 if (len0 == 0)
4648                         break;
4649
4650                 n = _M_RETRY(M_DONTWAIT, MT_DATA);
4651                 if (n == NULL) {
4652                         m_freem(top);
4653                         top = *last = NULL;
4654                         break;
4655                 }
4656
4657                 n->m_ext = m->m_ext;
4658                 m_incref(m);
4659                 n->m_flags |= M_EXT;
4660                 m = n;
4661         }
4662         return (top);
4663 }
4664
4665 struct mbuf *
4666 m_normalize(struct mbuf *m)
4667 {
4668         struct mbuf *top = NULL;
4669         struct mbuf **nm = &top;
4670         boolean_t expanded = FALSE;
4671
4672         while (m != NULL) {
4673                 struct mbuf *n;
4674
4675                 n = m->m_next;
4676                 m->m_next = NULL;
4677
4678                 /* Does the data cross one or more page boundaries? */
4679                 if (MBUF_MULTIPAGES(m)) {
4680                         struct mbuf *last;
4681                         if ((m = m_expand(m, &last)) == NULL) {
4682                                 m_freem(n);
4683                                 m_freem(top);
4684                                 top = NULL;
4685                                 break;
4686                         }
4687                         *nm = m;
4688                         nm = &last->m_next;
4689                         expanded = TRUE;
4690                 } else {
4691                         *nm = m;
4692                         nm = &m->m_next;
4693                 }
4694                 m = n;
4695         }
4696         if (expanded)
4697                 atomic_add_32(&mb_normalized, 1);
4698         return (top);
4699 }
4700
4701 void
4702 m_mchtype(struct mbuf *m, int t)
4703 {
4704         mtype_stat_inc(t);
4705         mtype_stat_dec(m->m_type);
4706         (m)->m_type = t;
4707 }
4708
4709 void *
4710 m_mtod(struct mbuf *m)
4711 {
4712         return (MTOD(m, void *));
4713 }
4714
4715 struct mbuf *
4716 m_dtom(void *x)
4717 {
4718         return ((struct mbuf *)((u_long)(x) & ~(MSIZE-1)));
4719 }
4720
4721 void
4722 m_mcheck(struct mbuf *m)
4723 {
4724         _MCHECK(m);
4725 }
4726
4727 /*
4728  * Inform the corresponding mcache(s) that there's a waiter below.
4729  */
4730 static void
4731 mbuf_waiter_inc(mbuf_class_t class, boolean_t comp)
4732 {
4733         mcache_waiter_inc(m_cache(class));
4734         if (comp) {
4735                 if (class == MC_CL) {
4736                         mcache_waiter_inc(m_cache(MC_MBUF_CL));
4737                 } else if (class == MC_BIGCL) {
4738                         mcache_waiter_inc(m_cache(MC_MBUF_BIGCL));
4739                 } else if (class == MC_16KCL) {
4740                         mcache_waiter_inc(m_cache(MC_MBUF_16KCL));
4741                 } else {
4742                         mcache_waiter_inc(m_cache(MC_MBUF_CL));
4743                         mcache_waiter_inc(m_cache(MC_MBUF_BIGCL));
4744                 }
4745         }
4746 }
4747
4748 /*
4749  * Inform the corresponding mcache(s) that there's no more waiter below.
4750  */
4751 static void
4752 mbuf_waiter_dec(mbuf_class_t class, boolean_t comp)
4753 {
4754         mcache_waiter_dec(m_cache(class));
4755         if (comp) {
4756                 if (class == MC_CL) {
4757                         mcache_waiter_dec(m_cache(MC_MBUF_CL));
4758                 } else if (class == MC_BIGCL) {
4759                         mcache_waiter_dec(m_cache(MC_MBUF_BIGCL));
4760                 } else if (class == MC_16KCL) {
4761                         mcache_waiter_dec(m_cache(MC_MBUF_16KCL));
4762                 } else {
4763                         mcache_waiter_dec(m_cache(MC_MBUF_CL));
4764                         mcache_waiter_dec(m_cache(MC_MBUF_BIGCL));
4765                 }
4766         }
4767 }
4768
4769 /*
4770  * Called during blocking allocation.  Returns TRUE if one or more objects
4771  * are available at the per-CPU caches layer and that allocation should be
4772  * retried at that level.
4773  */
4774 static boolean_t
4775 mbuf_sleep(mbuf_class_t class, unsigned int num, int wait)
4776 {
4777         boolean_t mcache_retry = FALSE;
4778
4779         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
4780
4781         /* Check if there's anything at the cache layer */
4782         if (mbuf_cached_above(class, wait)) {
4783                 mcache_retry = TRUE;
4784                 goto done;
4785         }
4786
4787         /* Nothing?  Then try hard to get it from somewhere */
4788         m_reclaim(class, num, (wait & MCR_COMP));
4789
4790         /* We tried hard and got something? */
4791         if (m_infree(class) > 0) {
4792                 mbstat.m_wait++;
4793                 goto done;
4794         } else if (mbuf_cached_above(class, wait)) {
4795                 mbstat.m_wait++;
4796                 mcache_retry = TRUE;
4797                 goto done;
4798         } else if (wait & MCR_TRYHARD) {
4799                 mcache_retry = TRUE;
4800                 goto done;
4801         }
4802
4803         /*
4804          * There's really nothing for us right now; inform the
4805          * cache(s) that there is a waiter below and go to sleep.
4806          */
4807         mbuf_waiter_inc(class, (wait & MCR_COMP));
4808
4809         VERIFY(!(wait & MCR_NOSLEEP));
4810         mb_waiters++;
4811         (void) msleep(mb_waitchan, mbuf_mlock, (PZERO-1), m_cname(class), NULL);
4812
4813         /* We are now up; stop getting notified until next round */
4814         mbuf_waiter_dec(class, (wait & MCR_COMP));
4815
4816         /* We waited and got something */
4817         if (m_infree(class) > 0) {
4818                 mbstat.m_wait++;
4819                 goto done;
4820         } else if (mbuf_cached_above(class, wait)) {
4821                 mbstat.m_wait++;
4822                 mcache_retry = TRUE;
4823         }
4824 done:
4825         return (mcache_retry);
4826 }
4827
4828 static void
4829 mbuf_worker_thread(void)
4830 {
4831         int mbuf_expand;
4832
4833         while (1) {
4834                 lck_mtx_lock(mbuf_mlock);
4835
4836                 mbuf_expand = 0;
4837                 if (mbuf_expand_mcl) {
4838                         int n;
4839
4840                         /* Adjust to current number of cluster in use */
4841                         n = mbuf_expand_mcl -
4842                             (m_total(MC_CL) - m_infree(MC_CL));
4843                         if ((n + m_total(MC_CL)) > m_maxlimit(MC_CL))
4844                                 n = m_maxlimit(MC_CL) - m_total(MC_CL);
4845                         mbuf_expand_mcl = 0;
4846
4847                         if (n > 0 && freelist_populate(MC_CL, n, M_WAIT) > 0)
4848                                 mbuf_expand++;
4849                 }
4850                 if (mbuf_expand_big) {
4851                         int n;
4852
4853                         /* Adjust to current number of 4 KB cluster in use */
4854                         n = mbuf_expand_big -
4855                             (m_total(MC_BIGCL) - m_infree(MC_BIGCL));
4856                         if ((n + m_total(MC_BIGCL)) > m_maxlimit(MC_BIGCL))
4857                                 n = m_maxlimit(MC_BIGCL) - m_total(MC_BIGCL);
4858                         mbuf_expand_big = 0;
4859
4860                         if (n > 0 && freelist_populate(MC_BIGCL, n, M_WAIT) > 0)
4861                                 mbuf_expand++;
4862                 }
4863                 if (mbuf_expand_16k) {
4864                         int n;
4865
4866                         /* Adjust to current number of 16 KB cluster in use */
4867                         n = mbuf_expand_16k -
4868                             (m_total(MC_16KCL) - m_infree(MC_16KCL));
4869                         if ((n + m_total(MC_16KCL)) > m_maxlimit(MC_16KCL))
4870                                 n = m_maxlimit(MC_16KCL) - m_total(MC_16KCL);
4871                         mbuf_expand_16k = 0;
4872
4873                         if (n > 0)
4874                                 (void) freelist_populate(MC_16KCL, n, M_WAIT);
4875                 }
4876
4877                 /*
4878                  * Because we can run out of memory before filling the mbuf
4879                  * map, we should not allocate more clusters than they are
4880                  * mbufs -- otherwise we could have a large number of useless
4881                  * clusters allocated.
4882                  */
4883                 if (mbuf_expand) {
4884                         while (m_total(MC_MBUF) <
4885                             (m_total(MC_BIGCL) + m_total(MC_CL))) {
4886                                 if (freelist_populate(MC_MBUF, 1, M_WAIT) == 0)
4887                                         break;
4888                         }
4889                 }
4890
4891                 lck_mtx_unlock(mbuf_mlock);
4892
4893                 assert_wait(&mbuf_worker_run, THREAD_UNINT);
4894                 (void) thread_block((thread_continue_t)mbuf_worker_thread);
4895         }
4896 }
4897
4898 static void
4899 mbuf_worker_thread_init(void)
4900 {
4901         mbuf_worker_ready++;
4902         mbuf_worker_thread();
4903 }
4904
4905 static mcl_slab_t *
4906 slab_get(void *buf)
4907 {
4908         mcl_slabg_t *slg;
4909         unsigned int ix, k;
4910
4911         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
4912
4913         VERIFY(MBUF_IN_MAP(buf));
4914         ix = ((char *)buf - (char *)mbutl) >> MBSHIFT;
4915         VERIFY(ix < maxslabgrp);
4916
4917         if ((slg = slabstbl[ix]) == NULL) {
4918                 /*
4919                  * In the current implementation, we never shrink the memory
4920                  * pool (hence the cluster map); if we attempt to reallocate
4921                  * a cluster group when it's already allocated, panic since
4922                  * this is a sign of a memory corruption (slabstbl[ix] got
4923                  * nullified).  This also means that there shouldn't be any
4924                  * hole in the kernel sub-map for the mbuf pool.
4925                  */
4926                 ++slabgrp;
4927                 VERIFY(ix < slabgrp);
4928                 /*
4929                  * Slabs expansion can only be done single threaded; when
4930                  * we get here, it must be as a result of m_clalloc() which
4931                  * is serialized and therefore mb_clalloc_busy must be set.
4932                  */
4933                 VERIFY(mb_clalloc_busy);
4934                 lck_mtx_unlock(mbuf_mlock);
4935
4936                 /* This is a new buffer; create the slabs group for it */
4937                 MALLOC(slg, mcl_slabg_t *, sizeof (*slg), M_TEMP,
4938                     M_WAITOK | M_ZERO);
4939                 VERIFY(slg != NULL);
4940
4941                 lck_mtx_lock(mbuf_mlock);
4942                 /*
4943                  * No other thread could have gone into m_clalloc() after
4944                  * we dropped the lock above, so verify that it's true.
4945                  */
4946                 VERIFY(mb_clalloc_busy);
4947
4948                 slabstbl[ix] = slg;
4949
4950                 /* Chain each slab in the group to its forward neighbor */
4951                 for (k = 1; k < NSLABSPMB; k++)
4952                         slg->slg_slab[k - 1].sl_next = &slg->slg_slab[k];
4953                 VERIFY(slg->slg_slab[NSLABSPMB - 1].sl_next == NULL);
4954
4955                 /* And chain the last slab in the previous group to this */
4956                 if (ix > 0) {
4957                         VERIFY(slabstbl[ix - 1]->
4958                             slg_slab[NSLABSPMB - 1].sl_next == NULL);
4959                         slabstbl[ix - 1]->slg_slab[NSLABSPMB - 1].sl_next =
4960                             &slg->slg_slab[0];
4961                 }
4962         }
4963
4964         ix = MTOCL(buf) % NSLABSPMB;
4965         VERIFY(ix < NSLABSPMB);
4966
4967         return (&slg->slg_slab[ix]);
4968 }
4969
4970 static void
4971 slab_init(mcl_slab_t *sp, mbuf_class_t class, u_int32_t flags,
4972     void *base, void *head, unsigned int len, int refcnt, int chunks)
4973 {
4974         sp->sl_class = class;
4975         sp->sl_flags = flags;
4976         sp->sl_base = base;
4977         sp->sl_head = head;
4978         sp->sl_len = len;
4979         sp->sl_refcnt = refcnt;
4980         sp->sl_chunks = chunks;
4981         slab_detach(sp);
4982 }
4983
4984 static void
4985 slab_insert(mcl_slab_t *sp, mbuf_class_t class)
4986 {
4987         VERIFY(slab_is_detached(sp));
4988         m_slab_cnt(class)++;
4989         TAILQ_INSERT_TAIL(&m_slablist(class), sp, sl_link);
4990         sp->sl_flags &= ~SLF_DETACHED;
4991         if (class == MC_BIGCL) {
4992                 sp = sp->sl_next;
4993                 /* Next slab must already be present */
4994                 VERIFY(sp != NULL);
4995                 VERIFY(slab_is_detached(sp));
4996                 sp->sl_flags &= ~SLF_DETACHED;
4997         } else if (class == MC_16KCL) {
4998                 int k;
4999                 for (k = 1; k < (M16KCLBYTES / MCLBYTES); k++) {
5000                         sp = sp->sl_next;
5001                         /* Next slab must already be present */
5002                         VERIFY(sp != NULL);
5003                         VERIFY(slab_is_detached(sp));
5004                         sp->sl_flags &= ~SLF_DETACHED;
5005                 }
5006         }
5007 }
5008
5009 static void
5010 slab_remove(mcl_slab_t *sp, mbuf_class_t class)
5011 {
5012         VERIFY(!slab_is_detached(sp));
5013         VERIFY(m_slab_cnt(class) > 0);
5014         m_slab_cnt(class)--;
5015         TAILQ_REMOVE(&m_slablist(class), sp, sl_link);
5016         slab_detach(sp);
5017         if (class == MC_BIGCL) {
5018                 sp = sp->sl_next;
5019                 /* Next slab must already be present */
5020                 VERIFY(sp != NULL);
5021                 VERIFY(!slab_is_detached(sp));
5022                 slab_detach(sp);
5023         } else if (class == MC_16KCL) {
5024                 int k;
5025                 for (k = 1; k < (M16KCLBYTES / MCLBYTES); k++) {
5026                         sp = sp->sl_next;
5027                         /* Next slab must already be present */
5028                         VERIFY(sp != NULL);
5029                         VERIFY(!slab_is_detached(sp));
5030                         slab_detach(sp);
5031                 }
5032         }
5033 }
5034
5035 static boolean_t
5036 slab_inrange(mcl_slab_t *sp, void *buf)
5037 {
5038         return ((uintptr_t)buf >= (uintptr_t)sp->sl_base &&
5039             (uintptr_t)buf < ((uintptr_t)sp->sl_base + sp->sl_len));
5040 }
5041
5042 #undef panic(...)
5043
5044 static void
5045 slab_nextptr_panic(mcl_slab_t *sp, void *addr)
5046 {
5047         int i;
5048         unsigned int chunk_len = sp->sl_len / sp->sl_chunks;
5049         uintptr_t buf = (uintptr_t)sp->sl_base;
5050
5051         for (i = 0; i < sp->sl_chunks; i++, buf += chunk_len) {
5052                 void *next = ((mcache_obj_t *)buf)->obj_next;
5053                 if (next != addr)
5054                         continue;
5055                 if (mclaudit == NULL) {
5056                         if (next != NULL && !MBUF_IN_MAP(next)) {
5057                                 mcache_t *cp = m_cache(sp->sl_class);
5058                                 panic("%s: %s buffer %p in slab %p modified "
5059                                     "after free at offset 0: %p out of range "
5060                                     "[%p-%p)\n", __func__, cp->mc_name,
5061                                     (void *)buf, sp, next, mbutl, embutl);
5062                                 /* NOTREACHED */
5063                         }
5064                 } else {
5065                         mcache_audit_t *mca = mcl_audit_buf2mca(sp->sl_class,
5066                             (mcache_obj_t *)buf);
5067                         mcl_audit_verify_nextptr(next, mca);
5068                 }
5069         }
5070 }
5071
5072 static void
5073 slab_detach(mcl_slab_t *sp)
5074 {
5075         sp->sl_link.tqe_next = (mcl_slab_t *)-1;
5076         sp->sl_link.tqe_prev = (mcl_slab_t **)-1;
5077         sp->sl_flags |= SLF_DETACHED;
5078 }
5079
5080 static boolean_t
5081 slab_is_detached(mcl_slab_t *sp)
5082 {
5083         return ((intptr_t)sp->sl_link.tqe_next == -1 &&
5084             (intptr_t)sp->sl_link.tqe_prev == -1 &&
5085             (sp->sl_flags & SLF_DETACHED));
5086 }
5087
5088 static void
5089 mcl_audit_init(void *buf, mcache_audit_t **mca_list,
5090     mcache_obj_t **con_list, size_t con_size, unsigned int num)
5091 {
5092         mcache_audit_t *mca, *mca_tail;
5093         mcache_obj_t *con = NULL;
5094         boolean_t save_contents = (con_list != NULL);
5095         unsigned int i, ix;
5096
5097         ASSERT(num <= NMBPCL);
5098         ASSERT(con_list == NULL || con_size != 0);
5099
5100         ix = MTOCL(buf);
5101         /* Make sure we haven't been here before */
5102         for (i = 0; i < NMBPCL; i++)
5103                 VERIFY(mclaudit[ix].cl_audit[i] == NULL);
5104
5105         mca = mca_tail = *mca_list;
5106         if (save_contents)
5107                 con = *con_list;
5108
5109         for (i = 0; i < num; i++) {
5110                 mcache_audit_t *next;
5111
5112                 next = mca->mca_next;
5113                 bzero(mca, sizeof (*mca));
5114                 mca->mca_next = next;
5115                 mclaudit[ix].cl_audit[i] = mca;
5116
5117                 /* Attach the contents buffer if requested */
5118                 if (save_contents) {
5119                         VERIFY(con != NULL);
5120                         mca->mca_contents_size = con_size;
5121                         mca->mca_contents = con;
5122                         con = con->obj_next;
5123                         bzero(mca->mca_contents, mca->mca_contents_size);
5124                 }
5125
5126                 mca_tail = mca;
5127                 mca = mca->mca_next;
5128         }
5129
5130         if (save_contents)
5131                 *con_list = con;
5132
5133         *mca_list = mca_tail->mca_next;
5134         mca_tail->mca_next = NULL;
5135 }
5136
5137 /*
5138  * Given an address of a buffer (mbuf/cluster/big cluster), return
5139  * the corresponding audit structure for that buffer.
5140  */
5141 static mcache_audit_t *
5142 mcl_audit_buf2mca(mbuf_class_t class, mcache_obj_t *o)
5143 {
5144         mcache_audit_t *mca = NULL;
5145         int ix = MTOCL(o);
5146
5147         VERIFY(IS_P2ALIGNED(o, MIN(m_maxsize(class), NBPG)));
5148
5149         switch (class) {
5150         case MC_MBUF:
5151                 /*
5152                  * For the mbuf case, find the index of the cluster
5153                  * used by the mbuf and use that index to locate the
5154                  * base address of the cluster.  Then find out the
5155                  * mbuf index relative to the cluster base and use
5156                  * it to locate the audit structure.
5157                  */
5158                 VERIFY(MCLIDX(CLTOM(ix), o) < (int)NMBPCL);
5159                 mca = mclaudit[ix].cl_audit[MCLIDX(CLTOM(ix), o)];
5160                 break;
5161
5162         case MC_CL:
5163         case MC_BIGCL:
5164         case MC_16KCL:
5165                 /*
5166                  * Same as above, but only return the first element.
5167                  */
5168                 mca = mclaudit[ix].cl_audit[0];
5169                 break;
5170
5171         default:
5172                 VERIFY(0);
5173                 /* NOTREACHED */
5174         }
5175
5176         return (mca);
5177 }
5178
5179 static void
5180 mcl_audit_mbuf(mcache_audit_t *mca, void *addr, boolean_t composite,
5181     boolean_t alloc)
5182 {
5183         struct mbuf *m = addr;
5184         mcache_obj_t *next = ((mcache_obj_t *)m)->obj_next;
5185
5186         VERIFY(mca->mca_contents != NULL &&
5187             mca->mca_contents_size == AUDIT_CONTENTS_SIZE);
5188
5189         mcl_audit_verify_nextptr(next, mca);
5190
5191         if (!alloc) {
5192                 /* Save constructed mbuf fields */
5193                 mcl_audit_save_mbuf(m, mca);
5194                 mcache_set_pattern(MCACHE_FREE_PATTERN, m, m_maxsize(MC_MBUF));
5195                 ((mcache_obj_t *)m)->obj_next = next;
5196                 return;
5197         }
5198
5199         /* Check if the buffer has been corrupted while in freelist */
5200         mcache_audit_free_verify_set(mca, addr, 0, m_maxsize(MC_MBUF));
5201
5202         /* Restore constructed mbuf fields */
5203         mcl_audit_restore_mbuf(m, mca, composite);
5204 }
5205
5206 static void
5207 mcl_audit_restore_mbuf(struct mbuf *m, mcache_audit_t *mca, boolean_t composite)
5208 {
5209         struct mbuf *ms = (struct mbuf *)mca->mca_contents;
5210
5211         if (composite) {
5212                 struct mbuf *next = m->m_next;
5213                 VERIFY(ms->m_flags == M_EXT && MEXT_RFA(ms) != NULL &&
5214                     MBUF_IS_COMPOSITE(ms));
5215                 /*
5216                  * We could have hand-picked the mbuf fields and restore
5217                  * them individually, but that will be a maintenance
5218                  * headache.  Instead, restore everything that was saved;
5219                  * the mbuf layer will recheck and reinitialize anyway.
5220                  */
5221                 bcopy(ms, m, mca->mca_contents_size);
5222                 m->m_next = next;
5223         } else {
5224                 /*
5225                  * For a regular mbuf (no cluster attached) there's nothing
5226                  * to restore other than the type field, which is expected
5227                  * to be MT_FREE.
5228                  */
5229                 m->m_type = ms->m_type;
5230         }
5231         _MCHECK(m);
5232 }
5233
5234 static void
5235 mcl_audit_save_mbuf(struct mbuf *m, mcache_audit_t *mca)
5236 {
5237         _MCHECK(m);
5238         bcopy(m, mca->mca_contents, mca->mca_contents_size);
5239 }
5240
5241 static void
5242 mcl_audit_cluster(mcache_audit_t *mca, void *addr, size_t size, boolean_t alloc,
5243     boolean_t save_next)
5244 {
5245         mcache_obj_t *next = ((mcache_obj_t *)addr)->obj_next;
5246
5247         if (!alloc) {
5248                 mcache_set_pattern(MCACHE_FREE_PATTERN, addr, size);
5249                 if (save_next) {
5250                         mcl_audit_verify_nextptr(next, mca);
5251                         ((mcache_obj_t *)addr)->obj_next = next;
5252                 }
5253         } else {
5254                 /* Check if the buffer has been corrupted while in freelist */
5255                 mcl_audit_verify_nextptr(next, mca);
5256                 mcache_audit_free_verify_set(mca, addr, 0, size);
5257         }
5258 }
5259
5260 static void
5261 mcl_audit_mcheck_panic(struct mbuf *m)
5262 {
5263         mcache_audit_t *mca;
5264
5265         MRANGE(m);
5266         mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
5267
5268         panic("mcl_audit: freed mbuf %p with type 0x%x (instead of 0x%x)\n%s\n",
5269             m, (u_int16_t)m->m_type, MT_FREE, mcache_dump_mca(mca));
5270         /* NOTREACHED */
5271 }
5272
5273 static void
5274 mcl_audit_verify_nextptr(void *next, mcache_audit_t *mca)
5275 {
5276         if (next != NULL && next != (void *)MCACHE_FREE_PATTERN &&
5277             !MBUF_IN_MAP(next)) {
5278                 panic("mcl_audit: buffer %p modified after free at offset 0: "
5279                     "%p out of range [%p-%p)\n%s\n",
5280                     mca->mca_addr, next, mbutl, embutl, mcache_dump_mca(mca));
5281                 /* NOTREACHED */
5282         }
5283 }
5284
5285 SYSCTL_DECL(_kern_ipc);
5286 SYSCTL_PROC(_kern_ipc, KIPC_MBSTAT, mbstat, CTLFLAG_RD | CTLFLAG_LOCKED,
5287     0, 0, mbstat_sysctl, "S,mbstat", "");
5288 SYSCTL_PROC(_kern_ipc, OID_AUTO, mb_stat, CTLFLAG_RD | CTLFLAG_LOCKED,
5289     0, 0, mb_stat_sysctl, "S,mb_stat", "");
5290 SYSCTL_INT(_kern_ipc, OID_AUTO, mb_normalized, CTLFLAG_RD | CTLFLAG_LOCKED,
5291     &mb_normalized, 0, "");