bsd/kern/uipc_mbuf.c

   1 /*
   2  * Copyright (c) 2000-2007 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
  29 /*
  30  * Copyright (c) 1982, 1986, 1988, 1991, 1993
  31  *      The Regents of the University of California.  All rights reserved.
  32  *
  33  * Redistribution and use in source and binary forms, with or without
  34  * modification, are permitted provided that the following conditions
  35  * are met:
  36  * 1. Redistributions of source code must retain the above copyright
  37  *    notice, this list of conditions and the following disclaimer.
  38  * 2. Redistributions in binary form must reproduce the above copyright
  39  *    notice, this list of conditions and the following disclaimer in the
  40  *    documentation and/or other materials provided with the distribution.
  41  * 3. All advertising materials mentioning features or use of this software
  42  *    must display the following acknowledgement:
  43  *      This product includes software developed by the University of
  44  *      California, Berkeley and its contributors.
  45  * 4. Neither the name of the University nor the names of its contributors
  46  *    may be used to endorse or promote products derived from this software
  47  *    without specific prior written permission.
  48  *
  49  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  50  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  51  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  52  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  53  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  54  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  55  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  56  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  57  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  58  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  59  * SUCH DAMAGE.
  60  *
  61  *      @(#)uipc_mbuf.c 8.2 (Berkeley) 1/4/94
  62  */
  63 /*
  64  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
  65  * support for mandatory and extensible security protections.  This notice
  66  * is included in support of clause 2.2 (b) of the Apple Public License,
  67  * Version 2.0.
  68  */
  69
  70 #include <sys/param.h>
  71 #include <sys/systm.h>
  72 #include <sys/malloc.h>
  73 #include <sys/mbuf.h>
  74 #include <sys/kernel.h>
  75 #include <sys/sysctl.h>
  76 #include <sys/syslog.h>
  77 #include <sys/protosw.h>
  78 #include <sys/domain.h>
  79 #include <sys/queue.h>
  80
  81 #include <kern/kern_types.h>
  82 #include <kern/simple_lock.h>
  83 #include <kern/queue.h>
  84 #include <kern/sched_prim.h>
  85 #include <kern/cpu_number.h>
  86
  87 #include <libkern/OSAtomic.h>
  88 #include <libkern/libkern.h>
  89
  90 #include <IOKit/IOMapper.h>
  91
  92 #include <machine/limits.h>
  93 #include <machine/machine_routines.h>
  94
  95 #if CONFIG_MACF_NET
  96 #include <security/mac_framework.h>
  97 #endif /* MAC_NET */
  98
  99 #include <sys/mcache.h>
 100
 101 /*
 102  * MBUF IMPLEMENTATION NOTES.
 103  *
 104  * There is a total of 5 per-CPU caches:
 105  *
 106  * MC_MBUF:
 107  *      This is a cache of rudimentary objects of MSIZE in size; each
 108  *      object represents an mbuf structure.  This cache preserves only
 109  *      the m_type field of the mbuf during its transactions.
 110  *
 111  * MC_CL:
 112  *      This is a cache of rudimentary objects of MCLBYTES in size; each
 113  *      object represents a mcluster structure.  This cache does not
 114  *      preserve the contents of the objects during its transactions.
 115  *
 116  * MC_BIGCL:
 117  *      This is a cache of rudimentary objects of NBPG in size; each
 118  *      object represents a mbigcluster structure.  This cache does not
 119  *      preserve the contents of the objects during its transaction.
 120  *
 121  * MC_MBUF_CL:
 122  *      This is a cache of mbufs each having a cluster attached to it.
 123  *      It is backed by MC_MBUF and MC_CL rudimentary caches.  Several
 124  *      fields of the mbuf related to the external cluster are preserved
 125  *      during transactions.
 126  *
 127  * MC_MBUF_BIGCL:
 128  *      This is a cache of mbufs each having a big cluster attached to it.
 129  *      It is backed by MC_MBUF and MC_BIGCL rudimentary caches.  Several
 130  *      fields of the mbuf related to the external cluster are preserved
 131  *      during transactions.
 132  *
 133  * OBJECT ALLOCATION:
 134  *
 135  * Allocation requests are handled first at the per-CPU (mcache) layer
 136  * before falling back to the slab layer.  Performance is optimal when
 137  * the request is satisfied at the CPU layer because global data/lock
 138  * never gets accessed.  When the slab layer is entered for allocation,
 139  * the slab freelist will be checked first for available objects before
 140  * the VM backing store is invoked.  Slab layer operations are serialized
 141  * for all of the caches as the mbuf global lock is held most of the time.
 142  * Allocation paths are different depending on the class of objects:
 143  *
 144  * a. Rudimentary object:
 145  *
 146  *      { m_get_common(), m_clattach(), m_mclget(),
 147  *        m_mclalloc(), m_bigalloc(), m_copym_with_hdrs(),
 148  *        composite object allocation }
 149  *                      |       ^
 150  *                      |       |
 151  *                      |       +-----------------------+
 152  *                      v                               |
 153  *         mcache_alloc/mcache_alloc_ext()      mbuf_slab_audit()
 154  *                      |                               ^
 155  *                      v                               |
 156  *                 [CPU cache] -------> (found?) -------+
 157  *                      |                               |
 158  *                      v                               |
 159  *               mbuf_slab_alloc()                      |
 160  *                      |                               |
 161  *                      v                               |
 162  *      +---------> [freelist] -------> (found?) -------+
 163  *      |               |
 164  *      |               v
 165  *      |           m_clalloc()
 166  *      |               |
 167  *      |               v
 168  *      +---<<---- kmem_mb_alloc()
 169  *
 170  * b. Composite object:
 171  *
 172  *      { m_getpackets_internal(), m_allocpacket_internal() }
 173  *                      |       ^
 174  *                      |       |
 175  *                      |       +------ (done) ---------+
 176  *                      v                               |
 177  *         mcache_alloc/mcache_alloc_ext()      mbuf_cslab_audit()
 178  *                      |                               ^
 179  *                      v                               |
 180  *                 [CPU cache] -------> (found?) -------+
 181  *                      |                               |
 182  *                      v                               |
 183  *               mbuf_cslab_alloc()                     |
 184  *                      |                               |
 185  *                      v                               |
 186  *                  [freelist] -------> (found?) -------+
 187  *                      |                               |
 188  *                      v                               |
 189  *              (rudimentary object)                    |
 190  *         mcache_alloc/mcache_alloc_ext() ------>>-----+
 191  *
 192  * Auditing notes: If auditing is enabled, buffers will be subjected to
 193  * integrity checks by the audit routine.  This is done by verifying their
 194  * contents against DEADBEEF (free) pattern before returning them to caller.
 195  * As part of this step, the routine will also record the transaction and
 196  * pattern-fill the buffers with BADDCAFE (uninitialized) pattern.  It will
 197  * also restore any constructed data structure fields if necessary.
 198  *
 199  * OBJECT DEALLOCATION:
 200  *
 201  * Freeing an object simply involves placing it into the CPU cache; this
 202  * pollutes the cache to benefit subsequent allocations.  The slab layer
 203  * will only be entered if the object is to be purged out of the cache.
 204  * During normal operations, this happens only when the CPU layer resizes
 205  * its bucket while it's adjusting to the allocation load.  Deallocation
 206  * paths are different depending on the class of objects:
 207  *
 208  * a. Rudimentary object:
 209  *
 210  *      { m_free(), m_freem_list(), composite object deallocation }
 211  *                      |       ^
 212  *                      |       |
 213  *                      |       +------ (done) ---------+
 214  *                      v                               |
 215  *         mcache_free/mcache_free_ext()                |
 216  *                      |                               |
 217  *                      v                               |
 218  *              mbuf_slab_audit()                       |
 219  *                      |                               |
 220  *                      v                               |
 221  *                 [CPU cache] ---> (not purging?) -----+
 222  *                      |                               |
 223  *                      v                               |
 224  *               mbuf_slab_free()                       |
 225  *                      |                               |
 226  *                      v                               |
 227  *                  [freelist] ----------->>------------+
 228  *       (objects never get purged to VM)
 229  *
 230  * b. Composite object:
 231  *
 232  *      { m_free(), m_freem_list() }
 233  *                      |       ^
 234  *                      |       |
 235  *                      |       +------ (done) ---------+
 236  *                      v                               |
 237  *         mcache_free/mcache_free_ext()                |
 238  *                      |                               |
 239  *                      v                               |
 240  *              mbuf_cslab_audit()                      |
 241  *                      |                               |
 242  *                      v                               |
 243  *                 [CPU cache] ---> (not purging?) -----+
 244  *                      |                               |
 245  *                      v                               |
 246  *               mbuf_cslab_free()                      |
 247  *                      |                               |
 248  *                      v                               |
 249  *                  [freelist] ---> (not purging?) -----+
 250  *                      |                               |
 251  *                      v                               |
 252  *              (rudimentary object)                    |
 253  *         mcache_free/mcache_free_ext() ------->>------+
 254  *
 255  * Auditing notes: If auditing is enabled, the audit routine will save
 256  * any constructed data structure fields (if necessary) before filling the
 257  * contents of the buffers with DEADBEEF (free) pattern and recording the
 258  * transaction.  Buffers that are freed (whether at CPU or slab layer) are
 259  * expected to contain the free pattern.
 260  *
 261  * DEBUGGING:
 262  *
 263  * Debugging can be enabled by adding "mbuf_debug=0x3" to boot-args; this
 264  * translates to the mcache flags (MCF_VERIFY | MCF_AUDIT).  Additionally,
 265  * the CPU layer cache can be disabled by setting the MCF_NOCPUCACHE flag,
 266  * i.e. modify the boot argument parameter to "mbuf_debug=0x13".  Note
 267  * that debugging consumes more CPU and memory.
 268  *
 269  * Each object is associated with exactly one mcache_audit_t structure that
 270  * contains the information related to its last buffer transaction.  Given
 271  * an address of an object, the audit structure can be retrieved by finding
 272  * the position of the object relevant to the base address of the cluster:
 273  *
 274  *      +------------+                  +=============+
 275  *      | mbuf addr  |                  | mclaudit[i] |
 276  *      +------------+                  +=============+
 277  *            |                         | cl_audit[0] |
 278  *      i = MTOCL(addr)                 +-------------+
 279  *            |                 +-----> | cl_audit[1] | -----> mcache_audit_t
 280  *      b = CLTOM(i)            |       +-------------+
 281  *            |                 |       |     ...     |
 282  *      x = MCLIDX(b, addr)     |       +-------------+
 283  *            |                 |       | cl_audit[7] |
 284  *            +-----------------+       +-------------+
 285  *               (e.g. x == 1)
 286  *
 287  * The mclaudit[] array is allocated at initialization time, but its contents
 288  * get populated when the corresponding cluster is created.  Because a cluster
 289  * can be turned into NMBPCL number of mbufs, we preserve enough space for the
 290  * mbufs so that there is a 1-to-1 mapping between them.  A cluster that never
 291  * gets (or has not yet) turned into mbufs will use only cl_audit[0] with the
 292  * remaining entries unused.  For big clusters, only one entry is allocated
 293  * and used for the entire cluster pair.
 294  */
 295
 296 /* TODO: should be in header file */
 297 /* kernel translater */
 298 extern vm_offset_t kmem_mb_alloc(vm_map_t, int);
 299 extern ppnum_t pmap_find_phys(pmap_t pmap, addr64_t va);
 300 extern vm_map_t mb_map;         /* special map */
 301
 302 /* Global lock */
 303 static lck_mtx_t *mbuf_mlock;
 304 static lck_attr_t *mbuf_mlock_attr;
 305 static lck_grp_t *mbuf_mlock_grp;
 306 static lck_grp_attr_t *mbuf_mlock_grp_attr;
 307
 308 /* Back-end (common) layer */
 309 static void *mbuf_worker_run;   /* wait channel for worker thread */
 310 static int mbuf_worker_ready;   /* worker thread is runnable */
 311 static int mbuf_expand_mcl;     /* number of cluster creation requets */
 312 static int mbuf_expand_big;     /* number of big cluster creation requests */
 313 static int mbuf_expand_16k;     /* number of 16K cluster creation requests */
 314 static int ncpu;                /* number of CPUs */
 315 static int *mcl_paddr;          /* Array of cluster physical addresses */
 316 static ppnum_t mcl_paddr_base;  /* Handle returned by IOMapper::iovmAlloc() */
 317 static mcache_t *ref_cache;     /* Cache of cluster reference & flags */
 318 static mcache_t *mcl_audit_con_cache; /* Audit contents cache */
 319 static unsigned int mbuf_debug; /* patchable mbuf mcache flags */
 320 static unsigned int mb_normalized; /* number of packets "normalized" */
 321
 322 typedef enum {
 323         MC_MBUF = 0,    /* Regular mbuf */
 324         MC_CL,          /* Cluster */
 325         MC_BIGCL,       /* Large (4K) cluster */
 326         MC_16KCL,       /* Jumbo (16K) cluster */
 327         MC_MBUF_CL,     /* mbuf + cluster */
 328         MC_MBUF_BIGCL,  /* mbuf + large (4K) cluster */
 329         MC_MBUF_16KCL   /* mbuf + jumbo (16K) cluster */
 330 } mbuf_class_t;
 331
 332 #define MBUF_CLASS_MIN          MC_MBUF
 333 #define MBUF_CLASS_MAX          MC_MBUF_16KCL
 334 #define MBUF_CLASS_LAST         MC_16KCL
 335 #define MBUF_CLASS_VALID(c) \
 336         ((int)(c) >= MBUF_CLASS_MIN && (int)(c) <= MBUF_CLASS_MAX)
 337 #define MBUF_CLASS_COMPOSITE(c) \
 338         ((int)(c) > MBUF_CLASS_LAST)
 339
 340
 341 /*
 342  * mbuf specific mcache allocation request flags.
 343  */
 344 #define MCR_COMP        MCR_USR1 /* for MC_MBUF_{CL,BIGCL,16KCL} caches */
 345
 346 /*
 347  * Per-cluster slab structure.
 348  *
 349  * A slab is a cluster control structure that contains one or more object
 350  * chunks; the available chunks are chained in the slab's freelist (sl_head).
 351  * Each time a chunk is taken out of the slab, the slab's reference count
 352  * gets incremented.  When all chunks have been taken out, the empty slab
 353  * gets removed (SLF_DETACHED) from the class's slab list.  A chunk that is
 354  * returned to a slab causes the slab's reference count to be decremented;
 355  * it also causes the slab to be reinserted back to class's slab list, if
 356  * it's not already done.
 357  *
 358  * Compartmentalizing of the object chunks into slabs allows us to easily
 359  * merge one or more slabs together when the adjacent slabs are idle, as
 360  * well as to convert or move a slab from one class to another; e.g. the
 361  * mbuf cluster slab can be converted to a regular cluster slab when all
 362  * mbufs in the slab have been freed.
 363  *
 364  * A slab may also span across multiple clusters for chunks larger than
 365  * a cluster's size.  In this case, only the slab of the first cluster is
 366  * used.  The rest of the slabs are marked with SLF_PARTIAL to indicate
 367  * that they are part of the larger slab.
 368  */
 369 typedef struct mcl_slab {
 370         struct mcl_slab *sl_next;       /* neighboring slab */
 371         u_int8_t        sl_class;       /* controlling mbuf class */
 372         int8_t          sl_refcnt;      /* outstanding allocations */
 373         int8_t          sl_chunks;      /* chunks (bufs) in this slab */
 374         u_int16_t       sl_flags;       /* slab flags (see below) */
 375         u_int16_t       sl_len;         /* slab length */
 376         void            *sl_base;       /* base of allocated memory */
 377         void            *sl_head;       /* first free buffer */
 378         TAILQ_ENTRY(mcl_slab) sl_link;  /* next/prev slab on freelist */
 379 } mcl_slab_t;
 380
 381 #define SLF_MAPPED      0x0001          /* backed by a mapped page */
 382 #define SLF_PARTIAL     0x0002          /* part of another slab */
 383 #define SLF_DETACHED    0x0004          /* not in slab freelist */
 384
 385 /*
 386  * The array of slabs are broken into groups of arrays per 1MB of kernel
 387  * memory to reduce the footprint.  Each group is allocated on demand
 388  * whenever a new piece of memory mapped in from the VM crosses the 1MB
 389  * boundary.
 390  */
 391 #define MBSHIFT         20                              /* 1MB */
 392 #define NSLABSPMB       ((1 << MBSHIFT) >> MCLSHIFT)    /* 512 slabs/grp */
 393
 394 typedef struct mcl_slabg {
 395         mcl_slab_t      slg_slab[NSLABSPMB];    /* group of slabs */
 396 } mcl_slabg_t;
 397
 398 /*
 399  * Per-cluster audit structure.
 400  */
 401 typedef struct {
 402         mcache_audit_t  *cl_audit[NMBPCL];      /* array of audits */
 403 } mcl_audit_t;
 404
 405 #if CONFIG_MBUF_NOEXPAND
 406 static unsigned int maxmbufcl;
 407 #endif /* CONFIG_MBUF_NOEXPAND */
 408
 409 /*
 410  * Size of data from the beginning of an mbuf that covers m_hdr, pkthdr
 411  * and m_ext structures.  If auditing is enabled, we allocate a shadow
 412  * mbuf structure of this size inside each audit structure, and the
 413  * contents of the real mbuf gets copied into it when the mbuf is freed.
 414  * This allows us to pattern-fill the mbuf for integrity check, and to
 415  * preserve any constructed mbuf fields (e.g. mbuf + cluster cache case).
 416  * Note that we don't save the contents of clusters when they are freed;
 417  * we simply pattern-fill them.
 418  */
 419 #if defined(__LP64__)
 420 #define AUDIT_CONTENTS_SIZE     160
 421 #else
 422 #define AUDIT_CONTENTS_SIZE     80
 423 #endif /* __LP64__ */
 424
 425 /*
 426  * mbuf specific mcache audit flags
 427  */
 428 #define MB_INUSE        0x01    /* object has not been returned to slab */
 429 #define MB_COMP_INUSE   0x02    /* object has not been returned to cslab */
 430 #define MB_SCVALID      0x04    /* object has valid saved contents */
 431
 432 /*
 433  * Each of the following two arrays hold up to nmbclusters elements.
 434  */
 435 static mcl_audit_t *mclaudit;   /* array of cluster audit information */
 436 static mcl_slabg_t **slabstbl;  /* cluster slabs table */
 437 static unsigned int maxslabgrp; /* max # of entries in slabs table */
 438 static unsigned int slabgrp;    /* # of entries in slabs table */
 439
 440 /* Globals */
 441 int nclusters;                  /* # of clusters for non-jumbo (legacy) sizes */
 442 int njcl;                       /* # of clusters for jumbo sizes */
 443 int njclbytes;                  /* size of a jumbo cluster */
 444 union mcluster *mbutl;          /* first mapped cluster address */
 445 union mcluster *embutl;         /* ending virtual address of mclusters */
 446 int max_linkhdr;                /* largest link-level header */
 447 int max_protohdr;               /* largest protocol header */
 448 int max_hdr;                    /* largest link+protocol header */
 449 int max_datalen;                /* MHLEN - max_hdr */
 450
 451 /* TODO: should be in header file */
 452 int do_reclaim = 0;
 453
 454 /* The minimum number of objects that are allocated, to start. */
 455 #define MINCL           32
 456 #define MINBIGCL        (MINCL >> 1)
 457 #define MIN16KCL        (MINCL >> 2)
 458
 459 /* Low watermarks (only map in pages once free counts go below) */
 460 #define MCL_LOWAT       MINCL
 461 #define MBIGCL_LOWAT    MINBIGCL
 462 #define M16KCL_LOWAT    MIN16KCL
 463
 464 typedef struct {
 465         mbuf_class_t    mtbl_class;     /* class type */
 466         mcache_t        *mtbl_cache;    /* mcache for this buffer class */
 467         TAILQ_HEAD(mcl_slhead, mcl_slab) mtbl_slablist; /* slab list */
 468         mcache_obj_t    *mtbl_cobjlist; /* composite objects freelist */
 469         mb_class_stat_t *mtbl_stats;    /* statistics fetchable via sysctl */
 470         u_int32_t       mtbl_maxsize;   /* maximum buffer size */
 471         int             mtbl_minlimit;  /* minimum allowed */
 472         int             mtbl_maxlimit;  /* maximum allowed */
 473         u_int32_t       mtbl_wantpurge; /* purge during next reclaim */
 474 } mbuf_table_t;
 475
 476 #define m_class(c)      mbuf_table[c].mtbl_class
 477 #define m_cache(c)      mbuf_table[c].mtbl_cache
 478 #define m_slablist(c)   mbuf_table[c].mtbl_slablist
 479 #define m_cobjlist(c)   mbuf_table[c].mtbl_cobjlist
 480 #define m_maxsize(c)    mbuf_table[c].mtbl_maxsize
 481 #define m_minlimit(c)   mbuf_table[c].mtbl_minlimit
 482 #define m_maxlimit(c)   mbuf_table[c].mtbl_maxlimit
 483 #define m_wantpurge(c)  mbuf_table[c].mtbl_wantpurge
 484 #define m_cname(c)      mbuf_table[c].mtbl_stats->mbcl_cname
 485 #define m_size(c)       mbuf_table[c].mtbl_stats->mbcl_size
 486 #define m_total(c)      mbuf_table[c].mtbl_stats->mbcl_total
 487 #define m_active(c)     mbuf_table[c].mtbl_stats->mbcl_active
 488 #define m_infree(c)     mbuf_table[c].mtbl_stats->mbcl_infree
 489 #define m_slab_cnt(c)   mbuf_table[c].mtbl_stats->mbcl_slab_cnt
 490 #define m_alloc_cnt(c)  mbuf_table[c].mtbl_stats->mbcl_alloc_cnt
 491 #define m_free_cnt(c)   mbuf_table[c].mtbl_stats->mbcl_free_cnt
 492 #define m_notified(c)   mbuf_table[c].mtbl_stats->mbcl_notified
 493 #define m_purge_cnt(c)  mbuf_table[c].mtbl_stats->mbcl_purge_cnt
 494 #define m_fail_cnt(c)   mbuf_table[c].mtbl_stats->mbcl_fail_cnt
 495 #define m_ctotal(c)     mbuf_table[c].mtbl_stats->mbcl_ctotal
 496
 497 static mbuf_table_t mbuf_table[] = {
 498         /*
 499          * The caches for mbufs, regular clusters and big clusters.
 500          */
 501         { MC_MBUF, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_MBUF)),
 502             NULL, NULL, 0, 0, 0, 0 },
 503         { MC_CL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_CL)),
 504             NULL, NULL, 0, 0, 0, 0 },
 505         { MC_BIGCL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_BIGCL)),
 506             NULL, NULL, 0, 0, 0, 0 },
 507         { MC_16KCL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_16KCL)),
 508             NULL, NULL, 0, 0, 0, 0 },
 509         /*
 510          * The following are special caches; they serve as intermediate
 511          * caches backed by the above rudimentary caches.  Each object
 512          * in the cache is an mbuf with a cluster attached to it.  Unlike
 513          * the above caches, these intermediate caches do not directly
 514          * deal with the slab structures; instead, the constructed
 515          * cached elements are simply stored in the freelists.
 516          */
 517         { MC_MBUF_CL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0 },
 518         { MC_MBUF_BIGCL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0 },
 519         { MC_MBUF_16KCL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0 },
 520 };
 521
 522 #define NELEM(a)        (sizeof (a) / sizeof ((a)[0]))
 523
 524 static void *mb_waitchan = &mbuf_table; /* wait channel for all caches */
 525 static int mb_waiters;                  /* number of sleepers */
 526
 527 /* The following are used to serialize m_clalloc() */
 528 static boolean_t mb_clalloc_busy;
 529 static void *mb_clalloc_waitchan = &mb_clalloc_busy;
 530 static int mb_clalloc_waiters;
 531
 532 static int mbstat_sysctl SYSCTL_HANDLER_ARGS;
 533 static int mb_stat_sysctl SYSCTL_HANDLER_ARGS;
 534 static void mbuf_table_init(void);
 535 static inline void m_incref(struct mbuf *);
 536 static inline u_int32_t m_decref(struct mbuf *);
 537 static int m_clalloc(const u_int32_t, const int, const u_int32_t);
 538 static void mbuf_worker_thread_init(void);
 539 static mcache_obj_t *slab_alloc(mbuf_class_t, int);
 540 static void slab_free(mbuf_class_t, mcache_obj_t *);
 541 static unsigned int mbuf_slab_alloc(void *, mcache_obj_t ***,
 542     unsigned int, int);
 543 static void mbuf_slab_free(void *, mcache_obj_t *, int);
 544 static void mbuf_slab_audit(void *, mcache_obj_t *, boolean_t);
 545 static void mbuf_slab_notify(void *, u_int32_t);
 546 static unsigned int cslab_alloc(mbuf_class_t, mcache_obj_t ***,
 547     unsigned int);
 548 static unsigned int cslab_free(mbuf_class_t, mcache_obj_t *, int);
 549 static unsigned int mbuf_cslab_alloc(void *, mcache_obj_t ***,
 550     unsigned int, int);
 551 static void mbuf_cslab_free(void *, mcache_obj_t *, int);
 552 static void mbuf_cslab_audit(void *, mcache_obj_t *, boolean_t);
 553 static int freelist_populate(mbuf_class_t, unsigned int, int);
 554 static boolean_t mbuf_cached_above(mbuf_class_t, int);
 555 static boolean_t mbuf_steal(mbuf_class_t, unsigned int);
 556 static void m_reclaim(mbuf_class_t, unsigned int, boolean_t);
 557 static int m_howmany(int, size_t);
 558 static void mbuf_worker_thread(void);
 559 static boolean_t mbuf_sleep(mbuf_class_t, unsigned int, int);
 560
 561 static void mcl_audit_init(void *, mcache_audit_t **, mcache_obj_t **,
 562     size_t, unsigned int);
 563 static mcache_audit_t *mcl_audit_buf2mca(mbuf_class_t, mcache_obj_t *);
 564 static void mcl_audit_mbuf(mcache_audit_t *, void *, boolean_t, boolean_t);
 565 static void mcl_audit_cluster(mcache_audit_t *, void *, size_t, boolean_t,
 566     boolean_t);
 567 static void mcl_audit_restore_mbuf(struct mbuf *, mcache_audit_t *, boolean_t);
 568 static void mcl_audit_save_mbuf(struct mbuf *, mcache_audit_t *);
 569 static void mcl_audit_mcheck_panic(struct mbuf *);
 570 static void mcl_audit_verify_nextptr(void *, mcache_audit_t *);
 571
 572 static mcl_slab_t *slab_get(void *);
 573 static void slab_init(mcl_slab_t *, mbuf_class_t, u_int32_t,
 574     void *, void *, unsigned int, int, int);
 575 static void slab_insert(mcl_slab_t *, mbuf_class_t);
 576 static void slab_remove(mcl_slab_t *, mbuf_class_t);
 577 static boolean_t slab_inrange(mcl_slab_t *, void *);
 578 static void slab_nextptr_panic(mcl_slab_t *, void *);
 579 static void slab_detach(mcl_slab_t *);
 580 static boolean_t slab_is_detached(mcl_slab_t *);
 581
 582 /*
 583  * This flag is set for all mbufs that come out of and into the composite
 584  * mbuf + cluster caches, i.e. MC_MBUF_CL and MC_MBUF_BIGCL.  mbufs that
 585  * are marked with such a flag have clusters attached to them, and will be
 586  * treated differently when they are freed; instead of being placed back
 587  * into the mbuf and cluster freelists, the composite mbuf + cluster objects
 588  * are placed back into the appropriate composite cache's freelist, and the
 589  * actual freeing is deferred until the composite objects are purged.  At
 590  * such a time, this flag will be cleared from the mbufs and the objects
 591  * will be freed into their own separate freelists.
 592  */
 593 #define EXTF_COMPOSITE  0x1
 594
 595 #define MEXT_RFA(m)             ((m)->m_ext.ext_refflags)
 596 #define MEXT_REF(m)             (MEXT_RFA(m)->refcnt)
 597 #define MEXT_FLAGS(m)           (MEXT_RFA(m)->flags)
 598 #define MBUF_IS_COMPOSITE(m)    \
 599         (MEXT_REF(m) == 0 && (MEXT_FLAGS(m) & EXTF_COMPOSITE))
 600
 601 /*
 602  * Macros used to verify the integrity of the mbuf.
 603  */
 604 #define _MCHECK(m) {                                                    \
 605         if ((m)->m_type != MT_FREE) {                                   \
 606                 if (mclaudit == NULL)                                   \
 607                         panic("MCHECK: m_type=%d m=%p",                 \
 608                             (u_int16_t)(m)->m_type, m);                 \
 609                 else                                                    \
 610                         mcl_audit_mcheck_panic(m);                      \
 611         }                                                               \
 612 }
 613
 614 #define MBUF_IN_MAP(addr)                                               \
 615         ((void *)(addr) >= (void *)mbutl && (void *)(addr) < (void *)embutl)
 616
 617 #define MRANGE(addr) {                                                  \
 618         if (!MBUF_IN_MAP(addr))                                         \
 619                 panic("MRANGE: address out of range 0x%p", addr);       \
 620 }
 621
 622 /*
 623  * Macro version of mtod.
 624  */
 625 #define MTOD(m, t)      ((t)((m)->m_data))
 626
 627 /*
 628  * Macros to obtain cluster index and base cluster address.
 629  */
 630 #define MTOCL(x)        (((char *)(x) - (char *)mbutl) >> MCLSHIFT)
 631 #define CLTOM(x)        ((union mcluster *)(mbutl + (x)))
 632
 633 /*
 634  * Macro to find the mbuf index relative to the cluster base.
 635  */
 636 #define MCLIDX(c, m)    (((char *)(m) - (char *)(c)) >> 8)
 637
 638 /*
 639  * Macros used during mbuf and cluster initialization.
 640  */
 641 #define MBUF_INIT(m, pkthdr, type) {                                    \
 642         _MCHECK(m);                                                     \
 643         (m)->m_next = (m)->m_nextpkt = NULL;                            \
 644         (m)->m_len = 0;                                                 \
 645         (m)->m_type = type;                                             \
 646         if ((pkthdr) == 0) {                                            \
 647                 (m)->m_data = (m)->m_dat;                               \
 648                 (m)->m_flags = 0;                                       \
 649         } else {                                                        \
 650                 (m)->m_data = (m)->m_pktdat;                            \
 651                 (m)->m_flags = M_PKTHDR;                                \
 652                 (m)->m_pkthdr.rcvif = NULL;                             \
 653                 (m)->m_pkthdr.len = 0;                                  \
 654                 (m)->m_pkthdr.header = NULL;                            \
 655                 (m)->m_pkthdr.csum_flags = 0;                           \
 656                 (m)->m_pkthdr.csum_data = 0;                            \
 657                 (m)->m_pkthdr.reserved0 = NULL;                         \
 658                 (m)->m_pkthdr.vlan_tag = 0;                             \
 659                 (m)->m_pkthdr.socket_id = 0;                            \
 660                 m_tag_init(m);                                          \
 661         }                                                               \
 662 }
 663
 664 #define MEXT_INIT(m, buf, size, free, arg, rfa, ref, flag) {            \
 665         (m)->m_data = (m)->m_ext.ext_buf = (buf);                       \
 666         (m)->m_flags |= M_EXT;                                          \
 667         (m)->m_ext.ext_size = (size);                                   \
 668         (m)->m_ext.ext_free = (free);                                   \
 669         (m)->m_ext.ext_arg = (arg);                                     \
 670         (m)->m_ext.ext_refs.forward = (m)->m_ext.ext_refs.backward =    \
 671             &(m)->m_ext.ext_refs;                                       \
 672         MEXT_RFA(m) = (rfa);                                            \
 673         MEXT_REF(m) = (ref);                                            \
 674         MEXT_FLAGS(m) = (flag);                                         \
 675 }
 676
 677 #define MBUF_CL_INIT(m, buf, rfa, ref, flag)    \
 678         MEXT_INIT(m, buf, m_maxsize(MC_CL), NULL, NULL, rfa, ref, flag)
 679
 680 #define MBUF_BIGCL_INIT(m, buf, rfa, ref, flag) \
 681         MEXT_INIT(m, buf, m_maxsize(MC_BIGCL), m_bigfree, NULL, rfa, ref, flag)
 682
 683 #define MBUF_16KCL_INIT(m, buf, rfa, ref, flag) \
 684         MEXT_INIT(m, buf, m_maxsize(MC_16KCL), m_16kfree, NULL, rfa, ref, flag)
 685
 686 /*
 687  * Macro to convert BSD malloc sleep flag to mcache's
 688  */
 689 #define MSLEEPF(f)      ((!((f) & M_DONTWAIT)) ? MCR_SLEEP : MCR_NOSLEEP)
 690
 691 /*
 692  * The structure that holds all mbuf class statistics exportable via sysctl.
 693  * Similar to mbstat structure, the mb_stat structure is protected by the
 694  * global mbuf lock.  It contains additional information about the classes
 695  * that allows for a more accurate view of the state of the allocator.
 696  */
 697 struct mb_stat *mb_stat;
 698
 699 #define MB_STAT_SIZE(n) \
 700         ((size_t)(&((mb_stat_t *)0)->mbs_class[n]))
 701
 702 /*
 703  * The legacy structure holding all of the mbuf allocation statistics.
 704  * The actual statistics used by the kernel are stored in the mbuf_table
 705  * instead, and are updated atomically while the global mbuf lock is held.
 706  * They are mirrored in mbstat to support legacy applications (e.g. netstat).
 707  * Unlike before, the kernel no longer relies on the contents of mbstat for
 708  * its operations (e.g. cluster expansion) because the structure is exposed
 709  * to outside and could possibly be modified, therefore making it unsafe.
 710  * With the exception of the mbstat.m_mtypes array (see below), all of the
 711  * statistics are updated as they change.
 712  */
 713 struct mbstat mbstat;
 714
 715 #define MBSTAT_MTYPES_MAX \
 716         (sizeof (mbstat.m_mtypes) / sizeof (mbstat.m_mtypes[0]))
 717
 718 /*
 719  * Allocation statistics related to mbuf types (up to MT_MAX-1) are updated
 720  * atomically and stored in a per-CPU structure which is lock-free; this is
 721  * done in order to avoid writing to the global mbstat data structure which
 722  * would cause false sharing.  During sysctl request for kern.ipc.mbstat,
 723  * the statistics across all CPUs will be converged into the mbstat.m_mtypes
 724  * array and returned to the application.  Any updates for types greater or
 725  * equal than MT_MAX would be done atomically to the mbstat; this slows down
 726  * performance but is okay since the kernel uses only up to MT_MAX-1 while
 727  * anything beyond that (up to type 255) is considered a corner case.
 728  */
 729 typedef struct {
 730         unsigned int    cpu_mtypes[MT_MAX];
 731 } __attribute__((aligned(CPU_CACHE_SIZE), packed)) mtypes_cpu_t;
 732
 733 typedef struct {
 734         mtypes_cpu_t    mbs_cpu[1];
 735 } mbuf_mtypes_t;
 736
 737 static mbuf_mtypes_t *mbuf_mtypes;      /* per-CPU statistics */
 738
 739 #define MBUF_MTYPES_SIZE(n) \
 740         ((size_t)(&((mbuf_mtypes_t *)0)->mbs_cpu[n]))
 741
 742 #define MTYPES_CPU(p) \
 743         ((mtypes_cpu_t *)((char *)(p) + MBUF_MTYPES_SIZE(cpu_number())))
 744
 745 /* This should be in a header file */
 746 #define atomic_add_32(a, n)     ((void) OSAddAtomic(n, (volatile SInt32 *)a))
 747
 748 #define mtype_stat_add(type, n) {                                       \
 749         if ((unsigned)(type) < MT_MAX) {                                \
 750                 mtypes_cpu_t *mbs = MTYPES_CPU(mbuf_mtypes);            \
 751                 atomic_add_32(&mbs->cpu_mtypes[type], n);               \
 752         } else if ((unsigned)(type) < MBSTAT_MTYPES_MAX) {              \
 753                 atomic_add_32(&mbstat.m_mtypes[type], n);               \
 754         }                                                               \
 755 }
 756
 757 #define mtype_stat_sub(t, n)    mtype_stat_add(t, -(n))
 758 #define mtype_stat_inc(t)       mtype_stat_add(t, 1)
 759 #define mtype_stat_dec(t)       mtype_stat_sub(t, 1)
 760
 761 static int
 762 mbstat_sysctl SYSCTL_HANDLER_ARGS
 763 {
 764 #pragma unused(oidp, arg1, arg2)
 765         int m, n;
 766         mtypes_cpu_t mtc;
 767
 768         bzero(&mtc, sizeof (mtc));
 769         for (m = 0; m < ncpu; m++) {
 770                 mtypes_cpu_t *scp = &mbuf_mtypes->mbs_cpu[m];
 771                 mtypes_cpu_t temp;
 772
 773                 bcopy(&scp->cpu_mtypes, &temp.cpu_mtypes,
 774                     sizeof (temp.cpu_mtypes));
 775
 776                 for (n = 0; n < MT_MAX; n++)
 777                         mtc.cpu_mtypes[n] += temp.cpu_mtypes[n];
 778         }
 779         lck_mtx_lock(mbuf_mlock);
 780         for (n = 0; n < MT_MAX; n++)
 781                 mbstat.m_mtypes[n] = mtc.cpu_mtypes[n];
 782         lck_mtx_unlock(mbuf_mlock);
 783
 784         return (SYSCTL_OUT(req, &mbstat, sizeof (mbstat)));
 785 }
 786
 787 static int
 788 mb_stat_sysctl SYSCTL_HANDLER_ARGS
 789 {
 790 #pragma unused(oidp, arg1, arg2)
 791         mcache_t *cp;
 792         mcache_cpu_t *ccp;
 793         mb_class_stat_t *sp;
 794         int k, m, bktsize;
 795
 796         lck_mtx_lock(mbuf_mlock);
 797         for (k = 0; k < NELEM(mbuf_table); k++) {
 798                 cp = m_cache(k);
 799                 ccp = &cp->mc_cpu[0];
 800                 bktsize = ccp->cc_bktsize;
 801                 sp = mbuf_table[k].mtbl_stats;
 802
 803                 if (cp->mc_flags & MCF_NOCPUCACHE)
 804                         sp->mbcl_mc_state = MCS_DISABLED;
 805                 else if (cp->mc_purge_cnt > 0)
 806                         sp->mbcl_mc_state = MCS_PURGING;
 807                 else if (bktsize == 0)
 808                         sp->mbcl_mc_state = MCS_OFFLINE;
 809                 else
 810                         sp->mbcl_mc_state = MCS_ONLINE;
 811
 812                 sp->mbcl_mc_cached = 0;
 813                 for (m = 0; m < ncpu; m++) {
 814                         ccp = &cp->mc_cpu[m];
 815                         if (ccp->cc_objs > 0)
 816                                 sp->mbcl_mc_cached += ccp->cc_objs;
 817                         if (ccp->cc_pobjs > 0)
 818                                 sp->mbcl_mc_cached += ccp->cc_pobjs;
 819                 }
 820                 sp->mbcl_mc_cached += (cp->mc_full.bl_total * bktsize);
 821                 sp->mbcl_active = sp->mbcl_total - sp->mbcl_mc_cached -
 822                     sp->mbcl_infree;
 823
 824                 sp->mbcl_mc_waiter_cnt = cp->mc_waiter_cnt;
 825                 sp->mbcl_mc_wretry_cnt = cp->mc_wretry_cnt;
 826                 sp->mbcl_mc_nwretry_cnt = cp->mc_nwretry_cnt;
 827
 828                 /* Calculate total count specific to each class */
 829                 sp->mbcl_ctotal = sp->mbcl_total;
 830                 switch (m_class(k)) {
 831                 case MC_MBUF:
 832                         /* Deduct mbufs used in composite caches */
 833                         sp->mbcl_ctotal -= (m_total(MC_MBUF_CL) +
 834                             m_total(MC_MBUF_BIGCL));
 835                         break;
 836
 837                 case MC_CL:
 838                         /* Deduct clusters used in composite cache and mbufs */
 839                         sp->mbcl_ctotal -= (m_total(MC_MBUF_CL) +
 840                             (P2ROUNDUP(m_total(MC_MBUF), NMBPCL)/NMBPCL));
 841                         break;
 842
 843                 case MC_BIGCL:
 844                         /* Deduct clusters used in composite cache */
 845                         sp->mbcl_ctotal -= m_total(MC_MBUF_BIGCL);
 846                         break;
 847
 848                 case MC_16KCL:
 849                         /* Deduct clusters used in composite cache */
 850                         sp->mbcl_ctotal -= m_total(MC_MBUF_16KCL);
 851                         break;
 852
 853                 default:
 854                         break;
 855                 }
 856         }
 857         lck_mtx_unlock(mbuf_mlock);
 858
 859         return (SYSCTL_OUT(req, mb_stat, MB_STAT_SIZE(NELEM(mbuf_table))));
 860 }
 861
 862 static inline void
 863 m_incref(struct mbuf *m)
 864 {
 865         UInt32 old, new;
 866         volatile UInt32 *addr = (volatile UInt32 *)&MEXT_REF(m);
 867
 868         do {
 869                 old = *addr;
 870                 new = old + 1;
 871                 ASSERT(new != 0);
 872         } while (!OSCompareAndSwap(old, new, addr));
 873 }
 874
 875 static inline u_int32_t
 876 m_decref(struct mbuf *m)
 877 {
 878         UInt32 old, new;
 879         volatile UInt32 *addr = (volatile UInt32 *)&MEXT_REF(m);
 880
 881         do {
 882                 old = *addr;
 883                 new = old - 1;
 884                 ASSERT(old != 0);
 885         } while (!OSCompareAndSwap(old, new, addr));
 886
 887         return (new);
 888 }
 889
 890 static void
 891 mbuf_table_init(void)
 892 {
 893         int m;
 894
 895         MALLOC(mb_stat, mb_stat_t *, MB_STAT_SIZE(NELEM(mbuf_table)),
 896             M_TEMP, M_WAITOK | M_ZERO);
 897         VERIFY(mb_stat != NULL);
 898
 899         mb_stat->mbs_cnt = NELEM(mbuf_table);
 900         for (m = 0; m < NELEM(mbuf_table); m++)
 901                 mbuf_table[m].mtbl_stats = &mb_stat->mbs_class[m];
 902
 903 #if CONFIG_MBUF_JUMBO
 904         /*
 905          * Set aside 1/3 of the mbuf cluster map for jumbo clusters; we do
 906          * this only on platforms where jumbo cluster pool is enabled.
 907          */
 908         njcl = nmbclusters / 3;
 909         njclbytes = M16KCLBYTES;
 910 #endif /* CONFIG_MBUF_JUMBO */
 911
 912         /*
 913          * nclusters is going to be split in 2 to hold both the 2K
 914          * and the 4K pools, so make sure each half is even.
 915          */
 916         nclusters = P2ROUNDDOWN(nmbclusters - njcl, 4);
 917         if (njcl > 0) {
 918                 /*
 919                  * Each jumbo cluster takes 8 2K clusters, so make
 920                  * sure that the pool size is evenly divisible by 8.
 921                  */
 922                 njcl = P2ROUNDDOWN(nmbclusters - nclusters, 8);
 923         }
 924
 925 #if CONFIG_MBUF_NOEXPAND
 926         /* Only use 4k clusters if we're setting aside more than 256k */
 927         if (nmbclusters <= 128) {
 928                 maxmbufcl = nmbclusters / 4;
 929         } else {
 930                 /* Half to big clusters, half to small */
 931                 maxmbufcl = (nmbclusters / 4) * 3;
 932         }
 933 #endif /* CONFIG_MBUF_NOEXPAND */
 934
 935         /*
 936          * 1/2 of the map is reserved for 2K clusters.  Out of this, 1/16th
 937          * of the total number of 2K clusters allocated is reserved and cannot
 938          * be turned into mbufs.  It can only be used for pure cluster objects.
 939          */
 940         m_minlimit(MC_CL) = (nclusters >> 5);
 941         m_maxlimit(MC_CL) = (nclusters >> 1);
 942         m_maxsize(MC_CL) = m_size(MC_CL) = MCLBYTES;
 943         (void) snprintf(m_cname(MC_CL), MAX_MBUF_CNAME, "cl");
 944
 945         /*
 946          * The remaining (15/16th) can be turned into mbufs.
 947          */
 948         m_minlimit(MC_MBUF) = 0;
 949         m_maxlimit(MC_MBUF) = (m_maxlimit(MC_CL) - m_minlimit(MC_CL)) * NMBPCL;
 950         m_maxsize(MC_MBUF) = m_size(MC_MBUF) = MSIZE;
 951         (void) snprintf(m_cname(MC_MBUF), MAX_MBUF_CNAME, "mbuf");
 952
 953         /*
 954          * The other 1/2 of the map is reserved for 4K clusters.
 955          */
 956         m_minlimit(MC_BIGCL) = 0;
 957         m_maxlimit(MC_BIGCL) = m_maxlimit(MC_CL) >> 1;
 958         m_maxsize(MC_BIGCL) = m_size(MC_BIGCL) = NBPG;
 959         (void) snprintf(m_cname(MC_BIGCL), MAX_MBUF_CNAME, "bigcl");
 960
 961         /*
 962          * Set limits for the composite classes.
 963          */
 964         m_minlimit(MC_MBUF_CL) = 0;
 965         m_maxlimit(MC_MBUF_CL) = m_maxlimit(MC_CL) - m_minlimit(MC_CL);
 966         m_maxsize(MC_MBUF_CL) = MCLBYTES;
 967         m_size(MC_MBUF_CL) = m_size(MC_MBUF) + m_size(MC_CL);
 968         (void) snprintf(m_cname(MC_MBUF_CL), MAX_MBUF_CNAME, "mbuf_cl");
 969
 970         m_minlimit(MC_MBUF_BIGCL) = 0;
 971         m_maxlimit(MC_MBUF_BIGCL) = m_maxlimit(MC_BIGCL);
 972         m_maxsize(MC_MBUF_BIGCL) = NBPG;
 973         m_size(MC_MBUF_BIGCL) = m_size(MC_MBUF) + m_size(MC_BIGCL);
 974         (void) snprintf(m_cname(MC_MBUF_BIGCL), MAX_MBUF_CNAME, "mbuf_bigcl");
 975
 976         /*
 977          * And for jumbo classes.
 978          */
 979         m_minlimit(MC_16KCL) = 0;
 980         m_maxlimit(MC_16KCL) = (njcl >> 3);
 981         m_maxsize(MC_16KCL) = m_size(MC_16KCL) = M16KCLBYTES;
 982         (void) snprintf(m_cname(MC_16KCL), MAX_MBUF_CNAME, "16kcl");
 983
 984         m_minlimit(MC_MBUF_16KCL) = 0;
 985         m_maxlimit(MC_MBUF_16KCL) = m_maxlimit(MC_16KCL);
 986         m_maxsize(MC_MBUF_16KCL) = M16KCLBYTES;
 987         m_size(MC_MBUF_16KCL) = m_size(MC_MBUF) + m_size(MC_16KCL);
 988         (void) snprintf(m_cname(MC_MBUF_16KCL), MAX_MBUF_CNAME, "mbuf_16kcl");
 989
 990         /*
 991          * Initialize the legacy mbstat structure.
 992          */
 993         bzero(&mbstat, sizeof (mbstat));
 994         mbstat.m_msize = m_maxsize(MC_MBUF);
 995         mbstat.m_mclbytes = m_maxsize(MC_CL);
 996         mbstat.m_minclsize = MINCLSIZE;
 997         mbstat.m_mlen = MLEN;
 998         mbstat.m_mhlen = MHLEN;
 999         mbstat.m_bigmclbytes = m_maxsize(MC_BIGCL);
1000 }
1001
1002 __private_extern__ void
1003 mbinit(void)
1004 {
1005         unsigned int m;
1006         int initmcl = MINCL;
1007         int mcl_pages;
1008         void *buf;
1009
1010         if (nmbclusters == 0)
1011                 nmbclusters = NMBCLUSTERS;
1012
1013         /* Setup the mbuf table */
1014         mbuf_table_init();
1015
1016         /* Global lock for common layer */
1017         mbuf_mlock_grp_attr = lck_grp_attr_alloc_init();
1018         mbuf_mlock_grp = lck_grp_alloc_init("mbuf", mbuf_mlock_grp_attr);
1019         mbuf_mlock_attr = lck_attr_alloc_init();
1020         mbuf_mlock = lck_mtx_alloc_init(mbuf_mlock_grp, mbuf_mlock_attr);
1021
1022         /* Allocate cluster slabs table */
1023         maxslabgrp = P2ROUNDUP(nmbclusters, NSLABSPMB) / NSLABSPMB;
1024         MALLOC(slabstbl, mcl_slabg_t **, maxslabgrp * sizeof (mcl_slabg_t *),
1025             M_TEMP, M_WAITOK | M_ZERO);
1026         VERIFY(slabstbl != NULL);
1027
1028         /* Allocate audit structures if needed */
1029         PE_parse_boot_arg("mbuf_debug", &mbuf_debug);
1030         mbuf_debug |= mcache_getflags();
1031         if (mbuf_debug & MCF_AUDIT) {
1032                 MALLOC(mclaudit, mcl_audit_t *,
1033                     nmbclusters * sizeof (*mclaudit), M_TEMP,
1034                     M_WAITOK | M_ZERO);
1035                 VERIFY(mclaudit != NULL);
1036
1037                 mcl_audit_con_cache = mcache_create("mcl_audit_contents",
1038                     AUDIT_CONTENTS_SIZE, 0, 0, MCR_SLEEP);
1039                 VERIFY(mcl_audit_con_cache != NULL);
1040         }
1041
1042         /* Calculate the number of pages assigned to the cluster pool */
1043         mcl_pages = nmbclusters/(NBPG/CLBYTES);
1044         MALLOC(mcl_paddr, int *, mcl_pages * sizeof (int), M_TEMP, M_WAITOK);
1045         VERIFY(mcl_paddr != NULL);
1046
1047         /* Register with the I/O Bus mapper */
1048         mcl_paddr_base = IOMapperIOVMAlloc(mcl_pages);
1049         bzero((char *)mcl_paddr, mcl_pages * sizeof (int));
1050
1051         embutl = (union mcluster *)
1052             ((unsigned char *)mbutl + (nmbclusters * MCLBYTES));
1053
1054         PE_parse_boot_arg("initmcl", &initmcl);
1055
1056         lck_mtx_lock(mbuf_mlock);
1057
1058         if (m_clalloc(MAX(NBPG/CLBYTES, 1) * initmcl, M_WAIT, MCLBYTES) == 0)
1059                 panic("mbinit: m_clalloc failed\n");
1060
1061         lck_mtx_unlock(mbuf_mlock);
1062
1063         (void) kernel_thread(kernel_task, mbuf_worker_thread_init);
1064
1065         ref_cache = mcache_create("mext_ref", sizeof (struct ext_ref),
1066             0, 0, MCR_SLEEP);
1067
1068         /* Create the cache for each class */
1069         for (m = 0; m < NELEM(mbuf_table); m++) {
1070                 void *allocfunc, *freefunc, *auditfunc;
1071                 u_int32_t flags;
1072
1073                 flags = mbuf_debug;
1074                 if (m_class(m) == MC_MBUF_CL || m_class(m) == MC_MBUF_BIGCL ||
1075                     m_class(m) == MC_MBUF_16KCL) {
1076                         allocfunc = mbuf_cslab_alloc;
1077                         freefunc = mbuf_cslab_free;
1078                         auditfunc = mbuf_cslab_audit;
1079                 } else {
1080                         allocfunc = mbuf_slab_alloc;
1081                         freefunc = mbuf_slab_free;
1082                         auditfunc = mbuf_slab_audit;
1083                 }
1084
1085                 /*
1086                  * Disable per-CPU caches for jumbo classes if there
1087                  * is no jumbo cluster pool available in the system.
1088                  * The cache itself is still created (but will never
1089                  * be populated) since it simplifies the code.
1090                  */
1091                 if ((m_class(m) == MC_MBUF_16KCL || m_class(m) == MC_16KCL) &&
1092                     njcl == 0)
1093                         flags |= MCF_NOCPUCACHE;
1094
1095                 m_cache(m) = mcache_create_ext(m_cname(m), m_maxsize(m),
1096                     allocfunc, freefunc, auditfunc, mbuf_slab_notify,
1097                     (void *)m, flags, MCR_SLEEP);
1098         }
1099
1100         /*
1101          * Allocate structure for per-CPU statistics that's aligned
1102          * on the CPU cache boundary; this code assumes that we never
1103          * uninitialize this framework, since the original address
1104          * before alignment is not saved.
1105          */
1106         ncpu = ml_get_max_cpus();
1107         MALLOC(buf, void *, MBUF_MTYPES_SIZE(ncpu) + CPU_CACHE_SIZE,
1108             M_TEMP, M_WAITOK);
1109         VERIFY(buf != NULL);
1110
1111         mbuf_mtypes = (mbuf_mtypes_t *)P2ROUNDUP((intptr_t)buf, CPU_CACHE_SIZE);
1112         bzero(mbuf_mtypes, MBUF_MTYPES_SIZE(ncpu));
1113
1114         printf("mbinit: done\n");
1115 }
1116
1117 /*
1118  * Obtain a slab of object(s) from the class's freelist.
1119  */
1120 static mcache_obj_t *
1121 slab_alloc(mbuf_class_t class, int wait)
1122 {
1123         mcl_slab_t *sp;
1124         mcache_obj_t *buf;
1125
1126         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1127
1128         VERIFY(class != MC_16KCL || njcl > 0);
1129
1130         /* This should always be NULL for us */
1131         VERIFY(m_cobjlist(class) == NULL);
1132
1133         /*
1134          * Treat composite objects as having longer lifespan by using
1135          * a slab from the reverse direction, in hoping that this could
1136          * reduce the probability of fragmentation for slabs that hold
1137          * more than one buffer chunks (e.g. mbuf slabs).  For other
1138          * slabs, this probably doesn't make much of a difference.
1139          */
1140         if (class == MC_MBUF && (wait & MCR_COMP))
1141                 sp = (mcl_slab_t *)TAILQ_LAST(&m_slablist(class), mcl_slhead);
1142         else
1143                 sp = (mcl_slab_t *)TAILQ_FIRST(&m_slablist(class));
1144
1145         if (sp == NULL) {
1146                 VERIFY(m_infree(class) == 0 && m_slab_cnt(class) == 0);
1147                 /* The slab list for this class is empty */
1148                 return (NULL);
1149         }
1150
1151         VERIFY(m_infree(class) > 0);
1152         VERIFY(!slab_is_detached(sp));
1153         VERIFY(sp->sl_class == class &&
1154             (sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED);
1155         buf = sp->sl_head;
1156         VERIFY(slab_inrange(sp, buf) && sp == slab_get(buf));
1157
1158         if (class == MC_MBUF) {
1159                 sp->sl_head = buf->obj_next;
1160                 VERIFY(sp->sl_head != NULL || sp->sl_refcnt == (NMBPCL - 1));
1161         } else {
1162                 sp->sl_head = NULL;
1163         }
1164         if (sp->sl_head != NULL && !slab_inrange(sp, sp->sl_head)) {
1165                 slab_nextptr_panic(sp, sp->sl_head);
1166                 /* In case sl_head is in the map but not in the slab */
1167                 VERIFY(slab_inrange(sp, sp->sl_head));
1168                 /* NOTREACHED */
1169         }
1170
1171         /* Increment slab reference */
1172         sp->sl_refcnt++;
1173
1174         if (mclaudit != NULL) {
1175                 mcache_audit_t *mca = mcl_audit_buf2mca(class, buf);
1176                 mca->mca_uflags = 0;
1177                 /* Save contents on mbuf objects only */
1178                 if (class == MC_MBUF)
1179                         mca->mca_uflags |= MB_SCVALID;
1180         }
1181
1182         if (class == MC_CL) {
1183                 mbstat.m_clfree = (--m_infree(MC_CL)) + m_infree(MC_MBUF_CL);
1184                 /*
1185                  * A 2K cluster slab can have at most 1 reference.
1186                  */
1187                 VERIFY(sp->sl_refcnt == 1 && sp->sl_chunks == 1 &&
1188                     sp->sl_len == m_maxsize(MC_CL) && sp->sl_head == NULL);
1189         } else if (class == MC_BIGCL) {
1190                 mcl_slab_t *nsp = sp->sl_next;
1191                 mbstat.m_bigclfree = (--m_infree(MC_BIGCL)) +
1192                     m_infree(MC_MBUF_BIGCL);
1193                 /*
1194                  * Increment 2nd slab.  A 4K big cluster takes
1195                  * 2 slabs, each having at most 1 reference.
1196                  */
1197                 VERIFY(sp->sl_refcnt == 1 && sp->sl_chunks == 1 &&
1198                     sp->sl_len == m_maxsize(MC_BIGCL) && sp->sl_head == NULL);
1199                 /* Next slab must already be present */
1200                 VERIFY(nsp != NULL);
1201                 nsp->sl_refcnt++;
1202                 VERIFY(!slab_is_detached(nsp));
1203                 VERIFY(nsp->sl_class == MC_BIGCL &&
1204                     nsp->sl_flags == (SLF_MAPPED | SLF_PARTIAL) &&
1205                     nsp->sl_refcnt == 1 && nsp->sl_chunks == 0 &&
1206                     nsp->sl_len == 0 && nsp->sl_base == sp->sl_base &&
1207                     nsp->sl_head == NULL);
1208         } else if (class == MC_16KCL) {
1209                 mcl_slab_t *nsp;
1210                 int k;
1211
1212                 --m_infree(MC_16KCL);
1213                 VERIFY(sp->sl_refcnt == 1 && sp->sl_chunks == 1 &&
1214                     sp->sl_len == m_maxsize(MC_16KCL) && sp->sl_head == NULL);
1215                 /*
1216                  * Increment 2nd-8th slab.  A 16K big cluster takes
1217                  * 8 cluster slabs, each having at most 1 reference.
1218                  */
1219                 for (nsp = sp, k = 1; k < (M16KCLBYTES / MCLBYTES); k++) {
1220                         nsp = nsp->sl_next;
1221                         /* Next slab must already be present */
1222                         VERIFY(nsp != NULL);
1223                         nsp->sl_refcnt++;
1224                         VERIFY(!slab_is_detached(nsp));
1225                         VERIFY(nsp->sl_class == MC_16KCL &&
1226                             nsp->sl_flags == (SLF_MAPPED | SLF_PARTIAL) &&
1227                             nsp->sl_refcnt == 1 && nsp->sl_chunks == 0 &&
1228                             nsp->sl_len == 0 && nsp->sl_base == sp->sl_base &&
1229                             nsp->sl_head == NULL);
1230                 }
1231         } else {
1232                 ASSERT(class == MC_MBUF);
1233                 --m_infree(MC_MBUF);
1234                 /*
1235                  * If auditing is turned on, this check is
1236                  * deferred until later in mbuf_slab_audit().
1237                  */
1238                 if (mclaudit == NULL)
1239                         _MCHECK((struct mbuf *)buf);
1240                 /*
1241                  * Since we have incremented the reference count above,
1242                  * an mbuf slab (formerly a 2K cluster slab that was cut
1243                  * up into mbufs) must have a reference count between 1
1244                  * and NMBPCL at this point.
1245                  */
1246                 VERIFY(sp->sl_refcnt >= 1 &&
1247                     (unsigned short)sp->sl_refcnt <= NMBPCL &&
1248                     sp->sl_chunks == NMBPCL && sp->sl_len == m_maxsize(MC_CL));
1249                 VERIFY((unsigned short)sp->sl_refcnt < NMBPCL ||
1250                     sp->sl_head == NULL);
1251         }
1252
1253         /* If empty, remove this slab from the class's freelist */
1254         if (sp->sl_head == NULL) {
1255                 VERIFY(class != MC_MBUF || sp->sl_refcnt == NMBPCL);
1256                 slab_remove(sp, class);
1257         }
1258
1259         return (buf);
1260 }
1261
1262 /*
1263  * Place a slab of object(s) back into a class's slab list.
1264  */
1265 static void
1266 slab_free(mbuf_class_t class, mcache_obj_t *buf)
1267 {
1268         mcl_slab_t *sp;
1269
1270         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1271
1272         VERIFY(class != MC_16KCL || njcl > 0);
1273         VERIFY(buf->obj_next == NULL);
1274         sp = slab_get(buf);
1275         VERIFY(sp->sl_class == class && slab_inrange(sp, buf) &&
1276             (sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED);
1277
1278         /* Decrement slab reference */
1279         sp->sl_refcnt--;
1280
1281         if (class == MC_CL || class == MC_BIGCL) {
1282                 VERIFY(IS_P2ALIGNED(buf, MCLBYTES));
1283                 /*
1284                  * A 2K cluster slab can have at most 1 reference
1285                  * which must be 0 at this point.
1286                  */
1287                 VERIFY(sp->sl_refcnt == 0 && sp->sl_chunks == 1 &&
1288                     sp->sl_len == m_maxsize(class) && sp->sl_head == NULL);
1289                 VERIFY(slab_is_detached(sp));
1290                 if (class == MC_BIGCL) {
1291                         mcl_slab_t *nsp = sp->sl_next;
1292                         VERIFY(IS_P2ALIGNED(buf, NBPG));
1293                         /* Next slab must already be present */
1294                         VERIFY(nsp != NULL);
1295                         /* Decrement 2nd slab reference */
1296                         nsp->sl_refcnt--;
1297                         /*
1298                          * A 4K big cluster takes 2 slabs, both
1299                          * must now have 0 reference.
1300                          */
1301                         VERIFY(slab_is_detached(nsp));
1302                         VERIFY(nsp->sl_class == MC_BIGCL &&
1303                             (nsp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) &&
1304                             nsp->sl_refcnt == 0 && nsp->sl_chunks == 0 &&
1305                             nsp->sl_len == 0 && nsp->sl_base == sp->sl_base &&
1306                             nsp->sl_head == NULL);
1307                 }
1308         } else if (class == MC_16KCL) {
1309                 mcl_slab_t *nsp;
1310                 int k;
1311                 /*
1312                  * A 16K cluster takes 8 cluster slabs, all must
1313                  * now have 0 reference.
1314                  */
1315                 VERIFY(IS_P2ALIGNED(buf, NBPG));
1316                 VERIFY(sp->sl_refcnt == 0 && sp->sl_chunks == 1 &&
1317                     sp->sl_len == m_maxsize(MC_16KCL) && sp->sl_head == NULL);
1318                 VERIFY(slab_is_detached(sp));
1319                 for (nsp = sp, k = 1; k < (M16KCLBYTES / MCLBYTES); k++) {
1320                         nsp = nsp->sl_next;
1321                         /* Next slab must already be present */
1322                         VERIFY(nsp != NULL);
1323                         nsp->sl_refcnt--;
1324                         VERIFY(slab_is_detached(nsp));
1325                         VERIFY(nsp->sl_class == MC_16KCL &&
1326                             (nsp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) &&
1327                             nsp->sl_refcnt == 0 && nsp->sl_chunks == 0 &&
1328                             nsp->sl_len == 0 && nsp->sl_base == sp->sl_base &&
1329                             nsp->sl_head == NULL);
1330                 }
1331         } else {
1332                 /*
1333                  * An mbuf slab has a total of NMBPL reference counts.
1334                  * Since we have decremented the reference above, it
1335                  * must now be between 0 and NMBPCL-1.
1336                  */
1337                 VERIFY(sp->sl_refcnt >= 0 &&
1338                     (unsigned short)sp->sl_refcnt <= (NMBPCL - 1) &&
1339                     sp->sl_chunks == NMBPCL && sp->sl_len == m_maxsize(MC_CL));
1340                 VERIFY(sp->sl_refcnt < (NMBPCL - 1) ||
1341                     (slab_is_detached(sp) && sp->sl_head == NULL));
1342         }
1343
1344         /*
1345          * When auditing is enabled, ensure that the buffer still
1346          * contains the free pattern.  Otherwise it got corrupted
1347          * while at the CPU cache layer.
1348          */
1349         if (mclaudit != NULL) {
1350                 mcache_audit_t *mca = mcl_audit_buf2mca(class, buf);
1351                 mcache_audit_free_verify(mca, buf, 0, m_maxsize(class));
1352                 mca->mca_uflags &= ~MB_SCVALID;
1353         }
1354
1355         if (class == MC_CL) {
1356                 mbstat.m_clfree = (++m_infree(MC_CL)) + m_infree(MC_MBUF_CL);
1357         } else if (class == MC_BIGCL) {
1358                 mbstat.m_bigclfree = (++m_infree(MC_BIGCL)) +
1359                     m_infree(MC_MBUF_BIGCL);
1360         } else if (class == MC_16KCL) {
1361                 ++m_infree(MC_16KCL);
1362         } else {
1363                 ++m_infree(MC_MBUF);
1364                 buf->obj_next = sp->sl_head;
1365         }
1366         sp->sl_head = buf;
1367
1368         /* All mbufs are freed; return the cluster that we stole earlier */
1369         if (sp->sl_refcnt == 0 && class == MC_MBUF) {
1370                 int i = NMBPCL;
1371
1372                 m_total(MC_MBUF) -= NMBPCL;
1373                 mbstat.m_mbufs = m_total(MC_MBUF);
1374                 m_infree(MC_MBUF) -= NMBPCL;
1375                 mtype_stat_add(MT_FREE, -NMBPCL);
1376
1377                 while (i--) {
1378                         struct mbuf *m = sp->sl_head;
1379                         VERIFY(m != NULL);
1380                         sp->sl_head = m->m_next;
1381                         m->m_next = NULL;
1382                 }
1383                 VERIFY(sp->sl_head == NULL);
1384
1385                 /* Remove the slab from the mbuf class's slab list */
1386                 slab_remove(sp, class);
1387
1388                 /* Reinitialize it as a 2K cluster slab */
1389                 slab_init(sp, MC_CL, sp->sl_flags, sp->sl_base, sp->sl_base,
1390                     sp->sl_len, 0, 1);
1391
1392                 if (mclaudit != NULL)
1393                         mcache_set_pattern(MCACHE_FREE_PATTERN,
1394                             (caddr_t)sp->sl_head, m_maxsize(MC_CL));
1395
1396                 mbstat.m_clfree = (++m_infree(MC_CL)) + m_infree(MC_MBUF_CL);
1397
1398                 VERIFY(slab_is_detached(sp));
1399                 /* And finally switch class */
1400                 class = MC_CL;
1401         }
1402
1403         /* Reinsert the slab to the class's slab list */
1404         if (slab_is_detached(sp))
1405                 slab_insert(sp, class);
1406 }
1407
1408 /*
1409  * Common allocator for rudimentary objects called by the CPU cache layer
1410  * during an allocation request whenever there is no available element in the
1411  * bucket layer.  It returns one or more elements from the appropriate global
1412  * freelist.  If the freelist is empty, it will attempt to populate it and
1413  * retry the allocation.
1414  */
1415 static unsigned int
1416 mbuf_slab_alloc(void *arg, mcache_obj_t ***plist, unsigned int num, int wait)
1417 {
1418         mbuf_class_t class = (mbuf_class_t)arg;
1419         unsigned int need = num;
1420         mcache_obj_t **list = *plist;
1421
1422         ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
1423         ASSERT(need > 0);
1424
1425         lck_mtx_lock(mbuf_mlock);
1426
1427         for (;;) {
1428                 if ((*list = slab_alloc(class, wait)) != NULL) {
1429                         (*list)->obj_next = NULL;
1430                         list = *plist = &(*list)->obj_next;
1431
1432                         if (--need == 0) {
1433                                 /*
1434                                  * If the number of elements in freelist has
1435                                  * dropped below low watermark, asynchronously
1436                                  * populate the freelist now rather than doing
1437                                  * it later when we run out of elements.
1438                                  */
1439                                 if (!mbuf_cached_above(class, wait) &&
1440                                     m_infree(class) < m_total(class) >> 5) {
1441                                         (void) freelist_populate(class, 1,
1442                                             M_DONTWAIT);
1443                                 }
1444                                 break;
1445                         }
1446                 } else {
1447                         VERIFY(m_infree(class) == 0 || class == MC_CL);
1448
1449                         (void) freelist_populate(class, 1,
1450                             (wait & MCR_NOSLEEP) ? M_DONTWAIT : M_WAIT);
1451
1452                         if (m_infree(class) > 0)
1453                                 continue;
1454
1455                         /* Check if there's anything at the cache layer */
1456                         if (mbuf_cached_above(class, wait))
1457                                 break;
1458
1459                         /* We have nothing and cannot block; give up */
1460                         if (wait & MCR_NOSLEEP) {
1461                                 if (!(wait & MCR_TRYHARD)) {
1462                                         m_fail_cnt(class)++;
1463                                         mbstat.m_drops++;
1464                                         break;
1465                                 }
1466                         }
1467
1468                         /*
1469                          * If the freelist is still empty and the caller is
1470                          * willing to be blocked, sleep on the wait channel
1471                          * until an element is available.  Otherwise, if
1472                          * MCR_TRYHARD is set, do our best to satisfy the
1473                          * request without having to go to sleep.
1474                          */
1475                         if (mbuf_worker_ready &&
1476                             mbuf_sleep(class, need, wait))
1477                                 break;
1478
1479                         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1480                 }
1481         }
1482
1483         m_alloc_cnt(class) += num - need;
1484         lck_mtx_unlock(mbuf_mlock);
1485
1486         return (num - need);
1487 }
1488
1489 /*
1490  * Common de-allocator for rudimentary objects called by the CPU cache
1491  * layer when one or more elements need to be returned to the appropriate
1492  * global freelist.
1493  */
1494 static void
1495 mbuf_slab_free(void *arg, mcache_obj_t *list, __unused int purged)
1496 {
1497         mbuf_class_t class = (mbuf_class_t)arg;
1498         mcache_obj_t *nlist;
1499         unsigned int num = 0;
1500         int w;
1501
1502         ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
1503
1504         lck_mtx_lock(mbuf_mlock);
1505
1506         for (;;) {
1507                 nlist = list->obj_next;
1508                 list->obj_next = NULL;
1509                 slab_free(class, list);
1510                 ++num;
1511                 if ((list = nlist) == NULL)
1512                         break;
1513         }
1514         m_free_cnt(class) += num;
1515
1516         if ((w = mb_waiters) > 0)
1517                 mb_waiters = 0;
1518
1519         lck_mtx_unlock(mbuf_mlock);
1520
1521         if (w != 0)
1522                 wakeup(mb_waitchan);
1523 }
1524
1525 /*
1526  * Common auditor for rudimentary objects called by the CPU cache layer
1527  * during an allocation or free request.  For the former, this is called
1528  * after the objects are obtained from either the bucket or slab layer
1529  * and before they are returned to the caller.  For the latter, this is
1530  * called immediately during free and before placing the objects into
1531  * the bucket or slab layer.
1532  */
1533 static void
1534 mbuf_slab_audit(void *arg, mcache_obj_t *list, boolean_t alloc)
1535 {
1536         mbuf_class_t class = (mbuf_class_t)arg;
1537         mcache_audit_t *mca;
1538
1539         ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
1540
1541         while (list != NULL) {
1542                 lck_mtx_lock(mbuf_mlock);
1543                 mca = mcl_audit_buf2mca(class, list);
1544
1545                 /* Do the sanity checks */
1546                 if (class == MC_MBUF) {
1547                         mcl_audit_mbuf(mca, list, FALSE, alloc);
1548                         ASSERT(mca->mca_uflags & MB_SCVALID);
1549                 } else {
1550                         mcl_audit_cluster(mca, list, m_maxsize(class),
1551                             alloc, TRUE);
1552                         ASSERT(!(mca->mca_uflags & MB_SCVALID));
1553                 }
1554                 /* Record this transaction */
1555                 mcache_buffer_log(mca, list, m_cache(class));
1556                 if (alloc)
1557                         mca->mca_uflags |= MB_INUSE;
1558                 else
1559                         mca->mca_uflags &= ~MB_INUSE;
1560                 /* Unpair the object (unconditionally) */
1561                 mca->mca_uptr = NULL;
1562                 lck_mtx_unlock(mbuf_mlock);
1563
1564                 list = list->obj_next;
1565         }
1566 }
1567
1568 /*
1569  * Common notify routine for all caches.  It is called by mcache when
1570  * one or more objects get freed.  We use this indication to trigger
1571  * the wakeup of any sleeping threads so that they can retry their
1572  * allocation requests.
1573  */
1574 static void
1575 mbuf_slab_notify(void *arg, u_int32_t reason)
1576 {
1577         mbuf_class_t class = (mbuf_class_t)arg;
1578         int w;
1579
1580         ASSERT(MBUF_CLASS_VALID(class));
1581
1582         if (reason != MCN_RETRYALLOC)
1583                 return;
1584
1585         lck_mtx_lock(mbuf_mlock);
1586         if ((w = mb_waiters) > 0) {
1587                 m_notified(class)++;
1588                 mb_waiters = 0;
1589         }
1590         lck_mtx_unlock(mbuf_mlock);
1591
1592         if (w != 0)
1593                 wakeup(mb_waitchan);
1594 }
1595
1596 /*
1597  * Obtain object(s) from the composite class's freelist.
1598  */
1599 static unsigned int
1600 cslab_alloc(mbuf_class_t class, mcache_obj_t ***plist, unsigned int num)
1601 {
1602         unsigned int need = num;
1603         mcl_slab_t *sp, *clsp, *nsp;
1604         struct mbuf *m;
1605         mcache_obj_t **list = *plist;
1606         void *cl;
1607
1608         VERIFY(need > 0);
1609         VERIFY(class != MC_MBUF_16KCL || njcl > 0);
1610         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1611
1612         /* Get what we can from the freelist */
1613         while ((*list = m_cobjlist(class)) != NULL) {
1614                 MRANGE(*list);
1615
1616                 m = (struct mbuf *)*list;
1617                 sp = slab_get(m);
1618                 cl = m->m_ext.ext_buf;
1619                 clsp = slab_get(cl);
1620                 VERIFY(m->m_flags == M_EXT && cl != NULL);
1621                 VERIFY(MEXT_RFA(m) != NULL && MBUF_IS_COMPOSITE(m));
1622                 VERIFY(clsp->sl_refcnt == 1);
1623                 if (class == MC_MBUF_BIGCL) {
1624                         nsp = clsp->sl_next;
1625                         /* Next slab must already be present */
1626                         VERIFY(nsp != NULL);
1627                         VERIFY(nsp->sl_refcnt == 1);
1628                 } else if (class == MC_MBUF_16KCL) {
1629                         int k;
1630                         for (nsp = clsp, k = 1;
1631                             k < (M16KCLBYTES / MCLBYTES); k++) {
1632                                 nsp = nsp->sl_next;
1633                                 /* Next slab must already be present */
1634                                 VERIFY(nsp != NULL);
1635                                 VERIFY(nsp->sl_refcnt == 1);
1636                         }
1637                 }
1638
1639                 if ((m_cobjlist(class) = (*list)->obj_next) != NULL &&
1640                     !MBUF_IN_MAP(m_cobjlist(class))) {
1641                         slab_nextptr_panic(sp, m_cobjlist(class));
1642                         /* NOTREACHED */
1643                 }
1644                 (*list)->obj_next = NULL;
1645                 list = *plist = &(*list)->obj_next;
1646
1647                 if (--need == 0)
1648                         break;
1649         }
1650         m_infree(class) -= (num - need);
1651
1652         return (num - need);
1653 }
1654
1655 /*
1656  * Place object(s) back into a composite class's freelist.
1657  */
1658 static unsigned int
1659 cslab_free(mbuf_class_t class, mcache_obj_t *list, int purged)
1660 {
1661         mcache_obj_t *o, *tail;
1662         unsigned int num = 0;
1663         struct mbuf *m, *ms;
1664         mcache_audit_t *mca = NULL;
1665         mcache_obj_t *ref_list = NULL;
1666         mcl_slab_t *clsp, *nsp;
1667         void *cl;
1668
1669         ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
1670         VERIFY(class != MC_MBUF_16KCL || njcl > 0);
1671         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1672
1673         o = tail = list;
1674
1675         while ((m = ms = (struct mbuf *)o) != NULL) {
1676                 mcache_obj_t *rfa, *nexto = o->obj_next;
1677
1678                 /* Do the mbuf sanity checks */
1679                 if (mclaudit != NULL) {
1680                         mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
1681                         mcache_audit_free_verify(mca, m, 0, m_maxsize(MC_MBUF));
1682                         ms = (struct mbuf *)mca->mca_contents;
1683                 }
1684
1685                 /* Do the cluster sanity checks */
1686                 cl = ms->m_ext.ext_buf;
1687                 clsp = slab_get(cl);
1688                 if (mclaudit != NULL) {
1689                         size_t size;
1690                         if (class == MC_MBUF_CL)
1691                                 size = m_maxsize(MC_CL);
1692                         else if (class == MC_MBUF_BIGCL)
1693                                 size = m_maxsize(MC_BIGCL);
1694                         else
1695                                 size = m_maxsize(MC_16KCL);
1696                         mcache_audit_free_verify(mcl_audit_buf2mca(MC_CL,
1697                             (mcache_obj_t *)cl), cl, 0, size);
1698                 }
1699                 VERIFY(ms->m_type == MT_FREE);
1700                 VERIFY(ms->m_flags == M_EXT);
1701                 VERIFY(MEXT_RFA(ms) != NULL && MBUF_IS_COMPOSITE(ms));
1702                 VERIFY(clsp->sl_refcnt == 1);
1703                 if (class == MC_MBUF_BIGCL) {
1704                         nsp = clsp->sl_next;
1705                         /* Next slab must already be present */
1706                         VERIFY(nsp != NULL);
1707                         VERIFY(nsp->sl_refcnt == 1);
1708                 } else if (class == MC_MBUF_16KCL) {
1709                         int k;
1710                         for (nsp = clsp, k = 1;
1711                             k < (M16KCLBYTES / MCLBYTES); k++) {
1712                                 nsp = nsp->sl_next;
1713                                 /* Next slab must already be present */
1714                                 VERIFY(nsp != NULL);
1715                                 VERIFY(nsp->sl_refcnt == 1);
1716                         }
1717                 }
1718
1719                 /*
1720                  * If we're asked to purge, restore the actual mbuf using
1721                  * contents of the shadow structure (if auditing is enabled)
1722                  * and clear EXTF_COMPOSITE flag from the mbuf, as we are
1723                  * about to free it and the attached cluster into their caches.
1724                  */
1725                 if (purged) {
1726                         /* Restore constructed mbuf fields */
1727                         if (mclaudit != NULL)
1728                                 mcl_audit_restore_mbuf(m, mca, TRUE);
1729
1730                         MEXT_REF(m) = 0;
1731                         MEXT_FLAGS(m) = 0;
1732
1733                         rfa = (mcache_obj_t *)MEXT_RFA(m);
1734                         rfa->obj_next = ref_list;
1735                         ref_list = rfa;
1736                         MEXT_RFA(m) = NULL;
1737
1738                         m->m_type = MT_FREE;
1739                         m->m_flags = m->m_len = 0;
1740                         m->m_next = m->m_nextpkt = NULL;
1741
1742                         /* Save mbuf fields and make auditing happy */
1743                         if (mclaudit != NULL)
1744                                 mcl_audit_mbuf(mca, o, FALSE, FALSE);
1745
1746                         VERIFY(m_total(class) > 0);
1747                         m_total(class)--;
1748
1749                         /* Free the mbuf */
1750                         o->obj_next = NULL;
1751                         slab_free(MC_MBUF, o);
1752
1753                         /* And free the cluster */
1754                         ((mcache_obj_t *)cl)->obj_next = NULL;
1755                         if (class == MC_MBUF_CL)
1756                                 slab_free(MC_CL, cl);
1757                         else if (class == MC_MBUF_BIGCL)
1758                                 slab_free(MC_BIGCL, cl);
1759                         else
1760                                 slab_free(MC_16KCL, cl);
1761                 }
1762
1763                 ++num;
1764                 tail = o;
1765                 o = nexto;
1766         }
1767
1768         if (!purged) {
1769                 tail->obj_next = m_cobjlist(class);
1770                 m_cobjlist(class) = list;
1771                 m_infree(class) += num;
1772         } else if (ref_list != NULL) {
1773                 mcache_free_ext(ref_cache, ref_list);
1774         }
1775
1776         return (num);
1777 }
1778
1779 /*
1780  * Common allocator for composite objects called by the CPU cache layer
1781  * during an allocation request whenever there is no available element in
1782  * the bucket layer.  It returns one or more composite elements from the
1783  * appropriate global freelist.  If the freelist is empty, it will attempt
1784  * to obtain the rudimentary objects from their caches and construct them
1785  * into composite mbuf + cluster objects.
1786  */
1787 static unsigned int
1788 mbuf_cslab_alloc(void *arg, mcache_obj_t ***plist, unsigned int needed,
1789     int wait)
1790 {
1791         mbuf_class_t class = (mbuf_class_t)arg;
1792         mcache_t *cp = NULL;
1793         unsigned int num = 0, cnum = 0, want = needed;
1794         mcache_obj_t *ref_list = NULL;
1795         mcache_obj_t *mp_list = NULL;
1796         mcache_obj_t *clp_list = NULL;
1797         mcache_obj_t **list;
1798         struct ext_ref *rfa;
1799         struct mbuf *m;
1800         void *cl;
1801
1802         ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
1803         ASSERT(needed > 0);
1804
1805         VERIFY(class != MC_MBUF_16KCL || njcl > 0);
1806
1807         /* There should not be any slab for this class */
1808         VERIFY(m_slab_cnt(class) == 0 &&
1809             m_slablist(class).tqh_first == NULL &&
1810             m_slablist(class).tqh_last == NULL);
1811
1812         lck_mtx_lock(mbuf_mlock);
1813
1814         /* Try using the freelist first */
1815         num = cslab_alloc(class, plist, needed);
1816         list = *plist;
1817         if (num == needed) {
1818                 m_alloc_cnt(class) += num;
1819                 lck_mtx_unlock(mbuf_mlock);
1820                 return (needed);
1821         }
1822
1823         lck_mtx_unlock(mbuf_mlock);
1824
1825         /*
1826          * We could not satisfy the request using the freelist alone;
1827          * allocate from the appropriate rudimentary caches and use
1828          * whatever we can get to construct the composite objects.
1829          */
1830         needed -= num;
1831
1832         /*
1833          * Mark these allocation requests as coming from a composite cache.
1834          * Also, if the caller is willing to be blocked, mark the request
1835          * with MCR_FAILOK such that we don't end up sleeping at the mbuf
1836          * slab layer waiting for the individual object when one or more
1837          * of the already-constructed composite objects are available.
1838          */
1839         wait |= MCR_COMP;
1840         if (!(wait & MCR_NOSLEEP))
1841                 wait |= MCR_FAILOK;
1842
1843         needed = mcache_alloc_ext(m_cache(MC_MBUF), &mp_list, needed, wait);
1844         if (needed == 0) {
1845                 ASSERT(mp_list == NULL);
1846                 goto fail;
1847         }
1848         if (class == MC_MBUF_CL)
1849                 cp = m_cache(MC_CL);
1850         else if (class == MC_MBUF_BIGCL)
1851                 cp = m_cache(MC_BIGCL);
1852         else
1853                 cp = m_cache(MC_16KCL);
1854         needed = mcache_alloc_ext(cp, &clp_list, needed, wait);
1855         if (needed == 0) {
1856                 ASSERT(clp_list == NULL);
1857                 goto fail;
1858         }
1859         needed = mcache_alloc_ext(ref_cache, &ref_list, needed, wait);
1860         if (needed == 0) {
1861                 ASSERT(ref_list == NULL);
1862                 goto fail;
1863         }
1864
1865         /*
1866          * By this time "needed" is MIN(mbuf, cluster, ref).  Any left
1867          * overs will get freed accordingly before we return to caller.
1868          */
1869         for (cnum = 0; cnum < needed; cnum++) {
1870                 struct mbuf *ms;
1871
1872                 m = ms = (struct mbuf *)mp_list;
1873                 mp_list = mp_list->obj_next;
1874
1875                 cl = clp_list;
1876                 clp_list = clp_list->obj_next;
1877                 ((mcache_obj_t *)cl)->obj_next = NULL;
1878
1879                 rfa = (struct ext_ref *)ref_list;
1880                 ref_list = ref_list->obj_next;
1881                 ((mcache_obj_t *)rfa)->obj_next = NULL;
1882
1883                 /*
1884                  * If auditing is enabled, construct the shadow mbuf
1885                  * in the audit structure instead of in the actual one.
1886                  * mbuf_cslab_audit() will take care of restoring the
1887                  * contents after the integrity check.
1888                  */
1889                 if (mclaudit != NULL) {
1890                         mcache_audit_t *mca, *cl_mca;
1891                         size_t size;
1892
1893                         lck_mtx_lock(mbuf_mlock);
1894                         mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
1895                         ms = ((struct mbuf *)mca->mca_contents);
1896                         cl_mca = mcl_audit_buf2mca(MC_CL, (mcache_obj_t *)cl);
1897
1898                         /*
1899                          * Pair them up.  Note that this is done at the time
1900                          * the mbuf+cluster objects are constructed.  This
1901                          * information should be treated as "best effort"
1902                          * debugging hint since more than one mbufs can refer
1903                          * to a cluster.  In that case, the cluster might not
1904                          * be freed along with the mbuf it was paired with.
1905                          */
1906                         mca->mca_uptr = cl_mca;
1907                         cl_mca->mca_uptr = mca;
1908
1909                         ASSERT(mca->mca_uflags & MB_SCVALID);
1910                         ASSERT(!(cl_mca->mca_uflags & MB_SCVALID));
1911                         lck_mtx_unlock(mbuf_mlock);
1912
1913                         /* Technically, they are in the freelist */
1914                         mcache_set_pattern(MCACHE_FREE_PATTERN, m,
1915                             m_maxsize(MC_MBUF));
1916                         if (class == MC_MBUF_CL)
1917                                 size = m_maxsize(MC_CL);
1918                         else if (class == MC_MBUF_BIGCL)
1919                                 size = m_maxsize(MC_BIGCL);
1920                         else
1921                                 size = m_maxsize(MC_16KCL);
1922                         mcache_set_pattern(MCACHE_FREE_PATTERN, cl, size);
1923                 }
1924
1925                 MBUF_INIT(ms, 0, MT_FREE);
1926                 if (class == MC_MBUF_16KCL) {
1927                         MBUF_16KCL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE);
1928                 } else if (class == MC_MBUF_BIGCL) {
1929                         MBUF_BIGCL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE);
1930                 } else {
1931                         MBUF_CL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE);
1932                 }
1933                 VERIFY(ms->m_flags == M_EXT);
1934                 VERIFY(MEXT_RFA(ms) != NULL && MBUF_IS_COMPOSITE(ms));
1935
1936                 *list = (mcache_obj_t *)m;
1937                 (*list)->obj_next = NULL;
1938                 list = *plist = &(*list)->obj_next;
1939         }
1940
1941 fail:
1942         /*
1943          * Free up what's left of the above.
1944          */
1945         if (mp_list != NULL)
1946                 mcache_free_ext(m_cache(MC_MBUF), mp_list);
1947         if (clp_list != NULL)
1948                 mcache_free_ext(cp, clp_list);
1949         if (ref_list != NULL)
1950                 mcache_free_ext(ref_cache, ref_list);
1951
1952         lck_mtx_lock(mbuf_mlock);
1953         if (num > 0 || cnum > 0) {
1954                 m_total(class) += cnum;
1955                 VERIFY(m_total(class) <= m_maxlimit(class));
1956                 m_alloc_cnt(class) += num + cnum;
1957         }
1958         if ((num + cnum) < want)
1959                 m_fail_cnt(class) += (want - (num + cnum));
1960         lck_mtx_unlock(mbuf_mlock);
1961
1962         return (num + cnum);
1963 }
1964
1965 /*
1966  * Common de-allocator for composite objects called by the CPU cache
1967  * layer when one or more elements need to be returned to the appropriate
1968  * global freelist.
1969  */
1970 static void
1971 mbuf_cslab_free(void *arg, mcache_obj_t *list, int purged)
1972 {
1973         mbuf_class_t class = (mbuf_class_t)arg;
1974         unsigned int num;
1975         int w;
1976
1977         ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
1978
1979         lck_mtx_lock(mbuf_mlock);
1980
1981         num = cslab_free(class, list, purged);
1982         m_free_cnt(class) += num;
1983
1984         if ((w = mb_waiters) > 0)
1985                 mb_waiters = 0;
1986
1987         lck_mtx_unlock(mbuf_mlock);
1988
1989         if (w != 0)
1990                 wakeup(mb_waitchan);
1991 }
1992
1993 /*
1994  * Common auditor for composite objects called by the CPU cache layer
1995  * during an allocation or free request.  For the former, this is called
1996  * after the objects are obtained from either the bucket or slab layer
1997  * and before they are returned to the caller.  For the latter, this is
1998  * called immediately during free and before placing the objects into
1999  * the bucket or slab layer.
2000  */
2001 static void
2002 mbuf_cslab_audit(void *arg, mcache_obj_t *list, boolean_t alloc)
2003 {
2004         mbuf_class_t class = (mbuf_class_t)arg;
2005         mcache_audit_t *mca;
2006         struct mbuf *m, *ms;
2007         mcl_slab_t *clsp, *nsp;
2008         size_t size;
2009         void *cl;
2010
2011         ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
2012
2013         while ((m = ms = (struct mbuf *)list) != NULL) {
2014                 lck_mtx_lock(mbuf_mlock);
2015                 /* Do the mbuf sanity checks and record its transaction */
2016                 mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
2017                 mcl_audit_mbuf(mca, m, TRUE, alloc);
2018                 mcache_buffer_log(mca, m, m_cache(class));
2019                 if (alloc)
2020                         mca->mca_uflags |= MB_COMP_INUSE;
2021                 else
2022                         mca->mca_uflags &= ~MB_COMP_INUSE;
2023
2024                 /*
2025                  * Use the shadow mbuf in the audit structure if we are
2026                  * freeing, since the contents of the actual mbuf has been
2027                  * pattern-filled by the above call to mcl_audit_mbuf().
2028                  */
2029                 if (!alloc)
2030                         ms = (struct mbuf *)mca->mca_contents;
2031
2032                 /* Do the cluster sanity checks and record its transaction */
2033                 cl = ms->m_ext.ext_buf;
2034                 clsp = slab_get(cl);
2035                 VERIFY(ms->m_flags == M_EXT && cl != NULL);
2036                 VERIFY(MEXT_RFA(ms) != NULL && MBUF_IS_COMPOSITE(ms));
2037                 VERIFY(clsp->sl_refcnt == 1);
2038                 if (class == MC_MBUF_BIGCL) {
2039                         nsp = clsp->sl_next;
2040                         /* Next slab must already be present */
2041                         VERIFY(nsp != NULL);
2042                         VERIFY(nsp->sl_refcnt == 1);
2043                 } else if (class == MC_MBUF_16KCL) {
2044                         int k;
2045                         for (nsp = clsp, k = 1;
2046                             k < (M16KCLBYTES / MCLBYTES); k++) {
2047                                 nsp = nsp->sl_next;
2048                                 /* Next slab must already be present */
2049                                 VERIFY(nsp != NULL);
2050                                 VERIFY(nsp->sl_refcnt == 1);
2051                         }
2052                 }
2053
2054                 mca = mcl_audit_buf2mca(MC_CL, cl);
2055                 if (class == MC_MBUF_CL)
2056                         size = m_maxsize(MC_CL);
2057                 else if (class == MC_MBUF_BIGCL)
2058                         size = m_maxsize(MC_BIGCL);
2059                 else
2060                         size = m_maxsize(MC_16KCL);
2061                 mcl_audit_cluster(mca, cl, size, alloc, FALSE);
2062                 mcache_buffer_log(mca, cl, m_cache(class));
2063                 if (alloc)
2064                         mca->mca_uflags |= MB_COMP_INUSE;
2065                 else
2066                         mca->mca_uflags &= ~MB_COMP_INUSE;
2067                 lck_mtx_unlock(mbuf_mlock);
2068
2069                 list = list->obj_next;
2070         }
2071 }
2072
2073 /*
2074  * Allocate some number of mbuf clusters and place on cluster freelist.
2075  */
2076 static int
2077 m_clalloc(const u_int32_t num, const int wait, const u_int32_t bufsize)
2078 {
2079         int i;
2080         vm_size_t size = 0;
2081         int numpages = 0;
2082         vm_offset_t page = 0;
2083         mcache_audit_t *mca_list = NULL;
2084         mcache_obj_t *con_list = NULL;
2085         mcl_slab_t *sp;
2086
2087         VERIFY(bufsize == m_maxsize(MC_CL) ||
2088             bufsize == m_maxsize(MC_BIGCL) || bufsize == m_maxsize(MC_16KCL));
2089
2090         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2091
2092         /*
2093          * Multiple threads may attempt to populate the cluster map one
2094          * after another.  Since we drop the lock below prior to acquiring
2095          * the physical page(s), our view of the cluster map may no longer
2096          * be accurate, and we could end up over-committing the pages beyond
2097          * the maximum allowed for each class.  To prevent it, this entire
2098          * operation (including the page mapping) is serialized.
2099          */
2100         while (mb_clalloc_busy) {
2101                 mb_clalloc_waiters++;
2102                 (void) msleep(mb_clalloc_waitchan, mbuf_mlock,
2103                     (PZERO-1), "m_clalloc", NULL);
2104                 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2105         }
2106
2107         /* We are busy now; tell everyone else to go away */
2108         mb_clalloc_busy = TRUE;
2109
2110         /*
2111          * Honor the caller's wish to block or not block.  We have a way
2112          * to grow the pool asynchronously using the mbuf worker thread.
2113          */
2114         i = m_howmany(num, bufsize);
2115         if (i == 0 || (wait & M_DONTWAIT))
2116                 goto out;
2117
2118         lck_mtx_unlock(mbuf_mlock);
2119
2120         size = round_page_32(i * bufsize);
2121         page = kmem_mb_alloc(mb_map, size);
2122
2123         if (page == 0) {
2124                 if (bufsize <= m_maxsize(MC_BIGCL)) {
2125                         /* Try for 1 page if failed, only for 2KB/4KB request */
2126                         size = NBPG;
2127                         page = kmem_mb_alloc(mb_map, size);
2128                 }
2129
2130                 if (page == 0) {
2131                         lck_mtx_lock(mbuf_mlock);
2132                         goto out;
2133                 }
2134         }
2135
2136         VERIFY(IS_P2ALIGNED(page, NBPG));
2137         numpages = size / NBPG;
2138
2139         /* If auditing is enabled, allocate the audit structures now */
2140         if (mclaudit != NULL) {
2141                 int needed;
2142
2143                 /*
2144                  * Yes, I realize this is a waste of memory for clusters
2145                  * that never get transformed into mbufs, as we may end
2146                  * up with NMBPCL-1 unused audit structures per cluster.
2147                  * But doing so tremendously simplifies the allocation
2148                  * strategy, since at this point we are not holding the
2149                  * mbuf lock and the caller is okay to be blocked.  For
2150                  * the case of big clusters, we allocate one structure
2151                  * for each as we never turn them into mbufs.
2152                  */
2153                 if (bufsize == m_maxsize(MC_CL)) {
2154                         needed = numpages * 2 * NMBPCL;
2155
2156                         i = mcache_alloc_ext(mcl_audit_con_cache,
2157                             &con_list, needed, MCR_SLEEP);
2158
2159                         VERIFY(con_list != NULL && i == needed);
2160                 } else if (bufsize == m_maxsize(MC_BIGCL)) {
2161                         needed = numpages;
2162                 } else {
2163                         needed = numpages / (M16KCLBYTES / NBPG);
2164                 }
2165
2166                 i = mcache_alloc_ext(mcache_audit_cache,
2167                     (mcache_obj_t **)&mca_list, needed, MCR_SLEEP);
2168
2169                 VERIFY(mca_list != NULL && i == needed);
2170         }
2171
2172         lck_mtx_lock(mbuf_mlock);
2173
2174         for (i = 0; i < numpages; i++, page += NBPG) {
2175                 ppnum_t offset = ((char *)page - (char *)mbutl) / NBPG;
2176                 ppnum_t new_page = pmap_find_phys(kernel_pmap,
2177                     (vm_address_t)page);
2178
2179                 /*
2180                  * In the case of no mapper being available the following
2181                  * code noops and returns the input page; if there is a
2182                  * mapper the appropriate I/O page is returned.
2183                  */
2184                 new_page = IOMapperInsertPage(mcl_paddr_base, offset, new_page);
2185                 mcl_paddr[offset] = new_page << PGSHIFT;
2186
2187                 /* Pattern-fill this fresh page */
2188                 if (mclaudit != NULL)
2189                         mcache_set_pattern(MCACHE_FREE_PATTERN,
2190                             (caddr_t)page, NBPG);
2191
2192                 if (bufsize == m_maxsize(MC_CL)) {
2193                         union mcluster *mcl = (union mcluster *)page;
2194
2195                         /* 1st cluster in the page */
2196                         sp = slab_get(mcl);
2197                         if (mclaudit != NULL)
2198                                 mcl_audit_init(mcl, &mca_list, &con_list,
2199                                     AUDIT_CONTENTS_SIZE, NMBPCL);
2200
2201                         VERIFY(sp->sl_refcnt == 0 && sp->sl_flags == 0);
2202                         slab_init(sp, MC_CL, SLF_MAPPED,
2203                             mcl, mcl, bufsize, 0, 1);
2204
2205                         /* Insert this slab */
2206                         slab_insert(sp, MC_CL);
2207
2208                         /* Update stats now since slab_get() drops the lock */
2209                         mbstat.m_clfree = ++m_infree(MC_CL) +
2210                             m_infree(MC_MBUF_CL);
2211                         mbstat.m_clusters = ++m_total(MC_CL);
2212                         VERIFY(m_total(MC_CL) <= m_maxlimit(MC_CL));
2213
2214                         /* 2nd cluster in the page */
2215                         sp = slab_get(++mcl);
2216                         if (mclaudit != NULL)
2217                                 mcl_audit_init(mcl, &mca_list, &con_list,
2218                                     AUDIT_CONTENTS_SIZE, NMBPCL);
2219
2220                         VERIFY(sp->sl_refcnt == 0 && sp->sl_flags == 0);
2221                         slab_init(sp, MC_CL, SLF_MAPPED,
2222                             mcl, mcl, bufsize, 0, 1);
2223
2224                         /* Insert this slab */
2225                         slab_insert(sp, MC_CL);
2226
2227                         /* Update stats now since slab_get() drops the lock */
2228                         mbstat.m_clfree = ++m_infree(MC_CL) +
2229                             m_infree(MC_MBUF_CL);
2230                         mbstat.m_clusters = ++m_total(MC_CL);
2231                         VERIFY(m_total(MC_CL) <= m_maxlimit(MC_CL));
2232                 } else if (bufsize == m_maxsize(MC_BIGCL)) {
2233                         union mbigcluster *mbc = (union mbigcluster *)page;
2234                         mcl_slab_t *nsp;
2235
2236                         /* One for the entire page */
2237                         sp = slab_get(mbc);
2238                         if (mclaudit != NULL)
2239                                 mcl_audit_init(mbc, &mca_list, NULL, 0, 1);
2240
2241                         VERIFY(sp->sl_refcnt == 0 && sp->sl_flags == 0);
2242                         slab_init(sp, MC_BIGCL, SLF_MAPPED,
2243                             mbc, mbc, bufsize, 0, 1);
2244
2245                         /* 2nd cluster's slab is part of the previous one */
2246                         nsp = slab_get(((union mcluster *)page) + 1);
2247                         slab_init(nsp, MC_BIGCL, SLF_MAPPED | SLF_PARTIAL,
2248                             mbc, NULL, 0, 0, 0);
2249
2250                         /* Insert this slab */
2251                         slab_insert(sp, MC_BIGCL);
2252
2253                         /* Update stats now since slab_get() drops the lock */
2254                         mbstat.m_bigclfree = ++m_infree(MC_BIGCL) +
2255                             m_infree(MC_MBUF_BIGCL);
2256                         mbstat.m_bigclusters = ++m_total(MC_BIGCL);
2257                         VERIFY(m_total(MC_BIGCL) <= m_maxlimit(MC_BIGCL));
2258                 } else if ((i % (M16KCLBYTES / NBPG)) == 0) {
2259                         union m16kcluster *m16kcl = (union m16kcluster *)page;
2260                         mcl_slab_t *nsp;
2261                         int k;
2262
2263                         VERIFY(njcl > 0);
2264                         /* One for the entire 16KB */
2265                         sp = slab_get(m16kcl);
2266                         if (mclaudit != NULL)
2267                                 mcl_audit_init(m16kcl, &mca_list, NULL, 0, 1);
2268
2269                         VERIFY(sp->sl_refcnt == 0 && sp->sl_flags == 0);
2270                         slab_init(sp, MC_16KCL, SLF_MAPPED,
2271                             m16kcl, m16kcl, bufsize, 0, 1);
2272
2273                         /* 2nd-8th cluster's slab is part of the first one */
2274                         for (k = 1; k < (M16KCLBYTES / MCLBYTES); k++) {
2275                                 nsp = slab_get(((union mcluster *)page) + k);
2276                                 VERIFY(nsp->sl_refcnt == 0 &&
2277                                     nsp->sl_flags == 0);
2278                                 slab_init(nsp, MC_16KCL,
2279                                     SLF_MAPPED | SLF_PARTIAL,
2280                                     m16kcl, NULL, 0, 0, 0);
2281                         }
2282
2283                         /* Insert this slab */
2284                         slab_insert(sp, MC_16KCL);
2285
2286                         /* Update stats now since slab_get() drops the lock */
2287                         m_infree(MC_16KCL)++;
2288                         m_total(MC_16KCL)++;
2289                         VERIFY(m_total(MC_16KCL) <= m_maxlimit(MC_16KCL));
2290                 }
2291         }
2292         VERIFY(mca_list == NULL && con_list == NULL);
2293
2294         /* We're done; let others enter */
2295         mb_clalloc_busy = FALSE;
2296         if (mb_clalloc_waiters > 0) {
2297                 mb_clalloc_waiters = 0;
2298                 wakeup(mb_clalloc_waitchan);
2299         }
2300
2301         if (bufsize == m_maxsize(MC_CL))
2302                 return (numpages << 1);
2303         else if (bufsize == m_maxsize(MC_BIGCL))
2304                 return (numpages);
2305
2306         VERIFY(bufsize == m_maxsize(MC_16KCL));
2307         return (numpages / (M16KCLBYTES / NBPG));
2308
2309 out:
2310         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2311
2312         /* We're done; let others enter */
2313         mb_clalloc_busy = FALSE;
2314         if (mb_clalloc_waiters > 0) {
2315                 mb_clalloc_waiters = 0;
2316                 wakeup(mb_clalloc_waitchan);
2317         }
2318
2319         /*
2320          * When non-blocking we kick a thread if we have to grow the
2321          * pool or if the number of free clusters is less than requested.
2322          */
2323         if (bufsize == m_maxsize(MC_CL)) {
2324                 if (i > 0) {
2325                         /*
2326                          * Remember total number of clusters needed
2327                          * at this time.
2328                          */
2329                         i += m_total(MC_CL);
2330                         if (i > mbuf_expand_mcl) {
2331                                 mbuf_expand_mcl = i;
2332                                 if (mbuf_worker_ready)
2333                                         wakeup((caddr_t)&mbuf_worker_run);
2334                         }
2335                 }
2336
2337                 if (m_infree(MC_CL) >= num)
2338                         return (1);
2339         } else if (bufsize == m_maxsize(MC_BIGCL)) {
2340                 if (i > 0) {
2341                         /*
2342                          * Remember total number of 4KB clusters needed
2343                          * at this time.
2344                          */
2345                         i += m_total(MC_BIGCL);
2346                         if (i > mbuf_expand_big) {
2347                                 mbuf_expand_big = i;
2348                                 if (mbuf_worker_ready)
2349                                         wakeup((caddr_t)&mbuf_worker_run);
2350                         }
2351                 }
2352
2353                 if (m_infree(MC_BIGCL) >= num)
2354                         return (1);
2355         } else {
2356                 if (i > 0) {
2357                         /*
2358                          * Remember total number of 16KB clusters needed
2359                          * at this time.
2360                          */
2361                         i += m_total(MC_16KCL);
2362                         if (i > mbuf_expand_16k) {
2363                                 mbuf_expand_16k = i;
2364                                 if (mbuf_worker_ready)
2365                                         wakeup((caddr_t)&mbuf_worker_run);
2366                         }
2367                 }
2368
2369                 if (m_infree(MC_16KCL) >= num)
2370                         return (1);
2371         }
2372         return (0);
2373 }
2374
2375 /*
2376  * Populate the global freelist of the corresponding buffer class.
2377  */
2378 static int
2379 freelist_populate(mbuf_class_t class, unsigned int num, int wait)
2380 {
2381         mcache_obj_t *o = NULL;
2382         int i;
2383
2384         VERIFY(class == MC_MBUF || class == MC_CL || class == MC_BIGCL ||
2385             class == MC_16KCL);
2386
2387 #if CONFIG_MBUF_NOEXPAND
2388         if ((mbstat.m_mbufs / NMBPCL) >= maxmbufcl) {
2389 #if DEBUG
2390                 static int printonce = 1;
2391                 if (printonce == 1) {
2392                         printonce = 0;
2393                         printf("m_expand failed, allocated %ld out of %d "
2394                             "clusters\n", mbstat.m_mbufs / NMBPCL,
2395                             nmbclusters);
2396                 }
2397 #endif /* DEBUG */
2398                 return (0);
2399         }
2400 #endif /* CONFIG_MBUF_NOEXPAND */
2401
2402         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2403
2404         switch (class) {
2405         case MC_MBUF:
2406         case MC_CL:
2407                 i = m_clalloc(num, wait, m_maxsize(MC_CL));
2408
2409                 /* Respect the 2K clusters minimum limit */
2410                 if (m_total(MC_CL) == m_maxlimit(MC_CL) &&
2411                     m_infree(MC_CL) <= m_minlimit(MC_CL)) {
2412                         if (class != MC_CL || (wait & MCR_COMP))
2413                                 return (0);
2414                 }
2415                 if (class == MC_CL)
2416                         return (i != 0);
2417                 break;
2418
2419         case MC_BIGCL:
2420         case MC_16KCL:
2421                 return (m_clalloc(num, wait, m_maxsize(class)) != 0);
2422                 /* NOTREACHED */
2423
2424         default:
2425                 VERIFY(0);
2426                 /* NOTREACHED */
2427         }
2428
2429         /* Steal a cluster and cut it up to create NMBPCL mbufs */
2430         if ((o = slab_alloc(MC_CL, wait)) != NULL) {
2431                 struct mbuf *m = (struct mbuf *)o;
2432                 mcache_audit_t *mca = NULL;
2433                 mcl_slab_t *sp = slab_get(o);
2434
2435                 VERIFY(slab_is_detached(sp) &&
2436                     (sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED);
2437
2438                 /* Make sure that the cluster is unmolested while in freelist */
2439                 if (mclaudit != NULL) {
2440                         mca = mcl_audit_buf2mca(MC_CL, o);
2441                         mcache_audit_free_verify(mca, o, 0, m_maxsize(MC_CL));
2442                 }
2443
2444                 /* Reinitialize it as an mbuf slab */
2445                 slab_init(sp, MC_MBUF, sp->sl_flags, sp->sl_base, NULL,
2446                     sp->sl_len, 0, NMBPCL);
2447
2448                 VERIFY(m == (struct mbuf *)sp->sl_base);
2449                 VERIFY(sp->sl_head == NULL);
2450
2451                 m_total(MC_MBUF) += NMBPCL;
2452                 mbstat.m_mbufs = m_total(MC_MBUF);
2453                 m_infree(MC_MBUF) += NMBPCL;
2454                 mtype_stat_add(MT_FREE, NMBPCL);
2455
2456                 i = NMBPCL;
2457                 while (i--) {
2458                         /*
2459                          * If auditing is enabled, construct the shadow mbuf
2460                          * in the audit structure instead of the actual one.
2461                          * mbuf_slab_audit() will take care of restoring the
2462                          * contents after the integrity check.
2463                          */
2464                         if (mclaudit != NULL) {
2465                                 struct mbuf *ms;
2466                                 mca = mcl_audit_buf2mca(MC_MBUF,
2467                                     (mcache_obj_t *)m);
2468                                 ms = ((struct mbuf *)mca->mca_contents);
2469                                 ms->m_type = MT_FREE;
2470                         } else {
2471                                 m->m_type = MT_FREE;
2472                         }
2473                         m->m_next = sp->sl_head;
2474                         sp->sl_head = (void *)m++;
2475                 }
2476
2477                 /* Insert it into the mbuf class's slab list */
2478                 slab_insert(sp, MC_MBUF);
2479
2480                 if ((i = mb_waiters) > 0)
2481                         mb_waiters = 0;
2482                 if (i != 0)
2483                         wakeup(mb_waitchan);
2484
2485                 return (1);
2486         }
2487
2488         return (0);
2489 }
2490
2491 /*
2492  * (Inaccurately) check if it might be worth a trip back to the
2493  * mcache layer due the availability of objects there.  We'll
2494  * end up back here if there's nothing up there.
2495  */
2496 static boolean_t
2497 mbuf_cached_above(mbuf_class_t class, int wait)
2498 {
2499         switch (class) {
2500         case MC_MBUF:
2501                 if (wait & MCR_COMP)
2502                         return (!mcache_bkt_isempty(m_cache(MC_MBUF_CL)) ||
2503                             !mcache_bkt_isempty(m_cache(MC_MBUF_BIGCL)));
2504                 break;
2505
2506         case MC_CL:
2507                 if (wait & MCR_COMP)
2508                         return (!mcache_bkt_isempty(m_cache(MC_MBUF_CL)));
2509                 break;
2510
2511         case MC_BIGCL:
2512                 if (wait & MCR_COMP)
2513                         return (!mcache_bkt_isempty(m_cache(MC_MBUF_BIGCL)));
2514                 break;
2515
2516         case MC_16KCL:
2517                 if (wait & MCR_COMP)
2518                         return (!mcache_bkt_isempty(m_cache(MC_MBUF_16KCL)));
2519                 break;
2520
2521         case MC_MBUF_CL:
2522         case MC_MBUF_BIGCL:
2523         case MC_MBUF_16KCL:
2524                 break;
2525
2526         default:
2527                 VERIFY(0);
2528                 /* NOTREACHED */
2529         }
2530
2531         return (!mcache_bkt_isempty(m_cache(class)));
2532 }
2533
2534 /*
2535  * If possible, convert constructed objects to raw ones.
2536  */
2537 static boolean_t
2538 mbuf_steal(mbuf_class_t class, unsigned int num)
2539 {
2540         mcache_obj_t *top = NULL;
2541         mcache_obj_t **list = &top;
2542         unsigned int tot = 0;
2543
2544         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2545
2546         switch (class) {
2547         case MC_MBUF:
2548         case MC_CL:
2549         case MC_BIGCL:
2550         case MC_16KCL:
2551                 return (FALSE);
2552
2553         case MC_MBUF_CL:
2554         case MC_MBUF_BIGCL:
2555         case MC_MBUF_16KCL:
2556                 /* Get the required number of constructed objects if possible */
2557                 if (m_infree(class) > m_minlimit(class)) {
2558                         tot = cslab_alloc(class, &list,
2559                             MIN(num, m_infree(class)));
2560                 }
2561
2562                 /* And destroy them to get back the raw objects */
2563                 if (top != NULL)
2564                         (void) cslab_free(class, top, 1);
2565                 break;
2566
2567         default:
2568                 VERIFY(0);
2569                 /* NOTREACHED */
2570         }
2571
2572         return (tot == num);
2573 }
2574
2575 static void
2576 m_reclaim(mbuf_class_t class, unsigned int num, boolean_t comp)
2577 {
2578         int m, bmap = 0;
2579
2580         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2581
2582         VERIFY(m_total(MC_CL) <= m_maxlimit(MC_CL));
2583         VERIFY(m_total(MC_BIGCL) <= m_maxlimit(MC_BIGCL));
2584         VERIFY(m_total(MC_16KCL) <= m_maxlimit(MC_16KCL));
2585
2586         /*
2587          * This logic can be made smarter; for now, simply mark
2588          * all other related classes as potential victims.
2589          */
2590         switch (class) {
2591         case MC_MBUF:
2592                 m_wantpurge(MC_CL)++;
2593                 m_wantpurge(MC_MBUF_CL)++;
2594                 m_wantpurge(MC_MBUF_BIGCL)++;
2595                 break;
2596
2597         case MC_CL:
2598                 m_wantpurge(MC_MBUF)++;
2599                 if (!comp)
2600                         m_wantpurge(MC_MBUF_CL)++;
2601                 break;
2602
2603         case MC_BIGCL:
2604                 if (!comp)
2605                         m_wantpurge(MC_MBUF_BIGCL)++;
2606                 break;
2607
2608         case MC_16KCL:
2609                 if (!comp)
2610                         m_wantpurge(MC_MBUF_16KCL)++;
2611                 break;
2612
2613         default:
2614                 VERIFY(0);
2615                 /* NOTREACHED */
2616         }
2617
2618         /*
2619          * Run through each marked class and check if we really need to
2620          * purge (and therefore temporarily disable) the per-CPU caches
2621          * layer used by the class.  If so, remember the classes since
2622          * we are going to drop the lock below prior to purging.
2623          */
2624         for (m = 0; m < NELEM(mbuf_table); m++) {
2625                 if (m_wantpurge(m) > 0) {
2626                         m_wantpurge(m) = 0;
2627                         /*
2628                          * Try hard to steal the required number of objects
2629                          * from the freelist of other mbuf classes.  Only
2630                          * purge and disable the per-CPU caches layer when
2631                          * we don't have enough; it's the last resort.
2632                          */
2633                         if (!mbuf_steal(m, num))
2634                                 bmap |= (1 << m);
2635                 }
2636         }
2637
2638         lck_mtx_unlock(mbuf_mlock);
2639
2640         if (bmap != 0) {
2641                 /* drain is performed in pfslowtimo(), to avoid deadlocks */
2642                 do_reclaim = 1;
2643
2644                 /* Sigh; we have no other choices but to ask mcache to purge */
2645                 for (m = 0; m < NELEM(mbuf_table); m++) {
2646                         if ((bmap & (1 << m)) &&
2647                             mcache_purge_cache(m_cache(m))) {
2648                                 lck_mtx_lock(mbuf_mlock);
2649                                 m_purge_cnt(m)++;
2650                                 mbstat.m_drain++;
2651                                 lck_mtx_unlock(mbuf_mlock);
2652                         }
2653                 }
2654         } else {
2655                 /*
2656                  * Request mcache to reap extra elements from all of its caches;
2657                  * note that all reaps are serialized and happen only at a fixed
2658                  * interval.
2659                  */
2660                 mcache_reap();
2661         }
2662         lck_mtx_lock(mbuf_mlock);
2663 }
2664
2665 static inline struct mbuf *
2666 m_get_common(int wait, short type, int hdr)
2667 {
2668         struct mbuf *m;
2669         int mcflags = MSLEEPF(wait);
2670
2671         /* Is this due to a non-blocking retry?  If so, then try harder */
2672         if (mcflags & MCR_NOSLEEP)
2673                 mcflags |= MCR_TRYHARD;
2674
2675         m = mcache_alloc(m_cache(MC_MBUF), mcflags);
2676         if (m != NULL) {
2677                 MBUF_INIT(m, hdr, type);
2678                 mtype_stat_inc(type);
2679                 mtype_stat_dec(MT_FREE);
2680 #if CONFIG_MACF_NET
2681                 if (hdr && mac_init_mbuf(m, wait) != 0) {
2682                         m_free(m);
2683                         return (NULL);
2684                 }
2685 #endif /* MAC_NET */
2686         }
2687         return (m);
2688 }
2689
2690 /*
2691  * Space allocation routines; these are also available as macros
2692  * for critical paths.
2693  */
2694 #define _M_GET(wait, type)      m_get_common(wait, type, 0)
2695 #define _M_GETHDR(wait, type)   m_get_common(wait, type, 1)
2696 #define _M_RETRY(wait, type)    _M_GET(wait, type)
2697 #define _M_RETRYHDR(wait, type) _M_GETHDR(wait, type)
2698 #define _MGET(m, how, type)     ((m) = _M_GET(how, type))
2699 #define _MGETHDR(m, how, type)  ((m) = _M_GETHDR(how, type))
2700
2701 struct mbuf *
2702 m_get(int wait, int type)
2703 {
2704         return (_M_GET(wait, type));
2705 }
2706
2707 struct mbuf *
2708 m_gethdr(int wait, int type)
2709 {
2710         return (_M_GETHDR(wait, type));
2711 }
2712
2713 struct mbuf *
2714 m_retry(int wait, int type)
2715 {
2716         return (_M_RETRY(wait, type));
2717 }
2718
2719 struct mbuf *
2720 m_retryhdr(int wait, int type)
2721 {
2722         return (_M_RETRYHDR(wait, type));
2723 }
2724
2725 struct mbuf *
2726 m_getclr(int wait, int type)
2727 {
2728         struct mbuf *m;
2729
2730         _MGET(m, wait, type);
2731         if (m != NULL)
2732                 bzero(MTOD(m, caddr_t), MLEN);
2733         return (m);
2734 }
2735
2736 struct mbuf *
2737 m_free(struct mbuf *m)
2738 {
2739         struct mbuf *n = m->m_next;
2740
2741         if (m->m_type == MT_FREE)
2742                 panic("m_free: freeing an already freed mbuf");
2743
2744         /* Free the aux data and tags if there is any */
2745         if (m->m_flags & M_PKTHDR) {
2746                 m_tag_delete_chain(m, NULL);
2747         }
2748
2749         if (m->m_flags & M_EXT) {
2750                 u_int32_t refcnt;
2751                 u_int32_t flags;
2752
2753                 refcnt = m_decref(m);
2754                 flags = MEXT_FLAGS(m);
2755                 if (refcnt == 0 && flags == 0) {
2756                         if (m->m_ext.ext_free == NULL) {
2757                                 mcache_free(m_cache(MC_CL), m->m_ext.ext_buf);
2758                         } else if (m->m_ext.ext_free == m_bigfree) {
2759                                 mcache_free(m_cache(MC_BIGCL),
2760                                     m->m_ext.ext_buf);
2761                         } else if (m->m_ext.ext_free == m_16kfree) {
2762                                 mcache_free(m_cache(MC_16KCL),
2763                                     m->m_ext.ext_buf);
2764                         } else {
2765                                 (*(m->m_ext.ext_free))(m->m_ext.ext_buf,
2766                                     m->m_ext.ext_size, m->m_ext.ext_arg);
2767                         }
2768                         mcache_free(ref_cache, MEXT_RFA(m));
2769                         MEXT_RFA(m) = NULL;
2770                 } else if (refcnt == 0 && (flags & EXTF_COMPOSITE)) {
2771                         VERIFY(m->m_type != MT_FREE);
2772
2773                         mtype_stat_dec(m->m_type);
2774                         mtype_stat_inc(MT_FREE);
2775
2776                         m->m_type = MT_FREE;
2777                         m->m_flags = M_EXT;
2778                         m->m_len = 0;
2779                         m->m_next = m->m_nextpkt = NULL;
2780
2781                         /* "Free" into the intermediate cache */
2782                         if (m->m_ext.ext_free == NULL) {
2783                                 mcache_free(m_cache(MC_MBUF_CL), m);
2784                         } else if (m->m_ext.ext_free == m_bigfree) {
2785                                 mcache_free(m_cache(MC_MBUF_BIGCL), m);
2786                         } else {
2787                                 VERIFY(m->m_ext.ext_free == m_16kfree);
2788                                 mcache_free(m_cache(MC_MBUF_16KCL), m);
2789                         }
2790                         return (n);
2791                 }
2792         }
2793
2794         if (m->m_type != MT_FREE) {
2795                 mtype_stat_dec(m->m_type);
2796                 mtype_stat_inc(MT_FREE);
2797         }
2798
2799         m->m_type = MT_FREE;
2800         m->m_flags = m->m_len = 0;
2801         m->m_next = m->m_nextpkt = NULL;
2802
2803         mcache_free(m_cache(MC_MBUF), m);
2804
2805         return (n);
2806 }
2807
2808 __private_extern__ struct mbuf *
2809 m_clattach(struct mbuf *m, int type, caddr_t extbuf,
2810     void (*extfree)(caddr_t, u_int, caddr_t), u_int extsize, caddr_t extarg,
2811     int wait)
2812 {
2813         struct ext_ref *rfa = NULL;
2814
2815         if (m == NULL && (m = _M_GETHDR(wait, type)) == NULL)
2816                 return (NULL);
2817
2818         if (m->m_flags & M_EXT) {
2819                 u_int32_t refcnt;
2820                 u_int32_t flags;
2821
2822                 refcnt = m_decref(m);
2823                 flags = MEXT_FLAGS(m);
2824                 if (refcnt == 0 && flags == 0) {
2825                         if (m->m_ext.ext_free == NULL) {
2826                                 mcache_free(m_cache(MC_CL), m->m_ext.ext_buf);
2827                         } else if (m->m_ext.ext_free == m_bigfree) {
2828                                 mcache_free(m_cache(MC_BIGCL),
2829                                     m->m_ext.ext_buf);
2830                         } else if (m->m_ext.ext_free == m_16kfree) {
2831                                 mcache_free(m_cache(MC_16KCL),
2832                                     m->m_ext.ext_buf);
2833                         } else {
2834                                 (*(m->m_ext.ext_free))(m->m_ext.ext_buf,
2835                                     m->m_ext.ext_size, m->m_ext.ext_arg);
2836                         }
2837                         /* Re-use the reference structure */
2838                         rfa = MEXT_RFA(m);
2839                 } else if (refcnt == 0 && (flags & EXTF_COMPOSITE)) {
2840                         VERIFY(m->m_type != MT_FREE);
2841
2842                         mtype_stat_dec(m->m_type);
2843                         mtype_stat_inc(MT_FREE);
2844
2845                         m->m_type = MT_FREE;
2846                         m->m_flags = M_EXT;
2847                         m->m_len = 0;
2848                         m->m_next = m->m_nextpkt = NULL;
2849                         /* "Free" into the intermediate cache */
2850                         if (m->m_ext.ext_free == NULL) {
2851                                 mcache_free(m_cache(MC_MBUF_CL), m);
2852                         } else if (m->m_ext.ext_free == m_bigfree) {
2853                                 mcache_free(m_cache(MC_MBUF_BIGCL), m);
2854                         } else {
2855                                 VERIFY(m->m_ext.ext_free == m_16kfree);
2856                                 mcache_free(m_cache(MC_MBUF_16KCL), m);
2857                         }
2858                         /*
2859                          * Allocate a new mbuf, since we didn't divorce
2860                          * the composite mbuf + cluster pair above.
2861                          */
2862                         if ((m = _M_GETHDR(wait, type)) == NULL)
2863                                 return (NULL);
2864                 }
2865         }
2866
2867         if (rfa == NULL &&
2868             (rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL) {
2869                 m_free(m);
2870                 return (NULL);
2871         }
2872
2873         MEXT_INIT(m, extbuf, extsize, extfree, extarg, rfa, 1, 0);
2874
2875         return (m);
2876 }
2877
2878 /* m_mclget() add an mbuf cluster to a normal mbuf */
2879 struct mbuf *
2880 m_mclget(struct mbuf *m, int wait)
2881 {
2882         struct ext_ref *rfa;
2883
2884         if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL)
2885                 return (m);
2886
2887         m->m_ext.ext_buf = m_mclalloc(wait);
2888         if (m->m_ext.ext_buf != NULL) {
2889                 MBUF_CL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0);
2890         } else {
2891                 mcache_free(ref_cache, rfa);
2892         }
2893         return (m);
2894 }
2895
2896 /* Allocate an mbuf cluster */
2897 caddr_t
2898 m_mclalloc(int wait)
2899 {
2900         int mcflags = MSLEEPF(wait);
2901
2902         /* Is this due to a non-blocking retry?  If so, then try harder */
2903         if (mcflags & MCR_NOSLEEP)
2904                 mcflags |= MCR_TRYHARD;
2905
2906         return (mcache_alloc(m_cache(MC_CL), mcflags));
2907 }
2908
2909 /* Free an mbuf cluster */
2910 void
2911 m_mclfree(caddr_t p)
2912 {
2913         mcache_free(m_cache(MC_CL), p);
2914 }
2915
2916 /*
2917  * mcl_hasreference() checks if a cluster of an mbuf is referenced by
2918  * another mbuf
2919  */
2920 int
2921 m_mclhasreference(struct mbuf *m)
2922 {
2923         if (!(m->m_flags & M_EXT))
2924                 return (0);
2925
2926         ASSERT(MEXT_RFA(m) != NULL);
2927
2928         return (MEXT_REF(m) > 1);
2929 }
2930
2931 __private_extern__ caddr_t
2932 m_bigalloc(int wait)
2933 {
2934         int mcflags = MSLEEPF(wait);
2935
2936         /* Is this due to a non-blocking retry?  If so, then try harder */
2937         if (mcflags & MCR_NOSLEEP)
2938                 mcflags |= MCR_TRYHARD;
2939
2940         return (mcache_alloc(m_cache(MC_BIGCL), mcflags));
2941 }
2942
2943 __private_extern__ void
2944 m_bigfree(caddr_t p, __unused u_int size, __unused caddr_t arg)
2945 {
2946         mcache_free(m_cache(MC_BIGCL), p);
2947 }
2948
2949 /* m_mbigget() add an 4KB mbuf cluster to a normal mbuf */
2950 __private_extern__ struct mbuf *
2951 m_mbigget(struct mbuf *m, int wait)
2952 {
2953         struct ext_ref *rfa;
2954
2955         if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL)
2956                 return (m);
2957
2958         m->m_ext.ext_buf =  m_bigalloc(wait);
2959         if (m->m_ext.ext_buf != NULL) {
2960                 MBUF_BIGCL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0);
2961         } else {
2962                 mcache_free(ref_cache, rfa);
2963         }
2964         return (m);
2965 }
2966
2967 __private_extern__ caddr_t
2968 m_16kalloc(int wait)
2969 {
2970         int mcflags = MSLEEPF(wait);
2971
2972         /* Is this due to a non-blocking retry?  If so, then try harder */
2973         if (mcflags & MCR_NOSLEEP)
2974                 mcflags |= MCR_TRYHARD;
2975
2976         return (mcache_alloc(m_cache(MC_16KCL), mcflags));
2977 }
2978
2979 __private_extern__ void
2980 m_16kfree(caddr_t p, __unused u_int size, __unused caddr_t arg)
2981 {
2982         mcache_free(m_cache(MC_16KCL), p);
2983 }
2984
2985 /* m_m16kget() add a 16KB mbuf cluster to a normal mbuf */
2986 __private_extern__ struct mbuf *
2987 m_m16kget(struct mbuf *m, int wait)
2988 {
2989         struct ext_ref *rfa;
2990
2991         if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL)
2992                 return (m);
2993
2994         m->m_ext.ext_buf =  m_16kalloc(wait);
2995         if (m->m_ext.ext_buf != NULL) {
2996                 MBUF_16KCL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0);
2997         } else {
2998                 mcache_free(ref_cache, rfa);
2999         }
3000         return (m);
3001 }
3002
3003 /* */
3004 void
3005 m_copy_pkthdr(struct mbuf *to, struct mbuf *from)
3006 {
3007 #if CONFIG_MACF_NET
3008         /* We will be taking over the tags of 'to' */
3009         if (to->m_flags & M_PKTHDR)
3010                 m_tag_delete_chain(to, NULL);
3011 #endif /* MAC_NET */
3012         to->m_pkthdr = from->m_pkthdr;          /* especially tags */
3013         m_tag_init(from);                       /* purge tags from src */
3014         to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT);
3015         if ((to->m_flags & M_EXT) == 0)
3016                 to->m_data = to->m_pktdat;
3017 }
3018
3019 /*
3020  * Duplicate "from"'s mbuf pkthdr in "to".
3021  * "from" must have M_PKTHDR set, and "to" must be empty.
3022  * In particular, this does a deep copy of the packet tags.
3023  */
3024 static int
3025 m_dup_pkthdr(struct mbuf *to, struct mbuf *from, int how)
3026 {
3027 #if CONFIG_MACF_NET
3028         if (to->m_flags & M_PKTHDR)
3029                 m_tag_delete_chain(to, NULL);
3030 #endif /* MAC_NET */
3031         to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT);
3032         if ((to->m_flags & M_EXT) == 0)
3033                 to->m_data = to->m_pktdat;
3034         to->m_pkthdr = from->m_pkthdr;
3035         m_tag_init(to);
3036         return (m_tag_copy_chain(to, from, how));
3037 }
3038
3039 /*
3040  * Return a list of mbuf hdrs that point to clusters.  Try for num_needed;
3041  * if wantall is not set, return whatever number were available.  Set up the
3042  * first num_with_pkthdrs with mbuf hdrs configured as packet headers; these
3043  * are chained on the m_nextpkt field.  Any packets requested beyond this
3044  * are chained onto the last packet header's m_next field.  The size of
3045  * the cluster is controlled by the parameter bufsize.
3046  */
3047 __private_extern__ struct mbuf *
3048 m_getpackets_internal(unsigned int *num_needed, int num_with_pkthdrs,
3049     int wait, int wantall, size_t bufsize)
3050 {
3051         struct mbuf *m;
3052         struct mbuf **np, *top;
3053         unsigned int pnum, needed = *num_needed;
3054         mcache_obj_t *mp_list = NULL;
3055         int mcflags = MSLEEPF(wait);
3056         u_int32_t flag;
3057         struct ext_ref *rfa;
3058         mcache_t *cp;
3059         void *cl;
3060
3061         ASSERT(bufsize == m_maxsize(MC_CL) ||
3062             bufsize == m_maxsize(MC_BIGCL) ||
3063             bufsize == m_maxsize(MC_16KCL));
3064
3065         /*
3066          * Caller must first check for njcl because this
3067          * routine is internal and not exposed/used via KPI.
3068          */
3069         VERIFY(bufsize != m_maxsize(MC_16KCL) || njcl > 0);
3070
3071         top = NULL;
3072         np = &top;
3073         pnum = 0;
3074
3075         /*
3076          * The caller doesn't want all the requested buffers; only some.
3077          * Try hard to get what we can, but don't block.  This effectively
3078          * overrides MCR_SLEEP, since this thread will not go to sleep
3079          * if we can't get all the buffers.
3080          */
3081         if (!wantall || (mcflags & MCR_NOSLEEP))
3082                 mcflags |= MCR_TRYHARD;
3083
3084         /* Allocate the composite mbuf + cluster elements from the cache */
3085         if (bufsize == m_maxsize(MC_CL))
3086                 cp = m_cache(MC_MBUF_CL);
3087         else if (bufsize == m_maxsize(MC_BIGCL))
3088                 cp = m_cache(MC_MBUF_BIGCL);
3089         else
3090                 cp = m_cache(MC_MBUF_16KCL);
3091         needed = mcache_alloc_ext(cp, &mp_list, needed, mcflags);
3092
3093         for (pnum = 0; pnum < needed; pnum++) {
3094                 m = (struct mbuf *)mp_list;
3095                 mp_list = mp_list->obj_next;
3096
3097                 VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT);
3098                 cl = m->m_ext.ext_buf;
3099                 rfa = MEXT_RFA(m);
3100
3101                 ASSERT(cl != NULL && rfa != NULL);
3102                 VERIFY(MBUF_IS_COMPOSITE(m));
3103
3104                 flag = MEXT_FLAGS(m);
3105
3106                 MBUF_INIT(m, num_with_pkthdrs, MT_DATA);
3107                 if (bufsize == m_maxsize(MC_16KCL)) {
3108                         MBUF_16KCL_INIT(m, cl, rfa, 1, flag);
3109                 } else if (bufsize == m_maxsize(MC_BIGCL)) {
3110                         MBUF_BIGCL_INIT(m, cl, rfa, 1, flag);
3111                 } else {
3112                         MBUF_CL_INIT(m, cl, rfa, 1, flag);
3113                 }
3114
3115                 if (num_with_pkthdrs > 0) {
3116                         --num_with_pkthdrs;
3117 #if CONFIG_MACF_NET
3118                         if (mac_mbuf_label_init(m, wait) != 0) {
3119                                 m_free(m);
3120                                 break;
3121                         }
3122 #endif /* MAC_NET */
3123                 }
3124
3125                 *np = m;
3126                 if (num_with_pkthdrs > 0)
3127                         np = &m->m_nextpkt;
3128                 else
3129                         np = &m->m_next;
3130         }
3131         ASSERT(pnum != *num_needed || mp_list == NULL);
3132         if (mp_list != NULL)
3133                 mcache_free_ext(cp, mp_list);
3134
3135         if (pnum > 0) {
3136                 mtype_stat_add(MT_DATA, pnum);
3137                 mtype_stat_sub(MT_FREE, pnum);
3138         }
3139
3140         if (wantall && (pnum != *num_needed)) {
3141                 if (top != NULL)
3142                         m_freem_list(top);
3143                 return (NULL);
3144         }
3145
3146         *num_needed = pnum;
3147         return (top);
3148 }
3149
3150 /*
3151  * Return list of mbuf linked by m_nextpkt.  Try for numlist, and if
3152  * wantall is not set, return whatever number were available.  The size of
3153  * each mbuf in the list is controlled by the parameter packetlen.  Each
3154  * mbuf of the list may have a chain of mbufs linked by m_next.  Each mbuf
3155  * in the chain is called a segment.  If maxsegments is not null and the
3156  * value pointed to is not null, this specify the maximum number of segments
3157  * for a chain of mbufs.  If maxsegments is zero or the value pointed to
3158  * is zero the caller does not have any restriction on the number of segments.
3159  * The actual  number of segments of a mbuf chain is return in the value
3160  * pointed to by maxsegments.
3161  */
3162 __private_extern__ struct mbuf *
3163 m_allocpacket_internal(unsigned int *numlist, size_t packetlen,
3164     unsigned int *maxsegments, int wait, int wantall, size_t wantsize)
3165 {
3166         struct mbuf **np, *top, *first = NULL;
3167         size_t bufsize, r_bufsize;
3168         unsigned int num = 0;
3169         unsigned int nsegs = 0;
3170         unsigned int needed, resid;
3171         int mcflags = MSLEEPF(wait);
3172         mcache_obj_t *mp_list = NULL, *rmp_list = NULL;
3173         mcache_t *cp = NULL, *rcp = NULL;
3174
3175         if (*numlist == 0)
3176                 return (NULL);
3177
3178         top = NULL;
3179         np = &top;
3180
3181         if (wantsize == 0) {
3182                 if (packetlen <= MINCLSIZE) {
3183                         bufsize = packetlen;
3184                 } else if (packetlen > m_maxsize(MC_CL)) {
3185                         /* Use 4KB if jumbo cluster pool isn't available */
3186                         if (packetlen <= m_maxsize(MC_BIGCL) || njcl == 0)
3187                                 bufsize = m_maxsize(MC_BIGCL);
3188                         else
3189                                 bufsize = m_maxsize(MC_16KCL);
3190                 } else {
3191                         bufsize = m_maxsize(MC_CL);
3192                 }
3193         } else if (wantsize == m_maxsize(MC_CL) ||
3194             wantsize == m_maxsize(MC_BIGCL) ||
3195             (wantsize == m_maxsize(MC_16KCL) && njcl > 0)) {
3196                 bufsize = wantsize;
3197         } else {
3198                 return (NULL);
3199         }
3200
3201         if (bufsize <= MHLEN) {
3202                 nsegs = 1;
3203         } else if (bufsize <= MINCLSIZE) {
3204                 if (maxsegments != NULL && *maxsegments == 1) {
3205                         bufsize = m_maxsize(MC_CL);
3206                         nsegs = 1;
3207                 } else {
3208                         nsegs = 2;
3209                 }
3210         } else if (bufsize == m_maxsize(MC_16KCL)) {
3211                 VERIFY(njcl > 0);
3212                 nsegs = ((packetlen - 1) >> (PGSHIFT + 2)) + 1;
3213         } else if (bufsize == m_maxsize(MC_BIGCL)) {
3214                 nsegs = ((packetlen - 1) >> PGSHIFT) + 1;
3215         } else {
3216                 nsegs = ((packetlen - 1) >> MCLSHIFT) + 1;
3217         }
3218         if (maxsegments != NULL) {
3219                 if (*maxsegments && nsegs > *maxsegments) {
3220                         *maxsegments = nsegs;
3221                         return (NULL);
3222                 }
3223                 *maxsegments = nsegs;
3224         }
3225
3226         /*
3227          * The caller doesn't want all the requested buffers; only some.
3228          * Try hard to get what we can, but don't block.  This effectively
3229          * overrides MCR_SLEEP, since this thread will not go to sleep
3230          * if we can't get all the buffers.
3231          */
3232         if (!wantall || (mcflags & MCR_NOSLEEP))
3233                 mcflags |= MCR_TRYHARD;
3234
3235         /*
3236          * Simple case where all elements in the lists/chains are mbufs.
3237          * Unless bufsize is greater than MHLEN, each segment chain is made
3238          * up of exactly 1 mbuf.  Otherwise, each segment chain is made up
3239          * of 2 mbufs; the second one is used for the residual data, i.e.
3240          * the remaining data that cannot fit into the first mbuf.
3241          */
3242         if (bufsize <= MINCLSIZE) {
3243                 /* Allocate the elements in one shot from the mbuf cache */
3244                 ASSERT(bufsize <= MHLEN || nsegs == 2);
3245                 cp = m_cache(MC_MBUF);
3246                 needed = mcache_alloc_ext(cp, &mp_list,
3247                     (*numlist) * nsegs, mcflags);
3248
3249                 /*
3250                  * The number of elements must be even if we are to use an
3251                  * mbuf (instead of a cluster) to store the residual data.
3252                  * If we couldn't allocate the requested number of mbufs,
3253                  * trim the number down (if it's odd) in order to avoid
3254                  * creating a partial segment chain.
3255                  */
3256                 if (bufsize > MHLEN && (needed & 0x1))
3257                         needed--;
3258
3259                 while (num < needed) {
3260                         struct mbuf *m;
3261
3262                         m = (struct mbuf *)mp_list;
3263                         mp_list = mp_list->obj_next;
3264                         ASSERT(m != NULL);
3265
3266                         MBUF_INIT(m, 1, MT_DATA);
3267 #if CONFIG_MACF_NET
3268                         if (mac_init_mbuf(m, wait) != 0) {
3269                                 m_free(m);
3270                                 break;
3271                         }
3272 #endif /* MAC_NET */
3273                         num++;
3274                         if (bufsize > MHLEN) {
3275                                 /* A second mbuf for this segment chain */
3276                                 m->m_next = (struct mbuf *)mp_list;
3277                                 mp_list = mp_list->obj_next;
3278                                 ASSERT(m->m_next != NULL);
3279
3280                                 MBUF_INIT(m->m_next, 0, MT_DATA);
3281                                 num++;
3282                         }
3283                         *np = m;
3284                         np = &m->m_nextpkt;
3285                 }
3286                 ASSERT(num != *numlist || mp_list == NULL);
3287
3288                 if (num > 0) {
3289                         mtype_stat_add(MT_DATA, num);
3290                         mtype_stat_sub(MT_FREE, num);
3291                 }
3292                 num /= nsegs;
3293
3294                 /* We've got them all; return to caller */
3295                 if (num == *numlist)
3296                         return (top);
3297
3298                 goto fail;
3299         }
3300
3301         /*
3302          * Complex cases where elements are made up of one or more composite
3303          * mbufs + cluster, depending on packetlen.  Each N-segment chain can
3304          * be illustrated as follows:
3305          *
3306          * [mbuf + cluster 1] [mbuf + cluster 2] ... [mbuf + cluster N]
3307          *
3308          * Every composite mbuf + cluster element comes from the intermediate
3309          * cache (either MC_MBUF_CL or MC_MBUF_BIGCL).  For space efficiency,
3310          * the last composite element will come from the MC_MBUF_CL cache,
3311          * unless the residual data is larger than 2KB where we use the
3312          * big cluster composite cache (MC_MBUF_BIGCL) instead.  Residual
3313          * data is defined as extra data beyond the first element that cannot
3314          * fit into the previous element, i.e. there is no residual data if
3315          * the chain only has 1 segment.
3316          */
3317         r_bufsize = bufsize;
3318         resid = packetlen > bufsize ? packetlen % bufsize : 0;
3319         if (resid > 0) {
3320                 /* There is residual data; figure out the cluster size */
3321                 if (wantsize == 0 && packetlen > MINCLSIZE) {
3322                         /*
3323                          * Caller didn't request that all of the segments
3324                          * in the chain use the same cluster size; use the
3325                          * smaller of the cluster sizes.
3326                          */
3327                         if (njcl > 0 && resid > m_maxsize(MC_BIGCL))
3328                                 r_bufsize = m_maxsize(MC_16KCL);
3329                         else if (resid > m_maxsize(MC_CL))
3330                                 r_bufsize = m_maxsize(MC_BIGCL);
3331                         else
3332                                 r_bufsize = m_maxsize(MC_CL);
3333                 } else {
3334                         /* Use the same cluster size as the other segments */
3335                         resid = 0;
3336                 }
3337         }
3338
3339         needed = *numlist;
3340         if (resid > 0) {
3341                 /*
3342                  * Attempt to allocate composite mbuf + cluster elements for
3343                  * the residual data in each chain; record the number of such
3344                  * elements that can be allocated so that we know how many
3345                  * segment chains we can afford to create.
3346                  */
3347                 if (r_bufsize <= m_maxsize(MC_CL))
3348                         rcp = m_cache(MC_MBUF_CL);
3349                 else if (r_bufsize <= m_maxsize(MC_BIGCL))
3350                         rcp = m_cache(MC_MBUF_BIGCL);
3351                 else
3352                         rcp = m_cache(MC_MBUF_16KCL);
3353                 needed = mcache_alloc_ext(rcp, &rmp_list, *numlist, mcflags);
3354
3355                 if (needed == 0)
3356                         goto fail;
3357
3358                 /* This is temporarily reduced for calculation */
3359                 ASSERT(nsegs > 1);
3360                 nsegs--;
3361         }
3362
3363         /*
3364          * Attempt to allocate the rest of the composite mbuf + cluster
3365          * elements for the number of segment chains that we need.
3366          */
3367         if (bufsize <= m_maxsize(MC_CL))
3368                 cp = m_cache(MC_MBUF_CL);
3369         else if (bufsize <= m_maxsize(MC_BIGCL))
3370                 cp = m_cache(MC_MBUF_BIGCL);
3371         else
3372                 cp = m_cache(MC_MBUF_16KCL);
3373         needed = mcache_alloc_ext(cp, &mp_list, needed * nsegs, mcflags);
3374
3375         /* Round it down to avoid creating a partial segment chain */
3376         needed = (needed / nsegs) * nsegs;
3377         if (needed == 0)
3378                 goto fail;
3379
3380         if (resid > 0) {
3381                 /*
3382                  * We're about to construct the chain(s); take into account
3383                  * the number of segments we have created above to hold the
3384                  * residual data for each chain, as well as restore the
3385                  * original count of segments per chain.
3386                  */
3387                 ASSERT(nsegs > 0);
3388                 needed += needed / nsegs;
3389                 nsegs++;
3390         }
3391
3392         for (;;) {
3393                 struct mbuf *m;
3394                 u_int32_t flag;
3395                 struct ext_ref *rfa;
3396                 void *cl;
3397                 int pkthdr;
3398
3399                 ++num;
3400                 if (nsegs == 1 || (num % nsegs) != 0 || resid == 0) {
3401                         m = (struct mbuf *)mp_list;
3402                         mp_list = mp_list->obj_next;
3403                 } else {
3404                         m = (struct mbuf *)rmp_list;
3405                         rmp_list = rmp_list->obj_next;
3406                 }
3407                 ASSERT(m != NULL);
3408                 VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT);
3409                 VERIFY(m->m_ext.ext_free == NULL ||
3410                     m->m_ext.ext_free == m_bigfree ||
3411                     m->m_ext.ext_free == m_16kfree);
3412
3413                 cl = m->m_ext.ext_buf;
3414                 rfa = MEXT_RFA(m);
3415
3416                 ASSERT(cl != NULL && rfa != NULL);
3417                 VERIFY(MBUF_IS_COMPOSITE(m));
3418
3419                 flag = MEXT_FLAGS(m);
3420
3421                 pkthdr = (nsegs == 1 || (num % nsegs) == 1);
3422                 if (pkthdr)
3423                         first = m;
3424                 MBUF_INIT(m, pkthdr, MT_DATA);
3425                 if (m->m_ext.ext_free == m_16kfree) {
3426                         MBUF_16KCL_INIT(m, cl, rfa, 1, flag);
3427                 } else if (m->m_ext.ext_free == m_bigfree) {
3428                         MBUF_BIGCL_INIT(m, cl, rfa, 1, flag);
3429                 } else {
3430                         MBUF_CL_INIT(m, cl, rfa, 1, flag);
3431                 }
3432 #if CONFIG_MACF_NET
3433                 if (pkthdr && mac_init_mbuf(m, wait) != 0) {
3434                         --num;
3435                         m_free(m);
3436                         break;
3437                 }
3438 #endif /* MAC_NET */
3439
3440                 *np = m;
3441                 if ((num % nsegs) == 0)
3442                         np = &first->m_nextpkt;
3443                 else
3444                         np = &m->m_next;
3445
3446                 if (num == needed)
3447                         break;
3448         }
3449
3450         if (num > 0) {
3451                 mtype_stat_add(MT_DATA, num);
3452                 mtype_stat_sub(MT_FREE, num);
3453         }
3454
3455         num /= nsegs;
3456
3457         /* We've got them all; return to caller */
3458         if (num == *numlist) {
3459                 ASSERT(mp_list == NULL && rmp_list == NULL);
3460                 return (top);
3461         }
3462
3463 fail:
3464         /* Free up what's left of the above */
3465         if (mp_list != NULL)
3466                 mcache_free_ext(cp, mp_list);
3467         if (rmp_list != NULL)
3468                 mcache_free_ext(rcp, rmp_list);
3469         if (wantall && top != NULL) {
3470                 m_freem(top);
3471                 return (NULL);
3472         }
3473         *numlist = num;
3474         return (top);
3475 }
3476
3477 /*
3478  * Best effort to get a mbuf cluster + pkthdr.  Used by drivers to allocated
3479  * packets on receive ring.
3480  */
3481 __private_extern__ struct mbuf *
3482 m_getpacket_how(int wait)
3483 {
3484         unsigned int num_needed = 1;
3485
3486         return (m_getpackets_internal(&num_needed, 1, wait, 1,
3487             m_maxsize(MC_CL)));
3488 }
3489
3490 /*
3491  * Best effort to get a mbuf cluster + pkthdr.  Used by drivers to allocated
3492  * packets on receive ring.
3493  */
3494 struct mbuf *
3495 m_getpacket(void)
3496 {
3497         unsigned int num_needed = 1;
3498
3499         return (m_getpackets_internal(&num_needed, 1, M_WAIT, 1,
3500             m_maxsize(MC_CL)));
3501 }
3502
3503 /*
3504  * Return a list of mbuf hdrs that point to clusters.  Try for num_needed;
3505  * if this can't be met, return whatever number were available.  Set up the
3506  * first num_with_pkthdrs with mbuf hdrs configured as packet headers.  These
3507  * are chained on the m_nextpkt field.  Any packets requested beyond this are
3508  * chained onto the last packet header's m_next field.
3509  */
3510 struct mbuf *
3511 m_getpackets(int num_needed, int num_with_pkthdrs, int how)
3512 {
3513         unsigned int n = num_needed;
3514
3515         return (m_getpackets_internal(&n, num_with_pkthdrs, how, 0,
3516             m_maxsize(MC_CL)));
3517 }
3518
3519 /*
3520  * Return a list of mbuf hdrs set up as packet hdrs chained together
3521  * on the m_nextpkt field
3522  */
3523 struct mbuf *
3524 m_getpackethdrs(int num_needed, int how)
3525 {
3526         struct mbuf *m;
3527         struct mbuf **np, *top;
3528
3529         top = NULL;
3530         np = &top;
3531
3532         while (num_needed--) {
3533                 m = _M_RETRYHDR(how, MT_DATA);
3534                 if (m == NULL)
3535                         break;
3536
3537                 *np = m;
3538                 np = &m->m_nextpkt;
3539         }
3540
3541         return (top);
3542 }
3543
3544 /*
3545  * Free an mbuf list (m_nextpkt) while following m_next.  Returns the count
3546  * for mbufs packets freed.  Used by the drivers.
3547  */
3548 int
3549 m_freem_list(struct mbuf *m)
3550 {
3551         struct mbuf *nextpkt;
3552         mcache_obj_t *mp_list = NULL;
3553         mcache_obj_t *mcl_list = NULL;
3554         mcache_obj_t *mbc_list = NULL;
3555         mcache_obj_t *m16k_list = NULL;
3556         mcache_obj_t *m_mcl_list = NULL;
3557         mcache_obj_t *m_mbc_list = NULL;
3558         mcache_obj_t *m_m16k_list = NULL;
3559         mcache_obj_t *ref_list = NULL;
3560         int pktcount = 0;
3561         int mt_free = 0, mt_data = 0, mt_header = 0, mt_soname = 0, mt_tag = 0;
3562
3563         while (m != NULL) {
3564                 pktcount++;
3565
3566                 nextpkt = m->m_nextpkt;
3567                 m->m_nextpkt = NULL;
3568
3569                 while (m != NULL) {
3570                         struct mbuf *next = m->m_next;
3571                         mcache_obj_t *o, *rfa;
3572                         u_int32_t refcnt, flags;
3573
3574                         if (m->m_type == MT_FREE)
3575                                 panic("m_free: freeing an already freed mbuf");
3576
3577                         if (m->m_type != MT_FREE)
3578                                 mt_free++;
3579
3580                         if (m->m_flags & M_PKTHDR) {
3581                                 m_tag_delete_chain(m, NULL);
3582                         }
3583
3584                         if (!(m->m_flags & M_EXT))
3585                                 goto simple_free;
3586
3587                         o = (mcache_obj_t *)m->m_ext.ext_buf;
3588                         refcnt = m_decref(m);
3589                         flags = MEXT_FLAGS(m);
3590                         if (refcnt == 0 && flags == 0) {
3591                                 if (m->m_ext.ext_free == NULL) {
3592                                         o->obj_next = mcl_list;
3593                                         mcl_list = o;
3594                                 } else if (m->m_ext.ext_free == m_bigfree) {
3595                                         o->obj_next = mbc_list;
3596                                         mbc_list = o;
3597                                 } else if (m->m_ext.ext_free == m_16kfree) {
3598                                         o->obj_next = m16k_list;
3599                                         m16k_list = o;
3600                                 } else {
3601                                         (*(m->m_ext.ext_free))((caddr_t)o,
3602                                             m->m_ext.ext_size,
3603                                             m->m_ext.ext_arg);
3604                                 }
3605                                 rfa = (mcache_obj_t *)MEXT_RFA(m);
3606                                 rfa->obj_next = ref_list;
3607                                 ref_list = rfa;
3608                                 MEXT_RFA(m) = NULL;
3609                         } else if (refcnt == 0 && (flags & EXTF_COMPOSITE)) {
3610                                 VERIFY(m->m_type != MT_FREE);
3611                                 /*
3612                                  * Amortize the costs of atomic operations
3613                                  * by doing them at the end, if possible.
3614                                  */
3615                                 if (m->m_type == MT_DATA)
3616                                         mt_data++;
3617                                 else if (m->m_type == MT_HEADER)
3618                                         mt_header++;
3619                                 else if (m->m_type == MT_SONAME)
3620                                         mt_soname++;
3621                                 else if (m->m_type == MT_TAG)
3622                                         mt_tag++;
3623                                 else
3624                                         mtype_stat_dec(m->m_type);
3625
3626                                 m->m_type = MT_FREE;
3627                                 m->m_flags = M_EXT;
3628                                 m->m_len = 0;
3629                                 m->m_next = m->m_nextpkt = NULL;
3630
3631                                 /* "Free" into the intermediate cache */
3632                                 o = (mcache_obj_t *)m;
3633                                 if (m->m_ext.ext_free == NULL) {
3634                                         o->obj_next = m_mcl_list;
3635                                         m_mcl_list = o;
3636                                 } else if (m->m_ext.ext_free == m_bigfree) {
3637                                         o->obj_next = m_mbc_list;
3638                                         m_mbc_list = o;
3639                                 } else {
3640                                         VERIFY(m->m_ext.ext_free == m_16kfree);
3641                                         o->obj_next = m_m16k_list;
3642                                         m_m16k_list = o;
3643                                 }
3644                                 m = next;
3645                                 continue;
3646                         }
3647 simple_free:
3648                         /*
3649                          * Amortize the costs of atomic operations
3650                          * by doing them at the end, if possible.
3651                          */
3652                         if (m->m_type == MT_DATA)
3653                                 mt_data++;
3654                         else if (m->m_type == MT_HEADER)
3655                                 mt_header++;
3656                         else if (m->m_type == MT_SONAME)
3657                                 mt_soname++;
3658                         else if (m->m_type == MT_TAG)
3659                                 mt_tag++;
3660                         else if (m->m_type != MT_FREE)
3661                                 mtype_stat_dec(m->m_type);
3662
3663                         m->m_type = MT_FREE;
3664                         m->m_flags = m->m_len = 0;
3665                         m->m_next = m->m_nextpkt = NULL;
3666
3667                         ((mcache_obj_t *)m)->obj_next = mp_list;
3668                         mp_list = (mcache_obj_t *)m;
3669
3670                         m = next;
3671                 }
3672
3673                 m = nextpkt;
3674         }
3675
3676         if (mt_free > 0)
3677                 mtype_stat_add(MT_FREE, mt_free);
3678         if (mt_data > 0)
3679                 mtype_stat_sub(MT_DATA, mt_data);
3680         if (mt_header > 0)
3681                 mtype_stat_sub(MT_HEADER, mt_header);
3682         if (mt_soname > 0)
3683                 mtype_stat_sub(MT_SONAME, mt_soname);
3684         if (mt_tag > 0)
3685                 mtype_stat_sub(MT_TAG, mt_tag);
3686
3687         if (mp_list != NULL)
3688                 mcache_free_ext(m_cache(MC_MBUF), mp_list);
3689         if (mcl_list != NULL)
3690                 mcache_free_ext(m_cache(MC_CL), mcl_list);
3691         if (mbc_list != NULL)
3692                 mcache_free_ext(m_cache(MC_BIGCL), mbc_list);
3693         if (m16k_list != NULL)
3694                 mcache_free_ext(m_cache(MC_16KCL), m16k_list);
3695         if (m_mcl_list != NULL)
3696                 mcache_free_ext(m_cache(MC_MBUF_CL), m_mcl_list);
3697         if (m_mbc_list != NULL)
3698                 mcache_free_ext(m_cache(MC_MBUF_BIGCL), m_mbc_list);
3699         if (m_m16k_list != NULL)
3700                 mcache_free_ext(m_cache(MC_MBUF_16KCL), m_m16k_list);
3701         if (ref_list != NULL)
3702                 mcache_free_ext(ref_cache, ref_list);
3703
3704         return (pktcount);
3705 }
3706
3707 void
3708 m_freem(struct mbuf *m)
3709 {
3710         while (m != NULL)
3711                 m = m_free(m);
3712 }
3713
3714 /*
3715  * Mbuffer utility routines.
3716  */
3717
3718 /*
3719  * Compute the amount of space available before the current start
3720  * of data in an mbuf.
3721  */
3722 int
3723 m_leadingspace(struct mbuf *m)
3724 {
3725         if (m->m_flags & M_EXT) {
3726                 if (MCLHASREFERENCE(m))
3727                         return (0);
3728                 return (m->m_data - m->m_ext.ext_buf);
3729         }
3730         if (m->m_flags & M_PKTHDR)
3731                 return (m->m_data - m->m_pktdat);
3732         return (m->m_data - m->m_dat);
3733 }
3734
3735 /*
3736  * Compute the amount of space available after the end of data in an mbuf.
3737  */
3738 int
3739 m_trailingspace(struct mbuf *m)
3740 {
3741         if (m->m_flags & M_EXT) {
3742                 if (MCLHASREFERENCE(m))
3743                         return (0);
3744                 return (m->m_ext.ext_buf + m->m_ext.ext_size -
3745                     (m->m_data + m->m_len));
3746         }
3747         return (&m->m_dat[MLEN] - (m->m_data + m->m_len));
3748 }
3749
3750 /*
3751  * Lesser-used path for M_PREPEND: allocate new mbuf to prepend to chain,
3752  * copy junk along.  Does not adjust packet header length.
3753  */
3754 struct mbuf *
3755 m_prepend(struct mbuf *m, int len, int how)
3756 {
3757         struct mbuf *mn;
3758
3759         _MGET(mn, how, m->m_type);
3760         if (mn == NULL) {
3761                 m_freem(m);
3762                 return (NULL);
3763         }
3764         if (m->m_flags & M_PKTHDR) {
3765                 M_COPY_PKTHDR(mn, m);
3766                 m->m_flags &= ~M_PKTHDR;
3767         }
3768         mn->m_next = m;
3769         m = mn;
3770         if (len < MHLEN)
3771                 MH_ALIGN(m, len);
3772         m->m_len = len;
3773         return (m);
3774 }
3775
3776 /*
3777  * Replacement for old M_PREPEND macro: allocate new mbuf to prepend to
3778  * chain, copy junk along, and adjust length.
3779  */
3780 struct mbuf *
3781 m_prepend_2(struct mbuf *m, int len, int how)
3782 {
3783         if (M_LEADINGSPACE(m) >= len) {
3784                 m->m_data -= len;
3785                 m->m_len += len;
3786         } else {
3787                 m = m_prepend(m, len, how);
3788         }
3789         if ((m) && (m->m_flags & M_PKTHDR))
3790                 m->m_pkthdr.len += len;
3791         return (m);
3792 }
3793
3794 /*
3795  * Make a copy of an mbuf chain starting "off0" bytes from the beginning,
3796  * continuing for "len" bytes.  If len is M_COPYALL, copy to end of mbuf.
3797  * The wait parameter is a choice of M_WAIT/M_DONTWAIT from caller.
3798  */
3799 int MCFail;
3800
3801 struct mbuf *
3802 m_copym(struct mbuf *m, int off0, int len, int wait)
3803 {
3804         struct mbuf *n, *mhdr = NULL, **np;
3805         int off = off0;
3806         struct mbuf *top;
3807         int copyhdr = 0;
3808
3809         if (off < 0 || len < 0)
3810                 panic("m_copym: invalid offset %d or len %d", off, len);
3811
3812         if (off == 0 && (m->m_flags & M_PKTHDR)) {
3813                 mhdr = m;
3814                 copyhdr = 1;
3815         }
3816
3817         while (off >= m->m_len) {
3818                 if (m->m_next == NULL)
3819                         panic("m_copym: invalid mbuf chain");
3820                 off -= m->m_len;
3821                 m = m->m_next;
3822         }
3823         np = &top;
3824         top = NULL;
3825
3826         while (len > 0) {
3827                 if (m == NULL) {
3828                         if (len != M_COPYALL)
3829                                 panic("m_copym: len != M_COPYALL");
3830                         break;
3831                 }
3832
3833                 n = _M_RETRY(wait, m->m_type);
3834                 *np = n;
3835
3836                 if (n == NULL)
3837                         goto nospace;
3838
3839                 if (copyhdr != 0) {
3840                         M_COPY_PKTHDR(n, mhdr);
3841                         if (len == M_COPYALL)
3842                                 n->m_pkthdr.len -= off0;
3843                         else
3844                                 n->m_pkthdr.len = len;
3845                         copyhdr = 0;
3846                 }
3847                 if (len == M_COPYALL) {
3848                         if (MIN(len, (m->m_len - off)) == len) {
3849                                 printf("m->m_len %ld - off %d = %ld, %ld\n",
3850                                     m->m_len, off, m->m_len - off,
3851                                     MIN(len, (m->m_len - off)));
3852                         }
3853                 }
3854                 n->m_len = MIN(len, (m->m_len - off));
3855                 if (n->m_len == M_COPYALL) {
3856                         printf("n->m_len == M_COPYALL, fixing\n");
3857                         n->m_len = MHLEN;
3858                 }
3859                 if (m->m_flags & M_EXT) {
3860                         n->m_ext = m->m_ext;
3861                         m_incref(m);
3862                         n->m_data = m->m_data + off;
3863                         n->m_flags |= M_EXT;
3864                 } else {
3865                         bcopy(MTOD(m, caddr_t)+off, MTOD(n, caddr_t),
3866                             (unsigned)n->m_len);
3867                 }
3868                 if (len != M_COPYALL)
3869                         len -= n->m_len;
3870                 off = 0;
3871                 m = m->m_next;
3872                 np = &n->m_next;
3873         }
3874
3875         if (top == NULL)
3876                 MCFail++;
3877
3878         return (top);
3879 nospace:
3880
3881         m_freem(top);
3882         MCFail++;
3883         return (NULL);
3884 }
3885
3886 /*
3887  * Equivalent to m_copym except that all necessary mbuf hdrs are allocated
3888  * within this routine also, the last mbuf and offset accessed are passed
3889  * out and can be passed back in to avoid having to rescan the entire mbuf
3890  * list (normally hung off of the socket)
3891  */
3892 struct mbuf *
3893 m_copym_with_hdrs(struct mbuf *m, int off0, int len0, int wait,
3894     struct mbuf **m_last, int *m_off)
3895 {
3896         struct mbuf *n, **np = NULL;
3897         int off = off0, len = len0;
3898         struct mbuf *top = NULL;
3899         int mcflags = MSLEEPF(wait);
3900         int copyhdr = 0;
3901         int type = 0;
3902         mcache_obj_t *list = NULL;
3903         int needed = 0;
3904
3905         if (off == 0 && (m->m_flags & M_PKTHDR))
3906                 copyhdr = 1;
3907
3908         if (*m_last != NULL) {
3909                 m = *m_last;
3910                 off = *m_off;
3911         } else {
3912                 while (off >= m->m_len) {
3913                         off -= m->m_len;
3914                         m = m->m_next;
3915                 }
3916         }
3917
3918         n = m;
3919         while (len > 0) {
3920                 needed++;
3921                 ASSERT(n != NULL);
3922                 len -= MIN(len, (n->m_len - ((needed == 1) ? off : 0)));
3923                 n = n->m_next;
3924         }
3925         needed++;
3926         len = len0;
3927
3928         /*
3929          * If the caller doesn't want to be put to sleep, mark it with
3930          * MCR_TRYHARD so that we may reclaim buffers from other places
3931          * before giving up.
3932          */
3933         if (mcflags & MCR_NOSLEEP)
3934                 mcflags |= MCR_TRYHARD;
3935
3936         if (mcache_alloc_ext(m_cache(MC_MBUF), &list, needed,
3937             mcflags) != needed)
3938                 goto nospace;
3939
3940         needed = 0;
3941         while (len > 0) {
3942                 n = (struct mbuf *)list;
3943                 list = list->obj_next;
3944                 ASSERT(n != NULL && m != NULL);
3945
3946                 type = (top == NULL) ? MT_HEADER : m->m_type;
3947                 MBUF_INIT(n, (top == NULL), type);
3948 #if CONFIG_MACF_NET
3949                 if (top == NULL && mac_mbuf_label_init(n, wait) != 0) {
3950                         mtype_stat_inc(MT_HEADER);
3951                         mtype_stat_dec(MT_FREE);
3952                         m_free(n);
3953                         goto nospace;
3954                 }
3955 #endif /* MAC_NET */
3956
3957                 if (top == NULL) {
3958                         top = n;
3959                         np = &top->m_next;
3960                         continue;
3961                 } else {
3962                         needed++;
3963                         *np = n;
3964                 }
3965
3966                 if (copyhdr) {
3967                         M_COPY_PKTHDR(n, m);
3968                         n->m_pkthdr.len = len;
3969                         copyhdr = 0;
3970                 }
3971                 n->m_len = MIN(len, (m->m_len - off));
3972
3973                 if (m->m_flags & M_EXT) {
3974                         n->m_ext = m->m_ext;
3975                         m_incref(m);
3976                         n->m_data = m->m_data + off;
3977                         n->m_flags |= M_EXT;
3978                 } else {
3979                         bcopy(MTOD(m, caddr_t)+off, MTOD(n, caddr_t),
3980                             (unsigned)n->m_len);
3981                 }
3982                 len -= n->m_len;
3983
3984                 if (len == 0) {
3985                         if ((off + n->m_len) == m->m_len) {
3986                                 *m_last = m->m_next;
3987                                 *m_off  = 0;
3988                         } else {
3989                                 *m_last = m;
3990                                 *m_off  = off + n->m_len;
3991                         }
3992                         break;
3993                 }
3994                 off = 0;
3995                 m = m->m_next;
3996                 np = &n->m_next;
3997         }
3998
3999         mtype_stat_inc(MT_HEADER);
4000         mtype_stat_add(type, needed);
4001         mtype_stat_sub(MT_FREE, needed + 1);
4002
4003         ASSERT(list == NULL);
4004         return (top);
4005
4006 nospace:
4007         if (list != NULL)
4008                 mcache_free_ext(m_cache(MC_MBUF), list);
4009         if (top != NULL)
4010                 m_freem(top);
4011         MCFail++;
4012         return (NULL);
4013 }
4014
4015 /*
4016  * Copy data from an mbuf chain starting "off" bytes from the beginning,
4017  * continuing for "len" bytes, into the indicated buffer.
4018  */
4019 void
4020 m_copydata(struct mbuf *m, int off, int len, caddr_t cp)
4021 {
4022         unsigned count;
4023
4024         if (off < 0 || len < 0)
4025                 panic("m_copydata: invalid offset %d or len %d", off, len);
4026
4027         while (off > 0) {
4028                 if (m == NULL)
4029                         panic("m_copydata: invalid mbuf chain");
4030                 if (off < m->m_len)
4031                         break;
4032                 off -= m->m_len;
4033                 m = m->m_next;
4034         }
4035         while (len > 0) {
4036                 if (m == NULL)
4037                         panic("m_copydata: invalid mbuf chain");
4038                 count = MIN(m->m_len - off, len);
4039                 bcopy(MTOD(m, caddr_t) + off, cp, count);
4040                 len -= count;
4041                 cp += count;
4042                 off = 0;
4043                 m = m->m_next;
4044         }
4045 }
4046
4047 /*
4048  * Concatenate mbuf chain n to m.  Both chains must be of the same type
4049  * (e.g. MT_DATA).  Any m_pkthdr is not updated.
4050  */
4051 void
4052 m_cat(struct mbuf *m, struct mbuf *n)
4053 {
4054         while (m->m_next)
4055                 m = m->m_next;
4056         while (n) {
4057                 if ((m->m_flags & M_EXT) ||
4058                     m->m_data + m->m_len + n->m_len >= &m->m_dat[MLEN]) {
4059                         /* just join the two chains */
4060                         m->m_next = n;
4061                         return;
4062                 }
4063                 /* splat the data from one into the other */
4064                 bcopy(MTOD(n, caddr_t), MTOD(m, caddr_t) + m->m_len,
4065                     (u_int)n->m_len);
4066                 m->m_len += n->m_len;
4067                 n = m_free(n);
4068         }
4069 }
4070
4071 void
4072 m_adj(struct mbuf *mp, int req_len)
4073 {
4074         int len = req_len;
4075         struct mbuf *m;
4076         int count;
4077
4078         if ((m = mp) == NULL)
4079                 return;
4080         if (len >= 0) {
4081                 /*
4082                  * Trim from head.
4083                  */
4084                 while (m != NULL && len > 0) {
4085                         if (m->m_len <= len) {
4086                                 len -= m->m_len;
4087                                 m->m_len = 0;
4088                                 m = m->m_next;
4089                         } else {
4090                                 m->m_len -= len;
4091                                 m->m_data += len;
4092                                 len = 0;
4093                         }
4094                 }
4095                 m = mp;
4096                 if (m->m_flags & M_PKTHDR)
4097                         m->m_pkthdr.len -= (req_len - len);
4098         } else {
4099                 /*
4100                  * Trim from tail.  Scan the mbuf chain,
4101                  * calculating its length and finding the last mbuf.
4102                  * If the adjustment only affects this mbuf, then just
4103                  * adjust and return.  Otherwise, rescan and truncate
4104                  * after the remaining size.
4105                  */
4106                 len = -len;
4107                 count = 0;
4108                 for (;;) {
4109                         count += m->m_len;
4110                         if (m->m_next == (struct mbuf *)0)
4111                                 break;
4112                         m = m->m_next;
4113                 }
4114                 if (m->m_len >= len) {
4115                         m->m_len -= len;
4116                         m = mp;
4117                         if (m->m_flags & M_PKTHDR)
4118                                 m->m_pkthdr.len -= len;
4119                         return;
4120                 }
4121                 count -= len;
4122                 if (count < 0)
4123                         count = 0;
4124                 /*
4125                  * Correct length for chain is "count".
4126                  * Find the mbuf with last data, adjust its length,
4127                  * and toss data from remaining mbufs on chain.
4128                  */
4129                 m = mp;
4130                 if (m->m_flags & M_PKTHDR)
4131                         m->m_pkthdr.len = count;
4132                 for (; m; m = m->m_next) {
4133                         if (m->m_len >= count) {
4134                                 m->m_len = count;
4135                                 break;
4136                         }
4137                         count -= m->m_len;
4138                 }
4139                 while ((m = m->m_next))
4140                         m->m_len = 0;
4141         }
4142 }
4143
4144 /*
4145  * Rearange an mbuf chain so that len bytes are contiguous
4146  * and in the data area of an mbuf (so that mtod and dtom
4147  * will work for a structure of size len).  Returns the resulting
4148  * mbuf chain on success, frees it and returns null on failure.
4149  * If there is room, it will add up to max_protohdr-len extra bytes to the
4150  * contiguous region in an attempt to avoid being called next time.
4151  */
4152 int MPFail;
4153
4154 struct mbuf *
4155 m_pullup(struct mbuf *n, int len)
4156 {
4157         struct mbuf *m;
4158         int count;
4159         int space;
4160
4161         /*
4162          * If first mbuf has no cluster, and has room for len bytes
4163          * without shifting current data, pullup into it,
4164          * otherwise allocate a new mbuf to prepend to the chain.
4165          */
4166         if ((n->m_flags & M_EXT) == 0 &&
4167             n->m_data + len < &n->m_dat[MLEN] && n->m_next) {
4168                 if (n->m_len >= len)
4169                         return (n);
4170                 m = n;
4171                 n = n->m_next;
4172                 len -= m->m_len;
4173         } else {
4174                 if (len > MHLEN)
4175                         goto bad;
4176                 _MGET(m, M_DONTWAIT, n->m_type);
4177                 if (m == 0)
4178                         goto bad;
4179                 m->m_len = 0;
4180                 if (n->m_flags & M_PKTHDR) {
4181                         M_COPY_PKTHDR(m, n);
4182                         n->m_flags &= ~M_PKTHDR;
4183                 }
4184         }
4185         space = &m->m_dat[MLEN] - (m->m_data + m->m_len);
4186         do {
4187                 count = MIN(MIN(MAX(len, max_protohdr), space), n->m_len);
4188                 bcopy(MTOD(n, caddr_t), MTOD(m, caddr_t) + m->m_len,
4189                     (unsigned)count);
4190                 len -= count;
4191                 m->m_len += count;
4192                 n->m_len -= count;
4193                 space -= count;
4194                 if (n->m_len)
4195                         n->m_data += count;
4196                 else
4197                         n = m_free(n);
4198         } while (len > 0 && n);
4199         if (len > 0) {
4200                 (void) m_free(m);
4201                 goto bad;
4202         }
4203         m->m_next = n;
4204         return (m);
4205 bad:
4206         m_freem(n);
4207         MPFail++;
4208         return (0);
4209 }
4210
4211 /*
4212  * Partition an mbuf chain in two pieces, returning the tail --
4213  * all but the first len0 bytes.  In case of failure, it returns NULL and
4214  * attempts to restore the chain to its original state.
4215  */
4216 struct mbuf *
4217 m_split(struct mbuf *m0, int len0, int wait)
4218 {
4219         struct mbuf *m, *n;
4220         unsigned len = len0, remain;
4221
4222         for (m = m0; m && len > m->m_len; m = m->m_next)
4223                 len -= m->m_len;
4224         if (m == NULL)
4225                 return (NULL);
4226         remain = m->m_len - len;
4227         if (m0->m_flags & M_PKTHDR) {
4228                 _MGETHDR(n, wait, m0->m_type);
4229                 if (n == NULL)
4230                         return (NULL);
4231                 n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif;
4232                 n->m_pkthdr.len = m0->m_pkthdr.len - len0;
4233                 m0->m_pkthdr.len = len0;
4234                 if (m->m_flags & M_EXT)
4235                         goto extpacket;
4236                 if (remain > MHLEN) {
4237                         /* m can't be the lead packet */
4238                         MH_ALIGN(n, 0);
4239                         n->m_next = m_split(m, len, wait);
4240                         if (n->m_next == NULL) {
4241                                 (void) m_free(n);
4242                                 return (NULL);
4243                         } else
4244                                 return (n);
4245                 } else
4246                         MH_ALIGN(n, remain);
4247         } else if (remain == 0) {
4248                 n = m->m_next;
4249                 m->m_next = NULL;
4250                 return (n);
4251         } else {
4252                 _MGET(n, wait, m->m_type);
4253                 if (n == NULL)
4254                         return (NULL);
4255                 M_ALIGN(n, remain);
4256         }
4257 extpacket:
4258         if (m->m_flags & M_EXT) {
4259                 n->m_flags |= M_EXT;
4260                 n->m_ext = m->m_ext;
4261                 m_incref(m);
4262                 n->m_data = m->m_data + len;
4263         } else {
4264                 bcopy(MTOD(m, caddr_t) + len, MTOD(n, caddr_t), remain);
4265         }
4266         n->m_len = remain;
4267         m->m_len = len;
4268         n->m_next = m->m_next;
4269         m->m_next = NULL;
4270         return (n);
4271 }
4272
4273 /*
4274  * Routine to copy from device local memory into mbufs.
4275  */
4276 struct mbuf *
4277 m_devget(char *buf, int totlen, int off0, struct ifnet *ifp,
4278     void (*copy)(const void *, void *, size_t))
4279 {
4280         struct mbuf *m;
4281         struct mbuf *top = NULL, **mp = &top;
4282         int off = off0, len;
4283         char *cp;
4284         char *epkt;
4285
4286         cp = buf;
4287         epkt = cp + totlen;
4288         if (off) {
4289                 /*
4290                  * If 'off' is non-zero, packet is trailer-encapsulated,
4291                  * so we have to skip the type and length fields.
4292                  */
4293                 cp += off + 2 * sizeof (u_int16_t);
4294                 totlen -= 2 * sizeof (u_int16_t);
4295         }
4296         _MGETHDR(m, M_DONTWAIT, MT_DATA);
4297         if (m == NULL)
4298                 return (NULL);
4299         m->m_pkthdr.rcvif = ifp;
4300         m->m_pkthdr.len = totlen;
4301         m->m_len = MHLEN;
4302
4303         while (totlen > 0) {
4304                 if (top != NULL) {
4305                         _MGET(m, M_DONTWAIT, MT_DATA);
4306                         if (m == NULL) {
4307                                 m_freem(top);
4308                                 return (NULL);
4309                         }
4310                         m->m_len = MLEN;
4311                 }
4312                 len = MIN(totlen, epkt - cp);
4313                 if (len >= MINCLSIZE) {
4314                         MCLGET(m, M_DONTWAIT);
4315                         if (m->m_flags & M_EXT) {
4316                                 m->m_len = len = MIN(len, m_maxsize(MC_CL));
4317                         } else {
4318                                 /* give up when it's out of cluster mbufs */
4319                                 if (top != NULL)
4320                                         m_freem(top);
4321                                 m_freem(m);
4322                                 return (NULL);
4323                         }
4324                 } else {
4325                         /*
4326                          * Place initial small packet/header at end of mbuf.
4327                          */
4328                         if (len < m->m_len) {
4329                                 if (top == NULL &&
4330                                     len + max_linkhdr <= m->m_len)
4331                                         m->m_data += max_linkhdr;
4332                                 m->m_len = len;
4333                         } else {
4334                                 len = m->m_len;
4335                         }
4336                 }
4337                 if (copy)
4338                         copy(cp, MTOD(m, caddr_t), (unsigned)len);
4339                 else
4340                         bcopy(cp, MTOD(m, caddr_t), (unsigned)len);
4341                 cp += len;
4342                 *mp = m;
4343                 mp = &m->m_next;
4344                 totlen -= len;
4345                 if (cp == epkt)
4346                         cp = buf;
4347         }
4348         return (top);
4349 }
4350
4351 /*
4352  * Cluster freelist allocation check.
4353  */
4354 static int
4355 m_howmany(int num, size_t bufsize)
4356 {
4357         int i = 0, j = 0;
4358         u_int32_t m_clusters, m_bigclusters, m_16kclusters;
4359         u_int32_t m_clfree, m_bigclfree, m_16kclfree;
4360
4361         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
4362
4363         m_clusters = m_total(MC_CL);
4364         m_bigclusters = m_total(MC_BIGCL);
4365         m_16kclusters = m_total(MC_16KCL);
4366         m_clfree = m_infree(MC_CL);
4367         m_bigclfree = m_infree(MC_BIGCL);
4368         m_16kclfree = m_infree(MC_16KCL);
4369
4370         /* Bail if we've maxed out the mbuf memory map */
4371         if ((bufsize != m_maxsize(MC_16KCL) &&
4372             (m_clusters + (m_bigclusters << 1) >= nclusters)) ||
4373             (njcl > 0 && bufsize == m_maxsize(MC_16KCL) &&
4374             (m_16kclusters << 3) >= njcl)) {
4375 #if DEBUG
4376                 if (bufsize == MCLBYTES && num > m_clfree) {
4377                         printf("m_howmany - out of small clusters, "
4378                             "%d short\n", num - mbstat.m_clfree);
4379                 }
4380 #endif /* DEBUG */
4381                 return (0);
4382         }
4383
4384         if (bufsize == m_maxsize(MC_CL)) {
4385                 /* Under minimum */
4386                 if (m_clusters < MINCL)
4387                         return (MINCL - m_clusters);
4388                 /* Too few (free < 1/16 total) and not over maximum */
4389                 if (m_clusters < m_maxlimit(MC_CL)) {
4390                         if (m_clfree >= MCL_LOWAT)
4391                                 return (0);
4392                         if (num >= m_clfree)
4393                                 i = num - m_clfree;
4394                         if (((m_clusters + num) >> 4) > m_clfree)
4395                                 j = ((m_clusters + num) >> 4) - m_clfree;
4396                         i = MAX(i, j);
4397                         if (i + m_clusters >= m_maxlimit(MC_CL))
4398                                 i = m_maxlimit(MC_CL) - m_clusters;
4399                 }
4400                 VERIFY((m_total(MC_CL) + i) <= m_maxlimit(MC_CL));
4401         } else if (bufsize == m_maxsize(MC_BIGCL)) {
4402                 /* Under minimum */
4403                 if (m_bigclusters < MINBIGCL)
4404                         return (MINBIGCL - m_bigclusters);
4405                 /* Too few (free < 1/16 total) and not over maximum */
4406                 if (m_bigclusters < m_maxlimit(MC_BIGCL)) {
4407                         if (m_bigclfree >= MBIGCL_LOWAT)
4408                                 return (0);
4409                         if (num >= m_bigclfree)
4410                                 i = num - m_bigclfree;
4411                         if (((m_bigclusters + num) >> 4) > m_bigclfree)
4412                                 j = ((m_bigclusters + num) >> 4) - m_bigclfree;
4413                         i = MAX(i, j);
4414                         if (i + m_bigclusters >= m_maxlimit(MC_BIGCL))
4415                                 i = m_maxlimit(MC_BIGCL) - m_bigclusters;
4416                 }
4417                 VERIFY((m_total(MC_BIGCL) + i) <= m_maxlimit(MC_BIGCL));
4418         } else {
4419                 VERIFY(njcl > 0);
4420                 /* Under minimum */
4421                 if (m_16kclusters < MIN16KCL)
4422                         return (MIN16KCL - m_16kclusters);
4423                 /* Too few (free < 1/16 total) and not over maximum */
4424                 if (m_16kclusters < m_maxlimit(MC_16KCL)) {
4425                         if (m_16kclfree >= M16KCL_LOWAT)
4426                                 return (0);
4427                         if (num >= m_16kclfree)
4428                                 i = num - m_16kclfree;
4429                         if (((m_16kclusters + num) >> 4) > m_16kclfree)
4430                                 j = ((m_16kclusters + num) >> 4) - m_16kclfree;
4431                         i = MAX(i, j);
4432                         if (i + m_16kclusters >= m_maxlimit(MC_16KCL))
4433                                 i = m_maxlimit(MC_16KCL) - m_16kclusters;
4434                 }
4435                 VERIFY((m_total(MC_16KCL) + i) <= m_maxlimit(MC_16KCL));
4436         }
4437
4438         return (i);
4439 }
4440
4441 /*
4442  * Copy data from a buffer back into the indicated mbuf chain,
4443  * starting "off" bytes from the beginning, extending the mbuf
4444  * chain if necessary.
4445  */
4446 void
4447 m_copyback(struct mbuf *m0, int off, int len, caddr_t cp)
4448 {
4449         int mlen;
4450         struct mbuf *m = m0, *n;
4451         int totlen = 0;
4452
4453         if (m0 == NULL)
4454                 return;
4455         while (off > (mlen = m->m_len)) {
4456                 off -= mlen;
4457                 totlen += mlen;
4458                 if (m->m_next == NULL) {
4459                         n = m_getclr(M_DONTWAIT, m->m_type);
4460                         if (n == NULL)
4461                                 goto out;
4462                         n->m_len = MIN(MLEN, len + off);
4463                         m->m_next = n;
4464                 }
4465                 m = m->m_next;
4466         }
4467         while (len > 0) {
4468                 mlen = MIN(m->m_len - off, len);
4469                 bcopy(cp, off + MTOD(m, caddr_t), (unsigned)mlen);
4470                 cp += mlen;
4471                 len -= mlen;
4472                 mlen += off;
4473                 off = 0;
4474                 totlen += mlen;
4475                 if (len == 0)
4476                         break;
4477                 if (m->m_next == NULL) {
4478                         n = _M_GET(M_DONTWAIT, m->m_type);
4479                         if (n == NULL)
4480                                 break;
4481                         n->m_len = MIN(MLEN, len);
4482                         m->m_next = n;
4483                 }
4484                 m = m->m_next;
4485         }
4486 out:
4487         if (((m = m0)->m_flags & M_PKTHDR) && (m->m_pkthdr.len < totlen))
4488                 m->m_pkthdr.len = totlen;
4489 }
4490
4491 char *
4492 mcl_to_paddr(char *addr)
4493 {
4494         int base_phys;
4495
4496         if (!MBUF_IN_MAP(addr))
4497                 return (NULL);
4498         base_phys = mcl_paddr[(addr - (char *)mbutl) >> PGSHIFT];
4499
4500         if (base_phys == 0)
4501                 return (NULL);
4502         return ((char *)((int)base_phys | ((int)addr & PGOFSET)));
4503 }
4504
4505 /*
4506  * Dup the mbuf chain passed in.  The whole thing.  No cute additional cruft.
4507  * And really copy the thing.  That way, we don't "precompute" checksums
4508  * for unsuspecting consumers.  Assumption: m->m_nextpkt == 0.  Trick: for
4509  * small packets, don't dup into a cluster.  That way received  packets
4510  * don't take up too much room in the sockbuf (cf. sbspace()).
4511  */
4512 int MDFail;
4513
4514 struct mbuf *
4515 m_dup(struct mbuf *m, int how)
4516 {
4517         struct mbuf *n, **np;
4518         struct mbuf *top;
4519         int copyhdr = 0;
4520
4521         np = &top;
4522         top = NULL;
4523         if (m->m_flags & M_PKTHDR)
4524                 copyhdr = 1;
4525
4526         /*
4527          * Quick check: if we have one mbuf and its data fits in an
4528          *  mbuf with packet header, just copy and go.
4529          */
4530         if (m->m_next == NULL) {
4531                 /* Then just move the data into an mbuf and be done... */
4532                 if (copyhdr) {
4533                         if (m->m_pkthdr.len <= MHLEN && m->m_len <= MHLEN) {
4534                                 if ((n = _M_GETHDR(how, m->m_type)) == NULL)
4535                                         return (NULL);
4536                                 n->m_len = m->m_len;
4537                                 m_dup_pkthdr(n, m, how);
4538                                 bcopy(m->m_data, n->m_data, m->m_len);
4539                                 return (n);
4540                         }
4541                 } else if (m->m_len <= MLEN) {
4542                         if ((n = _M_GET(how, m->m_type)) == NULL)
4543                                 return (NULL);
4544                         bcopy(m->m_data, n->m_data, m->m_len);
4545                         n->m_len = m->m_len;
4546                         return (n);
4547                 }
4548         }
4549         while (m != NULL) {
4550 #if BLUE_DEBUG
4551                 kprintf("<%x: %x, %x, %x\n", m, m->m_flags, m->m_len,
4552                     m->m_data);
4553 #endif
4554                 if (copyhdr)
4555                         n = _M_GETHDR(how, m->m_type);
4556                 else
4557                         n = _M_GET(how, m->m_type);
4558                 if (n == NULL)
4559                         goto nospace;
4560                 if (m->m_flags & M_EXT) {
4561                         if (m->m_len <= m_maxsize(MC_CL))
4562                                 MCLGET(n, how);
4563                         else if (m->m_len <= m_maxsize(MC_BIGCL))
4564                                 n = m_mbigget(n, how);
4565                         else if (m->m_len <= m_maxsize(MC_16KCL) && njcl > 0)
4566                                 n = m_m16kget(n, how);
4567                         if (!(n->m_flags & M_EXT)) {
4568                                 (void) m_free(n);
4569                                 goto nospace;
4570                         }
4571                 }
4572                 *np = n;
4573                 if (copyhdr) {
4574                         /* Don't use M_COPY_PKTHDR: preserve m_data */
4575                         m_dup_pkthdr(n, m, how);
4576                         copyhdr = 0;
4577                         if (!(n->m_flags & M_EXT))
4578                                 n->m_data = n->m_pktdat;
4579                 }
4580                 n->m_len = m->m_len;
4581                 /*
4582                  * Get the dup on the same bdry as the original
4583                  * Assume that the two mbufs have the same offset to data area
4584                  * (up to word boundaries)
4585                  */
4586                 bcopy(MTOD(m, caddr_t), MTOD(n, caddr_t), (unsigned)n->m_len);
4587                 m = m->m_next;
4588                 np = &n->m_next;
4589 #if BLUE_DEBUG
4590                 kprintf(">%x: %x, %x, %x\n", n, n->m_flags, n->m_len,
4591                     n->m_data);
4592 #endif
4593         }
4594
4595         if (top == NULL)
4596                 MDFail++;
4597         return (top);
4598
4599 nospace:
4600         m_freem(top);
4601         MDFail++;
4602         return (NULL);
4603 }
4604
4605 #define MBUF_MULTIPAGES(m)                                              \
4606         (((m)->m_flags & M_EXT) &&                                      \
4607         ((IS_P2ALIGNED((m)->m_data, NBPG) && (m)->m_len > NBPG) ||      \
4608         (!IS_P2ALIGNED((m)->m_data, NBPG) &&                            \
4609         P2ROUNDUP((m)->m_data, NBPG) < ((uintptr_t)(m)->m_data + (m)->m_len))))
4610
4611 static struct mbuf *
4612 m_expand(struct mbuf *m, struct mbuf **last)
4613 {
4614         struct mbuf *top = NULL;
4615         struct mbuf **nm = &top;
4616         uintptr_t data0, data;
4617         unsigned int len0, len;
4618
4619         VERIFY(MBUF_MULTIPAGES(m));
4620         VERIFY(m->m_next == NULL);
4621         data0 = (uintptr_t)m->m_data;
4622         len0 = m->m_len;
4623         *last = top;
4624
4625         for (;;) {
4626                 struct mbuf *n;
4627
4628                 data = data0;
4629                 if (IS_P2ALIGNED(data, NBPG) && len0 > NBPG)
4630                         len = NBPG;
4631                 else if (!IS_P2ALIGNED(data, NBPG) &&
4632                     P2ROUNDUP(data, NBPG) < (data + len0))
4633                         len = P2ROUNDUP(data, NBPG) - data;
4634                 else
4635                         len = len0;
4636
4637                 VERIFY(len > 0);
4638                 VERIFY(m->m_flags & M_EXT);
4639                 m->m_data = (void *)data;
4640                 m->m_len = len;
4641
4642                 *nm = *last = m;
4643                 nm = &m->m_next;
4644                 m->m_next = NULL;
4645
4646                 data0 += len;
4647                 len0 -= len;
4648                 if (len0 == 0)
4649                         break;
4650
4651                 n = _M_RETRY(M_DONTWAIT, MT_DATA);
4652                 if (n == NULL) {
4653                         m_freem(top);
4654                         top = *last = NULL;
4655                         break;
4656                 }
4657
4658                 n->m_ext = m->m_ext;
4659                 m_incref(m);
4660                 n->m_flags |= M_EXT;
4661                 m = n;
4662         }
4663         return (top);
4664 }
4665
4666 struct mbuf *
4667 m_normalize(struct mbuf *m)
4668 {
4669         struct mbuf *top = NULL;
4670         struct mbuf **nm = &top;
4671         boolean_t expanded = FALSE;
4672
4673         while (m != NULL) {
4674                 struct mbuf *n;
4675
4676                 n = m->m_next;
4677                 m->m_next = NULL;
4678
4679                 /* Does the data cross one or more page boundaries? */
4680                 if (MBUF_MULTIPAGES(m)) {
4681                         struct mbuf *last;
4682                         if ((m = m_expand(m, &last)) == NULL) {
4683                                 m_freem(n);
4684                                 m_freem(top);
4685                                 top = NULL;
4686                                 break;
4687                         }
4688                         *nm = m;
4689                         nm = &last->m_next;
4690                         expanded = TRUE;
4691                 } else {
4692                         *nm = m;
4693                         nm = &m->m_next;
4694                 }
4695                 m = n;
4696         }
4697         if (expanded)
4698                 atomic_add_32(&mb_normalized, 1);
4699         return (top);
4700 }
4701
4702 void
4703 m_mchtype(struct mbuf *m, int t)
4704 {
4705         mtype_stat_inc(t);
4706         mtype_stat_dec(m->m_type);
4707         (m)->m_type = t;
4708 }
4709
4710 void *
4711 m_mtod(struct mbuf *m)
4712 {
4713         return (MTOD(m, void *));
4714 }
4715
4716 struct mbuf *
4717 m_dtom(void *x)
4718 {
4719         return ((struct mbuf *)((u_long)(x) & ~(MSIZE-1)));
4720 }
4721
4722 void
4723 m_mcheck(struct mbuf *m)
4724 {
4725         _MCHECK(m);
4726 }
4727
4728 /*
4729  * Inform the corresponding mcache(s) that there's a waiter below.
4730  */
4731 static void
4732 mbuf_waiter_inc(mbuf_class_t class, boolean_t comp)
4733 {
4734         mcache_waiter_inc(m_cache(class));
4735         if (comp) {
4736                 if (class == MC_CL) {
4737                         mcache_waiter_inc(m_cache(MC_MBUF_CL));
4738                 } else if (class == MC_BIGCL) {
4739                         mcache_waiter_inc(m_cache(MC_MBUF_BIGCL));
4740                 } else if (class == MC_16KCL) {
4741                         mcache_waiter_inc(m_cache(MC_MBUF_16KCL));
4742                 } else {
4743                         mcache_waiter_inc(m_cache(MC_MBUF_CL));
4744                         mcache_waiter_inc(m_cache(MC_MBUF_BIGCL));
4745                 }
4746         }
4747 }
4748
4749 /*
4750  * Inform the corresponding mcache(s) that there's no more waiter below.
4751  */
4752 static void
4753 mbuf_waiter_dec(mbuf_class_t class, boolean_t comp)
4754 {
4755         mcache_waiter_dec(m_cache(class));
4756         if (comp) {
4757                 if (class == MC_CL) {
4758                         mcache_waiter_dec(m_cache(MC_MBUF_CL));
4759                 } else if (class == MC_BIGCL) {
4760                         mcache_waiter_dec(m_cache(MC_MBUF_BIGCL));
4761                 } else if (class == MC_16KCL) {
4762                         mcache_waiter_dec(m_cache(MC_MBUF_16KCL));
4763                 } else {
4764                         mcache_waiter_dec(m_cache(MC_MBUF_CL));
4765                         mcache_waiter_dec(m_cache(MC_MBUF_BIGCL));
4766                 }
4767         }
4768 }
4769
4770 /*
4771  * Called during blocking allocation.  Returns TRUE if one or more objects
4772  * are available at the per-CPU caches layer and that allocation should be
4773  * retried at that level.
4774  */
4775 static boolean_t
4776 mbuf_sleep(mbuf_class_t class, unsigned int num, int wait)
4777 {
4778         boolean_t mcache_retry = FALSE;
4779
4780         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
4781
4782         /* Check if there's anything at the cache layer */
4783         if (mbuf_cached_above(class, wait)) {
4784                 mcache_retry = TRUE;
4785                 goto done;
4786         }
4787
4788         /* Nothing?  Then try hard to get it from somewhere */
4789         m_reclaim(class, num, (wait & MCR_COMP));
4790
4791         /* We tried hard and got something? */
4792         if (m_infree(class) > 0) {
4793                 mbstat.m_wait++;
4794                 goto done;
4795         } else if (mbuf_cached_above(class, wait)) {
4796                 mbstat.m_wait++;
4797                 mcache_retry = TRUE;
4798                 goto done;
4799         } else if (wait & MCR_TRYHARD) {
4800                 mcache_retry = TRUE;
4801                 goto done;
4802         }
4803
4804         /*
4805          * There's really nothing for us right now; inform the
4806          * cache(s) that there is a waiter below and go to sleep.
4807          */
4808         mbuf_waiter_inc(class, (wait & MCR_COMP));
4809
4810         VERIFY(!(wait & MCR_NOSLEEP));
4811         mb_waiters++;
4812         (void) msleep(mb_waitchan, mbuf_mlock, (PZERO-1), m_cname(class), NULL);
4813
4814         /* We are now up; stop getting notified until next round */
4815         mbuf_waiter_dec(class, (wait & MCR_COMP));
4816
4817         /* We waited and got something */
4818         if (m_infree(class) > 0) {
4819                 mbstat.m_wait++;
4820                 goto done;
4821         } else if (mbuf_cached_above(class, wait)) {
4822                 mbstat.m_wait++;
4823                 mcache_retry = TRUE;
4824         }
4825 done:
4826         return (mcache_retry);
4827 }
4828
4829 static void
4830 mbuf_worker_thread(void)
4831 {
4832         int mbuf_expand;
4833
4834         while (1) {
4835                 lck_mtx_lock(mbuf_mlock);
4836
4837                 mbuf_expand = 0;
4838                 if (mbuf_expand_mcl) {
4839                         int n;
4840
4841                         /* Adjust to current number of cluster in use */
4842                         n = mbuf_expand_mcl -
4843                             (m_total(MC_CL) - m_infree(MC_CL));
4844                         if ((n + m_total(MC_CL)) > m_maxlimit(MC_CL))
4845                                 n = m_maxlimit(MC_CL) - m_total(MC_CL);
4846                         mbuf_expand_mcl = 0;
4847
4848                         if (n > 0 && freelist_populate(MC_CL, n, M_WAIT) > 0)
4849                                 mbuf_expand++;
4850                 }
4851                 if (mbuf_expand_big) {
4852                         int n;
4853
4854                         /* Adjust to current number of 4 KB cluster in use */
4855                         n = mbuf_expand_big -
4856                             (m_total(MC_BIGCL) - m_infree(MC_BIGCL));
4857                         if ((n + m_total(MC_BIGCL)) > m_maxlimit(MC_BIGCL))
4858                                 n = m_maxlimit(MC_BIGCL) - m_total(MC_BIGCL);
4859                         mbuf_expand_big = 0;
4860
4861                         if (n > 0 && freelist_populate(MC_BIGCL, n, M_WAIT) > 0)
4862                                 mbuf_expand++;
4863                 }
4864                 if (mbuf_expand_16k) {
4865                         int n;
4866
4867                         /* Adjust to current number of 16 KB cluster in use */
4868                         n = mbuf_expand_16k -
4869                             (m_total(MC_16KCL) - m_infree(MC_16KCL));
4870                         if ((n + m_total(MC_16KCL)) > m_maxlimit(MC_16KCL))
4871                                 n = m_maxlimit(MC_16KCL) - m_total(MC_16KCL);
4872                         mbuf_expand_16k = 0;
4873
4874                         if (n > 0)
4875                                 (void) freelist_populate(MC_16KCL, n, M_WAIT);
4876                 }
4877
4878                 /*
4879                  * Because we can run out of memory before filling the mbuf
4880                  * map, we should not allocate more clusters than they are
4881                  * mbufs -- otherwise we could have a large number of useless
4882                  * clusters allocated.
4883                  */
4884                 if (mbuf_expand) {
4885                         while (m_total(MC_MBUF) <
4886                             (m_total(MC_BIGCL) + m_total(MC_CL))) {
4887                                 if (freelist_populate(MC_MBUF, 1, M_WAIT) == 0)
4888                                         break;
4889                         }
4890                 }
4891
4892                 lck_mtx_unlock(mbuf_mlock);
4893
4894                 assert_wait(&mbuf_worker_run, THREAD_UNINT);
4895                 (void) thread_block((thread_continue_t)mbuf_worker_thread);
4896         }
4897 }
4898
4899 static void
4900 mbuf_worker_thread_init(void)
4901 {
4902         mbuf_worker_ready++;
4903         mbuf_worker_thread();
4904 }
4905
4906 static mcl_slab_t *
4907 slab_get(void *buf)
4908 {
4909         mcl_slabg_t *slg;
4910         unsigned int ix, k;
4911
4912         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
4913
4914         VERIFY(MBUF_IN_MAP(buf));
4915         ix = ((char *)buf - (char *)mbutl) >> MBSHIFT;
4916         VERIFY(ix < maxslabgrp);
4917
4918         if ((slg = slabstbl[ix]) == NULL) {
4919                 /*
4920                  * In the current implementation, we never shrink the memory
4921                  * pool (hence the cluster map); if we attempt to reallocate
4922                  * a cluster group when it's already allocated, panic since
4923                  * this is a sign of a memory corruption (slabstbl[ix] got
4924                  * nullified).  This also means that there shouldn't be any
4925                  * hole in the kernel sub-map for the mbuf pool.
4926                  */
4927                 ++slabgrp;
4928                 VERIFY(ix < slabgrp);
4929                 /*
4930                  * Slabs expansion can only be done single threaded; when
4931                  * we get here, it must be as a result of m_clalloc() which
4932                  * is serialized and therefore mb_clalloc_busy must be set.
4933                  */
4934                 VERIFY(mb_clalloc_busy);
4935                 lck_mtx_unlock(mbuf_mlock);
4936
4937                 /* This is a new buffer; create the slabs group for it */
4938                 MALLOC(slg, mcl_slabg_t *, sizeof (*slg), M_TEMP,
4939                     M_WAITOK | M_ZERO);
4940                 VERIFY(slg != NULL);
4941
4942                 lck_mtx_lock(mbuf_mlock);
4943                 /*
4944                  * No other thread could have gone into m_clalloc() after
4945                  * we dropped the lock above, so verify that it's true.
4946                  */
4947                 VERIFY(mb_clalloc_busy);
4948
4949                 slabstbl[ix] = slg;
4950
4951                 /* Chain each slab in the group to its forward neighbor */
4952                 for (k = 1; k < NSLABSPMB; k++)
4953                         slg->slg_slab[k - 1].sl_next = &slg->slg_slab[k];
4954                 VERIFY(slg->slg_slab[NSLABSPMB - 1].sl_next == NULL);
4955
4956                 /* And chain the last slab in the previous group to this */
4957                 if (ix > 0) {
4958                         VERIFY(slabstbl[ix - 1]->
4959                             slg_slab[NSLABSPMB - 1].sl_next == NULL);
4960                         slabstbl[ix - 1]->slg_slab[NSLABSPMB - 1].sl_next =
4961                             &slg->slg_slab[0];
4962                 }
4963         }
4964
4965         ix = MTOCL(buf) % NSLABSPMB;
4966         VERIFY(ix < NSLABSPMB);
4967
4968         return (&slg->slg_slab[ix]);
4969 }
4970
4971 static void
4972 slab_init(mcl_slab_t *sp, mbuf_class_t class, u_int32_t flags,
4973     void *base, void *head, unsigned int len, int refcnt, int chunks)
4974 {
4975         sp->sl_class = class;
4976         sp->sl_flags = flags;
4977         sp->sl_base = base;
4978         sp->sl_head = head;
4979         sp->sl_len = len;
4980         sp->sl_refcnt = refcnt;
4981         sp->sl_chunks = chunks;
4982         slab_detach(sp);
4983 }
4984
4985 static void
4986 slab_insert(mcl_slab_t *sp, mbuf_class_t class)
4987 {
4988         VERIFY(slab_is_detached(sp));
4989         m_slab_cnt(class)++;
4990         TAILQ_INSERT_TAIL(&m_slablist(class), sp, sl_link);
4991         sp->sl_flags &= ~SLF_DETACHED;
4992         if (class == MC_BIGCL) {
4993                 sp = sp->sl_next;
4994                 /* Next slab must already be present */
4995                 VERIFY(sp != NULL);
4996                 VERIFY(slab_is_detached(sp));
4997                 sp->sl_flags &= ~SLF_DETACHED;
4998         } else if (class == MC_16KCL) {
4999                 int k;
5000                 for (k = 1; k < (M16KCLBYTES / MCLBYTES); k++) {
5001                         sp = sp->sl_next;
5002                         /* Next slab must already be present */
5003                         VERIFY(sp != NULL);
5004                         VERIFY(slab_is_detached(sp));
5005                         sp->sl_flags &= ~SLF_DETACHED;
5006                 }
5007         }
5008 }
5009
5010 static void
5011 slab_remove(mcl_slab_t *sp, mbuf_class_t class)
5012 {
5013         VERIFY(!slab_is_detached(sp));
5014         VERIFY(m_slab_cnt(class) > 0);
5015         m_slab_cnt(class)--;
5016         TAILQ_REMOVE(&m_slablist(class), sp, sl_link);
5017         slab_detach(sp);
5018         if (class == MC_BIGCL) {
5019                 sp = sp->sl_next;
5020                 /* Next slab must already be present */
5021                 VERIFY(sp != NULL);
5022                 VERIFY(!slab_is_detached(sp));
5023                 slab_detach(sp);
5024         } else if (class == MC_16KCL) {
5025                 int k;
5026                 for (k = 1; k < (M16KCLBYTES / MCLBYTES); k++) {
5027                         sp = sp->sl_next;
5028                         /* Next slab must already be present */
5029                         VERIFY(sp != NULL);
5030                         VERIFY(!slab_is_detached(sp));
5031                         slab_detach(sp);
5032                 }
5033         }
5034 }
5035
5036 static boolean_t
5037 slab_inrange(mcl_slab_t *sp, void *buf)
5038 {
5039         return ((uintptr_t)buf >= (uintptr_t)sp->sl_base &&
5040             (uintptr_t)buf < ((uintptr_t)sp->sl_base + sp->sl_len));
5041 }
5042
5043 #undef panic(...)
5044
5045 static void
5046 slab_nextptr_panic(mcl_slab_t *sp, void *addr)
5047 {
5048         int i;
5049         unsigned int chunk_len = sp->sl_len / sp->sl_chunks;
5050         uintptr_t buf = (uintptr_t)sp->sl_base;
5051
5052         for (i = 0; i < sp->sl_chunks; i++, buf += chunk_len) {
5053                 void *next = ((mcache_obj_t *)buf)->obj_next;
5054                 if (next != addr)
5055                         continue;
5056                 if (mclaudit == NULL) {
5057                         if (next != NULL && !MBUF_IN_MAP(next)) {
5058                                 mcache_t *cp = m_cache(sp->sl_class);
5059                                 panic("%s: %s buffer %p in slab %p modified "
5060                                     "after free at offset 0: %p out of range "
5061                                     "[%p-%p)\n", __func__, cp->mc_name,
5062                                     (void *)buf, sp, next, mbutl, embutl);
5063                                 /* NOTREACHED */
5064                         }
5065                 } else {
5066                         mcache_audit_t *mca = mcl_audit_buf2mca(sp->sl_class,
5067                             (mcache_obj_t *)buf);
5068                         mcl_audit_verify_nextptr(next, mca);
5069                 }
5070         }
5071 }
5072
5073 static void
5074 slab_detach(mcl_slab_t *sp)
5075 {
5076         sp->sl_link.tqe_next = (mcl_slab_t *)-1;
5077         sp->sl_link.tqe_prev = (mcl_slab_t **)-1;
5078         sp->sl_flags |= SLF_DETACHED;
5079 }
5080
5081 static boolean_t
5082 slab_is_detached(mcl_slab_t *sp)
5083 {
5084         return ((intptr_t)sp->sl_link.tqe_next == -1 &&
5085             (intptr_t)sp->sl_link.tqe_prev == -1 &&
5086             (sp->sl_flags & SLF_DETACHED));
5087 }
5088
5089 static void
5090 mcl_audit_init(void *buf, mcache_audit_t **mca_list,
5091     mcache_obj_t **con_list, size_t con_size, unsigned int num)
5092 {
5093         mcache_audit_t *mca, *mca_tail;
5094         mcache_obj_t *con = NULL;
5095         boolean_t save_contents = (con_list != NULL);
5096         unsigned int i, ix;
5097
5098         ASSERT(num <= NMBPCL);
5099         ASSERT(con_list == NULL || con_size != 0);
5100
5101         ix = MTOCL(buf);
5102         /* Make sure we haven't been here before */
5103         for (i = 0; i < NMBPCL; i++)
5104                 VERIFY(mclaudit[ix].cl_audit[i] == NULL);
5105
5106         mca = mca_tail = *mca_list;
5107         if (save_contents)
5108                 con = *con_list;
5109
5110         for (i = 0; i < num; i++) {
5111                 mcache_audit_t *next;
5112
5113                 next = mca->mca_next;
5114                 bzero(mca, sizeof (*mca));
5115                 mca->mca_next = next;
5116                 mclaudit[ix].cl_audit[i] = mca;
5117
5118                 /* Attach the contents buffer if requested */
5119                 if (save_contents) {
5120                         VERIFY(con != NULL);
5121                         mca->mca_contents_size = con_size;
5122                         mca->mca_contents = con;
5123                         con = con->obj_next;
5124                         bzero(mca->mca_contents, mca->mca_contents_size);
5125                 }
5126
5127                 mca_tail = mca;
5128                 mca = mca->mca_next;
5129         }
5130
5131         if (save_contents)
5132                 *con_list = con;
5133
5134         *mca_list = mca_tail->mca_next;
5135         mca_tail->mca_next = NULL;
5136 }
5137
5138 /*
5139  * Given an address of a buffer (mbuf/cluster/big cluster), return
5140  * the corresponding audit structure for that buffer.
5141  */
5142 static mcache_audit_t *
5143 mcl_audit_buf2mca(mbuf_class_t class, mcache_obj_t *o)
5144 {
5145         mcache_audit_t *mca = NULL;
5146         int ix = MTOCL(o);
5147
5148         VERIFY(IS_P2ALIGNED(o, MIN(m_maxsize(class), NBPG)));
5149
5150         switch (class) {
5151         case MC_MBUF:
5152                 /*
5153                  * For the mbuf case, find the index of the cluster
5154                  * used by the mbuf and use that index to locate the
5155                  * base address of the cluster.  Then find out the
5156                  * mbuf index relative to the cluster base and use
5157                  * it to locate the audit structure.
5158                  */
5159                 VERIFY(MCLIDX(CLTOM(ix), o) < (int)NMBPCL);
5160                 mca = mclaudit[ix].cl_audit[MCLIDX(CLTOM(ix), o)];
5161                 break;
5162
5163         case MC_CL:
5164         case MC_BIGCL:
5165         case MC_16KCL:
5166                 /*
5167                  * Same as above, but only return the first element.
5168                  */
5169                 mca = mclaudit[ix].cl_audit[0];
5170                 break;
5171
5172         default:
5173                 VERIFY(0);
5174                 /* NOTREACHED */
5175         }
5176
5177         return (mca);
5178 }
5179
5180 static void
5181 mcl_audit_mbuf(mcache_audit_t *mca, void *addr, boolean_t composite,
5182     boolean_t alloc)
5183 {
5184         struct mbuf *m = addr;
5185         mcache_obj_t *next = ((mcache_obj_t *)m)->obj_next;
5186
5187         VERIFY(mca->mca_contents != NULL &&
5188             mca->mca_contents_size == AUDIT_CONTENTS_SIZE);
5189
5190         mcl_audit_verify_nextptr(next, mca);
5191
5192         if (!alloc) {
5193                 /* Save constructed mbuf fields */
5194                 mcl_audit_save_mbuf(m, mca);
5195                 mcache_set_pattern(MCACHE_FREE_PATTERN, m, m_maxsize(MC_MBUF));
5196                 ((mcache_obj_t *)m)->obj_next = next;
5197                 return;
5198         }
5199
5200         /* Check if the buffer has been corrupted while in freelist */
5201         mcache_audit_free_verify_set(mca, addr, 0, m_maxsize(MC_MBUF));
5202
5203         /* Restore constructed mbuf fields */
5204         mcl_audit_restore_mbuf(m, mca, composite);
5205 }
5206
5207 static void
5208 mcl_audit_restore_mbuf(struct mbuf *m, mcache_audit_t *mca, boolean_t composite)
5209 {
5210         struct mbuf *ms = (struct mbuf *)mca->mca_contents;
5211
5212         if (composite) {
5213                 struct mbuf *next = m->m_next;
5214                 VERIFY(ms->m_flags == M_EXT && MEXT_RFA(ms) != NULL &&
5215                     MBUF_IS_COMPOSITE(ms));
5216                 /*
5217                  * We could have hand-picked the mbuf fields and restore
5218                  * them individually, but that will be a maintenance
5219                  * headache.  Instead, restore everything that was saved;
5220                  * the mbuf layer will recheck and reinitialize anyway.
5221                  */
5222                 bcopy(ms, m, mca->mca_contents_size);
5223                 m->m_next = next;
5224         } else {
5225                 /*
5226                  * For a regular mbuf (no cluster attached) there's nothing
5227                  * to restore other than the type field, which is expected
5228                  * to be MT_FREE.
5229                  */
5230                 m->m_type = ms->m_type;
5231         }
5232         _MCHECK(m);
5233 }
5234
5235 static void
5236 mcl_audit_save_mbuf(struct mbuf *m, mcache_audit_t *mca)
5237 {
5238         _MCHECK(m);
5239         bcopy(m, mca->mca_contents, mca->mca_contents_size);
5240 }
5241
5242 static void
5243 mcl_audit_cluster(mcache_audit_t *mca, void *addr, size_t size, boolean_t alloc,
5244     boolean_t save_next)
5245 {
5246         mcache_obj_t *next = ((mcache_obj_t *)addr)->obj_next;
5247
5248         if (!alloc) {
5249                 mcache_set_pattern(MCACHE_FREE_PATTERN, addr, size);
5250                 if (save_next) {
5251                         mcl_audit_verify_nextptr(next, mca);
5252                         ((mcache_obj_t *)addr)->obj_next = next;
5253                 }
5254         } else {
5255                 /* Check if the buffer has been corrupted while in freelist */
5256                 mcl_audit_verify_nextptr(next, mca);
5257                 mcache_audit_free_verify_set(mca, addr, 0, size);
5258         }
5259 }
5260
5261 static void
5262 mcl_audit_mcheck_panic(struct mbuf *m)
5263 {
5264         mcache_audit_t *mca;
5265
5266         MRANGE(m);
5267         mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
5268
5269         panic("mcl_audit: freed mbuf %p with type 0x%x (instead of 0x%x)\n%s\n",
5270             m, (u_int16_t)m->m_type, MT_FREE, mcache_dump_mca(mca));
5271         /* NOTREACHED */
5272 }
5273
5274 static void
5275 mcl_audit_verify_nextptr(void *next, mcache_audit_t *mca)
5276 {
5277         if (next != NULL && next != (void *)MCACHE_FREE_PATTERN &&
5278             !MBUF_IN_MAP(next)) {
5279                 panic("mcl_audit: buffer %p modified after free at offset 0: "
5280                     "%p out of range [%p-%p)\n%s\n",
5281                     mca->mca_addr, next, mbutl, embutl, mcache_dump_mca(mca));
5282                 /* NOTREACHED */
5283         }
5284 }
5285
5286 SYSCTL_DECL(_kern_ipc);
5287 SYSCTL_PROC(_kern_ipc, KIPC_MBSTAT, mbstat, CTLFLAG_RD | CTLFLAG_LOCKED,
5288     0, 0, mbstat_sysctl, "S,mbstat", "");
5289 SYSCTL_PROC(_kern_ipc, OID_AUTO, mb_stat, CTLFLAG_RD | CTLFLAG_LOCKED,
5290     0, 0, mb_stat_sysctl, "S,mb_stat", "");
5291 SYSCTL_INT(_kern_ipc, OID_AUTO, mb_normalized, CTLFLAG_RD | CTLFLAG_LOCKED,
5292     &mb_normalized, 0, "");