bsd/kern/uipc_mbuf.c

   1 /*
   2  * Copyright (c) 1998-2017 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
  29 /*
  30  * Copyright (c) 1982, 1986, 1988, 1991, 1993
  31  *      The Regents of the University of California.  All rights reserved.
  32  *
  33  * Redistribution and use in source and binary forms, with or without
  34  * modification, are permitted provided that the following conditions
  35  * are met:
  36  * 1. Redistributions of source code must retain the above copyright
  37  *    notice, this list of conditions and the following disclaimer.
  38  * 2. Redistributions in binary form must reproduce the above copyright
  39  *    notice, this list of conditions and the following disclaimer in the
  40  *    documentation and/or other materials provided with the distribution.
  41  * 3. All advertising materials mentioning features or use of this software
  42  *    must display the following acknowledgement:
  43  *      This product includes software developed by the University of
  44  *      California, Berkeley and its contributors.
  45  * 4. Neither the name of the University nor the names of its contributors
  46  *    may be used to endorse or promote products derived from this software
  47  *    without specific prior written permission.
  48  *
  49  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  50  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  51  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  52  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  53  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  54  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  55  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  56  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  57  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  58  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  59  * SUCH DAMAGE.
  60  *
  61  *      @(#)uipc_mbuf.c 8.2 (Berkeley) 1/4/94
  62  */
  63 /*
  64  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
  65  * support for mandatory and extensible security protections.  This notice
  66  * is included in support of clause 2.2 (b) of the Apple Public License,
  67  * Version 2.0.
  68  */
  69
  70 #include <sys/param.h>
  71 #include <sys/systm.h>
  72 #include <sys/malloc.h>
  73 #include <sys/mbuf.h>
  74 #include <sys/kernel.h>
  75 #include <sys/sysctl.h>
  76 #include <sys/syslog.h>
  77 #include <sys/protosw.h>
  78 #include <sys/domain.h>
  79 #include <sys/queue.h>
  80 #include <sys/proc.h>
  81
  82 #include <dev/random/randomdev.h>
  83
  84 #include <kern/kern_types.h>
  85 #include <kern/simple_lock.h>
  86 #include <kern/queue.h>
  87 #include <kern/sched_prim.h>
  88 #include <kern/backtrace.h>
  89 #include <kern/cpu_number.h>
  90 #include <kern/zalloc.h>
  91
  92 #include <libkern/OSAtomic.h>
  93 #include <libkern/OSDebug.h>
  94 #include <libkern/libkern.h>
  95
  96 #include <IOKit/IOMapper.h>
  97
  98 #include <machine/limits.h>
  99 #include <machine/machine_routines.h>
 100
 101 #if CONFIG_MACF_NET
 102 #include <security/mac_framework.h>
 103 #endif /* MAC_NET */
 104
 105 #include <sys/mcache.h>
 106 #include <net/ntstat.h>
 107
 108 /*
 109  * MBUF IMPLEMENTATION NOTES.
 110  *
 111  * There is a total of 5 per-CPU caches:
 112  *
 113  * MC_MBUF:
 114  *      This is a cache of rudimentary objects of MSIZE in size; each
 115  *      object represents an mbuf structure.  This cache preserves only
 116  *      the m_type field of the mbuf during its transactions.
 117  *
 118  * MC_CL:
 119  *      This is a cache of rudimentary objects of MCLBYTES in size; each
 120  *      object represents a mcluster structure.  This cache does not
 121  *      preserve the contents of the objects during its transactions.
 122  *
 123  * MC_BIGCL:
 124  *      This is a cache of rudimentary objects of MBIGCLBYTES in size; each
 125  *      object represents a mbigcluster structure.  This cache does not
 126  *      preserve the contents of the objects during its transaction.
 127  *
 128  * MC_MBUF_CL:
 129  *      This is a cache of mbufs each having a cluster attached to it.
 130  *      It is backed by MC_MBUF and MC_CL rudimentary caches.  Several
 131  *      fields of the mbuf related to the external cluster are preserved
 132  *      during transactions.
 133  *
 134  * MC_MBUF_BIGCL:
 135  *      This is a cache of mbufs each having a big cluster attached to it.
 136  *      It is backed by MC_MBUF and MC_BIGCL rudimentary caches.  Several
 137  *      fields of the mbuf related to the external cluster are preserved
 138  *      during transactions.
 139  *
 140  * OBJECT ALLOCATION:
 141  *
 142  * Allocation requests are handled first at the per-CPU (mcache) layer
 143  * before falling back to the slab layer.  Performance is optimal when
 144  * the request is satisfied at the CPU layer because global data/lock
 145  * never gets accessed.  When the slab layer is entered for allocation,
 146  * the slab freelist will be checked first for available objects before
 147  * the VM backing store is invoked.  Slab layer operations are serialized
 148  * for all of the caches as the mbuf global lock is held most of the time.
 149  * Allocation paths are different depending on the class of objects:
 150  *
 151  * a. Rudimentary object:
 152  *
 153  *      { m_get_common(), m_clattach(), m_mclget(),
 154  *        m_mclalloc(), m_bigalloc(), m_copym_with_hdrs(),
 155  *        composite object allocation }
 156  *                      |       ^
 157  *                      |       |
 158  *                      |       +-----------------------+
 159  *                      v                               |
 160  *         mcache_alloc/mcache_alloc_ext()      mbuf_slab_audit()
 161  *                      |                               ^
 162  *                      v                               |
 163  *                 [CPU cache] -------> (found?) -------+
 164  *                      |                               |
 165  *                      v                               |
 166  *               mbuf_slab_alloc()                      |
 167  *                      |                               |
 168  *                      v                               |
 169  *      +---------> [freelist] -------> (found?) -------+
 170  *      |               |
 171  *      |               v
 172  *      |           m_clalloc()
 173  *      |               |
 174  *      |               v
 175  *      +---<<---- kmem_mb_alloc()
 176  *
 177  * b. Composite object:
 178  *
 179  *      { m_getpackets_internal(), m_allocpacket_internal() }
 180  *                      |       ^
 181  *                      |       |
 182  *                      |       +------ (done) ---------+
 183  *                      v                               |
 184  *         mcache_alloc/mcache_alloc_ext()      mbuf_cslab_audit()
 185  *                      |                               ^
 186  *                      v                               |
 187  *                 [CPU cache] -------> (found?) -------+
 188  *                      |                               |
 189  *                      v                               |
 190  *               mbuf_cslab_alloc()                     |
 191  *                      |                               |
 192  *                      v                               |
 193  *                  [freelist] -------> (found?) -------+
 194  *                      |                               |
 195  *                      v                               |
 196  *              (rudimentary object)                    |
 197  *         mcache_alloc/mcache_alloc_ext() ------>>-----+
 198  *
 199  * Auditing notes: If auditing is enabled, buffers will be subjected to
 200  * integrity checks by the audit routine.  This is done by verifying their
 201  * contents against DEADBEEF (free) pattern before returning them to caller.
 202  * As part of this step, the routine will also record the transaction and
 203  * pattern-fill the buffers with BADDCAFE (uninitialized) pattern.  It will
 204  * also restore any constructed data structure fields if necessary.
 205  *
 206  * OBJECT DEALLOCATION:
 207  *
 208  * Freeing an object simply involves placing it into the CPU cache; this
 209  * pollutes the cache to benefit subsequent allocations.  The slab layer
 210  * will only be entered if the object is to be purged out of the cache.
 211  * During normal operations, this happens only when the CPU layer resizes
 212  * its bucket while it's adjusting to the allocation load.  Deallocation
 213  * paths are different depending on the class of objects:
 214  *
 215  * a. Rudimentary object:
 216  *
 217  *      { m_free(), m_freem_list(), composite object deallocation }
 218  *                      |       ^
 219  *                      |       |
 220  *                      |       +------ (done) ---------+
 221  *                      v                               |
 222  *         mcache_free/mcache_free_ext()                |
 223  *                      |                               |
 224  *                      v                               |
 225  *              mbuf_slab_audit()                       |
 226  *                      |                               |
 227  *                      v                               |
 228  *                 [CPU cache] ---> (not purging?) -----+
 229  *                      |                               |
 230  *                      v                               |
 231  *               mbuf_slab_free()                       |
 232  *                      |                               |
 233  *                      v                               |
 234  *                  [freelist] ----------->>------------+
 235  *       (objects get purged to VM only on demand)
 236  *
 237  * b. Composite object:
 238  *
 239  *      { m_free(), m_freem_list() }
 240  *                      |       ^
 241  *                      |       |
 242  *                      |       +------ (done) ---------+
 243  *                      v                               |
 244  *         mcache_free/mcache_free_ext()                |
 245  *                      |                               |
 246  *                      v                               |
 247  *              mbuf_cslab_audit()                      |
 248  *                      |                               |
 249  *                      v                               |
 250  *                 [CPU cache] ---> (not purging?) -----+
 251  *                      |                               |
 252  *                      v                               |
 253  *               mbuf_cslab_free()                      |
 254  *                      |                               |
 255  *                      v                               |
 256  *                  [freelist] ---> (not purging?) -----+
 257  *                      |                               |
 258  *                      v                               |
 259  *              (rudimentary object)                    |
 260  *         mcache_free/mcache_free_ext() ------->>------+
 261  *
 262  * Auditing notes: If auditing is enabled, the audit routine will save
 263  * any constructed data structure fields (if necessary) before filling the
 264  * contents of the buffers with DEADBEEF (free) pattern and recording the
 265  * transaction.  Buffers that are freed (whether at CPU or slab layer) are
 266  * expected to contain the free pattern.
 267  *
 268  * DEBUGGING:
 269  *
 270  * Debugging can be enabled by adding "mbuf_debug=0x3" to boot-args; this
 271  * translates to the mcache flags (MCF_VERIFY | MCF_AUDIT).  Additionally,
 272  * the CPU layer cache can be disabled by setting the MCF_NOCPUCACHE flag,
 273  * i.e. modify the boot argument parameter to "mbuf_debug=0x13".  Leak
 274  * detection may also be disabled by setting the MCF_NOLEAKLOG flag, e.g.
 275  * "mbuf_debug=0x113".  Note that debugging consumes more CPU and memory.
 276  *
 277  * Each object is associated with exactly one mcache_audit_t structure that
 278  * contains the information related to its last buffer transaction.  Given
 279  * an address of an object, the audit structure can be retrieved by finding
 280  * the position of the object relevant to the base address of the cluster:
 281  *
 282  *      +------------+                  +=============+
 283  *      | mbuf addr  |                  | mclaudit[i] |
 284  *      +------------+                  +=============+
 285  *            |                         | cl_audit[0] |
 286  *      i = MTOBG(addr)                 +-------------+
 287  *            |                 +-----> | cl_audit[1] | -----> mcache_audit_t
 288  *      b = BGTOM(i)            |       +-------------+
 289  *            |                 |       |     ...     |
 290  *      x = MCLIDX(b, addr)     |       +-------------+
 291  *            |                 |       | cl_audit[7] |
 292  *            +-----------------+       +-------------+
 293  *               (e.g. x == 1)
 294  *
 295  * The mclaudit[] array is allocated at initialization time, but its contents
 296  * get populated when the corresponding cluster is created.  Because a page
 297  * can be turned into NMBPG number of mbufs, we preserve enough space for the
 298  * mbufs so that there is a 1-to-1 mapping between them.  A page that never
 299  * gets (or has not yet) turned into mbufs will use only cl_audit[0] with the
 300  * remaining entries unused.  For 16KB cluster, only one entry from the first
 301  * page is allocated and used for the entire object.
 302  */
 303
 304 /* TODO: should be in header file */
 305 /* kernel translater */
 306 extern vm_offset_t kmem_mb_alloc(vm_map_t, int, int, kern_return_t *);
 307 extern ppnum_t pmap_find_phys(pmap_t pmap, addr64_t va);
 308 extern vm_map_t mb_map;         /* special map */
 309
 310 static uint32_t mb_kmem_contig_failed;
 311 static uint32_t mb_kmem_failed;
 312 static uint32_t mb_kmem_one_failed;
 313 /* Timestamp of allocation failures. */
 314 static uint64_t mb_kmem_contig_failed_ts;
 315 static uint64_t mb_kmem_failed_ts;
 316 static uint64_t mb_kmem_one_failed_ts;
 317 static uint64_t mb_kmem_contig_failed_size;
 318 static uint64_t mb_kmem_failed_size;
 319 static uint32_t mb_kmem_stats[6];
 320 static const char *mb_kmem_stats_labels[] = { "INVALID_ARGUMENT",
 321                                               "INVALID_ADDRESS",
 322                                               "RESOURCE_SHORTAGE",
 323                                               "NO_SPACE",
 324                                               "KERN_FAILURE",
 325                                               "OTHERS" };
 326
 327 /* Global lock */
 328 decl_lck_mtx_data(static, mbuf_mlock_data);
 329 static lck_mtx_t *mbuf_mlock = &mbuf_mlock_data;
 330 static lck_attr_t *mbuf_mlock_attr;
 331 static lck_grp_t *mbuf_mlock_grp;
 332 static lck_grp_attr_t *mbuf_mlock_grp_attr;
 333
 334 /* Back-end (common) layer */
 335 static uint64_t mb_expand_cnt;
 336 static uint64_t mb_expand_cl_cnt;
 337 static uint64_t mb_expand_cl_total;
 338 static uint64_t mb_expand_bigcl_cnt;
 339 static uint64_t mb_expand_bigcl_total;
 340 static uint64_t mb_expand_16kcl_cnt;
 341 static uint64_t mb_expand_16kcl_total;
 342 static boolean_t mbuf_worker_needs_wakeup; /* wait channel for mbuf worker */
 343 static uint32_t mbuf_worker_run_cnt;
 344 static uint64_t mbuf_worker_last_runtime;
 345 static int mbuf_worker_ready;   /* worker thread is runnable */
 346 static int ncpu;                /* number of CPUs */
 347 static ppnum_t *mcl_paddr;      /* Array of cluster physical addresses */
 348 static ppnum_t mcl_pages;       /* Size of array (# physical pages) */
 349 static ppnum_t mcl_paddr_base;  /* Handle returned by IOMapper::iovmAlloc() */
 350 static mcache_t *ref_cache;     /* Cache of cluster reference & flags */
 351 static mcache_t *mcl_audit_con_cache; /* Audit contents cache */
 352 static unsigned int mbuf_debug; /* patchable mbuf mcache flags */
 353 static unsigned int mb_normalized; /* number of packets "normalized" */
 354
 355 #define MB_GROWTH_AGGRESSIVE    1       /* Threshold: 1/2 of total */
 356 #define MB_GROWTH_NORMAL        2       /* Threshold: 3/4 of total */
 357
 358 typedef enum {
 359         MC_MBUF = 0,    /* Regular mbuf */
 360         MC_CL,          /* Cluster */
 361         MC_BIGCL,       /* Large (4KB) cluster */
 362         MC_16KCL,       /* Jumbo (16KB) cluster */
 363         MC_MBUF_CL,     /* mbuf + cluster */
 364         MC_MBUF_BIGCL,  /* mbuf + large (4KB) cluster */
 365         MC_MBUF_16KCL   /* mbuf + jumbo (16KB) cluster */
 366 } mbuf_class_t;
 367
 368 #define MBUF_CLASS_MIN          MC_MBUF
 369 #define MBUF_CLASS_MAX          MC_MBUF_16KCL
 370 #define MBUF_CLASS_LAST         MC_16KCL
 371 #define MBUF_CLASS_VALID(c) \
 372         ((int)(c) >= MBUF_CLASS_MIN && (int)(c) <= MBUF_CLASS_MAX)
 373 #define MBUF_CLASS_COMPOSITE(c) \
 374         ((int)(c) > MBUF_CLASS_LAST)
 375
 376
 377 /*
 378  * mbuf specific mcache allocation request flags.
 379  */
 380 #define MCR_COMP        MCR_USR1 /* for MC_MBUF_{CL,BIGCL,16KCL} caches */
 381
 382 /*
 383  * Per-cluster slab structure.
 384  *
 385  * A slab is a cluster control structure that contains one or more object
 386  * chunks; the available chunks are chained in the slab's freelist (sl_head).
 387  * Each time a chunk is taken out of the slab, the slab's reference count
 388  * gets incremented.  When all chunks have been taken out, the empty slab
 389  * gets removed (SLF_DETACHED) from the class's slab list.  A chunk that is
 390  * returned to a slab causes the slab's reference count to be decremented;
 391  * it also causes the slab to be reinserted back to class's slab list, if
 392  * it's not already done.
 393  *
 394  * Compartmentalizing of the object chunks into slabs allows us to easily
 395  * merge one or more slabs together when the adjacent slabs are idle, as
 396  * well as to convert or move a slab from one class to another; e.g. the
 397  * mbuf cluster slab can be converted to a regular cluster slab when all
 398  * mbufs in the slab have been freed.
 399  *
 400  * A slab may also span across multiple clusters for chunks larger than
 401  * a cluster's size.  In this case, only the slab of the first cluster is
 402  * used.  The rest of the slabs are marked with SLF_PARTIAL to indicate
 403  * that they are part of the larger slab.
 404  *
 405  * Each slab controls a page of memory.
 406  */
 407 typedef struct mcl_slab {
 408         struct mcl_slab *sl_next;       /* neighboring slab */
 409         u_int8_t        sl_class;       /* controlling mbuf class */
 410         int8_t          sl_refcnt;      /* outstanding allocations */
 411         int8_t          sl_chunks;      /* chunks (bufs) in this slab */
 412         u_int16_t       sl_flags;       /* slab flags (see below) */
 413         u_int16_t       sl_len;         /* slab length */
 414         void            *sl_base;       /* base of allocated memory */
 415         void            *sl_head;       /* first free buffer */
 416         TAILQ_ENTRY(mcl_slab) sl_link;  /* next/prev slab on freelist */
 417 } mcl_slab_t;
 418
 419 #define SLF_MAPPED      0x0001          /* backed by a mapped page */
 420 #define SLF_PARTIAL     0x0002          /* part of another slab */
 421 #define SLF_DETACHED    0x0004          /* not in slab freelist */
 422
 423 /*
 424  * The array of slabs are broken into groups of arrays per 1MB of kernel
 425  * memory to reduce the footprint.  Each group is allocated on demand
 426  * whenever a new piece of memory mapped in from the VM crosses the 1MB
 427  * boundary.
 428  */
 429 #define NSLABSPMB       ((1 << MBSHIFT) >> PAGE_SHIFT)
 430
 431 typedef struct mcl_slabg {
 432         mcl_slab_t      *slg_slab;      /* group of slabs */
 433 } mcl_slabg_t;
 434
 435 /*
 436  * Number of slabs needed to control a 16KB cluster object.
 437  */
 438 #define NSLABSP16KB     (M16KCLBYTES >> PAGE_SHIFT)
 439
 440 /*
 441  * Per-cluster audit structure.
 442  */
 443 typedef struct {
 444         mcache_audit_t  **cl_audit;     /* array of audits */
 445 } mcl_audit_t;
 446
 447 typedef struct {
 448         struct thread   *msa_thread;    /* thread doing transaction */
 449         struct thread   *msa_pthread;   /* previous transaction thread */
 450         uint32_t        msa_tstamp;     /* transaction timestamp (ms) */
 451         uint32_t        msa_ptstamp;    /* prev transaction timestamp (ms) */
 452         uint16_t        msa_depth;      /* pc stack depth */
 453         uint16_t        msa_pdepth;     /* previous transaction pc stack */
 454         void            *msa_stack[MCACHE_STACK_DEPTH];
 455         void            *msa_pstack[MCACHE_STACK_DEPTH];
 456 } mcl_scratch_audit_t;
 457
 458 typedef struct {
 459         /*
 460          * Size of data from the beginning of an mbuf that covers m_hdr,
 461          * pkthdr and m_ext structures.  If auditing is enabled, we allocate
 462          * a shadow mbuf structure of this size inside each audit structure,
 463          * and the contents of the real mbuf gets copied into it when the mbuf
 464          * is freed.  This allows us to pattern-fill the mbuf for integrity
 465          * check, and to preserve any constructed mbuf fields (e.g. mbuf +
 466          * cluster cache case).  Note that we don't save the contents of
 467          * clusters when they are freed; we simply pattern-fill them.
 468          */
 469         u_int8_t                sc_mbuf[(MSIZE - _MHLEN) + sizeof (_m_ext_t)];
 470         mcl_scratch_audit_t     sc_scratch __attribute__((aligned(8)));
 471 } mcl_saved_contents_t;
 472
 473 #define AUDIT_CONTENTS_SIZE     (sizeof (mcl_saved_contents_t))
 474
 475 #define MCA_SAVED_MBUF_PTR(_mca)                                        \
 476         ((struct mbuf *)(void *)((mcl_saved_contents_t *)               \
 477         (_mca)->mca_contents)->sc_mbuf)
 478 #define MCA_SAVED_MBUF_SIZE                                             \
 479         (sizeof (((mcl_saved_contents_t *)0)->sc_mbuf))
 480 #define MCA_SAVED_SCRATCH_PTR(_mca)                                     \
 481         (&((mcl_saved_contents_t *)(_mca)->mca_contents)->sc_scratch)
 482
 483 /*
 484  * mbuf specific mcache audit flags
 485  */
 486 #define MB_INUSE        0x01    /* object has not been returned to slab */
 487 #define MB_COMP_INUSE   0x02    /* object has not been returned to cslab */
 488 #define MB_SCVALID      0x04    /* object has valid saved contents */
 489
 490 /*
 491  * Each of the following two arrays hold up to nmbclusters elements.
 492  */
 493 static mcl_audit_t *mclaudit;   /* array of cluster audit information */
 494 static unsigned int maxclaudit; /* max # of entries in audit table */
 495 static mcl_slabg_t **slabstbl;  /* cluster slabs table */
 496 static unsigned int maxslabgrp; /* max # of entries in slabs table */
 497 static unsigned int slabgrp;    /* # of entries in slabs table */
 498
 499 /* Globals */
 500 int nclusters;                  /* # of clusters for non-jumbo (legacy) sizes */
 501 int njcl;                       /* # of clusters for jumbo sizes */
 502 int njclbytes;                  /* size of a jumbo cluster */
 503 unsigned char *mbutl;           /* first mapped cluster address */
 504 unsigned char *embutl;          /* ending virtual address of mclusters */
 505 int _max_linkhdr;               /* largest link-level header */
 506 int _max_protohdr;              /* largest protocol header */
 507 int max_hdr;                    /* largest link+protocol header */
 508 int max_datalen;                /* MHLEN - max_hdr */
 509
 510 static boolean_t mclverify;     /* debug: pattern-checking */
 511 static boolean_t mcltrace;      /* debug: stack tracing */
 512 static boolean_t mclfindleak;   /* debug: leak detection */
 513 static boolean_t mclexpleak;    /* debug: expose leak info to user space */
 514
 515 static struct timeval mb_start; /* beginning of time */
 516
 517 /* mbuf leak detection variables */
 518 static struct mleak_table mleak_table;
 519 static mleak_stat_t *mleak_stat;
 520
 521 #define MLEAK_STAT_SIZE(n) \
 522         __builtin_offsetof(mleak_stat_t, ml_trace[n])
 523
 524 struct mallocation {
 525         mcache_obj_t *element;  /* the alloc'ed element, NULL if unused */
 526         u_int32_t trace_index;  /* mtrace index for corresponding backtrace */
 527         u_int32_t count;        /* How many objects were requested */
 528         u_int64_t hitcount;     /* for determining hash effectiveness */
 529 };
 530
 531 struct mtrace {
 532         u_int64_t       collisions;
 533         u_int64_t       hitcount;
 534         u_int64_t       allocs;
 535         u_int64_t       depth;
 536         uintptr_t       addr[MLEAK_STACK_DEPTH];
 537 };
 538
 539 /* Size must be a power of two for the zhash to be able to just mask off bits */
 540 #define MLEAK_ALLOCATION_MAP_NUM        512
 541 #define MLEAK_TRACE_MAP_NUM             256
 542
 543 /*
 544  * Sample factor for how often to record a trace.  This is overwritable
 545  * by the boot-arg mleak_sample_factor.
 546  */
 547 #define MLEAK_SAMPLE_FACTOR             500
 548
 549 /*
 550  * Number of top leakers recorded.
 551  */
 552 #define MLEAK_NUM_TRACES                5
 553
 554 #define MB_LEAK_SPACING_64 "                    "
 555 #define MB_LEAK_SPACING_32 "            "
 556
 557
 558 #define MB_LEAK_HDR_32  "\n\
 559     trace [1]   trace [2]   trace [3]   trace [4]   trace [5]  \n\
 560     ----------  ----------  ----------  ----------  ---------- \n\
 561 "
 562
 563 #define MB_LEAK_HDR_64  "\n\
 564     trace [1]           trace [2]           trace [3]       \
 565         trace [4]           trace [5]      \n\
 566     ------------------  ------------------  ------------------  \
 567     ------------------  ------------------ \n\
 568 "
 569
 570 static uint32_t mleak_alloc_buckets = MLEAK_ALLOCATION_MAP_NUM;
 571 static uint32_t mleak_trace_buckets = MLEAK_TRACE_MAP_NUM;
 572
 573 /* Hashmaps of allocations and their corresponding traces */
 574 static struct mallocation *mleak_allocations;
 575 static struct mtrace *mleak_traces;
 576 static struct mtrace *mleak_top_trace[MLEAK_NUM_TRACES];
 577
 578 /* Lock to protect mleak tables from concurrent modification */
 579 decl_lck_mtx_data(static, mleak_lock_data);
 580 static lck_mtx_t *mleak_lock = &mleak_lock_data;
 581 static lck_attr_t *mleak_lock_attr;
 582 static lck_grp_t *mleak_lock_grp;
 583 static lck_grp_attr_t *mleak_lock_grp_attr;
 584
 585 /* Lock to protect the completion callback table */
 586 static lck_grp_attr_t *mbuf_tx_compl_tbl_lck_grp_attr = NULL;
 587 static lck_attr_t *mbuf_tx_compl_tbl_lck_attr = NULL;
 588 static lck_grp_t *mbuf_tx_compl_tbl_lck_grp = NULL;
 589 decl_lck_rw_data(, mbuf_tx_compl_tbl_lck_rw_data);
 590 lck_rw_t *mbuf_tx_compl_tbl_lock = &mbuf_tx_compl_tbl_lck_rw_data;
 591
 592 extern u_int32_t high_sb_max;
 593
 594 /* The minimum number of objects that are allocated, to start. */
 595 #define MINCL           32
 596 #define MINBIGCL        (MINCL >> 1)
 597 #define MIN16KCL        (MINCL >> 2)
 598
 599 /* Low watermarks (only map in pages once free counts go below) */
 600 #define MBIGCL_LOWAT    MINBIGCL
 601 #define M16KCL_LOWAT    MIN16KCL
 602
 603 typedef struct {
 604         mbuf_class_t    mtbl_class;     /* class type */
 605         mcache_t        *mtbl_cache;    /* mcache for this buffer class */
 606         TAILQ_HEAD(mcl_slhead, mcl_slab) mtbl_slablist; /* slab list */
 607         mcache_obj_t    *mtbl_cobjlist; /* composite objects freelist */
 608         mb_class_stat_t *mtbl_stats;    /* statistics fetchable via sysctl */
 609         u_int32_t       mtbl_maxsize;   /* maximum buffer size */
 610         int             mtbl_minlimit;  /* minimum allowed */
 611         int             mtbl_maxlimit;  /* maximum allowed */
 612         u_int32_t       mtbl_wantpurge; /* purge during next reclaim */
 613         uint32_t        mtbl_avgtotal;  /* average total on iOS */
 614         u_int32_t       mtbl_expand;    /* worker should expand the class */
 615 } mbuf_table_t;
 616
 617 #define m_class(c)      mbuf_table[c].mtbl_class
 618 #define m_cache(c)      mbuf_table[c].mtbl_cache
 619 #define m_slablist(c)   mbuf_table[c].mtbl_slablist
 620 #define m_cobjlist(c)   mbuf_table[c].mtbl_cobjlist
 621 #define m_maxsize(c)    mbuf_table[c].mtbl_maxsize
 622 #define m_minlimit(c)   mbuf_table[c].mtbl_minlimit
 623 #define m_maxlimit(c)   mbuf_table[c].mtbl_maxlimit
 624 #define m_wantpurge(c)  mbuf_table[c].mtbl_wantpurge
 625 #define m_avgtotal(c)   mbuf_table[c].mtbl_avgtotal
 626 #define m_cname(c)      mbuf_table[c].mtbl_stats->mbcl_cname
 627 #define m_size(c)       mbuf_table[c].mtbl_stats->mbcl_size
 628 #define m_total(c)      mbuf_table[c].mtbl_stats->mbcl_total
 629 #define m_active(c)     mbuf_table[c].mtbl_stats->mbcl_active
 630 #define m_infree(c)     mbuf_table[c].mtbl_stats->mbcl_infree
 631 #define m_slab_cnt(c)   mbuf_table[c].mtbl_stats->mbcl_slab_cnt
 632 #define m_alloc_cnt(c)  mbuf_table[c].mtbl_stats->mbcl_alloc_cnt
 633 #define m_free_cnt(c)   mbuf_table[c].mtbl_stats->mbcl_free_cnt
 634 #define m_notified(c)   mbuf_table[c].mtbl_stats->mbcl_notified
 635 #define m_purge_cnt(c)  mbuf_table[c].mtbl_stats->mbcl_purge_cnt
 636 #define m_fail_cnt(c)   mbuf_table[c].mtbl_stats->mbcl_fail_cnt
 637 #define m_ctotal(c)     mbuf_table[c].mtbl_stats->mbcl_ctotal
 638 #define m_peak(c)       mbuf_table[c].mtbl_stats->mbcl_peak_reported
 639 #define m_release_cnt(c) mbuf_table[c].mtbl_stats->mbcl_release_cnt
 640 #define m_region_expand(c)      mbuf_table[c].mtbl_expand
 641
 642 static mbuf_table_t mbuf_table[] = {
 643         /*
 644          * The caches for mbufs, regular clusters and big clusters.
 645          * The average total values were based on data gathered by actual
 646          * usage patterns on iOS.
 647          */
 648         { MC_MBUF, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_MBUF)),
 649             NULL, NULL, 0, 0, 0, 0, 3000, 0 },
 650         { MC_CL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_CL)),
 651             NULL, NULL, 0, 0, 0, 0, 2000, 0 },
 652         { MC_BIGCL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_BIGCL)),
 653             NULL, NULL, 0, 0, 0, 0, 1000, 0 },
 654         { MC_16KCL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_16KCL)),
 655             NULL, NULL, 0, 0, 0, 0, 200, 0 },
 656         /*
 657          * The following are special caches; they serve as intermediate
 658          * caches backed by the above rudimentary caches.  Each object
 659          * in the cache is an mbuf with a cluster attached to it.  Unlike
 660          * the above caches, these intermediate caches do not directly
 661          * deal with the slab structures; instead, the constructed
 662          * cached elements are simply stored in the freelists.
 663          */
 664         { MC_MBUF_CL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0, 2000, 0 },
 665         { MC_MBUF_BIGCL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0, 1000, 0 },
 666         { MC_MBUF_16KCL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0, 200, 0 },
 667 };
 668
 669 #define NELEM(a)        (sizeof (a) / sizeof ((a)[0]))
 670
 671 static void *mb_waitchan = &mbuf_table; /* wait channel for all caches */
 672 static int mb_waiters;                  /* number of waiters */
 673
 674 boolean_t mb_peak_newreport = FALSE;
 675 boolean_t mb_peak_firstreport = FALSE;
 676
 677 /* generate a report by default after 1 week of uptime */
 678 #define MBUF_PEAK_FIRST_REPORT_THRESHOLD        604800
 679
 680 #define MB_WDT_MAXTIME  10              /* # of secs before watchdog panic */
 681 static struct timeval mb_wdtstart;      /* watchdog start timestamp */
 682 static char *mbuf_dump_buf;
 683
 684 #define MBUF_DUMP_BUF_SIZE      3072
 685
 686 /*
 687  * mbuf watchdog is enabled by default on embedded platforms.  It is
 688  * also toggeable via the kern.ipc.mb_watchdog sysctl.
 689  * Garbage collection is also enabled by default on embedded platforms.
 690  * mb_drain_maxint controls the amount of time to wait (in seconds) before
 691  * consecutive calls to m_drain().
 692  */
 693 #if CONFIG_EMBEDDED
 694 static unsigned int mb_watchdog = 1;
 695 static unsigned int mb_drain_maxint = 60;
 696 #else
 697 static unsigned int mb_watchdog = 0;
 698 static unsigned int mb_drain_maxint = 0;
 699 #endif /* CONFIG_EMBEDDED */
 700
 701 uintptr_t mb_obscure_extfree __attribute__((visibility("hidden")));
 702 uintptr_t mb_obscure_extref __attribute__((visibility("hidden")));
 703
 704 /* Red zone */
 705 static u_int32_t mb_redzone_cookie;
 706 static void m_redzone_init(struct mbuf *);
 707 static void m_redzone_verify(struct mbuf *m);
 708
 709 /* The following are used to serialize m_clalloc() */
 710 static boolean_t mb_clalloc_busy;
 711 static void *mb_clalloc_waitchan = &mb_clalloc_busy;
 712 static int mb_clalloc_waiters;
 713
 714 static void mbuf_mtypes_sync(boolean_t);
 715 static int mbstat_sysctl SYSCTL_HANDLER_ARGS;
 716 static void mbuf_stat_sync(void);
 717 static int mb_stat_sysctl SYSCTL_HANDLER_ARGS;
 718 static int mleak_top_trace_sysctl SYSCTL_HANDLER_ARGS;
 719 static int mleak_table_sysctl SYSCTL_HANDLER_ARGS;
 720 static char *mbuf_dump(void);
 721 static void mbuf_table_init(void);
 722 static inline void m_incref(struct mbuf *);
 723 static inline u_int16_t m_decref(struct mbuf *);
 724 static int m_clalloc(const u_int32_t, const int, const u_int32_t);
 725 static void mbuf_worker_thread_init(void);
 726 static mcache_obj_t *slab_alloc(mbuf_class_t, int);
 727 static void slab_free(mbuf_class_t, mcache_obj_t *);
 728 static unsigned int mbuf_slab_alloc(void *, mcache_obj_t ***,
 729     unsigned int, int);
 730 static void mbuf_slab_free(void *, mcache_obj_t *, int);
 731 static void mbuf_slab_audit(void *, mcache_obj_t *, boolean_t);
 732 static void mbuf_slab_notify(void *, u_int32_t);
 733 static unsigned int cslab_alloc(mbuf_class_t, mcache_obj_t ***,
 734     unsigned int);
 735 static unsigned int cslab_free(mbuf_class_t, mcache_obj_t *, int);
 736 static unsigned int mbuf_cslab_alloc(void *, mcache_obj_t ***,
 737     unsigned int, int);
 738 static void mbuf_cslab_free(void *, mcache_obj_t *, int);
 739 static void mbuf_cslab_audit(void *, mcache_obj_t *, boolean_t);
 740 static int freelist_populate(mbuf_class_t, unsigned int, int);
 741 static void freelist_init(mbuf_class_t);
 742 static boolean_t mbuf_cached_above(mbuf_class_t, int);
 743 static boolean_t mbuf_steal(mbuf_class_t, unsigned int);
 744 static void m_reclaim(mbuf_class_t, unsigned int, boolean_t);
 745 static int m_howmany(int, size_t);
 746 static void mbuf_worker_thread(void);
 747 static void mbuf_watchdog(void);
 748 static boolean_t mbuf_sleep(mbuf_class_t, unsigned int, int);
 749
 750 static void mcl_audit_init(void *, mcache_audit_t **, mcache_obj_t **,
 751     size_t, unsigned int);
 752 static void mcl_audit_free(void *, unsigned int);
 753 static mcache_audit_t *mcl_audit_buf2mca(mbuf_class_t, mcache_obj_t *);
 754 static void mcl_audit_mbuf(mcache_audit_t *, void *, boolean_t, boolean_t);
 755 static void mcl_audit_cluster(mcache_audit_t *, void *, size_t, boolean_t,
 756     boolean_t);
 757 static void mcl_audit_restore_mbuf(struct mbuf *, mcache_audit_t *, boolean_t);
 758 static void mcl_audit_save_mbuf(struct mbuf *, mcache_audit_t *);
 759 static void mcl_audit_scratch(mcache_audit_t *);
 760 static void mcl_audit_mcheck_panic(struct mbuf *);
 761 static void mcl_audit_verify_nextptr(void *, mcache_audit_t *);
 762
 763 static void mleak_activate(void);
 764 static void mleak_logger(u_int32_t, mcache_obj_t *, boolean_t);
 765 static boolean_t mleak_log(uintptr_t *, mcache_obj_t *, uint32_t, int);
 766 static void mleak_free(mcache_obj_t *);
 767 static void mleak_sort_traces(void);
 768 static void mleak_update_stats(void);
 769
 770 static mcl_slab_t *slab_get(void *);
 771 static void slab_init(mcl_slab_t *, mbuf_class_t, u_int32_t,
 772     void *, void *, unsigned int, int, int);
 773 static void slab_insert(mcl_slab_t *, mbuf_class_t);
 774 static void slab_remove(mcl_slab_t *, mbuf_class_t);
 775 static boolean_t slab_inrange(mcl_slab_t *, void *);
 776 static void slab_nextptr_panic(mcl_slab_t *, void *);
 777 static void slab_detach(mcl_slab_t *);
 778 static boolean_t slab_is_detached(mcl_slab_t *);
 779
 780 static int m_copyback0(struct mbuf **, int, int, const void *, int, int);
 781 static struct mbuf *m_split0(struct mbuf *, int, int, int);
 782 __private_extern__ void mbuf_report_peak_usage(void);
 783 static boolean_t mbuf_report_usage(mbuf_class_t);
 784
 785 /* flags for m_copyback0 */
 786 #define M_COPYBACK0_COPYBACK    0x0001  /* copyback from cp */
 787 #define M_COPYBACK0_PRESERVE    0x0002  /* preserve original data */
 788 #define M_COPYBACK0_COW         0x0004  /* do copy-on-write */
 789 #define M_COPYBACK0_EXTEND      0x0008  /* extend chain */
 790
 791 /*
 792  * This flag is set for all mbufs that come out of and into the composite
 793  * mbuf + cluster caches, i.e. MC_MBUF_CL and MC_MBUF_BIGCL.  mbufs that
 794  * are marked with such a flag have clusters attached to them, and will be
 795  * treated differently when they are freed; instead of being placed back
 796  * into the mbuf and cluster freelists, the composite mbuf + cluster objects
 797  * are placed back into the appropriate composite cache's freelist, and the
 798  * actual freeing is deferred until the composite objects are purged.  At
 799  * such a time, this flag will be cleared from the mbufs and the objects
 800  * will be freed into their own separate freelists.
 801  */
 802 #define EXTF_COMPOSITE  0x1
 803
 804 /*
 805  * This flag indicates that the external cluster is read-only, i.e. it is
 806  * or was referred to by more than one mbufs.  Once set, this flag is never
 807  * cleared.
 808  */
 809 #define EXTF_READONLY   0x2
 810 /*
 811  * This flag indicates that the external cluster is paired with the mbuf.
 812  * Pairing implies an external free routine defined which will be invoked
 813  * when the reference count drops to the minimum at m_free time.  This
 814  * flag is never cleared.
 815  */
 816 #define EXTF_PAIRED     0x4
 817
 818 #define EXTF_MASK       \
 819         (EXTF_COMPOSITE | EXTF_READONLY | EXTF_PAIRED)
 820
 821 #define MEXT_MINREF(m)          ((m_get_rfa(m))->minref)
 822 #define MEXT_REF(m)             ((m_get_rfa(m))->refcnt)
 823 #define MEXT_PREF(m)            ((m_get_rfa(m))->prefcnt)
 824 #define MEXT_FLAGS(m)           ((m_get_rfa(m))->flags)
 825 #define MEXT_PRIV(m)            ((m_get_rfa(m))->priv)
 826 #define MEXT_PMBUF(m)           ((m_get_rfa(m))->paired)
 827 #define MEXT_TOKEN(m)           ((m_get_rfa(m))->ext_token)
 828 #define MBUF_IS_COMPOSITE(m)                                            \
 829         (MEXT_REF(m) == MEXT_MINREF(m) &&                               \
 830         (MEXT_FLAGS(m) & EXTF_MASK) == EXTF_COMPOSITE)
 831 /*
 832  * This macro can be used to test if the mbuf is paired to an external
 833  * cluster.  The test for MEXT_PMBUF being equal to the mbuf in subject
 834  * is important, as EXTF_PAIRED alone is insufficient since it is immutable,
 835  * and thus survives calls to m_free_paired.
 836  */
 837 #define MBUF_IS_PAIRED(m)                                               \
 838         (((m)->m_flags & M_EXT) &&                                      \
 839         (MEXT_FLAGS(m) & EXTF_MASK) == EXTF_PAIRED &&                   \
 840         MEXT_PMBUF(m) == (m))
 841
 842 /*
 843  * Macros used to verify the integrity of the mbuf.
 844  */
 845 #define _MCHECK(m) {                                                    \
 846         if ((m)->m_type != MT_FREE && !MBUF_IS_PAIRED(m)) {             \
 847                 if (mclaudit == NULL)                                   \
 848                         panic("MCHECK: m_type=%d m=%p",                 \
 849                             (u_int16_t)(m)->m_type, m);                 \
 850                 else                                                    \
 851                         mcl_audit_mcheck_panic(m);                      \
 852         }                                                               \
 853 }
 854
 855 #define MBUF_IN_MAP(addr)                                               \
 856         ((unsigned char *)(addr) >= mbutl &&                            \
 857         (unsigned char *)(addr) < embutl)
 858
 859 #define MRANGE(addr) {                                                  \
 860         if (!MBUF_IN_MAP(addr))                                         \
 861                 panic("MRANGE: address out of range 0x%p", addr);       \
 862 }
 863
 864 /*
 865  * Macro version of mtod.
 866  */
 867 #define MTOD(m, t)      ((t)((m)->m_data))
 868
 869 /*
 870  * Macros to obtain page index given a base cluster address
 871  */
 872 #define MTOPG(x)        (((unsigned char *)x - mbutl) >> PAGE_SHIFT)
 873 #define PGTOM(x)        (mbutl + (x << PAGE_SHIFT))
 874
 875 /*
 876  * Macro to find the mbuf index relative to a base.
 877  */
 878 #define MBPAGEIDX(c, m) \
 879         (((unsigned char *)(m) - (unsigned char *)(c)) >> MSIZESHIFT)
 880
 881 /*
 882  * Same thing for 2KB cluster index.
 883  */
 884 #define CLPAGEIDX(c, m) \
 885         (((unsigned char *)(m) - (unsigned char *)(c)) >> MCLSHIFT)
 886
 887 /*
 888  * Macro to find 4KB cluster index relative to a base
 889  */
 890 #define BCLPAGEIDX(c, m) \
 891         (((unsigned char *)(m) - (unsigned char *)(c)) >> MBIGCLSHIFT)
 892
 893 /*
 894  * Macros used during mbuf and cluster initialization.
 895  */
 896 #define MBUF_INIT_PKTHDR(m) {                                           \
 897         (m)->m_pkthdr.rcvif = NULL;                                     \
 898         (m)->m_pkthdr.pkt_hdr = NULL;                                   \
 899         (m)->m_pkthdr.len = 0;                                          \
 900         (m)->m_pkthdr.csum_flags = 0;                                   \
 901         (m)->m_pkthdr.csum_data = 0;                                    \
 902         (m)->m_pkthdr.vlan_tag = 0;                                     \
 903         m_classifier_init(m, 0);                                        \
 904         m_tag_init(m, 1);                                               \
 905         m_scratch_init(m);                                              \
 906         m_redzone_init(m);                                              \
 907 }
 908
 909 #define MBUF_INIT(m, pkthdr, type) {                                    \
 910         _MCHECK(m);                                                     \
 911         (m)->m_next = (m)->m_nextpkt = NULL;                            \
 912         (m)->m_len = 0;                                                 \
 913         (m)->m_type = type;                                             \
 914         if ((pkthdr) == 0) {                                            \
 915                 (m)->m_data = (m)->m_dat;                               \
 916                 (m)->m_flags = 0;                                       \
 917         } else {                                                        \
 918                 (m)->m_data = (m)->m_pktdat;                            \
 919                 (m)->m_flags = M_PKTHDR;                                \
 920                 MBUF_INIT_PKTHDR(m);                                    \
 921         }                                                               \
 922 }
 923
 924 #define MEXT_INIT(m, buf, size, free, arg, rfa, min, ref, pref, flag,   \
 925     priv, pm) {                                                         \
 926         (m)->m_data = (m)->m_ext.ext_buf = (buf);                       \
 927         (m)->m_flags |= M_EXT;                                          \
 928         m_set_ext((m), (rfa), (free), (arg));                           \
 929         (m)->m_ext.ext_size = (size);                                   \
 930         MEXT_MINREF(m) = (min);                                         \
 931         MEXT_REF(m) = (ref);                                            \
 932         MEXT_PREF(m) = (pref);                                          \
 933         MEXT_FLAGS(m) = (flag);                                         \
 934         MEXT_PRIV(m) = (priv);                                          \
 935         MEXT_PMBUF(m) = (pm);                                           \
 936 }
 937
 938 #define MBUF_CL_INIT(m, buf, rfa, ref, flag)    \
 939         MEXT_INIT(m, buf, m_maxsize(MC_CL), NULL, NULL, rfa, 0,         \
 940             ref, 0, flag, 0, NULL)
 941
 942 #define MBUF_BIGCL_INIT(m, buf, rfa, ref, flag) \
 943         MEXT_INIT(m, buf, m_maxsize(MC_BIGCL), m_bigfree, NULL, rfa, 0, \
 944             ref, 0, flag, 0, NULL)
 945
 946 #define MBUF_16KCL_INIT(m, buf, rfa, ref, flag) \
 947         MEXT_INIT(m, buf, m_maxsize(MC_16KCL), m_16kfree, NULL, rfa, 0, \
 948             ref, 0, flag, 0, NULL)
 949
 950 /*
 951  * Macro to convert BSD malloc sleep flag to mcache's
 952  */
 953 #define MSLEEPF(f)      ((!((f) & M_DONTWAIT)) ? MCR_SLEEP : MCR_NOSLEEP)
 954
 955 /*
 956  * The structure that holds all mbuf class statistics exportable via sysctl.
 957  * Similar to mbstat structure, the mb_stat structure is protected by the
 958  * global mbuf lock.  It contains additional information about the classes
 959  * that allows for a more accurate view of the state of the allocator.
 960  */
 961 struct mb_stat *mb_stat;
 962 struct omb_stat *omb_stat;      /* For backwards compatibility */
 963
 964 #define MB_STAT_SIZE(n) \
 965         __builtin_offsetof(mb_stat_t, mbs_class[n])
 966 #define OMB_STAT_SIZE(n) \
 967         ((size_t)(&((struct omb_stat *)0)->mbs_class[n]))
 968
 969 /*
 970  * The legacy structure holding all of the mbuf allocation statistics.
 971  * The actual statistics used by the kernel are stored in the mbuf_table
 972  * instead, and are updated atomically while the global mbuf lock is held.
 973  * They are mirrored in mbstat to support legacy applications (e.g. netstat).
 974  * Unlike before, the kernel no longer relies on the contents of mbstat for
 975  * its operations (e.g. cluster expansion) because the structure is exposed
 976  * to outside and could possibly be modified, therefore making it unsafe.
 977  * With the exception of the mbstat.m_mtypes array (see below), all of the
 978  * statistics are updated as they change.
 979  */
 980 struct mbstat mbstat;
 981
 982 #define MBSTAT_MTYPES_MAX \
 983         (sizeof (mbstat.m_mtypes) / sizeof (mbstat.m_mtypes[0]))
 984
 985 /*
 986  * Allocation statistics related to mbuf types (up to MT_MAX-1) are updated
 987  * atomically and stored in a per-CPU structure which is lock-free; this is
 988  * done in order to avoid writing to the global mbstat data structure which
 989  * would cause false sharing.  During sysctl request for kern.ipc.mbstat,
 990  * the statistics across all CPUs will be converged into the mbstat.m_mtypes
 991  * array and returned to the application.  Any updates for types greater or
 992  * equal than MT_MAX would be done atomically to the mbstat; this slows down
 993  * performance but is okay since the kernel uses only up to MT_MAX-1 while
 994  * anything beyond that (up to type 255) is considered a corner case.
 995  */
 996 typedef struct {
 997         unsigned int    cpu_mtypes[MT_MAX];
 998 } __attribute__((aligned(MAX_CPU_CACHE_LINE_SIZE), packed)) mtypes_cpu_t;
 999
1000 typedef struct {
1001         mtypes_cpu_t    mbs_cpu[1];
1002 } mbuf_mtypes_t;
1003
1004 static mbuf_mtypes_t *mbuf_mtypes;      /* per-CPU statistics */
1005
1006 #define MBUF_MTYPES_SIZE(n) \
1007         ((size_t)(&((mbuf_mtypes_t *)0)->mbs_cpu[n]))
1008
1009 #define MTYPES_CPU(p) \
1010         ((mtypes_cpu_t *)(void *)((char *)(p) + MBUF_MTYPES_SIZE(cpu_number())))
1011
1012 #define mtype_stat_add(type, n) {                                       \
1013         if ((unsigned)(type) < MT_MAX) {                                \
1014                 mtypes_cpu_t *mbs = MTYPES_CPU(mbuf_mtypes);            \
1015                 atomic_add_32(&mbs->cpu_mtypes[type], n);               \
1016         } else if ((unsigned)(type) < (unsigned)MBSTAT_MTYPES_MAX) {    \
1017                 atomic_add_16((int16_t *)&mbstat.m_mtypes[type], n);    \
1018         }                                                               \
1019 }
1020
1021 #define mtype_stat_sub(t, n)    mtype_stat_add(t, -(n))
1022 #define mtype_stat_inc(t)       mtype_stat_add(t, 1)
1023 #define mtype_stat_dec(t)       mtype_stat_sub(t, 1)
1024
1025 static void
1026 mbuf_mtypes_sync(boolean_t locked)
1027 {
1028         int m, n;
1029         mtypes_cpu_t mtc;
1030
1031         if (locked)
1032                 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1033
1034         bzero(&mtc, sizeof (mtc));
1035         for (m = 0; m < ncpu; m++) {
1036                 mtypes_cpu_t *scp = &mbuf_mtypes->mbs_cpu[m];
1037                 mtypes_cpu_t temp;
1038
1039                 bcopy(&scp->cpu_mtypes, &temp.cpu_mtypes,
1040                     sizeof (temp.cpu_mtypes));
1041
1042                 for (n = 0; n < MT_MAX; n++)
1043                         mtc.cpu_mtypes[n] += temp.cpu_mtypes[n];
1044         }
1045         if (!locked)
1046                 lck_mtx_lock(mbuf_mlock);
1047         for (n = 0; n < MT_MAX; n++)
1048                 mbstat.m_mtypes[n] = mtc.cpu_mtypes[n];
1049         if (!locked)
1050                 lck_mtx_unlock(mbuf_mlock);
1051 }
1052
1053 static int
1054 mbstat_sysctl SYSCTL_HANDLER_ARGS
1055 {
1056 #pragma unused(oidp, arg1, arg2)
1057         mbuf_mtypes_sync(FALSE);
1058
1059         return (SYSCTL_OUT(req, &mbstat, sizeof (mbstat)));
1060 }
1061
1062 static void
1063 mbuf_stat_sync(void)
1064 {
1065         mb_class_stat_t *sp;
1066         mcache_cpu_t *ccp;
1067         mcache_t *cp;
1068         int k, m, bktsize;
1069
1070         LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1071
1072         for (k = 0; k < NELEM(mbuf_table); k++) {
1073                 cp = m_cache(k);
1074                 ccp = &cp->mc_cpu[0];
1075                 bktsize = ccp->cc_bktsize;
1076                 sp = mbuf_table[k].mtbl_stats;
1077
1078                 if (cp->mc_flags & MCF_NOCPUCACHE)
1079                         sp->mbcl_mc_state = MCS_DISABLED;
1080                 else if (cp->mc_purge_cnt > 0)
1081                         sp->mbcl_mc_state = MCS_PURGING;
1082                 else if (bktsize == 0)
1083                         sp->mbcl_mc_state = MCS_OFFLINE;
1084                 else
1085                         sp->mbcl_mc_state = MCS_ONLINE;
1086
1087                 sp->mbcl_mc_cached = 0;
1088                 for (m = 0; m < ncpu; m++) {
1089                         ccp = &cp->mc_cpu[m];
1090                         if (ccp->cc_objs > 0)
1091                                 sp->mbcl_mc_cached += ccp->cc_objs;
1092                         if (ccp->cc_pobjs > 0)
1093                                 sp->mbcl_mc_cached += ccp->cc_pobjs;
1094                 }
1095                 sp->mbcl_mc_cached += (cp->mc_full.bl_total * bktsize);
1096                 sp->mbcl_active = sp->mbcl_total - sp->mbcl_mc_cached -
1097                     sp->mbcl_infree;
1098
1099                 sp->mbcl_mc_waiter_cnt = cp->mc_waiter_cnt;
1100                 sp->mbcl_mc_wretry_cnt = cp->mc_wretry_cnt;
1101                 sp->mbcl_mc_nwretry_cnt = cp->mc_nwretry_cnt;
1102
1103                 /* Calculate total count specific to each class */
1104                 sp->mbcl_ctotal = sp->mbcl_total;
1105                 switch (m_class(k)) {
1106                 case MC_MBUF:
1107                         /* Deduct mbufs used in composite caches */
1108                         sp->mbcl_ctotal -= (m_total(MC_MBUF_CL) +
1109                             m_total(MC_MBUF_BIGCL));
1110                         break;
1111
1112                 case MC_CL:
1113                         /* Deduct clusters used in composite cache */
1114                         sp->mbcl_ctotal -= m_total(MC_MBUF_CL);
1115                         break;
1116
1117                 case MC_BIGCL:
1118                         /* Deduct clusters used in composite cache */
1119                         sp->mbcl_ctotal -= m_total(MC_MBUF_BIGCL);
1120                         break;
1121
1122                 case MC_16KCL:
1123                         /* Deduct clusters used in composite cache */
1124                         sp->mbcl_ctotal -= m_total(MC_MBUF_16KCL);
1125                         break;
1126
1127                 default:
1128                         break;
1129                 }
1130         }
1131 }
1132
1133 static int
1134 mb_stat_sysctl SYSCTL_HANDLER_ARGS
1135 {
1136 #pragma unused(oidp, arg1, arg2)
1137         void *statp;
1138         int k, statsz, proc64 = proc_is64bit(req->p);
1139
1140         lck_mtx_lock(mbuf_mlock);
1141         mbuf_stat_sync();
1142
1143         if (!proc64) {
1144                 struct omb_class_stat *oc;
1145                 struct mb_class_stat *c;
1146
1147                 omb_stat->mbs_cnt = mb_stat->mbs_cnt;
1148                 oc = &omb_stat->mbs_class[0];
1149                 c = &mb_stat->mbs_class[0];
1150                 for (k = 0; k < omb_stat->mbs_cnt; k++, oc++, c++) {
1151                         (void) snprintf(oc->mbcl_cname, sizeof (oc->mbcl_cname),
1152                             "%s", c->mbcl_cname);
1153                         oc->mbcl_size = c->mbcl_size;
1154                         oc->mbcl_total = c->mbcl_total;
1155                         oc->mbcl_active = c->mbcl_active;
1156                         oc->mbcl_infree = c->mbcl_infree;
1157                         oc->mbcl_slab_cnt = c->mbcl_slab_cnt;
1158                         oc->mbcl_alloc_cnt = c->mbcl_alloc_cnt;
1159                         oc->mbcl_free_cnt = c->mbcl_free_cnt;
1160                         oc->mbcl_notified = c->mbcl_notified;
1161                         oc->mbcl_purge_cnt = c->mbcl_purge_cnt;
1162                         oc->mbcl_fail_cnt = c->mbcl_fail_cnt;
1163                         oc->mbcl_ctotal = c->mbcl_ctotal;
1164                         oc->mbcl_release_cnt = c->mbcl_release_cnt;
1165                         oc->mbcl_mc_state = c->mbcl_mc_state;
1166                         oc->mbcl_mc_cached = c->mbcl_mc_cached;
1167                         oc->mbcl_mc_waiter_cnt = c->mbcl_mc_waiter_cnt;
1168                         oc->mbcl_mc_wretry_cnt = c->mbcl_mc_wretry_cnt;
1169                         oc->mbcl_mc_nwretry_cnt = c->mbcl_mc_nwretry_cnt;
1170                 }
1171                 statp = omb_stat;
1172                 statsz = OMB_STAT_SIZE(NELEM(mbuf_table));
1173         } else {
1174                 statp = mb_stat;
1175                 statsz = MB_STAT_SIZE(NELEM(mbuf_table));
1176         }
1177
1178         lck_mtx_unlock(mbuf_mlock);
1179
1180         return (SYSCTL_OUT(req, statp, statsz));
1181 }
1182
1183 static int
1184 mleak_top_trace_sysctl SYSCTL_HANDLER_ARGS
1185 {
1186 #pragma unused(oidp, arg1, arg2)
1187         int i;
1188
1189         /* Ensure leak tracing turned on */
1190         if (!mclfindleak || !mclexpleak)
1191                 return (ENXIO);
1192
1193         lck_mtx_lock(mleak_lock);
1194         mleak_update_stats();
1195         i = SYSCTL_OUT(req, mleak_stat, MLEAK_STAT_SIZE(MLEAK_NUM_TRACES));
1196         lck_mtx_unlock(mleak_lock);
1197
1198         return (i);
1199 }
1200
1201 static int
1202 mleak_table_sysctl SYSCTL_HANDLER_ARGS
1203 {
1204 #pragma unused(oidp, arg1, arg2)
1205         int i = 0;
1206
1207         /* Ensure leak tracing turned on */
1208         if (!mclfindleak || !mclexpleak)
1209                 return (ENXIO);
1210
1211         lck_mtx_lock(mleak_lock);
1212         i = SYSCTL_OUT(req, &mleak_table, sizeof (mleak_table));
1213         lck_mtx_unlock(mleak_lock);
1214
1215         return (i);
1216 }
1217
1218 static inline void
1219 m_incref(struct mbuf *m)
1220 {
1221         UInt16 old, new;
1222         volatile UInt16 *addr = (volatile UInt16 *)&MEXT_REF(m);
1223
1224         do {
1225                 old = *addr;
1226                 new = old + 1;
1227                 ASSERT(new != 0);
1228         } while (!OSCompareAndSwap16(old, new, addr));
1229
1230         /*
1231          * If cluster is shared, mark it with (sticky) EXTF_READONLY;
1232          * we don't clear the flag when the refcount goes back to the
1233          * minimum, to simplify code calling m_mclhasreference().
1234          */
1235         if (new > (MEXT_MINREF(m) + 1) && !(MEXT_FLAGS(m) & EXTF_READONLY))
1236                 (void) OSBitOrAtomic16(EXTF_READONLY, &MEXT_FLAGS(m));
1237 }
1238
1239 static inline u_int16_t
1240 m_decref(struct mbuf *m)
1241 {
1242         UInt16 old, new;
1243         volatile UInt16 *addr = (volatile UInt16 *)&MEXT_REF(m);
1244
1245         do {
1246                 old = *addr;
1247                 new = old - 1;
1248                 ASSERT(old != 0);
1249         } while (!OSCompareAndSwap16(old, new, addr));
1250
1251         return (new);
1252 }
1253
1254 static void
1255 mbuf_table_init(void)
1256 {
1257         unsigned int b, c, s;
1258         int m, config_mbuf_jumbo = 0;
1259
1260         MALLOC(omb_stat, struct omb_stat *, OMB_STAT_SIZE(NELEM(mbuf_table)),
1261             M_TEMP, M_WAITOK | M_ZERO);
1262         VERIFY(omb_stat != NULL);
1263
1264         MALLOC(mb_stat, mb_stat_t *, MB_STAT_SIZE(NELEM(mbuf_table)),
1265             M_TEMP, M_WAITOK | M_ZERO);
1266         VERIFY(mb_stat != NULL);
1267
1268         mb_stat->mbs_cnt = NELEM(mbuf_table);
1269         for (m = 0; m < NELEM(mbuf_table); m++)
1270                 mbuf_table[m].mtbl_stats = &mb_stat->mbs_class[m];
1271
1272 #if CONFIG_MBUF_JUMBO
1273         config_mbuf_jumbo = 1;
1274 #endif /* CONFIG_MBUF_JUMBO */
1275
1276         if (config_mbuf_jumbo == 1 || PAGE_SIZE == M16KCLBYTES) {
1277                 /*
1278                  * Set aside 1/3 of the mbuf cluster map for jumbo
1279                  * clusters; we do this only on platforms where jumbo
1280                  * cluster pool is enabled.
1281                  */
1282                 njcl = nmbclusters / 3;
1283                 njclbytes = M16KCLBYTES;
1284         }
1285
1286         /*
1287          * nclusters holds both the 2KB and 4KB pools, so ensure it's
1288          * a multiple of 4KB clusters.
1289          */
1290         nclusters = P2ROUNDDOWN(nmbclusters - njcl, NCLPG);
1291         if (njcl > 0) {
1292                 /*
1293                  * Each jumbo cluster takes 8 2KB clusters, so make
1294                  * sure that the pool size is evenly divisible by 8;
1295                  * njcl is in 2KB unit, hence treated as such.
1296                  */
1297                 njcl = P2ROUNDDOWN(nmbclusters - nclusters, NCLPJCL);
1298
1299                 /* Update nclusters with rounded down value of njcl */
1300                 nclusters = P2ROUNDDOWN(nmbclusters - njcl, NCLPG);
1301         }
1302
1303         /*
1304          * njcl is valid only on platforms with 16KB jumbo clusters or
1305          * with 16KB pages, where it is configured to 1/3 of the pool
1306          * size.  On these platforms, the remaining is used for 2KB
1307          * and 4KB clusters.  On platforms without 16KB jumbo clusters,
1308          * the entire pool is used for both 2KB and 4KB clusters.  A 4KB
1309          * cluster can either be splitted into 16 mbufs, or into 2 2KB
1310          * clusters.
1311          *
1312          *  +---+---+------------ ... -----------+------- ... -------+
1313          *  | c | b |              s             |        njcl       |
1314          *  +---+---+------------ ... -----------+------- ... -------+
1315          *
1316          * 1/32th of the shared region is reserved for pure 2KB and 4KB
1317          * clusters (1/64th each.)
1318          */
1319         c = P2ROUNDDOWN((nclusters >> 6), NCLPG);       /* in 2KB unit */
1320         b = P2ROUNDDOWN((nclusters >> (6 + NCLPBGSHIFT)), NBCLPG); /* in 4KB unit */
1321         s = nclusters - (c + (b << NCLPBGSHIFT));       /* in 2KB unit */
1322
1323         /*
1324          * 1/64th (c) is reserved for 2KB clusters.
1325          */
1326         m_minlimit(MC_CL) = c;
1327         m_maxlimit(MC_CL) = s + c;                      /* in 2KB unit */
1328         m_maxsize(MC_CL) = m_size(MC_CL) = MCLBYTES;
1329         (void) snprintf(m_cname(MC_CL), MAX_MBUF_CNAME, "cl");
1330
1331         /*
1332          * Another 1/64th (b) of the map is reserved for 4KB clusters.
1333          * It cannot be turned into 2KB clusters or mbufs.
1334          */
1335         m_minlimit(MC_BIGCL) = b;
1336         m_maxlimit(MC_BIGCL) = (s >> NCLPBGSHIFT) + b;  /* in 4KB unit */
1337         m_maxsize(MC_BIGCL) = m_size(MC_BIGCL) = MBIGCLBYTES;
1338         (void) snprintf(m_cname(MC_BIGCL), MAX_MBUF_CNAME, "bigcl");
1339
1340         /*
1341          * The remaining 31/32ths (s) are all-purpose (mbufs, 2KB, or 4KB)
1342          */
1343         m_minlimit(MC_MBUF) = 0;
1344         m_maxlimit(MC_MBUF) = (s << NMBPCLSHIFT);       /* in mbuf unit */
1345         m_maxsize(MC_MBUF) = m_size(MC_MBUF) = MSIZE;
1346         (void) snprintf(m_cname(MC_MBUF), MAX_MBUF_CNAME, "mbuf");
1347
1348         /*
1349          * Set limits for the composite classes.
1350          */
1351         m_minlimit(MC_MBUF_CL) = 0;
1352         m_maxlimit(MC_MBUF_CL) = m_maxlimit(MC_CL);
1353         m_maxsize(MC_MBUF_CL) = MCLBYTES;
1354         m_size(MC_MBUF_CL) = m_size(MC_MBUF) + m_size(MC_CL);
1355         (void) snprintf(m_cname(MC_MBUF_CL), MAX_MBUF_CNAME, "mbuf_cl");
1356
1357         m_minlimit(MC_MBUF_BIGCL) = 0;
1358         m_maxlimit(MC_MBUF_BIGCL) = m_maxlimit(MC_BIGCL);
1359         m_maxsize(MC_MBUF_BIGCL) = MBIGCLBYTES;
1360         m_size(MC_MBUF_BIGCL) = m_size(MC_MBUF) + m_size(MC_BIGCL);
1361         (void) snprintf(m_cname(MC_MBUF_BIGCL), MAX_MBUF_CNAME, "mbuf_bigcl");
1362
1363         /*
1364          * And for jumbo classes.
1365          */
1366         m_minlimit(MC_16KCL) = 0;
1367         m_maxlimit(MC_16KCL) = (njcl >> NCLPJCLSHIFT);  /* in 16KB unit */
1368         m_maxsize(MC_16KCL) = m_size(MC_16KCL) = M16KCLBYTES;
1369         (void) snprintf(m_cname(MC_16KCL), MAX_MBUF_CNAME, "16kcl");
1370
1371         m_minlimit(MC_MBUF_16KCL) = 0;
1372         m_maxlimit(MC_MBUF_16KCL) = m_maxlimit(MC_16KCL);
1373         m_maxsize(MC_MBUF_16KCL) = M16KCLBYTES;
1374         m_size(MC_MBUF_16KCL) = m_size(MC_MBUF) + m_size(MC_16KCL);
1375         (void) snprintf(m_cname(MC_MBUF_16KCL), MAX_MBUF_CNAME, "mbuf_16kcl");
1376
1377         /*
1378          * Initialize the legacy mbstat structure.
1379          */
1380         bzero(&mbstat, sizeof (mbstat));
1381         mbstat.m_msize = m_maxsize(MC_MBUF);
1382         mbstat.m_mclbytes = m_maxsize(MC_CL);
1383         mbstat.m_minclsize = MINCLSIZE;
1384         mbstat.m_mlen = MLEN;
1385         mbstat.m_mhlen = MHLEN;
1386         mbstat.m_bigmclbytes = m_maxsize(MC_BIGCL);
1387 }
1388
1389 #if defined(__LP64__)
1390 typedef struct ncl_tbl {
1391         uint64_t nt_maxmem;     /* memory (sane) size */
1392         uint32_t nt_mbpool;     /* mbuf pool size */
1393 } ncl_tbl_t;
1394
1395 /* Non-server */
1396 static ncl_tbl_t ncl_table[] = {
1397         { (1ULL << GBSHIFT)       /*  1 GB */,  (64 << MBSHIFT)  /*  64 MB */ },
1398         { (1ULL << (GBSHIFT + 3)) /*  8 GB */,  (96 << MBSHIFT)  /*  96 MB */ },
1399         { (1ULL << (GBSHIFT + 4)) /* 16 GB */,  (128 << MBSHIFT) /* 128 MB */ },
1400         { 0, 0 }
1401 };
1402
1403 /* Server */
1404 static ncl_tbl_t ncl_table_srv[] = {
1405         { (1ULL << GBSHIFT)       /*  1 GB */,  (96 << MBSHIFT)  /*  96 MB */ },
1406         { (1ULL << (GBSHIFT + 2)) /*  4 GB */,  (128 << MBSHIFT) /* 128 MB */ },
1407         { (1ULL << (GBSHIFT + 3)) /*  8 GB */,  (160 << MBSHIFT) /* 160 MB */ },
1408         { (1ULL << (GBSHIFT + 4)) /* 16 GB */,  (192 << MBSHIFT) /* 192 MB */ },
1409         { (1ULL << (GBSHIFT + 5)) /* 32 GB */,  (256 << MBSHIFT) /* 256 MB */ },
1410         { (1ULL << (GBSHIFT + 6)) /* 64 GB */,  (384 << MBSHIFT) /* 384 MB */ },
1411         { 0, 0 }
1412 };
1413 #endif /* __LP64__ */
1414
1415 __private_extern__ unsigned int
1416 mbuf_default_ncl(int server, uint64_t mem)
1417 {
1418 #if !defined(__LP64__)
1419 #pragma unused(server)
1420         unsigned int n;
1421         /*
1422          * 32-bit kernel (default to 64MB of mbuf pool for >= 1GB RAM).
1423          */
1424         if ((n = ((mem / 16) / MCLBYTES)) > 32768)
1425                 n = 32768;
1426 #else
1427         unsigned int n, i;
1428         ncl_tbl_t *tbl = (server ? ncl_table_srv : ncl_table);
1429         /*
1430          * 64-bit kernel (mbuf pool size based on table).
1431          */
1432         n = tbl[0].nt_mbpool;
1433         for (i = 0; tbl[i].nt_mbpool != 0; i++) {
1434                 if (mem < tbl[i].nt_maxmem)
1435                         break;
1436                 n = tbl[i].nt_mbpool;
1437         }
1438         n >>= MCLSHIFT;
1439 #endif /* !__LP64__ */
1440         return (n);
1441 }
1442
1443 __private_extern__ void
1444 mbinit(void)
1445 {
1446         unsigned int m;
1447         unsigned int initmcl = 0;
1448         void *buf;
1449         thread_t thread = THREAD_NULL;
1450
1451         microuptime(&mb_start);
1452
1453         /*
1454          * These MBUF_ values must be equal to their private counterparts.
1455          */
1456         _CASSERT(MBUF_EXT == M_EXT);
1457         _CASSERT(MBUF_PKTHDR == M_PKTHDR);
1458         _CASSERT(MBUF_EOR == M_EOR);
1459         _CASSERT(MBUF_LOOP == M_LOOP);
1460         _CASSERT(MBUF_BCAST == M_BCAST);
1461         _CASSERT(MBUF_MCAST == M_MCAST);
1462         _CASSERT(MBUF_FRAG == M_FRAG);
1463         _CASSERT(MBUF_FIRSTFRAG == M_FIRSTFRAG);
1464         _CASSERT(MBUF_LASTFRAG == M_LASTFRAG);
1465         _CASSERT(MBUF_PROMISC == M_PROMISC);
1466         _CASSERT(MBUF_HASFCS == M_HASFCS);
1467
1468         _CASSERT(MBUF_TYPE_FREE == MT_FREE);
1469         _CASSERT(MBUF_TYPE_DATA == MT_DATA);
1470         _CASSERT(MBUF_TYPE_HEADER == MT_HEADER);
1471         _CASSERT(MBUF_TYPE_SOCKET == MT_SOCKET);
1472         _CASSERT(MBUF_TYPE_PCB == MT_PCB);
1473         _CASSERT(MBUF_TYPE_RTABLE == MT_RTABLE);
1474         _CASSERT(MBUF_TYPE_HTABLE == MT_HTABLE);
1475         _CASSERT(MBUF_TYPE_ATABLE == MT_ATABLE);
1476         _CASSERT(MBUF_TYPE_SONAME == MT_SONAME);
1477         _CASSERT(MBUF_TYPE_SOOPTS == MT_SOOPTS);
1478         _CASSERT(MBUF_TYPE_FTABLE == MT_FTABLE);
1479         _CASSERT(MBUF_TYPE_RIGHTS == MT_RIGHTS);
1480         _CASSERT(MBUF_TYPE_IFADDR == MT_IFADDR);
1481         _CASSERT(MBUF_TYPE_CONTROL == MT_CONTROL);
1482         _CASSERT(MBUF_TYPE_OOBDATA == MT_OOBDATA);
1483
1484         _CASSERT(MBUF_TSO_IPV4 == CSUM_TSO_IPV4);
1485         _CASSERT(MBUF_TSO_IPV6 == CSUM_TSO_IPV6);
1486         _CASSERT(MBUF_CSUM_REQ_SUM16 == CSUM_PARTIAL);
1487         _CASSERT(MBUF_CSUM_TCP_SUM16 == MBUF_CSUM_REQ_SUM16);
1488         _CASSERT(MBUF_CSUM_REQ_ZERO_INVERT == CSUM_ZERO_INVERT);
1489         _CASSERT(MBUF_CSUM_REQ_IP == CSUM_IP);
1490         _CASSERT(MBUF_CSUM_REQ_TCP == CSUM_TCP);
1491         _CASSERT(MBUF_CSUM_REQ_UDP == CSUM_UDP);
1492         _CASSERT(MBUF_CSUM_REQ_TCPIPV6 == CSUM_TCPIPV6);
1493         _CASSERT(MBUF_CSUM_REQ_UDPIPV6 == CSUM_UDPIPV6);
1494         _CASSERT(MBUF_CSUM_DID_IP == CSUM_IP_CHECKED);
1495         _CASSERT(MBUF_CSUM_IP_GOOD == CSUM_IP_VALID);
1496         _CASSERT(MBUF_CSUM_DID_DATA == CSUM_DATA_VALID);
1497         _CASSERT(MBUF_CSUM_PSEUDO_HDR == CSUM_PSEUDO_HDR);
1498
1499         _CASSERT(MBUF_WAITOK == M_WAIT);
1500         _CASSERT(MBUF_DONTWAIT == M_DONTWAIT);
1501         _CASSERT(MBUF_COPYALL == M_COPYALL);
1502
1503         _CASSERT(MBUF_SC2TC(MBUF_SC_BK_SYS) == MBUF_TC_BK);
1504         _CASSERT(MBUF_SC2TC(MBUF_SC_BK) == MBUF_TC_BK);
1505         _CASSERT(MBUF_SC2TC(MBUF_SC_BE) == MBUF_TC_BE);
1506         _CASSERT(MBUF_SC2TC(MBUF_SC_RD) == MBUF_TC_BE);
1507         _CASSERT(MBUF_SC2TC(MBUF_SC_OAM) == MBUF_TC_BE);
1508         _CASSERT(MBUF_SC2TC(MBUF_SC_AV) == MBUF_TC_VI);
1509         _CASSERT(MBUF_SC2TC(MBUF_SC_RV) == MBUF_TC_VI);
1510         _CASSERT(MBUF_SC2TC(MBUF_SC_VI) == MBUF_TC_VI);
1511         _CASSERT(MBUF_SC2TC(MBUF_SC_VO) == MBUF_TC_VO);
1512         _CASSERT(MBUF_SC2TC(MBUF_SC_CTL) == MBUF_TC_VO);
1513
1514         _CASSERT(MBUF_TC2SCVAL(MBUF_TC_BK) == SCVAL_BK);
1515         _CASSERT(MBUF_TC2SCVAL(MBUF_TC_BE) == SCVAL_BE);
1516         _CASSERT(MBUF_TC2SCVAL(MBUF_TC_VI) == SCVAL_VI);
1517         _CASSERT(MBUF_TC2SCVAL(MBUF_TC_VO) == SCVAL_VO);
1518
1519         /* Module specific scratch space (32-bit alignment requirement) */
1520         _CASSERT(!(offsetof(struct mbuf, m_pkthdr.pkt_mpriv) %
1521             sizeof (uint32_t)));
1522
1523         /* Initialize random red zone cookie value */
1524         _CASSERT(sizeof (mb_redzone_cookie) ==
1525             sizeof (((struct pkthdr *)0)->redzone));
1526         read_random(&mb_redzone_cookie, sizeof (mb_redzone_cookie));
1527         read_random(&mb_obscure_extref, sizeof (mb_obscure_extref));
1528         read_random(&mb_obscure_extfree, sizeof (mb_obscure_extfree));
1529         mb_obscure_extref |= 0x3;
1530         mb_obscure_extfree |= 0x3;
1531
1532         /* Make sure we don't save more than we should */
1533         _CASSERT(MCA_SAVED_MBUF_SIZE <= sizeof (struct mbuf));
1534
1535         if (nmbclusters == 0)
1536                 nmbclusters = NMBCLUSTERS;
1537
1538         /* This should be a sane (at least even) value by now */
1539         VERIFY(nmbclusters != 0 && !(nmbclusters & 0x1));
1540
1541         /* Setup the mbuf table */
1542         mbuf_table_init();
1543
1544         /* Global lock for common layer */
1545         mbuf_mlock_grp_attr = lck_grp_attr_alloc_init();
1546         mbuf_mlock_grp = lck_grp_alloc_init("mbuf", mbuf_mlock_grp_attr);
1547         mbuf_mlock_attr = lck_attr_alloc_init();
1548         lck_mtx_init(mbuf_mlock, mbuf_mlock_grp, mbuf_mlock_attr);
1549
1550         /*
1551          * Allocate cluster slabs table:
1552          *
1553          *      maxslabgrp = (N * 2048) / (1024 * 1024)
1554          *
1555          * Where N is nmbclusters rounded up to the nearest 512.  This yields
1556          * mcl_slab_g_t units, each one representing a MB of memory.
1557          */
1558         maxslabgrp =
1559             (P2ROUNDUP(nmbclusters, (MBSIZE >> MCLSHIFT)) << MCLSHIFT) >> MBSHIFT;
1560         MALLOC(slabstbl, mcl_slabg_t **, maxslabgrp * sizeof (mcl_slabg_t *),
1561             M_TEMP, M_WAITOK | M_ZERO);
1562         VERIFY(slabstbl != NULL);
1563
1564         /*
1565          * Allocate audit structures, if needed:
1566          *
1567          *      maxclaudit = (maxslabgrp * 1024 * 1024) / PAGE_SIZE
1568          *
1569          * This yields mcl_audit_t units, each one representing a page.
1570          */
1571         PE_parse_boot_argn("mbuf_debug", &mbuf_debug, sizeof (mbuf_debug));
1572         mbuf_debug |= mcache_getflags();
1573         if (mbuf_debug & MCF_DEBUG) {
1574                 int l;
1575                 mcl_audit_t *mclad;
1576                 maxclaudit = ((maxslabgrp << MBSHIFT) >> PAGE_SHIFT);
1577                 MALLOC(mclaudit, mcl_audit_t *, maxclaudit * sizeof (*mclaudit),
1578                     M_TEMP, M_WAITOK | M_ZERO);
1579                 VERIFY(mclaudit != NULL);
1580                 for (l = 0, mclad = mclaudit; l < maxclaudit; l++) {
1581                         MALLOC(mclad[l].cl_audit, mcache_audit_t **,
1582                             NMBPG * sizeof(mcache_audit_t *),
1583                             M_TEMP, M_WAITOK | M_ZERO);
1584                         VERIFY(mclad[l].cl_audit != NULL);
1585                 }
1586
1587                 mcl_audit_con_cache = mcache_create("mcl_audit_contents",
1588                     AUDIT_CONTENTS_SIZE, sizeof (u_int64_t), 0, MCR_SLEEP);
1589                 VERIFY(mcl_audit_con_cache != NULL);
1590         }
1591         mclverify = (mbuf_debug & MCF_VERIFY);
1592         mcltrace = (mbuf_debug & MCF_TRACE);
1593         mclfindleak = !(mbuf_debug & MCF_NOLEAKLOG);
1594         mclexpleak = mclfindleak && (mbuf_debug & MCF_EXPLEAKLOG);
1595
1596         /* Enable mbuf leak logging, with a lock to protect the tables */
1597
1598         mleak_lock_grp_attr = lck_grp_attr_alloc_init();
1599         mleak_lock_grp = lck_grp_alloc_init("mleak_lock", mleak_lock_grp_attr);
1600         mleak_lock_attr = lck_attr_alloc_init();
1601         lck_mtx_init(mleak_lock, mleak_lock_grp, mleak_lock_attr);
1602
1603         mleak_activate();
1604
1605         /*
1606          * Allocate structure for per-CPU statistics that's aligned
1607          * on the CPU cache boundary; this code assumes that we never
1608          * uninitialize this framework, since the original address
1609          * before alignment is not saved.
1610          */
1611         ncpu = ml_get_max_cpus();
1612         MALLOC(buf, void *, MBUF_MTYPES_SIZE(ncpu) + CPU_CACHE_LINE_SIZE,
1613             M_TEMP, M_WAITOK);
1614         VERIFY(buf != NULL);
1615
1616         mbuf_mtypes = (mbuf_mtypes_t *)P2ROUNDUP((intptr_t)buf,
1617             CPU_CACHE_LINE_SIZE);
1618         bzero(mbuf_mtypes, MBUF_MTYPES_SIZE(ncpu));
1619
1620         /* Calculate the number of pages assigned to the cluster pool */
1621         mcl_pages = (nmbclusters << MCLSHIFT) / PAGE_SIZE;
1622         MALLOC(mcl_paddr, ppnum_t *, mcl_pages * sizeof (ppnum_t),
1623             M_TEMP, M_WAITOK);
1624         VERIFY(mcl_paddr != NULL);
1625
1626         /* Register with the I/O Bus mapper */
1627         mcl_paddr_base = IOMapperIOVMAlloc(mcl_pages);
1628         bzero((char *)mcl_paddr, mcl_pages * sizeof (ppnum_t));
1629
1630         embutl = (mbutl + (nmbclusters * MCLBYTES));
1631         VERIFY(((embutl - mbutl) % MBIGCLBYTES) == 0);
1632
1633         /* Prime up the freelist */
1634         PE_parse_boot_argn("initmcl", &initmcl, sizeof (initmcl));
1635         if (initmcl != 0) {
1636                 initmcl >>= NCLPBGSHIFT;        /* become a 4K unit */
1637                 if (initmcl > m_maxlimit(MC_BIGCL))
1638                         initmcl = m_maxlimit(MC_BIGCL);
1639         }
1640         if (initmcl < m_minlimit(MC_BIGCL))
1641                 initmcl = m_minlimit(MC_BIGCL);
1642
1643         lck_mtx_lock(mbuf_mlock);
1644
1645         /*
1646          * For classes with non-zero minimum limits, populate their freelists
1647          * so that m_total(class) is at least m_minlimit(class).
1648          */
1649         VERIFY(m_total(MC_BIGCL) == 0 && m_minlimit(MC_BIGCL) != 0);
1650         freelist_populate(m_class(MC_BIGCL), initmcl, M_WAIT);
1651         VERIFY(m_total(MC_BIGCL) >= m_minlimit(MC_BIGCL));
1652         freelist_init(m_class(MC_CL));
1653
1654         for (m = 0; m < NELEM(mbuf_table); m++) {
1655                 /* Make sure we didn't miss any */
1656                 VERIFY(m_minlimit(m_class(m)) == 0 ||
1657                     m_total(m_class(m)) >= m_minlimit(m_class(m)));
1658
1659                 /* populate the initial sizes and report from there on */
1660                 m_peak(m_class(m)) = m_total(m_class(m));
1661         }
1662         mb_peak_newreport = FALSE;
1663
1664         lck_mtx_unlock(mbuf_mlock);
1665
1666         (void) kernel_thread_start((thread_continue_t)mbuf_worker_thread_init,
1667             NULL, &thread);
1668         thread_deallocate(thread);
1669
1670         ref_cache = mcache_create("mext_ref", sizeof (struct ext_ref),
1671             0, 0, MCR_SLEEP);
1672
1673         /* Create the cache for each class */
1674         for (m = 0; m < NELEM(mbuf_table); m++) {
1675                 void *allocfunc, *freefunc, *auditfunc, *logfunc;
1676                 u_int32_t flags;
1677
1678                 flags = mbuf_debug;
1679                 if (m_class(m) == MC_MBUF_CL || m_class(m) == MC_MBUF_BIGCL ||
1680                     m_class(m) == MC_MBUF_16KCL) {
1681                         allocfunc = mbuf_cslab_alloc;
1682                         freefunc = mbuf_cslab_free;
1683                         auditfunc = mbuf_cslab_audit;
1684                         logfunc = mleak_logger;
1685                 } else {
1686                         allocfunc = mbuf_slab_alloc;
1687                         freefunc = mbuf_slab_free;
1688                         auditfunc = mbuf_slab_audit;
1689                         logfunc = mleak_logger;
1690                 }
1691
1692                 /*
1693                  * Disable per-CPU caches for jumbo classes if there
1694                  * is no jumbo cluster pool available in the system.
1695                  * The cache itself is still created (but will never
1696                  * be populated) since it simplifies the code.
1697                  */
1698                 if ((m_class(m) == MC_MBUF_16KCL || m_class(m) == MC_16KCL) &&
1699                     njcl == 0)
1700                         flags |= MCF_NOCPUCACHE;
1701
1702                 if (!mclfindleak)
1703                         flags |= MCF_NOLEAKLOG;
1704
1705                 m_cache(m) = mcache_create_ext(m_cname(m), m_maxsize(m),
1706                     allocfunc, freefunc, auditfunc, logfunc, mbuf_slab_notify,
1707                     (void *)(uintptr_t)m, flags, MCR_SLEEP);
1708         }
1709
1710         /*
1711          * Set the max limit on sb_max to be 1/16 th of the size of
1712          * memory allocated for mbuf clusters.
1713          */
1714         high_sb_max = (nmbclusters << (MCLSHIFT - 4));
1715         if (high_sb_max < sb_max) {
1716                 /* sb_max is too large for this configuration, scale it down */
1717                 if (high_sb_max > (1 << MBSHIFT)) {
1718                         /* We have atleast 16 M of mbuf pool */
1719                         sb_max = high_sb_max;
1720                 } else if ((nmbclusters << MCLSHIFT) > (1 << MBSHIFT)) {
1721                         /*
1722                          * If we have more than 1M of mbufpool, cap the size of
1723                          * max sock buf at 1M
1724                          */
1725                         sb_max = high_sb_max = (1 << MBSHIFT);
1726                 } else {
1727                         sb_max = high_sb_max;
1728                 }
1729         }
1730
1731         /* allocate space for mbuf_dump_buf */
1732         MALLOC(mbuf_dump_buf, char *, MBUF_DUMP_BUF_SIZE, M_TEMP, M_WAITOK);
1733         VERIFY(mbuf_dump_buf != NULL);
1734
1735         if (mbuf_debug & MCF_DEBUG) {
1736                 printf("%s: MLEN %d, MHLEN %d\n", __func__,
1737                     (int)_MLEN, (int)_MHLEN);
1738         }
1739
1740         printf("%s: done [%d MB total pool size, (%d/%d) split]\n", __func__,
1741             (nmbclusters << MCLSHIFT) >> MBSHIFT,
1742             (nclusters << MCLSHIFT) >> MBSHIFT,
1743             (njcl << MCLSHIFT) >> MBSHIFT);
1744
1745         /* initialize lock form tx completion callback table */
1746         mbuf_tx_compl_tbl_lck_grp_attr = lck_grp_attr_alloc_init();
1747         if (mbuf_tx_compl_tbl_lck_grp_attr == NULL) {
1748                 panic("%s: lck_grp_attr_alloc_init failed", __func__);
1749                 /* NOTREACHED */
1750         }
1751         mbuf_tx_compl_tbl_lck_grp = lck_grp_alloc_init("mbuf_tx_compl_tbl",
1752             mbuf_tx_compl_tbl_lck_grp_attr);
1753         if (mbuf_tx_compl_tbl_lck_grp == NULL) {
1754                 panic("%s: lck_grp_alloc_init failed", __func__);
1755                 /* NOTREACHED */
1756         }
1757         mbuf_tx_compl_tbl_lck_attr = lck_attr_alloc_init();
1758         if (mbuf_tx_compl_tbl_lck_attr == NULL) {
1759                 panic("%s: lck_attr_alloc_init failed", __func__);
1760                 /* NOTREACHED */
1761         }
1762         lck_rw_init(mbuf_tx_compl_tbl_lock, mbuf_tx_compl_tbl_lck_grp,
1763             mbuf_tx_compl_tbl_lck_attr);
1764
1765 }
1766
1767 /*
1768  * Obtain a slab of object(s) from the class's freelist.
1769  */
1770 static mcache_obj_t *
1771 slab_alloc(mbuf_class_t class, int wait)
1772 {
1773         mcl_slab_t *sp;
1774         mcache_obj_t *buf;
1775
1776         LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1777
1778         /* This should always be NULL for us */
1779         VERIFY(m_cobjlist(class) == NULL);
1780
1781         /*
1782          * Treat composite objects as having longer lifespan by using
1783          * a slab from the reverse direction, in hoping that this could
1784          * reduce the probability of fragmentation for slabs that hold
1785          * more than one buffer chunks (e.g. mbuf slabs).  For other
1786          * slabs, this probably doesn't make much of a difference.
1787          */
1788         if ((class == MC_MBUF || class == MC_CL || class == MC_BIGCL)
1789             && (wait & MCR_COMP))
1790                 sp = (mcl_slab_t *)TAILQ_LAST(&m_slablist(class), mcl_slhead);
1791         else
1792                 sp = (mcl_slab_t *)TAILQ_FIRST(&m_slablist(class));
1793
1794         if (sp == NULL) {
1795                 VERIFY(m_infree(class) == 0 && m_slab_cnt(class) == 0);
1796                 /* The slab list for this class is empty */
1797                 return (NULL);
1798         }
1799
1800         VERIFY(m_infree(class) > 0);
1801         VERIFY(!slab_is_detached(sp));
1802         VERIFY(sp->sl_class == class &&
1803             (sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED);
1804         buf = sp->sl_head;
1805         VERIFY(slab_inrange(sp, buf) && sp == slab_get(buf));
1806         sp->sl_head = buf->obj_next;
1807         /* Increment slab reference */
1808         sp->sl_refcnt++;
1809
1810         VERIFY(sp->sl_head != NULL || sp->sl_refcnt == sp->sl_chunks);
1811
1812         if (sp->sl_head != NULL && !slab_inrange(sp, sp->sl_head)) {
1813                 slab_nextptr_panic(sp, sp->sl_head);
1814                 /* In case sl_head is in the map but not in the slab */
1815                 VERIFY(slab_inrange(sp, sp->sl_head));
1816                 /* NOTREACHED */
1817         }
1818
1819         if (mclaudit != NULL) {
1820                 mcache_audit_t *mca = mcl_audit_buf2mca(class, buf);
1821                 mca->mca_uflags = 0;
1822                 /* Save contents on mbuf objects only */
1823                 if (class == MC_MBUF)
1824                         mca->mca_uflags |= MB_SCVALID;
1825         }
1826
1827         if (class == MC_CL) {
1828                 mbstat.m_clfree = (--m_infree(MC_CL)) + m_infree(MC_MBUF_CL);
1829                 /*
1830                  * A 2K cluster slab can have at most NCLPG references.
1831                  */
1832                 VERIFY(sp->sl_refcnt >= 1 && sp->sl_refcnt <= NCLPG &&
1833                     sp->sl_chunks == NCLPG && sp->sl_len == PAGE_SIZE);
1834                 VERIFY(sp->sl_refcnt < NCLPG || sp->sl_head == NULL);
1835         } else if (class == MC_BIGCL) {
1836                 mbstat.m_bigclfree = (--m_infree(MC_BIGCL)) +
1837                     m_infree(MC_MBUF_BIGCL);
1838                 /*
1839                  * A 4K cluster slab can have NBCLPG references.
1840                  */
1841                 VERIFY(sp->sl_refcnt >= 1 && sp->sl_chunks == NBCLPG &&
1842                     sp->sl_len == PAGE_SIZE &&
1843                     (sp->sl_refcnt < NBCLPG || sp->sl_head == NULL));
1844         } else if (class == MC_16KCL) {
1845                 mcl_slab_t *nsp;
1846                 int k;
1847
1848                 --m_infree(MC_16KCL);
1849                 VERIFY(sp->sl_refcnt == 1 && sp->sl_chunks == 1 &&
1850                     sp->sl_len == m_maxsize(class) && sp->sl_head == NULL);
1851                 /*
1852                  * Increment 2nd-Nth slab reference, where N is NSLABSP16KB.
1853                  * A 16KB big cluster takes NSLABSP16KB slabs, each having at
1854                  * most 1 reference.
1855                  */
1856                 for (nsp = sp, k = 1; k < NSLABSP16KB; k++) {
1857                         nsp = nsp->sl_next;
1858                         /* Next slab must already be present */
1859                         VERIFY(nsp != NULL);
1860                         nsp->sl_refcnt++;
1861                         VERIFY(!slab_is_detached(nsp));
1862                         VERIFY(nsp->sl_class == MC_16KCL &&
1863                             nsp->sl_flags == (SLF_MAPPED | SLF_PARTIAL) &&
1864                             nsp->sl_refcnt == 1 && nsp->sl_chunks == 0 &&
1865                             nsp->sl_len == 0 && nsp->sl_base == sp->sl_base &&
1866                             nsp->sl_head == NULL);
1867                 }
1868         } else {
1869                 VERIFY(class == MC_MBUF);
1870                 --m_infree(MC_MBUF);
1871                 /*
1872                  * If auditing is turned on, this check is
1873                  * deferred until later in mbuf_slab_audit().
1874                  */
1875                 if (mclaudit == NULL)
1876                         _MCHECK((struct mbuf *)buf);
1877                 /*
1878                  * Since we have incremented the reference count above,
1879                  * an mbuf slab (formerly a 4KB cluster slab that was cut
1880                  * up into mbufs) must have a reference count between 1
1881                  * and NMBPG at this point.
1882                  */
1883                 VERIFY(sp->sl_refcnt >= 1 && sp->sl_refcnt <= NMBPG &&
1884                     sp->sl_chunks == NMBPG &&
1885                     sp->sl_len == PAGE_SIZE);
1886                 VERIFY(sp->sl_refcnt < NMBPG || sp->sl_head == NULL);
1887         }
1888
1889         /* If empty, remove this slab from the class's freelist */
1890         if (sp->sl_head == NULL) {
1891                 VERIFY(class != MC_MBUF || sp->sl_refcnt == NMBPG);
1892                 VERIFY(class != MC_CL || sp->sl_refcnt == NCLPG);
1893                 VERIFY(class != MC_BIGCL || sp->sl_refcnt == NBCLPG);
1894                 slab_remove(sp, class);
1895         }
1896
1897         return (buf);
1898 }
1899
1900 /*
1901  * Place a slab of object(s) back into a class's slab list.
1902  */
1903 static void
1904 slab_free(mbuf_class_t class, mcache_obj_t *buf)
1905 {
1906         mcl_slab_t *sp;
1907         boolean_t reinit_supercl = false;
1908         mbuf_class_t super_class;
1909
1910         LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1911
1912         VERIFY(class != MC_16KCL || njcl > 0);
1913         VERIFY(buf->obj_next == NULL);
1914
1915         /*
1916          * Synchronizing with m_clalloc, as it reads m_total, while we here
1917          * are modifying m_total.
1918          */
1919         while (mb_clalloc_busy) {
1920                 mb_clalloc_waiters++;
1921                 (void) msleep(mb_clalloc_waitchan, mbuf_mlock,
1922                     (PZERO-1), "m_clalloc", NULL);
1923                 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1924         }
1925
1926         /* We are busy now; tell everyone else to go away */
1927         mb_clalloc_busy = TRUE;
1928
1929         sp = slab_get(buf);
1930         VERIFY(sp->sl_class == class && slab_inrange(sp, buf) &&
1931             (sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED);
1932
1933         /* Decrement slab reference */
1934         sp->sl_refcnt--;
1935
1936         if (class == MC_CL) {
1937                 VERIFY(IS_P2ALIGNED(buf, MCLBYTES));
1938                 /*
1939                  * A slab that has been splitted for 2KB clusters can have
1940                  * at most 1 outstanding reference at this point.
1941                  */
1942                 VERIFY(sp->sl_refcnt >= 0 && sp->sl_refcnt <= (NCLPG - 1) &&
1943                     sp->sl_chunks == NCLPG && sp->sl_len == PAGE_SIZE);
1944                 VERIFY(sp->sl_refcnt < (NCLPG - 1) ||
1945                     (slab_is_detached(sp) && sp->sl_head == NULL));
1946         } else if (class == MC_BIGCL) {
1947                 VERIFY(IS_P2ALIGNED(buf, MBIGCLBYTES));
1948
1949                 /* A 4KB cluster slab can have NBCLPG references at most */
1950                 VERIFY(sp->sl_refcnt >= 0 && sp->sl_chunks == NBCLPG);
1951                 VERIFY(sp->sl_refcnt < (NBCLPG - 1) ||
1952                     (slab_is_detached(sp) && sp->sl_head == NULL));
1953         } else if (class == MC_16KCL) {
1954                 mcl_slab_t *nsp;
1955                 int k;
1956                 /*
1957                  * A 16KB cluster takes NSLABSP16KB slabs, all must
1958                  * now have 0 reference.
1959                  */
1960                 VERIFY(IS_P2ALIGNED(buf, PAGE_SIZE));
1961                 VERIFY(sp->sl_refcnt == 0 && sp->sl_chunks == 1 &&
1962                     sp->sl_len == m_maxsize(class) && sp->sl_head == NULL);
1963                 VERIFY(slab_is_detached(sp));
1964                 for (nsp = sp, k = 1; k < NSLABSP16KB; k++) {
1965                         nsp = nsp->sl_next;
1966                         /* Next slab must already be present */
1967                         VERIFY(nsp != NULL);
1968                         nsp->sl_refcnt--;
1969                         VERIFY(slab_is_detached(nsp));
1970                         VERIFY(nsp->sl_class == MC_16KCL &&
1971                             (nsp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) &&
1972                             nsp->sl_refcnt == 0 && nsp->sl_chunks == 0 &&
1973                             nsp->sl_len == 0 && nsp->sl_base == sp->sl_base &&
1974                             nsp->sl_head == NULL);
1975                 }
1976         } else {
1977                 /*
1978                  * A slab that has been splitted for mbufs has at most
1979                  * NMBPG reference counts.  Since we have decremented
1980                  * one reference above, it must now be between 0 and
1981                  * NMBPG-1.
1982                  */
1983                 VERIFY(class == MC_MBUF);
1984                 VERIFY(sp->sl_refcnt >= 0 &&
1985                     sp->sl_refcnt <= (NMBPG - 1) &&
1986                     sp->sl_chunks == NMBPG &&
1987                     sp->sl_len == PAGE_SIZE);
1988                 VERIFY(sp->sl_refcnt < (NMBPG - 1) ||
1989                     (slab_is_detached(sp) && sp->sl_head == NULL));
1990         }
1991
1992         /*
1993          * When auditing is enabled, ensure that the buffer still
1994          * contains the free pattern.  Otherwise it got corrupted
1995          * while at the CPU cache layer.
1996          */
1997         if (mclaudit != NULL) {
1998                 mcache_audit_t *mca = mcl_audit_buf2mca(class, buf);
1999                 if (mclverify) {
2000                         mcache_audit_free_verify(mca, buf, 0,
2001                             m_maxsize(class));
2002                 }
2003                 mca->mca_uflags &= ~MB_SCVALID;
2004         }
2005
2006         if (class == MC_CL) {
2007                 mbstat.m_clfree = (++m_infree(MC_CL)) + m_infree(MC_MBUF_CL);
2008                 buf->obj_next = sp->sl_head;
2009         } else if (class == MC_BIGCL) {
2010                 mbstat.m_bigclfree = (++m_infree(MC_BIGCL)) +
2011                     m_infree(MC_MBUF_BIGCL);
2012                 buf->obj_next = sp->sl_head;
2013         } else if (class == MC_16KCL) {
2014                 ++m_infree(MC_16KCL);
2015         } else {
2016                 ++m_infree(MC_MBUF);
2017                 buf->obj_next = sp->sl_head;
2018         }
2019         sp->sl_head = buf;
2020
2021         /*
2022          * If a slab has been split to either one which holds 2KB clusters,
2023          * or one which holds mbufs, turn it back to one which holds a
2024          * 4 or 16 KB cluster depending on the page size.
2025          */
2026         if (m_maxsize(MC_BIGCL) == PAGE_SIZE) {
2027                 super_class = MC_BIGCL;
2028         } else {
2029                 VERIFY(PAGE_SIZE == m_maxsize(MC_16KCL));
2030                 super_class = MC_16KCL;
2031         }
2032         if (class == MC_MBUF && sp->sl_refcnt == 0 &&
2033             m_total(class) >= (m_minlimit(class) + NMBPG) &&
2034             m_total(super_class) < m_maxlimit(super_class)) {
2035                 int i = NMBPG;
2036
2037                 m_total(MC_MBUF) -= NMBPG;
2038                 mbstat.m_mbufs = m_total(MC_MBUF);
2039                 m_infree(MC_MBUF) -= NMBPG;
2040                 mtype_stat_add(MT_FREE, -((unsigned)NMBPG));
2041
2042                 while (i--) {
2043                         struct mbuf *m = sp->sl_head;
2044                         VERIFY(m != NULL);
2045                         sp->sl_head = m->m_next;
2046                         m->m_next = NULL;
2047                 }
2048                 reinit_supercl = true;
2049         } else if (class == MC_CL && sp->sl_refcnt == 0 &&
2050             m_total(class) >=  (m_minlimit(class) + NCLPG) &&
2051             m_total(super_class) < m_maxlimit(super_class)) {
2052                 int i = NCLPG;
2053
2054                 m_total(MC_CL) -= NCLPG;
2055                 mbstat.m_clusters = m_total(MC_CL);
2056                 m_infree(MC_CL) -= NCLPG;
2057
2058                 while (i--) {
2059                         union mcluster *c = sp->sl_head;
2060                         VERIFY(c != NULL);
2061                         sp->sl_head = c->mcl_next;
2062                         c->mcl_next = NULL;
2063                 }
2064                 reinit_supercl = true;
2065         } else if (class == MC_BIGCL && super_class != MC_BIGCL &&
2066             sp->sl_refcnt == 0 &&
2067             m_total(class) >= (m_minlimit(class) + NBCLPG) &&
2068             m_total(super_class) < m_maxlimit(super_class)) {
2069                 int i = NBCLPG;
2070
2071                 VERIFY(super_class == MC_16KCL);
2072                 m_total(MC_BIGCL) -= NBCLPG;
2073                 mbstat.m_bigclusters = m_total(MC_BIGCL);
2074                 m_infree(MC_BIGCL) -= NBCLPG;
2075
2076                 while (i--) {
2077                         union mbigcluster *bc = sp->sl_head;
2078                         VERIFY(bc != NULL);
2079                         sp->sl_head = bc->mbc_next;
2080                         bc->mbc_next = NULL;
2081                 }
2082                 reinit_supercl = true;
2083         }
2084
2085         if (reinit_supercl) {
2086                 VERIFY(sp->sl_head == NULL);
2087                 VERIFY(m_total(class) >= m_minlimit(class));
2088                 slab_remove(sp, class);
2089
2090                 /* Reinitialize it as a cluster for the super class */
2091                 m_total(super_class)++;
2092                 m_infree(super_class)++;
2093                 VERIFY(sp->sl_flags == (SLF_MAPPED | SLF_DETACHED) &&
2094                     sp->sl_len == PAGE_SIZE && sp->sl_refcnt == 0);
2095
2096                 slab_init(sp, super_class, SLF_MAPPED, sp->sl_base,
2097                     sp->sl_base, PAGE_SIZE, 0, 1);
2098                 if (mclverify)
2099                         mcache_set_pattern(MCACHE_FREE_PATTERN,
2100                             (caddr_t)sp->sl_base, sp->sl_len);
2101                 ((mcache_obj_t *)(sp->sl_base))->obj_next = NULL;
2102
2103                 if (super_class == MC_BIGCL) {
2104                         mbstat.m_bigclusters = m_total(MC_BIGCL);
2105                         mbstat.m_bigclfree = m_infree(MC_BIGCL) +
2106                             m_infree(MC_MBUF_BIGCL);
2107                 }
2108
2109                 VERIFY(slab_is_detached(sp));
2110                 VERIFY(m_total(super_class) <= m_maxlimit(super_class));
2111
2112                 /* And finally switch class */
2113                 class = super_class;
2114         }
2115
2116         /* Reinsert the slab to the class's slab list */
2117         if (slab_is_detached(sp))
2118                 slab_insert(sp, class);
2119
2120         /* We're done; let others enter */
2121         mb_clalloc_busy = FALSE;
2122         if (mb_clalloc_waiters > 0) {
2123                 mb_clalloc_waiters = 0;
2124                 wakeup(mb_clalloc_waitchan);
2125         }
2126 }
2127
2128 /*
2129  * Common allocator for rudimentary objects called by the CPU cache layer
2130  * during an allocation request whenever there is no available element in the
2131  * bucket layer.  It returns one or more elements from the appropriate global
2132  * freelist.  If the freelist is empty, it will attempt to populate it and
2133  * retry the allocation.
2134  */
2135 static unsigned int
2136 mbuf_slab_alloc(void *arg, mcache_obj_t ***plist, unsigned int num, int wait)
2137 {
2138         mbuf_class_t class = (mbuf_class_t)arg;
2139         unsigned int need = num;
2140         mcache_obj_t **list = *plist;
2141
2142         ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
2143         ASSERT(need > 0);
2144
2145         lck_mtx_lock(mbuf_mlock);
2146
2147         for (;;) {
2148                 if ((*list = slab_alloc(class, wait)) != NULL) {
2149                         (*list)->obj_next = NULL;
2150                         list = *plist = &(*list)->obj_next;
2151
2152                         if (--need == 0) {
2153                                 /*
2154                                  * If the number of elements in freelist has
2155                                  * dropped below low watermark, asynchronously
2156                                  * populate the freelist now rather than doing
2157                                  * it later when we run out of elements.
2158                                  */
2159                                 if (!mbuf_cached_above(class, wait) &&
2160                                     m_infree(class) < (m_total(class) >> 5)) {
2161                                         (void) freelist_populate(class, 1,
2162                                             M_DONTWAIT);
2163                                 }
2164                                 break;
2165                         }
2166                 } else {
2167                         VERIFY(m_infree(class) == 0 || class == MC_CL);
2168
2169                         (void) freelist_populate(class, 1,
2170                             (wait & MCR_NOSLEEP) ? M_DONTWAIT : M_WAIT);
2171
2172                         if (m_infree(class) > 0)
2173                                 continue;
2174
2175                         /* Check if there's anything at the cache layer */
2176                         if (mbuf_cached_above(class, wait))
2177                                 break;
2178
2179                         /* watchdog checkpoint */
2180                         mbuf_watchdog();
2181
2182                         /* We have nothing and cannot block; give up */
2183                         if (wait & MCR_NOSLEEP) {
2184                                 if (!(wait & MCR_TRYHARD)) {
2185                                         m_fail_cnt(class)++;
2186                                         mbstat.m_drops++;
2187                                         break;
2188                                 }
2189                         }
2190
2191                         /*
2192                          * If the freelist is still empty and the caller is
2193                          * willing to be blocked, sleep on the wait channel
2194                          * until an element is available.  Otherwise, if
2195                          * MCR_TRYHARD is set, do our best to satisfy the
2196                          * request without having to go to sleep.
2197                          */
2198                         if (mbuf_worker_ready &&
2199                             mbuf_sleep(class, need, wait))
2200                                 break;
2201
2202                         LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2203                 }
2204         }
2205
2206         m_alloc_cnt(class) += num - need;
2207         lck_mtx_unlock(mbuf_mlock);
2208
2209         return (num - need);
2210 }
2211
2212 /*
2213  * Common de-allocator for rudimentary objects called by the CPU cache
2214  * layer when one or more elements need to be returned to the appropriate
2215  * global freelist.
2216  */
2217 static void
2218 mbuf_slab_free(void *arg, mcache_obj_t *list, __unused int purged)
2219 {
2220         mbuf_class_t class = (mbuf_class_t)arg;
2221         mcache_obj_t *nlist;
2222         unsigned int num = 0;
2223         int w;
2224
2225         ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
2226
2227         lck_mtx_lock(mbuf_mlock);
2228
2229         for (;;) {
2230                 nlist = list->obj_next;
2231                 list->obj_next = NULL;
2232                 slab_free(class, list);
2233                 ++num;
2234                 if ((list = nlist) == NULL)
2235                         break;
2236         }
2237         m_free_cnt(class) += num;
2238
2239         if ((w = mb_waiters) > 0)
2240                 mb_waiters = 0;
2241
2242         lck_mtx_unlock(mbuf_mlock);
2243
2244         if (w != 0)
2245                 wakeup(mb_waitchan);
2246 }
2247
2248 /*
2249  * Common auditor for rudimentary objects called by the CPU cache layer
2250  * during an allocation or free request.  For the former, this is called
2251  * after the objects are obtained from either the bucket or slab layer
2252  * and before they are returned to the caller.  For the latter, this is
2253  * called immediately during free and before placing the objects into
2254  * the bucket or slab layer.
2255  */
2256 static void
2257 mbuf_slab_audit(void *arg, mcache_obj_t *list, boolean_t alloc)
2258 {
2259         mbuf_class_t class = (mbuf_class_t)arg;
2260         mcache_audit_t *mca;
2261
2262         ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
2263
2264         while (list != NULL) {
2265                 lck_mtx_lock(mbuf_mlock);
2266                 mca = mcl_audit_buf2mca(class, list);
2267
2268                 /* Do the sanity checks */
2269                 if (class == MC_MBUF) {
2270                         mcl_audit_mbuf(mca, list, FALSE, alloc);
2271                         ASSERT(mca->mca_uflags & MB_SCVALID);
2272                 } else {
2273                         mcl_audit_cluster(mca, list, m_maxsize(class),
2274                             alloc, TRUE);
2275                         ASSERT(!(mca->mca_uflags & MB_SCVALID));
2276                 }
2277                 /* Record this transaction */
2278                 if (mcltrace)
2279                         mcache_buffer_log(mca, list, m_cache(class), &mb_start);
2280
2281                 if (alloc)
2282                         mca->mca_uflags |= MB_INUSE;
2283                 else
2284                         mca->mca_uflags &= ~MB_INUSE;
2285                 /* Unpair the object (unconditionally) */
2286                 mca->mca_uptr = NULL;
2287                 lck_mtx_unlock(mbuf_mlock);
2288
2289                 list = list->obj_next;
2290         }
2291 }
2292
2293 /*
2294  * Common notify routine for all caches.  It is called by mcache when
2295  * one or more objects get freed.  We use this indication to trigger
2296  * the wakeup of any sleeping threads so that they can retry their
2297  * allocation requests.
2298  */
2299 static void
2300 mbuf_slab_notify(void *arg, u_int32_t reason)
2301 {
2302         mbuf_class_t class = (mbuf_class_t)arg;
2303         int w;
2304
2305         ASSERT(MBUF_CLASS_VALID(class));
2306
2307         if (reason != MCN_RETRYALLOC)
2308                 return;
2309
2310         lck_mtx_lock(mbuf_mlock);
2311         if ((w = mb_waiters) > 0) {
2312                 m_notified(class)++;
2313                 mb_waiters = 0;
2314         }
2315         lck_mtx_unlock(mbuf_mlock);
2316
2317         if (w != 0)
2318                 wakeup(mb_waitchan);
2319 }
2320
2321 /*
2322  * Obtain object(s) from the composite class's freelist.
2323  */
2324 static unsigned int
2325 cslab_alloc(mbuf_class_t class, mcache_obj_t ***plist, unsigned int num)
2326 {
2327         unsigned int need = num;
2328         mcl_slab_t *sp, *clsp, *nsp;
2329         struct mbuf *m;
2330         mcache_obj_t **list = *plist;
2331         void *cl;
2332
2333         VERIFY(need > 0);
2334         VERIFY(class != MC_MBUF_16KCL || njcl > 0);
2335         LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2336
2337         /* Get what we can from the freelist */
2338         while ((*list = m_cobjlist(class)) != NULL) {
2339                 MRANGE(*list);
2340
2341                 m = (struct mbuf *)*list;
2342                 sp = slab_get(m);
2343                 cl = m->m_ext.ext_buf;
2344                 clsp = slab_get(cl);
2345                 VERIFY(m->m_flags == M_EXT && cl != NULL);
2346                 VERIFY(m_get_rfa(m) != NULL && MBUF_IS_COMPOSITE(m));
2347
2348                 if (class == MC_MBUF_CL) {
2349                         VERIFY(clsp->sl_refcnt >= 1 &&
2350                             clsp->sl_refcnt <= NCLPG);
2351                 } else {
2352                         VERIFY(clsp->sl_refcnt >= 1 &&
2353                             clsp->sl_refcnt <= NBCLPG);
2354                 }
2355
2356                 if (class == MC_MBUF_16KCL) {
2357                         int k;
2358                         for (nsp = clsp, k = 1; k < NSLABSP16KB; k++) {
2359                                 nsp = nsp->sl_next;
2360                                 /* Next slab must already be present */
2361                                 VERIFY(nsp != NULL);
2362                                 VERIFY(nsp->sl_refcnt == 1);
2363                         }
2364                 }
2365
2366                 if ((m_cobjlist(class) = (*list)->obj_next) != NULL &&
2367                     !MBUF_IN_MAP(m_cobjlist(class))) {
2368                         slab_nextptr_panic(sp, m_cobjlist(class));
2369                         /* NOTREACHED */
2370                 }
2371                 (*list)->obj_next = NULL;
2372                 list = *plist = &(*list)->obj_next;
2373
2374                 if (--need == 0)
2375                         break;
2376         }
2377         m_infree(class) -= (num - need);
2378
2379         return (num - need);
2380 }
2381
2382 /*
2383  * Place object(s) back into a composite class's freelist.
2384  */
2385 static unsigned int
2386 cslab_free(mbuf_class_t class, mcache_obj_t *list, int purged)
2387 {
2388         mcache_obj_t *o, *tail;
2389         unsigned int num = 0;
2390         struct mbuf *m, *ms;
2391         mcache_audit_t *mca = NULL;
2392         mcache_obj_t *ref_list = NULL;
2393         mcl_slab_t *clsp, *nsp;
2394         void *cl;
2395         mbuf_class_t cl_class;
2396
2397         ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
2398         VERIFY(class != MC_MBUF_16KCL || njcl > 0);
2399         LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2400
2401         if (class == MC_MBUF_CL) {
2402                 cl_class = MC_CL;
2403         } else if (class == MC_MBUF_BIGCL) {
2404                 cl_class = MC_BIGCL;
2405         } else {
2406                 VERIFY(class == MC_MBUF_16KCL);
2407                 cl_class = MC_16KCL;
2408         }
2409
2410         o = tail = list;
2411
2412         while ((m = ms = (struct mbuf *)o) != NULL) {
2413                 mcache_obj_t *rfa, *nexto = o->obj_next;
2414
2415                 /* Do the mbuf sanity checks */
2416                 if (mclaudit != NULL) {
2417                         mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
2418                         if (mclverify) {
2419                                 mcache_audit_free_verify(mca, m, 0,
2420                                     m_maxsize(MC_MBUF));
2421                         }
2422                         ms = MCA_SAVED_MBUF_PTR(mca);
2423                 }
2424
2425                 /* Do the cluster sanity checks */
2426                 cl = ms->m_ext.ext_buf;
2427                 clsp = slab_get(cl);
2428                 if (mclverify) {
2429                         size_t size = m_maxsize(cl_class);
2430                         mcache_audit_free_verify(mcl_audit_buf2mca(cl_class,
2431                             (mcache_obj_t *)cl), cl, 0, size);
2432                 }
2433                 VERIFY(ms->m_type == MT_FREE);
2434                 VERIFY(ms->m_flags == M_EXT);
2435                 VERIFY(m_get_rfa(ms) != NULL && MBUF_IS_COMPOSITE(ms));
2436                 if (cl_class == MC_CL) {
2437                         VERIFY(clsp->sl_refcnt >= 1 &&
2438                             clsp->sl_refcnt <= NCLPG);
2439                 } else {
2440                         VERIFY(clsp->sl_refcnt >= 1 &&
2441                             clsp->sl_refcnt <= NBCLPG);
2442                 }
2443                 if (cl_class == MC_16KCL) {
2444                         int k;
2445                         for (nsp = clsp, k = 1; k < NSLABSP16KB; k++) {
2446                                 nsp = nsp->sl_next;
2447                                 /* Next slab must already be present */
2448                                 VERIFY(nsp != NULL);
2449                                 VERIFY(nsp->sl_refcnt == 1);
2450                         }
2451                 }
2452
2453                 /*
2454                  * If we're asked to purge, restore the actual mbuf using
2455                  * contents of the shadow structure (if auditing is enabled)
2456                  * and clear EXTF_COMPOSITE flag from the mbuf, as we are
2457                  * about to free it and the attached cluster into their caches.
2458                  */
2459                 if (purged) {
2460                         /* Restore constructed mbuf fields */
2461                         if (mclaudit != NULL)
2462                                 mcl_audit_restore_mbuf(m, mca, TRUE);
2463
2464                         MEXT_MINREF(m) = 0;
2465                         MEXT_REF(m) = 0;
2466                         MEXT_PREF(m) = 0;
2467                         MEXT_FLAGS(m) = 0;
2468                         MEXT_PRIV(m) = 0;
2469                         MEXT_PMBUF(m) = NULL;
2470                         MEXT_TOKEN(m) = 0;
2471
2472                         rfa = (mcache_obj_t *)(void *)m_get_rfa(m);
2473                         m_set_ext(m, NULL, NULL, NULL);
2474                         rfa->obj_next = ref_list;
2475                         ref_list = rfa;
2476
2477                         m->m_type = MT_FREE;
2478                         m->m_flags = m->m_len = 0;
2479                         m->m_next = m->m_nextpkt = NULL;
2480
2481                         /* Save mbuf fields and make auditing happy */
2482                         if (mclaudit != NULL)
2483                                 mcl_audit_mbuf(mca, o, FALSE, FALSE);
2484
2485                         VERIFY(m_total(class) > 0);
2486                         m_total(class)--;
2487
2488                         /* Free the mbuf */
2489                         o->obj_next = NULL;
2490                         slab_free(MC_MBUF, o);
2491
2492                         /* And free the cluster */
2493                         ((mcache_obj_t *)cl)->obj_next = NULL;
2494                         if (class == MC_MBUF_CL)
2495                                 slab_free(MC_CL, cl);
2496                         else if (class == MC_MBUF_BIGCL)
2497                                 slab_free(MC_BIGCL, cl);
2498                         else
2499                                 slab_free(MC_16KCL, cl);
2500                 }
2501
2502                 ++num;
2503                 tail = o;
2504                 o = nexto;
2505         }
2506
2507         if (!purged) {
2508                 tail->obj_next = m_cobjlist(class);
2509                 m_cobjlist(class) = list;
2510                 m_infree(class) += num;
2511         } else if (ref_list != NULL) {
2512                 mcache_free_ext(ref_cache, ref_list);
2513         }
2514
2515         return (num);
2516 }
2517
2518 /*
2519  * Common allocator for composite objects called by the CPU cache layer
2520  * during an allocation request whenever there is no available element in
2521  * the bucket layer.  It returns one or more composite elements from the
2522  * appropriate global freelist.  If the freelist is empty, it will attempt
2523  * to obtain the rudimentary objects from their caches and construct them
2524  * into composite mbuf + cluster objects.
2525  */
2526 static unsigned int
2527 mbuf_cslab_alloc(void *arg, mcache_obj_t ***plist, unsigned int needed,
2528     int wait)
2529 {
2530         mbuf_class_t class = (mbuf_class_t)arg;
2531         mbuf_class_t cl_class = 0;
2532         unsigned int num = 0, cnum = 0, want = needed;
2533         mcache_obj_t *ref_list = NULL;
2534         mcache_obj_t *mp_list = NULL;
2535         mcache_obj_t *clp_list = NULL;
2536         mcache_obj_t **list;
2537         struct ext_ref *rfa;
2538         struct mbuf *m;
2539         void *cl;
2540
2541         ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
2542         ASSERT(needed > 0);
2543
2544         VERIFY(class != MC_MBUF_16KCL || njcl > 0);
2545
2546         /* There should not be any slab for this class */
2547         VERIFY(m_slab_cnt(class) == 0 &&
2548             m_slablist(class).tqh_first == NULL &&
2549             m_slablist(class).tqh_last == NULL);
2550
2551         lck_mtx_lock(mbuf_mlock);
2552
2553         /* Try using the freelist first */
2554         num = cslab_alloc(class, plist, needed);
2555         list = *plist;
2556         if (num == needed) {
2557                 m_alloc_cnt(class) += num;
2558                 lck_mtx_unlock(mbuf_mlock);
2559                 return (needed);
2560         }
2561
2562         lck_mtx_unlock(mbuf_mlock);
2563
2564         /*
2565          * We could not satisfy the request using the freelist alone;
2566          * allocate from the appropriate rudimentary caches and use
2567          * whatever we can get to construct the composite objects.
2568          */
2569         needed -= num;
2570
2571         /*
2572          * Mark these allocation requests as coming from a composite cache.
2573          * Also, if the caller is willing to be blocked, mark the request
2574          * with MCR_FAILOK such that we don't end up sleeping at the mbuf
2575          * slab layer waiting for the individual object when one or more
2576          * of the already-constructed composite objects are available.
2577          */
2578         wait |= MCR_COMP;
2579         if (!(wait & MCR_NOSLEEP))
2580                 wait |= MCR_FAILOK;
2581
2582         /* allocate mbufs */
2583         needed = mcache_alloc_ext(m_cache(MC_MBUF), &mp_list, needed, wait);
2584         if (needed == 0) {
2585                 ASSERT(mp_list == NULL);
2586                 goto fail;
2587         }
2588
2589         /* allocate clusters */
2590         if (class == MC_MBUF_CL) {
2591                 cl_class = MC_CL;
2592         } else if (class == MC_MBUF_BIGCL) {
2593                 cl_class = MC_BIGCL;
2594         } else {
2595                 VERIFY(class == MC_MBUF_16KCL);
2596                 cl_class = MC_16KCL;
2597         }
2598         needed = mcache_alloc_ext(m_cache(cl_class), &clp_list, needed, wait);
2599         if (needed == 0) {
2600                 ASSERT(clp_list == NULL);
2601                 goto fail;
2602         }
2603
2604         needed = mcache_alloc_ext(ref_cache, &ref_list, needed, wait);
2605         if (needed == 0) {
2606                 ASSERT(ref_list == NULL);
2607                 goto fail;
2608         }
2609
2610         /*
2611          * By this time "needed" is MIN(mbuf, cluster, ref).  Any left
2612          * overs will get freed accordingly before we return to caller.
2613          */
2614         for (cnum = 0; cnum < needed; cnum++) {
2615                 struct mbuf *ms;
2616
2617                 m = ms = (struct mbuf *)mp_list;
2618                 mp_list = mp_list->obj_next;
2619
2620                 cl = clp_list;
2621                 clp_list = clp_list->obj_next;
2622                 ((mcache_obj_t *)cl)->obj_next = NULL;
2623
2624                 rfa = (struct ext_ref *)ref_list;
2625                 ref_list = ref_list->obj_next;
2626                 ((mcache_obj_t *)(void *)rfa)->obj_next = NULL;
2627
2628                 /*
2629                  * If auditing is enabled, construct the shadow mbuf
2630                  * in the audit structure instead of in the actual one.
2631                  * mbuf_cslab_audit() will take care of restoring the
2632                  * contents after the integrity check.
2633                  */
2634                 if (mclaudit != NULL) {
2635                         mcache_audit_t *mca, *cl_mca;
2636
2637                         lck_mtx_lock(mbuf_mlock);
2638                         mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
2639                         ms = MCA_SAVED_MBUF_PTR(mca);
2640                         cl_mca = mcl_audit_buf2mca(cl_class,
2641                             (mcache_obj_t *)cl);
2642
2643                         /*
2644                          * Pair them up.  Note that this is done at the time
2645                          * the mbuf+cluster objects are constructed.  This
2646                          * information should be treated as "best effort"
2647                          * debugging hint since more than one mbufs can refer
2648                          * to a cluster.  In that case, the cluster might not
2649                          * be freed along with the mbuf it was paired with.
2650                          */
2651                         mca->mca_uptr = cl_mca;
2652                         cl_mca->mca_uptr = mca;
2653
2654                         ASSERT(mca->mca_uflags & MB_SCVALID);
2655                         ASSERT(!(cl_mca->mca_uflags & MB_SCVALID));
2656                         lck_mtx_unlock(mbuf_mlock);
2657
2658                         /* Technically, they are in the freelist */
2659                         if (mclverify) {
2660                                 size_t size;
2661
2662                                 mcache_set_pattern(MCACHE_FREE_PATTERN, m,
2663                                     m_maxsize(MC_MBUF));
2664
2665                                 if (class == MC_MBUF_CL)
2666                                         size = m_maxsize(MC_CL);
2667                                 else if (class == MC_MBUF_BIGCL)
2668                                         size = m_maxsize(MC_BIGCL);
2669                                 else
2670                                         size = m_maxsize(MC_16KCL);
2671
2672                                 mcache_set_pattern(MCACHE_FREE_PATTERN, cl,
2673                                     size);
2674                         }
2675                 }
2676
2677                 MBUF_INIT(ms, 0, MT_FREE);
2678                 if (class == MC_MBUF_16KCL) {
2679                         MBUF_16KCL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE);
2680                 } else if (class == MC_MBUF_BIGCL) {
2681                         MBUF_BIGCL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE);
2682                 } else {
2683                         MBUF_CL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE);
2684                 }
2685                 VERIFY(ms->m_flags == M_EXT);
2686                 VERIFY(m_get_rfa(ms) != NULL && MBUF_IS_COMPOSITE(ms));
2687
2688                 *list = (mcache_obj_t *)m;
2689                 (*list)->obj_next = NULL;
2690                 list = *plist = &(*list)->obj_next;
2691         }
2692
2693 fail:
2694         /*
2695          * Free up what's left of the above.
2696          */
2697         if (mp_list != NULL)
2698                 mcache_free_ext(m_cache(MC_MBUF), mp_list);
2699         if (clp_list != NULL)
2700                 mcache_free_ext(m_cache(cl_class), clp_list);
2701         if (ref_list != NULL)
2702                 mcache_free_ext(ref_cache, ref_list);
2703
2704         lck_mtx_lock(mbuf_mlock);
2705         if (num > 0 || cnum > 0) {
2706                 m_total(class) += cnum;
2707                 VERIFY(m_total(class) <= m_maxlimit(class));
2708                 m_alloc_cnt(class) += num + cnum;
2709         }
2710         if ((num + cnum) < want)
2711                 m_fail_cnt(class) += (want - (num + cnum));
2712         lck_mtx_unlock(mbuf_mlock);
2713
2714         return (num + cnum);
2715 }
2716
2717 /*
2718  * Common de-allocator for composite objects called by the CPU cache
2719  * layer when one or more elements need to be returned to the appropriate
2720  * global freelist.
2721  */
2722 static void
2723 mbuf_cslab_free(void *arg, mcache_obj_t *list, int purged)
2724 {
2725         mbuf_class_t class = (mbuf_class_t)arg;
2726         unsigned int num;
2727         int w;
2728
2729         ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
2730
2731         lck_mtx_lock(mbuf_mlock);
2732
2733         num = cslab_free(class, list, purged);
2734         m_free_cnt(class) += num;
2735
2736         if ((w = mb_waiters) > 0)
2737                 mb_waiters = 0;
2738
2739         lck_mtx_unlock(mbuf_mlock);
2740
2741         if (w != 0)
2742                 wakeup(mb_waitchan);
2743 }
2744
2745 /*
2746  * Common auditor for composite objects called by the CPU cache layer
2747  * during an allocation or free request.  For the former, this is called
2748  * after the objects are obtained from either the bucket or slab layer
2749  * and before they are returned to the caller.  For the latter, this is
2750  * called immediately during free and before placing the objects into
2751  * the bucket or slab layer.
2752  */
2753 static void
2754 mbuf_cslab_audit(void *arg, mcache_obj_t *list, boolean_t alloc)
2755 {
2756         mbuf_class_t class = (mbuf_class_t)arg, cl_class;
2757         mcache_audit_t *mca;
2758         struct mbuf *m, *ms;
2759         mcl_slab_t *clsp, *nsp;
2760         size_t cl_size;
2761         void *cl;
2762
2763         ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
2764         if (class == MC_MBUF_CL)
2765                 cl_class = MC_CL;
2766         else if (class == MC_MBUF_BIGCL)
2767                 cl_class = MC_BIGCL;
2768         else
2769                 cl_class = MC_16KCL;
2770         cl_size = m_maxsize(cl_class);
2771
2772         while ((m = ms = (struct mbuf *)list) != NULL) {
2773                 lck_mtx_lock(mbuf_mlock);
2774                 /* Do the mbuf sanity checks and record its transaction */
2775                 mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
2776                 mcl_audit_mbuf(mca, m, TRUE, alloc);
2777                 if (mcltrace)
2778                         mcache_buffer_log(mca, m, m_cache(class), &mb_start);
2779
2780                 if (alloc)
2781                         mca->mca_uflags |= MB_COMP_INUSE;
2782                 else
2783                         mca->mca_uflags &= ~MB_COMP_INUSE;
2784
2785                 /*
2786                  * Use the shadow mbuf in the audit structure if we are
2787                  * freeing, since the contents of the actual mbuf has been
2788                  * pattern-filled by the above call to mcl_audit_mbuf().
2789                  */
2790                 if (!alloc && mclverify)
2791                         ms = MCA_SAVED_MBUF_PTR(mca);
2792
2793                 /* Do the cluster sanity checks and record its transaction */
2794                 cl = ms->m_ext.ext_buf;
2795                 clsp = slab_get(cl);
2796                 VERIFY(ms->m_flags == M_EXT && cl != NULL);
2797                 VERIFY(m_get_rfa(ms) != NULL && MBUF_IS_COMPOSITE(ms));
2798                 if (class == MC_MBUF_CL)
2799                         VERIFY(clsp->sl_refcnt >= 1 &&
2800                             clsp->sl_refcnt <= NCLPG);
2801                 else
2802                         VERIFY(clsp->sl_refcnt >= 1 &&
2803                             clsp->sl_refcnt <= NBCLPG);
2804
2805                 if (class == MC_MBUF_16KCL) {
2806                         int k;
2807                         for (nsp = clsp, k = 1; k < NSLABSP16KB; k++) {
2808                                 nsp = nsp->sl_next;
2809                                 /* Next slab must already be present */
2810                                 VERIFY(nsp != NULL);
2811                                 VERIFY(nsp->sl_refcnt == 1);
2812                         }
2813                 }
2814
2815
2816                 mca = mcl_audit_buf2mca(cl_class, cl);
2817                 mcl_audit_cluster(mca, cl, cl_size, alloc, FALSE);
2818                 if (mcltrace)
2819                         mcache_buffer_log(mca, cl, m_cache(class), &mb_start);
2820
2821                 if (alloc)
2822                         mca->mca_uflags |= MB_COMP_INUSE;
2823                 else
2824                         mca->mca_uflags &= ~MB_COMP_INUSE;
2825                 lck_mtx_unlock(mbuf_mlock);
2826
2827                 list = list->obj_next;
2828         }
2829 }
2830
2831 static void
2832 m_vm_error_stats(uint32_t *cnt, uint64_t *ts, uint64_t *size,
2833                  uint64_t alloc_size, kern_return_t error)
2834 {
2835
2836         *cnt = *cnt + 1;
2837         *ts = net_uptime();
2838         if (size) {
2839                 *size = alloc_size;
2840         }
2841         _CASSERT(sizeof(mb_kmem_stats) / sizeof(mb_kmem_stats[0]) ==
2842             sizeof(mb_kmem_stats_labels) / sizeof(mb_kmem_stats_labels[0]));
2843         switch (error) {
2844         case KERN_SUCCESS:
2845                 break;
2846         case KERN_INVALID_ARGUMENT:
2847                 mb_kmem_stats[0]++;
2848                 break;
2849         case KERN_INVALID_ADDRESS:
2850                 mb_kmem_stats[1]++;
2851                 break;
2852         case KERN_RESOURCE_SHORTAGE:
2853                 mb_kmem_stats[2]++;
2854                 break;
2855         case KERN_NO_SPACE:
2856                 mb_kmem_stats[3]++;
2857                 break;
2858         case KERN_FAILURE:
2859                 mb_kmem_stats[4]++;
2860                 break;
2861         default:
2862                 mb_kmem_stats[5]++;
2863                 break;
2864         }
2865 }
2866
2867 /*
2868  * Allocate some number of mbuf clusters and place on cluster freelist.
2869  */
2870 static int
2871 m_clalloc(const u_int32_t num, const int wait, const u_int32_t bufsize)
2872 {
2873         int i, count = 0;
2874         vm_size_t size = 0;
2875         int numpages = 0, large_buffer;
2876         vm_offset_t page = 0;
2877         mcache_audit_t *mca_list = NULL;
2878         mcache_obj_t *con_list = NULL;
2879         mcl_slab_t *sp;
2880         mbuf_class_t class;
2881         kern_return_t error;
2882
2883         /* Set if a buffer allocation needs allocation of multiple pages */
2884         large_buffer = ((bufsize == m_maxsize(MC_16KCL)) &&
2885                 PAGE_SIZE < M16KCLBYTES);
2886         VERIFY(bufsize == m_maxsize(MC_BIGCL) ||
2887             bufsize == m_maxsize(MC_16KCL));
2888
2889         VERIFY((bufsize == PAGE_SIZE) ||
2890             (bufsize > PAGE_SIZE && bufsize == m_maxsize(MC_16KCL)));
2891
2892         if (bufsize == m_size(MC_BIGCL))
2893                 class = MC_BIGCL;
2894         else
2895                 class = MC_16KCL;
2896
2897         LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2898
2899         /*
2900          * Multiple threads may attempt to populate the cluster map one
2901          * after another.  Since we drop the lock below prior to acquiring
2902          * the physical page(s), our view of the cluster map may no longer
2903          * be accurate, and we could end up over-committing the pages beyond
2904          * the maximum allowed for each class.  To prevent it, this entire
2905          * operation (including the page mapping) is serialized.
2906          */
2907         while (mb_clalloc_busy) {
2908                 mb_clalloc_waiters++;
2909                 (void) msleep(mb_clalloc_waitchan, mbuf_mlock,
2910                     (PZERO-1), "m_clalloc", NULL);
2911                 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2912         }
2913
2914         /* We are busy now; tell everyone else to go away */
2915         mb_clalloc_busy = TRUE;
2916
2917         /*
2918          * Honor the caller's wish to block or not block.  We have a way
2919          * to grow the pool asynchronously using the mbuf worker thread.
2920          */
2921         i = m_howmany(num, bufsize);
2922         if (i <= 0 || (wait & M_DONTWAIT))
2923                 goto out;
2924
2925         lck_mtx_unlock(mbuf_mlock);
2926
2927         size = round_page(i * bufsize);
2928         page = kmem_mb_alloc(mb_map, size, large_buffer, &error);
2929
2930         /*
2931          * If we did ask for "n" 16KB physically contiguous chunks
2932          * and didn't get them, then please try again without this
2933          * restriction.
2934          */
2935         net_update_uptime();
2936         if (large_buffer && page == 0) {
2937                 m_vm_error_stats(&mb_kmem_contig_failed,
2938                     &mb_kmem_contig_failed_ts,
2939                     &mb_kmem_contig_failed_size,
2940                     size, error);
2941                 page = kmem_mb_alloc(mb_map, size, 0, &error);
2942         }
2943
2944         if (page == 0) {
2945                 m_vm_error_stats(&mb_kmem_failed,
2946                     &mb_kmem_failed_ts,
2947                     &mb_kmem_failed_size,
2948                     size, error);
2949 #if PAGE_SIZE == 4096
2950                 if (bufsize == m_maxsize(MC_BIGCL)) {
2951 #else
2952                 if (bufsize >= m_maxsize(MC_BIGCL)) {
2953 #endif
2954                         /* Try for 1 page if failed */
2955                         size = PAGE_SIZE;
2956                         page = kmem_mb_alloc(mb_map, size, 0, &error);
2957                 }
2958
2959                 if (page == 0) {
2960                         m_vm_error_stats(&mb_kmem_one_failed,
2961                             &mb_kmem_one_failed_ts,
2962                             NULL, size, error);
2963                         lck_mtx_lock(mbuf_mlock);
2964                         goto out;
2965                 }
2966         }
2967
2968         VERIFY(IS_P2ALIGNED(page, PAGE_SIZE));
2969         numpages = size / PAGE_SIZE;
2970
2971         /* If auditing is enabled, allocate the audit structures now */
2972         if (mclaudit != NULL) {
2973                 int needed;
2974
2975                 /*
2976                  * Yes, I realize this is a waste of memory for clusters
2977                  * that never get transformed into mbufs, as we may end
2978                  * up with NMBPG-1 unused audit structures per cluster.
2979                  * But doing so tremendously simplifies the allocation
2980                  * strategy, since at this point we are not holding the
2981                  * mbuf lock and the caller is okay to be blocked.
2982                  */
2983                 if (bufsize == PAGE_SIZE) {
2984                         needed = numpages * NMBPG;
2985
2986                         i = mcache_alloc_ext(mcl_audit_con_cache,
2987                             &con_list, needed, MCR_SLEEP);
2988
2989                         VERIFY(con_list != NULL && i == needed);
2990                 } else {
2991                         /*
2992                          * if multiple 4K pages are being used for a
2993                          * 16K cluster
2994                          */
2995                         needed = numpages / NSLABSP16KB;
2996                 }
2997
2998                 i = mcache_alloc_ext(mcache_audit_cache,
2999                     (mcache_obj_t **)&mca_list, needed, MCR_SLEEP);
3000
3001                 VERIFY(mca_list != NULL && i == needed);
3002         }
3003
3004         lck_mtx_lock(mbuf_mlock);
3005
3006         for (i = 0; i < numpages; i++, page += PAGE_SIZE) {
3007                 ppnum_t offset =
3008                     ((unsigned char *)page - mbutl) >> PAGE_SHIFT;
3009                 ppnum_t new_page = pmap_find_phys(kernel_pmap, page);
3010
3011                 /*
3012                  * If there is a mapper the appropriate I/O page is
3013                  * returned; zero out the page to discard its past
3014                  * contents to prevent exposing leftover kernel memory.
3015                  */
3016                 VERIFY(offset < mcl_pages);
3017                 if (mcl_paddr_base != 0) {
3018                         bzero((void *)(uintptr_t) page, PAGE_SIZE);
3019                         new_page = IOMapperInsertPage(mcl_paddr_base,
3020                             offset, new_page);
3021                 }
3022                 mcl_paddr[offset] = new_page;
3023
3024                 /* Pattern-fill this fresh page */
3025                 if (mclverify) {
3026                         mcache_set_pattern(MCACHE_FREE_PATTERN,
3027                             (caddr_t)page, PAGE_SIZE);
3028                 }
3029                 if (bufsize == PAGE_SIZE) {
3030                         mcache_obj_t *buf;
3031                         /* One for the entire page */
3032                         sp = slab_get((void *)page);
3033                         if (mclaudit != NULL) {
3034                                 mcl_audit_init((void *)page,
3035                                     &mca_list, &con_list,
3036                                     AUDIT_CONTENTS_SIZE, NMBPG);
3037                         }
3038                         VERIFY(sp->sl_refcnt == 0 && sp->sl_flags == 0);
3039                         slab_init(sp, class, SLF_MAPPED, (void *)page,
3040                             (void *)page, PAGE_SIZE, 0, 1);
3041                         buf = (mcache_obj_t *)page;
3042                         buf->obj_next = NULL;
3043
3044                         /* Insert this slab */
3045                         slab_insert(sp, class);
3046
3047                         /* Update stats now since slab_get drops the lock */
3048                         ++m_infree(class);
3049                         ++m_total(class);
3050                         VERIFY(m_total(class) <= m_maxlimit(class));
3051                         if (class == MC_BIGCL) {
3052                                 mbstat.m_bigclfree = m_infree(MC_BIGCL) +
3053                                     m_infree(MC_MBUF_BIGCL);
3054                                 mbstat.m_bigclusters = m_total(MC_BIGCL);
3055                         }
3056                         ++count;
3057                 } else if ((bufsize > PAGE_SIZE) &&
3058                     (i % NSLABSP16KB) == 0) {
3059                         union m16kcluster *m16kcl = (union m16kcluster *)page;
3060                         mcl_slab_t *nsp;
3061                         int k;
3062
3063                         /* One for the entire 16KB */
3064                         sp = slab_get(m16kcl);
3065                         if (mclaudit != NULL)
3066                                 mcl_audit_init(m16kcl, &mca_list, NULL, 0, 1);
3067
3068                         VERIFY(sp->sl_refcnt == 0 && sp->sl_flags == 0);
3069                         slab_init(sp, MC_16KCL, SLF_MAPPED,
3070                             m16kcl, m16kcl, bufsize, 0, 1);
3071                         m16kcl->m16kcl_next = NULL;
3072
3073                         /*
3074                          * 2nd-Nth page's slab is part of the first one,
3075                          * where N is NSLABSP16KB.
3076                          */
3077                         for (k = 1; k < NSLABSP16KB; k++) {
3078                                 nsp = slab_get(((union mbigcluster *)page) + k);
3079                                 VERIFY(nsp->sl_refcnt == 0 &&
3080                                     nsp->sl_flags == 0);
3081                                 slab_init(nsp, MC_16KCL,
3082                                     SLF_MAPPED | SLF_PARTIAL,
3083                                     m16kcl, NULL, 0, 0, 0);
3084                         }
3085                         /* Insert this slab */
3086                         slab_insert(sp, MC_16KCL);
3087
3088                         /* Update stats now since slab_get drops the lock */
3089                         ++m_infree(MC_16KCL);
3090                         ++m_total(MC_16KCL);
3091                         VERIFY(m_total(MC_16KCL) <= m_maxlimit(MC_16KCL));
3092                         ++count;
3093                 }
3094         }
3095         VERIFY(mca_list == NULL && con_list == NULL);
3096
3097         if (!mb_peak_newreport && mbuf_report_usage(class))
3098                 mb_peak_newreport = TRUE;
3099
3100         /* We're done; let others enter */
3101         mb_clalloc_busy = FALSE;
3102         if (mb_clalloc_waiters > 0) {
3103                 mb_clalloc_waiters = 0;
3104                 wakeup(mb_clalloc_waitchan);
3105         }
3106
3107         return (count);
3108 out:
3109         LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
3110
3111         /* We're done; let others enter */
3112         mb_clalloc_busy = FALSE;
3113         if (mb_clalloc_waiters > 0) {
3114                 mb_clalloc_waiters = 0;
3115                 wakeup(mb_clalloc_waitchan);
3116         }
3117
3118         /*
3119          * When non-blocking we kick a thread if we have to grow the
3120          * pool or if the number of free clusters is less than requested.
3121          */
3122         if (i > 0 && mbuf_worker_ready && mbuf_worker_needs_wakeup) {
3123                 wakeup((caddr_t)&mbuf_worker_needs_wakeup);
3124                 mbuf_worker_needs_wakeup = FALSE;
3125         }
3126         if (class == MC_BIGCL) {
3127                 if (i > 0) {
3128                         /*
3129                          * Remember total number of 4KB clusters needed
3130                          * at this time.
3131                          */
3132                         i += m_total(MC_BIGCL);
3133                         if (i > m_region_expand(MC_BIGCL)) {
3134                                 m_region_expand(MC_BIGCL) = i;
3135                         }
3136                 }
3137                 if (m_infree(MC_BIGCL) >= num)
3138                         return (1);
3139         } else {
3140                 if (i > 0) {
3141                         /*
3142                          * Remember total number of 16KB clusters needed
3143                          * at this time.
3144                          */
3145                         i += m_total(MC_16KCL);
3146                         if (i > m_region_expand(MC_16KCL)) {
3147                                 m_region_expand(MC_16KCL) = i;
3148                         }
3149                 }
3150                 if (m_infree(MC_16KCL) >= num)
3151                         return (1);
3152         }
3153         return (0);
3154 }
3155
3156 /*
3157  * Populate the global freelist of the corresponding buffer class.
3158  */
3159 static int
3160 freelist_populate(mbuf_class_t class, unsigned int num, int wait)
3161 {
3162         mcache_obj_t *o = NULL;
3163         int i, numpages = 0, count;
3164         mbuf_class_t super_class;
3165
3166         VERIFY(class == MC_MBUF || class == MC_CL || class == MC_BIGCL ||
3167             class == MC_16KCL);
3168
3169         LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
3170
3171         VERIFY(PAGE_SIZE == m_maxsize(MC_BIGCL) ||
3172             PAGE_SIZE == m_maxsize(MC_16KCL));
3173
3174         if (m_maxsize(class) >= PAGE_SIZE)
3175                 return(m_clalloc(num, wait, m_maxsize(class)) != 0);
3176
3177         /*
3178          * The rest of the function will allocate pages and will slice
3179          * them up into the right size
3180          */
3181
3182         numpages = (num * m_size(class) + PAGE_SIZE - 1) / PAGE_SIZE;
3183
3184         /* Currently assume that pages are 4K or 16K */
3185         if (PAGE_SIZE == m_maxsize(MC_BIGCL))
3186                 super_class = MC_BIGCL;
3187         else
3188                 super_class = MC_16KCL;
3189
3190         i = m_clalloc(numpages, wait, m_maxsize(super_class));
3191
3192         /* how many objects will we cut the page into? */
3193         int numobj = PAGE_SIZE / m_maxsize(class);
3194
3195         for (count = 0; count < numpages; count++) {
3196                 /* respect totals, minlimit, maxlimit */
3197                 if (m_total(super_class) <= m_minlimit(super_class) ||
3198                     m_total(class) >= m_maxlimit(class))
3199                         break;
3200
3201                 if ((o = slab_alloc(super_class, wait)) == NULL)
3202                         break;
3203
3204                 struct mbuf *m = (struct mbuf *)o;
3205                 union mcluster *c = (union mcluster *)o;
3206                 union mbigcluster *mbc = (union mbigcluster *)o;
3207                 mcl_slab_t *sp = slab_get(o);
3208                 mcache_audit_t *mca = NULL;
3209
3210                 /*
3211                  * since one full page will be converted to MC_MBUF or
3212                  * MC_CL, verify that the reference count will match that
3213                  * assumption
3214                  */
3215                 VERIFY(sp->sl_refcnt == 1 && slab_is_detached(sp));
3216                 VERIFY((sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED);
3217                 /*
3218                  * Make sure that the cluster is unmolested
3219                  * while in freelist
3220                  */
3221                 if (mclverify) {
3222                         mca = mcl_audit_buf2mca(super_class,
3223                             (mcache_obj_t *)o);
3224                         mcache_audit_free_verify(mca,
3225                             (mcache_obj_t *)o, 0, m_maxsize(super_class));
3226                 }
3227
3228                 /* Reinitialize it as an mbuf or 2K or 4K slab */
3229                 slab_init(sp, class, sp->sl_flags,
3230                     sp->sl_base, NULL, PAGE_SIZE, 0, numobj);
3231
3232                 VERIFY(sp->sl_head == NULL);
3233
3234                 VERIFY(m_total(super_class) >= 1);
3235                 m_total(super_class)--;
3236
3237                 if (super_class == MC_BIGCL)
3238                         mbstat.m_bigclusters = m_total(MC_BIGCL);
3239
3240                 m_total(class) += numobj;
3241                 VERIFY(m_total(class) <= m_maxlimit(class));
3242                 m_infree(class) += numobj;
3243
3244                 if (!mb_peak_newreport && mbuf_report_usage(class))
3245                         mb_peak_newreport = TRUE;
3246
3247                 i = numobj;
3248                 if (class == MC_MBUF) {
3249                         mbstat.m_mbufs = m_total(MC_MBUF);
3250                         mtype_stat_add(MT_FREE, NMBPG);
3251                         while (i--) {
3252                                 /*
3253                                  * If auditing is enabled, construct the
3254                                  * shadow mbuf in the audit structure
3255                                  * instead of the actual one.
3256                                  * mbuf_slab_audit() will take care of
3257                                  * restoring the contents after the
3258                                  * integrity check.
3259                                  */
3260                                 if (mclaudit != NULL) {
3261                                         struct mbuf *ms;
3262                                         mca = mcl_audit_buf2mca(MC_MBUF,
3263                                             (mcache_obj_t *)m);
3264                                         ms = MCA_SAVED_MBUF_PTR(mca);
3265                                         ms->m_type = MT_FREE;
3266                                 } else {
3267                                         m->m_type = MT_FREE;
3268                                 }
3269                                 m->m_next = sp->sl_head;
3270                                 sp->sl_head = (void *)m++;
3271                         }
3272                 } else if (class == MC_CL) { /* MC_CL */
3273                         mbstat.m_clfree =
3274                             m_infree(MC_CL) + m_infree(MC_MBUF_CL);
3275                         mbstat.m_clusters = m_total(MC_CL);
3276                         while (i--) {
3277                                 c->mcl_next = sp->sl_head;
3278                                 sp->sl_head = (void *)c++;
3279                         }
3280                 } else {
3281                         VERIFY(class == MC_BIGCL);
3282                         mbstat.m_bigclusters = m_total(MC_BIGCL);
3283                         mbstat.m_bigclfree = m_infree(MC_BIGCL) +
3284                             m_infree(MC_MBUF_BIGCL);
3285                         while (i--) {
3286                                 mbc->mbc_next = sp->sl_head;
3287                                 sp->sl_head = (void *)mbc++;
3288                         }
3289                 }
3290
3291                 /* Insert into the mbuf or 2k or 4k slab list */
3292                 slab_insert(sp, class);
3293
3294                 if ((i = mb_waiters) > 0)
3295                         mb_waiters = 0;
3296                 if (i != 0)
3297                         wakeup(mb_waitchan);
3298         }
3299         return (count != 0);
3300 }
3301
3302 /*
3303  * For each class, initialize the freelist to hold m_minlimit() objects.
3304  */
3305 static void
3306 freelist_init(mbuf_class_t class)
3307 {
3308         LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
3309
3310         VERIFY(class == MC_CL || class == MC_BIGCL);
3311         VERIFY(m_total(class) == 0);
3312         VERIFY(m_minlimit(class) > 0);
3313
3314         while (m_total(class) < m_minlimit(class))
3315                 (void) freelist_populate(class, m_minlimit(class), M_WAIT);
3316
3317         VERIFY(m_total(class) >= m_minlimit(class));
3318 }
3319
3320 /*
3321  * (Inaccurately) check if it might be worth a trip back to the
3322  * mcache layer due the availability of objects there.  We'll
3323  * end up back here if there's nothing up there.
3324  */
3325 static boolean_t
3326 mbuf_cached_above(mbuf_class_t class, int wait)
3327 {
3328         switch (class) {
3329         case MC_MBUF:
3330                 if (wait & MCR_COMP)
3331                         return (!mcache_bkt_isempty(m_cache(MC_MBUF_CL)) ||
3332                             !mcache_bkt_isempty(m_cache(MC_MBUF_BIGCL)));
3333                 break;
3334
3335         case MC_CL:
3336                 if (wait & MCR_COMP)
3337                         return (!mcache_bkt_isempty(m_cache(MC_MBUF_CL)));
3338                 break;
3339
3340         case MC_BIGCL:
3341                 if (wait & MCR_COMP)
3342                         return (!mcache_bkt_isempty(m_cache(MC_MBUF_BIGCL)));
3343                 break;
3344
3345         case MC_16KCL:
3346                 if (wait & MCR_COMP)
3347                         return (!mcache_bkt_isempty(m_cache(MC_MBUF_16KCL)));
3348                 break;
3349
3350         case MC_MBUF_CL:
3351         case MC_MBUF_BIGCL:
3352         case MC_MBUF_16KCL:
3353                 break;
3354
3355         default:
3356                 VERIFY(0);
3357                 /* NOTREACHED */
3358         }
3359
3360         return (!mcache_bkt_isempty(m_cache(class)));
3361 }
3362
3363 /*
3364  * If possible, convert constructed objects to raw ones.
3365  */
3366 static boolean_t
3367 mbuf_steal(mbuf_class_t class, unsigned int num)
3368 {
3369         mcache_obj_t *top = NULL;
3370         mcache_obj_t **list = &top;
3371         unsigned int tot = 0;
3372
3373         LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
3374
3375         switch (class) {
3376         case MC_MBUF:
3377         case MC_CL:
3378         case MC_BIGCL:
3379         case MC_16KCL:
3380                 return (FALSE);
3381
3382         case MC_MBUF_CL:
3383         case MC_MBUF_BIGCL:
3384         case MC_MBUF_16KCL:
3385                 /* Get the required number of constructed objects if possible */
3386                 if (m_infree(class) > m_minlimit(class)) {
3387                         tot = cslab_alloc(class, &list,
3388                             MIN(num, m_infree(class)));
3389                 }
3390
3391                 /* And destroy them to get back the raw objects */
3392                 if (top != NULL)
3393                         (void) cslab_free(class, top, 1);
3394                 break;
3395
3396         default:
3397                 VERIFY(0);
3398                 /* NOTREACHED */
3399         }
3400
3401         return (tot == num);
3402 }
3403
3404 static void
3405 m_reclaim(mbuf_class_t class, unsigned int num, boolean_t comp)
3406 {
3407         int m, bmap = 0;
3408
3409         LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
3410
3411         VERIFY(m_total(MC_CL) <= m_maxlimit(MC_CL));
3412         VERIFY(m_total(MC_BIGCL) <= m_maxlimit(MC_BIGCL));
3413         VERIFY(m_total(MC_16KCL) <= m_maxlimit(MC_16KCL));
3414
3415         /*
3416          * This logic can be made smarter; for now, simply mark
3417          * all other related classes as potential victims.
3418          */
3419         switch (class) {
3420         case MC_MBUF:
3421                 m_wantpurge(MC_CL)++;
3422                 m_wantpurge(MC_BIGCL)++;
3423                 m_wantpurge(MC_MBUF_CL)++;
3424                 m_wantpurge(MC_MBUF_BIGCL)++;
3425                 break;
3426
3427         case MC_CL:
3428                 m_wantpurge(MC_MBUF)++;
3429                 m_wantpurge(MC_BIGCL)++;
3430                 m_wantpurge(MC_MBUF_BIGCL)++;
3431                 if (!comp)
3432                         m_wantpurge(MC_MBUF_CL)++;
3433                 break;
3434
3435         case MC_BIGCL:
3436                 m_wantpurge(MC_MBUF)++;
3437                 m_wantpurge(MC_CL)++;
3438                 m_wantpurge(MC_MBUF_CL)++;
3439                 if (!comp)
3440                         m_wantpurge(MC_MBUF_BIGCL)++;
3441                 break;
3442
3443         case MC_16KCL:
3444                 if (!comp)
3445                         m_wantpurge(MC_MBUF_16KCL)++;
3446                 break;
3447
3448         default:
3449                 VERIFY(0);
3450                 /* NOTREACHED */
3451         }
3452
3453         /*
3454          * Run through each marked class and check if we really need to
3455          * purge (and therefore temporarily disable) the per-CPU caches
3456          * layer used by the class.  If so, remember the classes since
3457          * we are going to drop the lock below prior to purging.
3458          */
3459         for (m = 0; m < NELEM(mbuf_table); m++) {
3460                 if (m_wantpurge(m) > 0) {
3461                         m_wantpurge(m) = 0;
3462                         /*
3463                          * Try hard to steal the required number of objects
3464                          * from the freelist of other mbuf classes.  Only
3465                          * purge and disable the per-CPU caches layer when
3466                          * we don't have enough; it's the last resort.
3467                          */
3468                         if (!mbuf_steal(m, num))
3469                                 bmap |= (1 << m);
3470                 }
3471         }
3472
3473         lck_mtx_unlock(mbuf_mlock);
3474
3475         if (bmap != 0) {
3476                 /* signal the domains to drain */
3477                 net_drain_domains();
3478
3479                 /* Sigh; we have no other choices but to ask mcache to purge */
3480                 for (m = 0; m < NELEM(mbuf_table); m++) {
3481                         if ((bmap & (1 << m)) &&
3482                             mcache_purge_cache(m_cache(m), TRUE)) {
3483                                 lck_mtx_lock(mbuf_mlock);
3484                                 m_purge_cnt(m)++;
3485                                 mbstat.m_drain++;
3486                                 lck_mtx_unlock(mbuf_mlock);
3487                         }
3488                 }
3489         } else {
3490                 /*
3491                  * Request mcache to reap extra elements from all of its caches;
3492                  * note that all reaps are serialized and happen only at a fixed
3493                  * interval.
3494                  */
3495                 mcache_reap();
3496         }
3497         lck_mtx_lock(mbuf_mlock);
3498 }
3499
3500 static inline struct mbuf *
3501 m_get_common(int wait, short type, int hdr)
3502 {
3503         struct mbuf *m;
3504         int mcflags = MSLEEPF(wait);
3505
3506         /* Is this due to a non-blocking retry?  If so, then try harder */
3507         if (mcflags & MCR_NOSLEEP)
3508                 mcflags |= MCR_TRYHARD;
3509
3510         m = mcache_alloc(m_cache(MC_MBUF), mcflags);
3511         if (m != NULL) {
3512                 MBUF_INIT(m, hdr, type);
3513                 mtype_stat_inc(type);
3514                 mtype_stat_dec(MT_FREE);
3515 #if CONFIG_MACF_NET
3516                 if (hdr && mac_init_mbuf(m, wait) != 0) {
3517                         m_free(m);
3518                         return (NULL);
3519                 }
3520 #endif /* MAC_NET */
3521         }
3522         return (m);
3523 }
3524
3525 /*
3526  * Space allocation routines; these are also available as macros
3527  * for critical paths.
3528  */
3529 #define _M_GET(wait, type)      m_get_common(wait, type, 0)
3530 #define _M_GETHDR(wait, type)   m_get_common(wait, type, 1)
3531 #define _M_RETRY(wait, type)    _M_GET(wait, type)
3532 #define _M_RETRYHDR(wait, type) _M_GETHDR(wait, type)
3533 #define _MGET(m, how, type)     ((m) = _M_GET(how, type))
3534 #define _MGETHDR(m, how, type)  ((m) = _M_GETHDR(how, type))
3535
3536 struct mbuf *
3537 m_get(int wait, int type)
3538 {
3539         return (_M_GET(wait, type));
3540 }
3541
3542 struct mbuf *
3543 m_gethdr(int wait, int type)
3544 {
3545         return (_M_GETHDR(wait, type));
3546 }
3547
3548 struct mbuf *
3549 m_retry(int wait, int type)
3550 {
3551         return (_M_RETRY(wait, type));
3552 }
3553
3554 struct mbuf *
3555 m_retryhdr(int wait, int type)
3556 {
3557         return (_M_RETRYHDR(wait, type));
3558 }
3559
3560 struct mbuf *
3561 m_getclr(int wait, int type)
3562 {
3563         struct mbuf *m;
3564
3565         _MGET(m, wait, type);
3566         if (m != NULL)
3567                 bzero(MTOD(m, caddr_t), MLEN);
3568         return (m);
3569 }
3570
3571 static int
3572 m_free_paired(struct mbuf *m)
3573 {
3574         VERIFY((m->m_flags & M_EXT) && (MEXT_FLAGS(m) & EXTF_PAIRED));
3575
3576         membar_sync();
3577         if (MEXT_PMBUF(m) == m) {
3578                 volatile UInt16 *addr = (volatile UInt16 *)&MEXT_PREF(m);
3579                 int16_t oprefcnt, prefcnt;
3580
3581                 /*
3582                  * Paired ref count might be negative in case we lose
3583                  * against another thread clearing MEXT_PMBUF, in the
3584                  * event it occurs after the above memory barrier sync.
3585                  * In that case just ignore as things have been unpaired.
3586                  */
3587                 do {
3588                         oprefcnt = *addr;
3589                         prefcnt = oprefcnt - 1;
3590                 } while (!OSCompareAndSwap16(oprefcnt, prefcnt, addr));
3591
3592                 if (prefcnt > 1) {
3593                         return (1);
3594                 } else if (prefcnt == 1) {
3595                         (*(m_get_ext_free(m)))(m->m_ext.ext_buf,
3596                             m->m_ext.ext_size, m_get_ext_arg(m));
3597                         return (1);
3598                 } else if (prefcnt == 0) {
3599                         VERIFY(MBUF_IS_PAIRED(m));
3600
3601                         /*
3602                          * Restore minref to its natural value, so that
3603                          * the caller will be able to free the cluster
3604                          * as appropriate.
3605                          */
3606                         MEXT_MINREF(m) = 0;
3607
3608                         /*
3609                          * Clear MEXT_PMBUF, but leave EXTF_PAIRED intact
3610                          * as it is immutable.  atomic_set_ptr also causes
3611                          * memory barrier sync.
3612                          */
3613                         atomic_set_ptr(&MEXT_PMBUF(m), NULL);
3614
3615                         switch (m->m_ext.ext_size) {
3616                         case MCLBYTES:
3617                                 m_set_ext(m, m_get_rfa(m), NULL, NULL);
3618                                 break;
3619
3620                         case MBIGCLBYTES:
3621                                 m_set_ext(m, m_get_rfa(m), m_bigfree, NULL);
3622                                 break;
3623
3624                         case M16KCLBYTES:
3625                                 m_set_ext(m, m_get_rfa(m), m_16kfree, NULL);
3626                                 break;
3627
3628                         default:
3629                                 VERIFY(0);
3630                                 /* NOTREACHED */
3631                         }
3632                 }
3633         }
3634
3635         /*
3636          * Tell caller the unpair has occurred, and that the reference
3637          * count on the external cluster held for the paired mbuf should
3638          * now be dropped.
3639          */
3640         return (0);
3641 }
3642
3643 struct mbuf *
3644 m_free(struct mbuf *m)
3645 {
3646         struct mbuf *n = m->m_next;
3647
3648         if (m->m_type == MT_FREE)
3649                 panic("m_free: freeing an already freed mbuf");
3650
3651         if (m->m_flags & M_PKTHDR) {
3652                 /* Check for scratch area overflow */
3653                 m_redzone_verify(m);
3654                 /* Free the aux data and tags if there is any */
3655                 m_tag_delete_chain(m, NULL);
3656
3657                 m_do_tx_compl_callback(m, NULL);
3658         }
3659
3660         if (m->m_flags & M_EXT) {
3661                 u_int16_t refcnt;
3662                 u_int32_t composite;
3663                 m_ext_free_func_t m_free_func;
3664
3665                 if (MBUF_IS_PAIRED(m) && m_free_paired(m))
3666                         return (n);
3667
3668                 refcnt = m_decref(m);
3669                 composite = (MEXT_FLAGS(m) & EXTF_COMPOSITE);
3670                 m_free_func = m_get_ext_free(m);
3671
3672                 if (refcnt == MEXT_MINREF(m) && !composite) {
3673                         if (m_free_func == NULL) {
3674                                 mcache_free(m_cache(MC_CL), m->m_ext.ext_buf);
3675                         } else if (m_free_func == m_bigfree) {
3676                                 mcache_free(m_cache(MC_BIGCL),
3677                                     m->m_ext.ext_buf);
3678                         } else if (m_free_func == m_16kfree) {
3679                                 mcache_free(m_cache(MC_16KCL),
3680                                     m->m_ext.ext_buf);
3681                         } else {
3682                                 (*m_free_func)(m->m_ext.ext_buf,
3683                                     m->m_ext.ext_size, m_get_ext_arg(m));
3684                         }
3685                         mcache_free(ref_cache, m_get_rfa(m));
3686                         m_set_ext(m, NULL, NULL, NULL);
3687                 } else if (refcnt == MEXT_MINREF(m) && composite) {
3688                         VERIFY(!(MEXT_FLAGS(m) & EXTF_PAIRED));
3689                         VERIFY(m->m_type != MT_FREE);
3690
3691                         mtype_stat_dec(m->m_type);
3692                         mtype_stat_inc(MT_FREE);
3693
3694                         m->m_type = MT_FREE;
3695                         m->m_flags = M_EXT;
3696                         m->m_len = 0;
3697                         m->m_next = m->m_nextpkt = NULL;
3698
3699                         MEXT_FLAGS(m) &= ~EXTF_READONLY;
3700
3701                         /* "Free" into the intermediate cache */
3702                         if (m_free_func == NULL) {
3703                                 mcache_free(m_cache(MC_MBUF_CL), m);
3704                         } else if (m_free_func == m_bigfree) {
3705                                 mcache_free(m_cache(MC_MBUF_BIGCL), m);
3706                         } else {
3707                                 VERIFY(m_free_func == m_16kfree);
3708                                 mcache_free(m_cache(MC_MBUF_16KCL), m);
3709                         }
3710                         return (n);
3711                 }
3712         }
3713
3714         if (m->m_type != MT_FREE) {
3715                 mtype_stat_dec(m->m_type);
3716                 mtype_stat_inc(MT_FREE);
3717         }
3718
3719         m->m_type = MT_FREE;
3720         m->m_flags = m->m_len = 0;
3721         m->m_next = m->m_nextpkt = NULL;
3722
3723         mcache_free(m_cache(MC_MBUF), m);
3724
3725         return (n);
3726 }
3727
3728 __private_extern__ struct mbuf *
3729 m_clattach(struct mbuf *m, int type, caddr_t extbuf,
3730     void (*extfree)(caddr_t, u_int, caddr_t), u_int extsize, caddr_t extarg,
3731     int wait, int pair)
3732 {
3733         struct ext_ref *rfa = NULL;
3734
3735         /*
3736          * If pairing is requested and an existing mbuf is provided, reject
3737          * it if it's already been paired to another cluster.  Otherwise,
3738          * allocate a new one or free any existing below.
3739          */
3740         if ((m != NULL && MBUF_IS_PAIRED(m)) ||
3741             (m == NULL && (m = _M_GETHDR(wait, type)) == NULL))
3742                 return (NULL);
3743
3744         if (m->m_flags & M_EXT) {
3745                 u_int16_t refcnt;
3746                 u_int32_t composite;
3747                 m_ext_free_func_t m_free_func;
3748
3749                 refcnt = m_decref(m);
3750                 composite = (MEXT_FLAGS(m) & EXTF_COMPOSITE);
3751                 VERIFY(!(MEXT_FLAGS(m) & EXTF_PAIRED) && MEXT_PMBUF(m) == NULL);
3752                 m_free_func = m_get_ext_free(m);
3753                 if (refcnt == MEXT_MINREF(m) && !composite) {
3754                         if (m_free_func == NULL) {
3755                                 mcache_free(m_cache(MC_CL), m->m_ext.ext_buf);
3756                         } else if (m_free_func == m_bigfree) {
3757                                 mcache_free(m_cache(MC_BIGCL),
3758                                     m->m_ext.ext_buf);
3759                         } else if (m_free_func == m_16kfree) {
3760                                 mcache_free(m_cache(MC_16KCL),
3761                                     m->m_ext.ext_buf);
3762                         } else {
3763                                 (*m_free_func)(m->m_ext.ext_buf,
3764                                     m->m_ext.ext_size, m_get_ext_arg(m));
3765                         }
3766                         /* Re-use the reference structure */
3767                         rfa = m_get_rfa(m);
3768                 } else if (refcnt == MEXT_MINREF(m) && composite) {
3769                         VERIFY(m->m_type != MT_FREE);
3770
3771                         mtype_stat_dec(m->m_type);
3772                         mtype_stat_inc(MT_FREE);
3773
3774                         m->m_type = MT_FREE;
3775                         m->m_flags = M_EXT;
3776                         m->m_len = 0;
3777                         m->m_next = m->m_nextpkt = NULL;
3778
3779                         MEXT_FLAGS(m) &= ~EXTF_READONLY;
3780
3781                         /* "Free" into the intermediate cache */
3782                         if (m_free_func == NULL) {
3783                                 mcache_free(m_cache(MC_MBUF_CL), m);
3784                         } else if (m_free_func == m_bigfree) {
3785                                 mcache_free(m_cache(MC_MBUF_BIGCL), m);
3786                         } else {
3787                                 VERIFY(m_free_func == m_16kfree);
3788                                 mcache_free(m_cache(MC_MBUF_16KCL), m);
3789                         }
3790                         /*
3791                          * Allocate a new mbuf, since we didn't divorce
3792                          * the composite mbuf + cluster pair above.
3793                          */
3794                         if ((m = _M_GETHDR(wait, type)) == NULL)
3795                                 return (NULL);
3796                 }
3797         }
3798
3799         if (rfa == NULL &&
3800             (rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL) {
3801                 m_free(m);
3802                 return (NULL);
3803         }
3804
3805         if (!pair) {
3806                 MEXT_INIT(m, extbuf, extsize, extfree, extarg, rfa,
3807                     0, 1, 0, 0, 0, NULL);
3808         } else {
3809                 MEXT_INIT(m, extbuf, extsize, extfree, (caddr_t)m, rfa,
3810                     1, 1, 1, EXTF_PAIRED, 0, m);
3811         }
3812
3813         return (m);
3814 }
3815
3816 /*
3817  * Perform `fast' allocation mbuf clusters from a cache of recently-freed
3818  * clusters. (If the cache is empty, new clusters are allocated en-masse.)
3819  */
3820 struct mbuf *
3821 m_getcl(int wait, int type, int flags)
3822 {
3823         struct mbuf *m;
3824         int mcflags = MSLEEPF(wait);
3825         int hdr = (flags & M_PKTHDR);
3826
3827         /* Is this due to a non-blocking retry?  If so, then try harder */
3828         if (mcflags & MCR_NOSLEEP)
3829                 mcflags |= MCR_TRYHARD;
3830
3831         m = mcache_alloc(m_cache(MC_MBUF_CL), mcflags);
3832         if (m != NULL) {
3833                 u_int16_t flag;
3834                 struct ext_ref *rfa;
3835                 void *cl;
3836
3837                 VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT);
3838                 cl = m->m_ext.ext_buf;
3839                 rfa = m_get_rfa(m);
3840
3841                 ASSERT(cl != NULL && rfa != NULL);
3842                 VERIFY(MBUF_IS_COMPOSITE(m) && m_get_ext_free(m) == NULL);
3843
3844                 flag = MEXT_FLAGS(m);
3845
3846                 MBUF_INIT(m, hdr, type);
3847                 MBUF_CL_INIT(m, cl, rfa, 1, flag);
3848
3849                 mtype_stat_inc(type);
3850                 mtype_stat_dec(MT_FREE);
3851 #if CONFIG_MACF_NET
3852                 if (hdr && mac_init_mbuf(m, wait) != 0) {
3853                         m_freem(m);
3854                         return (NULL);
3855                 }
3856 #endif /* MAC_NET */
3857         }
3858         return (m);
3859 }
3860
3861 /* m_mclget() add an mbuf cluster to a normal mbuf */
3862 struct mbuf *
3863 m_mclget(struct mbuf *m, int wait)
3864 {
3865         struct ext_ref *rfa;
3866
3867         if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL)
3868                 return (m);
3869
3870         m->m_ext.ext_buf = m_mclalloc(wait);
3871         if (m->m_ext.ext_buf != NULL) {
3872                 MBUF_CL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0);
3873         } else {
3874                 mcache_free(ref_cache, rfa);
3875         }
3876         return (m);
3877 }
3878
3879 /* Allocate an mbuf cluster */
3880 caddr_t
3881 m_mclalloc(int wait)
3882 {
3883         int mcflags = MSLEEPF(wait);
3884
3885         /* Is this due to a non-blocking retry?  If so, then try harder */
3886         if (mcflags & MCR_NOSLEEP)
3887                 mcflags |= MCR_TRYHARD;
3888
3889         return (mcache_alloc(m_cache(MC_CL), mcflags));
3890 }
3891
3892 /* Free an mbuf cluster */
3893 void
3894 m_mclfree(caddr_t p)
3895 {
3896         mcache_free(m_cache(MC_CL), p);
3897 }
3898
3899 /*
3900  * mcl_hasreference() checks if a cluster of an mbuf is referenced by
3901  * another mbuf; see comments in m_incref() regarding EXTF_READONLY.
3902  */
3903 int
3904 m_mclhasreference(struct mbuf *m)
3905 {
3906         if (!(m->m_flags & M_EXT))
3907                 return (0);
3908
3909         ASSERT(m_get_rfa(m) != NULL);
3910
3911         return ((MEXT_FLAGS(m) & EXTF_READONLY) ? 1 : 0);
3912 }
3913
3914 __private_extern__ caddr_t
3915 m_bigalloc(int wait)
3916 {
3917         int mcflags = MSLEEPF(wait);
3918
3919         /* Is this due to a non-blocking retry?  If so, then try harder */
3920         if (mcflags & MCR_NOSLEEP)
3921                 mcflags |= MCR_TRYHARD;
3922
3923         return (mcache_alloc(m_cache(MC_BIGCL), mcflags));
3924 }
3925
3926 __private_extern__ void
3927 m_bigfree(caddr_t p, __unused u_int size, __unused caddr_t arg)
3928 {
3929         mcache_free(m_cache(MC_BIGCL), p);
3930 }
3931
3932 /* m_mbigget() add an 4KB mbuf cluster to a normal mbuf */
3933 __private_extern__ struct mbuf *
3934 m_mbigget(struct mbuf *m, int wait)
3935 {
3936         struct ext_ref *rfa;
3937
3938         if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL)
3939                 return (m);
3940
3941         m->m_ext.ext_buf =  m_bigalloc(wait);
3942         if (m->m_ext.ext_buf != NULL) {
3943                 MBUF_BIGCL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0);
3944         } else {
3945                 mcache_free(ref_cache, rfa);
3946         }
3947         return (m);
3948 }
3949
3950 __private_extern__ caddr_t
3951 m_16kalloc(int wait)
3952 {
3953         int mcflags = MSLEEPF(wait);
3954
3955         /* Is this due to a non-blocking retry?  If so, then try harder */
3956         if (mcflags & MCR_NOSLEEP)
3957                 mcflags |= MCR_TRYHARD;
3958
3959         return (mcache_alloc(m_cache(MC_16KCL), mcflags));
3960 }
3961
3962 __private_extern__ void
3963 m_16kfree(caddr_t p, __unused u_int size, __unused caddr_t arg)
3964 {
3965         mcache_free(m_cache(MC_16KCL), p);
3966 }
3967
3968 /* m_m16kget() add a 16KB mbuf cluster to a normal mbuf */
3969 __private_extern__ struct mbuf *
3970 m_m16kget(struct mbuf *m, int wait)
3971 {
3972         struct ext_ref *rfa;
3973
3974         if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL)
3975                 return (m);
3976
3977         m->m_ext.ext_buf =  m_16kalloc(wait);
3978         if (m->m_ext.ext_buf != NULL) {
3979                 MBUF_16KCL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0);
3980         } else {
3981                 mcache_free(ref_cache, rfa);
3982         }
3983         return (m);
3984 }
3985
3986 /*
3987  * "Move" mbuf pkthdr from "from" to "to".
3988  * "from" must have M_PKTHDR set, and "to" must be empty.
3989  */
3990 void
3991 m_copy_pkthdr(struct mbuf *to, struct mbuf *from)
3992 {
3993         VERIFY(from->m_flags & M_PKTHDR);
3994
3995         /* Check for scratch area overflow */
3996         m_redzone_verify(from);
3997
3998         if (to->m_flags & M_PKTHDR) {
3999                 /* Check for scratch area overflow */
4000                 m_redzone_verify(to);
4001                 /* We will be taking over the tags of 'to' */
4002                 m_tag_delete_chain(to, NULL);
4003         }
4004         to->m_pkthdr = from->m_pkthdr;          /* especially tags */
4005         m_classifier_init(from, 0);             /* purge classifier info */
4006         m_tag_init(from, 1);                    /* purge all tags from src */
4007         m_scratch_init(from);                   /* clear src scratch area */
4008         to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT);
4009         if ((to->m_flags & M_EXT) == 0)
4010                 to->m_data = to->m_pktdat;
4011         m_redzone_init(to);                     /* setup red zone on dst */
4012 }
4013
4014 /*
4015  * Duplicate "from"'s mbuf pkthdr in "to".
4016  * "from" must have M_PKTHDR set, and "to" must be empty.
4017  * In particular, this does a deep copy of the packet tags.
4018  */
4019 static int
4020 m_dup_pkthdr(struct mbuf *to, struct mbuf *from, int how)
4021 {
4022         VERIFY(from->m_flags & M_PKTHDR);
4023
4024         /* Check for scratch area overflow */
4025         m_redzone_verify(from);
4026
4027         if (to->m_flags & M_PKTHDR) {
4028                 /* Check for scratch area overflow */
4029                 m_redzone_verify(to);
4030                 /* We will be taking over the tags of 'to' */
4031                 m_tag_delete_chain(to, NULL);
4032         }
4033         to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT);
4034         if ((to->m_flags & M_EXT) == 0)
4035                 to->m_data = to->m_pktdat;
4036         to->m_pkthdr = from->m_pkthdr;
4037         m_redzone_init(to);                     /* setup red zone on dst */
4038         m_tag_init(to, 0);                      /* preserve dst static tags */
4039         return (m_tag_copy_chain(to, from, how));
4040 }
4041
4042 void
4043 m_copy_pftag(struct mbuf *to, struct mbuf *from)
4044 {
4045         memcpy(m_pftag(to), m_pftag(from), sizeof(struct pf_mtag));
4046 #if PF_ECN
4047         m_pftag(to)->pftag_hdr = NULL;
4048         m_pftag(to)->pftag_flags &= ~(PF_TAG_HDR_INET|PF_TAG_HDR_INET6);
4049 #endif /* PF_ECN */
4050 }
4051
4052 void
4053 m_classifier_init(struct mbuf *m, uint32_t pktf_mask)
4054 {
4055         VERIFY(m->m_flags & M_PKTHDR);
4056
4057         m->m_pkthdr.pkt_proto = 0;
4058         m->m_pkthdr.pkt_flowsrc = 0;
4059         m->m_pkthdr.pkt_flowid = 0;
4060         m->m_pkthdr.pkt_flags &= pktf_mask;     /* caller-defined mask */
4061         /* preserve service class and interface info for loopback packets */
4062         if (!(m->m_pkthdr.pkt_flags & PKTF_LOOP))
4063                 (void) m_set_service_class(m, MBUF_SC_BE);
4064         if (!(m->m_pkthdr.pkt_flags & PKTF_IFAINFO))
4065                 m->m_pkthdr.pkt_ifainfo = 0;
4066         /*
4067          * Preserve timestamp if requested
4068          */
4069         if (!(m->m_pkthdr.pkt_flags & PKTF_TS_VALID))
4070                 m->m_pkthdr.pkt_timestamp = 0;
4071 }
4072
4073 void
4074 m_copy_classifier(struct mbuf *to, struct mbuf *from)
4075 {
4076         VERIFY(to->m_flags & M_PKTHDR);
4077         VERIFY(from->m_flags & M_PKTHDR);
4078
4079         to->m_pkthdr.pkt_proto = from->m_pkthdr.pkt_proto;
4080         to->m_pkthdr.pkt_flowsrc = from->m_pkthdr.pkt_flowsrc;
4081         to->m_pkthdr.pkt_flowid = from->m_pkthdr.pkt_flowid;
4082         to->m_pkthdr.pkt_flags = from->m_pkthdr.pkt_flags;
4083         (void) m_set_service_class(to, from->m_pkthdr.pkt_svc);
4084         to->m_pkthdr.pkt_ifainfo  = from->m_pkthdr.pkt_ifainfo;
4085 }
4086
4087 /*
4088  * Return a list of mbuf hdrs that point to clusters.  Try for num_needed;
4089  * if wantall is not set, return whatever number were available.  Set up the
4090  * first num_with_pkthdrs with mbuf hdrs configured as packet headers; these
4091  * are chained on the m_nextpkt field.  Any packets requested beyond this
4092  * are chained onto the last packet header's m_next field.  The size of
4093  * the cluster is controlled by the parameter bufsize.
4094  */
4095 __private_extern__ struct mbuf *
4096 m_getpackets_internal(unsigned int *num_needed, int num_with_pkthdrs,
4097     int wait, int wantall, size_t bufsize)
4098 {
4099         struct mbuf *m;
4100         struct mbuf **np, *top;
4101         unsigned int pnum, needed = *num_needed;
4102         mcache_obj_t *mp_list = NULL;
4103         int mcflags = MSLEEPF(wait);
4104         u_int16_t flag;
4105         struct ext_ref *rfa;
4106         mcache_t *cp;
4107         void *cl;
4108
4109         ASSERT(bufsize == m_maxsize(MC_CL) ||
4110             bufsize == m_maxsize(MC_BIGCL) ||
4111             bufsize == m_maxsize(MC_16KCL));
4112
4113         /*
4114          * Caller must first check for njcl because this
4115          * routine is internal and not exposed/used via KPI.
4116          */
4117         VERIFY(bufsize != m_maxsize(MC_16KCL) || njcl > 0);
4118
4119         top = NULL;
4120         np = &top;
4121         pnum = 0;
4122
4123         /*
4124          * The caller doesn't want all the requested buffers; only some.
4125          * Try hard to get what we can, but don't block.  This effectively
4126          * overrides MCR_SLEEP, since this thread will not go to sleep
4127          * if we can't get all the buffers.
4128          */
4129         if (!wantall || (mcflags & MCR_NOSLEEP))
4130                 mcflags |= MCR_TRYHARD;
4131
4132         /* Allocate the composite mbuf + cluster elements from the cache */
4133         if (bufsize == m_maxsize(MC_CL))
4134                 cp = m_cache(MC_MBUF_CL);
4135         else if (bufsize == m_maxsize(MC_BIGCL))
4136                 cp = m_cache(MC_MBUF_BIGCL);
4137         else
4138                 cp = m_cache(MC_MBUF_16KCL);
4139         needed = mcache_alloc_ext(cp, &mp_list, needed, mcflags);
4140
4141         for (pnum = 0; pnum < needed; pnum++) {
4142                 m = (struct mbuf *)mp_list;
4143                 mp_list = mp_list->obj_next;
4144
4145                 VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT);
4146                 cl = m->m_ext.ext_buf;
4147                 rfa = m_get_rfa(m);
4148
4149                 ASSERT(cl != NULL && rfa != NULL);
4150                 VERIFY(MBUF_IS_COMPOSITE(m));
4151
4152                 flag = MEXT_FLAGS(m);
4153
4154                 MBUF_INIT(m, num_with_pkthdrs, MT_DATA);
4155                 if (bufsize == m_maxsize(MC_16KCL)) {
4156                         MBUF_16KCL_INIT(m, cl, rfa, 1, flag);
4157                 } else if (bufsize == m_maxsize(MC_BIGCL)) {
4158                         MBUF_BIGCL_INIT(m, cl, rfa, 1, flag);
4159                 } else {
4160                         MBUF_CL_INIT(m, cl, rfa, 1, flag);
4161                 }
4162
4163                 if (num_with_pkthdrs > 0) {
4164                         --num_with_pkthdrs;
4165 #if CONFIG_MACF_NET
4166                         if (mac_mbuf_label_init(m, wait) != 0) {
4167                                 m_freem(m);
4168                                 break;
4169                         }
4170 #endif /* MAC_NET */
4171                 }
4172
4173                 *np = m;
4174                 if (num_with_pkthdrs > 0)
4175                         np = &m->m_nextpkt;
4176                 else
4177                         np = &m->m_next;
4178         }
4179         ASSERT(pnum != *num_needed || mp_list == NULL);
4180         if (mp_list != NULL)
4181                 mcache_free_ext(cp, mp_list);
4182
4183         if (pnum > 0) {
4184                 mtype_stat_add(MT_DATA, pnum);
4185                 mtype_stat_sub(MT_FREE, pnum);
4186         }
4187
4188         if (wantall && (pnum != *num_needed)) {
4189                 if (top != NULL)
4190                         m_freem_list(top);
4191                 return (NULL);
4192         }
4193
4194         if (pnum > *num_needed) {
4195                 printf("%s: File a radar related to <rdar://10146739>. \
4196                         needed = %u, pnum = %u, num_needed = %u \n",
4197                         __func__, needed, pnum, *num_needed);
4198         }
4199
4200         *num_needed = pnum;
4201         return (top);
4202 }
4203
4204 /*
4205  * Return list of mbuf linked by m_nextpkt.  Try for numlist, and if
4206  * wantall is not set, return whatever number were available.  The size of
4207  * each mbuf in the list is controlled by the parameter packetlen.  Each
4208  * mbuf of the list may have a chain of mbufs linked by m_next.  Each mbuf
4209  * in the chain is called a segment.  If maxsegments is not null and the
4210  * value pointed to is not null, this specify the maximum number of segments
4211  * for a chain of mbufs.  If maxsegments is zero or the value pointed to
4212  * is zero the caller does not have any restriction on the number of segments.
4213  * The actual  number of segments of a mbuf chain is return in the value
4214  * pointed to by maxsegments.
4215  */
4216 __private_extern__ struct mbuf *
4217 m_allocpacket_internal(unsigned int *numlist, size_t packetlen,
4218     unsigned int *maxsegments, int wait, int wantall, size_t wantsize)
4219 {
4220         struct mbuf **np, *top, *first = NULL;
4221         size_t bufsize, r_bufsize;
4222         unsigned int num = 0;
4223         unsigned int nsegs = 0;
4224         unsigned int needed, resid;
4225         int mcflags = MSLEEPF(wait);
4226         mcache_obj_t *mp_list = NULL, *rmp_list = NULL;
4227         mcache_t *cp = NULL, *rcp = NULL;
4228
4229         if (*numlist == 0)
4230                 return (NULL);
4231
4232         top = NULL;
4233         np = &top;
4234
4235         if (wantsize == 0) {
4236                 if (packetlen <= MINCLSIZE) {
4237                         bufsize = packetlen;
4238                 } else if (packetlen > m_maxsize(MC_CL)) {
4239                         /* Use 4KB if jumbo cluster pool isn't available */
4240                         if (packetlen <= m_maxsize(MC_BIGCL) || njcl == 0)
4241                                 bufsize = m_maxsize(MC_BIGCL);
4242                         else
4243                                 bufsize = m_maxsize(MC_16KCL);
4244                 } else {
4245                         bufsize = m_maxsize(MC_CL);
4246                 }
4247         } else if (wantsize == m_maxsize(MC_CL) ||
4248             wantsize == m_maxsize(MC_BIGCL) ||
4249             (wantsize == m_maxsize(MC_16KCL) && njcl > 0)) {
4250                 bufsize = wantsize;
4251         } else {
4252                 return (NULL);
4253         }
4254
4255         if (bufsize <= MHLEN) {
4256                 nsegs = 1;
4257         } else if (bufsize <= MINCLSIZE) {
4258                 if (maxsegments != NULL && *maxsegments == 1) {
4259                         bufsize = m_maxsize(MC_CL);
4260                         nsegs = 1;
4261                 } else {
4262                         nsegs = 2;
4263                 }
4264         } else if (bufsize == m_maxsize(MC_16KCL)) {
4265                 VERIFY(njcl > 0);
4266                 nsegs = ((packetlen - 1) >> M16KCLSHIFT) + 1;
4267         } else if (bufsize == m_maxsize(MC_BIGCL)) {
4268                 nsegs = ((packetlen - 1) >> MBIGCLSHIFT) + 1;
4269         } else {
4270                 nsegs = ((packetlen - 1) >> MCLSHIFT) + 1;
4271         }
4272         if (maxsegments != NULL) {
4273                 if (*maxsegments && nsegs > *maxsegments) {
4274                         *maxsegments = nsegs;
4275                         return (NULL);
4276                 }
4277                 *maxsegments = nsegs;
4278         }
4279
4280         /*
4281          * The caller doesn't want all the requested buffers; only some.
4282          * Try hard to get what we can, but don't block.  This effectively
4283          * overrides MCR_SLEEP, since this thread will not go to sleep
4284          * if we can't get all the buffers.
4285          */
4286         if (!wantall || (mcflags & MCR_NOSLEEP))
4287                 mcflags |= MCR_TRYHARD;
4288
4289         /*
4290          * Simple case where all elements in the lists/chains are mbufs.
4291          * Unless bufsize is greater than MHLEN, each segment chain is made
4292          * up of exactly 1 mbuf.  Otherwise, each segment chain is made up
4293          * of 2 mbufs; the second one is used for the residual data, i.e.
4294          * the remaining data that cannot fit into the first mbuf.
4295          */
4296         if (bufsize <= MINCLSIZE) {
4297                 /* Allocate the elements in one shot from the mbuf cache */
4298                 ASSERT(bufsize <= MHLEN || nsegs == 2);
4299                 cp = m_cache(MC_MBUF);
4300                 needed = mcache_alloc_ext(cp, &mp_list,
4301                     (*numlist) * nsegs, mcflags);
4302
4303                 /*
4304                  * The number of elements must be even if we are to use an
4305                  * mbuf (instead of a cluster) to store the residual data.
4306                  * If we couldn't allocate the requested number of mbufs,
4307                  * trim the number down (if it's odd) in order to avoid
4308                  * creating a partial segment chain.
4309                  */
4310                 if (bufsize > MHLEN && (needed & 0x1))
4311                         needed--;
4312
4313                 while (num < needed) {
4314                         struct mbuf *m;
4315
4316                         m = (struct mbuf *)mp_list;
4317                         mp_list = mp_list->obj_next;
4318                         ASSERT(m != NULL);
4319
4320                         MBUF_INIT(m, 1, MT_DATA);
4321 #if CONFIG_MACF_NET
4322                         if (mac_init_mbuf(m, wait) != 0) {
4323                                 m_free(m);
4324                                 break;
4325                         }
4326 #endif /* MAC_NET */
4327                         num++;
4328                         if (bufsize > MHLEN) {
4329                                 /* A second mbuf for this segment chain */
4330                                 m->m_next = (struct mbuf *)mp_list;
4331                                 mp_list = mp_list->obj_next;
4332                                 ASSERT(m->m_next != NULL);
4333
4334                                 MBUF_INIT(m->m_next, 0, MT_DATA);
4335                                 num++;
4336                         }
4337                         *np = m;
4338                         np = &m->m_nextpkt;
4339                 }
4340                 ASSERT(num != *numlist || mp_list == NULL);
4341
4342                 if (num > 0) {
4343                         mtype_stat_add(MT_DATA, num);
4344                         mtype_stat_sub(MT_FREE, num);
4345                 }
4346                 num /= nsegs;
4347
4348                 /* We've got them all; return to caller */
4349                 if (num == *numlist)
4350                         return (top);
4351
4352                 goto fail;
4353         }
4354
4355         /*
4356          * Complex cases where elements are made up of one or more composite
4357          * mbufs + cluster, depending on packetlen.  Each N-segment chain can
4358          * be illustrated as follows:
4359          *
4360          * [mbuf + cluster 1] [mbuf + cluster 2] ... [mbuf + cluster N]
4361          *
4362          * Every composite mbuf + cluster element comes from the intermediate
4363          * cache (either MC_MBUF_CL or MC_MBUF_BIGCL).  For space efficiency,
4364          * the last composite element will come from the MC_MBUF_CL cache,
4365          * unless the residual data is larger than 2KB where we use the
4366          * big cluster composite cache (MC_MBUF_BIGCL) instead.  Residual
4367          * data is defined as extra data beyond the first element that cannot
4368          * fit into the previous element, i.e. there is no residual data if
4369          * the chain only has 1 segment.
4370          */
4371         r_bufsize = bufsize;
4372         resid = packetlen > bufsize ? packetlen % bufsize : 0;
4373         if (resid > 0) {
4374                 /* There is residual data; figure out the cluster size */
4375                 if (wantsize == 0 && packetlen > MINCLSIZE) {
4376                         /*
4377                          * Caller didn't request that all of the segments
4378                          * in the chain use the same cluster size; use the
4379                          * smaller of the cluster sizes.
4380                          */
4381                         if (njcl > 0 && resid > m_maxsize(MC_BIGCL))
4382                                 r_bufsize = m_maxsize(MC_16KCL);
4383                         else if (resid > m_maxsize(MC_CL))
4384                                 r_bufsize = m_maxsize(MC_BIGCL);
4385                         else
4386                                 r_bufsize = m_maxsize(MC_CL);
4387                 } else {
4388                         /* Use the same cluster size as the other segments */
4389                         resid = 0;
4390                 }
4391         }
4392
4393         needed = *numlist;
4394         if (resid > 0) {
4395                 /*
4396                  * Attempt to allocate composite mbuf + cluster elements for
4397                  * the residual data in each chain; record the number of such
4398                  * elements that can be allocated so that we know how many
4399                  * segment chains we can afford to create.
4400                  */
4401                 if (r_bufsize <= m_maxsize(MC_CL))
4402                         rcp = m_cache(MC_MBUF_CL);
4403                 else if (r_bufsize <= m_maxsize(MC_BIGCL))
4404                         rcp = m_cache(MC_MBUF_BIGCL);
4405                 else
4406                         rcp = m_cache(MC_MBUF_16KCL);
4407                 needed = mcache_alloc_ext(rcp, &rmp_list, *numlist, mcflags);
4408
4409                 if (needed == 0)
4410                         goto fail;
4411
4412                 /* This is temporarily reduced for calculation */
4413                 ASSERT(nsegs > 1);
4414                 nsegs--;
4415         }
4416
4417         /*
4418          * Attempt to allocate the rest of the composite mbuf + cluster
4419          * elements for the number of segment chains that we need.
4420          */
4421         if (bufsize <= m_maxsize(MC_CL))
4422                 cp = m_cache(MC_MBUF_CL);
4423         else if (bufsize <= m_maxsize(MC_BIGCL))
4424                 cp = m_cache(MC_MBUF_BIGCL);
4425         else
4426                 cp = m_cache(MC_MBUF_16KCL);
4427         needed = mcache_alloc_ext(cp, &mp_list, needed * nsegs, mcflags);
4428
4429         /* Round it down to avoid creating a partial segment chain */
4430         needed = (needed / nsegs) * nsegs;
4431         if (needed == 0)
4432                 goto fail;
4433
4434         if (resid > 0) {
4435                 /*
4436                  * We're about to construct the chain(s); take into account
4437                  * the number of segments we have created above to hold the
4438                  * residual data for each chain, as well as restore the
4439                  * original count of segments per chain.
4440                  */
4441                 ASSERT(nsegs > 0);
4442                 needed += needed / nsegs;
4443                 nsegs++;
4444         }
4445
4446         for (;;) {
4447                 struct mbuf *m;
4448                 u_int16_t flag;
4449                 struct ext_ref *rfa;
4450                 void *cl;
4451                 int pkthdr;
4452                 m_ext_free_func_t m_free_func;
4453
4454                 ++num;
4455                 if (nsegs == 1 || (num % nsegs) != 0 || resid == 0) {
4456                         m = (struct mbuf *)mp_list;
4457                         mp_list = mp_list->obj_next;
4458                 } else {
4459                         m = (struct mbuf *)rmp_list;
4460                         rmp_list = rmp_list->obj_next;
4461                 }
4462                 m_free_func = m_get_ext_free(m);
4463                 ASSERT(m != NULL);
4464                 VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT);
4465                 VERIFY(m_free_func == NULL || m_free_func == m_bigfree ||
4466                     m_free_func == m_16kfree);
4467
4468                 cl = m->m_ext.ext_buf;
4469                 rfa = m_get_rfa(m);
4470
4471                 ASSERT(cl != NULL && rfa != NULL);
4472                 VERIFY(MBUF_IS_COMPOSITE(m));
4473
4474                 flag = MEXT_FLAGS(m);
4475
4476                 pkthdr = (nsegs == 1 || (num % nsegs) == 1);
4477                 if (pkthdr)
4478                         first = m;
4479                 MBUF_INIT(m, pkthdr, MT_DATA);
4480                 if (m_free_func == m_16kfree) {
4481                         MBUF_16KCL_INIT(m, cl, rfa, 1, flag);
4482                 } else if (m_free_func == m_bigfree) {
4483                         MBUF_BIGCL_INIT(m, cl, rfa, 1, flag);
4484                 } else {
4485                         MBUF_CL_INIT(m, cl, rfa, 1, flag);
4486                 }
4487 #if CONFIG_MACF_NET
4488                 if (pkthdr && mac_init_mbuf(m, wait) != 0) {
4489                         --num;
4490                         m_freem(m);
4491                         break;
4492                 }
4493 #endif /* MAC_NET */
4494
4495                 *np = m;
4496                 if ((num % nsegs) == 0)
4497                         np = &first->m_nextpkt;
4498                 else
4499                         np = &m->m_next;
4500
4501                 if (num == needed)
4502                         break;
4503         }
4504
4505         if (num > 0) {
4506                 mtype_stat_add(MT_DATA, num);
4507                 mtype_stat_sub(MT_FREE, num);
4508         }
4509
4510         num /= nsegs;
4511
4512         /* We've got them all; return to caller */
4513         if (num == *numlist) {
4514                 ASSERT(mp_list == NULL && rmp_list == NULL);
4515                 return (top);
4516         }
4517
4518 fail:
4519         /* Free up what's left of the above */
4520         if (mp_list != NULL)
4521                 mcache_free_ext(cp, mp_list);
4522         if (rmp_list != NULL)
4523                 mcache_free_ext(rcp, rmp_list);
4524         if (wantall && top != NULL) {
4525                 m_freem(top);
4526                 return (NULL);
4527         }
4528         *numlist = num;
4529         return (top);
4530 }
4531
4532 /*
4533  * Best effort to get a mbuf cluster + pkthdr.  Used by drivers to allocated
4534  * packets on receive ring.
4535  */
4536 __private_extern__ struct mbuf *
4537 m_getpacket_how(int wait)
4538 {
4539         unsigned int num_needed = 1;
4540
4541         return (m_getpackets_internal(&num_needed, 1, wait, 1,
4542             m_maxsize(MC_CL)));
4543 }
4544
4545 /*
4546  * Best effort to get a mbuf cluster + pkthdr.  Used by drivers to allocated
4547  * packets on receive ring.
4548  */
4549 struct mbuf *
4550 m_getpacket(void)
4551 {
4552         unsigned int num_needed = 1;
4553
4554         return (m_getpackets_internal(&num_needed, 1, M_WAIT, 1,
4555             m_maxsize(MC_CL)));
4556 }
4557
4558 /*
4559  * Return a list of mbuf hdrs that point to clusters.  Try for num_needed;
4560  * if this can't be met, return whatever number were available.  Set up the
4561  * first num_with_pkthdrs with mbuf hdrs configured as packet headers.  These
4562  * are chained on the m_nextpkt field.  Any packets requested beyond this are
4563  * chained onto the last packet header's m_next field.
4564  */
4565 struct mbuf *
4566 m_getpackets(int num_needed, int num_with_pkthdrs, int how)
4567 {
4568         unsigned int n = num_needed;
4569
4570         return (m_getpackets_internal(&n, num_with_pkthdrs, how, 0,
4571             m_maxsize(MC_CL)));
4572 }
4573
4574 /*
4575  * Return a list of mbuf hdrs set up as packet hdrs chained together
4576  * on the m_nextpkt field
4577  */
4578 struct mbuf *
4579 m_getpackethdrs(int num_needed, int how)
4580 {
4581         struct mbuf *m;
4582         struct mbuf **np, *top;
4583
4584         top = NULL;
4585         np = &top;
4586
4587         while (num_needed--) {
4588                 m = _M_RETRYHDR(how, MT_DATA);
4589                 if (m == NULL)
4590                         break;
4591
4592                 *np = m;
4593                 np = &m->m_nextpkt;
4594         }
4595
4596         return (top);
4597 }
4598
4599 /*
4600  * Free an mbuf list (m_nextpkt) while following m_next.  Returns the count
4601  * for mbufs packets freed.  Used by the drivers.
4602  */
4603 int
4604 m_freem_list(struct mbuf *m)
4605 {
4606         struct mbuf *nextpkt;
4607         mcache_obj_t *mp_list = NULL;
4608         mcache_obj_t *mcl_list = NULL;
4609         mcache_obj_t *mbc_list = NULL;
4610         mcache_obj_t *m16k_list = NULL;
4611         mcache_obj_t *m_mcl_list = NULL;
4612         mcache_obj_t *m_mbc_list = NULL;
4613         mcache_obj_t *m_m16k_list = NULL;
4614         mcache_obj_t *ref_list = NULL;
4615         int pktcount = 0;
4616         int mt_free = 0, mt_data = 0, mt_header = 0, mt_soname = 0, mt_tag = 0;
4617
4618         while (m != NULL) {
4619                 pktcount++;
4620
4621                 nextpkt = m->m_nextpkt;
4622                 m->m_nextpkt = NULL;
4623
4624                 while (m != NULL) {
4625                         struct mbuf *next = m->m_next;
4626                         mcache_obj_t *o, *rfa;
4627                         u_int32_t composite;
4628                         u_int16_t refcnt;
4629                         m_ext_free_func_t m_free_func;
4630
4631                         if (m->m_type == MT_FREE)
4632                                 panic("m_free: freeing an already freed mbuf");
4633
4634                         if (m->m_flags & M_PKTHDR) {
4635                                 /* Check for scratch area overflow */
4636                                 m_redzone_verify(m);
4637                                 /* Free the aux data and tags if there is any */
4638                                 m_tag_delete_chain(m, NULL);
4639                         }
4640
4641                         if (!(m->m_flags & M_EXT)) {
4642                                 mt_free++;
4643                                 goto simple_free;
4644                         }
4645
4646                         if (MBUF_IS_PAIRED(m) && m_free_paired(m)) {
4647                                 m = next;
4648                                 continue;
4649                         }
4650
4651                         mt_free++;
4652
4653                         o = (mcache_obj_t *)(void *)m->m_ext.ext_buf;
4654                         refcnt = m_decref(m);
4655                         composite = (MEXT_FLAGS(m) & EXTF_COMPOSITE);
4656                         m_free_func = m_get_ext_free(m);
4657                         if (refcnt == MEXT_MINREF(m) && !composite) {
4658                                 if (m_free_func == NULL) {
4659                                         o->obj_next = mcl_list;
4660                                         mcl_list = o;
4661                                 } else if (m_free_func == m_bigfree) {
4662                                         o->obj_next = mbc_list;
4663                                         mbc_list = o;
4664                                 } else if (m_free_func == m_16kfree) {
4665                                         o->obj_next = m16k_list;
4666                                         m16k_list = o;
4667                                 } else {
4668                                         (*(m_free_func))((caddr_t)o,
4669                                             m->m_ext.ext_size,
4670                                             m_get_ext_arg(m));
4671                                 }
4672                                 rfa = (mcache_obj_t *)(void *)m_get_rfa(m);
4673                                 rfa->obj_next = ref_list;
4674                                 ref_list = rfa;
4675                                 m_set_ext(m, NULL, NULL, NULL);
4676                         } else if (refcnt == MEXT_MINREF(m) && composite) {
4677                                 VERIFY(!(MEXT_FLAGS(m) & EXTF_PAIRED));
4678                                 VERIFY(m->m_type != MT_FREE);
4679                                 /*
4680                                  * Amortize the costs of atomic operations
4681                                  * by doing them at the end, if possible.
4682                                  */
4683                                 if (m->m_type == MT_DATA)
4684                                         mt_data++;
4685                                 else if (m->m_type == MT_HEADER)
4686                                         mt_header++;
4687                                 else if (m->m_type == MT_SONAME)
4688                                         mt_soname++;
4689                                 else if (m->m_type == MT_TAG)
4690                                         mt_tag++;
4691                                 else
4692                                         mtype_stat_dec(m->m_type);
4693
4694                                 m->m_type = MT_FREE;
4695                                 m->m_flags = M_EXT;
4696                                 m->m_len = 0;
4697                                 m->m_next = m->m_nextpkt = NULL;
4698
4699                                 MEXT_FLAGS(m) &= ~EXTF_READONLY;
4700
4701                                 /* "Free" into the intermediate cache */
4702                                 o = (mcache_obj_t *)m;
4703                                 if (m_free_func == NULL) {
4704                                         o->obj_next = m_mcl_list;
4705                                         m_mcl_list = o;
4706                                 } else if (m_free_func == m_bigfree) {
4707                                         o->obj_next = m_mbc_list;
4708                                         m_mbc_list = o;
4709                                 } else {
4710                                         VERIFY(m_free_func == m_16kfree);
4711                                         o->obj_next = m_m16k_list;
4712                                         m_m16k_list = o;
4713                                 }
4714                                 m = next;
4715                                 continue;
4716                         }
4717 simple_free:
4718                         /*
4719                          * Amortize the costs of atomic operations
4720                          * by doing them at the end, if possible.
4721                          */
4722                         if (m->m_type == MT_DATA)
4723                                 mt_data++;
4724                         else if (m->m_type == MT_HEADER)
4725                                 mt_header++;
4726                         else if (m->m_type == MT_SONAME)
4727                                 mt_soname++;
4728                         else if (m->m_type == MT_TAG)
4729                                 mt_tag++;
4730                         else if (m->m_type != MT_FREE)
4731                                 mtype_stat_dec(m->m_type);
4732
4733                         m->m_type = MT_FREE;
4734                         m->m_flags = m->m_len = 0;
4735                         m->m_next = m->m_nextpkt = NULL;
4736
4737                         ((mcache_obj_t *)m)->obj_next = mp_list;
4738                         mp_list = (mcache_obj_t *)m;
4739
4740                         m = next;
4741                 }
4742
4743                 m = nextpkt;
4744         }
4745
4746         if (mt_free > 0)
4747                 mtype_stat_add(MT_FREE, mt_free);
4748         if (mt_data > 0)
4749                 mtype_stat_sub(MT_DATA, mt_data);
4750         if (mt_header > 0)
4751                 mtype_stat_sub(MT_HEADER, mt_header);
4752         if (mt_soname > 0)
4753                 mtype_stat_sub(MT_SONAME, mt_soname);
4754         if (mt_tag > 0)
4755                 mtype_stat_sub(MT_TAG, mt_tag);
4756
4757         if (mp_list != NULL)
4758                 mcache_free_ext(m_cache(MC_MBUF), mp_list);
4759         if (mcl_list != NULL)
4760                 mcache_free_ext(m_cache(MC_CL), mcl_list);
4761         if (mbc_list != NULL)
4762                 mcache_free_ext(m_cache(MC_BIGCL), mbc_list);
4763         if (m16k_list != NULL)
4764                 mcache_free_ext(m_cache(MC_16KCL), m16k_list);
4765         if (m_mcl_list != NULL)
4766                 mcache_free_ext(m_cache(MC_MBUF_CL), m_mcl_list);
4767         if (m_mbc_list != NULL)
4768                 mcache_free_ext(m_cache(MC_MBUF_BIGCL), m_mbc_list);
4769         if (m_m16k_list != NULL)
4770                 mcache_free_ext(m_cache(MC_MBUF_16KCL), m_m16k_list);
4771         if (ref_list != NULL)
4772                 mcache_free_ext(ref_cache, ref_list);
4773
4774         return (pktcount);
4775 }
4776
4777 void
4778 m_freem(struct mbuf *m)
4779 {
4780         while (m != NULL)
4781                 m = m_free(m);
4782 }
4783
4784 /*
4785  * Mbuffer utility routines.
4786  */
4787
4788 /*
4789  * Compute the amount of space available before the current start
4790  * of data in an mbuf.
4791  */
4792 int
4793 m_leadingspace(struct mbuf *m)
4794 {
4795         if (m->m_flags & M_EXT) {
4796                 if (MCLHASREFERENCE(m))
4797                         return (0);
4798                 return (m->m_data - m->m_ext.ext_buf);
4799         }
4800         if (m->m_flags & M_PKTHDR)
4801                 return (m->m_data - m->m_pktdat);
4802         return (m->m_data - m->m_dat);
4803 }
4804
4805 /*
4806  * Compute the amount of space available after the end of data in an mbuf.
4807  */
4808 int
4809 m_trailingspace(struct mbuf *m)
4810 {
4811         if (m->m_flags & M_EXT) {
4812                 if (MCLHASREFERENCE(m))
4813                         return (0);
4814                 return (m->m_ext.ext_buf + m->m_ext.ext_size -
4815                     (m->m_data + m->m_len));
4816         }
4817         return (&m->m_dat[MLEN] - (m->m_data + m->m_len));
4818 }
4819
4820 /*
4821  * Lesser-used path for M_PREPEND: allocate new mbuf to prepend to chain,
4822  * copy junk along.  Does not adjust packet header length.
4823  */
4824 struct mbuf *
4825 m_prepend(struct mbuf *m, int len, int how)
4826 {
4827         struct mbuf *mn;
4828
4829         _MGET(mn, how, m->m_type);
4830         if (mn == NULL) {
4831                 m_freem(m);
4832                 return (NULL);
4833         }
4834         if (m->m_flags & M_PKTHDR) {
4835                 M_COPY_PKTHDR(mn, m);
4836                 m->m_flags &= ~M_PKTHDR;
4837         }
4838         mn->m_next = m;
4839         m = mn;
4840         if (m->m_flags & M_PKTHDR) {
4841                 VERIFY(len <= MHLEN);
4842                 MH_ALIGN(m, len);
4843         } else {
4844                 VERIFY(len <= MLEN);
4845                 M_ALIGN(m, len);
4846         }
4847         m->m_len = len;
4848         return (m);
4849 }
4850
4851 /*
4852  * Replacement for old M_PREPEND macro: allocate new mbuf to prepend to
4853  * chain, copy junk along, and adjust length.
4854  */
4855 struct mbuf *
4856 m_prepend_2(struct mbuf *m, int len, int how, int align)
4857 {
4858         if (M_LEADINGSPACE(m) >= len &&
4859             (!align || IS_P2ALIGNED((m->m_data - len), sizeof(u_int32_t)))) {
4860                 m->m_data -= len;
4861                 m->m_len += len;
4862         } else {
4863                 m = m_prepend(m, len, how);
4864         }
4865         if ((m) && (m->m_flags & M_PKTHDR))
4866                 m->m_pkthdr.len += len;
4867         return (m);
4868 }
4869
4870 /*
4871  * Make a copy of an mbuf chain starting "off0" bytes from the beginning,
4872  * continuing for "len" bytes.  If len is M_COPYALL, copy to end of mbuf.
4873  * The wait parameter is a choice of M_WAIT/M_DONTWAIT from caller.
4874  */
4875 int MCFail;
4876
4877 struct mbuf *
4878 m_copym_mode(struct mbuf *m, int off0, int len, int wait, uint32_t mode)
4879 {
4880         struct mbuf *n, *mhdr = NULL, **np;
4881         int off = off0;
4882         struct mbuf *top;
4883         int copyhdr = 0;
4884
4885         if (off < 0 || len < 0)
4886                 panic("m_copym: invalid offset %d or len %d", off, len);
4887
4888         VERIFY((mode != M_COPYM_MUST_COPY_HDR &&
4889             mode != M_COPYM_MUST_MOVE_HDR) || (m->m_flags & M_PKTHDR));
4890
4891         if ((off == 0 && (m->m_flags & M_PKTHDR)) ||
4892             mode == M_COPYM_MUST_COPY_HDR || mode == M_COPYM_MUST_MOVE_HDR) {
4893                 mhdr = m;
4894                 copyhdr = 1;
4895         }
4896
4897         while (off >= m->m_len) {
4898                 if (m->m_next == NULL)
4899                         panic("m_copym: invalid mbuf chain");
4900                 off -= m->m_len;
4901                 m = m->m_next;
4902         }
4903         np = &top;
4904         top = NULL;
4905
4906         while (len > 0) {
4907                 if (m == NULL) {
4908                         if (len != M_COPYALL)
4909                                 panic("m_copym: len != M_COPYALL");
4910                         break;
4911                 }
4912
4913                 if (copyhdr)
4914                         n = _M_RETRYHDR(wait, m->m_type);
4915                 else
4916                         n = _M_RETRY(wait, m->m_type);
4917                 *np = n;
4918
4919                 if (n == NULL)
4920                         goto nospace;
4921
4922                 if (copyhdr != 0) {
4923                         if ((mode == M_COPYM_MOVE_HDR) ||
4924                             (mode == M_COPYM_MUST_MOVE_HDR)) {
4925                                 M_COPY_PKTHDR(n, mhdr);
4926                         } else if ((mode == M_COPYM_COPY_HDR) ||
4927                             (mode == M_COPYM_MUST_COPY_HDR)) {
4928                                 if (m_dup_pkthdr(n, mhdr, wait) == 0)
4929                                         goto nospace;
4930                         }
4931                         if (len == M_COPYALL)
4932                                 n->m_pkthdr.len -= off0;
4933                         else
4934                                 n->m_pkthdr.len = len;
4935                         copyhdr = 0;
4936                         /*
4937                          * There is data to copy from the packet header mbuf
4938                          * if it is empty or it is before the starting offset
4939                          */
4940                         if (mhdr != m) {
4941                                 np = &n->m_next;
4942                                 continue;
4943                         }
4944                 }
4945                 n->m_len = MIN(len, (m->m_len - off));
4946                 if (m->m_flags & M_EXT) {
4947                         n->m_ext = m->m_ext;
4948                         m_incref(m);
4949                         n->m_data = m->m_data + off;
4950                         n->m_flags |= M_EXT;
4951                 } else {
4952                         /*
4953                          * Limit to the capacity of the destination
4954                          */
4955                         if (n->m_flags & M_PKTHDR)
4956                                 n->m_len = MIN(n->m_len, MHLEN);
4957                         else
4958                                 n->m_len = MIN(n->m_len, MLEN);
4959
4960                         if (MTOD(n, char *) + n->m_len > ((char *)n) + MSIZE)
4961                                 panic("%s n %p copy overflow",
4962                                         __func__, n);
4963
4964                         bcopy(MTOD(m, caddr_t)+off, MTOD(n, caddr_t),
4965                             (unsigned)n->m_len);
4966                 }
4967                 if (len != M_COPYALL)
4968                         len -= n->m_len;
4969                 off = 0;
4970                 m = m->m_next;
4971                 np = &n->m_next;
4972         }
4973
4974         if (top == NULL)
4975                 MCFail++;
4976
4977         return (top);
4978 nospace:
4979
4980         m_freem(top);
4981         MCFail++;
4982         return (NULL);
4983 }
4984
4985
4986 struct mbuf *
4987 m_copym(struct mbuf *m, int off0, int len, int wait)
4988 {
4989         return (m_copym_mode(m, off0, len, wait, M_COPYM_MOVE_HDR));
4990 }
4991
4992 /*
4993  * Equivalent to m_copym except that all necessary mbuf hdrs are allocated
4994  * within this routine also, the last mbuf and offset accessed are passed
4995  * out and can be passed back in to avoid having to rescan the entire mbuf
4996  * list (normally hung off of the socket)
4997  */
4998 struct mbuf *
4999 m_copym_with_hdrs(struct mbuf *m0, int off0, int len0, int wait,
5000     struct mbuf **m_lastm, int *m_off, uint32_t mode)
5001 {
5002         struct mbuf *m = m0, *n, **np = NULL;
5003         int off = off0, len = len0;
5004         struct mbuf *top = NULL;
5005         int mcflags = MSLEEPF(wait);
5006         int copyhdr = 0;
5007         int type = 0;
5008         mcache_obj_t *list = NULL;
5009         int needed = 0;
5010
5011         if (off == 0 && (m->m_flags & M_PKTHDR))
5012                 copyhdr = 1;
5013
5014         if (m_lastm != NULL && *m_lastm != NULL) {
5015                 m = *m_lastm;
5016                 off = *m_off;
5017         } else {
5018                 while (off >= m->m_len) {
5019                         off -= m->m_len;
5020                         m = m->m_next;
5021                 }
5022         }
5023
5024         n = m;
5025         while (len > 0) {
5026                 needed++;
5027                 ASSERT(n != NULL);
5028                 len -= MIN(len, (n->m_len - ((needed == 1) ? off : 0)));
5029                 n = n->m_next;
5030         }
5031         needed++;
5032         len = len0;
5033
5034         /*
5035          * If the caller doesn't want to be put to sleep, mark it with
5036          * MCR_TRYHARD so that we may reclaim buffers from other places
5037          * before giving up.
5038          */
5039         if (mcflags & MCR_NOSLEEP)
5040                 mcflags |= MCR_TRYHARD;
5041
5042         if (mcache_alloc_ext(m_cache(MC_MBUF), &list, needed,
5043             mcflags) != needed)
5044                 goto nospace;
5045
5046         needed = 0;
5047         while (len > 0) {
5048                 n = (struct mbuf *)list;
5049                 list = list->obj_next;
5050                 ASSERT(n != NULL && m != NULL);
5051
5052                 type = (top == NULL) ? MT_HEADER : m->m_type;
5053                 MBUF_INIT(n, (top == NULL), type);
5054 #if CONFIG_MACF_NET
5055                 if (top == NULL && mac_mbuf_label_init(n, wait) != 0) {
5056                         mtype_stat_inc(MT_HEADER);
5057                         mtype_stat_dec(MT_FREE);
5058                         m_free(n);
5059                         goto nospace;
5060                 }
5061 #endif /* MAC_NET */
5062
5063                 if (top == NULL) {
5064                         top = n;
5065                         np = &top->m_next;
5066                         continue;
5067                 } else {
5068                         needed++;
5069                         *np = n;
5070                 }
5071
5072                 if (copyhdr) {
5073                         if ((mode == M_COPYM_MOVE_HDR) ||
5074                             (mode == M_COPYM_MUST_MOVE_HDR)) {
5075                                 M_COPY_PKTHDR(n, m);
5076                         } else if ((mode == M_COPYM_COPY_HDR) ||
5077                             (mode == M_COPYM_MUST_COPY_HDR)) {
5078                                 if (m_dup_pkthdr(n, m, wait) == 0)
5079                                         goto nospace;
5080                         }
5081                         n->m_pkthdr.len = len;
5082                         copyhdr = 0;
5083                 }
5084                 n->m_len = MIN(len, (m->m_len - off));
5085
5086                 if (m->m_flags & M_EXT) {
5087                         n->m_ext = m->m_ext;
5088                         m_incref(m);
5089                         n->m_data = m->m_data + off;
5090                         n->m_flags |= M_EXT;
5091                 } else {
5092                         if (MTOD(n, char *) + n->m_len > ((char *)n) + MSIZE)
5093                                 panic("%s n %p copy overflow",
5094                                         __func__, n);
5095
5096                         bcopy(MTOD(m, caddr_t)+off, MTOD(n, caddr_t),
5097                             (unsigned)n->m_len);
5098                 }
5099                 len -= n->m_len;
5100
5101                 if (len == 0) {
5102                         if (m_lastm != NULL && m_off != NULL) {
5103                                 if ((off + n->m_len) == m->m_len) {
5104                                         *m_lastm = m->m_next;
5105                                         *m_off  = 0;
5106                                 } else {
5107                                         *m_lastm = m;
5108                                         *m_off  = off + n->m_len;
5109                                 }
5110                         }
5111                         break;
5112                 }
5113                 off = 0;
5114                 m = m->m_next;
5115                 np = &n->m_next;
5116         }
5117
5118         mtype_stat_inc(MT_HEADER);
5119         mtype_stat_add(type, needed);
5120         mtype_stat_sub(MT_FREE, needed + 1);
5121
5122         ASSERT(list == NULL);
5123         return (top);
5124
5125 nospace:
5126         if (list != NULL)
5127                 mcache_free_ext(m_cache(MC_MBUF), list);
5128         if (top != NULL)
5129                 m_freem(top);
5130         MCFail++;
5131         return (NULL);
5132 }
5133
5134 /*
5135  * Copy data from an mbuf chain starting "off" bytes from the beginning,
5136  * continuing for "len" bytes, into the indicated buffer.
5137  */
5138 void
5139 m_copydata(struct mbuf *m, int off, int len, void *vp)
5140 {
5141         int off0 = off, len0 = len;
5142         struct mbuf *m0 = m;
5143         unsigned count;
5144         char *cp = vp;
5145
5146         if (__improbable(off < 0 || len < 0)) {
5147                 panic("%s: invalid offset %d or len %d", __func__, off, len);
5148                 /* NOTREACHED */
5149         }
5150
5151         while (off > 0) {
5152                 if (__improbable(m == NULL)) {
5153                         panic("%s: invalid mbuf chain %p [off %d, len %d]",
5154                             __func__, m0, off0, len0);
5155                         /* NOTREACHED */
5156                 }
5157                 if (off < m->m_len)
5158                         break;
5159                 off -= m->m_len;
5160                 m = m->m_next;
5161         }
5162         while (len > 0) {
5163                 if (__improbable(m == NULL)) {
5164                         panic("%s: invalid mbuf chain %p [off %d, len %d]",
5165                             __func__, m0, off0, len0);
5166                         /* NOTREACHED */
5167                 }
5168                 count = MIN(m->m_len - off, len);
5169                 bcopy(MTOD(m, caddr_t) + off, cp, count);
5170                 len -= count;
5171                 cp += count;
5172                 off = 0;
5173                 m = m->m_next;
5174         }
5175 }
5176
5177 /*
5178  * Concatenate mbuf chain n to m.  Both chains must be of the same type
5179  * (e.g. MT_DATA).  Any m_pkthdr is not updated.
5180  */
5181 void
5182 m_cat(struct mbuf *m, struct mbuf *n)
5183 {
5184         while (m->m_next)
5185                 m = m->m_next;
5186         while (n) {
5187                 if ((m->m_flags & M_EXT) ||
5188                     m->m_data + m->m_len + n->m_len >= &m->m_dat[MLEN]) {
5189                         /* just join the two chains */
5190                         m->m_next = n;
5191                         return;
5192                 }
5193                 /* splat the data from one into the other */
5194                 bcopy(MTOD(n, caddr_t), MTOD(m, caddr_t) + m->m_len,
5195                     (u_int)n->m_len);
5196                 m->m_len += n->m_len;
5197                 n = m_free(n);
5198         }
5199 }
5200
5201 void
5202 m_adj(struct mbuf *mp, int req_len)
5203 {
5204         int len = req_len;
5205         struct mbuf *m;
5206         int count;
5207
5208         if ((m = mp) == NULL)
5209                 return;
5210         if (len >= 0) {
5211                 /*
5212                  * Trim from head.
5213                  */
5214                 while (m != NULL && len > 0) {
5215                         if (m->m_len <= len) {
5216                                 len -= m->m_len;
5217                                 m->m_len = 0;
5218                                 m = m->m_next;
5219                         } else {
5220                                 m->m_len -= len;
5221                                 m->m_data += len;
5222                                 len = 0;
5223                         }
5224                 }
5225                 m = mp;
5226                 if (m->m_flags & M_PKTHDR)
5227                         m->m_pkthdr.len -= (req_len - len);
5228         } else {
5229                 /*
5230                  * Trim from tail.  Scan the mbuf chain,
5231                  * calculating its length and finding the last mbuf.
5232                  * If the adjustment only affects this mbuf, then just
5233                  * adjust and return.  Otherwise, rescan and truncate
5234                  * after the remaining size.
5235                  */
5236                 len = -len;
5237                 count = 0;
5238                 for (;;) {
5239                         count += m->m_len;
5240                         if (m->m_next == (struct mbuf *)0)
5241                                 break;
5242                         m = m->m_next;
5243                 }
5244                 if (m->m_len >= len) {
5245                         m->m_len -= len;
5246                         m = mp;
5247                         if (m->m_flags & M_PKTHDR)
5248                                 m->m_pkthdr.len -= len;
5249                         return;
5250                 }
5251                 count -= len;
5252                 if (count < 0)
5253                         count = 0;
5254                 /*
5255                  * Correct length for chain is "count".
5256                  * Find the mbuf with last data, adjust its length,
5257                  * and toss data from remaining mbufs on chain.
5258                  */
5259                 m = mp;
5260                 if (m->m_flags & M_PKTHDR)
5261                         m->m_pkthdr.len = count;
5262                 for (; m; m = m->m_next) {
5263                         if (m->m_len >= count) {
5264                                 m->m_len = count;
5265                                 break;
5266                         }
5267                         count -= m->m_len;
5268                 }
5269                 while ((m = m->m_next))
5270                         m->m_len = 0;
5271         }
5272 }
5273
5274 /*
5275  * Rearange an mbuf chain so that len bytes are contiguous
5276  * and in the data area of an mbuf (so that mtod and dtom
5277  * will work for a structure of size len).  Returns the resulting
5278  * mbuf chain on success, frees it and returns null on failure.
5279  * If there is room, it will add up to max_protohdr-len extra bytes to the
5280  * contiguous region in an attempt to avoid being called next time.
5281  */
5282 int MPFail;
5283
5284 struct mbuf *
5285 m_pullup(struct mbuf *n, int len)
5286 {
5287         struct mbuf *m;
5288         int count;
5289         int space;
5290
5291         /*
5292          * If first mbuf has no cluster, and has room for len bytes
5293          * without shifting current data, pullup into it,
5294          * otherwise allocate a new mbuf to prepend to the chain.
5295          */
5296         if ((n->m_flags & M_EXT) == 0 &&
5297             n->m_data + len < &n->m_dat[MLEN] && n->m_next) {
5298                 if (n->m_len >= len)
5299                         return (n);
5300                 m = n;
5301                 n = n->m_next;
5302                 len -= m->m_len;
5303         } else {
5304                 if (len > MHLEN)
5305                         goto bad;
5306                 _MGET(m, M_DONTWAIT, n->m_type);
5307                 if (m == 0)
5308                         goto bad;
5309                 m->m_len = 0;
5310                 if (n->m_flags & M_PKTHDR) {
5311                         M_COPY_PKTHDR(m, n);
5312                         n->m_flags &= ~M_PKTHDR;
5313                 }
5314         }
5315         space = &m->m_dat[MLEN] - (m->m_data + m->m_len);
5316         do {
5317                 count = MIN(MIN(MAX(len, max_protohdr), space), n->m_len);
5318                 bcopy(MTOD(n, caddr_t), MTOD(m, caddr_t) + m->m_len,
5319                     (unsigned)count);
5320                 len -= count;
5321                 m->m_len += count;
5322                 n->m_len -= count;
5323                 space -= count;
5324                 if (n->m_len)
5325                         n->m_data += count;
5326                 else
5327                         n = m_free(n);
5328         } while (len > 0 && n);
5329         if (len > 0) {
5330                 (void) m_free(m);
5331                 goto bad;
5332         }
5333         m->m_next = n;
5334         return (m);
5335 bad:
5336         m_freem(n);
5337         MPFail++;
5338         return (0);
5339 }
5340
5341 /*
5342  * Like m_pullup(), except a new mbuf is always allocated, and we allow
5343  * the amount of empty space before the data in the new mbuf to be specified
5344  * (in the event that the caller expects to prepend later).
5345  */
5346 __private_extern__ int MSFail = 0;
5347
5348 __private_extern__ struct mbuf *
5349 m_copyup(struct mbuf *n, int len, int dstoff)
5350 {
5351         struct mbuf *m;
5352         int count, space;
5353
5354         if (len > (MHLEN - dstoff))
5355                 goto bad;
5356         MGET(m, M_DONTWAIT, n->m_type);
5357         if (m == NULL)
5358                 goto bad;
5359         m->m_len = 0;
5360         if (n->m_flags & M_PKTHDR) {
5361                 m_copy_pkthdr(m, n);
5362                 n->m_flags &= ~M_PKTHDR;
5363         }
5364         m->m_data += dstoff;
5365         space = &m->m_dat[MLEN] - (m->m_data + m->m_len);
5366         do {
5367                 count = min(min(max(len, max_protohdr), space), n->m_len);
5368                 memcpy(mtod(m, caddr_t) + m->m_len, mtod(n, caddr_t),
5369                     (unsigned)count);
5370                 len -= count;
5371                 m->m_len += count;
5372                 n->m_len -= count;
5373                 space -= count;
5374                 if (n->m_len)
5375                         n->m_data += count;
5376                 else
5377                         n = m_free(n);
5378         } while (len > 0 && n);
5379         if (len > 0) {
5380                 (void) m_free(m);
5381                 goto bad;
5382         }
5383         m->m_next = n;
5384         return (m);
5385 bad:
5386         m_freem(n);
5387         MSFail++;
5388         return (NULL);
5389 }
5390
5391 /*
5392  * Partition an mbuf chain in two pieces, returning the tail --
5393  * all but the first len0 bytes.  In case of failure, it returns NULL and
5394  * attempts to restore the chain to its original state.
5395  */
5396 struct mbuf *
5397 m_split(struct mbuf *m0, int len0, int wait)
5398 {
5399         return (m_split0(m0, len0, wait, 1));
5400 }
5401
5402 static struct mbuf *
5403 m_split0(struct mbuf *m0, int len0, int wait, int copyhdr)
5404 {
5405         struct mbuf *m, *n;
5406         unsigned len = len0, remain;
5407
5408         for (m = m0; m && len > m->m_len; m = m->m_next)
5409                 len -= m->m_len;
5410         if (m == NULL)
5411                 return (NULL);
5412         remain = m->m_len - len;
5413         if (copyhdr && (m0->m_flags & M_PKTHDR)) {
5414                 _MGETHDR(n, wait, m0->m_type);
5415                 if (n == NULL)
5416                         return (NULL);
5417                 n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif;
5418                 n->m_pkthdr.len = m0->m_pkthdr.len - len0;
5419                 m0->m_pkthdr.len = len0;
5420                 if (m->m_flags & M_EXT)
5421                         goto extpacket;
5422                 if (remain > MHLEN) {
5423                         /* m can't be the lead packet */
5424                         MH_ALIGN(n, 0);
5425                         n->m_next = m_split(m, len, wait);
5426                         if (n->m_next == NULL) {
5427                                 (void) m_free(n);
5428                                 return (NULL);
5429                         } else
5430                                 return (n);
5431                 } else
5432                         MH_ALIGN(n, remain);
5433         } else if (remain == 0) {
5434                 n = m->m_next;
5435                 m->m_next = NULL;
5436                 return (n);
5437         } else {
5438                 _MGET(n, wait, m->m_type);
5439                 if (n == NULL)
5440                         return (NULL);
5441                 M_ALIGN(n, remain);
5442         }
5443 extpacket:
5444         if (m->m_flags & M_EXT) {
5445                 n->m_flags |= M_EXT;
5446                 n->m_ext = m->m_ext;
5447                 m_incref(m);
5448                 n->m_data = m->m_data + len;
5449         } else {
5450                 bcopy(MTOD(m, caddr_t) + len, MTOD(n, caddr_t), remain);
5451         }
5452         n->m_len = remain;
5453         m->m_len = len;
5454         n->m_next = m->m_next;
5455         m->m_next = NULL;
5456         return (n);
5457 }
5458
5459 /*
5460  * Routine to copy from device local memory into mbufs.
5461  */
5462 struct mbuf *
5463 m_devget(char *buf, int totlen, int off0, struct ifnet *ifp,
5464     void (*copy)(const void *, void *, size_t))
5465 {
5466         struct mbuf *m;
5467         struct mbuf *top = NULL, **mp = &top;
5468         int off = off0, len;
5469         char *cp;
5470         char *epkt;
5471
5472         cp = buf;
5473         epkt = cp + totlen;
5474         if (off) {
5475                 /*
5476                  * If 'off' is non-zero, packet is trailer-encapsulated,
5477                  * so we have to skip the type and length fields.
5478                  */
5479                 cp += off + 2 * sizeof (u_int16_t);
5480                 totlen -= 2 * sizeof (u_int16_t);
5481         }
5482         _MGETHDR(m, M_DONTWAIT, MT_DATA);
5483         if (m == NULL)
5484                 return (NULL);
5485         m->m_pkthdr.rcvif = ifp;
5486         m->m_pkthdr.len = totlen;
5487         m->m_len = MHLEN;
5488
5489         while (totlen > 0) {
5490                 if (top != NULL) {
5491                         _MGET(m, M_DONTWAIT, MT_DATA);
5492                         if (m == NULL) {
5493                                 m_freem(top);
5494                                 return (NULL);
5495                         }
5496                         m->m_len = MLEN;
5497                 }
5498                 len = MIN(totlen, epkt - cp);
5499                 if (len >= MINCLSIZE) {
5500                         MCLGET(m, M_DONTWAIT);
5501                         if (m->m_flags & M_EXT) {
5502                                 m->m_len = len = MIN(len, m_maxsize(MC_CL));
5503                         } else {
5504                                 /* give up when it's out of cluster mbufs */
5505                                 if (top != NULL)
5506                                         m_freem(top);
5507                                 m_freem(m);
5508                                 return (NULL);
5509                         }
5510                 } else {
5511                         /*
5512                          * Place initial small packet/header at end of mbuf.
5513                          */
5514                         if (len < m->m_len) {
5515                                 if (top == NULL &&
5516                                     len + max_linkhdr <= m->m_len)
5517                                         m->m_data += max_linkhdr;
5518                                 m->m_len = len;
5519                         } else {
5520                                 len = m->m_len;
5521                         }
5522                 }
5523                 if (copy)
5524                         copy(cp, MTOD(m, caddr_t), (unsigned)len);
5525                 else
5526                         bcopy(cp, MTOD(m, caddr_t), (unsigned)len);
5527                 cp += len;
5528                 *mp = m;
5529                 mp = &m->m_next;
5530                 totlen -= len;
5531                 if (cp == epkt)
5532                         cp = buf;
5533         }
5534         return (top);
5535 }
5536
5537 #ifndef MBUF_GROWTH_NORMAL_THRESH
5538 #define MBUF_GROWTH_NORMAL_THRESH 25
5539 #endif
5540
5541 /*
5542  * Cluster freelist allocation check.
5543  */
5544 static int
5545 m_howmany(int num, size_t bufsize)
5546 {
5547         int i = 0, j = 0;
5548         u_int32_t m_mbclusters, m_clusters, m_bigclusters, m_16kclusters;
5549         u_int32_t m_mbfree, m_clfree, m_bigclfree, m_16kclfree;
5550         u_int32_t sumclusters, freeclusters;
5551         u_int32_t percent_pool, percent_kmem;
5552         u_int32_t mb_growth, mb_growth_thresh;
5553
5554         VERIFY(bufsize == m_maxsize(MC_BIGCL) ||
5555             bufsize == m_maxsize(MC_16KCL));
5556
5557         LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
5558
5559         /* Numbers in 2K cluster units */
5560         m_mbclusters = m_total(MC_MBUF) >> NMBPCLSHIFT;
5561         m_clusters = m_total(MC_CL);
5562         m_bigclusters = m_total(MC_BIGCL) << NCLPBGSHIFT;
5563         m_16kclusters = m_total(MC_16KCL);
5564         sumclusters = m_mbclusters + m_clusters + m_bigclusters;
5565
5566         m_mbfree = m_infree(MC_MBUF) >> NMBPCLSHIFT;
5567         m_clfree = m_infree(MC_CL);
5568         m_bigclfree = m_infree(MC_BIGCL) << NCLPBGSHIFT;
5569         m_16kclfree = m_infree(MC_16KCL);
5570         freeclusters = m_mbfree + m_clfree + m_bigclfree;
5571
5572         /* Bail if we've maxed out the mbuf memory map */
5573         if ((bufsize == m_maxsize(MC_BIGCL) && sumclusters >= nclusters) ||
5574             (njcl > 0 && bufsize == m_maxsize(MC_16KCL) &&
5575             (m_16kclusters << NCLPJCLSHIFT) >= njcl)) {
5576                 return (0);
5577         }
5578
5579         if (bufsize == m_maxsize(MC_BIGCL)) {
5580                 /* Under minimum */
5581                 if (m_bigclusters < m_minlimit(MC_BIGCL))
5582                         return (m_minlimit(MC_BIGCL) - m_bigclusters);
5583
5584                 percent_pool =
5585                     ((sumclusters - freeclusters) * 100) / sumclusters;
5586                 percent_kmem = (sumclusters * 100) / nclusters;
5587
5588                 /*
5589                  * If a light/normal user, grow conservatively (75%)
5590                  * If a heavy user, grow aggressively (50%)
5591                  */
5592                 if (percent_kmem < MBUF_GROWTH_NORMAL_THRESH)
5593                         mb_growth = MB_GROWTH_NORMAL;
5594                 else
5595                         mb_growth = MB_GROWTH_AGGRESSIVE;
5596
5597                 if (percent_kmem < 5) {
5598                         /* For initial allocations */
5599                         i = num;
5600                 } else {
5601                         /* Return if >= MBIGCL_LOWAT clusters available */
5602                         if (m_infree(MC_BIGCL) >= MBIGCL_LOWAT &&
5603                             m_total(MC_BIGCL) >=
5604                             MBIGCL_LOWAT + m_minlimit(MC_BIGCL))
5605                                 return (0);
5606
5607                         /* Ensure at least num clusters are accessible */
5608                         if (num >= m_infree(MC_BIGCL))
5609                                 i = num - m_infree(MC_BIGCL);
5610                         if (num > m_total(MC_BIGCL) - m_minlimit(MC_BIGCL))
5611                                 j = num - (m_total(MC_BIGCL) -
5612                                     m_minlimit(MC_BIGCL));
5613
5614                         i = MAX(i, j);
5615
5616                         /*
5617                          * Grow pool if percent_pool > 75 (normal growth)
5618                          * or percent_pool > 50 (aggressive growth).
5619                          */
5620                         mb_growth_thresh = 100 - (100 / (1 << mb_growth));
5621                         if (percent_pool > mb_growth_thresh)
5622                                 j = ((sumclusters + num) >> mb_growth) -
5623                                     freeclusters;
5624                         i = MAX(i, j);
5625                 }
5626
5627                 /* Check to ensure we didn't go over limits */
5628                 if (i + m_bigclusters >= m_maxlimit(MC_BIGCL))
5629                         i = m_maxlimit(MC_BIGCL) - m_bigclusters;
5630                 if ((i << 1) + sumclusters >= nclusters)
5631                         i = (nclusters - sumclusters) >> 1;
5632                 VERIFY((m_total(MC_BIGCL) + i) <= m_maxlimit(MC_BIGCL));
5633                 VERIFY(sumclusters + (i << 1) <= nclusters);
5634
5635         } else { /* 16K CL */
5636                 VERIFY(njcl > 0);
5637                 /* Ensure at least num clusters are available */
5638                 if (num >= m_16kclfree)
5639                         i = num - m_16kclfree;
5640
5641                 /* Always grow 16KCL pool aggressively */
5642                 if (((m_16kclusters + num) >> 1) > m_16kclfree)
5643                         j = ((m_16kclusters + num) >> 1) - m_16kclfree;
5644                 i = MAX(i, j);
5645
5646                 /* Check to ensure we don't go over limit */
5647                 if ((i + m_total(MC_16KCL)) >= m_maxlimit(MC_16KCL))
5648                         i = m_maxlimit(MC_16KCL) - m_total(MC_16KCL);
5649         }
5650         return (i);
5651 }
5652 /*
5653  * Return the number of bytes in the mbuf chain, m.
5654  */
5655 unsigned int
5656 m_length(struct mbuf *m)
5657 {
5658         struct mbuf *m0;
5659         unsigned int pktlen;
5660
5661         if (m->m_flags & M_PKTHDR)
5662                 return (m->m_pkthdr.len);
5663
5664         pktlen = 0;
5665         for (m0 = m; m0 != NULL; m0 = m0->m_next)
5666                 pktlen += m0->m_len;
5667         return (pktlen);
5668 }
5669
5670 /*
5671  * Copy data from a buffer back into the indicated mbuf chain,
5672  * starting "off" bytes from the beginning, extending the mbuf
5673  * chain if necessary.
5674  */
5675 void
5676 m_copyback(struct mbuf *m0, int off, int len, const void *cp)
5677 {
5678 #if DEBUG
5679         struct mbuf *origm = m0;
5680         int error;
5681 #endif /* DEBUG */
5682
5683         if (m0 == NULL)
5684                 return;
5685
5686 #if DEBUG
5687         error =
5688 #endif /* DEBUG */
5689         m_copyback0(&m0, off, len, cp,
5690             M_COPYBACK0_COPYBACK | M_COPYBACK0_EXTEND, M_DONTWAIT);
5691
5692 #if DEBUG
5693         if (error != 0 || (m0 != NULL && origm != m0))
5694                 panic("m_copyback");
5695 #endif /* DEBUG */
5696 }
5697
5698 struct mbuf *
5699 m_copyback_cow(struct mbuf *m0, int off, int len, const void *cp, int how)
5700 {
5701         int error;
5702
5703         /* don't support chain expansion */
5704         VERIFY(off + len <= m_length(m0));
5705
5706         error = m_copyback0(&m0, off, len, cp,
5707             M_COPYBACK0_COPYBACK | M_COPYBACK0_COW, how);
5708         if (error) {
5709                 /*
5710                  * no way to recover from partial success.
5711                  * just free the chain.
5712                  */
5713                 m_freem(m0);
5714                 return (NULL);
5715         }
5716         return (m0);
5717 }
5718
5719 /*
5720  * m_makewritable: ensure the specified range writable.
5721  */
5722 int
5723 m_makewritable(struct mbuf **mp, int off, int len, int how)
5724 {
5725         int error;
5726 #if DEBUG
5727         struct mbuf *n;
5728         int origlen, reslen;
5729
5730         origlen = m_length(*mp);
5731 #endif /* DEBUG */
5732
5733 #if 0 /* M_COPYALL is large enough */
5734         if (len == M_COPYALL)
5735                 len = m_length(*mp) - off; /* XXX */
5736 #endif
5737
5738         error = m_copyback0(mp, off, len, NULL,
5739             M_COPYBACK0_PRESERVE | M_COPYBACK0_COW, how);
5740
5741 #if DEBUG
5742         reslen = 0;
5743         for (n = *mp; n; n = n->m_next)
5744                 reslen += n->m_len;
5745         if (origlen != reslen)
5746                 panic("m_makewritable: length changed");
5747         if (((*mp)->m_flags & M_PKTHDR) && reslen != (*mp)->m_pkthdr.len)
5748                 panic("m_makewritable: inconsist");
5749 #endif /* DEBUG */
5750
5751         return (error);
5752 }
5753
5754 static int
5755 m_copyback0(struct mbuf **mp0, int off, int len, const void *vp, int flags,
5756     int how)
5757 {
5758         int mlen;
5759         struct mbuf *m, *n;
5760         struct mbuf **mp;
5761         int totlen = 0;
5762         const char *cp = vp;
5763
5764         VERIFY(mp0 != NULL);
5765         VERIFY(*mp0 != NULL);
5766         VERIFY((flags & M_COPYBACK0_PRESERVE) == 0 || cp == NULL);
5767         VERIFY((flags & M_COPYBACK0_COPYBACK) == 0 || cp != NULL);
5768
5769         /*
5770          * we don't bother to update "totlen" in the case of M_COPYBACK0_COW,
5771          * assuming that M_COPYBACK0_EXTEND and M_COPYBACK0_COW are exclusive.
5772          */
5773
5774         VERIFY((~flags & (M_COPYBACK0_EXTEND|M_COPYBACK0_COW)) != 0);
5775
5776         mp = mp0;
5777         m = *mp;
5778         while (off > (mlen = m->m_len)) {
5779                 off -= mlen;
5780                 totlen += mlen;
5781                 if (m->m_next == NULL) {
5782                         int tspace;
5783 extend:
5784                         if (!(flags & M_COPYBACK0_EXTEND))
5785                                 goto out;
5786
5787                         /*
5788                          * try to make some space at the end of "m".
5789                          */
5790
5791                         mlen = m->m_len;
5792                         if (off + len >= MINCLSIZE &&
5793                             !(m->m_flags & M_EXT) && m->m_len == 0) {
5794                                 MCLGET(m, how);
5795                         }
5796                         tspace = M_TRAILINGSPACE(m);
5797                         if (tspace > 0) {
5798                                 tspace = MIN(tspace, off + len);
5799                                 VERIFY(tspace > 0);
5800                                 bzero(mtod(m, char *) + m->m_len,
5801                                     MIN(off, tspace));
5802                                 m->m_len += tspace;
5803                                 off += mlen;
5804                                 totlen -= mlen;
5805                                 continue;
5806                         }
5807
5808                         /*
5809                          * need to allocate an mbuf.
5810                          */
5811
5812                         if (off + len >= MINCLSIZE) {
5813                                 n = m_getcl(how, m->m_type, 0);
5814                         } else {
5815                                 n = _M_GET(how, m->m_type);
5816                         }
5817                         if (n == NULL) {
5818                                 goto out;
5819                         }
5820                         n->m_len = 0;
5821                         n->m_len = MIN(M_TRAILINGSPACE(n), off + len);
5822                         bzero(mtod(n, char *), MIN(n->m_len, off));
5823                         m->m_next = n;
5824                 }
5825                 mp = &m->m_next;
5826                 m = m->m_next;
5827         }
5828         while (len > 0) {
5829                 mlen = m->m_len - off;
5830                 if (mlen != 0 && m_mclhasreference(m)) {
5831                         char *datap;
5832                         int eatlen;
5833
5834                         /*
5835                          * this mbuf is read-only.
5836                          * allocate a new writable mbuf and try again.
5837                          */
5838
5839 #if DIAGNOSTIC
5840                         if (!(flags & M_COPYBACK0_COW))
5841                                 panic("m_copyback0: read-only");
5842 #endif /* DIAGNOSTIC */
5843
5844                         /*
5845                          * if we're going to write into the middle of
5846                          * a mbuf, split it first.
5847                          */
5848                         if (off > 0 && len < mlen) {
5849                                 n = m_split0(m, off, how, 0);
5850                                 if (n == NULL)
5851                                         goto enobufs;
5852                                 m->m_next = n;
5853                                 mp = &m->m_next;
5854                                 m = n;
5855                                 off = 0;
5856                                 continue;
5857                         }
5858
5859                         /*
5860                          * XXX TODO coalesce into the trailingspace of
5861                          * the previous mbuf when possible.
5862                          */
5863
5864                         /*
5865                          * allocate a new mbuf.  copy packet header if needed.
5866                          */
5867                         n = _M_GET(how, m->m_type);
5868                         if (n == NULL)
5869                                 goto enobufs;
5870                         if (off == 0 && (m->m_flags & M_PKTHDR)) {
5871                                 M_COPY_PKTHDR(n, m);
5872                                 n->m_len = MHLEN;
5873                         } else {
5874                                 if (len >= MINCLSIZE)
5875                                         MCLGET(n, M_DONTWAIT);
5876                                 n->m_len =
5877                                     (n->m_flags & M_EXT) ? MCLBYTES : MLEN;
5878                         }
5879                         if (n->m_len > len)
5880                                 n->m_len = len;
5881
5882                         /*
5883                          * free the region which has been overwritten.
5884                          * copying data from old mbufs if requested.
5885                          */
5886                         if (flags & M_COPYBACK0_PRESERVE)
5887                                 datap = mtod(n, char *);
5888                         else
5889                                 datap = NULL;
5890                         eatlen = n->m_len;
5891                         VERIFY(off == 0 || eatlen >= mlen);
5892                         if (off > 0) {
5893                                 VERIFY(len >= mlen);
5894                                 m->m_len = off;
5895                                 m->m_next = n;
5896                                 if (datap) {
5897                                         m_copydata(m, off, mlen, datap);
5898                                         datap += mlen;
5899                                 }
5900                                 eatlen -= mlen;
5901                                 mp = &m->m_next;
5902                                 m = m->m_next;
5903                         }
5904                         while (m != NULL && m_mclhasreference(m) &&
5905                             n->m_type == m->m_type && eatlen > 0) {
5906                                 mlen = MIN(eatlen, m->m_len);
5907                                 if (datap) {
5908                                         m_copydata(m, 0, mlen, datap);
5909                                         datap += mlen;
5910                                 }
5911                                 m->m_data += mlen;
5912                                 m->m_len -= mlen;
5913                                 eatlen -= mlen;
5914                                 if (m->m_len == 0)
5915                                         *mp = m = m_free(m);
5916                         }
5917                         if (eatlen > 0)
5918                                 n->m_len -= eatlen;
5919                         n->m_next = m;
5920                         *mp = m = n;
5921                         continue;
5922                 }
5923                 mlen = MIN(mlen, len);
5924                 if (flags & M_COPYBACK0_COPYBACK) {
5925                         bcopy(cp, mtod(m, caddr_t) + off, (unsigned)mlen);
5926                         cp += mlen;
5927                 }
5928                 len -= mlen;
5929                 mlen += off;
5930                 off = 0;
5931                 totlen += mlen;
5932                 if (len == 0)
5933                         break;
5934                 if (m->m_next == NULL) {
5935                         goto extend;
5936                 }
5937                 mp = &m->m_next;
5938                 m = m->m_next;
5939         }
5940 out:
5941         if (((m = *mp0)->m_flags & M_PKTHDR) && (m->m_pkthdr.len < totlen)) {
5942                 VERIFY(flags & M_COPYBACK0_EXTEND);
5943                 m->m_pkthdr.len = totlen;
5944         }
5945
5946         return (0);
5947
5948 enobufs:
5949         return (ENOBUFS);
5950 }
5951
5952 uint64_t
5953 mcl_to_paddr(char *addr)
5954 {
5955         vm_offset_t base_phys;
5956
5957         if (!MBUF_IN_MAP(addr))
5958                 return (0);
5959         base_phys = mcl_paddr[atop_64(addr - (char *)mbutl)];
5960
5961         if (base_phys == 0)
5962                 return (0);
5963         return ((uint64_t)(ptoa_64(base_phys) | ((uint64_t)addr & PAGE_MASK)));
5964 }
5965
5966 /*
5967  * Dup the mbuf chain passed in.  The whole thing.  No cute additional cruft.
5968  * And really copy the thing.  That way, we don't "precompute" checksums
5969  * for unsuspecting consumers.  Assumption: m->m_nextpkt == 0.  Trick: for
5970  * small packets, don't dup into a cluster.  That way received  packets
5971  * don't take up too much room in the sockbuf (cf. sbspace()).
5972  */
5973 int MDFail;
5974
5975 struct mbuf *
5976 m_dup(struct mbuf *m, int how)
5977 {
5978         struct mbuf *n, **np;
5979         struct mbuf *top;
5980         int copyhdr = 0;
5981
5982         np = &top;
5983         top = NULL;
5984         if (m->m_flags & M_PKTHDR)
5985                 copyhdr = 1;
5986
5987         /*
5988          * Quick check: if we have one mbuf and its data fits in an
5989          *  mbuf with packet header, just copy and go.
5990          */
5991         if (m->m_next == NULL) {
5992                 /* Then just move the data into an mbuf and be done... */
5993                 if (copyhdr) {
5994                         if (m->m_pkthdr.len <= MHLEN && m->m_len <= MHLEN) {
5995                                 if ((n = _M_GETHDR(how, m->m_type)) == NULL)
5996                                         return (NULL);
5997                                 n->m_len = m->m_len;
5998                                 m_dup_pkthdr(n, m, how);
5999                                 bcopy(m->m_data, n->m_data, m->m_len);
6000                                 return (n);
6001                         }
6002                 } else if (m->m_len <= MLEN) {
6003                         if ((n = _M_GET(how, m->m_type)) == NULL)
6004                                 return (NULL);
6005                         bcopy(m->m_data, n->m_data, m->m_len);
6006                         n->m_len = m->m_len;
6007                         return (n);
6008                 }
6009         }
6010         while (m != NULL) {
6011 #if BLUE_DEBUG
6012                 printf("<%x: %x, %x, %x\n", m, m->m_flags, m->m_len,
6013                     m->m_data);
6014 #endif
6015                 if (copyhdr)
6016                         n = _M_GETHDR(how, m->m_type);
6017                 else
6018                         n = _M_GET(how, m->m_type);
6019                 if (n == NULL)
6020                         goto nospace;
6021                 if (m->m_flags & M_EXT) {
6022                         if (m->m_len <= m_maxsize(MC_CL))
6023                                 MCLGET(n, how);
6024                         else if (m->m_len <= m_maxsize(MC_BIGCL))
6025                                 n = m_mbigget(n, how);
6026                         else if (m->m_len <= m_maxsize(MC_16KCL) && njcl > 0)
6027                                 n = m_m16kget(n, how);
6028                         if (!(n->m_flags & M_EXT)) {
6029                                 (void) m_free(n);
6030                                 goto nospace;
6031                         }
6032                 }
6033                 *np = n;
6034                 if (copyhdr) {
6035                         /* Don't use M_COPY_PKTHDR: preserve m_data */
6036                         m_dup_pkthdr(n, m, how);
6037                         copyhdr = 0;
6038                         if (!(n->m_flags & M_EXT))
6039                                 n->m_data = n->m_pktdat;
6040                 }
6041                 n->m_len = m->m_len;
6042                 /*
6043                  * Get the dup on the same bdry as the original
6044                  * Assume that the two mbufs have the same offset to data area
6045                  * (up to word boundaries)
6046                  */
6047                 bcopy(MTOD(m, caddr_t), MTOD(n, caddr_t), (unsigned)n->m_len);
6048                 m = m->m_next;
6049                 np = &n->m_next;
6050 #if BLUE_DEBUG
6051                 printf(">%x: %x, %x, %x\n", n, n->m_flags, n->m_len,
6052                     n->m_data);
6053 #endif
6054         }
6055
6056         if (top == NULL)
6057                 MDFail++;
6058         return (top);
6059
6060 nospace:
6061         m_freem(top);
6062         MDFail++;
6063         return (NULL);
6064 }
6065
6066 #define MBUF_MULTIPAGES(m)                                              \
6067         (((m)->m_flags & M_EXT) &&                                      \
6068         ((IS_P2ALIGNED((m)->m_data, PAGE_SIZE)                          \
6069         && (m)->m_len > PAGE_SIZE) ||                                   \
6070         (!IS_P2ALIGNED((m)->m_data, PAGE_SIZE) &&                       \
6071         P2ROUNDUP((m)->m_data, PAGE_SIZE) < ((uintptr_t)(m)->m_data + (m)->m_len))))
6072
6073 static struct mbuf *
6074 m_expand(struct mbuf *m, struct mbuf **last)
6075 {
6076         struct mbuf *top = NULL;
6077         struct mbuf **nm = &top;
6078         uintptr_t data0, data;
6079         unsigned int len0, len;
6080
6081         VERIFY(MBUF_MULTIPAGES(m));
6082         VERIFY(m->m_next == NULL);
6083         data0 = (uintptr_t)m->m_data;
6084         len0 = m->m_len;
6085         *last = top;
6086
6087         for (;;) {
6088                 struct mbuf *n;
6089
6090                 data = data0;
6091                 if (IS_P2ALIGNED(data, PAGE_SIZE) && len0 > PAGE_SIZE)
6092                         len = PAGE_SIZE;
6093                 else if (!IS_P2ALIGNED(data, PAGE_SIZE) &&
6094                     P2ROUNDUP(data, PAGE_SIZE) < (data + len0))
6095                         len = P2ROUNDUP(data, PAGE_SIZE) - data;
6096                 else
6097                         len = len0;
6098
6099                 VERIFY(len > 0);
6100                 VERIFY(m->m_flags & M_EXT);
6101                 m->m_data = (void *)data;
6102                 m->m_len = len;
6103
6104                 *nm = *last = m;
6105                 nm = &m->m_next;
6106                 m->m_next = NULL;
6107
6108                 data0 += len;
6109                 len0 -= len;
6110                 if (len0 == 0)
6111                         break;
6112
6113                 n = _M_RETRY(M_DONTWAIT, MT_DATA);
6114                 if (n == NULL) {
6115                         m_freem(top);
6116                         top = *last = NULL;
6117                         break;
6118                 }
6119
6120                 n->m_ext = m->m_ext;
6121                 m_incref(m);
6122                 n->m_flags |= M_EXT;
6123                 m = n;
6124         }
6125         return (top);
6126 }
6127
6128 struct mbuf *
6129 m_normalize(struct mbuf *m)
6130 {
6131         struct mbuf *top = NULL;
6132         struct mbuf **nm = &top;
6133         boolean_t expanded = FALSE;
6134
6135         while (m != NULL) {
6136                 struct mbuf *n;
6137
6138                 n = m->m_next;
6139                 m->m_next = NULL;
6140
6141                 /* Does the data cross one or more page boundaries? */
6142                 if (MBUF_MULTIPAGES(m)) {
6143                         struct mbuf *last;
6144                         if ((m = m_expand(m, &last)) == NULL) {
6145                                 m_freem(n);
6146                                 m_freem(top);
6147                                 top = NULL;
6148                                 break;
6149                         }
6150                         *nm = m;
6151                         nm = &last->m_next;
6152                         expanded = TRUE;
6153                 } else {
6154                         *nm = m;
6155                         nm = &m->m_next;
6156                 }
6157                 m = n;
6158         }
6159         if (expanded)
6160                 atomic_add_32(&mb_normalized, 1);
6161         return (top);
6162 }
6163
6164 /*
6165  * Append the specified data to the indicated mbuf chain,
6166  * Extend the mbuf chain if the new data does not fit in
6167  * existing space.
6168  *
6169  * Return 1 if able to complete the job; otherwise 0.
6170  */
6171 int
6172 m_append(struct mbuf *m0, int len, caddr_t cp)
6173 {
6174         struct mbuf *m, *n;
6175         int remainder, space;
6176
6177         for (m = m0; m->m_next != NULL; m = m->m_next)
6178                 ;
6179         remainder = len;
6180         space = M_TRAILINGSPACE(m);
6181         if (space > 0) {
6182                 /*
6183                  * Copy into available space.
6184                  */
6185                 if (space > remainder)
6186                         space = remainder;
6187                 bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
6188                 m->m_len += space;
6189                 cp += space;
6190                 remainder -= space;
6191         }
6192         while (remainder > 0) {
6193                 /*
6194                  * Allocate a new mbuf; could check space
6195                  * and allocate a cluster instead.
6196                  */
6197                 n = m_get(M_WAITOK, m->m_type);
6198                 if (n == NULL)
6199                         break;
6200                 n->m_len = min(MLEN, remainder);
6201                 bcopy(cp, mtod(n, caddr_t), n->m_len);
6202                 cp += n->m_len;
6203                 remainder -= n->m_len;
6204                 m->m_next = n;
6205                 m = n;
6206         }
6207         if (m0->m_flags & M_PKTHDR)
6208                 m0->m_pkthdr.len += len - remainder;
6209         return (remainder == 0);
6210 }
6211
6212 struct mbuf *
6213 m_last(struct mbuf *m)
6214 {
6215         while (m->m_next != NULL)
6216                 m = m->m_next;
6217         return (m);
6218 }
6219
6220 unsigned int
6221 m_fixhdr(struct mbuf *m0)
6222 {
6223         u_int len;
6224
6225         VERIFY(m0->m_flags & M_PKTHDR);
6226
6227         len = m_length2(m0, NULL);
6228         m0->m_pkthdr.len = len;
6229         return (len);
6230 }
6231
6232 unsigned int
6233 m_length2(struct mbuf *m0, struct mbuf **last)
6234 {
6235         struct mbuf *m;
6236         u_int len;
6237
6238         len = 0;
6239         for (m = m0; m != NULL; m = m->m_next) {
6240                 len += m->m_len;
6241                 if (m->m_next == NULL)
6242                         break;
6243         }
6244         if (last != NULL)
6245                 *last = m;
6246         return (len);
6247 }
6248
6249 /*
6250  * Defragment a mbuf chain, returning the shortest possible chain of mbufs
6251  * and clusters.  If allocation fails and this cannot be completed, NULL will
6252  * be returned, but the passed in chain will be unchanged.  Upon success,
6253  * the original chain will be freed, and the new chain will be returned.
6254  *
6255  * If a non-packet header is passed in, the original mbuf (chain?) will
6256  * be returned unharmed.
6257  *
6258  * If offset is specfied, the first mbuf in the chain will have a leading
6259  * space of the amount stated by the "off" parameter.
6260  *
6261  * This routine requires that the m_pkthdr.header field of the original
6262  * mbuf chain is cleared by the caller.
6263  */
6264 struct mbuf *
6265 m_defrag_offset(struct mbuf *m0, u_int32_t off, int how)
6266 {
6267         struct mbuf *m_new = NULL, *m_final = NULL;
6268         int progress = 0, length, pktlen;
6269
6270         if (!(m0->m_flags & M_PKTHDR))
6271                 return (m0);
6272
6273         VERIFY(off < MHLEN);
6274         m_fixhdr(m0); /* Needed sanity check */
6275
6276         pktlen = m0->m_pkthdr.len + off;
6277         if (pktlen > MHLEN)
6278                 m_final = m_getcl(how, MT_DATA, M_PKTHDR);
6279         else
6280                 m_final = m_gethdr(how, MT_DATA);
6281
6282         if (m_final == NULL)
6283                 goto nospace;
6284
6285         if (off > 0) {
6286                 pktlen -= off;
6287                 m_final->m_data += off;
6288         }
6289
6290         /*
6291          * Caller must have handled the contents pointed to by this
6292          * pointer before coming here, as otherwise it will point to
6293          * the original mbuf which will get freed upon success.
6294          */
6295         VERIFY(m0->m_pkthdr.pkt_hdr == NULL);
6296
6297         if (m_dup_pkthdr(m_final, m0, how) == 0)
6298                 goto nospace;
6299
6300         m_new = m_final;
6301
6302         while (progress < pktlen) {
6303                 length = pktlen - progress;
6304                 if (length > MCLBYTES)
6305                         length = MCLBYTES;
6306                 length -= ((m_new == m_final) ? off : 0);
6307                 if (length < 0)
6308                         goto nospace;
6309
6310                 if (m_new == NULL) {
6311                         if (length > MLEN)
6312                                 m_new = m_getcl(how, MT_DATA, 0);
6313                         else
6314                                 m_new = m_get(how, MT_DATA);
6315                         if (m_new == NULL)
6316                                 goto nospace;
6317                 }
6318
6319                 m_copydata(m0, progress, length, mtod(m_new, caddr_t));
6320                 progress += length;
6321                 m_new->m_len = length;
6322                 if (m_new != m_final)
6323                         m_cat(m_final, m_new);
6324                 m_new = NULL;
6325         }
6326         m_freem(m0);
6327         m0 = m_final;
6328         return (m0);
6329 nospace:
6330         if (m_final)
6331                 m_freem(m_final);
6332         return (NULL);
6333 }
6334
6335 struct mbuf *
6336 m_defrag(struct mbuf *m0, int how)
6337 {
6338         return (m_defrag_offset(m0, 0, how));
6339 }
6340
6341 void
6342 m_mchtype(struct mbuf *m, int t)
6343 {
6344         mtype_stat_inc(t);
6345         mtype_stat_dec(m->m_type);
6346         (m)->m_type = t;
6347 }
6348
6349 void *
6350 m_mtod(struct mbuf *m)
6351 {
6352         return (MTOD(m, void *));
6353 }
6354
6355 struct mbuf *
6356 m_dtom(void *x)
6357 {
6358         return ((struct mbuf *)((uintptr_t)(x) & ~(MSIZE-1)));
6359 }
6360
6361 void
6362 m_mcheck(struct mbuf *m)
6363 {
6364         _MCHECK(m);
6365 }
6366
6367 /*
6368  * Return a pointer to mbuf/offset of location in mbuf chain.
6369  */
6370 struct mbuf *
6371 m_getptr(struct mbuf *m, int loc, int *off)
6372 {
6373
6374         while (loc >= 0) {
6375                 /* Normal end of search. */
6376                 if (m->m_len > loc) {
6377                         *off = loc;
6378                         return (m);
6379                 } else {
6380                         loc -= m->m_len;
6381                         if (m->m_next == NULL) {
6382                                 if (loc == 0) {
6383                                         /* Point at the end of valid data. */
6384                                         *off = m->m_len;
6385                                         return (m);
6386                                 }
6387                                 return (NULL);
6388                         }
6389                         m = m->m_next;
6390                 }
6391         }
6392         return (NULL);
6393 }
6394
6395 /*
6396  * Inform the corresponding mcache(s) that there's a waiter below.
6397  */
6398 static void
6399 mbuf_waiter_inc(mbuf_class_t class, boolean_t comp)
6400 {
6401         mcache_waiter_inc(m_cache(class));
6402         if (comp) {
6403                 if (class == MC_CL) {
6404                         mcache_waiter_inc(m_cache(MC_MBUF_CL));
6405                 } else if (class == MC_BIGCL) {
6406                         mcache_waiter_inc(m_cache(MC_MBUF_BIGCL));
6407                 } else if (class == MC_16KCL) {
6408                         mcache_waiter_inc(m_cache(MC_MBUF_16KCL));
6409                 } else {
6410                         mcache_waiter_inc(m_cache(MC_MBUF_CL));
6411                         mcache_waiter_inc(m_cache(MC_MBUF_BIGCL));
6412                 }
6413         }
6414 }
6415
6416 /*
6417  * Inform the corresponding mcache(s) that there's no more waiter below.
6418  */
6419 static void
6420 mbuf_waiter_dec(mbuf_class_t class, boolean_t comp)
6421 {
6422         mcache_waiter_dec(m_cache(class));
6423         if (comp) {
6424                 if (class == MC_CL) {
6425                         mcache_waiter_dec(m_cache(MC_MBUF_CL));
6426                 } else if (class == MC_BIGCL) {
6427                         mcache_waiter_dec(m_cache(MC_MBUF_BIGCL));
6428                 } else if (class == MC_16KCL) {
6429                         mcache_waiter_dec(m_cache(MC_MBUF_16KCL));
6430                 } else {
6431                         mcache_waiter_dec(m_cache(MC_MBUF_CL));
6432                         mcache_waiter_dec(m_cache(MC_MBUF_BIGCL));
6433                 }
6434         }
6435 }
6436
6437 /*
6438  * Called during slab (blocking and non-blocking) allocation.  If there
6439  * is at least one waiter, and the time since the first waiter is blocked
6440  * is greater than the watchdog timeout, panic the system.
6441  */
6442 static void
6443 mbuf_watchdog(void)
6444 {
6445         struct timeval now;
6446         unsigned int since;
6447
6448         if (mb_waiters == 0 || !mb_watchdog)
6449                 return;
6450
6451         microuptime(&now);
6452         since = now.tv_sec - mb_wdtstart.tv_sec;
6453         if (since >= MB_WDT_MAXTIME) {
6454                 panic_plain("%s: %d waiters stuck for %u secs\n%s", __func__,
6455                     mb_waiters, since, mbuf_dump());
6456                 /* NOTREACHED */
6457         }
6458 }
6459
6460 /*
6461  * Called during blocking allocation.  Returns TRUE if one or more objects
6462  * are available at the per-CPU caches layer and that allocation should be
6463  * retried at that level.
6464  */
6465 static boolean_t
6466 mbuf_sleep(mbuf_class_t class, unsigned int num, int wait)
6467 {
6468         boolean_t mcache_retry = FALSE;
6469
6470         LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
6471
6472         /* Check if there's anything at the cache layer */
6473         if (mbuf_cached_above(class, wait)) {
6474                 mcache_retry = TRUE;
6475                 goto done;
6476         }
6477
6478         /* Nothing?  Then try hard to get it from somewhere */
6479         m_reclaim(class, num, (wait & MCR_COMP));
6480
6481         /* We tried hard and got something? */
6482         if (m_infree(class) > 0) {
6483                 mbstat.m_wait++;
6484                 goto done;
6485         } else if (mbuf_cached_above(class, wait)) {
6486                 mbstat.m_wait++;
6487                 mcache_retry = TRUE;
6488                 goto done;
6489         } else if (wait & MCR_TRYHARD) {
6490                 mcache_retry = TRUE;
6491                 goto done;
6492         }
6493
6494         /*
6495          * There's really nothing for us right now; inform the
6496          * cache(s) that there is a waiter below and go to sleep.
6497          */
6498         mbuf_waiter_inc(class, (wait & MCR_COMP));
6499
6500         VERIFY(!(wait & MCR_NOSLEEP));
6501
6502         /*
6503          * If this is the first waiter, arm the watchdog timer.  Otherwise
6504          * check if we need to panic the system due to watchdog timeout.
6505          */
6506         if (mb_waiters == 0)
6507                 microuptime(&mb_wdtstart);
6508         else
6509                 mbuf_watchdog();
6510
6511         mb_waiters++;
6512         m_region_expand(class) += m_total(class) + num;
6513         /* wake up the worker thread */
6514         if (class > MC_MBUF && mbuf_worker_ready &&
6515             mbuf_worker_needs_wakeup) {
6516                 wakeup((caddr_t)&mbuf_worker_needs_wakeup);
6517                 mbuf_worker_needs_wakeup = FALSE;
6518         }
6519
6520         (void) msleep(mb_waitchan, mbuf_mlock, (PZERO-1), m_cname(class), NULL);
6521
6522         /* We are now up; stop getting notified until next round */
6523         mbuf_waiter_dec(class, (wait & MCR_COMP));
6524
6525         /* We waited and got something */
6526         if (m_infree(class) > 0) {
6527                 mbstat.m_wait++;
6528                 goto done;
6529         } else if (mbuf_cached_above(class, wait)) {
6530                 mbstat.m_wait++;
6531                 mcache_retry = TRUE;
6532         }
6533 done:
6534         return (mcache_retry);
6535 }
6536
6537 __attribute__((noreturn))
6538 static void
6539 mbuf_worker_thread(void)
6540 {
6541         int mbuf_expand;
6542
6543         while (1) {
6544                 lck_mtx_lock(mbuf_mlock);
6545                 mbuf_worker_run_cnt++;
6546                 mbuf_expand = 0;
6547                 if (m_region_expand(MC_CL) > 0) {
6548                         int n;
6549                         mb_expand_cl_cnt++;
6550                         /* Adjust to current number of cluster in use */
6551                         n = m_region_expand(MC_CL) -
6552                             (m_total(MC_CL) - m_infree(MC_CL));
6553                         if ((n + m_total(MC_CL)) > m_maxlimit(MC_CL))
6554                                 n = m_maxlimit(MC_CL) - m_total(MC_CL);
6555                         if (n > 0) {
6556                                 mb_expand_cl_total += n;
6557                         }
6558                         m_region_expand(MC_CL) = 0;
6559
6560                         if (n > 0 && freelist_populate(MC_CL, n, M_WAIT) > 0)
6561                                 mbuf_expand++;
6562                 }
6563                 if (m_region_expand(MC_BIGCL) > 0) {
6564                         int n;
6565                         mb_expand_bigcl_cnt++;
6566                         /* Adjust to current number of 4 KB cluster in use */
6567                         n = m_region_expand(MC_BIGCL) -
6568                             (m_total(MC_BIGCL) - m_infree(MC_BIGCL));
6569                         if ((n + m_total(MC_BIGCL)) > m_maxlimit(MC_BIGCL))
6570                                 n = m_maxlimit(MC_BIGCL) - m_total(MC_BIGCL);
6571                         if (n > 0) {
6572                                 mb_expand_bigcl_total += n;
6573                         }
6574                         m_region_expand(MC_BIGCL) = 0;
6575
6576                         if (n > 0 && freelist_populate(MC_BIGCL, n, M_WAIT) > 0)
6577                                 mbuf_expand++;
6578                 }
6579                 if (m_region_expand(MC_16KCL) > 0) {
6580                         int n;
6581                         mb_expand_16kcl_cnt++;
6582                         /* Adjust to current number of 16 KB cluster in use */
6583                         n = m_region_expand(MC_16KCL) -
6584                             (m_total(MC_16KCL) - m_infree(MC_16KCL));
6585                         if ((n + m_total(MC_16KCL)) > m_maxlimit(MC_16KCL))
6586                                 n = m_maxlimit(MC_16KCL) - m_total(MC_16KCL);
6587                         if (n > 0) {
6588                                 mb_expand_16kcl_total += n;
6589                         }
6590                         m_region_expand(MC_16KCL) = 0;
6591
6592                         if (n > 0)
6593                                 (void) freelist_populate(MC_16KCL, n, M_WAIT);
6594                 }
6595
6596                 /*
6597                  * Because we can run out of memory before filling the mbuf
6598                  * map, we should not allocate more clusters than they are
6599                  * mbufs -- otherwise we could have a large number of useless
6600                  * clusters allocated.
6601                  */
6602                 if (mbuf_expand) {
6603                         while (m_total(MC_MBUF) <
6604                             (m_total(MC_BIGCL) + m_total(MC_CL))) {
6605                                 mb_expand_cnt++;
6606                                 if (freelist_populate(MC_MBUF, 1, M_WAIT) == 0)
6607                                         break;
6608                         }
6609                 }
6610
6611                 mbuf_worker_needs_wakeup = TRUE;
6612                 /*
6613                  * If there's a deadlock and we're not sending / receiving
6614                  * packets, net_uptime() won't be updated.  Update it here
6615                  * so we are sure it's correct.
6616                  */
6617                 net_update_uptime();
6618                 mbuf_worker_last_runtime = net_uptime();
6619                 assert_wait((caddr_t)&mbuf_worker_needs_wakeup,
6620                     THREAD_UNINT);
6621                 lck_mtx_unlock(mbuf_mlock);
6622                 (void) thread_block((thread_continue_t)mbuf_worker_thread);
6623         }
6624 }
6625
6626 __attribute__((noreturn))
6627 static void
6628 mbuf_worker_thread_init(void)
6629 {
6630         mbuf_worker_ready++;
6631         mbuf_worker_thread();
6632 }
6633
6634 static mcl_slab_t *
6635 slab_get(void *buf)
6636 {
6637         mcl_slabg_t *slg;
6638         unsigned int ix, k;
6639
6640         LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
6641
6642         VERIFY(MBUF_IN_MAP(buf));
6643         ix = ((unsigned char *)buf - mbutl) >> MBSHIFT;
6644         VERIFY(ix < maxslabgrp);
6645
6646         if ((slg = slabstbl[ix]) == NULL) {
6647                 /*
6648                  * In the current implementation, we never shrink the slabs
6649                  * table; if we attempt to reallocate a cluster group when
6650                  * it's already allocated, panic since this is a sign of a
6651                  * memory corruption (slabstbl[ix] got nullified).
6652                  */
6653                 ++slabgrp;
6654                 VERIFY(ix < slabgrp);
6655                 /*
6656                  * Slabs expansion can only be done single threaded; when
6657                  * we get here, it must be as a result of m_clalloc() which
6658                  * is serialized and therefore mb_clalloc_busy must be set.
6659                  */
6660                 VERIFY(mb_clalloc_busy);
6661                 lck_mtx_unlock(mbuf_mlock);
6662
6663                 /* This is a new buffer; create the slabs group for it */
6664                 MALLOC(slg, mcl_slabg_t *, sizeof (*slg), M_TEMP,
6665                     M_WAITOK | M_ZERO);
6666                 MALLOC(slg->slg_slab, mcl_slab_t *, sizeof(mcl_slab_t) * NSLABSPMB,
6667                     M_TEMP, M_WAITOK | M_ZERO);
6668                 VERIFY(slg != NULL && slg->slg_slab != NULL);
6669
6670                 lck_mtx_lock(mbuf_mlock);
6671                 /*
6672                  * No other thread could have gone into m_clalloc() after
6673                  * we dropped the lock above, so verify that it's true.
6674                  */
6675                 VERIFY(mb_clalloc_busy);
6676
6677                 slabstbl[ix] = slg;
6678
6679                 /* Chain each slab in the group to its forward neighbor */
6680                 for (k = 1; k < NSLABSPMB; k++)
6681                         slg->slg_slab[k - 1].sl_next = &slg->slg_slab[k];
6682                 VERIFY(slg->slg_slab[NSLABSPMB - 1].sl_next == NULL);
6683
6684                 /* And chain the last slab in the previous group to this */
6685                 if (ix > 0) {
6686                         VERIFY(slabstbl[ix - 1]->
6687                             slg_slab[NSLABSPMB - 1].sl_next == NULL);
6688                         slabstbl[ix - 1]->slg_slab[NSLABSPMB - 1].sl_next =
6689                             &slg->slg_slab[0];
6690                 }
6691         }
6692
6693         ix = MTOPG(buf) % NSLABSPMB;
6694         VERIFY(ix < NSLABSPMB);
6695
6696         return (&slg->slg_slab[ix]);
6697 }
6698
6699 static void
6700 slab_init(mcl_slab_t *sp, mbuf_class_t class, u_int32_t flags,
6701     void *base, void *head, unsigned int len, int refcnt, int chunks)
6702 {
6703         sp->sl_class = class;
6704         sp->sl_flags = flags;
6705         sp->sl_base = base;
6706         sp->sl_head = head;
6707         sp->sl_len = len;
6708         sp->sl_refcnt = refcnt;
6709         sp->sl_chunks = chunks;
6710         slab_detach(sp);
6711 }
6712
6713 static void
6714 slab_insert(mcl_slab_t *sp, mbuf_class_t class)
6715 {
6716         VERIFY(slab_is_detached(sp));
6717         m_slab_cnt(class)++;
6718         TAILQ_INSERT_TAIL(&m_slablist(class), sp, sl_link);
6719         sp->sl_flags &= ~SLF_DETACHED;
6720
6721         /*
6722          * If a buffer spans multiple contiguous pages then mark them as
6723          * detached too
6724          */
6725         if (class == MC_16KCL) {
6726                 int k;
6727                 for (k = 1; k < NSLABSP16KB; k++) {
6728                         sp = sp->sl_next;
6729                         /* Next slab must already be present */
6730                         VERIFY(sp != NULL && slab_is_detached(sp));
6731                         sp->sl_flags &= ~SLF_DETACHED;
6732                 }
6733         }
6734 }
6735
6736 static void
6737 slab_remove(mcl_slab_t *sp, mbuf_class_t class)
6738 {
6739         int k;
6740         VERIFY(!slab_is_detached(sp));
6741         VERIFY(m_slab_cnt(class) > 0);
6742         m_slab_cnt(class)--;
6743         TAILQ_REMOVE(&m_slablist(class), sp, sl_link);
6744         slab_detach(sp);
6745         if (class == MC_16KCL) {
6746                 for (k = 1; k < NSLABSP16KB; k++) {
6747                         sp = sp->sl_next;
6748                         /* Next slab must already be present */
6749                         VERIFY(sp != NULL);
6750                         VERIFY(!slab_is_detached(sp));
6751                         slab_detach(sp);
6752                 }
6753         }
6754 }
6755
6756 static boolean_t
6757 slab_inrange(mcl_slab_t *sp, void *buf)
6758 {
6759         return ((uintptr_t)buf >= (uintptr_t)sp->sl_base &&
6760             (uintptr_t)buf < ((uintptr_t)sp->sl_base + sp->sl_len));
6761 }
6762
6763 #undef panic
6764
6765 static void
6766 slab_nextptr_panic(mcl_slab_t *sp, void *addr)
6767 {
6768         int i;
6769         unsigned int chunk_len = sp->sl_len / sp->sl_chunks;
6770         uintptr_t buf = (uintptr_t)sp->sl_base;
6771
6772         for (i = 0; i < sp->sl_chunks; i++, buf += chunk_len) {
6773                 void *next = ((mcache_obj_t *)buf)->obj_next;
6774                 if (next != addr)
6775                         continue;
6776                 if (!mclverify) {
6777                         if (next != NULL && !MBUF_IN_MAP(next)) {
6778                                 mcache_t *cp = m_cache(sp->sl_class);
6779                                 panic("%s: %s buffer %p in slab %p modified "
6780                                     "after free at offset 0: %p out of range "
6781                                     "[%p-%p)\n", __func__, cp->mc_name,
6782                                     (void *)buf, sp, next, mbutl, embutl);
6783                                 /* NOTREACHED */
6784                         }
6785                 } else {
6786                         mcache_audit_t *mca = mcl_audit_buf2mca(sp->sl_class,
6787                             (mcache_obj_t *)buf);
6788                         mcl_audit_verify_nextptr(next, mca);
6789                 }
6790         }
6791 }
6792
6793 static void
6794 slab_detach(mcl_slab_t *sp)
6795 {
6796         sp->sl_link.tqe_next = (mcl_slab_t *)-1;
6797         sp->sl_link.tqe_prev = (mcl_slab_t **)-1;
6798         sp->sl_flags |= SLF_DETACHED;
6799 }
6800
6801 static boolean_t
6802 slab_is_detached(mcl_slab_t *sp)
6803 {
6804         return ((intptr_t)sp->sl_link.tqe_next == -1 &&
6805             (intptr_t)sp->sl_link.tqe_prev == -1 &&
6806             (sp->sl_flags & SLF_DETACHED));
6807 }
6808
6809 static void
6810 mcl_audit_init(void *buf, mcache_audit_t **mca_list,
6811     mcache_obj_t **con_list, size_t con_size, unsigned int num)
6812 {
6813         mcache_audit_t *mca, *mca_tail;
6814         mcache_obj_t *con = NULL;
6815         boolean_t save_contents = (con_list != NULL);
6816         unsigned int i, ix;
6817
6818         ASSERT(num <= NMBPG);
6819         ASSERT(con_list == NULL || con_size != 0);
6820
6821         ix = MTOPG(buf);
6822         VERIFY(ix < maxclaudit);
6823
6824         /* Make sure we haven't been here before */
6825         for (i = 0; i < num; i++)
6826                 VERIFY(mclaudit[ix].cl_audit[i] == NULL);
6827
6828         mca = mca_tail = *mca_list;
6829         if (save_contents)
6830                 con = *con_list;
6831
6832         for (i = 0; i < num; i++) {
6833                 mcache_audit_t *next;
6834
6835                 next = mca->mca_next;
6836                 bzero(mca, sizeof (*mca));
6837                 mca->mca_next = next;
6838                 mclaudit[ix].cl_audit[i] = mca;
6839
6840                 /* Attach the contents buffer if requested */
6841                 if (save_contents) {
6842                         mcl_saved_contents_t *msc =
6843                             (mcl_saved_contents_t *)(void *)con;
6844
6845                         VERIFY(msc != NULL);
6846                         VERIFY(IS_P2ALIGNED(msc, sizeof (u_int64_t)));
6847                         VERIFY(con_size == sizeof (*msc));
6848                         mca->mca_contents_size = con_size;
6849                         mca->mca_contents = msc;
6850                         con = con->obj_next;
6851                         bzero(mca->mca_contents, mca->mca_contents_size);
6852                 }
6853
6854                 mca_tail = mca;
6855                 mca = mca->mca_next;
6856         }
6857
6858         if (save_contents)
6859                 *con_list = con;
6860
6861         *mca_list = mca_tail->mca_next;
6862         mca_tail->mca_next = NULL;
6863 }
6864
6865 static void
6866 mcl_audit_free(void *buf, unsigned int num)
6867 {
6868         unsigned int i, ix;
6869         mcache_audit_t *mca, *mca_list;
6870
6871         ix = MTOPG(buf);
6872         VERIFY(ix < maxclaudit);
6873
6874         if (mclaudit[ix].cl_audit[0] != NULL) {
6875                 mca_list = mclaudit[ix].cl_audit[0];
6876                 for (i = 0; i < num; i++) {
6877                         mca = mclaudit[ix].cl_audit[i];
6878                         mclaudit[ix].cl_audit[i] = NULL;
6879                         if (mca->mca_contents)
6880                                 mcache_free(mcl_audit_con_cache,
6881                                     mca->mca_contents);
6882                 }
6883                 mcache_free_ext(mcache_audit_cache,
6884                     (mcache_obj_t *)mca_list);
6885         }
6886 }
6887
6888 /*
6889  * Given an address of a buffer (mbuf/2KB/4KB/16KB), return
6890  * the corresponding audit structure for that buffer.
6891  */
6892 static mcache_audit_t *
6893 mcl_audit_buf2mca(mbuf_class_t class, mcache_obj_t *mobj)
6894 {
6895         mcache_audit_t *mca = NULL;
6896         int ix = MTOPG(mobj), m_idx = 0;
6897         unsigned char *page_addr;
6898
6899         VERIFY(ix < maxclaudit);
6900         VERIFY(IS_P2ALIGNED(mobj, MIN(m_maxsize(class), PAGE_SIZE)));
6901
6902         page_addr = PGTOM(ix);
6903
6904         switch (class) {
6905         case MC_MBUF:
6906                 /*
6907                  * For the mbuf case, find the index of the page
6908                  * used by the mbuf and use that index to locate the
6909                  * base address of the page.  Then find out the
6910                  * mbuf index relative to the page base and use
6911                  * it to locate the audit structure.
6912                  */
6913                 m_idx = MBPAGEIDX(page_addr, mobj);
6914                 VERIFY(m_idx < (int)NMBPG);
6915                 mca = mclaudit[ix].cl_audit[m_idx];
6916                 break;
6917
6918         case MC_CL:
6919                 /*
6920                  * Same thing as above, but for 2KB clusters in a page.
6921                  */
6922                 m_idx = CLPAGEIDX(page_addr, mobj);
6923                 VERIFY(m_idx < (int)NCLPG);
6924                 mca = mclaudit[ix].cl_audit[m_idx];
6925                 break;
6926
6927         case MC_BIGCL:
6928                 m_idx = BCLPAGEIDX(page_addr, mobj);
6929                 VERIFY(m_idx < (int)NBCLPG);
6930                 mca = mclaudit[ix].cl_audit[m_idx];
6931                 break;
6932         case MC_16KCL:
6933                 /*
6934                  * Same as above, but only return the first element.
6935                  */
6936                 mca = mclaudit[ix].cl_audit[0];
6937                 break;
6938
6939         default:
6940                 VERIFY(0);
6941                 /* NOTREACHED */
6942         }
6943
6944         return (mca);
6945 }
6946
6947 static void
6948 mcl_audit_mbuf(mcache_audit_t *mca, void *addr, boolean_t composite,
6949     boolean_t alloc)
6950 {
6951         struct mbuf *m = addr;
6952         mcache_obj_t *next = ((mcache_obj_t *)m)->obj_next;
6953
6954         VERIFY(mca->mca_contents != NULL &&
6955             mca->mca_contents_size == AUDIT_CONTENTS_SIZE);
6956
6957         if (mclverify)
6958                 mcl_audit_verify_nextptr(next, mca);
6959
6960         if (!alloc) {
6961                 /* Save constructed mbuf fields */
6962                 mcl_audit_save_mbuf(m, mca);
6963                 if (mclverify) {
6964                         mcache_set_pattern(MCACHE_FREE_PATTERN, m,
6965                             m_maxsize(MC_MBUF));
6966                 }
6967                 ((mcache_obj_t *)m)->obj_next = next;
6968                 return;
6969         }
6970
6971         /* Check if the buffer has been corrupted while in freelist */
6972         if (mclverify) {
6973                 mcache_audit_free_verify_set(mca, addr, 0, m_maxsize(MC_MBUF));
6974         }
6975         /* Restore constructed mbuf fields */
6976         mcl_audit_restore_mbuf(m, mca, composite);
6977 }
6978
6979 static void
6980 mcl_audit_restore_mbuf(struct mbuf *m, mcache_audit_t *mca, boolean_t composite)
6981 {
6982         struct mbuf *ms = MCA_SAVED_MBUF_PTR(mca);
6983
6984         if (composite) {
6985                 struct mbuf *next = m->m_next;
6986                 VERIFY(ms->m_flags == M_EXT && m_get_rfa(ms) != NULL &&
6987                     MBUF_IS_COMPOSITE(ms));
6988                 VERIFY(mca->mca_contents_size == AUDIT_CONTENTS_SIZE);
6989                 /*
6990                  * We could have hand-picked the mbuf fields and restore
6991                  * them individually, but that will be a maintenance
6992                  * headache.  Instead, restore everything that was saved;
6993                  * the mbuf layer will recheck and reinitialize anyway.
6994                  */
6995                 bcopy(ms, m, MCA_SAVED_MBUF_SIZE);
6996                 m->m_next = next;
6997         } else {
6998                 /*
6999                  * For a regular mbuf (no cluster attached) there's nothing
7000                  * to restore other than the type field, which is expected
7001                  * to be MT_FREE.
7002                  */
7003                 m->m_type = ms->m_type;
7004         }
7005         _MCHECK(m);
7006 }
7007
7008 static void
7009 mcl_audit_save_mbuf(struct mbuf *m, mcache_audit_t *mca)
7010 {
7011         VERIFY(mca->mca_contents_size == AUDIT_CONTENTS_SIZE);
7012         _MCHECK(m);
7013         bcopy(m, MCA_SAVED_MBUF_PTR(mca), MCA_SAVED_MBUF_SIZE);
7014 }
7015
7016 static void
7017 mcl_audit_cluster(mcache_audit_t *mca, void *addr, size_t size, boolean_t alloc,
7018     boolean_t save_next)
7019 {
7020         mcache_obj_t *next = ((mcache_obj_t *)addr)->obj_next;
7021
7022         if (!alloc) {
7023                 if (mclverify) {
7024                         mcache_set_pattern(MCACHE_FREE_PATTERN, addr, size);
7025                 }
7026                 if (save_next) {
7027                         mcl_audit_verify_nextptr(next, mca);
7028                         ((mcache_obj_t *)addr)->obj_next = next;
7029                 }
7030         } else if (mclverify) {
7031                 /* Check if the buffer has been corrupted while in freelist */
7032                 mcl_audit_verify_nextptr(next, mca);
7033                 mcache_audit_free_verify_set(mca, addr, 0, size);
7034         }
7035 }
7036
7037 static void
7038 mcl_audit_scratch(mcache_audit_t *mca)
7039 {
7040         void *stack[MCACHE_STACK_DEPTH + 1];
7041         mcl_scratch_audit_t *msa;
7042         struct timeval now;
7043
7044         VERIFY(mca->mca_contents != NULL);
7045         msa = MCA_SAVED_SCRATCH_PTR(mca);
7046
7047         msa->msa_pthread = msa->msa_thread;
7048         msa->msa_thread = current_thread();
7049         bcopy(msa->msa_stack, msa->msa_pstack, sizeof (msa->msa_pstack));
7050         msa->msa_pdepth = msa->msa_depth;
7051         bzero(stack, sizeof (stack));
7052         msa->msa_depth = OSBacktrace(stack, MCACHE_STACK_DEPTH + 1) - 1;
7053         bcopy(&stack[1], msa->msa_stack, sizeof (msa->msa_stack));
7054
7055         msa->msa_ptstamp = msa->msa_tstamp;
7056         microuptime(&now);
7057         /* tstamp is in ms relative to base_ts */
7058         msa->msa_tstamp = ((now.tv_usec - mb_start.tv_usec) / 1000);
7059         if ((now.tv_sec - mb_start.tv_sec) > 0)
7060                 msa->msa_tstamp += ((now.tv_sec - mb_start.tv_sec) * 1000);
7061 }
7062
7063 static void
7064 mcl_audit_mcheck_panic(struct mbuf *m)
7065 {
7066         mcache_audit_t *mca;
7067
7068         MRANGE(m);
7069         mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
7070
7071         panic("mcl_audit: freed mbuf %p with type 0x%x (instead of 0x%x)\n%s\n",
7072             m, (u_int16_t)m->m_type, MT_FREE, mcache_dump_mca(mca));
7073         /* NOTREACHED */
7074 }
7075
7076 static void
7077 mcl_audit_verify_nextptr(void *next, mcache_audit_t *mca)
7078 {
7079         if (next != NULL && !MBUF_IN_MAP(next) &&
7080             (next != (void *)MCACHE_FREE_PATTERN || !mclverify)) {
7081                 panic("mcl_audit: buffer %p modified after free at offset 0: "
7082                     "%p out of range [%p-%p)\n%s\n",
7083                     mca->mca_addr, next, mbutl, embutl, mcache_dump_mca(mca));
7084                 /* NOTREACHED */
7085         }
7086 }
7087
7088 /* This function turns on mbuf leak detection */
7089 static void
7090 mleak_activate(void)
7091 {
7092         mleak_table.mleak_sample_factor = MLEAK_SAMPLE_FACTOR;
7093         PE_parse_boot_argn("mleak_sample_factor",
7094             &mleak_table.mleak_sample_factor,
7095             sizeof (mleak_table.mleak_sample_factor));
7096
7097         if (mleak_table.mleak_sample_factor == 0)
7098                 mclfindleak = 0;
7099
7100         if (mclfindleak == 0)
7101                 return;
7102
7103         vm_size_t alloc_size =
7104             mleak_alloc_buckets * sizeof (struct mallocation);
7105         vm_size_t trace_size = mleak_trace_buckets * sizeof (struct mtrace);
7106
7107         MALLOC(mleak_allocations, struct mallocation *, alloc_size,
7108             M_TEMP, M_WAITOK | M_ZERO);
7109         VERIFY(mleak_allocations != NULL);
7110
7111         MALLOC(mleak_traces, struct mtrace *, trace_size,
7112             M_TEMP, M_WAITOK | M_ZERO);
7113         VERIFY(mleak_traces != NULL);
7114
7115         MALLOC(mleak_stat, mleak_stat_t *, MLEAK_STAT_SIZE(MLEAK_NUM_TRACES),
7116             M_TEMP, M_WAITOK | M_ZERO);
7117         VERIFY(mleak_stat != NULL);
7118         mleak_stat->ml_cnt = MLEAK_NUM_TRACES;
7119 #ifdef __LP64__
7120         mleak_stat->ml_isaddr64 = 1;
7121 #endif /* __LP64__ */
7122 }
7123
7124 static void
7125 mleak_logger(u_int32_t num, mcache_obj_t *addr, boolean_t alloc)
7126 {
7127         int temp;
7128
7129         if (mclfindleak == 0)
7130                 return;
7131
7132         if (!alloc)
7133                 return (mleak_free(addr));
7134
7135         temp = atomic_add_32_ov(&mleak_table.mleak_capture, 1);
7136
7137         if ((temp % mleak_table.mleak_sample_factor) == 0 && addr != NULL) {
7138                 uintptr_t bt[MLEAK_STACK_DEPTH];
7139                 int logged = backtrace(bt, MLEAK_STACK_DEPTH);
7140                 mleak_log(bt, addr, logged, num);
7141         }
7142 }
7143
7144 /*
7145  * This function records the allocation in the mleak_allocations table
7146  * and the backtrace in the mleak_traces table; if allocation slot is in use,
7147  * replace old allocation with new one if the trace slot is in use, return
7148  * (or increment refcount if same trace).
7149  */
7150 static boolean_t
7151 mleak_log(uintptr_t *bt, mcache_obj_t *addr, uint32_t depth, int num)
7152 {
7153         struct mallocation *allocation;
7154         struct mtrace *trace;
7155         uint32_t trace_index;
7156
7157         /* Quit if someone else modifying the tables */
7158         if (!lck_mtx_try_lock_spin(mleak_lock)) {
7159                 mleak_table.total_conflicts++;
7160                 return (FALSE);
7161         }
7162
7163         allocation = &mleak_allocations[hashaddr((uintptr_t)addr,
7164             mleak_alloc_buckets)];
7165         trace_index = hashbacktrace(bt, depth, mleak_trace_buckets);
7166         trace = &mleak_traces[trace_index];
7167
7168         VERIFY(allocation <= &mleak_allocations[mleak_alloc_buckets - 1]);
7169         VERIFY(trace <= &mleak_traces[mleak_trace_buckets - 1]);
7170
7171         allocation->hitcount++;
7172         trace->hitcount++;
7173
7174         /*
7175          * If the allocation bucket we want is occupied
7176          * and the occupier has the same trace, just bail.
7177          */
7178         if (allocation->element != NULL &&
7179             trace_index == allocation->trace_index) {
7180                 mleak_table.alloc_collisions++;
7181                 lck_mtx_unlock(mleak_lock);
7182                 return (TRUE);
7183         }
7184
7185         /*
7186          * Store the backtrace in the traces array;
7187          * Size of zero = trace bucket is free.
7188          */
7189         if (trace->allocs > 0 &&
7190             bcmp(trace->addr, bt, (depth * sizeof (uintptr_t))) != 0) {
7191                 /* Different, unique trace, but the same hash! Bail out. */
7192                 trace->collisions++;
7193                 mleak_table.trace_collisions++;
7194                 lck_mtx_unlock(mleak_lock);
7195                 return (TRUE);
7196         } else if (trace->allocs > 0) {
7197                 /* Same trace, already added, so increment refcount */
7198                 trace->allocs++;
7199         } else {
7200                 /* Found an unused trace bucket, so record the trace here */
7201                 if (trace->depth != 0) {
7202                         /* this slot previously used but not currently in use */
7203                         mleak_table.trace_overwrites++;
7204                 }
7205                 mleak_table.trace_recorded++;
7206                 trace->allocs = 1;
7207                 memcpy(trace->addr, bt, (depth * sizeof (uintptr_t)));
7208                 trace->depth = depth;
7209                 trace->collisions = 0;
7210         }
7211
7212         /* Step 2: Store the allocation record in the allocations array */
7213         if (allocation->element != NULL) {
7214                 /*
7215                  * Replace an existing allocation.  No need to preserve
7216                  * because only a subset of the allocations are being
7217                  * recorded anyway.
7218                  */
7219                 mleak_table.alloc_collisions++;
7220         } else if (allocation->trace_index != 0) {
7221                 mleak_table.alloc_overwrites++;
7222         }
7223         allocation->element = addr;
7224         allocation->trace_index = trace_index;
7225         allocation->count = num;
7226         mleak_table.alloc_recorded++;
7227         mleak_table.outstanding_allocs++;
7228
7229         lck_mtx_unlock(mleak_lock);
7230         return (TRUE);
7231 }
7232
7233 static void
7234 mleak_free(mcache_obj_t *addr)
7235 {
7236         while (addr != NULL) {
7237                 struct mallocation *allocation = &mleak_allocations
7238                     [hashaddr((uintptr_t)addr, mleak_alloc_buckets)];
7239
7240                 if (allocation->element == addr &&
7241                     allocation->trace_index < mleak_trace_buckets) {
7242                         lck_mtx_lock_spin(mleak_lock);
7243                         if (allocation->element == addr &&
7244                             allocation->trace_index < mleak_trace_buckets) {
7245                                 struct mtrace *trace;
7246                                 trace = &mleak_traces[allocation->trace_index];
7247                                 /* allocs = 0 means trace bucket is unused */
7248                                 if (trace->allocs > 0)
7249                                         trace->allocs--;
7250                                 if (trace->allocs == 0)
7251                                         trace->depth = 0;
7252                                 /* NULL element means alloc bucket is unused */
7253                                 allocation->element = NULL;
7254                                 mleak_table.outstanding_allocs--;
7255                         }
7256                         lck_mtx_unlock(mleak_lock);
7257                 }
7258                 addr = addr->obj_next;
7259         }
7260 }
7261
7262 static void
7263 mleak_sort_traces()
7264 {
7265         int i, j, k;
7266         struct mtrace *swap;
7267
7268         for(i = 0; i < MLEAK_NUM_TRACES; i++)
7269                 mleak_top_trace[i] = NULL;
7270
7271         for(i = 0, j = 0; j < MLEAK_NUM_TRACES && i < mleak_trace_buckets; i++)
7272         {
7273                 if (mleak_traces[i].allocs <= 0)
7274                         continue;
7275
7276                 mleak_top_trace[j] = &mleak_traces[i];
7277                 for (k = j; k > 0; k--) {
7278                         if (mleak_top_trace[k]->allocs <=
7279                             mleak_top_trace[k-1]->allocs)
7280                                 break;
7281
7282                         swap = mleak_top_trace[k-1];
7283                         mleak_top_trace[k-1] = mleak_top_trace[k];
7284                         mleak_top_trace[k] = swap;
7285                 }
7286                 j++;
7287         }
7288
7289         j--;
7290         for(; i < mleak_trace_buckets; i++) {
7291                 if (mleak_traces[i].allocs <= mleak_top_trace[j]->allocs)
7292                         continue;
7293
7294                 mleak_top_trace[j] = &mleak_traces[i];
7295
7296                 for (k = j; k > 0; k--) {
7297                         if (mleak_top_trace[k]->allocs <=
7298                             mleak_top_trace[k-1]->allocs)
7299                                 break;
7300
7301                         swap = mleak_top_trace[k-1];
7302                         mleak_top_trace[k-1] = mleak_top_trace[k];
7303                         mleak_top_trace[k] = swap;
7304                 }
7305         }
7306 }
7307
7308 static void
7309 mleak_update_stats()
7310 {
7311         mleak_trace_stat_t *mltr;
7312         int i;
7313
7314         VERIFY(mleak_stat != NULL);
7315 #ifdef __LP64__
7316         VERIFY(mleak_stat->ml_isaddr64);
7317 #else
7318         VERIFY(!mleak_stat->ml_isaddr64);
7319 #endif /* !__LP64__ */
7320         VERIFY(mleak_stat->ml_cnt == MLEAK_NUM_TRACES);
7321
7322         mleak_sort_traces();
7323
7324         mltr = &mleak_stat->ml_trace[0];
7325         bzero(mltr, sizeof (*mltr) * MLEAK_NUM_TRACES);
7326         for (i = 0; i < MLEAK_NUM_TRACES; i++) {
7327         int j;
7328
7329                 if (mleak_top_trace[i] == NULL ||
7330                     mleak_top_trace[i]->allocs == 0)
7331                         continue;
7332
7333                 mltr->mltr_collisions   = mleak_top_trace[i]->collisions;
7334                 mltr->mltr_hitcount     = mleak_top_trace[i]->hitcount;
7335                 mltr->mltr_allocs       = mleak_top_trace[i]->allocs;
7336                 mltr->mltr_depth        = mleak_top_trace[i]->depth;
7337
7338                 VERIFY(mltr->mltr_depth <= MLEAK_STACK_DEPTH);
7339                 for (j = 0; j < mltr->mltr_depth; j++)
7340                         mltr->mltr_addr[j] = mleak_top_trace[i]->addr[j];
7341
7342                 mltr++;
7343         }
7344 }
7345
7346 static struct mbtypes {
7347         int             mt_type;
7348         const char      *mt_name;
7349 } mbtypes[] = {
7350         { MT_DATA,      "data" },
7351         { MT_OOBDATA,   "oob data" },
7352         { MT_CONTROL,   "ancillary data" },
7353         { MT_HEADER,    "packet headers" },
7354         { MT_SOCKET,    "socket structures" },
7355         { MT_PCB,       "protocol control blocks" },
7356         { MT_RTABLE,    "routing table entries" },
7357         { MT_HTABLE,    "IMP host table entries" },
7358         { MT_ATABLE,    "address resolution tables" },
7359         { MT_FTABLE,    "fragment reassembly queue headers" },
7360         { MT_SONAME,    "socket names and addresses" },
7361         { MT_SOOPTS,    "socket options" },
7362         { MT_RIGHTS,    "access rights" },
7363         { MT_IFADDR,    "interface addresses" },
7364         { MT_TAG,       "packet tags" },
7365         { 0,            NULL }
7366 };
7367
7368 #define MBUF_DUMP_BUF_CHK() {   \
7369         clen -= k;              \
7370         if (clen < 1)           \
7371                 goto done;      \
7372         c += k;                 \
7373 }
7374
7375 static char *
7376 mbuf_dump(void)
7377 {
7378         unsigned long totmem = 0, totfree = 0, totmbufs, totused, totpct,
7379             totreturned = 0;
7380         u_int32_t m_mbufs = 0, m_clfree = 0, m_bigclfree = 0;
7381         u_int32_t m_mbufclfree = 0, m_mbufbigclfree = 0;
7382         u_int32_t m_16kclusters = 0, m_16kclfree = 0, m_mbuf16kclfree = 0;
7383         int nmbtypes = sizeof (mbstat.m_mtypes) / sizeof (short);
7384         uint8_t seen[256];
7385         struct mbtypes *mp;
7386         mb_class_stat_t *sp;
7387         mleak_trace_stat_t *mltr;
7388         char *c = mbuf_dump_buf;
7389         int i, k, clen = MBUF_DUMP_BUF_SIZE;
7390
7391         mbuf_dump_buf[0] = '\0';
7392
7393         /* synchronize all statistics in the mbuf table */
7394         mbuf_stat_sync();
7395         mbuf_mtypes_sync(TRUE);
7396
7397         sp = &mb_stat->mbs_class[0];
7398         for (i = 0; i < mb_stat->mbs_cnt; i++, sp++) {
7399                 u_int32_t mem;
7400
7401                 if (m_class(i) == MC_MBUF) {
7402                         m_mbufs = sp->mbcl_active;
7403                 } else if (m_class(i) == MC_CL) {
7404                         m_clfree = sp->mbcl_total - sp->mbcl_active;
7405                 } else if (m_class(i) == MC_BIGCL) {
7406                         m_bigclfree = sp->mbcl_total - sp->mbcl_active;
7407                 } else if (njcl > 0 && m_class(i) == MC_16KCL) {
7408                         m_16kclfree = sp->mbcl_total - sp->mbcl_active;
7409                         m_16kclusters = sp->mbcl_total;
7410                 } else if (m_class(i) == MC_MBUF_CL) {
7411                         m_mbufclfree = sp->mbcl_total - sp->mbcl_active;
7412                 } else if (m_class(i) == MC_MBUF_BIGCL) {
7413                         m_mbufbigclfree = sp->mbcl_total - sp->mbcl_active;
7414                 } else if (njcl > 0 && m_class(i) == MC_MBUF_16KCL) {
7415                         m_mbuf16kclfree = sp->mbcl_total - sp->mbcl_active;
7416                 }
7417
7418                 mem = sp->mbcl_ctotal * sp->mbcl_size;
7419                 totmem += mem;
7420                 totfree += (sp->mbcl_mc_cached + sp->mbcl_infree) *
7421                     sp->mbcl_size;
7422                 totreturned += sp->mbcl_release_cnt;
7423
7424         }
7425
7426         /* adjust free counts to include composite caches */
7427         m_clfree += m_mbufclfree;
7428         m_bigclfree += m_mbufbigclfree;
7429         m_16kclfree += m_mbuf16kclfree;
7430
7431         totmbufs = 0;
7432         for (mp = mbtypes; mp->mt_name != NULL; mp++)
7433                 totmbufs += mbstat.m_mtypes[mp->mt_type];
7434         if (totmbufs > m_mbufs)
7435                 totmbufs = m_mbufs;
7436         k = snprintf(c, clen, "%lu/%u mbufs in use:\n", totmbufs, m_mbufs);
7437         MBUF_DUMP_BUF_CHK();
7438
7439         bzero(&seen, sizeof (seen));
7440         for (mp = mbtypes; mp->mt_name != NULL; mp++) {
7441                 if (mbstat.m_mtypes[mp->mt_type] != 0) {
7442                         seen[mp->mt_type] = 1;
7443                         k = snprintf(c, clen, "\t%u mbufs allocated to %s\n",
7444                             mbstat.m_mtypes[mp->mt_type], mp->mt_name);
7445                         MBUF_DUMP_BUF_CHK();
7446                 }
7447         }
7448         seen[MT_FREE] = 1;
7449         for (i = 0; i < nmbtypes; i++)
7450                 if (!seen[i] && mbstat.m_mtypes[i] != 0) {
7451                         k = snprintf(c, clen, "\t%u mbufs allocated to "
7452                             "<mbuf type %d>\n", mbstat.m_mtypes[i], i);
7453                         MBUF_DUMP_BUF_CHK();
7454                 }
7455         if ((m_mbufs - totmbufs) > 0) {
7456                 k = snprintf(c, clen, "\t%lu mbufs allocated to caches\n",
7457                     m_mbufs - totmbufs);
7458                 MBUF_DUMP_BUF_CHK();
7459         }
7460         k = snprintf(c, clen, "%u/%u mbuf 2KB clusters in use\n"
7461             "%u/%u mbuf 4KB clusters in use\n",
7462             (unsigned int)(mbstat.m_clusters - m_clfree),
7463             (unsigned int)mbstat.m_clusters,
7464             (unsigned int)(mbstat.m_bigclusters - m_bigclfree),
7465             (unsigned int)mbstat.m_bigclusters);
7466         MBUF_DUMP_BUF_CHK();
7467
7468         if (njcl > 0) {
7469                 k = snprintf(c, clen, "%u/%u mbuf %uKB clusters in use\n",
7470                     m_16kclusters - m_16kclfree, m_16kclusters,
7471                     njclbytes / 1024);
7472                 MBUF_DUMP_BUF_CHK();
7473         }
7474         totused = totmem - totfree;
7475         if (totmem == 0) {
7476                 totpct = 0;
7477         } else if (totused < (ULONG_MAX / 100)) {
7478                 totpct = (totused * 100) / totmem;
7479         } else {
7480                 u_long totmem1 = totmem / 100;
7481                 u_long totused1 = totused / 100;
7482                 totpct = (totused1 * 100) / totmem1;
7483         }
7484         k = snprintf(c, clen, "%lu KB allocated to network (approx. %lu%% "
7485             "in use)\n", totmem / 1024, totpct);
7486         MBUF_DUMP_BUF_CHK();
7487         k = snprintf(c, clen, "%lu KB returned to the system\n",
7488             totreturned / 1024);
7489         MBUF_DUMP_BUF_CHK();
7490
7491         net_update_uptime();
7492         k = snprintf(c, clen,
7493             "VM allocation failures: contiguous %u, normal %u, one page %u\n",
7494             mb_kmem_contig_failed, mb_kmem_failed, mb_kmem_one_failed);
7495         MBUF_DUMP_BUF_CHK();
7496         if (mb_kmem_contig_failed_ts || mb_kmem_failed_ts ||
7497             mb_kmem_one_failed_ts) {
7498                 k = snprintf(c, clen,
7499                     "VM allocation failure timestamps: contiguous %llu "
7500                     "(size %llu), normal %llu (size %llu), one page %llu "
7501                     "(now %llu)\n",
7502                     mb_kmem_contig_failed_ts, mb_kmem_contig_failed_size,
7503                     mb_kmem_failed_ts, mb_kmem_failed_size,
7504                     mb_kmem_one_failed_ts, net_uptime());
7505                 MBUF_DUMP_BUF_CHK();
7506                 k = snprintf(c, clen,
7507                     "VM return codes: ");
7508                 MBUF_DUMP_BUF_CHK();
7509                 for (i = 0;
7510                      i < sizeof(mb_kmem_stats) / sizeof(mb_kmem_stats[0]);
7511                      i++) {
7512                         k = snprintf(c, clen, "%s: %u ", mb_kmem_stats_labels[i],
7513                             mb_kmem_stats[i]);
7514                         MBUF_DUMP_BUF_CHK();
7515                 }
7516                 k = snprintf(c, clen, "\n");
7517                 MBUF_DUMP_BUF_CHK();
7518         }
7519         k = snprintf(c, clen,
7520             "worker thread runs: %u, expansions: %llu, cl %llu/%llu, "
7521             "bigcl %llu/%llu, 16k %llu/%llu\n", mbuf_worker_run_cnt,
7522             mb_expand_cnt, mb_expand_cl_cnt, mb_expand_cl_total,
7523             mb_expand_bigcl_cnt, mb_expand_bigcl_total, mb_expand_16kcl_cnt,
7524             mb_expand_16kcl_total);
7525         MBUF_DUMP_BUF_CHK();
7526         if (mbuf_worker_last_runtime != 0) {
7527                 k = snprintf(c, clen, "worker thread last run time: "
7528                     "%llu (%llu seconds ago)\n",
7529                     mbuf_worker_last_runtime,
7530                     net_uptime() - mbuf_worker_last_runtime);
7531                 MBUF_DUMP_BUF_CHK();
7532         }
7533
7534         /* mbuf leak detection statistics */
7535         mleak_update_stats();
7536
7537         k = snprintf(c, clen, "\nmbuf leak detection table:\n");
7538         MBUF_DUMP_BUF_CHK();
7539         k = snprintf(c, clen, "\ttotal captured: %u (one per %u)\n",
7540             mleak_table.mleak_capture / mleak_table.mleak_sample_factor,
7541             mleak_table.mleak_sample_factor);
7542         MBUF_DUMP_BUF_CHK();
7543         k = snprintf(c, clen, "\ttotal allocs outstanding: %llu\n",
7544             mleak_table.outstanding_allocs);
7545         MBUF_DUMP_BUF_CHK();
7546         k = snprintf(c, clen, "\tnew hash recorded: %llu allocs, %llu traces\n",
7547             mleak_table.alloc_recorded, mleak_table.trace_recorded);
7548         MBUF_DUMP_BUF_CHK();
7549         k = snprintf(c, clen, "\thash collisions: %llu allocs, %llu traces\n",
7550             mleak_table.alloc_collisions, mleak_table.trace_collisions);
7551         MBUF_DUMP_BUF_CHK();
7552         k = snprintf(c, clen, "\toverwrites: %llu allocs, %llu traces\n",
7553             mleak_table.alloc_overwrites, mleak_table.trace_overwrites);
7554         MBUF_DUMP_BUF_CHK();
7555         k = snprintf(c, clen, "\tlock conflicts: %llu\n\n",
7556             mleak_table.total_conflicts);
7557         MBUF_DUMP_BUF_CHK();
7558
7559         k = snprintf(c, clen, "top %d outstanding traces:\n",
7560             mleak_stat->ml_cnt);
7561         MBUF_DUMP_BUF_CHK();
7562         for (i = 0; i < mleak_stat->ml_cnt; i++) {
7563                 mltr = &mleak_stat->ml_trace[i];
7564                 k = snprintf(c, clen, "[%d] %llu outstanding alloc(s), "
7565                     "%llu hit(s), %llu collision(s)\n", (i + 1),
7566                     mltr->mltr_allocs, mltr->mltr_hitcount,
7567                     mltr->mltr_collisions);
7568                 MBUF_DUMP_BUF_CHK();
7569         }
7570
7571         if (mleak_stat->ml_isaddr64)
7572                 k = snprintf(c, clen, MB_LEAK_HDR_64);
7573         else
7574                 k = snprintf(c, clen, MB_LEAK_HDR_32);
7575         MBUF_DUMP_BUF_CHK();
7576
7577         for (i = 0; i < MLEAK_STACK_DEPTH; i++) {
7578                 int j;
7579                 k = snprintf(c, clen, "%2d: ", (i + 1));
7580                 MBUF_DUMP_BUF_CHK();
7581                 for (j = 0; j < mleak_stat->ml_cnt; j++) {
7582                         mltr = &mleak_stat->ml_trace[j];
7583                         if (i < mltr->mltr_depth) {
7584                                 if (mleak_stat->ml_isaddr64) {
7585                                         k = snprintf(c, clen, "0x%0llx  ",
7586                                             (uint64_t)VM_KERNEL_UNSLIDE(
7587                                                 mltr->mltr_addr[i]));
7588                                 } else {
7589                                         k = snprintf(c, clen,
7590                                             "0x%08x  ",
7591                                             (uint32_t)VM_KERNEL_UNSLIDE(
7592                                                 mltr->mltr_addr[i]));
7593                                 }
7594                         } else {
7595                                 if (mleak_stat->ml_isaddr64)
7596                                         k = snprintf(c, clen,
7597                                             MB_LEAK_SPACING_64);
7598                                 else
7599                                         k = snprintf(c, clen,
7600                                             MB_LEAK_SPACING_32);
7601                         }
7602                         MBUF_DUMP_BUF_CHK();
7603                 }
7604                 k = snprintf(c, clen, "\n");
7605                 MBUF_DUMP_BUF_CHK();
7606         }
7607 done:
7608         return (mbuf_dump_buf);
7609 }
7610
7611 #undef MBUF_DUMP_BUF_CHK
7612
7613 /*
7614  * Convert between a regular and a packet header mbuf.  Caller is responsible
7615  * for setting or clearing M_PKTHDR; this routine does the rest of the work.
7616  */
7617 int
7618 m_reinit(struct mbuf *m, int hdr)
7619 {
7620         int ret = 0;
7621
7622         if (hdr) {
7623                 VERIFY(!(m->m_flags & M_PKTHDR));
7624                 if (!(m->m_flags & M_EXT) &&
7625                     (m->m_data != m->m_dat || m->m_len > 0)) {
7626                         /*
7627                          * If there's no external cluster attached and the
7628                          * mbuf appears to contain user data, we cannot
7629                          * safely convert this to a packet header mbuf,
7630                          * as the packet header structure might overlap
7631                          * with the data.
7632                          */
7633                         printf("%s: cannot set M_PKTHDR on altered mbuf %llx, "
7634                             "m_data %llx (expected %llx), "
7635                             "m_len %d (expected 0)\n",
7636                             __func__,
7637                             (uint64_t)VM_KERNEL_ADDRPERM(m),
7638                             (uint64_t)VM_KERNEL_ADDRPERM(m->m_data),
7639                             (uint64_t)VM_KERNEL_ADDRPERM(m->m_dat), m->m_len);
7640                         ret = EBUSY;
7641                 } else {
7642                         VERIFY((m->m_flags & M_EXT) || m->m_data == m->m_dat);
7643                         m->m_flags |= M_PKTHDR;
7644                         MBUF_INIT_PKTHDR(m);
7645                 }
7646         } else {
7647                 /* Check for scratch area overflow */
7648                 m_redzone_verify(m);
7649                 /* Free the aux data and tags if there is any */
7650                 m_tag_delete_chain(m, NULL);
7651                 m->m_flags &= ~M_PKTHDR;
7652         }
7653
7654         return (ret);
7655 }
7656
7657 int
7658 m_ext_set_prop(struct mbuf *m, uint32_t o, uint32_t n)
7659 {
7660         ASSERT(m->m_flags & M_EXT);
7661         return (atomic_test_set_32(&MEXT_PRIV(m), o, n));
7662 }
7663
7664 uint32_t
7665 m_ext_get_prop(struct mbuf *m)
7666 {
7667         ASSERT(m->m_flags & M_EXT);
7668         return (MEXT_PRIV(m));
7669 }
7670
7671 int
7672 m_ext_paired_is_active(struct mbuf *m)
7673 {
7674         return (MBUF_IS_PAIRED(m) ? (MEXT_PREF(m) > MEXT_MINREF(m)) : 1);
7675 }
7676
7677 void
7678 m_ext_paired_activate(struct mbuf *m)
7679 {
7680         struct ext_ref *rfa;
7681         int hdr, type;
7682         caddr_t extbuf;
7683         m_ext_free_func_t extfree;
7684         u_int extsize;
7685
7686         VERIFY(MBUF_IS_PAIRED(m));
7687         VERIFY(MEXT_REF(m) == MEXT_MINREF(m));
7688         VERIFY(MEXT_PREF(m) == MEXT_MINREF(m));
7689
7690         hdr = (m->m_flags & M_PKTHDR);
7691         type = m->m_type;
7692         extbuf = m->m_ext.ext_buf;
7693         extfree = m_get_ext_free(m);
7694         extsize = m->m_ext.ext_size;
7695         rfa = m_get_rfa(m);
7696
7697         VERIFY(extbuf != NULL && rfa != NULL);
7698
7699         /*
7700          * Safe to reinitialize packet header tags, since it's
7701          * already taken care of at m_free() time.  Similar to
7702          * what's done in m_clattach() for the cluster.  Bump
7703          * up MEXT_PREF to indicate activation.
7704          */
7705         MBUF_INIT(m, hdr, type);
7706         MEXT_INIT(m, extbuf, extsize, extfree, (caddr_t)m, rfa,
7707             1, 1, 2, EXTF_PAIRED, MEXT_PRIV(m), m);
7708 }
7709
7710 void
7711 m_scratch_init(struct mbuf *m)
7712 {
7713         struct pkthdr *pkt = &m->m_pkthdr;
7714
7715         VERIFY(m->m_flags & M_PKTHDR);
7716
7717         /* See comments in <rdar://problem/14040693> */
7718         if (pkt->pkt_flags & PKTF_PRIV_GUARDED) {
7719                 panic_plain("Invalid attempt to modify guarded module-private "
7720                     "area: mbuf %p, pkt_flags 0x%x\n", m, pkt->pkt_flags);
7721                 /* NOTREACHED */
7722         }
7723
7724         bzero(&pkt->pkt_mpriv, sizeof (pkt->pkt_mpriv));
7725 }
7726
7727 /*
7728  * This routine is reserved for mbuf_get_driver_scratch(); clients inside
7729  * xnu that intend on utilizing the module-private area should directly
7730  * refer to the pkt_mpriv structure in the pkthdr.  They are also expected
7731  * to set and clear PKTF_PRIV_GUARDED, while owning the packet and prior
7732  * to handing it off to another module, respectively.
7733  */
7734 u_int32_t
7735 m_scratch_get(struct mbuf *m, u_int8_t **p)
7736 {
7737         struct pkthdr *pkt = &m->m_pkthdr;
7738
7739         VERIFY(m->m_flags & M_PKTHDR);
7740
7741         /* See comments in <rdar://problem/14040693> */
7742         if (pkt->pkt_flags & PKTF_PRIV_GUARDED) {
7743                 panic_plain("Invalid attempt to access guarded module-private "
7744                     "area: mbuf %p, pkt_flags 0x%x\n", m, pkt->pkt_flags);
7745                 /* NOTREACHED */
7746         }
7747
7748         if (mcltrace) {
7749                 mcache_audit_t *mca;
7750
7751                 lck_mtx_lock(mbuf_mlock);
7752                 mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
7753                 if (mca->mca_uflags & MB_SCVALID)
7754                         mcl_audit_scratch(mca);
7755                 lck_mtx_unlock(mbuf_mlock);
7756         }
7757
7758         *p = (u_int8_t *)&pkt->pkt_mpriv;
7759         return (sizeof (pkt->pkt_mpriv));
7760 }
7761
7762 static void
7763 m_redzone_init(struct mbuf *m)
7764 {
7765         VERIFY(m->m_flags & M_PKTHDR);
7766         /*
7767          * Each mbuf has a unique red zone pattern, which is a XOR
7768          * of the red zone cookie and the address of the mbuf.
7769          */
7770         m->m_pkthdr.redzone = ((u_int32_t)(uintptr_t)m) ^ mb_redzone_cookie;
7771 }
7772
7773 static void
7774 m_redzone_verify(struct mbuf *m)
7775 {
7776         u_int32_t mb_redzone;
7777
7778         VERIFY(m->m_flags & M_PKTHDR);
7779
7780         mb_redzone = ((u_int32_t)(uintptr_t)m) ^ mb_redzone_cookie;
7781         if (m->m_pkthdr.redzone != mb_redzone) {
7782                 panic("mbuf %p redzone violation with value 0x%x "
7783                     "(instead of 0x%x, using cookie 0x%x)\n",
7784                     m, m->m_pkthdr.redzone, mb_redzone, mb_redzone_cookie);
7785                 /* NOTREACHED */
7786         }
7787 }
7788
7789 __private_extern__ inline void
7790 m_set_ext(struct mbuf *m, struct ext_ref *rfa, m_ext_free_func_t ext_free,
7791     caddr_t ext_arg)
7792 {
7793         VERIFY(m->m_flags & M_EXT);
7794         if (rfa != NULL) {
7795                 m->m_ext.ext_refflags =
7796                     (struct ext_ref *)(((uintptr_t)rfa) ^ mb_obscure_extref);
7797                 if (ext_free != NULL) {
7798                         rfa->ext_token = ((uintptr_t)&rfa->ext_token) ^
7799                             mb_obscure_extfree;
7800                         m->m_ext.ext_free = (m_ext_free_func_t)
7801                             (((uintptr_t)ext_free) ^ rfa->ext_token);
7802                         if (ext_arg != NULL) {
7803                                 m->m_ext.ext_arg =
7804                                     (caddr_t)(((uintptr_t)ext_arg) ^ rfa->ext_token);
7805                         } else {
7806                                 m->m_ext.ext_arg = NULL;
7807                         }
7808                 } else {
7809                         rfa->ext_token = 0;
7810                         m->m_ext.ext_free = NULL;
7811                         m->m_ext.ext_arg = NULL;
7812                 }
7813         } else {
7814                 /*
7815                  * If we are going to loose the cookie in ext_token by
7816                  * resetting the rfa, we should use the global cookie
7817                  * to obscure the ext_free and ext_arg pointers.
7818                  */
7819                 if (ext_free != NULL) {
7820                         m->m_ext.ext_free =
7821                             (m_ext_free_func_t)((uintptr_t)ext_free ^
7822                             mb_obscure_extfree);
7823                         if (ext_arg != NULL) {
7824                                 m->m_ext.ext_arg =
7825                                     (caddr_t)((uintptr_t)ext_arg ^
7826                                     mb_obscure_extfree);
7827                         } else {
7828                                 m->m_ext.ext_arg = NULL;
7829                         }
7830                 } else {
7831                         m->m_ext.ext_free = NULL;
7832                         m->m_ext.ext_arg = NULL;
7833                 }
7834                 m->m_ext.ext_refflags = NULL;
7835         }
7836 }
7837
7838 __private_extern__ inline struct ext_ref *
7839 m_get_rfa(struct mbuf *m)
7840 {
7841         if (m->m_ext.ext_refflags == NULL)
7842                 return (NULL);
7843         else
7844                 return ((struct ext_ref *)(((uintptr_t)m->m_ext.ext_refflags) ^ mb_obscure_extref));
7845 }
7846
7847 __private_extern__ inline m_ext_free_func_t
7848 m_get_ext_free(struct mbuf *m)
7849 {
7850         struct ext_ref *rfa;
7851         if (m->m_ext.ext_free == NULL)
7852                 return (NULL);
7853
7854         rfa = m_get_rfa(m);
7855         if (rfa == NULL)
7856                 return ((m_ext_free_func_t)((uintptr_t)m->m_ext.ext_free ^ mb_obscure_extfree));
7857         else
7858                 return ((m_ext_free_func_t)(((uintptr_t)m->m_ext.ext_free)
7859                     ^ rfa->ext_token));
7860 }
7861
7862 __private_extern__ inline caddr_t
7863 m_get_ext_arg(struct mbuf *m)
7864 {
7865         struct ext_ref *rfa;
7866         if (m->m_ext.ext_arg == NULL)
7867                 return (NULL);
7868
7869         rfa = m_get_rfa(m);
7870         if (rfa == NULL) {
7871                 return ((caddr_t)((uintptr_t)m->m_ext.ext_arg ^ mb_obscure_extfree));
7872         } else {
7873                 return ((caddr_t)(((uintptr_t)m->m_ext.ext_arg) ^
7874                     rfa->ext_token));
7875         }
7876 }
7877
7878 /*
7879  * Send a report of mbuf usage if the usage is at least 6% of max limit
7880  * or if there has been at least 3% increase since the last report.
7881  *
7882  * The values 6% and 3% are chosen so that we can do simple arithmetic
7883  * with shift operations.
7884  */
7885 static boolean_t
7886 mbuf_report_usage(mbuf_class_t cl)
7887 {
7888         /* if a report is already in progress, nothing to do */
7889         if (mb_peak_newreport)
7890                 return (TRUE);
7891
7892         if (m_total(cl) > m_peak(cl) &&
7893             m_total(cl) >= (m_maxlimit(cl) >> 4) &&
7894             (m_total(cl) - m_peak(cl)) >= (m_peak(cl) >> 5))
7895                 return (TRUE);
7896         return (FALSE);
7897 }
7898
7899 __private_extern__ void
7900 mbuf_report_peak_usage(void)
7901 {
7902         int i = 0;
7903         u_int64_t uptime;
7904         struct nstat_sysinfo_data ns_data;
7905         uint32_t memreleased = 0;
7906         static uint32_t prevmemreleased;
7907
7908         uptime = net_uptime();
7909         lck_mtx_lock(mbuf_mlock);
7910
7911         /* Generate an initial report after 1 week of uptime */
7912         if (!mb_peak_firstreport &&
7913             uptime > MBUF_PEAK_FIRST_REPORT_THRESHOLD) {
7914                 mb_peak_newreport = TRUE;
7915                 mb_peak_firstreport = TRUE;
7916         }
7917
7918         if (!mb_peak_newreport) {
7919                 lck_mtx_unlock(mbuf_mlock);
7920                 return;
7921         }
7922
7923         /*
7924          * Since a report is being generated before 1 week,
7925          * we do not need to force another one later
7926          */
7927         if (uptime < MBUF_PEAK_FIRST_REPORT_THRESHOLD)
7928                 mb_peak_firstreport = TRUE;
7929
7930         for (i = 0; i < NELEM(mbuf_table); i++) {
7931                 m_peak(m_class(i)) = m_total(m_class(i));
7932                 memreleased += m_release_cnt(i);
7933         }
7934         memreleased = memreleased - prevmemreleased;
7935         prevmemreleased = memreleased;
7936         mb_peak_newreport = FALSE;
7937         lck_mtx_unlock(mbuf_mlock);
7938
7939         bzero(&ns_data, sizeof(ns_data));
7940         ns_data.flags = NSTAT_SYSINFO_MBUF_STATS;
7941         ns_data.u.mb_stats.total_256b = m_peak(MC_MBUF);
7942         ns_data.u.mb_stats.total_2kb = m_peak(MC_CL);
7943         ns_data.u.mb_stats.total_4kb = m_peak(MC_BIGCL);
7944         ns_data.u.mb_stats.total_16kb = m_peak(MC_16KCL);
7945         ns_data.u.mb_stats.sbmb_total = total_sbmb_cnt_peak;
7946         ns_data.u.mb_stats.sb_atmbuflimit = sbmb_limreached;
7947         ns_data.u.mb_stats.draincnt = mbstat.m_drain;
7948         ns_data.u.mb_stats.memreleased = memreleased;
7949         ns_data.u.mb_stats.sbmb_floor = total_sbmb_cnt_floor;
7950
7951         nstat_sysinfo_send_data(&ns_data);
7952
7953         /*
7954          * Reset the floor whenever we report a new
7955          * peak to track the trend (increase peek usage
7956          * is not a leak if mbufs get released
7957          * between reports and the floor stays low)
7958          */
7959         total_sbmb_cnt_floor = total_sbmb_cnt_peak;
7960 }
7961
7962 /*
7963  * Called by the VM when there's memory pressure.
7964  */
7965 __private_extern__ void
7966 m_drain(void)
7967 {
7968         mbuf_class_t mc;
7969         mcl_slab_t *sp, *sp_tmp, *nsp;
7970         unsigned int num, k, interval, released = 0;
7971         unsigned long total_mem = 0, use_mem = 0;
7972         boolean_t ret, purge_caches = FALSE;
7973         ppnum_t offset;
7974         mcache_obj_t *obj;
7975         unsigned long per;
7976         static uint64_t last_drain = 0;
7977         static unsigned char scratch[32];
7978         static ppnum_t scratch_pa = 0;
7979
7980         if (mb_drain_maxint == 0 || mb_waiters)
7981                 return;
7982         if (scratch_pa == 0) {
7983                 bzero(scratch, sizeof(scratch));
7984                 scratch_pa = pmap_find_phys(kernel_pmap, (addr64_t)scratch);
7985                 VERIFY(scratch_pa);
7986         } else if (mclverify) {
7987                 /*
7988                  * Panic if a driver wrote to our scratch memory.
7989                  */
7990                 for (k = 0; k < sizeof(scratch); k++)
7991                         if (scratch[k])
7992                                 panic("suspect DMA to freed address");
7993         }
7994         /*
7995          * Don't free memory too often as that could cause excessive
7996          * waiting times for mbufs.  Purge caches if we were asked to drain
7997          * in the last 5 minutes.
7998          */
7999         lck_mtx_lock(mbuf_mlock);
8000         if (last_drain == 0) {
8001                 last_drain = net_uptime();
8002                 lck_mtx_unlock(mbuf_mlock);
8003                 return;
8004         }
8005         interval = net_uptime() - last_drain;
8006         if (interval <= mb_drain_maxint) {
8007                 lck_mtx_unlock(mbuf_mlock);
8008                 return;
8009         }
8010         if (interval <= mb_drain_maxint * 5)
8011                 purge_caches = TRUE;
8012         last_drain = net_uptime();
8013         /*
8014          * Don't free any memory if we're using 60% or more.
8015          */
8016         for (mc = 0; mc < NELEM(mbuf_table); mc++) {
8017                 total_mem += m_total(mc) * m_maxsize(mc);
8018                 use_mem += m_active(mc) * m_maxsize(mc);
8019         }
8020         per = (use_mem * 100) / total_mem;
8021         if (per >= 60) {
8022                 lck_mtx_unlock(mbuf_mlock);
8023                 return;
8024         }
8025         /*
8026          * Purge all the caches.  This effectively disables
8027          * caching for a few seconds, but the mbuf worker thread will
8028          * re-enable them again.
8029          */
8030         if (purge_caches == TRUE)
8031                 for (mc = 0; mc < NELEM(mbuf_table); mc++) {
8032                         if (m_total(mc) < m_avgtotal(mc))
8033                                 continue;
8034                         lck_mtx_unlock(mbuf_mlock);
8035                         ret = mcache_purge_cache(m_cache(mc), FALSE);
8036                         lck_mtx_lock(mbuf_mlock);
8037                         if (ret == TRUE)
8038                                 m_purge_cnt(mc)++;
8039                 }
8040         /*
8041          * Move the objects from the composite class freelist to
8042          * the rudimentary slabs list, but keep at least 10% of the average
8043          * total in the freelist.
8044          */
8045         for (mc = 0; mc < NELEM(mbuf_table); mc++) {
8046                 while (m_cobjlist(mc) &&
8047                     m_total(mc) < m_avgtotal(mc) &&
8048                     m_infree(mc) > 0.1 * m_avgtotal(mc) + m_minlimit(mc)) {
8049                         obj = m_cobjlist(mc);
8050                         m_cobjlist(mc) = obj->obj_next;
8051                         obj->obj_next = NULL;
8052                         num = cslab_free(mc, obj, 1);
8053                         VERIFY(num == 1);
8054                         m_free_cnt(mc)++;
8055                         m_infree(mc)--;
8056                         /* cslab_free() handles m_total */
8057                 }
8058         }
8059         /*
8060          * Free the buffers present in the slab list up to 10% of the total
8061          * average per class.
8062          *
8063          * We walk the list backwards in an attempt to reduce fragmentation.
8064          */
8065         for (mc = NELEM(mbuf_table) - 1; (int)mc >= 0; mc--) {
8066                 TAILQ_FOREACH_SAFE(sp, &m_slablist(mc), sl_link, sp_tmp) {
8067                         /*
8068                          * Process only unused slabs occupying memory.
8069                          */
8070                         if (sp->sl_refcnt != 0 || sp->sl_len == 0 ||
8071                             sp->sl_base == NULL)
8072                                 continue;
8073                         if (m_total(mc) < m_avgtotal(mc) ||
8074                             m_infree(mc) < 0.1 * m_avgtotal(mc) + m_minlimit(mc))
8075                                 break;
8076                         slab_remove(sp, mc);
8077                         switch (mc) {
8078                         case MC_MBUF:
8079                                 m_infree(mc) -= NMBPG;
8080                                 m_total(mc) -= NMBPG;
8081                                 if (mclaudit != NULL)
8082                                         mcl_audit_free(sp->sl_base, NMBPG);
8083                                 break;
8084                         case MC_CL:
8085                                 m_infree(mc) -= NCLPG;
8086                                 m_total(mc) -= NCLPG;
8087                                 if (mclaudit != NULL)
8088                                         mcl_audit_free(sp->sl_base, NMBPG);
8089                                 break;
8090                         case MC_BIGCL:
8091                         {
8092                                 m_infree(mc) -= NBCLPG;
8093                                 m_total(mc) -= NBCLPG;
8094                                 if (mclaudit != NULL)
8095                                         mcl_audit_free(sp->sl_base, NMBPG);
8096                                 break;
8097                         }
8098                         case MC_16KCL:
8099                                 m_infree(mc)--;
8100                                 m_total(mc)--;
8101                                 for (nsp = sp, k = 1; k < NSLABSP16KB; k++) {
8102                                         nsp = nsp->sl_next;
8103                                         VERIFY(nsp->sl_refcnt == 0 &&
8104                                             nsp->sl_base != NULL &&
8105                                             nsp->sl_len == 0);
8106                                         slab_init(nsp, 0, 0, NULL, NULL, 0, 0,
8107                                             0);
8108                                         nsp->sl_flags = 0;
8109                                 }
8110                                 if (mclaudit != NULL) {
8111                                         if (sp->sl_len == PAGE_SIZE) {
8112                                                 mcl_audit_free(sp->sl_base,
8113                                                     NMBPG);
8114                                         } else {
8115                                                 mcl_audit_free(sp->sl_base, 1);
8116                                         }
8117                                 }
8118                                 break;
8119                         default:
8120                                 /*
8121                                  * The composite classes have their own
8122                                  * freelist (m_cobjlist), so we only
8123                                  * process rudimentary classes here.
8124                                  */
8125                                 VERIFY(0);
8126                         }
8127                         m_release_cnt(mc) += m_size(mc);
8128                         released += m_size(mc);
8129                         VERIFY(sp->sl_base != NULL &&
8130                             sp->sl_len >= PAGE_SIZE);
8131                         offset = MTOPG(sp->sl_base);
8132                         /*
8133                          * Make sure the IOMapper points to a valid, but
8134                          * bogus, address.  This should prevent further DMA
8135                          * accesses to freed memory.
8136                          */
8137                         IOMapperInsertPage(mcl_paddr_base, offset, scratch_pa);
8138                         mcl_paddr[offset] = 0;
8139                         kmem_free(mb_map, (vm_offset_t)sp->sl_base,
8140                             sp->sl_len);
8141                         slab_init(sp, 0, 0, NULL, NULL, 0, 0, 0);
8142                         sp->sl_flags = 0;
8143                 }
8144         }
8145         mbstat.m_drain++;
8146         mbstat.m_bigclusters = m_total(MC_BIGCL);
8147         mbstat.m_clusters = m_total(MC_CL);
8148         mbstat.m_mbufs = m_total(MC_MBUF);
8149         mbuf_stat_sync();
8150         mbuf_mtypes_sync(TRUE);
8151         lck_mtx_unlock(mbuf_mlock);
8152 }
8153
8154 static int
8155 m_drain_force_sysctl SYSCTL_HANDLER_ARGS
8156 {
8157 #pragma unused(arg1, arg2)
8158         int val = 0, err;
8159
8160         err = sysctl_handle_int(oidp, &val, 0, req);
8161         if (err != 0 || req->newptr == USER_ADDR_NULL)
8162                 return (err);
8163         if (val)
8164                 m_drain();
8165
8166         return (err);
8167 }
8168
8169 #if DEBUG || DEVELOPMENT
8170
8171 static int mbtest_val;
8172 static int mbtest_running;
8173
8174 static void mbtest_thread(__unused void *arg)
8175 {
8176         int i;
8177
8178         printf("%s thread starting\n", __func__);
8179
8180         for (i = 0; i < 1000; i++) {
8181                 unsigned int needed = 100000;
8182                 struct mbuf *m1, *m2, *m3;
8183
8184                 if (njcl > 0) {
8185                         needed = 100000;
8186                         m3 = m_getpackets_internal(&needed, 0, M_DONTWAIT, 0, M16KCLBYTES);
8187                         m_freem_list(m3);
8188                 }
8189
8190                 needed = 100000;
8191                 m2 = m_getpackets_internal(&needed, 0, M_DONTWAIT, 0, MBIGCLBYTES);
8192                 m_freem_list(m2);
8193
8194                 m1 = m_getpackets_internal(&needed, 0, M_DONTWAIT, 0, MCLBYTES);
8195                 m_freem_list(m1);
8196         }
8197
8198         printf("%s thread ending\n", __func__);
8199
8200         OSDecrementAtomic(&mbtest_running);
8201         wakeup_one((caddr_t)&mbtest_running);
8202 }
8203
8204 static void sysctl_mbtest(void)
8205 {
8206         /* We launch three threads - wait for all of them */
8207         OSIncrementAtomic(&mbtest_running);
8208         OSIncrementAtomic(&mbtest_running);
8209         OSIncrementAtomic(&mbtest_running);
8210
8211         thread_call_func_delayed((thread_call_func_t)mbtest_thread, NULL, 10);
8212         thread_call_func_delayed((thread_call_func_t)mbtest_thread, NULL, 10);
8213         thread_call_func_delayed((thread_call_func_t)mbtest_thread, NULL, 10);
8214
8215         while (mbtest_running) {
8216                 msleep((caddr_t)&mbtest_running, NULL, PUSER, "mbtest_running", NULL);
8217         }
8218 }
8219
8220 static int
8221 mbtest SYSCTL_HANDLER_ARGS
8222 {
8223 #pragma unused(arg1, arg2)
8224         int error = 0, val, oldval = mbtest_val;
8225
8226         val = oldval;
8227         error = sysctl_handle_int(oidp, &val, 0, req);
8228         if (error || !req->newptr)
8229                 return (error);
8230
8231         if (val != oldval)
8232                 sysctl_mbtest();
8233
8234         mbtest_val = val;
8235
8236         return (error);
8237 }
8238 #endif
8239
8240 SYSCTL_DECL(_kern_ipc);
8241 #if DEBUG || DEVELOPMENT
8242 SYSCTL_PROC(_kern_ipc, OID_AUTO, mbtest,
8243     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &mbtest_val, 0, &mbtest, "I",
8244     "Toggle to test mbufs");
8245 #endif
8246 SYSCTL_PROC(_kern_ipc, KIPC_MBSTAT, mbstat,
8247     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
8248     0, 0, mbstat_sysctl, "S,mbstat", "");
8249 SYSCTL_PROC(_kern_ipc, OID_AUTO, mb_stat,
8250     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
8251     0, 0, mb_stat_sysctl, "S,mb_stat", "");
8252 SYSCTL_PROC(_kern_ipc, OID_AUTO, mleak_top_trace,
8253     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
8254     0, 0, mleak_top_trace_sysctl, "S,mb_top_trace", "");
8255 SYSCTL_PROC(_kern_ipc, OID_AUTO, mleak_table,
8256     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
8257     0, 0, mleak_table_sysctl, "S,mleak_table", "");
8258 SYSCTL_INT(_kern_ipc, OID_AUTO, mleak_sample_factor,
8259     CTLFLAG_RW | CTLFLAG_LOCKED, &mleak_table.mleak_sample_factor, 0, "");
8260 SYSCTL_INT(_kern_ipc, OID_AUTO, mb_normalized,
8261     CTLFLAG_RD | CTLFLAG_LOCKED, &mb_normalized, 0, "");
8262 SYSCTL_INT(_kern_ipc, OID_AUTO, mb_watchdog,
8263     CTLFLAG_RW | CTLFLAG_LOCKED, &mb_watchdog, 0, "");
8264 SYSCTL_PROC(_kern_ipc, OID_AUTO, mb_drain_force,
8265     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, NULL, 0,
8266     m_drain_force_sysctl, "I",
8267     "Forces the mbuf garbage collection to run");
8268 SYSCTL_INT(_kern_ipc, OID_AUTO, mb_drain_maxint,
8269     CTLFLAG_RW | CTLFLAG_LOCKED, &mb_drain_maxint, 0,
8270     "Minimum time interval between garbage collection");