bsd/kern/uipc_mbuf.c

   1 /*
   2  * Copyright (c) 1998-2014 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
  29 /*
  30  * Copyright (c) 1982, 1986, 1988, 1991, 1993
  31  *      The Regents of the University of California.  All rights reserved.
  32  *
  33  * Redistribution and use in source and binary forms, with or without
  34  * modification, are permitted provided that the following conditions
  35  * are met:
  36  * 1. Redistributions of source code must retain the above copyright
  37  *    notice, this list of conditions and the following disclaimer.
  38  * 2. Redistributions in binary form must reproduce the above copyright
  39  *    notice, this list of conditions and the following disclaimer in the
  40  *    documentation and/or other materials provided with the distribution.
  41  * 3. All advertising materials mentioning features or use of this software
  42  *    must display the following acknowledgement:
  43  *      This product includes software developed by the University of
  44  *      California, Berkeley and its contributors.
  45  * 4. Neither the name of the University nor the names of its contributors
  46  *    may be used to endorse or promote products derived from this software
  47  *    without specific prior written permission.
  48  *
  49  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  50  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  51  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  52  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  53  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  54  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  55  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  56  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  57  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  58  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  59  * SUCH DAMAGE.
  60  *
  61  *      @(#)uipc_mbuf.c 8.2 (Berkeley) 1/4/94
  62  */
  63 /*
  64  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
  65  * support for mandatory and extensible security protections.  This notice
  66  * is included in support of clause 2.2 (b) of the Apple Public License,
  67  * Version 2.0.
  68  */
  69
  70 #include <sys/param.h>
  71 #include <sys/systm.h>
  72 #include <sys/malloc.h>
  73 #include <sys/mbuf.h>
  74 #include <sys/kernel.h>
  75 #include <sys/sysctl.h>
  76 #include <sys/syslog.h>
  77 #include <sys/protosw.h>
  78 #include <sys/domain.h>
  79 #include <sys/queue.h>
  80 #include <sys/proc.h>
  81
  82 #include <dev/random/randomdev.h>
  83
  84 #include <kern/kern_types.h>
  85 #include <kern/simple_lock.h>
  86 #include <kern/queue.h>
  87 #include <kern/sched_prim.h>
  88 #include <kern/cpu_number.h>
  89 #include <kern/zalloc.h>
  90
  91 #include <libkern/OSAtomic.h>
  92 #include <libkern/OSDebug.h>
  93 #include <libkern/libkern.h>
  94
  95 #include <IOKit/IOMapper.h>
  96
  97 #include <machine/limits.h>
  98 #include <machine/machine_routines.h>
  99
 100 #if CONFIG_MACF_NET
 101 #include <security/mac_framework.h>
 102 #endif /* MAC_NET */
 103
 104 #include <sys/mcache.h>
 105 #include <net/ntstat.h>
 106
 107 /*
 108  * MBUF IMPLEMENTATION NOTES.
 109  *
 110  * There is a total of 5 per-CPU caches:
 111  *
 112  * MC_MBUF:
 113  *      This is a cache of rudimentary objects of MSIZE in size; each
 114  *      object represents an mbuf structure.  This cache preserves only
 115  *      the m_type field of the mbuf during its transactions.
 116  *
 117  * MC_CL:
 118  *      This is a cache of rudimentary objects of MCLBYTES in size; each
 119  *      object represents a mcluster structure.  This cache does not
 120  *      preserve the contents of the objects during its transactions.
 121  *
 122  * MC_BIGCL:
 123  *      This is a cache of rudimentary objects of MBIGCLBYTES in size; each
 124  *      object represents a mbigcluster structure.  This cache does not
 125  *      preserve the contents of the objects during its transaction.
 126  *
 127  * MC_MBUF_CL:
 128  *      This is a cache of mbufs each having a cluster attached to it.
 129  *      It is backed by MC_MBUF and MC_CL rudimentary caches.  Several
 130  *      fields of the mbuf related to the external cluster are preserved
 131  *      during transactions.
 132  *
 133  * MC_MBUF_BIGCL:
 134  *      This is a cache of mbufs each having a big cluster attached to it.
 135  *      It is backed by MC_MBUF and MC_BIGCL rudimentary caches.  Several
 136  *      fields of the mbuf related to the external cluster are preserved
 137  *      during transactions.
 138  *
 139  * OBJECT ALLOCATION:
 140  *
 141  * Allocation requests are handled first at the per-CPU (mcache) layer
 142  * before falling back to the slab layer.  Performance is optimal when
 143  * the request is satisfied at the CPU layer because global data/lock
 144  * never gets accessed.  When the slab layer is entered for allocation,
 145  * the slab freelist will be checked first for available objects before
 146  * the VM backing store is invoked.  Slab layer operations are serialized
 147  * for all of the caches as the mbuf global lock is held most of the time.
 148  * Allocation paths are different depending on the class of objects:
 149  *
 150  * a. Rudimentary object:
 151  *
 152  *      { m_get_common(), m_clattach(), m_mclget(),
 153  *        m_mclalloc(), m_bigalloc(), m_copym_with_hdrs(),
 154  *        composite object allocation }
 155  *                      |       ^
 156  *                      |       |
 157  *                      |       +-----------------------+
 158  *                      v                               |
 159  *         mcache_alloc/mcache_alloc_ext()      mbuf_slab_audit()
 160  *                      |                               ^
 161  *                      v                               |
 162  *                 [CPU cache] -------> (found?) -------+
 163  *                      |                               |
 164  *                      v                               |
 165  *               mbuf_slab_alloc()                      |
 166  *                      |                               |
 167  *                      v                               |
 168  *      +---------> [freelist] -------> (found?) -------+
 169  *      |               |
 170  *      |               v
 171  *      |           m_clalloc()
 172  *      |               |
 173  *      |               v
 174  *      +---<<---- kmem_mb_alloc()
 175  *
 176  * b. Composite object:
 177  *
 178  *      { m_getpackets_internal(), m_allocpacket_internal() }
 179  *                      |       ^
 180  *                      |       |
 181  *                      |       +------ (done) ---------+
 182  *                      v                               |
 183  *         mcache_alloc/mcache_alloc_ext()      mbuf_cslab_audit()
 184  *                      |                               ^
 185  *                      v                               |
 186  *                 [CPU cache] -------> (found?) -------+
 187  *                      |                               |
 188  *                      v                               |
 189  *               mbuf_cslab_alloc()                     |
 190  *                      |                               |
 191  *                      v                               |
 192  *                  [freelist] -------> (found?) -------+
 193  *                      |                               |
 194  *                      v                               |
 195  *              (rudimentary object)                    |
 196  *         mcache_alloc/mcache_alloc_ext() ------>>-----+
 197  *
 198  * Auditing notes: If auditing is enabled, buffers will be subjected to
 199  * integrity checks by the audit routine.  This is done by verifying their
 200  * contents against DEADBEEF (free) pattern before returning them to caller.
 201  * As part of this step, the routine will also record the transaction and
 202  * pattern-fill the buffers with BADDCAFE (uninitialized) pattern.  It will
 203  * also restore any constructed data structure fields if necessary.
 204  *
 205  * OBJECT DEALLOCATION:
 206  *
 207  * Freeing an object simply involves placing it into the CPU cache; this
 208  * pollutes the cache to benefit subsequent allocations.  The slab layer
 209  * will only be entered if the object is to be purged out of the cache.
 210  * During normal operations, this happens only when the CPU layer resizes
 211  * its bucket while it's adjusting to the allocation load.  Deallocation
 212  * paths are different depending on the class of objects:
 213  *
 214  * a. Rudimentary object:
 215  *
 216  *      { m_free(), m_freem_list(), composite object deallocation }
 217  *                      |       ^
 218  *                      |       |
 219  *                      |       +------ (done) ---------+
 220  *                      v                               |
 221  *         mcache_free/mcache_free_ext()                |
 222  *                      |                               |
 223  *                      v                               |
 224  *              mbuf_slab_audit()                       |
 225  *                      |                               |
 226  *                      v                               |
 227  *                 [CPU cache] ---> (not purging?) -----+
 228  *                      |                               |
 229  *                      v                               |
 230  *               mbuf_slab_free()                       |
 231  *                      |                               |
 232  *                      v                               |
 233  *                  [freelist] ----------->>------------+
 234  *       (objects get purged to VM only on demand)
 235  *
 236  * b. Composite object:
 237  *
 238  *      { m_free(), m_freem_list() }
 239  *                      |       ^
 240  *                      |       |
 241  *                      |       +------ (done) ---------+
 242  *                      v                               |
 243  *         mcache_free/mcache_free_ext()                |
 244  *                      |                               |
 245  *                      v                               |
 246  *              mbuf_cslab_audit()                      |
 247  *                      |                               |
 248  *                      v                               |
 249  *                 [CPU cache] ---> (not purging?) -----+
 250  *                      |                               |
 251  *                      v                               |
 252  *               mbuf_cslab_free()                      |
 253  *                      |                               |
 254  *                      v                               |
 255  *                  [freelist] ---> (not purging?) -----+
 256  *                      |                               |
 257  *                      v                               |
 258  *              (rudimentary object)                    |
 259  *         mcache_free/mcache_free_ext() ------->>------+
 260  *
 261  * Auditing notes: If auditing is enabled, the audit routine will save
 262  * any constructed data structure fields (if necessary) before filling the
 263  * contents of the buffers with DEADBEEF (free) pattern and recording the
 264  * transaction.  Buffers that are freed (whether at CPU or slab layer) are
 265  * expected to contain the free pattern.
 266  *
 267  * DEBUGGING:
 268  *
 269  * Debugging can be enabled by adding "mbuf_debug=0x3" to boot-args; this
 270  * translates to the mcache flags (MCF_VERIFY | MCF_AUDIT).  Additionally,
 271  * the CPU layer cache can be disabled by setting the MCF_NOCPUCACHE flag,
 272  * i.e. modify the boot argument parameter to "mbuf_debug=0x13".  Leak
 273  * detection may also be disabled by setting the MCF_NOLEAKLOG flag, e.g.
 274  * "mbuf_debug=0x113".  Note that debugging consumes more CPU and memory.
 275  *
 276  * Each object is associated with exactly one mcache_audit_t structure that
 277  * contains the information related to its last buffer transaction.  Given
 278  * an address of an object, the audit structure can be retrieved by finding
 279  * the position of the object relevant to the base address of the cluster:
 280  *
 281  *      +------------+                  +=============+
 282  *      | mbuf addr  |                  | mclaudit[i] |
 283  *      +------------+                  +=============+
 284  *            |                         | cl_audit[0] |
 285  *      i = MTOBG(addr)                 +-------------+
 286  *            |                 +-----> | cl_audit[1] | -----> mcache_audit_t
 287  *      b = BGTOM(i)            |       +-------------+
 288  *            |                 |       |     ...     |
 289  *      x = MCLIDX(b, addr)     |       +-------------+
 290  *            |                 |       | cl_audit[7] |
 291  *            +-----------------+       +-------------+
 292  *               (e.g. x == 1)
 293  *
 294  * The mclaudit[] array is allocated at initialization time, but its contents
 295  * get populated when the corresponding cluster is created.  Because a page
 296  * can be turned into NMBPG number of mbufs, we preserve enough space for the
 297  * mbufs so that there is a 1-to-1 mapping between them.  A page that never
 298  * gets (or has not yet) turned into mbufs will use only cl_audit[0] with the
 299  * remaining entries unused.  For 16KB cluster, only one entry from the first
 300  * page is allocated and used for the entire object.
 301  */
 302
 303 /* TODO: should be in header file */
 304 /* kernel translater */
 305 extern vm_offset_t kmem_mb_alloc(vm_map_t, int, int);
 306 extern ppnum_t pmap_find_phys(pmap_t pmap, addr64_t va);
 307 extern vm_map_t mb_map;         /* special map */
 308
 309 /* Global lock */
 310 decl_lck_mtx_data(static, mbuf_mlock_data);
 311 static lck_mtx_t *mbuf_mlock = &mbuf_mlock_data;
 312 static lck_attr_t *mbuf_mlock_attr;
 313 static lck_grp_t *mbuf_mlock_grp;
 314 static lck_grp_attr_t *mbuf_mlock_grp_attr;
 315
 316 /* Back-end (common) layer */
 317 static void *mbuf_worker_run;   /* wait channel for worker thread */
 318 static int mbuf_worker_ready;   /* worker thread is runnable */
 319 static int mbuf_expand_mcl;     /* number of cluster creation requets */
 320 static int mbuf_expand_big;     /* number of big cluster creation requests */
 321 static int mbuf_expand_16k;     /* number of 16KB cluster creation requests */
 322 static int ncpu;                /* number of CPUs */
 323 static ppnum_t *mcl_paddr;      /* Array of cluster physical addresses */
 324 static ppnum_t mcl_pages;       /* Size of array (# physical pages) */
 325 static ppnum_t mcl_paddr_base;  /* Handle returned by IOMapper::iovmAlloc() */
 326 static mcache_t *ref_cache;     /* Cache of cluster reference & flags */
 327 static mcache_t *mcl_audit_con_cache; /* Audit contents cache */
 328 static unsigned int mbuf_debug; /* patchable mbuf mcache flags */
 329 static unsigned int mb_normalized; /* number of packets "normalized" */
 330
 331 #define MB_GROWTH_AGGRESSIVE    1       /* Threshold: 1/2 of total */
 332 #define MB_GROWTH_NORMAL        2       /* Threshold: 3/4 of total */
 333
 334 typedef enum {
 335         MC_MBUF = 0,    /* Regular mbuf */
 336         MC_CL,          /* Cluster */
 337         MC_BIGCL,       /* Large (4KB) cluster */
 338         MC_16KCL,       /* Jumbo (16KB) cluster */
 339         MC_MBUF_CL,     /* mbuf + cluster */
 340         MC_MBUF_BIGCL,  /* mbuf + large (4KB) cluster */
 341         MC_MBUF_16KCL   /* mbuf + jumbo (16KB) cluster */
 342 } mbuf_class_t;
 343
 344 #define MBUF_CLASS_MIN          MC_MBUF
 345 #define MBUF_CLASS_MAX          MC_MBUF_16KCL
 346 #define MBUF_CLASS_LAST         MC_16KCL
 347 #define MBUF_CLASS_VALID(c) \
 348         ((int)(c) >= MBUF_CLASS_MIN && (int)(c) <= MBUF_CLASS_MAX)
 349 #define MBUF_CLASS_COMPOSITE(c) \
 350         ((int)(c) > MBUF_CLASS_LAST)
 351
 352
 353 /*
 354  * mbuf specific mcache allocation request flags.
 355  */
 356 #define MCR_COMP        MCR_USR1 /* for MC_MBUF_{CL,BIGCL,16KCL} caches */
 357
 358 /*
 359  * Per-cluster slab structure.
 360  *
 361  * A slab is a cluster control structure that contains one or more object
 362  * chunks; the available chunks are chained in the slab's freelist (sl_head).
 363  * Each time a chunk is taken out of the slab, the slab's reference count
 364  * gets incremented.  When all chunks have been taken out, the empty slab
 365  * gets removed (SLF_DETACHED) from the class's slab list.  A chunk that is
 366  * returned to a slab causes the slab's reference count to be decremented;
 367  * it also causes the slab to be reinserted back to class's slab list, if
 368  * it's not already done.
 369  *
 370  * Compartmentalizing of the object chunks into slabs allows us to easily
 371  * merge one or more slabs together when the adjacent slabs are idle, as
 372  * well as to convert or move a slab from one class to another; e.g. the
 373  * mbuf cluster slab can be converted to a regular cluster slab when all
 374  * mbufs in the slab have been freed.
 375  *
 376  * A slab may also span across multiple clusters for chunks larger than
 377  * a cluster's size.  In this case, only the slab of the first cluster is
 378  * used.  The rest of the slabs are marked with SLF_PARTIAL to indicate
 379  * that they are part of the larger slab.
 380  *
 381  * Each slab controls a page of memory.
 382  */
 383 typedef struct mcl_slab {
 384         struct mcl_slab *sl_next;       /* neighboring slab */
 385         u_int8_t        sl_class;       /* controlling mbuf class */
 386         int8_t          sl_refcnt;      /* outstanding allocations */
 387         int8_t          sl_chunks;      /* chunks (bufs) in this slab */
 388         u_int16_t       sl_flags;       /* slab flags (see below) */
 389         u_int16_t       sl_len;         /* slab length */
 390         void            *sl_base;       /* base of allocated memory */
 391         void            *sl_head;       /* first free buffer */
 392         TAILQ_ENTRY(mcl_slab) sl_link;  /* next/prev slab on freelist */
 393 } mcl_slab_t;
 394
 395 #define SLF_MAPPED      0x0001          /* backed by a mapped page */
 396 #define SLF_PARTIAL     0x0002          /* part of another slab */
 397 #define SLF_DETACHED    0x0004          /* not in slab freelist */
 398
 399 /*
 400  * The array of slabs are broken into groups of arrays per 1MB of kernel
 401  * memory to reduce the footprint.  Each group is allocated on demand
 402  * whenever a new piece of memory mapped in from the VM crosses the 1MB
 403  * boundary.
 404  */
 405 #define NSLABSPMB       ((1 << MBSHIFT) >> PAGE_SHIFT)
 406
 407 typedef struct mcl_slabg {
 408         mcl_slab_t      *slg_slab;      /* group of slabs */
 409 } mcl_slabg_t;
 410
 411 /*
 412  * Number of slabs needed to control a 16KB cluster object.
 413  */
 414 #define NSLABSP16KB     (M16KCLBYTES >> PAGE_SHIFT)
 415
 416 /*
 417  * Per-cluster audit structure.
 418  */
 419 typedef struct {
 420         mcache_audit_t  **cl_audit;     /* array of audits */
 421 } mcl_audit_t;
 422
 423 typedef struct {
 424         struct thread   *msa_thread;    /* thread doing transaction */
 425         struct thread   *msa_pthread;   /* previous transaction thread */
 426         uint32_t        msa_tstamp;     /* transaction timestamp (ms) */
 427         uint32_t        msa_ptstamp;    /* prev transaction timestamp (ms) */
 428         uint16_t        msa_depth;      /* pc stack depth */
 429         uint16_t        msa_pdepth;     /* previous transaction pc stack */
 430         void            *msa_stack[MCACHE_STACK_DEPTH];
 431         void            *msa_pstack[MCACHE_STACK_DEPTH];
 432 } mcl_scratch_audit_t;
 433
 434 typedef struct {
 435         /*
 436          * Size of data from the beginning of an mbuf that covers m_hdr,
 437          * pkthdr and m_ext structures.  If auditing is enabled, we allocate
 438          * a shadow mbuf structure of this size inside each audit structure,
 439          * and the contents of the real mbuf gets copied into it when the mbuf
 440          * is freed.  This allows us to pattern-fill the mbuf for integrity
 441          * check, and to preserve any constructed mbuf fields (e.g. mbuf +
 442          * cluster cache case).  Note that we don't save the contents of
 443          * clusters when they are freed; we simply pattern-fill them.
 444          */
 445         u_int8_t                sc_mbuf[(MSIZE - _MHLEN) + sizeof (_m_ext_t)];
 446         mcl_scratch_audit_t     sc_scratch __attribute__((aligned(8)));
 447 } mcl_saved_contents_t;
 448
 449 #define AUDIT_CONTENTS_SIZE     (sizeof (mcl_saved_contents_t))
 450
 451 #define MCA_SAVED_MBUF_PTR(_mca)                                        \
 452         ((struct mbuf *)(void *)((mcl_saved_contents_t *)               \
 453         (_mca)->mca_contents)->sc_mbuf)
 454 #define MCA_SAVED_MBUF_SIZE                                             \
 455         (sizeof (((mcl_saved_contents_t *)0)->sc_mbuf))
 456 #define MCA_SAVED_SCRATCH_PTR(_mca)                                     \
 457         (&((mcl_saved_contents_t *)(_mca)->mca_contents)->sc_scratch)
 458
 459 /*
 460  * mbuf specific mcache audit flags
 461  */
 462 #define MB_INUSE        0x01    /* object has not been returned to slab */
 463 #define MB_COMP_INUSE   0x02    /* object has not been returned to cslab */
 464 #define MB_SCVALID      0x04    /* object has valid saved contents */
 465
 466 /*
 467  * Each of the following two arrays hold up to nmbclusters elements.
 468  */
 469 static mcl_audit_t *mclaudit;   /* array of cluster audit information */
 470 static unsigned int maxclaudit; /* max # of entries in audit table */
 471 static mcl_slabg_t **slabstbl;  /* cluster slabs table */
 472 static unsigned int maxslabgrp; /* max # of entries in slabs table */
 473 static unsigned int slabgrp;    /* # of entries in slabs table */
 474
 475 /* Globals */
 476 int nclusters;                  /* # of clusters for non-jumbo (legacy) sizes */
 477 int njcl;                       /* # of clusters for jumbo sizes */
 478 int njclbytes;                  /* size of a jumbo cluster */
 479 unsigned char *mbutl;           /* first mapped cluster address */
 480 unsigned char *embutl;          /* ending virtual address of mclusters */
 481 int _max_linkhdr;               /* largest link-level header */
 482 int _max_protohdr;              /* largest protocol header */
 483 int max_hdr;                    /* largest link+protocol header */
 484 int max_datalen;                /* MHLEN - max_hdr */
 485
 486 static boolean_t mclverify;     /* debug: pattern-checking */
 487 static boolean_t mcltrace;      /* debug: stack tracing */
 488 static boolean_t mclfindleak;   /* debug: leak detection */
 489 static boolean_t mclexpleak;    /* debug: expose leak info to user space */
 490
 491 static struct timeval mb_start; /* beginning of time */
 492
 493 /* mbuf leak detection variables */
 494 static struct mleak_table mleak_table;
 495 static mleak_stat_t *mleak_stat;
 496
 497 #define MLEAK_STAT_SIZE(n) \
 498         ((size_t)(&((mleak_stat_t *)0)->ml_trace[n]))
 499
 500 struct mallocation {
 501         mcache_obj_t *element;  /* the alloc'ed element, NULL if unused */
 502         u_int32_t trace_index;  /* mtrace index for corresponding backtrace */
 503         u_int32_t count;        /* How many objects were requested */
 504         u_int64_t hitcount;     /* for determining hash effectiveness */
 505 };
 506
 507 struct mtrace {
 508         u_int64_t       collisions;
 509         u_int64_t       hitcount;
 510         u_int64_t       allocs;
 511         u_int64_t       depth;
 512         uintptr_t       addr[MLEAK_STACK_DEPTH];
 513 };
 514
 515 /* Size must be a power of two for the zhash to be able to just mask off bits */
 516 #define MLEAK_ALLOCATION_MAP_NUM        512
 517 #define MLEAK_TRACE_MAP_NUM             256
 518
 519 /*
 520  * Sample factor for how often to record a trace.  This is overwritable
 521  * by the boot-arg mleak_sample_factor.
 522  */
 523 #define MLEAK_SAMPLE_FACTOR             500
 524
 525 /*
 526  * Number of top leakers recorded.
 527  */
 528 #define MLEAK_NUM_TRACES                5
 529
 530 #define MB_LEAK_SPACING_64 "                    "
 531 #define MB_LEAK_SPACING_32 "            "
 532
 533
 534 #define MB_LEAK_HDR_32  "\n\
 535     trace [1]   trace [2]   trace [3]   trace [4]   trace [5]  \n\
 536     ----------  ----------  ----------  ----------  ---------- \n\
 537 "
 538
 539 #define MB_LEAK_HDR_64  "\n\
 540     trace [1]           trace [2]           trace [3]       \
 541         trace [4]           trace [5]      \n\
 542     ------------------  ------------------  ------------------  \
 543     ------------------  ------------------ \n\
 544 "
 545
 546 static uint32_t mleak_alloc_buckets = MLEAK_ALLOCATION_MAP_NUM;
 547 static uint32_t mleak_trace_buckets = MLEAK_TRACE_MAP_NUM;
 548
 549 /* Hashmaps of allocations and their corresponding traces */
 550 static struct mallocation *mleak_allocations;
 551 static struct mtrace *mleak_traces;
 552 static struct mtrace *mleak_top_trace[MLEAK_NUM_TRACES];
 553
 554 /* Lock to protect mleak tables from concurrent modification */
 555 decl_lck_mtx_data(static, mleak_lock_data);
 556 static lck_mtx_t *mleak_lock = &mleak_lock_data;
 557 static lck_attr_t *mleak_lock_attr;
 558 static lck_grp_t *mleak_lock_grp;
 559 static lck_grp_attr_t *mleak_lock_grp_attr;
 560
 561 extern u_int32_t high_sb_max;
 562
 563 /* The minimum number of objects that are allocated, to start. */
 564 #define MINCL           32
 565 #define MINBIGCL        (MINCL >> 1)
 566 #define MIN16KCL        (MINCL >> 2)
 567
 568 /* Low watermarks (only map in pages once free counts go below) */
 569 #define MBIGCL_LOWAT    MINBIGCL
 570 #define M16KCL_LOWAT    MIN16KCL
 571
 572 typedef struct {
 573         mbuf_class_t    mtbl_class;     /* class type */
 574         mcache_t        *mtbl_cache;    /* mcache for this buffer class */
 575         TAILQ_HEAD(mcl_slhead, mcl_slab) mtbl_slablist; /* slab list */
 576         mcache_obj_t    *mtbl_cobjlist; /* composite objects freelist */
 577         mb_class_stat_t *mtbl_stats;    /* statistics fetchable via sysctl */
 578         u_int32_t       mtbl_maxsize;   /* maximum buffer size */
 579         int             mtbl_minlimit;  /* minimum allowed */
 580         int             mtbl_maxlimit;  /* maximum allowed */
 581         u_int32_t       mtbl_wantpurge; /* purge during next reclaim */
 582         uint32_t        mtbl_avgtotal;  /* average total on iOS */
 583 } mbuf_table_t;
 584
 585 #define m_class(c)      mbuf_table[c].mtbl_class
 586 #define m_cache(c)      mbuf_table[c].mtbl_cache
 587 #define m_slablist(c)   mbuf_table[c].mtbl_slablist
 588 #define m_cobjlist(c)   mbuf_table[c].mtbl_cobjlist
 589 #define m_maxsize(c)    mbuf_table[c].mtbl_maxsize
 590 #define m_minlimit(c)   mbuf_table[c].mtbl_minlimit
 591 #define m_maxlimit(c)   mbuf_table[c].mtbl_maxlimit
 592 #define m_wantpurge(c)  mbuf_table[c].mtbl_wantpurge
 593 #define m_avgtotal(c)   mbuf_table[c].mtbl_avgtotal
 594 #define m_cname(c)      mbuf_table[c].mtbl_stats->mbcl_cname
 595 #define m_size(c)       mbuf_table[c].mtbl_stats->mbcl_size
 596 #define m_total(c)      mbuf_table[c].mtbl_stats->mbcl_total
 597 #define m_active(c)     mbuf_table[c].mtbl_stats->mbcl_active
 598 #define m_infree(c)     mbuf_table[c].mtbl_stats->mbcl_infree
 599 #define m_slab_cnt(c)   mbuf_table[c].mtbl_stats->mbcl_slab_cnt
 600 #define m_alloc_cnt(c)  mbuf_table[c].mtbl_stats->mbcl_alloc_cnt
 601 #define m_free_cnt(c)   mbuf_table[c].mtbl_stats->mbcl_free_cnt
 602 #define m_notified(c)   mbuf_table[c].mtbl_stats->mbcl_notified
 603 #define m_purge_cnt(c)  mbuf_table[c].mtbl_stats->mbcl_purge_cnt
 604 #define m_fail_cnt(c)   mbuf_table[c].mtbl_stats->mbcl_fail_cnt
 605 #define m_ctotal(c)     mbuf_table[c].mtbl_stats->mbcl_ctotal
 606 #define m_peak(c)       mbuf_table[c].mtbl_stats->mbcl_peak_reported
 607 #define m_release_cnt(c) mbuf_table[c].mtbl_stats->mbcl_release_cnt
 608
 609 static mbuf_table_t mbuf_table[] = {
 610         /*
 611          * The caches for mbufs, regular clusters and big clusters.
 612          * The average total values were based on data gathered by actual
 613          * usage patterns on iOS.
 614          */
 615         { MC_MBUF, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_MBUF)),
 616             NULL, NULL, 0, 0, 0, 0, 3000 },
 617         { MC_CL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_CL)),
 618             NULL, NULL, 0, 0, 0, 0, 2000 },
 619         { MC_BIGCL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_BIGCL)),
 620             NULL, NULL, 0, 0, 0, 0, 1000 },
 621         { MC_16KCL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_16KCL)),
 622             NULL, NULL, 0, 0, 0, 0, 1000 },
 623         /*
 624          * The following are special caches; they serve as intermediate
 625          * caches backed by the above rudimentary caches.  Each object
 626          * in the cache is an mbuf with a cluster attached to it.  Unlike
 627          * the above caches, these intermediate caches do not directly
 628          * deal with the slab structures; instead, the constructed
 629          * cached elements are simply stored in the freelists.
 630          */
 631         { MC_MBUF_CL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0, 2000 },
 632         { MC_MBUF_BIGCL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0, 1000 },
 633         { MC_MBUF_16KCL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0, 1000 },
 634 };
 635
 636 #define NELEM(a)        (sizeof (a) / sizeof ((a)[0]))
 637
 638 static void *mb_waitchan = &mbuf_table; /* wait channel for all caches */
 639 static int mb_waiters;                  /* number of waiters */
 640
 641 boolean_t mb_peak_newreport = FALSE;
 642 boolean_t mb_peak_firstreport = FALSE;
 643
 644 /* generate a report by default after 1 week of uptime */
 645 #define MBUF_PEAK_FIRST_REPORT_THRESHOLD        604800
 646
 647 #define MB_WDT_MAXTIME  10              /* # of secs before watchdog panic */
 648 static struct timeval mb_wdtstart;      /* watchdog start timestamp */
 649 static char *mbuf_dump_buf;
 650
 651 #define MBUF_DUMP_BUF_SIZE      2048
 652
 653 /*
 654  * mbuf watchdog is enabled by default on embedded platforms.  It is
 655  * also toggeable via the kern.ipc.mb_watchdog sysctl.
 656  * Garbage collection is also enabled by default on embedded platforms.
 657  * mb_drain_maxint controls the amount of time to wait (in seconds) before
 658  * consecutive calls to m_drain().
 659  */
 660 static unsigned int mb_watchdog = 0;
 661 static unsigned int mb_drain_maxint = 0;
 662
 663 /* Red zone */
 664 static u_int32_t mb_redzone_cookie;
 665 static void m_redzone_init(struct mbuf *);
 666 static void m_redzone_verify(struct mbuf *m);
 667
 668 /* The following are used to serialize m_clalloc() */
 669 static boolean_t mb_clalloc_busy;
 670 static void *mb_clalloc_waitchan = &mb_clalloc_busy;
 671 static int mb_clalloc_waiters;
 672
 673 static void mbuf_mtypes_sync(boolean_t);
 674 static int mbstat_sysctl SYSCTL_HANDLER_ARGS;
 675 static void mbuf_stat_sync(void);
 676 static int mb_stat_sysctl SYSCTL_HANDLER_ARGS;
 677 static int mleak_top_trace_sysctl SYSCTL_HANDLER_ARGS;
 678 static int mleak_table_sysctl SYSCTL_HANDLER_ARGS;
 679 static char *mbuf_dump(void);
 680 static void mbuf_table_init(void);
 681 static inline void m_incref(struct mbuf *);
 682 static inline u_int32_t m_decref(struct mbuf *);
 683 static int m_clalloc(const u_int32_t, const int, const u_int32_t);
 684 static void mbuf_worker_thread_init(void);
 685 static mcache_obj_t *slab_alloc(mbuf_class_t, int);
 686 static void slab_free(mbuf_class_t, mcache_obj_t *);
 687 static unsigned int mbuf_slab_alloc(void *, mcache_obj_t ***,
 688     unsigned int, int);
 689 static void mbuf_slab_free(void *, mcache_obj_t *, int);
 690 static void mbuf_slab_audit(void *, mcache_obj_t *, boolean_t);
 691 static void mbuf_slab_notify(void *, u_int32_t);
 692 static unsigned int cslab_alloc(mbuf_class_t, mcache_obj_t ***,
 693     unsigned int);
 694 static unsigned int cslab_free(mbuf_class_t, mcache_obj_t *, int);
 695 static unsigned int mbuf_cslab_alloc(void *, mcache_obj_t ***,
 696     unsigned int, int);
 697 static void mbuf_cslab_free(void *, mcache_obj_t *, int);
 698 static void mbuf_cslab_audit(void *, mcache_obj_t *, boolean_t);
 699 static int freelist_populate(mbuf_class_t, unsigned int, int);
 700 static void freelist_init(mbuf_class_t);
 701 static boolean_t mbuf_cached_above(mbuf_class_t, int);
 702 static boolean_t mbuf_steal(mbuf_class_t, unsigned int);
 703 static void m_reclaim(mbuf_class_t, unsigned int, boolean_t);
 704 static int m_howmany(int, size_t);
 705 static void mbuf_worker_thread(void);
 706 static void mbuf_watchdog(void);
 707 static boolean_t mbuf_sleep(mbuf_class_t, unsigned int, int);
 708
 709 static void mcl_audit_init(void *, mcache_audit_t **, mcache_obj_t **,
 710     size_t, unsigned int);
 711 static void mcl_audit_free(void *, unsigned int);
 712 static mcache_audit_t *mcl_audit_buf2mca(mbuf_class_t, mcache_obj_t *);
 713 static void mcl_audit_mbuf(mcache_audit_t *, void *, boolean_t, boolean_t);
 714 static void mcl_audit_cluster(mcache_audit_t *, void *, size_t, boolean_t,
 715     boolean_t);
 716 static void mcl_audit_restore_mbuf(struct mbuf *, mcache_audit_t *, boolean_t);
 717 static void mcl_audit_save_mbuf(struct mbuf *, mcache_audit_t *);
 718 static void mcl_audit_scratch(mcache_audit_t *);
 719 static void mcl_audit_mcheck_panic(struct mbuf *);
 720 static void mcl_audit_verify_nextptr(void *, mcache_audit_t *);
 721
 722 static void mleak_activate(void);
 723 static void mleak_logger(u_int32_t, mcache_obj_t *, boolean_t);
 724 static boolean_t mleak_log(uintptr_t *, mcache_obj_t *, uint32_t, int);
 725 static void mleak_free(mcache_obj_t *);
 726 static void mleak_sort_traces(void);
 727 static void mleak_update_stats(void);
 728
 729 static mcl_slab_t *slab_get(void *);
 730 static void slab_init(mcl_slab_t *, mbuf_class_t, u_int32_t,
 731     void *, void *, unsigned int, int, int);
 732 static void slab_insert(mcl_slab_t *, mbuf_class_t);
 733 static void slab_remove(mcl_slab_t *, mbuf_class_t);
 734 static boolean_t slab_inrange(mcl_slab_t *, void *);
 735 static void slab_nextptr_panic(mcl_slab_t *, void *);
 736 static void slab_detach(mcl_slab_t *);
 737 static boolean_t slab_is_detached(mcl_slab_t *);
 738
 739 static int m_copyback0(struct mbuf **, int, int, const void *, int, int);
 740 static struct mbuf *m_split0(struct mbuf *, int, int, int);
 741 __private_extern__ void mbuf_report_peak_usage(void);
 742 static boolean_t mbuf_report_usage(mbuf_class_t);
 743
 744 /* flags for m_copyback0 */
 745 #define M_COPYBACK0_COPYBACK    0x0001  /* copyback from cp */
 746 #define M_COPYBACK0_PRESERVE    0x0002  /* preserve original data */
 747 #define M_COPYBACK0_COW         0x0004  /* do copy-on-write */
 748 #define M_COPYBACK0_EXTEND      0x0008  /* extend chain */
 749
 750 /*
 751  * This flag is set for all mbufs that come out of and into the composite
 752  * mbuf + cluster caches, i.e. MC_MBUF_CL and MC_MBUF_BIGCL.  mbufs that
 753  * are marked with such a flag have clusters attached to them, and will be
 754  * treated differently when they are freed; instead of being placed back
 755  * into the mbuf and cluster freelists, the composite mbuf + cluster objects
 756  * are placed back into the appropriate composite cache's freelist, and the
 757  * actual freeing is deferred until the composite objects are purged.  At
 758  * such a time, this flag will be cleared from the mbufs and the objects
 759  * will be freed into their own separate freelists.
 760  */
 761 #define EXTF_COMPOSITE  0x1
 762
 763 /*
 764  * This flag indicates that the external cluster is read-only, i.e. it is
 765  * or was referred to by more than one mbufs.  Once set, this flag is never
 766  * cleared.
 767  */
 768 #define EXTF_READONLY   0x2
 769 #define EXTF_MASK       (EXTF_COMPOSITE | EXTF_READONLY)
 770
 771 #define MEXT_RFA(m)             ((m)->m_ext.ext_refflags)
 772 #define MEXT_REF(m)             (MEXT_RFA(m)->refcnt)
 773 #define MEXT_FLAGS(m)           (MEXT_RFA(m)->flags)
 774 #define MBUF_IS_COMPOSITE(m)    \
 775         (MEXT_REF(m) == 0 && (MEXT_FLAGS(m) & EXTF_MASK) == EXTF_COMPOSITE)
 776
 777 /*
 778  * Macros used to verify the integrity of the mbuf.
 779  */
 780 #define _MCHECK(m) {                                                    \
 781         if ((m)->m_type != MT_FREE) {                                   \
 782                 if (mclaudit == NULL)                                   \
 783                         panic("MCHECK: m_type=%d m=%p",                 \
 784                             (u_int16_t)(m)->m_type, m);                 \
 785                 else                                                    \
 786                         mcl_audit_mcheck_panic(m);                      \
 787         }                                                               \
 788 }
 789
 790 #define MBUF_IN_MAP(addr)                                               \
 791         ((unsigned char *)(addr) >= mbutl &&                            \
 792         (unsigned char *)(addr) < embutl)
 793
 794 #define MRANGE(addr) {                                                  \
 795         if (!MBUF_IN_MAP(addr))                                         \
 796                 panic("MRANGE: address out of range 0x%p", addr);       \
 797 }
 798
 799 /*
 800  * Macro version of mtod.
 801  */
 802 #define MTOD(m, t)      ((t)((m)->m_data))
 803
 804 /*
 805  * Macros to obtain page index given a base cluster address
 806  */
 807 #define MTOPG(x)        (((unsigned char *)x - mbutl) >> PAGE_SHIFT)
 808 #define PGTOM(x)        (mbutl + (x << PAGE_SHIFT))
 809
 810 /*
 811  * Macro to find the mbuf index relative to a base.
 812  */
 813 #define MBPAGEIDX(c, m) \
 814         (((unsigned char *)(m) - (unsigned char *)(c)) >> MSIZESHIFT)
 815
 816 /*
 817  * Same thing for 2KB cluster index.
 818  */
 819 #define CLPAGEIDX(c, m) \
 820         (((unsigned char *)(m) - (unsigned char *)(c)) >> MCLSHIFT)
 821
 822 /*
 823  * Macro to find 4KB cluster index relative to a base
 824  */
 825 #define BCLPAGEIDX(c, m) \
 826         (((unsigned char *)(m) - (unsigned char *)(c)) >> MBIGCLSHIFT)
 827
 828 /*
 829  * Macros used during mbuf and cluster initialization.
 830  */
 831 #define MBUF_INIT_PKTHDR(m) {                                           \
 832         (m)->m_pkthdr.rcvif = NULL;                                     \
 833         (m)->m_pkthdr.pkt_hdr = NULL;                                   \
 834         (m)->m_pkthdr.len = 0;                                          \
 835         (m)->m_pkthdr.csum_flags = 0;                                   \
 836         (m)->m_pkthdr.csum_data = 0;                                    \
 837         (m)->m_pkthdr.vlan_tag = 0;                                     \
 838         m_classifier_init(m, 0);                                        \
 839         m_tag_init(m, 1);                                               \
 840         m_scratch_init(m);                                              \
 841         m_redzone_init(m);                                              \
 842 }
 843
 844 #define MBUF_INIT(m, pkthdr, type) {                                    \
 845         _MCHECK(m);                                                     \
 846         (m)->m_next = (m)->m_nextpkt = NULL;                            \
 847         (m)->m_len = 0;                                                 \
 848         (m)->m_type = type;                                             \
 849         if ((pkthdr) == 0) {                                            \
 850                 (m)->m_data = (m)->m_dat;                               \
 851                 (m)->m_flags = 0;                                       \
 852         } else {                                                        \
 853                 (m)->m_data = (m)->m_pktdat;                            \
 854                 (m)->m_flags = M_PKTHDR;                                \
 855                 MBUF_INIT_PKTHDR(m);                                    \
 856         }                                                               \
 857 }
 858
 859 #define MEXT_INIT(m, buf, size, free, arg, rfa, ref, flag) {            \
 860         (m)->m_data = (m)->m_ext.ext_buf = (buf);                       \
 861         (m)->m_flags |= M_EXT;                                          \
 862         (m)->m_ext.ext_size = (size);                                   \
 863         (m)->m_ext.ext_free = (free);                                   \
 864         (m)->m_ext.ext_arg = (arg);                                     \
 865         (m)->m_ext.ext_refs.forward = (m)->m_ext.ext_refs.backward =    \
 866             &(m)->m_ext.ext_refs;                                       \
 867         MEXT_RFA(m) = (rfa);                                            \
 868         MEXT_REF(m) = (ref);                                            \
 869         MEXT_FLAGS(m) = (flag);                                         \
 870 }
 871
 872 #define MBUF_CL_INIT(m, buf, rfa, ref, flag)    \
 873         MEXT_INIT(m, buf, m_maxsize(MC_CL), NULL, NULL, rfa, ref, flag)
 874
 875 #define MBUF_BIGCL_INIT(m, buf, rfa, ref, flag) \
 876         MEXT_INIT(m, buf, m_maxsize(MC_BIGCL), m_bigfree, NULL, rfa, ref, flag)
 877
 878 #define MBUF_16KCL_INIT(m, buf, rfa, ref, flag) \
 879         MEXT_INIT(m, buf, m_maxsize(MC_16KCL), m_16kfree, NULL, rfa, ref, flag)
 880
 881 /*
 882  * Macro to convert BSD malloc sleep flag to mcache's
 883  */
 884 #define MSLEEPF(f)      ((!((f) & M_DONTWAIT)) ? MCR_SLEEP : MCR_NOSLEEP)
 885
 886 /*
 887  * The structure that holds all mbuf class statistics exportable via sysctl.
 888  * Similar to mbstat structure, the mb_stat structure is protected by the
 889  * global mbuf lock.  It contains additional information about the classes
 890  * that allows for a more accurate view of the state of the allocator.
 891  */
 892 struct mb_stat *mb_stat;
 893 struct omb_stat *omb_stat;      /* For backwards compatibility */
 894
 895 #define MB_STAT_SIZE(n) \
 896         ((size_t)(&((mb_stat_t *)0)->mbs_class[n]))
 897 #define OMB_STAT_SIZE(n) \
 898         ((size_t)(&((struct omb_stat *)0)->mbs_class[n]))
 899
 900 /*
 901  * The legacy structure holding all of the mbuf allocation statistics.
 902  * The actual statistics used by the kernel are stored in the mbuf_table
 903  * instead, and are updated atomically while the global mbuf lock is held.
 904  * They are mirrored in mbstat to support legacy applications (e.g. netstat).
 905  * Unlike before, the kernel no longer relies on the contents of mbstat for
 906  * its operations (e.g. cluster expansion) because the structure is exposed
 907  * to outside and could possibly be modified, therefore making it unsafe.
 908  * With the exception of the mbstat.m_mtypes array (see below), all of the
 909  * statistics are updated as they change.
 910  */
 911 struct mbstat mbstat;
 912
 913 #define MBSTAT_MTYPES_MAX \
 914         (sizeof (mbstat.m_mtypes) / sizeof (mbstat.m_mtypes[0]))
 915
 916 /*
 917  * Allocation statistics related to mbuf types (up to MT_MAX-1) are updated
 918  * atomically and stored in a per-CPU structure which is lock-free; this is
 919  * done in order to avoid writing to the global mbstat data structure which
 920  * would cause false sharing.  During sysctl request for kern.ipc.mbstat,
 921  * the statistics across all CPUs will be converged into the mbstat.m_mtypes
 922  * array and returned to the application.  Any updates for types greater or
 923  * equal than MT_MAX would be done atomically to the mbstat; this slows down
 924  * performance but is okay since the kernel uses only up to MT_MAX-1 while
 925  * anything beyond that (up to type 255) is considered a corner case.
 926  */
 927 typedef struct {
 928         unsigned int    cpu_mtypes[MT_MAX];
 929 } __attribute__((aligned(MAX_CPU_CACHE_LINE_SIZE), packed)) mtypes_cpu_t;
 930
 931 typedef struct {
 932         mtypes_cpu_t    mbs_cpu[1];
 933 } mbuf_mtypes_t;
 934
 935 static mbuf_mtypes_t *mbuf_mtypes;      /* per-CPU statistics */
 936
 937 #define MBUF_MTYPES_SIZE(n) \
 938         ((size_t)(&((mbuf_mtypes_t *)0)->mbs_cpu[n]))
 939
 940 #define MTYPES_CPU(p) \
 941         ((mtypes_cpu_t *)(void *)((char *)(p) + MBUF_MTYPES_SIZE(cpu_number())))
 942
 943 #define mtype_stat_add(type, n) {                                       \
 944         if ((unsigned)(type) < MT_MAX) {                                \
 945                 mtypes_cpu_t *mbs = MTYPES_CPU(mbuf_mtypes);            \
 946                 atomic_add_32(&mbs->cpu_mtypes[type], n);               \
 947         } else if ((unsigned)(type) < (unsigned)MBSTAT_MTYPES_MAX) {    \
 948                 atomic_add_16((int16_t *)&mbstat.m_mtypes[type], n);    \
 949         }                                                               \
 950 }
 951
 952 #define mtype_stat_sub(t, n)    mtype_stat_add(t, -(n))
 953 #define mtype_stat_inc(t)       mtype_stat_add(t, 1)
 954 #define mtype_stat_dec(t)       mtype_stat_sub(t, 1)
 955
 956 static void
 957 mbuf_mtypes_sync(boolean_t locked)
 958 {
 959         int m, n;
 960         mtypes_cpu_t mtc;
 961
 962         if (locked)
 963                 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
 964
 965         bzero(&mtc, sizeof (mtc));
 966         for (m = 0; m < ncpu; m++) {
 967                 mtypes_cpu_t *scp = &mbuf_mtypes->mbs_cpu[m];
 968                 mtypes_cpu_t temp;
 969
 970                 bcopy(&scp->cpu_mtypes, &temp.cpu_mtypes,
 971                     sizeof (temp.cpu_mtypes));
 972
 973                 for (n = 0; n < MT_MAX; n++)
 974                         mtc.cpu_mtypes[n] += temp.cpu_mtypes[n];
 975         }
 976         if (!locked)
 977                 lck_mtx_lock(mbuf_mlock);
 978         for (n = 0; n < MT_MAX; n++)
 979                 mbstat.m_mtypes[n] = mtc.cpu_mtypes[n];
 980         if (!locked)
 981                 lck_mtx_unlock(mbuf_mlock);
 982 }
 983
 984 static int
 985 mbstat_sysctl SYSCTL_HANDLER_ARGS
 986 {
 987 #pragma unused(oidp, arg1, arg2)
 988         mbuf_mtypes_sync(FALSE);
 989
 990         return (SYSCTL_OUT(req, &mbstat, sizeof (mbstat)));
 991 }
 992
 993 static void
 994 mbuf_stat_sync(void)
 995 {
 996         mb_class_stat_t *sp;
 997         mcache_cpu_t *ccp;
 998         mcache_t *cp;
 999         int k, m, bktsize;
1000
1001         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1002
1003         for (k = 0; k < NELEM(mbuf_table); k++) {
1004                 cp = m_cache(k);
1005                 ccp = &cp->mc_cpu[0];
1006                 bktsize = ccp->cc_bktsize;
1007                 sp = mbuf_table[k].mtbl_stats;
1008
1009                 if (cp->mc_flags & MCF_NOCPUCACHE)
1010                         sp->mbcl_mc_state = MCS_DISABLED;
1011                 else if (cp->mc_purge_cnt > 0)
1012                         sp->mbcl_mc_state = MCS_PURGING;
1013                 else if (bktsize == 0)
1014                         sp->mbcl_mc_state = MCS_OFFLINE;
1015                 else
1016                         sp->mbcl_mc_state = MCS_ONLINE;
1017
1018                 sp->mbcl_mc_cached = 0;
1019                 for (m = 0; m < ncpu; m++) {
1020                         ccp = &cp->mc_cpu[m];
1021                         if (ccp->cc_objs > 0)
1022                                 sp->mbcl_mc_cached += ccp->cc_objs;
1023                         if (ccp->cc_pobjs > 0)
1024                                 sp->mbcl_mc_cached += ccp->cc_pobjs;
1025                 }
1026                 sp->mbcl_mc_cached += (cp->mc_full.bl_total * bktsize);
1027                 sp->mbcl_active = sp->mbcl_total - sp->mbcl_mc_cached -
1028                     sp->mbcl_infree;
1029
1030                 sp->mbcl_mc_waiter_cnt = cp->mc_waiter_cnt;
1031                 sp->mbcl_mc_wretry_cnt = cp->mc_wretry_cnt;
1032                 sp->mbcl_mc_nwretry_cnt = cp->mc_nwretry_cnt;
1033
1034                 /* Calculate total count specific to each class */
1035                 sp->mbcl_ctotal = sp->mbcl_total;
1036                 switch (m_class(k)) {
1037                 case MC_MBUF:
1038                         /* Deduct mbufs used in composite caches */
1039                         sp->mbcl_ctotal -= (m_total(MC_MBUF_CL) +
1040                             m_total(MC_MBUF_BIGCL));
1041                         break;
1042
1043                 case MC_CL:
1044                         /* Deduct clusters used in composite cache */
1045                         sp->mbcl_ctotal -= m_total(MC_MBUF_CL);
1046                         break;
1047
1048                 case MC_BIGCL:
1049                         /* Deduct clusters used in composite cache */
1050                         sp->mbcl_ctotal -= m_total(MC_MBUF_BIGCL);
1051                         break;
1052
1053                 case MC_16KCL:
1054                         /* Deduct clusters used in composite cache */
1055                         sp->mbcl_ctotal -= m_total(MC_MBUF_16KCL);
1056                         break;
1057
1058                 default:
1059                         break;
1060                 }
1061         }
1062 }
1063
1064 static int
1065 mb_stat_sysctl SYSCTL_HANDLER_ARGS
1066 {
1067 #pragma unused(oidp, arg1, arg2)
1068         void *statp;
1069         int k, statsz, proc64 = proc_is64bit(req->p);
1070
1071         lck_mtx_lock(mbuf_mlock);
1072         mbuf_stat_sync();
1073
1074         if (!proc64) {
1075                 struct omb_class_stat *oc;
1076                 struct mb_class_stat *c;
1077
1078                 omb_stat->mbs_cnt = mb_stat->mbs_cnt;
1079                 oc = &omb_stat->mbs_class[0];
1080                 c = &mb_stat->mbs_class[0];
1081                 for (k = 0; k < omb_stat->mbs_cnt; k++, oc++, c++) {
1082                         (void) snprintf(oc->mbcl_cname, sizeof (oc->mbcl_cname),
1083                             "%s", c->mbcl_cname);
1084                         oc->mbcl_size = c->mbcl_size;
1085                         oc->mbcl_total = c->mbcl_total;
1086                         oc->mbcl_active = c->mbcl_active;
1087                         oc->mbcl_infree = c->mbcl_infree;
1088                         oc->mbcl_slab_cnt = c->mbcl_slab_cnt;
1089                         oc->mbcl_alloc_cnt = c->mbcl_alloc_cnt;
1090                         oc->mbcl_free_cnt = c->mbcl_free_cnt;
1091                         oc->mbcl_notified = c->mbcl_notified;
1092                         oc->mbcl_purge_cnt = c->mbcl_purge_cnt;
1093                         oc->mbcl_fail_cnt = c->mbcl_fail_cnt;
1094                         oc->mbcl_ctotal = c->mbcl_ctotal;
1095                         oc->mbcl_release_cnt = c->mbcl_release_cnt;
1096                         oc->mbcl_mc_state = c->mbcl_mc_state;
1097                         oc->mbcl_mc_cached = c->mbcl_mc_cached;
1098                         oc->mbcl_mc_waiter_cnt = c->mbcl_mc_waiter_cnt;
1099                         oc->mbcl_mc_wretry_cnt = c->mbcl_mc_wretry_cnt;
1100                         oc->mbcl_mc_nwretry_cnt = c->mbcl_mc_nwretry_cnt;
1101                 }
1102                 statp = omb_stat;
1103                 statsz = OMB_STAT_SIZE(NELEM(mbuf_table));
1104         } else {
1105                 statp = mb_stat;
1106                 statsz = MB_STAT_SIZE(NELEM(mbuf_table));
1107         }
1108
1109         lck_mtx_unlock(mbuf_mlock);
1110
1111         return (SYSCTL_OUT(req, statp, statsz));
1112 }
1113
1114 static int
1115 mleak_top_trace_sysctl SYSCTL_HANDLER_ARGS
1116 {
1117 #pragma unused(oidp, arg1, arg2)
1118         int i;
1119
1120         /* Ensure leak tracing turned on */
1121         if (!mclfindleak || !mclexpleak)
1122                 return (ENXIO);
1123
1124         lck_mtx_lock(mleak_lock);
1125         mleak_update_stats();
1126         i = SYSCTL_OUT(req, mleak_stat, MLEAK_STAT_SIZE(MLEAK_NUM_TRACES));
1127         lck_mtx_unlock(mleak_lock);
1128
1129         return (i);
1130 }
1131
1132 static int
1133 mleak_table_sysctl SYSCTL_HANDLER_ARGS
1134 {
1135 #pragma unused(oidp, arg1, arg2)
1136         int i = 0;
1137
1138         /* Ensure leak tracing turned on */
1139         if (!mclfindleak || !mclexpleak)
1140                 return (ENXIO);
1141
1142         lck_mtx_lock(mleak_lock);
1143         i = SYSCTL_OUT(req, &mleak_table, sizeof (mleak_table));
1144         lck_mtx_unlock(mleak_lock);
1145
1146         return (i);
1147 }
1148
1149 static inline void
1150 m_incref(struct mbuf *m)
1151 {
1152         UInt32 old, new;
1153         volatile UInt32 *addr = (volatile UInt32 *)&MEXT_REF(m);
1154
1155         do {
1156                 old = *addr;
1157                 new = old + 1;
1158                 ASSERT(new != 0);
1159         } while (!OSCompareAndSwap(old, new, addr));
1160
1161         /*
1162          * If cluster is shared, mark it with (sticky) EXTF_READONLY;
1163          * we don't clear the flag when the refcount goes back to 1
1164          * to simplify code calling m_mclhasreference().
1165          */
1166         if (new > 1 && !(MEXT_FLAGS(m) & EXTF_READONLY))
1167                 (void) OSBitOrAtomic(EXTF_READONLY, &MEXT_FLAGS(m));
1168 }
1169
1170 static inline u_int32_t
1171 m_decref(struct mbuf *m)
1172 {
1173         UInt32 old, new;
1174         volatile UInt32 *addr = (volatile UInt32 *)&MEXT_REF(m);
1175
1176         do {
1177                 old = *addr;
1178                 new = old - 1;
1179                 ASSERT(old != 0);
1180         } while (!OSCompareAndSwap(old, new, addr));
1181
1182         return (new);
1183 }
1184
1185 static void
1186 mbuf_table_init(void)
1187 {
1188         unsigned int b, c, s;
1189         int m, config_mbuf_jumbo = 0;
1190
1191         MALLOC(omb_stat, struct omb_stat *, OMB_STAT_SIZE(NELEM(mbuf_table)),
1192             M_TEMP, M_WAITOK | M_ZERO);
1193         VERIFY(omb_stat != NULL);
1194
1195         MALLOC(mb_stat, mb_stat_t *, MB_STAT_SIZE(NELEM(mbuf_table)),
1196             M_TEMP, M_WAITOK | M_ZERO);
1197         VERIFY(mb_stat != NULL);
1198
1199         mb_stat->mbs_cnt = NELEM(mbuf_table);
1200         for (m = 0; m < NELEM(mbuf_table); m++)
1201                 mbuf_table[m].mtbl_stats = &mb_stat->mbs_class[m];
1202
1203 #if CONFIG_MBUF_JUMBO
1204         config_mbuf_jumbo = 1;
1205 #endif /* CONFIG_MBUF_JUMBO */
1206
1207         if (config_mbuf_jumbo == 1 || PAGE_SIZE == M16KCLBYTES) {
1208                 /*
1209                  * Set aside 1/3 of the mbuf cluster map for jumbo
1210                  * clusters; we do this only on platforms where jumbo
1211                  * cluster pool is enabled.
1212                  */
1213                 njcl = nmbclusters / 3;
1214                 njclbytes = M16KCLBYTES;
1215         }
1216
1217         /*
1218          * nclusters holds both the 2KB and 4KB pools, so ensure it's
1219          * a multiple of 4KB clusters.
1220          */
1221         nclusters = P2ROUNDDOWN(nmbclusters - njcl, NCLPG);
1222         if (njcl > 0) {
1223                 /*
1224                  * Each jumbo cluster takes 8 2KB clusters, so make
1225                  * sure that the pool size is evenly divisible by 8;
1226                  * njcl is in 2KB unit, hence treated as such.
1227                  */
1228                 njcl = P2ROUNDDOWN(nmbclusters - nclusters, NCLPJCL);
1229
1230                 /* Update nclusters with rounded down value of njcl */
1231                 nclusters = P2ROUNDDOWN(nmbclusters - njcl, NCLPG);
1232         }
1233
1234         /*
1235          * njcl is valid only on platforms with 16KB jumbo clusters or
1236          * with 16KB pages, where it is configured to 1/3 of the pool
1237          * size.  On these platforms, the remaining is used for 2KB
1238          * and 4KB clusters.  On platforms without 16KB jumbo clusters,
1239          * the entire pool is used for both 2KB and 4KB clusters.  A 4KB
1240          * cluster can either be splitted into 16 mbufs, or into 2 2KB
1241          * clusters.
1242          *
1243          *  +---+---+------------ ... -----------+------- ... -------+
1244          *  | c | b |              s             |        njcl       |
1245          *  +---+---+------------ ... -----------+------- ... -------+
1246          *
1247          * 1/32th of the shared region is reserved for pure 2KB and 4KB
1248          * clusters (1/64th each.)
1249          */
1250         c = P2ROUNDDOWN((nclusters >> 6), NCLPG);       /* in 2KB unit */
1251         b = P2ROUNDDOWN((nclusters >> (6 + NCLPBGSHIFT)), NBCLPG); /* in 4KB unit */
1252         s = nclusters - (c + (b << NCLPBGSHIFT));       /* in 2KB unit */
1253
1254         /*
1255          * 1/64th (c) is reserved for 2KB clusters.
1256          */
1257         m_minlimit(MC_CL) = c;
1258         m_maxlimit(MC_CL) = s + c;                      /* in 2KB unit */
1259         m_maxsize(MC_CL) = m_size(MC_CL) = MCLBYTES;
1260         (void) snprintf(m_cname(MC_CL), MAX_MBUF_CNAME, "cl");
1261
1262         /*
1263          * Another 1/64th (b) of the map is reserved for 4KB clusters.
1264          * It cannot be turned into 2KB clusters or mbufs.
1265          */
1266         m_minlimit(MC_BIGCL) = b;
1267         m_maxlimit(MC_BIGCL) = (s >> NCLPBGSHIFT) + b;  /* in 4KB unit */
1268         m_maxsize(MC_BIGCL) = m_size(MC_BIGCL) = MBIGCLBYTES;
1269         (void) snprintf(m_cname(MC_BIGCL), MAX_MBUF_CNAME, "bigcl");
1270
1271         /*
1272          * The remaining 31/32ths (s) are all-purpose (mbufs, 2KB, or 4KB)
1273          */
1274         m_minlimit(MC_MBUF) = 0;
1275         m_maxlimit(MC_MBUF) = (s << NMBPCLSHIFT);       /* in mbuf unit */
1276         m_maxsize(MC_MBUF) = m_size(MC_MBUF) = MSIZE;
1277         (void) snprintf(m_cname(MC_MBUF), MAX_MBUF_CNAME, "mbuf");
1278
1279         /*
1280          * Set limits for the composite classes.
1281          */
1282         m_minlimit(MC_MBUF_CL) = 0;
1283         m_maxlimit(MC_MBUF_CL) = m_maxlimit(MC_CL);
1284         m_maxsize(MC_MBUF_CL) = MCLBYTES;
1285         m_size(MC_MBUF_CL) = m_size(MC_MBUF) + m_size(MC_CL);
1286         (void) snprintf(m_cname(MC_MBUF_CL), MAX_MBUF_CNAME, "mbuf_cl");
1287
1288         m_minlimit(MC_MBUF_BIGCL) = 0;
1289         m_maxlimit(MC_MBUF_BIGCL) = m_maxlimit(MC_BIGCL);
1290         m_maxsize(MC_MBUF_BIGCL) = MBIGCLBYTES;
1291         m_size(MC_MBUF_BIGCL) = m_size(MC_MBUF) + m_size(MC_BIGCL);
1292         (void) snprintf(m_cname(MC_MBUF_BIGCL), MAX_MBUF_CNAME, "mbuf_bigcl");
1293
1294         /*
1295          * And for jumbo classes.
1296          */
1297         m_minlimit(MC_16KCL) = 0;
1298         m_maxlimit(MC_16KCL) = (njcl >> NCLPJCLSHIFT);  /* in 16KB unit */
1299         m_maxsize(MC_16KCL) = m_size(MC_16KCL) = M16KCLBYTES;
1300         (void) snprintf(m_cname(MC_16KCL), MAX_MBUF_CNAME, "16kcl");
1301
1302         m_minlimit(MC_MBUF_16KCL) = 0;
1303         m_maxlimit(MC_MBUF_16KCL) = m_maxlimit(MC_16KCL);
1304         m_maxsize(MC_MBUF_16KCL) = M16KCLBYTES;
1305         m_size(MC_MBUF_16KCL) = m_size(MC_MBUF) + m_size(MC_16KCL);
1306         (void) snprintf(m_cname(MC_MBUF_16KCL), MAX_MBUF_CNAME, "mbuf_16kcl");
1307
1308         /*
1309          * Initialize the legacy mbstat structure.
1310          */
1311         bzero(&mbstat, sizeof (mbstat));
1312         mbstat.m_msize = m_maxsize(MC_MBUF);
1313         mbstat.m_mclbytes = m_maxsize(MC_CL);
1314         mbstat.m_minclsize = MINCLSIZE;
1315         mbstat.m_mlen = MLEN;
1316         mbstat.m_mhlen = MHLEN;
1317         mbstat.m_bigmclbytes = m_maxsize(MC_BIGCL);
1318 }
1319
1320 #if defined(__LP64__)
1321 typedef struct ncl_tbl {
1322         uint64_t nt_maxmem;     /* memory (sane) size */
1323         uint32_t nt_mbpool;     /* mbuf pool size */
1324 } ncl_tbl_t;
1325
1326 /* Non-server */
1327 static ncl_tbl_t ncl_table[] = {
1328         { (1ULL << GBSHIFT)       /*  1 GB */,  (64 << MBSHIFT)  /*  64 MB */ },
1329         { (1ULL << (GBSHIFT + 3)) /*  8 GB */,  (96 << MBSHIFT)  /*  96 MB */ },
1330         { (1ULL << (GBSHIFT + 4)) /* 16 GB */,  (128 << MBSHIFT) /* 128 MB */ },
1331         { 0, 0 }
1332 };
1333
1334 /* Server */
1335 static ncl_tbl_t ncl_table_srv[] = {
1336         { (1ULL << GBSHIFT)       /*  1 GB */,  (96 << MBSHIFT)  /*  96 MB */ },
1337         { (1ULL << (GBSHIFT + 2)) /*  4 GB */,  (128 << MBSHIFT) /* 128 MB */ },
1338         { (1ULL << (GBSHIFT + 3)) /*  8 GB */,  (160 << MBSHIFT) /* 160 MB */ },
1339         { (1ULL << (GBSHIFT + 4)) /* 16 GB */,  (192 << MBSHIFT) /* 192 MB */ },
1340         { (1ULL << (GBSHIFT + 5)) /* 32 GB */,  (256 << MBSHIFT) /* 256 MB */ },
1341         { (1ULL << (GBSHIFT + 6)) /* 64 GB */,  (384 << MBSHIFT) /* 384 MB */ },
1342         { 0, 0 }
1343 };
1344 #endif /* __LP64__ */
1345
1346 __private_extern__ unsigned int
1347 mbuf_default_ncl(int server, uint64_t mem)
1348 {
1349 #if !defined(__LP64__)
1350 #pragma unused(server)
1351         unsigned int n;
1352         /*
1353          * 32-bit kernel (default to 64MB of mbuf pool for >= 1GB RAM).
1354          */
1355         if ((n = ((mem / 16) / MCLBYTES)) > 32768)
1356                 n = 32768;
1357 #else
1358         unsigned int n, i;
1359         ncl_tbl_t *tbl = (server ? ncl_table_srv : ncl_table);
1360         /*
1361          * 64-bit kernel (mbuf pool size based on table).
1362          */
1363         n = tbl[0].nt_mbpool;
1364         for (i = 0; tbl[i].nt_mbpool != 0; i++) {
1365                 if (mem < tbl[i].nt_maxmem)
1366                         break;
1367                 n = tbl[i].nt_mbpool;
1368         }
1369         n >>= MCLSHIFT;
1370 #endif /* !__LP64__ */
1371         return (n);
1372 }
1373
1374 __private_extern__ void
1375 mbinit(void)
1376 {
1377         unsigned int m;
1378         unsigned int initmcl = 0;
1379         void *buf;
1380         thread_t thread = THREAD_NULL;
1381
1382         microuptime(&mb_start);
1383
1384         /*
1385          * These MBUF_ values must be equal to their private counterparts.
1386          */
1387         _CASSERT(MBUF_EXT == M_EXT);
1388         _CASSERT(MBUF_PKTHDR == M_PKTHDR);
1389         _CASSERT(MBUF_EOR == M_EOR);
1390         _CASSERT(MBUF_LOOP == M_LOOP);
1391         _CASSERT(MBUF_BCAST == M_BCAST);
1392         _CASSERT(MBUF_MCAST == M_MCAST);
1393         _CASSERT(MBUF_FRAG == M_FRAG);
1394         _CASSERT(MBUF_FIRSTFRAG == M_FIRSTFRAG);
1395         _CASSERT(MBUF_LASTFRAG == M_LASTFRAG);
1396         _CASSERT(MBUF_PROMISC == M_PROMISC);
1397         _CASSERT(MBUF_HASFCS == M_HASFCS);
1398
1399         _CASSERT(MBUF_TYPE_FREE == MT_FREE);
1400         _CASSERT(MBUF_TYPE_DATA == MT_DATA);
1401         _CASSERT(MBUF_TYPE_HEADER == MT_HEADER);
1402         _CASSERT(MBUF_TYPE_SOCKET == MT_SOCKET);
1403         _CASSERT(MBUF_TYPE_PCB == MT_PCB);
1404         _CASSERT(MBUF_TYPE_RTABLE == MT_RTABLE);
1405         _CASSERT(MBUF_TYPE_HTABLE == MT_HTABLE);
1406         _CASSERT(MBUF_TYPE_ATABLE == MT_ATABLE);
1407         _CASSERT(MBUF_TYPE_SONAME == MT_SONAME);
1408         _CASSERT(MBUF_TYPE_SOOPTS == MT_SOOPTS);
1409         _CASSERT(MBUF_TYPE_FTABLE == MT_FTABLE);
1410         _CASSERT(MBUF_TYPE_RIGHTS == MT_RIGHTS);
1411         _CASSERT(MBUF_TYPE_IFADDR == MT_IFADDR);
1412         _CASSERT(MBUF_TYPE_CONTROL == MT_CONTROL);
1413         _CASSERT(MBUF_TYPE_OOBDATA == MT_OOBDATA);
1414
1415         _CASSERT(MBUF_TSO_IPV4 == CSUM_TSO_IPV4);
1416         _CASSERT(MBUF_TSO_IPV6 == CSUM_TSO_IPV6);
1417         _CASSERT(MBUF_CSUM_REQ_SUM16 == CSUM_PARTIAL);
1418         _CASSERT(MBUF_CSUM_TCP_SUM16 == MBUF_CSUM_REQ_SUM16);
1419         _CASSERT(MBUF_CSUM_REQ_IP == CSUM_IP);
1420         _CASSERT(MBUF_CSUM_REQ_TCP == CSUM_TCP);
1421         _CASSERT(MBUF_CSUM_REQ_UDP == CSUM_UDP);
1422         _CASSERT(MBUF_CSUM_REQ_TCPIPV6 == CSUM_TCPIPV6);
1423         _CASSERT(MBUF_CSUM_REQ_UDPIPV6 == CSUM_UDPIPV6);
1424         _CASSERT(MBUF_CSUM_DID_IP == CSUM_IP_CHECKED);
1425         _CASSERT(MBUF_CSUM_IP_GOOD == CSUM_IP_VALID);
1426         _CASSERT(MBUF_CSUM_DID_DATA == CSUM_DATA_VALID);
1427         _CASSERT(MBUF_CSUM_PSEUDO_HDR == CSUM_PSEUDO_HDR);
1428
1429         _CASSERT(MBUF_WAITOK == M_WAIT);
1430         _CASSERT(MBUF_DONTWAIT == M_DONTWAIT);
1431         _CASSERT(MBUF_COPYALL == M_COPYALL);
1432
1433         _CASSERT(MBUF_SC2TC(MBUF_SC_BK_SYS) == MBUF_TC_BK);
1434         _CASSERT(MBUF_SC2TC(MBUF_SC_BK) == MBUF_TC_BK);
1435         _CASSERT(MBUF_SC2TC(MBUF_SC_BE) == MBUF_TC_BE);
1436         _CASSERT(MBUF_SC2TC(MBUF_SC_RD) == MBUF_TC_BE);
1437         _CASSERT(MBUF_SC2TC(MBUF_SC_OAM) == MBUF_TC_BE);
1438         _CASSERT(MBUF_SC2TC(MBUF_SC_AV) == MBUF_TC_VI);
1439         _CASSERT(MBUF_SC2TC(MBUF_SC_RV) == MBUF_TC_VI);
1440         _CASSERT(MBUF_SC2TC(MBUF_SC_VI) == MBUF_TC_VI);
1441         _CASSERT(MBUF_SC2TC(MBUF_SC_VO) == MBUF_TC_VO);
1442         _CASSERT(MBUF_SC2TC(MBUF_SC_CTL) == MBUF_TC_VO);
1443
1444         _CASSERT(MBUF_TC2SCVAL(MBUF_TC_BK) == SCVAL_BK);
1445         _CASSERT(MBUF_TC2SCVAL(MBUF_TC_BE) == SCVAL_BE);
1446         _CASSERT(MBUF_TC2SCVAL(MBUF_TC_VI) == SCVAL_VI);
1447         _CASSERT(MBUF_TC2SCVAL(MBUF_TC_VO) == SCVAL_VO);
1448
1449         /* Module specific scratch space (32-bit alignment requirement) */
1450         _CASSERT(!(offsetof(struct mbuf, m_pkthdr.pkt_mpriv) %
1451             sizeof (uint32_t)));
1452
1453         /* Initialize random red zone cookie value */
1454         _CASSERT(sizeof (mb_redzone_cookie) ==
1455             sizeof (((struct pkthdr *)0)->redzone));
1456         read_random(&mb_redzone_cookie, sizeof (mb_redzone_cookie));
1457
1458         /* Make sure we don't save more than we should */
1459         _CASSERT(MCA_SAVED_MBUF_SIZE <= sizeof (struct mbuf));
1460
1461         if (nmbclusters == 0)
1462                 nmbclusters = NMBCLUSTERS;
1463
1464         /* This should be a sane (at least even) value by now */
1465         VERIFY(nmbclusters != 0 && !(nmbclusters & 0x1));
1466
1467         /* Setup the mbuf table */
1468         mbuf_table_init();
1469
1470         /* Global lock for common layer */
1471         mbuf_mlock_grp_attr = lck_grp_attr_alloc_init();
1472         mbuf_mlock_grp = lck_grp_alloc_init("mbuf", mbuf_mlock_grp_attr);
1473         mbuf_mlock_attr = lck_attr_alloc_init();
1474         lck_mtx_init(mbuf_mlock, mbuf_mlock_grp, mbuf_mlock_attr);
1475
1476         /*
1477          * Allocate cluster slabs table:
1478          *
1479          *      maxslabgrp = (N * 2048) / (1024 * 1024)
1480          *
1481          * Where N is nmbclusters rounded up to the nearest 512.  This yields
1482          * mcl_slab_g_t units, each one representing a MB of memory.
1483          */
1484         maxslabgrp =
1485             (P2ROUNDUP(nmbclusters, (MBSIZE >> MCLSHIFT)) << MCLSHIFT) >> MBSHIFT;
1486         MALLOC(slabstbl, mcl_slabg_t **, maxslabgrp * sizeof (mcl_slabg_t *),
1487             M_TEMP, M_WAITOK | M_ZERO);
1488         VERIFY(slabstbl != NULL);
1489
1490         /*
1491          * Allocate audit structures, if needed:
1492          *
1493          *      maxclaudit = (maxslabgrp * 1024 * 1024) / PAGE_SIZE
1494          *
1495          * This yields mcl_audit_t units, each one representing a page.
1496          */
1497         PE_parse_boot_argn("mbuf_debug", &mbuf_debug, sizeof (mbuf_debug));
1498         mbuf_debug |= mcache_getflags();
1499         if (mbuf_debug & MCF_DEBUG) {
1500                 int l;
1501                 mcl_audit_t *mclad;
1502                 maxclaudit = ((maxslabgrp << MBSHIFT) >> PAGE_SHIFT);
1503                 MALLOC(mclaudit, mcl_audit_t *, maxclaudit * sizeof (*mclaudit),
1504                     M_TEMP, M_WAITOK | M_ZERO);
1505                 VERIFY(mclaudit != NULL);
1506                 for (l = 0, mclad = mclaudit; l < maxclaudit; l++) {
1507                         MALLOC(mclad[l].cl_audit, mcache_audit_t **,
1508                             NMBPG * sizeof(mcache_audit_t *),
1509                             M_TEMP, M_WAITOK | M_ZERO);
1510                         VERIFY(mclad[l].cl_audit != NULL);
1511                 }
1512
1513                 mcl_audit_con_cache = mcache_create("mcl_audit_contents",
1514                     AUDIT_CONTENTS_SIZE, sizeof (u_int64_t), 0, MCR_SLEEP);
1515                 VERIFY(mcl_audit_con_cache != NULL);
1516         }
1517         mclverify = (mbuf_debug & MCF_VERIFY);
1518         mcltrace = (mbuf_debug & MCF_TRACE);
1519         mclfindleak = !(mbuf_debug & MCF_NOLEAKLOG);
1520         mclexpleak = mclfindleak && (mbuf_debug & MCF_EXPLEAKLOG);
1521
1522         /* Enable mbuf leak logging, with a lock to protect the tables */
1523
1524         mleak_lock_grp_attr = lck_grp_attr_alloc_init();
1525         mleak_lock_grp = lck_grp_alloc_init("mleak_lock", mleak_lock_grp_attr);
1526         mleak_lock_attr = lck_attr_alloc_init();
1527         lck_mtx_init(mleak_lock, mleak_lock_grp, mleak_lock_attr);
1528
1529         mleak_activate();
1530
1531         /* Calculate the number of pages assigned to the cluster pool */
1532         mcl_pages = (nmbclusters << MCLSHIFT) / PAGE_SIZE;
1533         MALLOC(mcl_paddr, ppnum_t *, mcl_pages * sizeof (ppnum_t),
1534             M_TEMP, M_WAITOK);
1535         VERIFY(mcl_paddr != NULL);
1536
1537         /* Register with the I/O Bus mapper */
1538         mcl_paddr_base = IOMapperIOVMAlloc(mcl_pages);
1539         bzero((char *)mcl_paddr, mcl_pages * sizeof (ppnum_t));
1540
1541         embutl = (mbutl + (nmbclusters * MCLBYTES));
1542         VERIFY(((embutl - mbutl) % MBIGCLBYTES) == 0);
1543
1544         /* Prime up the freelist */
1545         PE_parse_boot_argn("initmcl", &initmcl, sizeof (initmcl));
1546         if (initmcl != 0) {
1547                 initmcl >>= NCLPBGSHIFT;        /* become a 4K unit */
1548                 if (initmcl > m_maxlimit(MC_BIGCL))
1549                         initmcl = m_maxlimit(MC_BIGCL);
1550         }
1551         if (initmcl < m_minlimit(MC_BIGCL))
1552                 initmcl = m_minlimit(MC_BIGCL);
1553
1554         lck_mtx_lock(mbuf_mlock);
1555
1556         /*
1557          * For classes with non-zero minimum limits, populate their freelists
1558          * so that m_total(class) is at least m_minlimit(class).
1559          */
1560         VERIFY(m_total(MC_BIGCL) == 0 && m_minlimit(MC_BIGCL) != 0);
1561         freelist_populate(m_class(MC_BIGCL), initmcl, M_WAIT);
1562         VERIFY(m_total(MC_BIGCL) >= m_minlimit(MC_BIGCL));
1563         freelist_init(m_class(MC_CL));
1564
1565         for (m = 0; m < NELEM(mbuf_table); m++) {
1566                 /* Make sure we didn't miss any */
1567                 VERIFY(m_minlimit(m_class(m)) == 0 ||
1568                     m_total(m_class(m)) >= m_minlimit(m_class(m)));
1569
1570                 /* populate the initial sizes and report from there on */
1571                 m_peak(m_class(m)) = m_total(m_class(m));
1572         }
1573         mb_peak_newreport = FALSE;
1574
1575         lck_mtx_unlock(mbuf_mlock);
1576
1577         (void) kernel_thread_start((thread_continue_t)mbuf_worker_thread_init,
1578             NULL, &thread);
1579         thread_deallocate(thread);
1580
1581         ref_cache = mcache_create("mext_ref", sizeof (struct ext_ref),
1582             0, 0, MCR_SLEEP);
1583
1584         /* Create the cache for each class */
1585         for (m = 0; m < NELEM(mbuf_table); m++) {
1586                 void *allocfunc, *freefunc, *auditfunc, *logfunc;
1587                 u_int32_t flags;
1588
1589                 flags = mbuf_debug;
1590                 if (m_class(m) == MC_MBUF_CL || m_class(m) == MC_MBUF_BIGCL ||
1591                     m_class(m) == MC_MBUF_16KCL) {
1592                         allocfunc = mbuf_cslab_alloc;
1593                         freefunc = mbuf_cslab_free;
1594                         auditfunc = mbuf_cslab_audit;
1595                         logfunc = mleak_logger;
1596                 } else {
1597                         allocfunc = mbuf_slab_alloc;
1598                         freefunc = mbuf_slab_free;
1599                         auditfunc = mbuf_slab_audit;
1600                         logfunc = mleak_logger;
1601                 }
1602
1603                 /*
1604                  * Disable per-CPU caches for jumbo classes if there
1605                  * is no jumbo cluster pool available in the system.
1606                  * The cache itself is still created (but will never
1607                  * be populated) since it simplifies the code.
1608                  */
1609                 if ((m_class(m) == MC_MBUF_16KCL || m_class(m) == MC_16KCL) &&
1610                     njcl == 0)
1611                         flags |= MCF_NOCPUCACHE;
1612
1613                 if (!mclfindleak)
1614                         flags |= MCF_NOLEAKLOG;
1615
1616                 m_cache(m) = mcache_create_ext(m_cname(m), m_maxsize(m),
1617                     allocfunc, freefunc, auditfunc, logfunc, mbuf_slab_notify,
1618                     (void *)(uintptr_t)m, flags, MCR_SLEEP);
1619         }
1620
1621         /*
1622          * Allocate structure for per-CPU statistics that's aligned
1623          * on the CPU cache boundary; this code assumes that we never
1624          * uninitialize this framework, since the original address
1625          * before alignment is not saved.
1626          */
1627         ncpu = ml_get_max_cpus();
1628         MALLOC(buf, void *, MBUF_MTYPES_SIZE(ncpu) + CPU_CACHE_LINE_SIZE,
1629             M_TEMP, M_WAITOK);
1630         VERIFY(buf != NULL);
1631
1632         mbuf_mtypes = (mbuf_mtypes_t *)P2ROUNDUP((intptr_t)buf,
1633             CPU_CACHE_LINE_SIZE);
1634         bzero(mbuf_mtypes, MBUF_MTYPES_SIZE(ncpu));
1635
1636         /*
1637          * Set the max limit on sb_max to be 1/16 th of the size of
1638          * memory allocated for mbuf clusters.
1639          */
1640         high_sb_max = (nmbclusters << (MCLSHIFT - 4));
1641         if (high_sb_max < sb_max) {
1642                 /* sb_max is too large for this configuration, scale it down */
1643                 if (high_sb_max > (1 << MBSHIFT)) {
1644                         /* We have atleast 16 M of mbuf pool */
1645                         sb_max = high_sb_max;
1646                 } else if ((nmbclusters << MCLSHIFT) > (1 << MBSHIFT)) {
1647                         /*
1648                          * If we have more than 1M of mbufpool, cap the size of
1649                          * max sock buf at 1M
1650                          */
1651                         sb_max = high_sb_max = (1 << MBSHIFT);
1652                 } else {
1653                         sb_max = high_sb_max;
1654                 }
1655         }
1656
1657         /* allocate space for mbuf_dump_buf */
1658         MALLOC(mbuf_dump_buf, char *, MBUF_DUMP_BUF_SIZE, M_TEMP, M_WAITOK);
1659         VERIFY(mbuf_dump_buf != NULL);
1660
1661         if (mbuf_debug & MCF_DEBUG) {
1662                 printf("%s: MLEN %d, MHLEN %d\n", __func__,
1663                     (int)_MLEN, (int)_MHLEN);
1664         }
1665
1666         printf("%s: done [%d MB total pool size, (%d/%d) split]\n", __func__,
1667             (nmbclusters << MCLSHIFT) >> MBSHIFT,
1668             (nclusters << MCLSHIFT) >> MBSHIFT,
1669             (njcl << MCLSHIFT) >> MBSHIFT);
1670 }
1671
1672 /*
1673  * Obtain a slab of object(s) from the class's freelist.
1674  */
1675 static mcache_obj_t *
1676 slab_alloc(mbuf_class_t class, int wait)
1677 {
1678         mcl_slab_t *sp;
1679         mcache_obj_t *buf;
1680
1681         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1682
1683         /* This should always be NULL for us */
1684         VERIFY(m_cobjlist(class) == NULL);
1685
1686         /*
1687          * Treat composite objects as having longer lifespan by using
1688          * a slab from the reverse direction, in hoping that this could
1689          * reduce the probability of fragmentation for slabs that hold
1690          * more than one buffer chunks (e.g. mbuf slabs).  For other
1691          * slabs, this probably doesn't make much of a difference.
1692          */
1693         if ((class == MC_MBUF || class == MC_CL || class == MC_BIGCL)
1694             && (wait & MCR_COMP))
1695                 sp = (mcl_slab_t *)TAILQ_LAST(&m_slablist(class), mcl_slhead);
1696         else
1697                 sp = (mcl_slab_t *)TAILQ_FIRST(&m_slablist(class));
1698
1699         if (sp == NULL) {
1700                 VERIFY(m_infree(class) == 0 && m_slab_cnt(class) == 0);
1701                 /* The slab list for this class is empty */
1702                 return (NULL);
1703         }
1704
1705         VERIFY(m_infree(class) > 0);
1706         VERIFY(!slab_is_detached(sp));
1707         VERIFY(sp->sl_class == class &&
1708             (sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED);
1709         buf = sp->sl_head;
1710         VERIFY(slab_inrange(sp, buf) && sp == slab_get(buf));
1711         sp->sl_head = buf->obj_next;
1712         /* Increment slab reference */
1713         sp->sl_refcnt++;
1714
1715         VERIFY(sp->sl_head != NULL || sp->sl_refcnt == sp->sl_chunks);
1716
1717         if (sp->sl_head != NULL && !slab_inrange(sp, sp->sl_head)) {
1718                 slab_nextptr_panic(sp, sp->sl_head);
1719                 /* In case sl_head is in the map but not in the slab */
1720                 VERIFY(slab_inrange(sp, sp->sl_head));
1721                 /* NOTREACHED */
1722         }
1723
1724         if (mclaudit != NULL) {
1725                 mcache_audit_t *mca = mcl_audit_buf2mca(class, buf);
1726                 mca->mca_uflags = 0;
1727                 /* Save contents on mbuf objects only */
1728                 if (class == MC_MBUF)
1729                         mca->mca_uflags |= MB_SCVALID;
1730         }
1731
1732         if (class == MC_CL) {
1733                 mbstat.m_clfree = (--m_infree(MC_CL)) + m_infree(MC_MBUF_CL);
1734                 /*
1735                  * A 2K cluster slab can have at most NCLPG references.
1736                  */
1737                 VERIFY(sp->sl_refcnt >= 1 && sp->sl_refcnt <= NCLPG &&
1738                     sp->sl_chunks == NCLPG && sp->sl_len == PAGE_SIZE);
1739                 VERIFY(sp->sl_refcnt < NCLPG || sp->sl_head == NULL);
1740         } else if (class == MC_BIGCL) {
1741                 mbstat.m_bigclfree = (--m_infree(MC_BIGCL)) +
1742                     m_infree(MC_MBUF_BIGCL);
1743                 /*
1744                  * A 4K cluster slab can have NBCLPG references.
1745                  */
1746                 VERIFY(sp->sl_refcnt >= 1 && sp->sl_chunks == NBCLPG &&
1747                     sp->sl_len == PAGE_SIZE &&
1748                     (sp->sl_refcnt < NBCLPG || sp->sl_head == NULL));
1749         } else if (class == MC_16KCL) {
1750                 mcl_slab_t *nsp;
1751                 int k;
1752
1753                 --m_infree(MC_16KCL);
1754                 VERIFY(sp->sl_refcnt == 1 && sp->sl_chunks == 1 &&
1755                     sp->sl_len == m_maxsize(class) && sp->sl_head == NULL);
1756                 /*
1757                  * Increment 2nd-Nth slab reference, where N is NSLABSP16KB.
1758                  * A 16KB big cluster takes NSLABSP16KB slabs, each having at
1759                  * most 1 reference.
1760                  */
1761                 for (nsp = sp, k = 1; k < NSLABSP16KB; k++) {
1762                         nsp = nsp->sl_next;
1763                         /* Next slab must already be present */
1764                         VERIFY(nsp != NULL);
1765                         nsp->sl_refcnt++;
1766                         VERIFY(!slab_is_detached(nsp));
1767                         VERIFY(nsp->sl_class == MC_16KCL &&
1768                             nsp->sl_flags == (SLF_MAPPED | SLF_PARTIAL) &&
1769                             nsp->sl_refcnt == 1 && nsp->sl_chunks == 0 &&
1770                             nsp->sl_len == 0 && nsp->sl_base == sp->sl_base &&
1771                             nsp->sl_head == NULL);
1772                 }
1773         } else {
1774                 VERIFY(class == MC_MBUF);
1775                 --m_infree(MC_MBUF);
1776                 /*
1777                  * If auditing is turned on, this check is
1778                  * deferred until later in mbuf_slab_audit().
1779                  */
1780                 if (mclaudit == NULL)
1781                         _MCHECK((struct mbuf *)buf);
1782                 /*
1783                  * Since we have incremented the reference count above,
1784                  * an mbuf slab (formerly a 4KB cluster slab that was cut
1785                  * up into mbufs) must have a reference count between 1
1786                  * and NMBPG at this point.
1787                  */
1788                 VERIFY(sp->sl_refcnt >= 1 && sp->sl_refcnt <= NMBPG &&
1789                     sp->sl_chunks == NMBPG &&
1790                     sp->sl_len == PAGE_SIZE);
1791                 VERIFY(sp->sl_refcnt < NMBPG || sp->sl_head == NULL);
1792         }
1793
1794         /* If empty, remove this slab from the class's freelist */
1795         if (sp->sl_head == NULL) {
1796                 VERIFY(class != MC_MBUF || sp->sl_refcnt == NMBPG);
1797                 VERIFY(class != MC_CL || sp->sl_refcnt == NCLPG);
1798                 VERIFY(class != MC_BIGCL || sp->sl_refcnt == NBCLPG);
1799                 slab_remove(sp, class);
1800         }
1801
1802         return (buf);
1803 }
1804
1805 /*
1806  * Place a slab of object(s) back into a class's slab list.
1807  */
1808 static void
1809 slab_free(mbuf_class_t class, mcache_obj_t *buf)
1810 {
1811         mcl_slab_t *sp;
1812         boolean_t reinit_supercl = false;
1813         mbuf_class_t super_class;
1814
1815         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1816
1817         VERIFY(class != MC_16KCL || njcl > 0);
1818         VERIFY(buf->obj_next == NULL);
1819
1820         sp = slab_get(buf);
1821         VERIFY(sp->sl_class == class && slab_inrange(sp, buf) &&
1822             (sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED);
1823
1824         /* Decrement slab reference */
1825         sp->sl_refcnt--;
1826
1827         if (class == MC_CL) {
1828                 VERIFY(IS_P2ALIGNED(buf, MCLBYTES));
1829                 /*
1830                  * A slab that has been splitted for 2KB clusters can have
1831                  * at most 1 outstanding reference at this point.
1832                  */
1833                 VERIFY(sp->sl_refcnt >= 0 && sp->sl_refcnt <= (NCLPG - 1) &&
1834                     sp->sl_chunks == NCLPG && sp->sl_len == PAGE_SIZE);
1835                 VERIFY(sp->sl_refcnt < (NCLPG - 1) ||
1836                     (slab_is_detached(sp) && sp->sl_head == NULL));
1837         } else if (class == MC_BIGCL) {
1838                 VERIFY(IS_P2ALIGNED(buf, MBIGCLBYTES));
1839
1840                 /* A 4KB cluster slab can have NBCLPG references at most */
1841                 VERIFY(sp->sl_refcnt >= 0 && sp->sl_chunks == NBCLPG);
1842                 VERIFY(sp->sl_refcnt < (NBCLPG - 1) ||
1843                     (slab_is_detached(sp) && sp->sl_head == NULL));
1844         } else if (class == MC_16KCL) {
1845                 mcl_slab_t *nsp;
1846                 int k;
1847                 /*
1848                  * A 16KB cluster takes NSLABSP16KB slabs, all must
1849                  * now have 0 reference.
1850                  */
1851                 VERIFY(IS_P2ALIGNED(buf, PAGE_SIZE));
1852                 VERIFY(sp->sl_refcnt == 0 && sp->sl_chunks == 1 &&
1853                     sp->sl_len == m_maxsize(class) && sp->sl_head == NULL);
1854                 VERIFY(slab_is_detached(sp));
1855                 for (nsp = sp, k = 1; k < NSLABSP16KB; k++) {
1856                         nsp = nsp->sl_next;
1857                         /* Next slab must already be present */
1858                         VERIFY(nsp != NULL);
1859                         nsp->sl_refcnt--;
1860                         VERIFY(slab_is_detached(nsp));
1861                         VERIFY(nsp->sl_class == MC_16KCL &&
1862                             (nsp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) &&
1863                             nsp->sl_refcnt == 0 && nsp->sl_chunks == 0 &&
1864                             nsp->sl_len == 0 && nsp->sl_base == sp->sl_base &&
1865                             nsp->sl_head == NULL);
1866                 }
1867         } else {
1868                 /*
1869                  * A slab that has been splitted for mbufs has at most
1870                  * NMBPG reference counts.  Since we have decremented
1871                  * one reference above, it must now be between 0 and
1872                  * NMBPG-1.
1873                  */
1874                 VERIFY(class == MC_MBUF);
1875                 VERIFY(sp->sl_refcnt >= 0 &&
1876                     sp->sl_refcnt <= (NMBPG - 1) &&
1877                     sp->sl_chunks == NMBPG &&
1878                     sp->sl_len == PAGE_SIZE);
1879                 VERIFY(sp->sl_refcnt < (NMBPG - 1) ||
1880                     (slab_is_detached(sp) && sp->sl_head == NULL));
1881         }
1882
1883         /*
1884          * When auditing is enabled, ensure that the buffer still
1885          * contains the free pattern.  Otherwise it got corrupted
1886          * while at the CPU cache layer.
1887          */
1888         if (mclaudit != NULL) {
1889                 mcache_audit_t *mca = mcl_audit_buf2mca(class, buf);
1890                 if (mclverify) {
1891                         mcache_audit_free_verify(mca, buf, 0,
1892                             m_maxsize(class));
1893                 }
1894                 mca->mca_uflags &= ~MB_SCVALID;
1895         }
1896
1897         if (class == MC_CL) {
1898                 mbstat.m_clfree = (++m_infree(MC_CL)) + m_infree(MC_MBUF_CL);
1899                 buf->obj_next = sp->sl_head;
1900         } else if (class == MC_BIGCL) {
1901                 mbstat.m_bigclfree = (++m_infree(MC_BIGCL)) +
1902                     m_infree(MC_MBUF_BIGCL);
1903                 buf->obj_next = sp->sl_head;
1904         } else if (class == MC_16KCL) {
1905                 ++m_infree(MC_16KCL);
1906         } else {
1907                 ++m_infree(MC_MBUF);
1908                 buf->obj_next = sp->sl_head;
1909         }
1910         sp->sl_head = buf;
1911
1912         /*
1913          * If a slab has been split to either one which holds 2KB clusters,
1914          * or one which holds mbufs, turn it back to one which holds a
1915          * 4 or 16 KB cluster depending on the page size.
1916          */
1917         if (m_maxsize(MC_BIGCL) == PAGE_SIZE) {
1918                 super_class = MC_BIGCL;
1919         } else {
1920                 VERIFY(PAGE_SIZE == m_maxsize(MC_16KCL));
1921                 super_class = MC_16KCL;
1922         }
1923         if (class == MC_MBUF && sp->sl_refcnt == 0 &&
1924             m_total(class) >= (m_minlimit(class) + NMBPG) &&
1925             m_total(super_class) < m_maxlimit(super_class)) {
1926                 int i = NMBPG;
1927
1928                 m_total(MC_MBUF) -= NMBPG;
1929                 mbstat.m_mbufs = m_total(MC_MBUF);
1930                 m_infree(MC_MBUF) -= NMBPG;
1931                 mtype_stat_add(MT_FREE, -((unsigned)NMBPG));
1932
1933                 while (i--) {
1934                         struct mbuf *m = sp->sl_head;
1935                         VERIFY(m != NULL);
1936                         sp->sl_head = m->m_next;
1937                         m->m_next = NULL;
1938                 }
1939                 reinit_supercl = true;
1940         } else if (class == MC_CL && sp->sl_refcnt == 0 &&
1941             m_total(class) >=  (m_minlimit(class) + NCLPG) &&
1942             m_total(super_class) < m_maxlimit(super_class)) {
1943                 int i = NCLPG;
1944
1945                 m_total(MC_CL) -= NCLPG;
1946                 mbstat.m_clusters = m_total(MC_CL);
1947                 m_infree(MC_CL) -= NCLPG;
1948
1949                 while (i--) {
1950                         union mcluster *c = sp->sl_head;
1951                         VERIFY(c != NULL);
1952                         sp->sl_head = c->mcl_next;
1953                         c->mcl_next = NULL;
1954                 }
1955                 reinit_supercl = true;
1956         } else if (class == MC_BIGCL && super_class != MC_BIGCL &&
1957             sp->sl_refcnt == 0 &&
1958             m_total(class) >= (m_minlimit(class) + NBCLPG) &&
1959             m_total(super_class) < m_maxlimit(super_class)) {
1960                 int i = NBCLPG;
1961
1962                 VERIFY(super_class == MC_16KCL);
1963                 m_total(MC_BIGCL) -= NBCLPG;
1964                 mbstat.m_bigclusters = m_total(MC_BIGCL);
1965                 m_infree(MC_BIGCL) -= NBCLPG;
1966
1967                 while (i--) {
1968                         union mbigcluster *bc = sp->sl_head;
1969                         VERIFY(bc != NULL);
1970                         sp->sl_head = bc->mbc_next;
1971                         bc->mbc_next = NULL;
1972                 }
1973                 reinit_supercl = true;
1974         }
1975
1976         if (reinit_supercl) {
1977                 VERIFY(sp->sl_head == NULL);
1978                 VERIFY(m_total(class) >= m_minlimit(class));
1979                 slab_remove(sp, class);
1980
1981                 /* Reinitialize it as a cluster for the super class */
1982                 m_total(super_class)++;
1983                 m_infree(super_class)++;
1984                 VERIFY(sp->sl_flags == (SLF_MAPPED | SLF_DETACHED) &&
1985                     sp->sl_len == PAGE_SIZE && sp->sl_refcnt == 0);
1986
1987                 slab_init(sp, super_class, SLF_MAPPED, sp->sl_base,
1988                     sp->sl_base, PAGE_SIZE, 0, 1);
1989                 if (mclverify)
1990                         mcache_set_pattern(MCACHE_FREE_PATTERN,
1991                             (caddr_t)sp->sl_base, sp->sl_len);
1992                 ((mcache_obj_t *)(sp->sl_base))->obj_next = NULL;
1993
1994                 if (super_class == MC_BIGCL) {
1995                         mbstat.m_bigclusters = m_total(MC_BIGCL);
1996                         mbstat.m_bigclfree = m_infree(MC_BIGCL) +
1997                             m_infree(MC_MBUF_BIGCL);
1998                 }
1999
2000                 VERIFY(slab_is_detached(sp));
2001                 VERIFY(m_total(super_class) <= m_maxlimit(super_class));
2002
2003                 /* And finally switch class */
2004                 class = super_class;
2005         }
2006
2007         /* Reinsert the slab to the class's slab list */
2008         if (slab_is_detached(sp))
2009                 slab_insert(sp, class);
2010 }
2011
2012 /*
2013  * Common allocator for rudimentary objects called by the CPU cache layer
2014  * during an allocation request whenever there is no available element in the
2015  * bucket layer.  It returns one or more elements from the appropriate global
2016  * freelist.  If the freelist is empty, it will attempt to populate it and
2017  * retry the allocation.
2018  */
2019 static unsigned int
2020 mbuf_slab_alloc(void *arg, mcache_obj_t ***plist, unsigned int num, int wait)
2021 {
2022         mbuf_class_t class = (mbuf_class_t)arg;
2023         unsigned int need = num;
2024         mcache_obj_t **list = *plist;
2025
2026         ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
2027         ASSERT(need > 0);
2028
2029         lck_mtx_lock(mbuf_mlock);
2030
2031         for (;;) {
2032                 if ((*list = slab_alloc(class, wait)) != NULL) {
2033                         (*list)->obj_next = NULL;
2034                         list = *plist = &(*list)->obj_next;
2035
2036                         if (--need == 0) {
2037                                 /*
2038                                  * If the number of elements in freelist has
2039                                  * dropped below low watermark, asynchronously
2040                                  * populate the freelist now rather than doing
2041                                  * it later when we run out of elements.
2042                                  */
2043                                 if (!mbuf_cached_above(class, wait) &&
2044                                     m_infree(class) < (m_total(class) >> 5)) {
2045                                         (void) freelist_populate(class, 1,
2046                                             M_DONTWAIT);
2047                                 }
2048                                 break;
2049                         }
2050                 } else {
2051                         VERIFY(m_infree(class) == 0 || class == MC_CL);
2052
2053                         (void) freelist_populate(class, 1,
2054                             (wait & MCR_NOSLEEP) ? M_DONTWAIT : M_WAIT);
2055
2056                         if (m_infree(class) > 0)
2057                                 continue;
2058
2059                         /* Check if there's anything at the cache layer */
2060                         if (mbuf_cached_above(class, wait))
2061                                 break;
2062
2063                         /* watchdog checkpoint */
2064                         mbuf_watchdog();
2065
2066                         /* We have nothing and cannot block; give up */
2067                         if (wait & MCR_NOSLEEP) {
2068                                 if (!(wait & MCR_TRYHARD)) {
2069                                         m_fail_cnt(class)++;
2070                                         mbstat.m_drops++;
2071                                         break;
2072                                 }
2073                         }
2074
2075                         /*
2076                          * If the freelist is still empty and the caller is
2077                          * willing to be blocked, sleep on the wait channel
2078                          * until an element is available.  Otherwise, if
2079                          * MCR_TRYHARD is set, do our best to satisfy the
2080                          * request without having to go to sleep.
2081                          */
2082                         if (mbuf_worker_ready &&
2083                             mbuf_sleep(class, need, wait))
2084                                 break;
2085
2086                         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2087                 }
2088         }
2089
2090         m_alloc_cnt(class) += num - need;
2091         lck_mtx_unlock(mbuf_mlock);
2092
2093         return (num - need);
2094 }
2095
2096 /*
2097  * Common de-allocator for rudimentary objects called by the CPU cache
2098  * layer when one or more elements need to be returned to the appropriate
2099  * global freelist.
2100  */
2101 static void
2102 mbuf_slab_free(void *arg, mcache_obj_t *list, __unused int purged)
2103 {
2104         mbuf_class_t class = (mbuf_class_t)arg;
2105         mcache_obj_t *nlist;
2106         unsigned int num = 0;
2107         int w;
2108
2109         ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
2110
2111         lck_mtx_lock(mbuf_mlock);
2112
2113         for (;;) {
2114                 nlist = list->obj_next;
2115                 list->obj_next = NULL;
2116                 slab_free(class, list);
2117                 ++num;
2118                 if ((list = nlist) == NULL)
2119                         break;
2120         }
2121         m_free_cnt(class) += num;
2122
2123         if ((w = mb_waiters) > 0)
2124                 mb_waiters = 0;
2125
2126         lck_mtx_unlock(mbuf_mlock);
2127
2128         if (w != 0)
2129                 wakeup(mb_waitchan);
2130 }
2131
2132 /*
2133  * Common auditor for rudimentary objects called by the CPU cache layer
2134  * during an allocation or free request.  For the former, this is called
2135  * after the objects are obtained from either the bucket or slab layer
2136  * and before they are returned to the caller.  For the latter, this is
2137  * called immediately during free and before placing the objects into
2138  * the bucket or slab layer.
2139  */
2140 static void
2141 mbuf_slab_audit(void *arg, mcache_obj_t *list, boolean_t alloc)
2142 {
2143         mbuf_class_t class = (mbuf_class_t)arg;
2144         mcache_audit_t *mca;
2145
2146         ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
2147
2148         while (list != NULL) {
2149                 lck_mtx_lock(mbuf_mlock);
2150                 mca = mcl_audit_buf2mca(class, list);
2151
2152                 /* Do the sanity checks */
2153                 if (class == MC_MBUF) {
2154                         mcl_audit_mbuf(mca, list, FALSE, alloc);
2155                         ASSERT(mca->mca_uflags & MB_SCVALID);
2156                 } else {
2157                         mcl_audit_cluster(mca, list, m_maxsize(class),
2158                             alloc, TRUE);
2159                         ASSERT(!(mca->mca_uflags & MB_SCVALID));
2160                 }
2161                 /* Record this transaction */
2162                 if (mcltrace)
2163                         mcache_buffer_log(mca, list, m_cache(class), &mb_start);
2164
2165                 if (alloc)
2166                         mca->mca_uflags |= MB_INUSE;
2167                 else
2168                         mca->mca_uflags &= ~MB_INUSE;
2169                 /* Unpair the object (unconditionally) */
2170                 mca->mca_uptr = NULL;
2171                 lck_mtx_unlock(mbuf_mlock);
2172
2173                 list = list->obj_next;
2174         }
2175 }
2176
2177 /*
2178  * Common notify routine for all caches.  It is called by mcache when
2179  * one or more objects get freed.  We use this indication to trigger
2180  * the wakeup of any sleeping threads so that they can retry their
2181  * allocation requests.
2182  */
2183 static void
2184 mbuf_slab_notify(void *arg, u_int32_t reason)
2185 {
2186         mbuf_class_t class = (mbuf_class_t)arg;
2187         int w;
2188
2189         ASSERT(MBUF_CLASS_VALID(class));
2190
2191         if (reason != MCN_RETRYALLOC)
2192                 return;
2193
2194         lck_mtx_lock(mbuf_mlock);
2195         if ((w = mb_waiters) > 0) {
2196                 m_notified(class)++;
2197                 mb_waiters = 0;
2198         }
2199         lck_mtx_unlock(mbuf_mlock);
2200
2201         if (w != 0)
2202                 wakeup(mb_waitchan);
2203 }
2204
2205 /*
2206  * Obtain object(s) from the composite class's freelist.
2207  */
2208 static unsigned int
2209 cslab_alloc(mbuf_class_t class, mcache_obj_t ***plist, unsigned int num)
2210 {
2211         unsigned int need = num;
2212         mcl_slab_t *sp, *clsp, *nsp;
2213         struct mbuf *m;
2214         mcache_obj_t **list = *plist;
2215         void *cl;
2216
2217         VERIFY(need > 0);
2218         VERIFY(class != MC_MBUF_16KCL || njcl > 0);
2219         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2220
2221         /* Get what we can from the freelist */
2222         while ((*list = m_cobjlist(class)) != NULL) {
2223                 MRANGE(*list);
2224
2225                 m = (struct mbuf *)*list;
2226                 sp = slab_get(m);
2227                 cl = m->m_ext.ext_buf;
2228                 clsp = slab_get(cl);
2229                 VERIFY(m->m_flags == M_EXT && cl != NULL);
2230                 VERIFY(MEXT_RFA(m) != NULL && MBUF_IS_COMPOSITE(m));
2231
2232                 if (class == MC_MBUF_CL) {
2233                         VERIFY(clsp->sl_refcnt >= 1 &&
2234                             clsp->sl_refcnt <= NCLPG);
2235                 } else {
2236                         VERIFY(clsp->sl_refcnt >= 1 &&
2237                             clsp->sl_refcnt <= NBCLPG);
2238                 }
2239
2240                 if (class == MC_MBUF_16KCL) {
2241                         int k;
2242                         for (nsp = clsp, k = 1; k < NSLABSP16KB; k++) {
2243                                 nsp = nsp->sl_next;
2244                                 /* Next slab must already be present */
2245                                 VERIFY(nsp != NULL);
2246                                 VERIFY(nsp->sl_refcnt == 1);
2247                         }
2248                 }
2249
2250                 if ((m_cobjlist(class) = (*list)->obj_next) != NULL &&
2251                     !MBUF_IN_MAP(m_cobjlist(class))) {
2252                         slab_nextptr_panic(sp, m_cobjlist(class));
2253                         /* NOTREACHED */
2254                 }
2255                 (*list)->obj_next = NULL;
2256                 list = *plist = &(*list)->obj_next;
2257
2258                 if (--need == 0)
2259                         break;
2260         }
2261         m_infree(class) -= (num - need);
2262
2263         return (num - need);
2264 }
2265
2266 /*
2267  * Place object(s) back into a composite class's freelist.
2268  */
2269 static unsigned int
2270 cslab_free(mbuf_class_t class, mcache_obj_t *list, int purged)
2271 {
2272         mcache_obj_t *o, *tail;
2273         unsigned int num = 0;
2274         struct mbuf *m, *ms;
2275         mcache_audit_t *mca = NULL;
2276         mcache_obj_t *ref_list = NULL;
2277         mcl_slab_t *clsp, *nsp;
2278         void *cl;
2279         mbuf_class_t cl_class;
2280
2281         ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
2282         VERIFY(class != MC_MBUF_16KCL || njcl > 0);
2283         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2284
2285         if (class == MC_MBUF_CL) {
2286                 cl_class = MC_CL;
2287         } else if (class == MC_MBUF_BIGCL) {
2288                 cl_class = MC_BIGCL;
2289         } else {
2290                 VERIFY(class == MC_MBUF_16KCL);
2291                 cl_class = MC_16KCL;
2292         }
2293
2294         o = tail = list;
2295
2296         while ((m = ms = (struct mbuf *)o) != NULL) {
2297                 mcache_obj_t *rfa, *nexto = o->obj_next;
2298
2299                 /* Do the mbuf sanity checks */
2300                 if (mclaudit != NULL) {
2301                         mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
2302                         if (mclverify) {
2303                                 mcache_audit_free_verify(mca, m, 0,
2304                                     m_maxsize(MC_MBUF));
2305                         }
2306                         ms = MCA_SAVED_MBUF_PTR(mca);
2307                 }
2308
2309                 /* Do the cluster sanity checks */
2310                 cl = ms->m_ext.ext_buf;
2311                 clsp = slab_get(cl);
2312                 if (mclverify) {
2313                         size_t size = m_maxsize(cl_class);
2314                         mcache_audit_free_verify(mcl_audit_buf2mca(cl_class,
2315                             (mcache_obj_t *)cl), cl, 0, size);
2316                 }
2317                 VERIFY(ms->m_type == MT_FREE);
2318                 VERIFY(ms->m_flags == M_EXT);
2319                 VERIFY(MEXT_RFA(ms) != NULL && MBUF_IS_COMPOSITE(ms));
2320                 if (cl_class == MC_CL) {
2321                         VERIFY(clsp->sl_refcnt >= 1 &&
2322                             clsp->sl_refcnt <= NCLPG);
2323                 } else {
2324                         VERIFY(clsp->sl_refcnt >= 1 &&
2325                             clsp->sl_refcnt <= NBCLPG);
2326                 }
2327                 if (cl_class == MC_16KCL) {
2328                         int k;
2329                         for (nsp = clsp, k = 1; k < NSLABSP16KB; k++) {
2330                                 nsp = nsp->sl_next;
2331                                 /* Next slab must already be present */
2332                                 VERIFY(nsp != NULL);
2333                                 VERIFY(nsp->sl_refcnt == 1);
2334                         }
2335                 }
2336
2337                 /*
2338                  * If we're asked to purge, restore the actual mbuf using
2339                  * contents of the shadow structure (if auditing is enabled)
2340                  * and clear EXTF_COMPOSITE flag from the mbuf, as we are
2341                  * about to free it and the attached cluster into their caches.
2342                  */
2343                 if (purged) {
2344                         /* Restore constructed mbuf fields */
2345                         if (mclaudit != NULL)
2346                                 mcl_audit_restore_mbuf(m, mca, TRUE);
2347
2348                         MEXT_REF(m) = 0;
2349                         MEXT_FLAGS(m) = 0;
2350
2351                         rfa = (mcache_obj_t *)(void *)MEXT_RFA(m);
2352                         rfa->obj_next = ref_list;
2353                         ref_list = rfa;
2354                         MEXT_RFA(m) = NULL;
2355
2356                         m->m_type = MT_FREE;
2357                         m->m_flags = m->m_len = 0;
2358                         m->m_next = m->m_nextpkt = NULL;
2359
2360                         /* Save mbuf fields and make auditing happy */
2361                         if (mclaudit != NULL)
2362                                 mcl_audit_mbuf(mca, o, FALSE, FALSE);
2363
2364                         VERIFY(m_total(class) > 0);
2365                         m_total(class)--;
2366
2367                         /* Free the mbuf */
2368                         o->obj_next = NULL;
2369                         slab_free(MC_MBUF, o);
2370
2371                         /* And free the cluster */
2372                         ((mcache_obj_t *)cl)->obj_next = NULL;
2373                         if (class == MC_MBUF_CL)
2374                                 slab_free(MC_CL, cl);
2375                         else if (class == MC_MBUF_BIGCL)
2376                                 slab_free(MC_BIGCL, cl);
2377                         else
2378                                 slab_free(MC_16KCL, cl);
2379                 }
2380
2381                 ++num;
2382                 tail = o;
2383                 o = nexto;
2384         }
2385
2386         if (!purged) {
2387                 tail->obj_next = m_cobjlist(class);
2388                 m_cobjlist(class) = list;
2389                 m_infree(class) += num;
2390         } else if (ref_list != NULL) {
2391                 mcache_free_ext(ref_cache, ref_list);
2392         }
2393
2394         return (num);
2395 }
2396
2397 /*
2398  * Common allocator for composite objects called by the CPU cache layer
2399  * during an allocation request whenever there is no available element in
2400  * the bucket layer.  It returns one or more composite elements from the
2401  * appropriate global freelist.  If the freelist is empty, it will attempt
2402  * to obtain the rudimentary objects from their caches and construct them
2403  * into composite mbuf + cluster objects.
2404  */
2405 static unsigned int
2406 mbuf_cslab_alloc(void *arg, mcache_obj_t ***plist, unsigned int needed,
2407     int wait)
2408 {
2409         mbuf_class_t class = (mbuf_class_t)arg;
2410         mbuf_class_t cl_class = 0;
2411         unsigned int num = 0, cnum = 0, want = needed;
2412         mcache_obj_t *ref_list = NULL;
2413         mcache_obj_t *mp_list = NULL;
2414         mcache_obj_t *clp_list = NULL;
2415         mcache_obj_t **list;
2416         struct ext_ref *rfa;
2417         struct mbuf *m;
2418         void *cl;
2419
2420         ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
2421         ASSERT(needed > 0);
2422
2423         VERIFY(class != MC_MBUF_16KCL || njcl > 0);
2424
2425         /* There should not be any slab for this class */
2426         VERIFY(m_slab_cnt(class) == 0 &&
2427             m_slablist(class).tqh_first == NULL &&
2428             m_slablist(class).tqh_last == NULL);
2429
2430         lck_mtx_lock(mbuf_mlock);
2431
2432         /* Try using the freelist first */
2433         num = cslab_alloc(class, plist, needed);
2434         list = *plist;
2435         if (num == needed) {
2436                 m_alloc_cnt(class) += num;
2437                 lck_mtx_unlock(mbuf_mlock);
2438                 return (needed);
2439         }
2440
2441         lck_mtx_unlock(mbuf_mlock);
2442
2443         /*
2444          * We could not satisfy the request using the freelist alone;
2445          * allocate from the appropriate rudimentary caches and use
2446          * whatever we can get to construct the composite objects.
2447          */
2448         needed -= num;
2449
2450         /*
2451          * Mark these allocation requests as coming from a composite cache.
2452          * Also, if the caller is willing to be blocked, mark the request
2453          * with MCR_FAILOK such that we don't end up sleeping at the mbuf
2454          * slab layer waiting for the individual object when one or more
2455          * of the already-constructed composite objects are available.
2456          */
2457         wait |= MCR_COMP;
2458         if (!(wait & MCR_NOSLEEP))
2459                 wait |= MCR_FAILOK;
2460
2461         /* allocate mbufs */
2462         needed = mcache_alloc_ext(m_cache(MC_MBUF), &mp_list, needed, wait);
2463         if (needed == 0) {
2464                 ASSERT(mp_list == NULL);
2465                 goto fail;
2466         }
2467
2468         /* allocate clusters */
2469         if (class == MC_MBUF_CL) {
2470                 cl_class = MC_CL;
2471         } else if (class == MC_MBUF_BIGCL) {
2472                 cl_class = MC_BIGCL;
2473         } else {
2474                 VERIFY(class == MC_MBUF_16KCL);
2475                 cl_class = MC_16KCL;
2476         }
2477         needed = mcache_alloc_ext(m_cache(cl_class), &clp_list, needed, wait);
2478         if (needed == 0) {
2479                 ASSERT(clp_list == NULL);
2480                 goto fail;
2481         }
2482
2483         needed = mcache_alloc_ext(ref_cache, &ref_list, needed, wait);
2484         if (needed == 0) {
2485                 ASSERT(ref_list == NULL);
2486                 goto fail;
2487         }
2488
2489         /*
2490          * By this time "needed" is MIN(mbuf, cluster, ref).  Any left
2491          * overs will get freed accordingly before we return to caller.
2492          */
2493         for (cnum = 0; cnum < needed; cnum++) {
2494                 struct mbuf *ms;
2495
2496                 m = ms = (struct mbuf *)mp_list;
2497                 mp_list = mp_list->obj_next;
2498
2499                 cl = clp_list;
2500                 clp_list = clp_list->obj_next;
2501                 ((mcache_obj_t *)cl)->obj_next = NULL;
2502
2503                 rfa = (struct ext_ref *)ref_list;
2504                 ref_list = ref_list->obj_next;
2505                 ((mcache_obj_t *)(void *)rfa)->obj_next = NULL;
2506
2507                 /*
2508                  * If auditing is enabled, construct the shadow mbuf
2509                  * in the audit structure instead of in the actual one.
2510                  * mbuf_cslab_audit() will take care of restoring the
2511                  * contents after the integrity check.
2512                  */
2513                 if (mclaudit != NULL) {
2514                         mcache_audit_t *mca, *cl_mca;
2515
2516                         lck_mtx_lock(mbuf_mlock);
2517                         mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
2518                         ms = MCA_SAVED_MBUF_PTR(mca);
2519                         cl_mca = mcl_audit_buf2mca(cl_class,
2520                             (mcache_obj_t *)cl);
2521
2522                         /*
2523                          * Pair them up.  Note that this is done at the time
2524                          * the mbuf+cluster objects are constructed.  This
2525                          * information should be treated as "best effort"
2526                          * debugging hint since more than one mbufs can refer
2527                          * to a cluster.  In that case, the cluster might not
2528                          * be freed along with the mbuf it was paired with.
2529                          */
2530                         mca->mca_uptr = cl_mca;
2531                         cl_mca->mca_uptr = mca;
2532
2533                         ASSERT(mca->mca_uflags & MB_SCVALID);
2534                         ASSERT(!(cl_mca->mca_uflags & MB_SCVALID));
2535                         lck_mtx_unlock(mbuf_mlock);
2536
2537                         /* Technically, they are in the freelist */
2538                         if (mclverify) {
2539                                 size_t size;
2540
2541                                 mcache_set_pattern(MCACHE_FREE_PATTERN, m,
2542                                     m_maxsize(MC_MBUF));
2543
2544                                 if (class == MC_MBUF_CL)
2545                                         size = m_maxsize(MC_CL);
2546                                 else if (class == MC_MBUF_BIGCL)
2547                                         size = m_maxsize(MC_BIGCL);
2548                                 else
2549                                         size = m_maxsize(MC_16KCL);
2550
2551                                 mcache_set_pattern(MCACHE_FREE_PATTERN, cl,
2552                                     size);
2553                         }
2554                 }
2555
2556                 MBUF_INIT(ms, 0, MT_FREE);
2557                 if (class == MC_MBUF_16KCL) {
2558                         MBUF_16KCL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE);
2559                 } else if (class == MC_MBUF_BIGCL) {
2560                         MBUF_BIGCL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE);
2561                 } else {
2562                         MBUF_CL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE);
2563                 }
2564                 VERIFY(ms->m_flags == M_EXT);
2565                 VERIFY(MEXT_RFA(ms) != NULL && MBUF_IS_COMPOSITE(ms));
2566
2567                 *list = (mcache_obj_t *)m;
2568                 (*list)->obj_next = NULL;
2569                 list = *plist = &(*list)->obj_next;
2570         }
2571
2572 fail:
2573         /*
2574          * Free up what's left of the above.
2575          */
2576         if (mp_list != NULL)
2577                 mcache_free_ext(m_cache(MC_MBUF), mp_list);
2578         if (clp_list != NULL)
2579                 mcache_free_ext(m_cache(cl_class), clp_list);
2580         if (ref_list != NULL)
2581                 mcache_free_ext(ref_cache, ref_list);
2582
2583         lck_mtx_lock(mbuf_mlock);
2584         if (num > 0 || cnum > 0) {
2585                 m_total(class) += cnum;
2586                 VERIFY(m_total(class) <= m_maxlimit(class));
2587                 m_alloc_cnt(class) += num + cnum;
2588         }
2589         if ((num + cnum) < want)
2590                 m_fail_cnt(class) += (want - (num + cnum));
2591         lck_mtx_unlock(mbuf_mlock);
2592
2593         return (num + cnum);
2594 }
2595
2596 /*
2597  * Common de-allocator for composite objects called by the CPU cache
2598  * layer when one or more elements need to be returned to the appropriate
2599  * global freelist.
2600  */
2601 static void
2602 mbuf_cslab_free(void *arg, mcache_obj_t *list, int purged)
2603 {
2604         mbuf_class_t class = (mbuf_class_t)arg;
2605         unsigned int num;
2606         int w;
2607
2608         ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
2609
2610         lck_mtx_lock(mbuf_mlock);
2611
2612         num = cslab_free(class, list, purged);
2613         m_free_cnt(class) += num;
2614
2615         if ((w = mb_waiters) > 0)
2616                 mb_waiters = 0;
2617
2618         lck_mtx_unlock(mbuf_mlock);
2619
2620         if (w != 0)
2621                 wakeup(mb_waitchan);
2622 }
2623
2624 /*
2625  * Common auditor for composite objects called by the CPU cache layer
2626  * during an allocation or free request.  For the former, this is called
2627  * after the objects are obtained from either the bucket or slab layer
2628  * and before they are returned to the caller.  For the latter, this is
2629  * called immediately during free and before placing the objects into
2630  * the bucket or slab layer.
2631  */
2632 static void
2633 mbuf_cslab_audit(void *arg, mcache_obj_t *list, boolean_t alloc)
2634 {
2635         mbuf_class_t class = (mbuf_class_t)arg, cl_class;
2636         mcache_audit_t *mca;
2637         struct mbuf *m, *ms;
2638         mcl_slab_t *clsp, *nsp;
2639         size_t cl_size;
2640         void *cl;
2641
2642         ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
2643         if (class == MC_MBUF_CL)
2644                 cl_class = MC_CL;
2645         else if (class == MC_MBUF_BIGCL)
2646                 cl_class = MC_BIGCL;
2647         else
2648                 cl_class = MC_16KCL;
2649         cl_size = m_maxsize(cl_class);
2650
2651         while ((m = ms = (struct mbuf *)list) != NULL) {
2652                 lck_mtx_lock(mbuf_mlock);
2653                 /* Do the mbuf sanity checks and record its transaction */
2654                 mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
2655                 mcl_audit_mbuf(mca, m, TRUE, alloc);
2656                 if (mcltrace)
2657                         mcache_buffer_log(mca, m, m_cache(class), &mb_start);
2658
2659                 if (alloc)
2660                         mca->mca_uflags |= MB_COMP_INUSE;
2661                 else
2662                         mca->mca_uflags &= ~MB_COMP_INUSE;
2663
2664                 /*
2665                  * Use the shadow mbuf in the audit structure if we are
2666                  * freeing, since the contents of the actual mbuf has been
2667                  * pattern-filled by the above call to mcl_audit_mbuf().
2668                  */
2669                 if (!alloc && mclverify)
2670                         ms = MCA_SAVED_MBUF_PTR(mca);
2671
2672                 /* Do the cluster sanity checks and record its transaction */
2673                 cl = ms->m_ext.ext_buf;
2674                 clsp = slab_get(cl);
2675                 VERIFY(ms->m_flags == M_EXT && cl != NULL);
2676                 VERIFY(MEXT_RFA(ms) != NULL && MBUF_IS_COMPOSITE(ms));
2677                 if (class == MC_MBUF_CL)
2678                         VERIFY(clsp->sl_refcnt >= 1 &&
2679                             clsp->sl_refcnt <= NCLPG);
2680                 else
2681                         VERIFY(clsp->sl_refcnt >= 1 &&
2682                             clsp->sl_refcnt <= NBCLPG);
2683
2684                 if (class == MC_MBUF_16KCL) {
2685                         int k;
2686                         for (nsp = clsp, k = 1; k < NSLABSP16KB; k++) {
2687                                 nsp = nsp->sl_next;
2688                                 /* Next slab must already be present */
2689                                 VERIFY(nsp != NULL);
2690                                 VERIFY(nsp->sl_refcnt == 1);
2691                         }
2692                 }
2693
2694
2695                 mca = mcl_audit_buf2mca(cl_class, cl);
2696                 mcl_audit_cluster(mca, cl, cl_size, alloc, FALSE);
2697                 if (mcltrace)
2698                         mcache_buffer_log(mca, cl, m_cache(class), &mb_start);
2699
2700                 if (alloc)
2701                         mca->mca_uflags |= MB_COMP_INUSE;
2702                 else
2703                         mca->mca_uflags &= ~MB_COMP_INUSE;
2704                 lck_mtx_unlock(mbuf_mlock);
2705
2706                 list = list->obj_next;
2707         }
2708 }
2709
2710 /*
2711  * Allocate some number of mbuf clusters and place on cluster freelist.
2712  */
2713 static int
2714 m_clalloc(const u_int32_t num, const int wait, const u_int32_t bufsize)
2715 {
2716         int i, count = 0;
2717         vm_size_t size = 0;
2718         int numpages = 0, large_buffer;
2719         vm_offset_t page = 0;
2720         mcache_audit_t *mca_list = NULL;
2721         mcache_obj_t *con_list = NULL;
2722         mcl_slab_t *sp;
2723         mbuf_class_t class;
2724
2725         /* Set if a buffer allocation needs allocation of multiple pages */
2726         large_buffer = ((bufsize == m_maxsize(MC_16KCL)) &&
2727                 PAGE_SIZE < M16KCLBYTES);
2728         VERIFY(bufsize == m_maxsize(MC_BIGCL) ||
2729             bufsize == m_maxsize(MC_16KCL));
2730
2731         VERIFY((bufsize == PAGE_SIZE) ||
2732             (bufsize > PAGE_SIZE && bufsize == m_maxsize(MC_16KCL)));
2733
2734         if (bufsize == m_size(MC_BIGCL))
2735                 class = MC_BIGCL;
2736         else
2737                 class = MC_16KCL;
2738
2739         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2740
2741         /*
2742          * Multiple threads may attempt to populate the cluster map one
2743          * after another.  Since we drop the lock below prior to acquiring
2744          * the physical page(s), our view of the cluster map may no longer
2745          * be accurate, and we could end up over-committing the pages beyond
2746          * the maximum allowed for each class.  To prevent it, this entire
2747          * operation (including the page mapping) is serialized.
2748          */
2749         while (mb_clalloc_busy) {
2750                 mb_clalloc_waiters++;
2751                 (void) msleep(mb_clalloc_waitchan, mbuf_mlock,
2752                     (PZERO-1), "m_clalloc", NULL);
2753                 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2754         }
2755
2756         /* We are busy now; tell everyone else to go away */
2757         mb_clalloc_busy = TRUE;
2758
2759         /*
2760          * Honor the caller's wish to block or not block.  We have a way
2761          * to grow the pool asynchronously using the mbuf worker thread.
2762          */
2763         i = m_howmany(num, bufsize);
2764         if (i == 0 || (wait & M_DONTWAIT))
2765                 goto out;
2766
2767         lck_mtx_unlock(mbuf_mlock);
2768
2769         size = round_page(i * bufsize);
2770         page = kmem_mb_alloc(mb_map, size, large_buffer);
2771
2772         /*
2773          * If we did ask for "n" 16KB physically contiguous chunks
2774          * and didn't get them, then please try again without this
2775          * restriction.
2776          */
2777         if (large_buffer && page == 0)
2778                 page = kmem_mb_alloc(mb_map, size, 0);
2779
2780         if (page == 0) {
2781                 if (bufsize == m_maxsize(MC_BIGCL)) {
2782                         /* Try for 1 page if failed */
2783                         size = PAGE_SIZE;
2784                         page = kmem_mb_alloc(mb_map, size, 0);
2785                 }
2786
2787                 if (page == 0) {
2788                         lck_mtx_lock(mbuf_mlock);
2789                         goto out;
2790                 }
2791         }
2792
2793         VERIFY(IS_P2ALIGNED(page, PAGE_SIZE));
2794         numpages = size / PAGE_SIZE;
2795
2796         /* If auditing is enabled, allocate the audit structures now */
2797         if (mclaudit != NULL) {
2798                 int needed;
2799
2800                 /*
2801                  * Yes, I realize this is a waste of memory for clusters
2802                  * that never get transformed into mbufs, as we may end
2803                  * up with NMBPG-1 unused audit structures per cluster.
2804                  * But doing so tremendously simplifies the allocation
2805                  * strategy, since at this point we are not holding the
2806                  * mbuf lock and the caller is okay to be blocked.
2807                  */
2808                 if (bufsize == PAGE_SIZE) {
2809                         needed = numpages * NMBPG;
2810
2811                         i = mcache_alloc_ext(mcl_audit_con_cache,
2812                             &con_list, needed, MCR_SLEEP);
2813
2814                         VERIFY(con_list != NULL && i == needed);
2815                 } else {
2816                         /*
2817                          * if multiple 4K pages are being used for a
2818                          * 16K cluster
2819                          */
2820                         needed = numpages / NSLABSP16KB;
2821                 }
2822
2823                 i = mcache_alloc_ext(mcache_audit_cache,
2824                     (mcache_obj_t **)&mca_list, needed, MCR_SLEEP);
2825
2826                 VERIFY(mca_list != NULL && i == needed);
2827         }
2828
2829         lck_mtx_lock(mbuf_mlock);
2830
2831         for (i = 0; i < numpages; i++, page += PAGE_SIZE) {
2832                 ppnum_t offset =
2833                     ((unsigned char *)page - mbutl) >> PAGE_SHIFT;
2834                 ppnum_t new_page = pmap_find_phys(kernel_pmap, page);
2835
2836                 /*
2837                  * If there is a mapper the appropriate I/O page is
2838                  * returned; zero out the page to discard its past
2839                  * contents to prevent exposing leftover kernel memory.
2840                  */
2841                 VERIFY(offset < mcl_pages);
2842                 if (mcl_paddr_base != 0) {
2843                         bzero((void *)(uintptr_t) page, PAGE_SIZE);
2844                         new_page = IOMapperInsertPage(mcl_paddr_base,
2845                             offset, new_page);
2846                 }
2847                 mcl_paddr[offset] = new_page;
2848
2849                 /* Pattern-fill this fresh page */
2850                 if (mclverify) {
2851                         mcache_set_pattern(MCACHE_FREE_PATTERN,
2852                             (caddr_t)page, PAGE_SIZE);
2853                 }
2854                 if (bufsize == PAGE_SIZE) {
2855                         mcache_obj_t *buf;
2856                         /* One for the entire page */
2857                         sp = slab_get((void *)page);
2858                         if (mclaudit != NULL) {
2859                                 mcl_audit_init((void *)page,
2860                                     &mca_list, &con_list,
2861                                     AUDIT_CONTENTS_SIZE, NMBPG);
2862                         }
2863                         VERIFY(sp->sl_refcnt == 0 && sp->sl_flags == 0);
2864                         slab_init(sp, class, SLF_MAPPED, (void *)page,
2865                             (void *)page, PAGE_SIZE, 0, 1);
2866                         buf = (mcache_obj_t *)page;
2867                         buf->obj_next = NULL;
2868
2869                         /* Insert this slab */
2870                         slab_insert(sp, class);
2871
2872                         /* Update stats now since slab_get drops the lock */
2873                         ++m_infree(class);
2874                         ++m_total(class);
2875                         VERIFY(m_total(class) <= m_maxlimit(class));
2876                         if (class == MC_BIGCL) {
2877                                 mbstat.m_bigclfree = m_infree(MC_BIGCL) +
2878                                     m_infree(MC_MBUF_BIGCL);
2879                                 mbstat.m_bigclusters = m_total(MC_BIGCL);
2880                         }
2881                         ++count;
2882                 } else if ((bufsize > PAGE_SIZE) &&
2883                     (i % NSLABSP16KB) == 0) {
2884                         union m16kcluster *m16kcl = (union m16kcluster *)page;
2885                         mcl_slab_t *nsp;
2886                         int k;
2887
2888                         /* One for the entire 16KB */
2889                         sp = slab_get(m16kcl);
2890                         if (mclaudit != NULL)
2891                                 mcl_audit_init(m16kcl, &mca_list, NULL, 0, 1);
2892
2893                         VERIFY(sp->sl_refcnt == 0 && sp->sl_flags == 0);
2894                         slab_init(sp, MC_16KCL, SLF_MAPPED,
2895                             m16kcl, m16kcl, bufsize, 0, 1);
2896                         m16kcl->m16kcl_next = NULL;
2897
2898                         /*
2899                          * 2nd-Nth page's slab is part of the first one,
2900                          * where N is NSLABSP16KB.
2901                          */
2902                         for (k = 1; k < NSLABSP16KB; k++) {
2903                                 nsp = slab_get(((union mbigcluster *)page) + k);
2904                                 VERIFY(nsp->sl_refcnt == 0 &&
2905                                     nsp->sl_flags == 0);
2906                                 slab_init(nsp, MC_16KCL,
2907                                     SLF_MAPPED | SLF_PARTIAL,
2908                                     m16kcl, NULL, 0, 0, 0);
2909                         }
2910                         /* Insert this slab */
2911                         slab_insert(sp, MC_16KCL);
2912
2913                         /* Update stats now since slab_get drops the lock */
2914                         ++m_infree(MC_16KCL);
2915                         ++m_total(MC_16KCL);
2916                         VERIFY(m_total(MC_16KCL) <= m_maxlimit(MC_16KCL));
2917                         ++count;
2918                 }
2919         }
2920         VERIFY(mca_list == NULL && con_list == NULL);
2921
2922         if (!mb_peak_newreport && mbuf_report_usage(class))
2923                 mb_peak_newreport = TRUE;
2924
2925         /* We're done; let others enter */
2926         mb_clalloc_busy = FALSE;
2927         if (mb_clalloc_waiters > 0) {
2928                 mb_clalloc_waiters = 0;
2929                 wakeup(mb_clalloc_waitchan);
2930         }
2931
2932         return (count);
2933 out:
2934         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2935
2936         /* We're done; let others enter */
2937         mb_clalloc_busy = FALSE;
2938         if (mb_clalloc_waiters > 0) {
2939                 mb_clalloc_waiters = 0;
2940                 wakeup(mb_clalloc_waitchan);
2941         }
2942
2943         /*
2944          * When non-blocking we kick a thread if we have to grow the
2945          * pool or if the number of free clusters is less than requested.
2946          */
2947         if (class == MC_BIGCL) {
2948                 if (i > 0) {
2949                         /*
2950                          * Remember total number of 4KB clusters needed
2951                          * at this time.
2952                          */
2953                         i += m_total(MC_BIGCL);
2954                         if (i > mbuf_expand_big) {
2955                                 mbuf_expand_big = i;
2956                                 if (mbuf_worker_ready)
2957                                         wakeup((caddr_t)&mbuf_worker_run);
2958                         }
2959                 }
2960
2961                 if (m_infree(MC_BIGCL) >= num)
2962                         return (1);
2963         } else {
2964                 if (i > 0) {
2965                         /*
2966                          * Remember total number of 16KB clusters needed
2967                          * at this time.
2968                          */
2969                         i += m_total(MC_16KCL);
2970                         if (i > mbuf_expand_16k) {
2971                                 mbuf_expand_16k = i;
2972                                 if (mbuf_worker_ready)
2973                                         wakeup((caddr_t)&mbuf_worker_run);
2974                         }
2975                 }
2976
2977                 if (m_infree(MC_16KCL) >= num)
2978                         return (1);
2979         }
2980         return (0);
2981 }
2982
2983 /*
2984  * Populate the global freelist of the corresponding buffer class.
2985  */
2986 static int
2987 freelist_populate(mbuf_class_t class, unsigned int num, int wait)
2988 {
2989         mcache_obj_t *o = NULL;
2990         int i, numpages = 0, count;
2991         mbuf_class_t super_class;
2992
2993         VERIFY(class == MC_MBUF || class == MC_CL || class == MC_BIGCL ||
2994             class == MC_16KCL);
2995
2996         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2997
2998         VERIFY(PAGE_SIZE == m_maxsize(MC_BIGCL) ||
2999             PAGE_SIZE == m_maxsize(MC_16KCL));
3000
3001         if (m_maxsize(class) >= PAGE_SIZE)
3002                 return(m_clalloc(num, wait, m_maxsize(class)) != 0);
3003
3004         /*
3005          * The rest of the function will allocate pages and will slice
3006          * them up into the right size
3007          */
3008
3009         numpages = (num * m_size(class) + PAGE_SIZE - 1) / PAGE_SIZE;
3010
3011         /* Currently assume that pages are 4K or 16K */
3012         if (PAGE_SIZE == m_maxsize(MC_BIGCL))
3013                 super_class = MC_BIGCL;
3014         else
3015                 super_class = MC_16KCL;
3016
3017         i = m_clalloc(numpages, wait, m_maxsize(super_class));
3018
3019         /* Respect the minimum limit  of super class */
3020         if (m_total(super_class) == m_maxlimit(super_class) &&
3021             m_infree(super_class) <= m_minlimit(super_class))
3022                 if (wait & MCR_COMP)
3023                                 return (0);
3024
3025         /* how many objects will we cut the page into? */
3026         int numobj = PAGE_SIZE / m_maxsize(class);
3027
3028         for (count = 0; count < numpages; count++) {
3029                 /* respect totals, minlimit, maxlimit */
3030                 if (m_total(super_class) <= m_minlimit(super_class) ||
3031                     m_total(class) >= m_maxlimit(class))
3032                         break;
3033
3034                 if ((o = slab_alloc(super_class, wait)) == NULL)
3035                         break;
3036
3037                 struct mbuf *m = (struct mbuf *)o;
3038                 union mcluster *c = (union mcluster *)o;
3039                 union mbigcluster *mbc = (union mbigcluster *)o;
3040                 mcl_slab_t *sp = slab_get(o);
3041                 mcache_audit_t *mca = NULL;
3042
3043                 /*
3044                  * since one full page will be converted to MC_MBUF or
3045                  * MC_CL, verify that the reference count will match that
3046                  * assumption
3047                  */
3048                 VERIFY(sp->sl_refcnt == 1 && slab_is_detached(sp));
3049                 VERIFY((sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED);
3050                 /*
3051                  * Make sure that the cluster is unmolested
3052                  * while in freelist
3053                  */
3054                 if (mclverify) {
3055                         mca = mcl_audit_buf2mca(super_class,
3056                             (mcache_obj_t *)o);
3057                         mcache_audit_free_verify(mca,
3058                             (mcache_obj_t *)o, 0, m_maxsize(super_class));
3059                 }
3060
3061                 /* Reinitialize it as an mbuf or 2K or 4K slab */
3062                 slab_init(sp, class, sp->sl_flags,
3063                     sp->sl_base, NULL, PAGE_SIZE, 0, numobj);
3064
3065                 VERIFY(sp->sl_head == NULL);
3066
3067                 VERIFY(m_total(super_class) >= 1);
3068                 m_total(super_class)--;
3069
3070                 if (super_class == MC_BIGCL)
3071                         mbstat.m_bigclusters = m_total(MC_BIGCL);
3072
3073                 m_total(class) += numobj;
3074                 m_infree(class) += numobj;
3075
3076                 if (!mb_peak_newreport && mbuf_report_usage(class))
3077                         mb_peak_newreport = TRUE;
3078
3079                 i = numobj;
3080                 if (class == MC_MBUF) {
3081                         mbstat.m_mbufs = m_total(MC_MBUF);
3082                         mtype_stat_add(MT_FREE, NMBPG);
3083                         while (i--) {
3084                                 /*
3085                                  * If auditing is enabled, construct the
3086                                  * shadow mbuf in the audit structure
3087                                  * instead of the actual one.
3088                                  * mbuf_slab_audit() will take care of
3089                                  * restoring the contents after the
3090                                  * integrity check.
3091                                  */
3092                                 if (mclaudit != NULL) {
3093                                         struct mbuf *ms;
3094                                         mca = mcl_audit_buf2mca(MC_MBUF,
3095                                             (mcache_obj_t *)m);
3096                                         ms = MCA_SAVED_MBUF_PTR(mca);
3097                                         ms->m_type = MT_FREE;
3098                                 } else {
3099                                         m->m_type = MT_FREE;
3100                                 }
3101                                 m->m_next = sp->sl_head;
3102                                 sp->sl_head = (void *)m++;
3103                         }
3104                 } else if (class == MC_CL) { /* MC_CL */
3105                         mbstat.m_clfree =
3106                             m_infree(MC_CL) + m_infree(MC_MBUF_CL);
3107                         mbstat.m_clusters = m_total(MC_CL);
3108                         while (i--) {
3109                                 c->mcl_next = sp->sl_head;
3110                                 sp->sl_head = (void *)c++;
3111                         }
3112                 } else {
3113                         VERIFY(class == MC_BIGCL);
3114                         mbstat.m_bigclusters = m_total(MC_BIGCL);
3115                         mbstat.m_bigclfree = m_infree(MC_BIGCL) +
3116                             m_infree(MC_MBUF_BIGCL);
3117                         while (i--) {
3118                                 mbc->mbc_next = sp->sl_head;
3119                                 sp->sl_head = (void *)mbc++;
3120                         }
3121                 }
3122
3123                 /* Insert into the mbuf or 2k or 4k slab list */
3124                 slab_insert(sp, class);
3125
3126                 if ((i = mb_waiters) > 0)
3127                         mb_waiters = 0;
3128                 if (i != 0)
3129                         wakeup(mb_waitchan);
3130         }
3131         return (count != 0);
3132 }
3133
3134 /*
3135  * For each class, initialize the freelist to hold m_minlimit() objects.
3136  */
3137 static void
3138 freelist_init(mbuf_class_t class)
3139 {
3140         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
3141
3142         VERIFY(class == MC_CL || class == MC_BIGCL);
3143         VERIFY(m_total(class) == 0);
3144         VERIFY(m_minlimit(class) > 0);
3145
3146         while (m_total(class) < m_minlimit(class))
3147                 (void) freelist_populate(class, m_minlimit(class), M_WAIT);
3148
3149         VERIFY(m_total(class) >= m_minlimit(class));
3150 }
3151
3152 /*
3153  * (Inaccurately) check if it might be worth a trip back to the
3154  * mcache layer due the availability of objects there.  We'll
3155  * end up back here if there's nothing up there.
3156  */
3157 static boolean_t
3158 mbuf_cached_above(mbuf_class_t class, int wait)
3159 {
3160         switch (class) {
3161         case MC_MBUF:
3162                 if (wait & MCR_COMP)
3163                         return (!mcache_bkt_isempty(m_cache(MC_MBUF_CL)) ||
3164                             !mcache_bkt_isempty(m_cache(MC_MBUF_BIGCL)));
3165                 break;
3166
3167         case MC_CL:
3168                 if (wait & MCR_COMP)
3169                         return (!mcache_bkt_isempty(m_cache(MC_MBUF_CL)));
3170                 break;
3171
3172         case MC_BIGCL:
3173                 if (wait & MCR_COMP)
3174                         return (!mcache_bkt_isempty(m_cache(MC_MBUF_BIGCL)));
3175                 break;
3176
3177         case MC_16KCL:
3178                 if (wait & MCR_COMP)
3179                         return (!mcache_bkt_isempty(m_cache(MC_MBUF_16KCL)));
3180                 break;
3181
3182         case MC_MBUF_CL:
3183         case MC_MBUF_BIGCL:
3184         case MC_MBUF_16KCL:
3185                 break;
3186
3187         default:
3188                 VERIFY(0);
3189                 /* NOTREACHED */
3190         }
3191
3192         return (!mcache_bkt_isempty(m_cache(class)));
3193 }
3194
3195 /*
3196  * If possible, convert constructed objects to raw ones.
3197  */
3198 static boolean_t
3199 mbuf_steal(mbuf_class_t class, unsigned int num)
3200 {
3201         mcache_obj_t *top = NULL;
3202         mcache_obj_t **list = &top;
3203         unsigned int tot = 0;
3204
3205         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
3206
3207         switch (class) {
3208         case MC_MBUF:
3209         case MC_CL:
3210         case MC_BIGCL:
3211         case MC_16KCL:
3212                 return (FALSE);
3213
3214         case MC_MBUF_CL:
3215         case MC_MBUF_BIGCL:
3216         case MC_MBUF_16KCL:
3217                 /* Get the required number of constructed objects if possible */
3218                 if (m_infree(class) > m_minlimit(class)) {
3219                         tot = cslab_alloc(class, &list,
3220                             MIN(num, m_infree(class)));
3221                 }
3222
3223                 /* And destroy them to get back the raw objects */
3224                 if (top != NULL)
3225                         (void) cslab_free(class, top, 1);
3226                 break;
3227
3228         default:
3229                 VERIFY(0);
3230                 /* NOTREACHED */
3231         }
3232
3233         return (tot == num);
3234 }
3235
3236 static void
3237 m_reclaim(mbuf_class_t class, unsigned int num, boolean_t comp)
3238 {
3239         int m, bmap = 0;
3240
3241         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
3242
3243         VERIFY(m_total(MC_CL) <= m_maxlimit(MC_CL));
3244         VERIFY(m_total(MC_BIGCL) <= m_maxlimit(MC_BIGCL));
3245         VERIFY(m_total(MC_16KCL) <= m_maxlimit(MC_16KCL));
3246
3247         /*
3248          * This logic can be made smarter; for now, simply mark
3249          * all other related classes as potential victims.
3250          */
3251         switch (class) {
3252         case MC_MBUF:
3253                 m_wantpurge(MC_CL)++;
3254                 m_wantpurge(MC_BIGCL)++;
3255                 m_wantpurge(MC_MBUF_CL)++;
3256                 m_wantpurge(MC_MBUF_BIGCL)++;
3257                 break;
3258
3259         case MC_CL:
3260                 m_wantpurge(MC_MBUF)++;
3261                 m_wantpurge(MC_BIGCL)++;
3262                 m_wantpurge(MC_MBUF_BIGCL)++;
3263                 if (!comp)
3264                         m_wantpurge(MC_MBUF_CL)++;
3265                 break;
3266
3267         case MC_BIGCL:
3268                 m_wantpurge(MC_MBUF)++;
3269                 m_wantpurge(MC_CL)++;
3270                 m_wantpurge(MC_MBUF_CL)++;
3271                 if (!comp)
3272                         m_wantpurge(MC_MBUF_BIGCL)++;
3273                 break;
3274
3275         case MC_16KCL:
3276                 if (!comp)
3277                         m_wantpurge(MC_MBUF_16KCL)++;
3278                 break;
3279
3280         default:
3281                 VERIFY(0);
3282                 /* NOTREACHED */
3283         }
3284
3285         /*
3286          * Run through each marked class and check if we really need to
3287          * purge (and therefore temporarily disable) the per-CPU caches
3288          * layer used by the class.  If so, remember the classes since
3289          * we are going to drop the lock below prior to purging.
3290          */
3291         for (m = 0; m < NELEM(mbuf_table); m++) {
3292                 if (m_wantpurge(m) > 0) {
3293                         m_wantpurge(m) = 0;
3294                         /*
3295                          * Try hard to steal the required number of objects
3296                          * from the freelist of other mbuf classes.  Only
3297                          * purge and disable the per-CPU caches layer when
3298                          * we don't have enough; it's the last resort.
3299                          */
3300                         if (!mbuf_steal(m, num))
3301                                 bmap |= (1 << m);
3302                 }
3303         }
3304
3305         lck_mtx_unlock(mbuf_mlock);
3306
3307         if (bmap != 0) {
3308                 /* signal the domains to drain */
3309                 net_drain_domains();
3310
3311                 /* Sigh; we have no other choices but to ask mcache to purge */
3312                 for (m = 0; m < NELEM(mbuf_table); m++) {
3313                         if ((bmap & (1 << m)) &&
3314                             mcache_purge_cache(m_cache(m), TRUE)) {
3315                                 lck_mtx_lock(mbuf_mlock);
3316                                 m_purge_cnt(m)++;
3317                                 mbstat.m_drain++;
3318                                 lck_mtx_unlock(mbuf_mlock);
3319                         }
3320                 }
3321         } else {
3322                 /*
3323                  * Request mcache to reap extra elements from all of its caches;
3324                  * note that all reaps are serialized and happen only at a fixed
3325                  * interval.
3326                  */
3327                 mcache_reap();
3328         }
3329         lck_mtx_lock(mbuf_mlock);
3330 }
3331
3332 static inline struct mbuf *
3333 m_get_common(int wait, short type, int hdr)
3334 {
3335         struct mbuf *m;
3336         int mcflags = MSLEEPF(wait);
3337
3338         /* Is this due to a non-blocking retry?  If so, then try harder */
3339         if (mcflags & MCR_NOSLEEP)
3340                 mcflags |= MCR_TRYHARD;
3341
3342         m = mcache_alloc(m_cache(MC_MBUF), mcflags);
3343         if (m != NULL) {
3344                 MBUF_INIT(m, hdr, type);
3345                 mtype_stat_inc(type);
3346                 mtype_stat_dec(MT_FREE);
3347 #if CONFIG_MACF_NET
3348                 if (hdr && mac_init_mbuf(m, wait) != 0) {
3349                         m_free(m);
3350                         return (NULL);
3351                 }
3352 #endif /* MAC_NET */
3353         }
3354         return (m);
3355 }
3356
3357 /*
3358  * Space allocation routines; these are also available as macros
3359  * for critical paths.
3360  */
3361 #define _M_GET(wait, type)      m_get_common(wait, type, 0)
3362 #define _M_GETHDR(wait, type)   m_get_common(wait, type, 1)
3363 #define _M_RETRY(wait, type)    _M_GET(wait, type)
3364 #define _M_RETRYHDR(wait, type) _M_GETHDR(wait, type)
3365 #define _MGET(m, how, type)     ((m) = _M_GET(how, type))
3366 #define _MGETHDR(m, how, type)  ((m) = _M_GETHDR(how, type))
3367
3368 struct mbuf *
3369 m_get(int wait, int type)
3370 {
3371         return (_M_GET(wait, type));
3372 }
3373
3374 struct mbuf *
3375 m_gethdr(int wait, int type)
3376 {
3377         return (_M_GETHDR(wait, type));
3378 }
3379
3380 struct mbuf *
3381 m_retry(int wait, int type)
3382 {
3383         return (_M_RETRY(wait, type));
3384 }
3385
3386 struct mbuf *
3387 m_retryhdr(int wait, int type)
3388 {
3389         return (_M_RETRYHDR(wait, type));
3390 }
3391
3392 struct mbuf *
3393 m_getclr(int wait, int type)
3394 {
3395         struct mbuf *m;
3396
3397         _MGET(m, wait, type);
3398         if (m != NULL)
3399                 bzero(MTOD(m, caddr_t), MLEN);
3400         return (m);
3401 }
3402
3403 struct mbuf *
3404 m_free(struct mbuf *m)
3405 {
3406         struct mbuf *n = m->m_next;
3407
3408         if (m->m_type == MT_FREE)
3409                 panic("m_free: freeing an already freed mbuf");
3410
3411         if (m->m_flags & M_PKTHDR) {
3412                 /* Check for scratch area overflow */
3413                 m_redzone_verify(m);
3414                 /* Free the aux data and tags if there is any */
3415                 m_tag_delete_chain(m, NULL);
3416         }
3417
3418         if (m->m_flags & M_EXT) {
3419                 u_int32_t refcnt;
3420                 u_int32_t composite;
3421
3422                 refcnt = m_decref(m);
3423                 composite = (MEXT_FLAGS(m) & EXTF_COMPOSITE);
3424                 if (refcnt == 0 && !composite) {
3425                         if (m->m_ext.ext_free == NULL) {
3426                                 mcache_free(m_cache(MC_CL), m->m_ext.ext_buf);
3427                         } else if (m->m_ext.ext_free == m_bigfree) {
3428                                 mcache_free(m_cache(MC_BIGCL),
3429                                     m->m_ext.ext_buf);
3430                         } else if (m->m_ext.ext_free == m_16kfree) {
3431                                 mcache_free(m_cache(MC_16KCL),
3432                                     m->m_ext.ext_buf);
3433                         } else {
3434                                 (*(m->m_ext.ext_free))(m->m_ext.ext_buf,
3435                                     m->m_ext.ext_size, m->m_ext.ext_arg);
3436                         }
3437                         mcache_free(ref_cache, MEXT_RFA(m));
3438                         MEXT_RFA(m) = NULL;
3439                 } else if (refcnt == 0 && composite) {
3440                         VERIFY(m->m_type != MT_FREE);
3441
3442                         mtype_stat_dec(m->m_type);
3443                         mtype_stat_inc(MT_FREE);
3444
3445                         m->m_type = MT_FREE;
3446                         m->m_flags = M_EXT;
3447                         m->m_len = 0;
3448                         m->m_next = m->m_nextpkt = NULL;
3449
3450                         MEXT_FLAGS(m) &= ~EXTF_READONLY;
3451
3452                         /* "Free" into the intermediate cache */
3453                         if (m->m_ext.ext_free == NULL) {
3454                                 mcache_free(m_cache(MC_MBUF_CL), m);
3455                         } else if (m->m_ext.ext_free == m_bigfree) {
3456                                 mcache_free(m_cache(MC_MBUF_BIGCL), m);
3457                         } else {
3458                                 VERIFY(m->m_ext.ext_free == m_16kfree);
3459                                 mcache_free(m_cache(MC_MBUF_16KCL), m);
3460                         }
3461                         return (n);
3462                 }
3463         }
3464
3465         if (m->m_type != MT_FREE) {
3466                 mtype_stat_dec(m->m_type);
3467                 mtype_stat_inc(MT_FREE);
3468         }
3469
3470         m->m_type = MT_FREE;
3471         m->m_flags = m->m_len = 0;
3472         m->m_next = m->m_nextpkt = NULL;
3473
3474         mcache_free(m_cache(MC_MBUF), m);
3475
3476         return (n);
3477 }
3478
3479 __private_extern__ struct mbuf *
3480 m_clattach(struct mbuf *m, int type, caddr_t extbuf,
3481     void (*extfree)(caddr_t, u_int, caddr_t), u_int extsize, caddr_t extarg,
3482     int wait)
3483 {
3484         struct ext_ref *rfa = NULL;
3485
3486         if (m == NULL && (m = _M_GETHDR(wait, type)) == NULL)
3487                 return (NULL);
3488
3489         if (m->m_flags & M_EXT) {
3490                 u_int32_t refcnt;
3491                 u_int32_t composite;
3492
3493                 refcnt = m_decref(m);
3494                 composite = (MEXT_FLAGS(m) & EXTF_COMPOSITE);
3495                 if (refcnt == 0 && !composite) {
3496                         if (m->m_ext.ext_free == NULL) {
3497                                 mcache_free(m_cache(MC_CL), m->m_ext.ext_buf);
3498                         } else if (m->m_ext.ext_free == m_bigfree) {
3499                                 mcache_free(m_cache(MC_BIGCL),
3500                                     m->m_ext.ext_buf);
3501                         } else if (m->m_ext.ext_free == m_16kfree) {
3502                                 mcache_free(m_cache(MC_16KCL),
3503                                     m->m_ext.ext_buf);
3504                         } else {
3505                                 (*(m->m_ext.ext_free))(m->m_ext.ext_buf,
3506                                     m->m_ext.ext_size, m->m_ext.ext_arg);
3507                         }
3508                         /* Re-use the reference structure */
3509                         rfa = MEXT_RFA(m);
3510                 } else if (refcnt == 0 && composite) {
3511                         VERIFY(m->m_type != MT_FREE);
3512
3513                         mtype_stat_dec(m->m_type);
3514                         mtype_stat_inc(MT_FREE);
3515
3516                         m->m_type = MT_FREE;
3517                         m->m_flags = M_EXT;
3518                         m->m_len = 0;
3519                         m->m_next = m->m_nextpkt = NULL;
3520
3521                         MEXT_FLAGS(m) &= ~EXTF_READONLY;
3522
3523                         /* "Free" into the intermediate cache */
3524                         if (m->m_ext.ext_free == NULL) {
3525                                 mcache_free(m_cache(MC_MBUF_CL), m);
3526                         } else if (m->m_ext.ext_free == m_bigfree) {
3527                                 mcache_free(m_cache(MC_MBUF_BIGCL), m);
3528                         } else {
3529                                 VERIFY(m->m_ext.ext_free == m_16kfree);
3530                                 mcache_free(m_cache(MC_MBUF_16KCL), m);
3531                         }
3532                         /*
3533                          * Allocate a new mbuf, since we didn't divorce
3534                          * the composite mbuf + cluster pair above.
3535                          */
3536                         if ((m = _M_GETHDR(wait, type)) == NULL)
3537                                 return (NULL);
3538                 }
3539         }
3540
3541         if (rfa == NULL &&
3542             (rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL) {
3543                 m_free(m);
3544                 return (NULL);
3545         }
3546
3547         MEXT_INIT(m, extbuf, extsize, extfree, extarg, rfa, 1, 0);
3548
3549         return (m);
3550 }
3551
3552 /*
3553  * Perform `fast' allocation mbuf clusters from a cache of recently-freed
3554  * clusters. (If the cache is empty, new clusters are allocated en-masse.)
3555  */
3556 struct mbuf *
3557 m_getcl(int wait, int type, int flags)
3558 {
3559         struct mbuf *m;
3560         int mcflags = MSLEEPF(wait);
3561         int hdr = (flags & M_PKTHDR);
3562
3563         /* Is this due to a non-blocking retry?  If so, then try harder */
3564         if (mcflags & MCR_NOSLEEP)
3565                 mcflags |= MCR_TRYHARD;
3566
3567         m = mcache_alloc(m_cache(MC_MBUF_CL), mcflags);
3568         if (m != NULL) {
3569                 u_int32_t flag;
3570                 struct ext_ref *rfa;
3571                 void *cl;
3572
3573                 VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT);
3574                 cl = m->m_ext.ext_buf;
3575                 rfa = MEXT_RFA(m);
3576
3577                 ASSERT(cl != NULL && rfa != NULL);
3578                 VERIFY(MBUF_IS_COMPOSITE(m) && m->m_ext.ext_free == NULL);
3579
3580                 flag = MEXT_FLAGS(m);
3581
3582                 MBUF_INIT(m, hdr, type);
3583                 MBUF_CL_INIT(m, cl, rfa, 1, flag);
3584
3585                 mtype_stat_inc(type);
3586                 mtype_stat_dec(MT_FREE);
3587 #if CONFIG_MACF_NET
3588                 if (hdr && mac_init_mbuf(m, wait) != 0) {
3589                         m_freem(m);
3590                         return (NULL);
3591                 }
3592 #endif /* MAC_NET */
3593         }
3594         return (m);
3595 }
3596
3597 /* m_mclget() add an mbuf cluster to a normal mbuf */
3598 struct mbuf *
3599 m_mclget(struct mbuf *m, int wait)
3600 {
3601         struct ext_ref *rfa;
3602
3603         if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL)
3604                 return (m);
3605
3606         m->m_ext.ext_buf = m_mclalloc(wait);
3607         if (m->m_ext.ext_buf != NULL) {
3608                 MBUF_CL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0);
3609         } else {
3610                 mcache_free(ref_cache, rfa);
3611         }
3612         return (m);
3613 }
3614
3615 /* Allocate an mbuf cluster */
3616 caddr_t
3617 m_mclalloc(int wait)
3618 {
3619         int mcflags = MSLEEPF(wait);
3620
3621         /* Is this due to a non-blocking retry?  If so, then try harder */
3622         if (mcflags & MCR_NOSLEEP)
3623                 mcflags |= MCR_TRYHARD;
3624
3625         return (mcache_alloc(m_cache(MC_CL), mcflags));
3626 }
3627
3628 /* Free an mbuf cluster */
3629 void
3630 m_mclfree(caddr_t p)
3631 {
3632         mcache_free(m_cache(MC_CL), p);
3633 }
3634
3635 /*
3636  * mcl_hasreference() checks if a cluster of an mbuf is referenced by
3637  * another mbuf; see comments in m_incref() regarding EXTF_READONLY.
3638  */
3639 int
3640 m_mclhasreference(struct mbuf *m)
3641 {
3642         if (!(m->m_flags & M_EXT))
3643                 return (0);
3644
3645         ASSERT(MEXT_RFA(m) != NULL);
3646
3647         return ((MEXT_FLAGS(m) & EXTF_READONLY) ? 1 : 0);
3648 }
3649
3650 __private_extern__ caddr_t
3651 m_bigalloc(int wait)
3652 {
3653         int mcflags = MSLEEPF(wait);
3654
3655         /* Is this due to a non-blocking retry?  If so, then try harder */
3656         if (mcflags & MCR_NOSLEEP)
3657                 mcflags |= MCR_TRYHARD;
3658
3659         return (mcache_alloc(m_cache(MC_BIGCL), mcflags));
3660 }
3661
3662 __private_extern__ void
3663 m_bigfree(caddr_t p, __unused u_int size, __unused caddr_t arg)
3664 {
3665         mcache_free(m_cache(MC_BIGCL), p);
3666 }
3667
3668 /* m_mbigget() add an 4KB mbuf cluster to a normal mbuf */
3669 __private_extern__ struct mbuf *
3670 m_mbigget(struct mbuf *m, int wait)
3671 {
3672         struct ext_ref *rfa;
3673
3674         if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL)
3675                 return (m);
3676
3677         m->m_ext.ext_buf =  m_bigalloc(wait);
3678         if (m->m_ext.ext_buf != NULL) {
3679                 MBUF_BIGCL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0);
3680         } else {
3681                 mcache_free(ref_cache, rfa);
3682         }
3683         return (m);
3684 }
3685
3686 __private_extern__ caddr_t
3687 m_16kalloc(int wait)
3688 {
3689         int mcflags = MSLEEPF(wait);
3690
3691         /* Is this due to a non-blocking retry?  If so, then try harder */
3692         if (mcflags & MCR_NOSLEEP)
3693                 mcflags |= MCR_TRYHARD;
3694
3695         return (mcache_alloc(m_cache(MC_16KCL), mcflags));
3696 }
3697
3698 __private_extern__ void
3699 m_16kfree(caddr_t p, __unused u_int size, __unused caddr_t arg)
3700 {
3701         mcache_free(m_cache(MC_16KCL), p);
3702 }
3703
3704 /* m_m16kget() add a 16KB mbuf cluster to a normal mbuf */
3705 __private_extern__ struct mbuf *
3706 m_m16kget(struct mbuf *m, int wait)
3707 {
3708         struct ext_ref *rfa;
3709
3710         if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL)
3711                 return (m);
3712
3713         m->m_ext.ext_buf =  m_16kalloc(wait);
3714         if (m->m_ext.ext_buf != NULL) {
3715                 MBUF_16KCL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0);
3716         } else {
3717                 mcache_free(ref_cache, rfa);
3718         }
3719         return (m);
3720 }
3721
3722 /*
3723  * "Move" mbuf pkthdr from "from" to "to".
3724  * "from" must have M_PKTHDR set, and "to" must be empty.
3725  */
3726 void
3727 m_copy_pkthdr(struct mbuf *to, struct mbuf *from)
3728 {
3729         VERIFY(from->m_flags & M_PKTHDR);
3730
3731         /* Check for scratch area overflow */
3732         m_redzone_verify(from);
3733
3734         if (to->m_flags & M_PKTHDR) {
3735                 /* Check for scratch area overflow */
3736                 m_redzone_verify(to);
3737                 /* We will be taking over the tags of 'to' */
3738                 m_tag_delete_chain(to, NULL);
3739         }
3740         to->m_pkthdr = from->m_pkthdr;          /* especially tags */
3741         m_classifier_init(from, 0);             /* purge classifier info */
3742         m_tag_init(from, 1);                    /* purge all tags from src */
3743         m_scratch_init(from);                   /* clear src scratch area */
3744         to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT);
3745         if ((to->m_flags & M_EXT) == 0)
3746                 to->m_data = to->m_pktdat;
3747         m_redzone_init(to);                     /* setup red zone on dst */
3748 }
3749
3750 /*
3751  * Duplicate "from"'s mbuf pkthdr in "to".
3752  * "from" must have M_PKTHDR set, and "to" must be empty.
3753  * In particular, this does a deep copy of the packet tags.
3754  */
3755 static int
3756 m_dup_pkthdr(struct mbuf *to, struct mbuf *from, int how)
3757 {
3758         VERIFY(from->m_flags & M_PKTHDR);
3759
3760         /* Check for scratch area overflow */
3761         m_redzone_verify(from);
3762
3763         if (to->m_flags & M_PKTHDR) {
3764                 /* Check for scratch area overflow */
3765                 m_redzone_verify(to);
3766                 /* We will be taking over the tags of 'to' */
3767                 m_tag_delete_chain(to, NULL);
3768         }
3769         to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT);
3770         if ((to->m_flags & M_EXT) == 0)
3771                 to->m_data = to->m_pktdat;
3772         to->m_pkthdr = from->m_pkthdr;
3773         m_redzone_init(to);                     /* setup red zone on dst */
3774         m_tag_init(to, 0);                      /* preserve dst static tags */
3775         return (m_tag_copy_chain(to, from, how));
3776 }
3777
3778 void
3779 m_copy_pftag(struct mbuf *to, struct mbuf *from)
3780 {
3781         to->m_pkthdr.pf_mtag = from->m_pkthdr.pf_mtag;
3782 #if PF_ECN
3783         to->m_pkthdr.pf_mtag.pftag_hdr = NULL;
3784         to->m_pkthdr.pf_mtag.pftag_flags &= ~(PF_TAG_HDR_INET|PF_TAG_HDR_INET6);
3785 #endif /* PF_ECN */
3786 }
3787
3788 void
3789 m_classifier_init(struct mbuf *m, uint32_t pktf_mask)
3790 {
3791         VERIFY(m->m_flags & M_PKTHDR);
3792
3793         m->m_pkthdr.pkt_proto = 0;
3794         m->m_pkthdr.pkt_flowsrc = 0;
3795         m->m_pkthdr.pkt_flowid = 0;
3796         m->m_pkthdr.pkt_flags &= pktf_mask;     /* caller-defined mask */
3797         /* preserve service class and interface info for loopback packets */
3798         if (!(m->m_pkthdr.pkt_flags & PKTF_LOOP))
3799                 (void) m_set_service_class(m, MBUF_SC_BE);
3800         if (!(m->m_pkthdr.pkt_flags & PKTF_IFAINFO))
3801                 m->m_pkthdr.pkt_ifainfo = 0;
3802 #if MEASURE_BW
3803         m->m_pkthdr.pkt_bwseq  = 0;
3804 #endif /* MEASURE_BW */
3805         m->m_pkthdr.pkt_enqueue_ts = 0;
3806 }
3807
3808 void
3809 m_copy_classifier(struct mbuf *to, struct mbuf *from)
3810 {
3811         VERIFY(to->m_flags & M_PKTHDR);
3812         VERIFY(from->m_flags & M_PKTHDR);
3813
3814         to->m_pkthdr.pkt_proto = from->m_pkthdr.pkt_proto;
3815         to->m_pkthdr.pkt_flowsrc = from->m_pkthdr.pkt_flowsrc;
3816         to->m_pkthdr.pkt_flowid = from->m_pkthdr.pkt_flowid;
3817         to->m_pkthdr.pkt_flags = from->m_pkthdr.pkt_flags;
3818         (void) m_set_service_class(to, from->m_pkthdr.pkt_svc);
3819         to->m_pkthdr.pkt_ifainfo  = from->m_pkthdr.pkt_ifainfo;
3820 #if MEASURE_BW
3821         to->m_pkthdr.pkt_bwseq  = from->m_pkthdr.pkt_bwseq;
3822 #endif /* MEASURE_BW */
3823 }
3824
3825 /*
3826  * Return a list of mbuf hdrs that point to clusters.  Try for num_needed;
3827  * if wantall is not set, return whatever number were available.  Set up the
3828  * first num_with_pkthdrs with mbuf hdrs configured as packet headers; these
3829  * are chained on the m_nextpkt field.  Any packets requested beyond this
3830  * are chained onto the last packet header's m_next field.  The size of
3831  * the cluster is controlled by the parameter bufsize.
3832  */
3833 __private_extern__ struct mbuf *
3834 m_getpackets_internal(unsigned int *num_needed, int num_with_pkthdrs,
3835     int wait, int wantall, size_t bufsize)
3836 {
3837         struct mbuf *m;
3838         struct mbuf **np, *top;
3839         unsigned int pnum, needed = *num_needed;
3840         mcache_obj_t *mp_list = NULL;
3841         int mcflags = MSLEEPF(wait);
3842         u_int32_t flag;
3843         struct ext_ref *rfa;
3844         mcache_t *cp;
3845         void *cl;
3846
3847         ASSERT(bufsize == m_maxsize(MC_CL) ||
3848             bufsize == m_maxsize(MC_BIGCL) ||
3849             bufsize == m_maxsize(MC_16KCL));
3850
3851         /*
3852          * Caller must first check for njcl because this
3853          * routine is internal and not exposed/used via KPI.
3854          */
3855         VERIFY(bufsize != m_maxsize(MC_16KCL) || njcl > 0);
3856
3857         top = NULL;
3858         np = &top;
3859         pnum = 0;
3860
3861         /*
3862          * The caller doesn't want all the requested buffers; only some.
3863          * Try hard to get what we can, but don't block.  This effectively
3864          * overrides MCR_SLEEP, since this thread will not go to sleep
3865          * if we can't get all the buffers.
3866          */
3867         if (!wantall || (mcflags & MCR_NOSLEEP))
3868                 mcflags |= MCR_TRYHARD;
3869
3870         /* Allocate the composite mbuf + cluster elements from the cache */
3871         if (bufsize == m_maxsize(MC_CL))
3872                 cp = m_cache(MC_MBUF_CL);
3873         else if (bufsize == m_maxsize(MC_BIGCL))
3874                 cp = m_cache(MC_MBUF_BIGCL);
3875         else
3876                 cp = m_cache(MC_MBUF_16KCL);
3877         needed = mcache_alloc_ext(cp, &mp_list, needed, mcflags);
3878
3879         for (pnum = 0; pnum < needed; pnum++) {
3880                 m = (struct mbuf *)mp_list;
3881                 mp_list = mp_list->obj_next;
3882
3883                 VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT);
3884                 cl = m->m_ext.ext_buf;
3885                 rfa = MEXT_RFA(m);
3886
3887                 ASSERT(cl != NULL && rfa != NULL);
3888                 VERIFY(MBUF_IS_COMPOSITE(m));
3889
3890                 flag = MEXT_FLAGS(m);
3891
3892                 MBUF_INIT(m, num_with_pkthdrs, MT_DATA);
3893                 if (bufsize == m_maxsize(MC_16KCL)) {
3894                         MBUF_16KCL_INIT(m, cl, rfa, 1, flag);
3895                 } else if (bufsize == m_maxsize(MC_BIGCL)) {
3896                         MBUF_BIGCL_INIT(m, cl, rfa, 1, flag);
3897                 } else {
3898                         MBUF_CL_INIT(m, cl, rfa, 1, flag);
3899                 }
3900
3901                 if (num_with_pkthdrs > 0) {
3902                         --num_with_pkthdrs;
3903 #if CONFIG_MACF_NET
3904                         if (mac_mbuf_label_init(m, wait) != 0) {
3905                                 m_freem(m);
3906                                 break;
3907                         }
3908 #endif /* MAC_NET */
3909                 }
3910
3911                 *np = m;
3912                 if (num_with_pkthdrs > 0)
3913                         np = &m->m_nextpkt;
3914                 else
3915                         np = &m->m_next;
3916         }
3917         ASSERT(pnum != *num_needed || mp_list == NULL);
3918         if (mp_list != NULL)
3919                 mcache_free_ext(cp, mp_list);
3920
3921         if (pnum > 0) {
3922                 mtype_stat_add(MT_DATA, pnum);
3923                 mtype_stat_sub(MT_FREE, pnum);
3924         }
3925
3926         if (wantall && (pnum != *num_needed)) {
3927                 if (top != NULL)
3928                         m_freem_list(top);
3929                 return (NULL);
3930         }
3931
3932         if (pnum > *num_needed) {
3933                 printf("%s: File a radar related to <rdar://10146739>. \
3934                         needed = %u, pnum = %u, num_needed = %u \n",
3935                         __func__, needed, pnum, *num_needed);
3936         }
3937
3938         *num_needed = pnum;
3939         return (top);
3940 }
3941
3942 /*
3943  * Return list of mbuf linked by m_nextpkt.  Try for numlist, and if
3944  * wantall is not set, return whatever number were available.  The size of
3945  * each mbuf in the list is controlled by the parameter packetlen.  Each
3946  * mbuf of the list may have a chain of mbufs linked by m_next.  Each mbuf
3947  * in the chain is called a segment.  If maxsegments is not null and the
3948  * value pointed to is not null, this specify the maximum number of segments
3949  * for a chain of mbufs.  If maxsegments is zero or the value pointed to
3950  * is zero the caller does not have any restriction on the number of segments.
3951  * The actual  number of segments of a mbuf chain is return in the value
3952  * pointed to by maxsegments.
3953  */
3954 __private_extern__ struct mbuf *
3955 m_allocpacket_internal(unsigned int *numlist, size_t packetlen,
3956     unsigned int *maxsegments, int wait, int wantall, size_t wantsize)
3957 {
3958         struct mbuf **np, *top, *first = NULL;
3959         size_t bufsize, r_bufsize;
3960         unsigned int num = 0;
3961         unsigned int nsegs = 0;
3962         unsigned int needed, resid;
3963         int mcflags = MSLEEPF(wait);
3964         mcache_obj_t *mp_list = NULL, *rmp_list = NULL;
3965         mcache_t *cp = NULL, *rcp = NULL;
3966
3967         if (*numlist == 0)
3968                 return (NULL);
3969
3970         top = NULL;
3971         np = &top;
3972
3973         if (wantsize == 0) {
3974                 if (packetlen <= MINCLSIZE) {
3975                         bufsize = packetlen;
3976                 } else if (packetlen > m_maxsize(MC_CL)) {
3977                         /* Use 4KB if jumbo cluster pool isn't available */
3978                         if (packetlen <= m_maxsize(MC_BIGCL) || njcl == 0)
3979                                 bufsize = m_maxsize(MC_BIGCL);
3980                         else
3981                                 bufsize = m_maxsize(MC_16KCL);
3982                 } else {
3983                         bufsize = m_maxsize(MC_CL);
3984                 }
3985         } else if (wantsize == m_maxsize(MC_CL) ||
3986             wantsize == m_maxsize(MC_BIGCL) ||
3987             (wantsize == m_maxsize(MC_16KCL) && njcl > 0)) {
3988                 bufsize = wantsize;
3989         } else {
3990                 return (NULL);
3991         }
3992
3993         if (bufsize <= MHLEN) {
3994                 nsegs = 1;
3995         } else if (bufsize <= MINCLSIZE) {
3996                 if (maxsegments != NULL && *maxsegments == 1) {
3997                         bufsize = m_maxsize(MC_CL);
3998                         nsegs = 1;
3999                 } else {
4000                         nsegs = 2;
4001                 }
4002         } else if (bufsize == m_maxsize(MC_16KCL)) {
4003                 VERIFY(njcl > 0);
4004                 nsegs = ((packetlen - 1) >> M16KCLSHIFT) + 1;
4005         } else if (bufsize == m_maxsize(MC_BIGCL)) {
4006                 nsegs = ((packetlen - 1) >> MBIGCLSHIFT) + 1;
4007         } else {
4008                 nsegs = ((packetlen - 1) >> MCLSHIFT) + 1;
4009         }
4010         if (maxsegments != NULL) {
4011                 if (*maxsegments && nsegs > *maxsegments) {
4012                         *maxsegments = nsegs;
4013                         return (NULL);
4014                 }
4015                 *maxsegments = nsegs;
4016         }
4017
4018         /*
4019          * The caller doesn't want all the requested buffers; only some.
4020          * Try hard to get what we can, but don't block.  This effectively
4021          * overrides MCR_SLEEP, since this thread will not go to sleep
4022          * if we can't get all the buffers.
4023          */
4024         if (!wantall || (mcflags & MCR_NOSLEEP))
4025                 mcflags |= MCR_TRYHARD;
4026
4027         /*
4028          * Simple case where all elements in the lists/chains are mbufs.
4029          * Unless bufsize is greater than MHLEN, each segment chain is made
4030          * up of exactly 1 mbuf.  Otherwise, each segment chain is made up
4031          * of 2 mbufs; the second one is used for the residual data, i.e.
4032          * the remaining data that cannot fit into the first mbuf.
4033          */
4034         if (bufsize <= MINCLSIZE) {
4035                 /* Allocate the elements in one shot from the mbuf cache */
4036                 ASSERT(bufsize <= MHLEN || nsegs == 2);
4037                 cp = m_cache(MC_MBUF);
4038                 needed = mcache_alloc_ext(cp, &mp_list,
4039                     (*numlist) * nsegs, mcflags);
4040
4041                 /*
4042                  * The number of elements must be even if we are to use an
4043                  * mbuf (instead of a cluster) to store the residual data.
4044                  * If we couldn't allocate the requested number of mbufs,
4045                  * trim the number down (if it's odd) in order to avoid
4046                  * creating a partial segment chain.
4047                  */
4048                 if (bufsize > MHLEN && (needed & 0x1))
4049                         needed--;
4050
4051                 while (num < needed) {
4052                         struct mbuf *m;
4053
4054                         m = (struct mbuf *)mp_list;
4055                         mp_list = mp_list->obj_next;
4056                         ASSERT(m != NULL);
4057
4058                         MBUF_INIT(m, 1, MT_DATA);
4059 #if CONFIG_MACF_NET
4060                         if (mac_init_mbuf(m, wait) != 0) {
4061                                 m_free(m);
4062                                 break;
4063                         }
4064 #endif /* MAC_NET */
4065                         num++;
4066                         if (bufsize > MHLEN) {
4067                                 /* A second mbuf for this segment chain */
4068                                 m->m_next = (struct mbuf *)mp_list;
4069                                 mp_list = mp_list->obj_next;
4070                                 ASSERT(m->m_next != NULL);
4071
4072                                 MBUF_INIT(m->m_next, 0, MT_DATA);
4073                                 num++;
4074                         }
4075                         *np = m;
4076                         np = &m->m_nextpkt;
4077                 }
4078                 ASSERT(num != *numlist || mp_list == NULL);
4079
4080                 if (num > 0) {
4081                         mtype_stat_add(MT_DATA, num);
4082                         mtype_stat_sub(MT_FREE, num);
4083                 }
4084                 num /= nsegs;
4085
4086                 /* We've got them all; return to caller */
4087                 if (num == *numlist)
4088                         return (top);
4089
4090                 goto fail;
4091         }
4092
4093         /*
4094          * Complex cases where elements are made up of one or more composite
4095          * mbufs + cluster, depending on packetlen.  Each N-segment chain can
4096          * be illustrated as follows:
4097          *
4098          * [mbuf + cluster 1] [mbuf + cluster 2] ... [mbuf + cluster N]
4099          *
4100          * Every composite mbuf + cluster element comes from the intermediate
4101          * cache (either MC_MBUF_CL or MC_MBUF_BIGCL).  For space efficiency,
4102          * the last composite element will come from the MC_MBUF_CL cache,
4103          * unless the residual data is larger than 2KB where we use the
4104          * big cluster composite cache (MC_MBUF_BIGCL) instead.  Residual
4105          * data is defined as extra data beyond the first element that cannot
4106          * fit into the previous element, i.e. there is no residual data if
4107          * the chain only has 1 segment.
4108          */
4109         r_bufsize = bufsize;
4110         resid = packetlen > bufsize ? packetlen % bufsize : 0;
4111         if (resid > 0) {
4112                 /* There is residual data; figure out the cluster size */
4113                 if (wantsize == 0 && packetlen > MINCLSIZE) {
4114                         /*
4115                          * Caller didn't request that all of the segments
4116                          * in the chain use the same cluster size; use the
4117                          * smaller of the cluster sizes.
4118                          */
4119                         if (njcl > 0 && resid > m_maxsize(MC_BIGCL))
4120                                 r_bufsize = m_maxsize(MC_16KCL);
4121                         else if (resid > m_maxsize(MC_CL))
4122                                 r_bufsize = m_maxsize(MC_BIGCL);
4123                         else
4124                                 r_bufsize = m_maxsize(MC_CL);
4125                 } else {
4126                         /* Use the same cluster size as the other segments */
4127                         resid = 0;
4128                 }
4129         }
4130
4131         needed = *numlist;
4132         if (resid > 0) {
4133                 /*
4134                  * Attempt to allocate composite mbuf + cluster elements for
4135                  * the residual data in each chain; record the number of such
4136                  * elements that can be allocated so that we know how many
4137                  * segment chains we can afford to create.
4138                  */
4139                 if (r_bufsize <= m_maxsize(MC_CL))
4140                         rcp = m_cache(MC_MBUF_CL);
4141                 else if (r_bufsize <= m_maxsize(MC_BIGCL))
4142                         rcp = m_cache(MC_MBUF_BIGCL);
4143                 else
4144                         rcp = m_cache(MC_MBUF_16KCL);
4145                 needed = mcache_alloc_ext(rcp, &rmp_list, *numlist, mcflags);
4146
4147                 if (needed == 0)
4148                         goto fail;
4149
4150                 /* This is temporarily reduced for calculation */
4151                 ASSERT(nsegs > 1);
4152                 nsegs--;
4153         }
4154
4155         /*
4156          * Attempt to allocate the rest of the composite mbuf + cluster
4157          * elements for the number of segment chains that we need.
4158          */
4159         if (bufsize <= m_maxsize(MC_CL))
4160                 cp = m_cache(MC_MBUF_CL);
4161         else if (bufsize <= m_maxsize(MC_BIGCL))
4162                 cp = m_cache(MC_MBUF_BIGCL);
4163         else
4164                 cp = m_cache(MC_MBUF_16KCL);
4165         needed = mcache_alloc_ext(cp, &mp_list, needed * nsegs, mcflags);
4166
4167         /* Round it down to avoid creating a partial segment chain */
4168         needed = (needed / nsegs) * nsegs;
4169         if (needed == 0)
4170                 goto fail;
4171
4172         if (resid > 0) {
4173                 /*
4174                  * We're about to construct the chain(s); take into account
4175                  * the number of segments we have created above to hold the
4176                  * residual data for each chain, as well as restore the
4177                  * original count of segments per chain.
4178                  */
4179                 ASSERT(nsegs > 0);
4180                 needed += needed / nsegs;
4181                 nsegs++;
4182         }
4183
4184         for (;;) {
4185                 struct mbuf *m;
4186                 u_int32_t flag;
4187                 struct ext_ref *rfa;
4188                 void *cl;
4189                 int pkthdr;
4190
4191                 ++num;
4192                 if (nsegs == 1 || (num % nsegs) != 0 || resid == 0) {
4193                         m = (struct mbuf *)mp_list;
4194                         mp_list = mp_list->obj_next;
4195                 } else {
4196                         m = (struct mbuf *)rmp_list;
4197                         rmp_list = rmp_list->obj_next;
4198                 }
4199                 ASSERT(m != NULL);
4200                 VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT);
4201                 VERIFY(m->m_ext.ext_free == NULL ||
4202                     m->m_ext.ext_free == m_bigfree ||
4203                     m->m_ext.ext_free == m_16kfree);
4204
4205                 cl = m->m_ext.ext_buf;
4206                 rfa = MEXT_RFA(m);
4207
4208                 ASSERT(cl != NULL && rfa != NULL);
4209                 VERIFY(MBUF_IS_COMPOSITE(m));
4210
4211                 flag = MEXT_FLAGS(m);
4212
4213                 pkthdr = (nsegs == 1 || (num % nsegs) == 1);
4214                 if (pkthdr)
4215                         first = m;
4216                 MBUF_INIT(m, pkthdr, MT_DATA);
4217                 if (m->m_ext.ext_free == m_16kfree) {
4218                         MBUF_16KCL_INIT(m, cl, rfa, 1, flag);
4219                 } else if (m->m_ext.ext_free == m_bigfree) {
4220                         MBUF_BIGCL_INIT(m, cl, rfa, 1, flag);
4221                 } else {
4222                         MBUF_CL_INIT(m, cl, rfa, 1, flag);
4223                 }
4224 #if CONFIG_MACF_NET
4225                 if (pkthdr && mac_init_mbuf(m, wait) != 0) {
4226                         --num;
4227                         m_freem(m);
4228                         break;
4229                 }
4230 #endif /* MAC_NET */
4231
4232                 *np = m;
4233                 if ((num % nsegs) == 0)
4234                         np = &first->m_nextpkt;
4235                 else
4236                         np = &m->m_next;
4237
4238                 if (num == needed)
4239                         break;
4240         }
4241
4242         if (num > 0) {
4243                 mtype_stat_add(MT_DATA, num);
4244                 mtype_stat_sub(MT_FREE, num);
4245         }
4246
4247         num /= nsegs;
4248
4249         /* We've got them all; return to caller */
4250         if (num == *numlist) {
4251                 ASSERT(mp_list == NULL && rmp_list == NULL);
4252                 return (top);
4253         }
4254
4255 fail:
4256         /* Free up what's left of the above */
4257         if (mp_list != NULL)
4258                 mcache_free_ext(cp, mp_list);
4259         if (rmp_list != NULL)
4260                 mcache_free_ext(rcp, rmp_list);
4261         if (wantall && top != NULL) {
4262                 m_freem(top);
4263                 return (NULL);
4264         }
4265         *numlist = num;
4266         return (top);
4267 }
4268
4269 /*
4270  * Best effort to get a mbuf cluster + pkthdr.  Used by drivers to allocated
4271  * packets on receive ring.
4272  */
4273 __private_extern__ struct mbuf *
4274 m_getpacket_how(int wait)
4275 {
4276         unsigned int num_needed = 1;
4277
4278         return (m_getpackets_internal(&num_needed, 1, wait, 1,
4279             m_maxsize(MC_CL)));
4280 }
4281
4282 /*
4283  * Best effort to get a mbuf cluster + pkthdr.  Used by drivers to allocated
4284  * packets on receive ring.
4285  */
4286 struct mbuf *
4287 m_getpacket(void)
4288 {
4289         unsigned int num_needed = 1;
4290
4291         return (m_getpackets_internal(&num_needed, 1, M_WAIT, 1,
4292             m_maxsize(MC_CL)));
4293 }
4294
4295 /*
4296  * Return a list of mbuf hdrs that point to clusters.  Try for num_needed;
4297  * if this can't be met, return whatever number were available.  Set up the
4298  * first num_with_pkthdrs with mbuf hdrs configured as packet headers.  These
4299  * are chained on the m_nextpkt field.  Any packets requested beyond this are
4300  * chained onto the last packet header's m_next field.
4301  */
4302 struct mbuf *
4303 m_getpackets(int num_needed, int num_with_pkthdrs, int how)
4304 {
4305         unsigned int n = num_needed;
4306
4307         return (m_getpackets_internal(&n, num_with_pkthdrs, how, 0,
4308             m_maxsize(MC_CL)));
4309 }
4310
4311 /*
4312  * Return a list of mbuf hdrs set up as packet hdrs chained together
4313  * on the m_nextpkt field
4314  */
4315 struct mbuf *
4316 m_getpackethdrs(int num_needed, int how)
4317 {
4318         struct mbuf *m;
4319         struct mbuf **np, *top;
4320
4321         top = NULL;
4322         np = &top;
4323
4324         while (num_needed--) {
4325                 m = _M_RETRYHDR(how, MT_DATA);
4326                 if (m == NULL)
4327                         break;
4328
4329                 *np = m;
4330                 np = &m->m_nextpkt;
4331         }
4332
4333         return (top);
4334 }
4335
4336 /*
4337  * Free an mbuf list (m_nextpkt) while following m_next.  Returns the count
4338  * for mbufs packets freed.  Used by the drivers.
4339  */
4340 int
4341 m_freem_list(struct mbuf *m)
4342 {
4343         struct mbuf *nextpkt;
4344         mcache_obj_t *mp_list = NULL;
4345         mcache_obj_t *mcl_list = NULL;
4346         mcache_obj_t *mbc_list = NULL;
4347         mcache_obj_t *m16k_list = NULL;
4348         mcache_obj_t *m_mcl_list = NULL;
4349         mcache_obj_t *m_mbc_list = NULL;
4350         mcache_obj_t *m_m16k_list = NULL;
4351         mcache_obj_t *ref_list = NULL;
4352         int pktcount = 0;
4353         int mt_free = 0, mt_data = 0, mt_header = 0, mt_soname = 0, mt_tag = 0;
4354
4355         while (m != NULL) {
4356                 pktcount++;
4357
4358                 nextpkt = m->m_nextpkt;
4359                 m->m_nextpkt = NULL;
4360
4361                 while (m != NULL) {
4362                         struct mbuf *next = m->m_next;
4363                         mcache_obj_t *o, *rfa;
4364                         u_int32_t refcnt, composite;
4365
4366                         if (m->m_type == MT_FREE)
4367                                 panic("m_free: freeing an already freed mbuf");
4368
4369                         if (m->m_type != MT_FREE)
4370                                 mt_free++;
4371
4372                         if (m->m_flags & M_PKTHDR) {
4373                                 /* Check for scratch area overflow */
4374                                 m_redzone_verify(m);
4375                                 /* Free the aux data and tags if there is any */
4376                                 m_tag_delete_chain(m, NULL);
4377                         }
4378
4379                         if (!(m->m_flags & M_EXT))
4380                                 goto simple_free;
4381
4382                         o = (mcache_obj_t *)(void *)m->m_ext.ext_buf;
4383                         refcnt = m_decref(m);
4384                         composite = (MEXT_FLAGS(m) & EXTF_COMPOSITE);
4385                         if (refcnt == 0 && !composite) {
4386                                 if (m->m_ext.ext_free == NULL) {
4387                                         o->obj_next = mcl_list;
4388                                         mcl_list = o;
4389                                 } else if (m->m_ext.ext_free == m_bigfree) {
4390                                         o->obj_next = mbc_list;
4391                                         mbc_list = o;
4392                                 } else if (m->m_ext.ext_free == m_16kfree) {
4393                                         o->obj_next = m16k_list;
4394                                         m16k_list = o;
4395                                 } else {
4396                                         (*(m->m_ext.ext_free))((caddr_t)o,
4397                                             m->m_ext.ext_size,
4398                                             m->m_ext.ext_arg);
4399                                 }
4400                                 rfa = (mcache_obj_t *)(void *)MEXT_RFA(m);
4401                                 rfa->obj_next = ref_list;
4402                                 ref_list = rfa;
4403                                 MEXT_RFA(m) = NULL;
4404                         } else if (refcnt == 0 && composite) {
4405                                 VERIFY(m->m_type != MT_FREE);
4406                                 /*
4407                                  * Amortize the costs of atomic operations
4408                                  * by doing them at the end, if possible.
4409                                  */
4410                                 if (m->m_type == MT_DATA)
4411                                         mt_data++;
4412                                 else if (m->m_type == MT_HEADER)
4413                                         mt_header++;
4414                                 else if (m->m_type == MT_SONAME)
4415                                         mt_soname++;
4416                                 else if (m->m_type == MT_TAG)
4417                                         mt_tag++;
4418                                 else
4419                                         mtype_stat_dec(m->m_type);
4420
4421                                 m->m_type = MT_FREE;
4422                                 m->m_flags = M_EXT;
4423                                 m->m_len = 0;
4424                                 m->m_next = m->m_nextpkt = NULL;
4425
4426                                 MEXT_FLAGS(m) &= ~EXTF_READONLY;
4427
4428                                 /* "Free" into the intermediate cache */
4429                                 o = (mcache_obj_t *)m;
4430                                 if (m->m_ext.ext_free == NULL) {
4431                                         o->obj_next = m_mcl_list;
4432                                         m_mcl_list = o;
4433                                 } else if (m->m_ext.ext_free == m_bigfree) {
4434                                         o->obj_next = m_mbc_list;
4435                                         m_mbc_list = o;
4436                                 } else {
4437                                         VERIFY(m->m_ext.ext_free == m_16kfree);
4438                                         o->obj_next = m_m16k_list;
4439                                         m_m16k_list = o;
4440                                 }
4441                                 m = next;
4442                                 continue;
4443                         }
4444 simple_free:
4445                         /*
4446                          * Amortize the costs of atomic operations
4447                          * by doing them at the end, if possible.
4448                          */
4449                         if (m->m_type == MT_DATA)
4450                                 mt_data++;
4451                         else if (m->m_type == MT_HEADER)
4452                                 mt_header++;
4453                         else if (m->m_type == MT_SONAME)
4454                                 mt_soname++;
4455                         else if (m->m_type == MT_TAG)
4456                                 mt_tag++;
4457                         else if (m->m_type != MT_FREE)
4458                                 mtype_stat_dec(m->m_type);
4459
4460                         m->m_type = MT_FREE;
4461                         m->m_flags = m->m_len = 0;
4462                         m->m_next = m->m_nextpkt = NULL;
4463
4464                         ((mcache_obj_t *)m)->obj_next = mp_list;
4465                         mp_list = (mcache_obj_t *)m;
4466
4467                         m = next;
4468                 }
4469
4470                 m = nextpkt;
4471         }
4472
4473         if (mt_free > 0)
4474                 mtype_stat_add(MT_FREE, mt_free);
4475         if (mt_data > 0)
4476                 mtype_stat_sub(MT_DATA, mt_data);
4477         if (mt_header > 0)
4478                 mtype_stat_sub(MT_HEADER, mt_header);
4479         if (mt_soname > 0)
4480                 mtype_stat_sub(MT_SONAME, mt_soname);
4481         if (mt_tag > 0)
4482                 mtype_stat_sub(MT_TAG, mt_tag);
4483
4484         if (mp_list != NULL)
4485                 mcache_free_ext(m_cache(MC_MBUF), mp_list);
4486         if (mcl_list != NULL)
4487                 mcache_free_ext(m_cache(MC_CL), mcl_list);
4488         if (mbc_list != NULL)
4489                 mcache_free_ext(m_cache(MC_BIGCL), mbc_list);
4490         if (m16k_list != NULL)
4491                 mcache_free_ext(m_cache(MC_16KCL), m16k_list);
4492         if (m_mcl_list != NULL)
4493                 mcache_free_ext(m_cache(MC_MBUF_CL), m_mcl_list);
4494         if (m_mbc_list != NULL)
4495                 mcache_free_ext(m_cache(MC_MBUF_BIGCL), m_mbc_list);
4496         if (m_m16k_list != NULL)
4497                 mcache_free_ext(m_cache(MC_MBUF_16KCL), m_m16k_list);
4498         if (ref_list != NULL)
4499                 mcache_free_ext(ref_cache, ref_list);
4500
4501         return (pktcount);
4502 }
4503
4504 void
4505 m_freem(struct mbuf *m)
4506 {
4507         while (m != NULL)
4508                 m = m_free(m);
4509 }
4510
4511 /*
4512  * Mbuffer utility routines.
4513  */
4514
4515 /*
4516  * Compute the amount of space available before the current start
4517  * of data in an mbuf.
4518  */
4519 int
4520 m_leadingspace(struct mbuf *m)
4521 {
4522         if (m->m_flags & M_EXT) {
4523                 if (MCLHASREFERENCE(m))
4524                         return (0);
4525                 return (m->m_data - m->m_ext.ext_buf);
4526         }
4527         if (m->m_flags & M_PKTHDR)
4528                 return (m->m_data - m->m_pktdat);
4529         return (m->m_data - m->m_dat);
4530 }
4531
4532 /*
4533  * Compute the amount of space available after the end of data in an mbuf.
4534  */
4535 int
4536 m_trailingspace(struct mbuf *m)
4537 {
4538         if (m->m_flags & M_EXT) {
4539                 if (MCLHASREFERENCE(m))
4540                         return (0);
4541                 return (m->m_ext.ext_buf + m->m_ext.ext_size -
4542                     (m->m_data + m->m_len));
4543         }
4544         return (&m->m_dat[MLEN] - (m->m_data + m->m_len));
4545 }
4546
4547 /*
4548  * Lesser-used path for M_PREPEND: allocate new mbuf to prepend to chain,
4549  * copy junk along.  Does not adjust packet header length.
4550  */
4551 struct mbuf *
4552 m_prepend(struct mbuf *m, int len, int how)
4553 {
4554         struct mbuf *mn;
4555
4556         _MGET(mn, how, m->m_type);
4557         if (mn == NULL) {
4558                 m_freem(m);
4559                 return (NULL);
4560         }
4561         if (m->m_flags & M_PKTHDR) {
4562                 M_COPY_PKTHDR(mn, m);
4563                 m->m_flags &= ~M_PKTHDR;
4564         }
4565         mn->m_next = m;
4566         m = mn;
4567         if (m->m_flags & M_PKTHDR) {
4568                 VERIFY(len <= MHLEN);
4569                 MH_ALIGN(m, len);
4570         } else {
4571                 VERIFY(len <= MLEN);
4572                 M_ALIGN(m, len);
4573         }
4574         m->m_len = len;
4575         return (m);
4576 }
4577
4578 /*
4579  * Replacement for old M_PREPEND macro: allocate new mbuf to prepend to
4580  * chain, copy junk along, and adjust length.
4581  */
4582 struct mbuf *
4583 m_prepend_2(struct mbuf *m, int len, int how, int align)
4584 {
4585         if (M_LEADINGSPACE(m) >= len &&
4586             (!align || IS_P2ALIGNED((m->m_data - len), sizeof(u_int32_t)))) {
4587                 m->m_data -= len;
4588                 m->m_len += len;
4589         } else {
4590                 m = m_prepend(m, len, how);
4591         }
4592         if ((m) && (m->m_flags & M_PKTHDR))
4593                 m->m_pkthdr.len += len;
4594         return (m);
4595 }
4596
4597 /*
4598  * Make a copy of an mbuf chain starting "off0" bytes from the beginning,
4599  * continuing for "len" bytes.  If len is M_COPYALL, copy to end of mbuf.
4600  * The wait parameter is a choice of M_WAIT/M_DONTWAIT from caller.
4601  */
4602 int MCFail;
4603
4604 struct mbuf *
4605 m_copym_mode(struct mbuf *m, int off0, int len, int wait, uint32_t mode)
4606 {
4607         struct mbuf *n, *mhdr = NULL, **np;
4608         int off = off0;
4609         struct mbuf *top;
4610         int copyhdr = 0;
4611
4612         if (off < 0 || len < 0)
4613                 panic("m_copym: invalid offset %d or len %d", off, len);
4614
4615         VERIFY((mode != M_COPYM_MUST_COPY_HDR &&
4616             mode != M_COPYM_MUST_MOVE_HDR) || (m->m_flags & M_PKTHDR));
4617
4618         if ((off == 0 && (m->m_flags & M_PKTHDR)) ||
4619             mode == M_COPYM_MUST_COPY_HDR || mode == M_COPYM_MUST_MOVE_HDR) {
4620                 mhdr = m;
4621                 copyhdr = 1;
4622         }
4623
4624         while (off >= m->m_len) {
4625                 if (m->m_next == NULL)
4626                         panic("m_copym: invalid mbuf chain");
4627                 off -= m->m_len;
4628                 m = m->m_next;
4629         }
4630         np = &top;
4631         top = NULL;
4632
4633         while (len > 0) {
4634                 if (m == NULL) {
4635                         if (len != M_COPYALL)
4636                                 panic("m_copym: len != M_COPYALL");
4637                         break;
4638                 }
4639
4640                 if (copyhdr)
4641                         n = _M_RETRYHDR(wait, m->m_type);
4642                 else
4643                         n = _M_RETRY(wait, m->m_type);
4644                 *np = n;
4645
4646                 if (n == NULL)
4647                         goto nospace;
4648
4649                 if (copyhdr != 0) {
4650                         if ((mode == M_COPYM_MOVE_HDR) ||
4651                             (mode == M_COPYM_MUST_MOVE_HDR)) {
4652                                 M_COPY_PKTHDR(n, mhdr);
4653                         } else if ((mode == M_COPYM_COPY_HDR) ||
4654                             (mode == M_COPYM_MUST_COPY_HDR)) {
4655                                 if (m_dup_pkthdr(n, mhdr, wait) == 0)
4656                                         goto nospace;
4657                         }
4658                         if (len == M_COPYALL)
4659                                 n->m_pkthdr.len -= off0;
4660                         else
4661                                 n->m_pkthdr.len = len;
4662                         copyhdr = 0;
4663                         /*
4664                          * There is data to copy from the packet header mbuf
4665                          * if it is empty or it is before the starting offset
4666                          */
4667                         if (mhdr != m) {
4668                                 np = &n->m_next;
4669                                 continue;
4670                         }
4671                 }
4672                 n->m_len = MIN(len, (m->m_len - off));
4673                 if (m->m_flags & M_EXT) {
4674                         n->m_ext = m->m_ext;
4675                         m_incref(m);
4676                         n->m_data = m->m_data + off;
4677                         n->m_flags |= M_EXT;
4678                 } else {
4679                         /*
4680                          * Limit to the capacity of the destination
4681                          */
4682                         if (n->m_flags & M_PKTHDR)
4683                                 n->m_len = MIN(n->m_len, MHLEN);
4684                         else
4685                                 n->m_len = MIN(n->m_len, MLEN);
4686
4687                         if (MTOD(n, char *) + n->m_len > ((char *)n) + MSIZE)
4688                                 panic("%s n %p copy overflow",
4689                                         __func__, n);
4690
4691                         bcopy(MTOD(m, caddr_t)+off, MTOD(n, caddr_t),
4692                             (unsigned)n->m_len);
4693                 }
4694                 if (len != M_COPYALL)
4695                         len -= n->m_len;
4696                 off = 0;
4697                 m = m->m_next;
4698                 np = &n->m_next;
4699         }
4700
4701         if (top == NULL)
4702                 MCFail++;
4703
4704         return (top);
4705 nospace:
4706
4707         m_freem(top);
4708         MCFail++;
4709         return (NULL);
4710 }
4711
4712
4713 struct mbuf *
4714 m_copym(struct mbuf *m, int off0, int len, int wait)
4715 {
4716         return (m_copym_mode(m, off0, len, wait, M_COPYM_MOVE_HDR));
4717 }
4718
4719 /*
4720  * Equivalent to m_copym except that all necessary mbuf hdrs are allocated
4721  * within this routine also, the last mbuf and offset accessed are passed
4722  * out and can be passed back in to avoid having to rescan the entire mbuf
4723  * list (normally hung off of the socket)
4724  */
4725 struct mbuf *
4726 m_copym_with_hdrs(struct mbuf *m0, int off0, int len0, int wait,
4727     struct mbuf **m_lastm, int *m_off, uint32_t mode)
4728 {
4729         struct mbuf *m = m0, *n, **np = NULL;
4730         int off = off0, len = len0;
4731         struct mbuf *top = NULL;
4732         int mcflags = MSLEEPF(wait);
4733         int copyhdr = 0;
4734         int type = 0;
4735         mcache_obj_t *list = NULL;
4736         int needed = 0;
4737
4738         if (off == 0 && (m->m_flags & M_PKTHDR))
4739                 copyhdr = 1;
4740
4741         if (m_lastm != NULL && *m_lastm != NULL) {
4742                 m = *m_lastm;
4743                 off = *m_off;
4744         } else {
4745                 while (off >= m->m_len) {
4746                         off -= m->m_len;
4747                         m = m->m_next;
4748                 }
4749         }
4750
4751         n = m;
4752         while (len > 0) {
4753                 needed++;
4754                 ASSERT(n != NULL);
4755                 len -= MIN(len, (n->m_len - ((needed == 1) ? off : 0)));
4756                 n = n->m_next;
4757         }
4758         needed++;
4759         len = len0;
4760
4761         /*
4762          * If the caller doesn't want to be put to sleep, mark it with
4763          * MCR_TRYHARD so that we may reclaim buffers from other places
4764          * before giving up.
4765          */
4766         if (mcflags & MCR_NOSLEEP)
4767                 mcflags |= MCR_TRYHARD;
4768
4769         if (mcache_alloc_ext(m_cache(MC_MBUF), &list, needed,
4770             mcflags) != needed)
4771                 goto nospace;
4772
4773         needed = 0;
4774         while (len > 0) {
4775                 n = (struct mbuf *)list;
4776                 list = list->obj_next;
4777                 ASSERT(n != NULL && m != NULL);
4778
4779                 type = (top == NULL) ? MT_HEADER : m->m_type;
4780                 MBUF_INIT(n, (top == NULL), type);
4781 #if CONFIG_MACF_NET
4782                 if (top == NULL && mac_mbuf_label_init(n, wait) != 0) {
4783                         mtype_stat_inc(MT_HEADER);
4784                         mtype_stat_dec(MT_FREE);
4785                         m_free(n);
4786                         goto nospace;
4787                 }
4788 #endif /* MAC_NET */
4789
4790                 if (top == NULL) {
4791                         top = n;
4792                         np = &top->m_next;
4793                         continue;
4794                 } else {
4795                         needed++;
4796                         *np = n;
4797                 }
4798
4799                 if (copyhdr) {
4800                         if ((mode == M_COPYM_MOVE_HDR) ||
4801                             (mode == M_COPYM_MUST_MOVE_HDR)) {
4802                                 M_COPY_PKTHDR(n, m);
4803                         } else if ((mode == M_COPYM_COPY_HDR) ||
4804                             (mode == M_COPYM_MUST_COPY_HDR)) {
4805                                 if (m_dup_pkthdr(n, m, wait) == 0)
4806                                         goto nospace;
4807                         }
4808                         n->m_pkthdr.len = len;
4809                         copyhdr = 0;
4810                 }
4811                 n->m_len = MIN(len, (m->m_len - off));
4812
4813                 if (m->m_flags & M_EXT) {
4814                         n->m_ext = m->m_ext;
4815                         m_incref(m);
4816                         n->m_data = m->m_data + off;
4817                         n->m_flags |= M_EXT;
4818                 } else {
4819                         if (MTOD(n, char *) + n->m_len > ((char *)n) + MSIZE)
4820                                 panic("%s n %p copy overflow",
4821                                         __func__, n);
4822
4823                         bcopy(MTOD(m, caddr_t)+off, MTOD(n, caddr_t),
4824                             (unsigned)n->m_len);
4825                 }
4826                 len -= n->m_len;
4827
4828                 if (len == 0) {
4829                         if (m_lastm != NULL && m_off != NULL) {
4830                                 if ((off + n->m_len) == m->m_len) {
4831                                         *m_lastm = m->m_next;
4832                                         *m_off  = 0;
4833                                 } else {
4834                                         *m_lastm = m;
4835                                         *m_off  = off + n->m_len;
4836                                 }
4837                         }
4838                         break;
4839                 }
4840                 off = 0;
4841                 m = m->m_next;
4842                 np = &n->m_next;
4843         }
4844
4845         mtype_stat_inc(MT_HEADER);
4846         mtype_stat_add(type, needed);
4847         mtype_stat_sub(MT_FREE, needed + 1);
4848
4849         ASSERT(list == NULL);
4850         return (top);
4851
4852 nospace:
4853         if (list != NULL)
4854                 mcache_free_ext(m_cache(MC_MBUF), list);
4855         if (top != NULL)
4856                 m_freem(top);
4857         MCFail++;
4858         return (NULL);
4859 }
4860
4861 /*
4862  * Copy data from an mbuf chain starting "off" bytes from the beginning,
4863  * continuing for "len" bytes, into the indicated buffer.
4864  */
4865 void
4866 m_copydata(struct mbuf *m, int off, int len, void *vp)
4867 {
4868         unsigned count;
4869         char *cp = vp;
4870
4871         if (off < 0 || len < 0)
4872                 panic("m_copydata: invalid offset %d or len %d", off, len);
4873
4874         while (off > 0) {
4875                 if (m == NULL)
4876                         panic("m_copydata: invalid mbuf chain");
4877                 if (off < m->m_len)
4878                         break;
4879                 off -= m->m_len;
4880                 m = m->m_next;
4881         }
4882         while (len > 0) {
4883                 if (m == NULL)
4884                         panic("m_copydata: invalid mbuf chain");
4885                 count = MIN(m->m_len - off, len);
4886                 bcopy(MTOD(m, caddr_t) + off, cp, count);
4887                 len -= count;
4888                 cp += count;
4889                 off = 0;
4890                 m = m->m_next;
4891         }
4892 }
4893
4894 /*
4895  * Concatenate mbuf chain n to m.  Both chains must be of the same type
4896  * (e.g. MT_DATA).  Any m_pkthdr is not updated.
4897  */
4898 void
4899 m_cat(struct mbuf *m, struct mbuf *n)
4900 {
4901         while (m->m_next)
4902                 m = m->m_next;
4903         while (n) {
4904                 if ((m->m_flags & M_EXT) ||
4905                     m->m_data + m->m_len + n->m_len >= &m->m_dat[MLEN]) {
4906                         /* just join the two chains */
4907                         m->m_next = n;
4908                         return;
4909                 }
4910                 /* splat the data from one into the other */
4911                 bcopy(MTOD(n, caddr_t), MTOD(m, caddr_t) + m->m_len,
4912                     (u_int)n->m_len);
4913                 m->m_len += n->m_len;
4914                 n = m_free(n);
4915         }
4916 }
4917
4918 void
4919 m_adj(struct mbuf *mp, int req_len)
4920 {
4921         int len = req_len;
4922         struct mbuf *m;
4923         int count;
4924
4925         if ((m = mp) == NULL)
4926                 return;
4927         if (len >= 0) {
4928                 /*
4929                  * Trim from head.
4930                  */
4931                 while (m != NULL && len > 0) {
4932                         if (m->m_len <= len) {
4933                                 len -= m->m_len;
4934                                 m->m_len = 0;
4935                                 m = m->m_next;
4936                         } else {
4937                                 m->m_len -= len;
4938                                 m->m_data += len;
4939                                 len = 0;
4940                         }
4941                 }
4942                 m = mp;
4943                 if (m->m_flags & M_PKTHDR)
4944                         m->m_pkthdr.len -= (req_len - len);
4945         } else {
4946                 /*
4947                  * Trim from tail.  Scan the mbuf chain,
4948                  * calculating its length and finding the last mbuf.
4949                  * If the adjustment only affects this mbuf, then just
4950                  * adjust and return.  Otherwise, rescan and truncate
4951                  * after the remaining size.
4952                  */
4953                 len = -len;
4954                 count = 0;
4955                 for (;;) {
4956                         count += m->m_len;
4957                         if (m->m_next == (struct mbuf *)0)
4958                                 break;
4959                         m = m->m_next;
4960                 }
4961                 if (m->m_len >= len) {
4962                         m->m_len -= len;
4963                         m = mp;
4964                         if (m->m_flags & M_PKTHDR)
4965                                 m->m_pkthdr.len -= len;
4966                         return;
4967                 }
4968                 count -= len;
4969                 if (count < 0)
4970                         count = 0;
4971                 /*
4972                  * Correct length for chain is "count".
4973                  * Find the mbuf with last data, adjust its length,
4974                  * and toss data from remaining mbufs on chain.
4975                  */
4976                 m = mp;
4977                 if (m->m_flags & M_PKTHDR)
4978                         m->m_pkthdr.len = count;
4979                 for (; m; m = m->m_next) {
4980                         if (m->m_len >= count) {
4981                                 m->m_len = count;
4982                                 break;
4983                         }
4984                         count -= m->m_len;
4985                 }
4986                 while ((m = m->m_next))
4987                         m->m_len = 0;
4988         }
4989 }
4990
4991 /*
4992  * Rearange an mbuf chain so that len bytes are contiguous
4993  * and in the data area of an mbuf (so that mtod and dtom
4994  * will work for a structure of size len).  Returns the resulting
4995  * mbuf chain on success, frees it and returns null on failure.
4996  * If there is room, it will add up to max_protohdr-len extra bytes to the
4997  * contiguous region in an attempt to avoid being called next time.
4998  */
4999 int MPFail;
5000
5001 struct mbuf *
5002 m_pullup(struct mbuf *n, int len)
5003 {
5004         struct mbuf *m;
5005         int count;
5006         int space;
5007
5008         /*
5009          * If first mbuf has no cluster, and has room for len bytes
5010          * without shifting current data, pullup into it,
5011          * otherwise allocate a new mbuf to prepend to the chain.
5012          */
5013         if ((n->m_flags & M_EXT) == 0 &&
5014             n->m_data + len < &n->m_dat[MLEN] && n->m_next) {
5015                 if (n->m_len >= len)
5016                         return (n);
5017                 m = n;
5018                 n = n->m_next;
5019                 len -= m->m_len;
5020         } else {
5021                 if (len > MHLEN)
5022                         goto bad;
5023                 _MGET(m, M_DONTWAIT, n->m_type);
5024                 if (m == 0)
5025                         goto bad;
5026                 m->m_len = 0;
5027                 if (n->m_flags & M_PKTHDR) {
5028                         M_COPY_PKTHDR(m, n);
5029                         n->m_flags &= ~M_PKTHDR;
5030                 }
5031         }
5032         space = &m->m_dat[MLEN] - (m->m_data + m->m_len);
5033         do {
5034                 count = MIN(MIN(MAX(len, max_protohdr), space), n->m_len);
5035                 bcopy(MTOD(n, caddr_t), MTOD(m, caddr_t) + m->m_len,
5036                     (unsigned)count);
5037                 len -= count;
5038                 m->m_len += count;
5039                 n->m_len -= count;
5040                 space -= count;
5041                 if (n->m_len)
5042                         n->m_data += count;
5043                 else
5044                         n = m_free(n);
5045         } while (len > 0 && n);
5046         if (len > 0) {
5047                 (void) m_free(m);
5048                 goto bad;
5049         }
5050         m->m_next = n;
5051         return (m);
5052 bad:
5053         m_freem(n);
5054         MPFail++;
5055         return (0);
5056 }
5057
5058 /*
5059  * Like m_pullup(), except a new mbuf is always allocated, and we allow
5060  * the amount of empty space before the data in the new mbuf to be specified
5061  * (in the event that the caller expects to prepend later).
5062  */
5063 __private_extern__ int MSFail = 0;
5064
5065 __private_extern__ struct mbuf *
5066 m_copyup(struct mbuf *n, int len, int dstoff)
5067 {
5068         struct mbuf *m;
5069         int count, space;
5070
5071         if (len > (MHLEN - dstoff))
5072                 goto bad;
5073         MGET(m, M_DONTWAIT, n->m_type);
5074         if (m == NULL)
5075                 goto bad;
5076         m->m_len = 0;
5077         if (n->m_flags & M_PKTHDR) {
5078                 m_copy_pkthdr(m, n);
5079                 n->m_flags &= ~M_PKTHDR;
5080         }
5081         m->m_data += dstoff;
5082         space = &m->m_dat[MLEN] - (m->m_data + m->m_len);
5083         do {
5084                 count = min(min(max(len, max_protohdr), space), n->m_len);
5085                 memcpy(mtod(m, caddr_t) + m->m_len, mtod(n, caddr_t),
5086                     (unsigned)count);
5087                 len -= count;
5088                 m->m_len += count;
5089                 n->m_len -= count;
5090                 space -= count;
5091                 if (n->m_len)
5092                         n->m_data += count;
5093                 else
5094                         n = m_free(n);
5095         } while (len > 0 && n);
5096         if (len > 0) {
5097                 (void) m_free(m);
5098                 goto bad;
5099         }
5100         m->m_next = n;
5101         return (m);
5102 bad:
5103         m_freem(n);
5104         MSFail++;
5105         return (NULL);
5106 }
5107
5108 /*
5109  * Partition an mbuf chain in two pieces, returning the tail --
5110  * all but the first len0 bytes.  In case of failure, it returns NULL and
5111  * attempts to restore the chain to its original state.
5112  */
5113 struct mbuf *
5114 m_split(struct mbuf *m0, int len0, int wait)
5115 {
5116         return (m_split0(m0, len0, wait, 1));
5117 }
5118
5119 static struct mbuf *
5120 m_split0(struct mbuf *m0, int len0, int wait, int copyhdr)
5121 {
5122         struct mbuf *m, *n;
5123         unsigned len = len0, remain;
5124
5125         for (m = m0; m && len > m->m_len; m = m->m_next)
5126                 len -= m->m_len;
5127         if (m == NULL)
5128                 return (NULL);
5129         remain = m->m_len - len;
5130         if (copyhdr && (m0->m_flags & M_PKTHDR)) {
5131                 _MGETHDR(n, wait, m0->m_type);
5132                 if (n == NULL)
5133                         return (NULL);
5134                 n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif;
5135                 n->m_pkthdr.len = m0->m_pkthdr.len - len0;
5136                 m0->m_pkthdr.len = len0;
5137                 if (m->m_flags & M_EXT)
5138                         goto extpacket;
5139                 if (remain > MHLEN) {
5140                         /* m can't be the lead packet */
5141                         MH_ALIGN(n, 0);
5142                         n->m_next = m_split(m, len, wait);
5143                         if (n->m_next == NULL) {
5144                                 (void) m_free(n);
5145                                 return (NULL);
5146                         } else
5147                                 return (n);
5148                 } else
5149                         MH_ALIGN(n, remain);
5150         } else if (remain == 0) {
5151                 n = m->m_next;
5152                 m->m_next = NULL;
5153                 return (n);
5154         } else {
5155                 _MGET(n, wait, m->m_type);
5156                 if (n == NULL)
5157                         return (NULL);
5158                 M_ALIGN(n, remain);
5159         }
5160 extpacket:
5161         if (m->m_flags & M_EXT) {
5162                 n->m_flags |= M_EXT;
5163                 n->m_ext = m->m_ext;
5164                 m_incref(m);
5165                 n->m_data = m->m_data + len;
5166         } else {
5167                 bcopy(MTOD(m, caddr_t) + len, MTOD(n, caddr_t), remain);
5168         }
5169         n->m_len = remain;
5170         m->m_len = len;
5171         n->m_next = m->m_next;
5172         m->m_next = NULL;
5173         return (n);
5174 }
5175
5176 /*
5177  * Routine to copy from device local memory into mbufs.
5178  */
5179 struct mbuf *
5180 m_devget(char *buf, int totlen, int off0, struct ifnet *ifp,
5181     void (*copy)(const void *, void *, size_t))
5182 {
5183         struct mbuf *m;
5184         struct mbuf *top = NULL, **mp = &top;
5185         int off = off0, len;
5186         char *cp;
5187         char *epkt;
5188
5189         cp = buf;
5190         epkt = cp + totlen;
5191         if (off) {
5192                 /*
5193                  * If 'off' is non-zero, packet is trailer-encapsulated,
5194                  * so we have to skip the type and length fields.
5195                  */
5196                 cp += off + 2 * sizeof (u_int16_t);
5197                 totlen -= 2 * sizeof (u_int16_t);
5198         }
5199         _MGETHDR(m, M_DONTWAIT, MT_DATA);
5200         if (m == NULL)
5201                 return (NULL);
5202         m->m_pkthdr.rcvif = ifp;
5203         m->m_pkthdr.len = totlen;
5204         m->m_len = MHLEN;
5205
5206         while (totlen > 0) {
5207                 if (top != NULL) {
5208                         _MGET(m, M_DONTWAIT, MT_DATA);
5209                         if (m == NULL) {
5210                                 m_freem(top);
5211                                 return (NULL);
5212                         }
5213                         m->m_len = MLEN;
5214                 }
5215                 len = MIN(totlen, epkt - cp);
5216                 if (len >= MINCLSIZE) {
5217                         MCLGET(m, M_DONTWAIT);
5218                         if (m->m_flags & M_EXT) {
5219                                 m->m_len = len = MIN(len, m_maxsize(MC_CL));
5220                         } else {
5221                                 /* give up when it's out of cluster mbufs */
5222                                 if (top != NULL)
5223                                         m_freem(top);
5224                                 m_freem(m);
5225                                 return (NULL);
5226                         }
5227                 } else {
5228                         /*
5229                          * Place initial small packet/header at end of mbuf.
5230                          */
5231                         if (len < m->m_len) {
5232                                 if (top == NULL &&
5233                                     len + max_linkhdr <= m->m_len)
5234                                         m->m_data += max_linkhdr;
5235                                 m->m_len = len;
5236                         } else {
5237                                 len = m->m_len;
5238                         }
5239                 }
5240                 if (copy)
5241                         copy(cp, MTOD(m, caddr_t), (unsigned)len);
5242                 else
5243                         bcopy(cp, MTOD(m, caddr_t), (unsigned)len);
5244                 cp += len;
5245                 *mp = m;
5246                 mp = &m->m_next;
5247                 totlen -= len;
5248                 if (cp == epkt)
5249                         cp = buf;
5250         }
5251         return (top);
5252 }
5253
5254 #ifndef MBUF_GROWTH_NORMAL_THRESH
5255 #define MBUF_GROWTH_NORMAL_THRESH 25
5256 #endif
5257
5258 /*
5259  * Cluster freelist allocation check.
5260  */
5261 static int
5262 m_howmany(int num, size_t bufsize)
5263 {
5264         int i = 0, j = 0;
5265         u_int32_t m_mbclusters, m_clusters, m_bigclusters, m_16kclusters;
5266         u_int32_t m_mbfree, m_clfree, m_bigclfree, m_16kclfree;
5267         u_int32_t sumclusters, freeclusters;
5268         u_int32_t percent_pool, percent_kmem;
5269         u_int32_t mb_growth, mb_growth_thresh;
5270
5271         VERIFY(bufsize == m_maxsize(MC_BIGCL) ||
5272             bufsize == m_maxsize(MC_16KCL));
5273
5274         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
5275
5276         /* Numbers in 2K cluster units */
5277         m_mbclusters = m_total(MC_MBUF) >> NMBPCLSHIFT;
5278         m_clusters = m_total(MC_CL);
5279         m_bigclusters = m_total(MC_BIGCL) << NCLPBGSHIFT;
5280         m_16kclusters = m_total(MC_16KCL);
5281         sumclusters = m_mbclusters + m_clusters + m_bigclusters;
5282
5283         m_mbfree = m_infree(MC_MBUF) >> NMBPCLSHIFT;
5284         m_clfree = m_infree(MC_CL);
5285         m_bigclfree = m_infree(MC_BIGCL) << NCLPBGSHIFT;
5286         m_16kclfree = m_infree(MC_16KCL);
5287         freeclusters = m_mbfree + m_clfree + m_bigclfree;
5288
5289         /* Bail if we've maxed out the mbuf memory map */
5290         if ((bufsize == m_maxsize(MC_BIGCL) && sumclusters >= nclusters) ||
5291             (njcl > 0 && bufsize == m_maxsize(MC_16KCL) &&
5292             (m_16kclusters << NCLPJCLSHIFT) >= njcl)) {
5293                 return (0);
5294         }
5295
5296         if (bufsize == m_maxsize(MC_BIGCL)) {
5297                 /* Under minimum */
5298                 if (m_bigclusters < m_minlimit(MC_BIGCL))
5299                         return (m_minlimit(MC_BIGCL) - m_bigclusters);
5300
5301                 percent_pool =
5302                     ((sumclusters - freeclusters) * 100) / sumclusters;
5303                 percent_kmem = (sumclusters * 100) / nclusters;
5304
5305                 /*
5306                  * If a light/normal user, grow conservatively (75%)
5307                  * If a heavy user, grow aggressively (50%)
5308                  */
5309                 if (percent_kmem < MBUF_GROWTH_NORMAL_THRESH)
5310                         mb_growth = MB_GROWTH_NORMAL;
5311                 else
5312                         mb_growth = MB_GROWTH_AGGRESSIVE;
5313
5314                 if (percent_kmem < 5) {
5315                         /* For initial allocations */
5316                         i = num;
5317                 } else {
5318                         /* Return if >= MBIGCL_LOWAT clusters available */
5319                         if (m_infree(MC_BIGCL) >= MBIGCL_LOWAT &&
5320                             m_total(MC_BIGCL) >=
5321                             MBIGCL_LOWAT + m_minlimit(MC_BIGCL))
5322                                 return (0);
5323
5324                         /* Ensure at least num clusters are accessible */
5325                         if (num >= m_infree(MC_BIGCL))
5326                                 i = num - m_infree(MC_BIGCL);
5327                         if (num > m_total(MC_BIGCL) - m_minlimit(MC_BIGCL))
5328                                 j = num - (m_total(MC_BIGCL) -
5329                                     m_minlimit(MC_BIGCL));
5330
5331                         i = MAX(i, j);
5332
5333                         /*
5334                          * Grow pool if percent_pool > 75 (normal growth)
5335                          * or percent_pool > 50 (aggressive growth).
5336                          */
5337                         mb_growth_thresh = 100 - (100 / (1 << mb_growth));
5338                         if (percent_pool > mb_growth_thresh)
5339                                 j = ((sumclusters + num) >> mb_growth) -
5340                                     freeclusters;
5341                         i = MAX(i, j);
5342                 }
5343
5344                 /* Check to ensure we didn't go over limits */
5345                 if (i + m_bigclusters >= m_maxlimit(MC_BIGCL))
5346                         i = m_maxlimit(MC_BIGCL) - m_bigclusters;
5347                 if ((i << 1) + sumclusters >= nclusters)
5348                         i = (nclusters - sumclusters) >> 1;
5349                 VERIFY((m_total(MC_BIGCL) + i) <= m_maxlimit(MC_BIGCL));
5350                 VERIFY(sumclusters + (i << 1) <= nclusters);
5351
5352         } else { /* 16K CL */
5353                 VERIFY(njcl > 0);
5354                 /* Ensure at least num clusters are available */
5355                 if (num >= m_16kclfree)
5356                         i = num - m_16kclfree;
5357
5358                 /* Always grow 16KCL pool aggressively */
5359                 if (((m_16kclusters + num) >> 1) > m_16kclfree)
5360                         j = ((m_16kclusters + num) >> 1) - m_16kclfree;
5361                 i = MAX(i, j);
5362
5363                 /* Check to ensure we don't go over limit */
5364                 if (i + m_16kclusters >= m_maxlimit(MC_16KCL))
5365                         i = m_maxlimit(MC_16KCL) - m_16kclusters;
5366                 VERIFY((m_total(MC_16KCL) + i) <= m_maxlimit(MC_16KCL));
5367         }
5368         return (i);
5369 }
5370 /*
5371  * Return the number of bytes in the mbuf chain, m.
5372  */
5373 unsigned int
5374 m_length(struct mbuf *m)
5375 {
5376         struct mbuf *m0;
5377         unsigned int pktlen;
5378
5379         if (m->m_flags & M_PKTHDR)
5380                 return (m->m_pkthdr.len);
5381
5382         pktlen = 0;
5383         for (m0 = m; m0 != NULL; m0 = m0->m_next)
5384                 pktlen += m0->m_len;
5385         return (pktlen);
5386 }
5387
5388 /*
5389  * Copy data from a buffer back into the indicated mbuf chain,
5390  * starting "off" bytes from the beginning, extending the mbuf
5391  * chain if necessary.
5392  */
5393 void
5394 m_copyback(struct mbuf *m0, int off, int len, const void *cp)
5395 {
5396 #if DEBUG
5397         struct mbuf *origm = m0;
5398         int error;
5399 #endif /* DEBUG */
5400
5401         if (m0 == NULL)
5402                 return;
5403
5404 #if DEBUG
5405         error =
5406 #endif /* DEBUG */
5407         m_copyback0(&m0, off, len, cp,
5408             M_COPYBACK0_COPYBACK | M_COPYBACK0_EXTEND, M_DONTWAIT);
5409
5410 #if DEBUG
5411         if (error != 0 || (m0 != NULL && origm != m0))
5412                 panic("m_copyback");
5413 #endif /* DEBUG */
5414 }
5415
5416 struct mbuf *
5417 m_copyback_cow(struct mbuf *m0, int off, int len, const void *cp, int how)
5418 {
5419         int error;
5420
5421         /* don't support chain expansion */
5422         VERIFY(off + len <= m_length(m0));
5423
5424         error = m_copyback0(&m0, off, len, cp,
5425             M_COPYBACK0_COPYBACK | M_COPYBACK0_COW, how);
5426         if (error) {
5427                 /*
5428                  * no way to recover from partial success.
5429                  * just free the chain.
5430                  */
5431                 m_freem(m0);
5432                 return (NULL);
5433         }
5434         return (m0);
5435 }
5436
5437 /*
5438  * m_makewritable: ensure the specified range writable.
5439  */
5440 int
5441 m_makewritable(struct mbuf **mp, int off, int len, int how)
5442 {
5443         int error;
5444 #if DEBUG
5445         struct mbuf *n;
5446         int origlen, reslen;
5447
5448         origlen = m_length(*mp);
5449 #endif /* DEBUG */
5450
5451 #if 0 /* M_COPYALL is large enough */
5452         if (len == M_COPYALL)
5453                 len = m_length(*mp) - off; /* XXX */
5454 #endif
5455
5456         error = m_copyback0(mp, off, len, NULL,
5457             M_COPYBACK0_PRESERVE | M_COPYBACK0_COW, how);
5458
5459 #if DEBUG
5460         reslen = 0;
5461         for (n = *mp; n; n = n->m_next)
5462                 reslen += n->m_len;
5463         if (origlen != reslen)
5464                 panic("m_makewritable: length changed");
5465         if (((*mp)->m_flags & M_PKTHDR) && reslen != (*mp)->m_pkthdr.len)
5466                 panic("m_makewritable: inconsist");
5467 #endif /* DEBUG */
5468
5469         return (error);
5470 }
5471
5472 static int
5473 m_copyback0(struct mbuf **mp0, int off, int len, const void *vp, int flags,
5474     int how)
5475 {
5476         int mlen;
5477         struct mbuf *m, *n;
5478         struct mbuf **mp;
5479         int totlen = 0;
5480         const char *cp = vp;
5481
5482         VERIFY(mp0 != NULL);
5483         VERIFY(*mp0 != NULL);
5484         VERIFY((flags & M_COPYBACK0_PRESERVE) == 0 || cp == NULL);
5485         VERIFY((flags & M_COPYBACK0_COPYBACK) == 0 || cp != NULL);
5486
5487         /*
5488          * we don't bother to update "totlen" in the case of M_COPYBACK0_COW,
5489          * assuming that M_COPYBACK0_EXTEND and M_COPYBACK0_COW are exclusive.
5490          */
5491
5492         VERIFY((~flags & (M_COPYBACK0_EXTEND|M_COPYBACK0_COW)) != 0);
5493
5494         mp = mp0;
5495         m = *mp;
5496         while (off > (mlen = m->m_len)) {
5497                 off -= mlen;
5498                 totlen += mlen;
5499                 if (m->m_next == NULL) {
5500                         int tspace;
5501 extend:
5502                         if (!(flags & M_COPYBACK0_EXTEND))
5503                                 goto out;
5504
5505                         /*
5506                          * try to make some space at the end of "m".
5507                          */
5508
5509                         mlen = m->m_len;
5510                         if (off + len >= MINCLSIZE &&
5511                             !(m->m_flags & M_EXT) && m->m_len == 0) {
5512                                 MCLGET(m, how);
5513                         }
5514                         tspace = M_TRAILINGSPACE(m);
5515                         if (tspace > 0) {
5516                                 tspace = MIN(tspace, off + len);
5517                                 VERIFY(tspace > 0);
5518                                 bzero(mtod(m, char *) + m->m_len,
5519                                     MIN(off, tspace));
5520                                 m->m_len += tspace;
5521                                 off += mlen;
5522                                 totlen -= mlen;
5523                                 continue;
5524                         }
5525
5526                         /*
5527                          * need to allocate an mbuf.
5528                          */
5529
5530                         if (off + len >= MINCLSIZE) {
5531                                 n = m_getcl(how, m->m_type, 0);
5532                         } else {
5533                                 n = _M_GET(how, m->m_type);
5534                         }
5535                         if (n == NULL) {
5536                                 goto out;
5537                         }
5538                         n->m_len = 0;
5539                         n->m_len = MIN(M_TRAILINGSPACE(n), off + len);
5540                         bzero(mtod(n, char *), MIN(n->m_len, off));
5541                         m->m_next = n;
5542                 }
5543                 mp = &m->m_next;
5544                 m = m->m_next;
5545         }
5546         while (len > 0) {
5547                 mlen = m->m_len - off;
5548                 if (mlen != 0 && m_mclhasreference(m)) {
5549                         char *datap;
5550                         int eatlen;
5551
5552                         /*
5553                          * this mbuf is read-only.
5554                          * allocate a new writable mbuf and try again.
5555                          */
5556
5557 #if DIAGNOSTIC
5558                         if (!(flags & M_COPYBACK0_COW))
5559                                 panic("m_copyback0: read-only");
5560 #endif /* DIAGNOSTIC */
5561
5562                         /*
5563                          * if we're going to write into the middle of
5564                          * a mbuf, split it first.
5565                          */
5566                         if (off > 0 && len < mlen) {
5567                                 n = m_split0(m, off, how, 0);
5568                                 if (n == NULL)
5569                                         goto enobufs;
5570                                 m->m_next = n;
5571                                 mp = &m->m_next;
5572                                 m = n;
5573                                 off = 0;
5574                                 continue;
5575                         }
5576
5577                         /*
5578                          * XXX TODO coalesce into the trailingspace of
5579                          * the previous mbuf when possible.
5580                          */
5581
5582                         /*
5583                          * allocate a new mbuf.  copy packet header if needed.
5584                          */
5585                         n = _M_GET(how, m->m_type);
5586                         if (n == NULL)
5587                                 goto enobufs;
5588                         if (off == 0 && (m->m_flags & M_PKTHDR)) {
5589                                 M_COPY_PKTHDR(n, m);
5590                                 n->m_len = MHLEN;
5591                         } else {
5592                                 if (len >= MINCLSIZE)
5593                                         MCLGET(n, M_DONTWAIT);
5594                                 n->m_len =
5595                                     (n->m_flags & M_EXT) ? MCLBYTES : MLEN;
5596                         }
5597                         if (n->m_len > len)
5598                                 n->m_len = len;
5599
5600                         /*
5601                          * free the region which has been overwritten.
5602                          * copying data from old mbufs if requested.
5603                          */
5604                         if (flags & M_COPYBACK0_PRESERVE)
5605                                 datap = mtod(n, char *);
5606                         else
5607                                 datap = NULL;
5608                         eatlen = n->m_len;
5609                         VERIFY(off == 0 || eatlen >= mlen);
5610                         if (off > 0) {
5611                                 VERIFY(len >= mlen);
5612                                 m->m_len = off;
5613                                 m->m_next = n;
5614                                 if (datap) {
5615                                         m_copydata(m, off, mlen, datap);
5616                                         datap += mlen;
5617                                 }
5618                                 eatlen -= mlen;
5619                                 mp = &m->m_next;
5620                                 m = m->m_next;
5621                         }
5622                         while (m != NULL && m_mclhasreference(m) &&
5623                             n->m_type == m->m_type && eatlen > 0) {
5624                                 mlen = MIN(eatlen, m->m_len);
5625                                 if (datap) {
5626                                         m_copydata(m, 0, mlen, datap);
5627                                         datap += mlen;
5628                                 }
5629                                 m->m_data += mlen;
5630                                 m->m_len -= mlen;
5631                                 eatlen -= mlen;
5632                                 if (m->m_len == 0)
5633                                         *mp = m = m_free(m);
5634                         }
5635                         if (eatlen > 0)
5636                                 n->m_len -= eatlen;
5637                         n->m_next = m;
5638                         *mp = m = n;
5639                         continue;
5640                 }
5641                 mlen = MIN(mlen, len);
5642                 if (flags & M_COPYBACK0_COPYBACK) {
5643                         bcopy(cp, mtod(m, caddr_t) + off, (unsigned)mlen);
5644                         cp += mlen;
5645                 }
5646                 len -= mlen;
5647                 mlen += off;
5648                 off = 0;
5649                 totlen += mlen;
5650                 if (len == 0)
5651                         break;
5652                 if (m->m_next == NULL) {
5653                         goto extend;
5654                 }
5655                 mp = &m->m_next;
5656                 m = m->m_next;
5657         }
5658 out:
5659         if (((m = *mp0)->m_flags & M_PKTHDR) && (m->m_pkthdr.len < totlen)) {
5660                 VERIFY(flags & M_COPYBACK0_EXTEND);
5661                 m->m_pkthdr.len = totlen;
5662         }
5663
5664         return (0);
5665
5666 enobufs:
5667         return (ENOBUFS);
5668 }
5669
5670 uint64_t
5671 mcl_to_paddr(char *addr)
5672 {
5673         vm_offset_t base_phys;
5674
5675         if (!MBUF_IN_MAP(addr))
5676                 return (0);
5677         base_phys = mcl_paddr[atop_64(addr - (char *)mbutl)];
5678
5679         if (base_phys == 0)
5680                 return (0);
5681         return ((uint64_t)(ptoa_64(base_phys) | ((uint64_t)addr & PAGE_MASK)));
5682 }
5683
5684 /*
5685  * Dup the mbuf chain passed in.  The whole thing.  No cute additional cruft.
5686  * And really copy the thing.  That way, we don't "precompute" checksums
5687  * for unsuspecting consumers.  Assumption: m->m_nextpkt == 0.  Trick: for
5688  * small packets, don't dup into a cluster.  That way received  packets
5689  * don't take up too much room in the sockbuf (cf. sbspace()).
5690  */
5691 int MDFail;
5692
5693 struct mbuf *
5694 m_dup(struct mbuf *m, int how)
5695 {
5696         struct mbuf *n, **np;
5697         struct mbuf *top;
5698         int copyhdr = 0;
5699
5700         np = &top;
5701         top = NULL;
5702         if (m->m_flags & M_PKTHDR)
5703                 copyhdr = 1;
5704
5705         /*
5706          * Quick check: if we have one mbuf and its data fits in an
5707          *  mbuf with packet header, just copy and go.
5708          */
5709         if (m->m_next == NULL) {
5710                 /* Then just move the data into an mbuf and be done... */
5711                 if (copyhdr) {
5712                         if (m->m_pkthdr.len <= MHLEN && m->m_len <= MHLEN) {
5713                                 if ((n = _M_GETHDR(how, m->m_type)) == NULL)
5714                                         return (NULL);
5715                                 n->m_len = m->m_len;
5716                                 m_dup_pkthdr(n, m, how);
5717                                 bcopy(m->m_data, n->m_data, m->m_len);
5718                                 return (n);
5719                         }
5720                 } else if (m->m_len <= MLEN) {
5721                         if ((n = _M_GET(how, m->m_type)) == NULL)
5722                                 return (NULL);
5723                         bcopy(m->m_data, n->m_data, m->m_len);
5724                         n->m_len = m->m_len;
5725                         return (n);
5726                 }
5727         }
5728         while (m != NULL) {
5729 #if BLUE_DEBUG
5730                 kprintf("<%x: %x, %x, %x\n", m, m->m_flags, m->m_len,
5731                     m->m_data);
5732 #endif
5733                 if (copyhdr)
5734                         n = _M_GETHDR(how, m->m_type);
5735                 else
5736                         n = _M_GET(how, m->m_type);
5737                 if (n == NULL)
5738                         goto nospace;
5739                 if (m->m_flags & M_EXT) {
5740                         if (m->m_len <= m_maxsize(MC_CL))
5741                                 MCLGET(n, how);
5742                         else if (m->m_len <= m_maxsize(MC_BIGCL))
5743                                 n = m_mbigget(n, how);
5744                         else if (m->m_len <= m_maxsize(MC_16KCL) && njcl > 0)
5745                                 n = m_m16kget(n, how);
5746                         if (!(n->m_flags & M_EXT)) {
5747                                 (void) m_free(n);
5748                                 goto nospace;
5749                         }
5750                 }
5751                 *np = n;
5752                 if (copyhdr) {
5753                         /* Don't use M_COPY_PKTHDR: preserve m_data */
5754                         m_dup_pkthdr(n, m, how);
5755                         copyhdr = 0;
5756                         if (!(n->m_flags & M_EXT))
5757                                 n->m_data = n->m_pktdat;
5758                 }
5759                 n->m_len = m->m_len;
5760                 /*
5761                  * Get the dup on the same bdry as the original
5762                  * Assume that the two mbufs have the same offset to data area
5763                  * (up to word boundaries)
5764                  */
5765                 bcopy(MTOD(m, caddr_t), MTOD(n, caddr_t), (unsigned)n->m_len);
5766                 m = m->m_next;
5767                 np = &n->m_next;
5768 #if BLUE_DEBUG
5769                 kprintf(">%x: %x, %x, %x\n", n, n->m_flags, n->m_len,
5770                     n->m_data);
5771 #endif
5772         }
5773
5774         if (top == NULL)
5775                 MDFail++;
5776         return (top);
5777
5778 nospace:
5779         m_freem(top);
5780         MDFail++;
5781         return (NULL);
5782 }
5783
5784 #define MBUF_MULTIPAGES(m)                                              \
5785         (((m)->m_flags & M_EXT) &&                                      \
5786         ((IS_P2ALIGNED((m)->m_data, PAGE_SIZE)                          \
5787         && (m)->m_len > PAGE_SIZE) ||                                   \
5788         (!IS_P2ALIGNED((m)->m_data, PAGE_SIZE) &&                       \
5789         P2ROUNDUP((m)->m_data, PAGE_SIZE) < ((uintptr_t)(m)->m_data + (m)->m_len))))
5790
5791 static struct mbuf *
5792 m_expand(struct mbuf *m, struct mbuf **last)
5793 {
5794         struct mbuf *top = NULL;
5795         struct mbuf **nm = &top;
5796         uintptr_t data0, data;
5797         unsigned int len0, len;
5798
5799         VERIFY(MBUF_MULTIPAGES(m));
5800         VERIFY(m->m_next == NULL);
5801         data0 = (uintptr_t)m->m_data;
5802         len0 = m->m_len;
5803         *last = top;
5804
5805         for (;;) {
5806                 struct mbuf *n;
5807
5808                 data = data0;
5809                 if (IS_P2ALIGNED(data, PAGE_SIZE) && len0 > PAGE_SIZE)
5810                         len = PAGE_SIZE;
5811                 else if (!IS_P2ALIGNED(data, PAGE_SIZE) &&
5812                     P2ROUNDUP(data, PAGE_SIZE) < (data + len0))
5813                         len = P2ROUNDUP(data, PAGE_SIZE) - data;
5814                 else
5815                         len = len0;
5816
5817                 VERIFY(len > 0);
5818                 VERIFY(m->m_flags & M_EXT);
5819                 m->m_data = (void *)data;
5820                 m->m_len = len;
5821
5822                 *nm = *last = m;
5823                 nm = &m->m_next;
5824                 m->m_next = NULL;
5825
5826                 data0 += len;
5827                 len0 -= len;
5828                 if (len0 == 0)
5829                         break;
5830
5831                 n = _M_RETRY(M_DONTWAIT, MT_DATA);
5832                 if (n == NULL) {
5833                         m_freem(top);
5834                         top = *last = NULL;
5835                         break;
5836                 }
5837
5838                 n->m_ext = m->m_ext;
5839                 m_incref(m);
5840                 n->m_flags |= M_EXT;
5841                 m = n;
5842         }
5843         return (top);
5844 }
5845
5846 struct mbuf *
5847 m_normalize(struct mbuf *m)
5848 {
5849         struct mbuf *top = NULL;
5850         struct mbuf **nm = &top;
5851         boolean_t expanded = FALSE;
5852
5853         while (m != NULL) {
5854                 struct mbuf *n;
5855
5856                 n = m->m_next;
5857                 m->m_next = NULL;
5858
5859                 /* Does the data cross one or more page boundaries? */
5860                 if (MBUF_MULTIPAGES(m)) {
5861                         struct mbuf *last;
5862                         if ((m = m_expand(m, &last)) == NULL) {
5863                                 m_freem(n);
5864                                 m_freem(top);
5865                                 top = NULL;
5866                                 break;
5867                         }
5868                         *nm = m;
5869                         nm = &last->m_next;
5870                         expanded = TRUE;
5871                 } else {
5872                         *nm = m;
5873                         nm = &m->m_next;
5874                 }
5875                 m = n;
5876         }
5877         if (expanded)
5878                 atomic_add_32(&mb_normalized, 1);
5879         return (top);
5880 }
5881
5882 /*
5883  * Append the specified data to the indicated mbuf chain,
5884  * Extend the mbuf chain if the new data does not fit in
5885  * existing space.
5886  *
5887  * Return 1 if able to complete the job; otherwise 0.
5888  */
5889 int
5890 m_append(struct mbuf *m0, int len, caddr_t cp)
5891 {
5892         struct mbuf *m, *n;
5893         int remainder, space;
5894
5895         for (m = m0; m->m_next != NULL; m = m->m_next)
5896                 ;
5897         remainder = len;
5898         space = M_TRAILINGSPACE(m);
5899         if (space > 0) {
5900                 /*
5901                  * Copy into available space.
5902                  */
5903                 if (space > remainder)
5904                         space = remainder;
5905                 bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
5906                 m->m_len += space;
5907                 cp += space, remainder -= space;
5908         }
5909         while (remainder > 0) {
5910                 /*
5911                  * Allocate a new mbuf; could check space
5912                  * and allocate a cluster instead.
5913                  */
5914                 n = m_get(M_WAITOK, m->m_type);
5915                 if (n == NULL)
5916                         break;
5917                 n->m_len = min(MLEN, remainder);
5918                 bcopy(cp, mtod(n, caddr_t), n->m_len);
5919                 cp += n->m_len;
5920                 remainder -= n->m_len;
5921                 m->m_next = n;
5922                 m = n;
5923         }
5924         if (m0->m_flags & M_PKTHDR)
5925                 m0->m_pkthdr.len += len - remainder;
5926         return (remainder == 0);
5927 }
5928
5929 struct mbuf *
5930 m_last(struct mbuf *m)
5931 {
5932         while (m->m_next != NULL)
5933                 m = m->m_next;
5934         return (m);
5935 }
5936
5937 unsigned int
5938 m_fixhdr(struct mbuf *m0)
5939 {
5940         u_int len;
5941
5942         VERIFY(m0->m_flags & M_PKTHDR);
5943
5944         len = m_length2(m0, NULL);
5945         m0->m_pkthdr.len = len;
5946         return (len);
5947 }
5948
5949 unsigned int
5950 m_length2(struct mbuf *m0, struct mbuf **last)
5951 {
5952         struct mbuf *m;
5953         u_int len;
5954
5955         len = 0;
5956         for (m = m0; m != NULL; m = m->m_next) {
5957                 len += m->m_len;
5958                 if (m->m_next == NULL)
5959                         break;
5960         }
5961         if (last != NULL)
5962                 *last = m;
5963         return (len);
5964 }
5965
5966 /*
5967  * Defragment a mbuf chain, returning the shortest possible chain of mbufs
5968  * and clusters.  If allocation fails and this cannot be completed, NULL will
5969  * be returned, but the passed in chain will be unchanged.  Upon success,
5970  * the original chain will be freed, and the new chain will be returned.
5971  *
5972  * If a non-packet header is passed in, the original mbuf (chain?) will
5973  * be returned unharmed.
5974  *
5975  * If offset is specfied, the first mbuf in the chain will have a leading
5976  * space of the amount stated by the "off" parameter.
5977  *
5978  * This routine requires that the m_pkthdr.header field of the original
5979  * mbuf chain is cleared by the caller.
5980  */
5981 struct mbuf *
5982 m_defrag_offset(struct mbuf *m0, u_int32_t off, int how)
5983 {
5984         struct mbuf *m_new = NULL, *m_final = NULL;
5985         int progress = 0, length, pktlen;
5986
5987         if (!(m0->m_flags & M_PKTHDR))
5988                 return (m0);
5989
5990         VERIFY(off < MHLEN);
5991         m_fixhdr(m0); /* Needed sanity check */
5992
5993         pktlen = m0->m_pkthdr.len + off;
5994         if (pktlen > MHLEN)
5995                 m_final = m_getcl(how, MT_DATA, M_PKTHDR);
5996         else
5997                 m_final = m_gethdr(how, MT_DATA);
5998
5999         if (m_final == NULL)
6000                 goto nospace;
6001
6002         if (off > 0) {
6003                 pktlen -= off;
6004                 m_final->m_data += off;
6005         }
6006
6007         /*
6008          * Caller must have handled the contents pointed to by this
6009          * pointer before coming here, as otherwise it will point to
6010          * the original mbuf which will get freed upon success.
6011          */
6012         VERIFY(m0->m_pkthdr.pkt_hdr == NULL);
6013
6014         if (m_dup_pkthdr(m_final, m0, how) == 0)
6015                 goto nospace;
6016
6017         m_new = m_final;
6018
6019         while (progress < pktlen) {
6020                 length = pktlen - progress;
6021                 if (length > MCLBYTES)
6022                         length = MCLBYTES;
6023                 length -= ((m_new == m_final) ? off : 0);
6024
6025                 if (m_new == NULL) {
6026                         if (length > MLEN)
6027                                 m_new = m_getcl(how, MT_DATA, 0);
6028                         else
6029                                 m_new = m_get(how, MT_DATA);
6030                         if (m_new == NULL)
6031                                 goto nospace;
6032                 }
6033
6034                 m_copydata(m0, progress, length, mtod(m_new, caddr_t));
6035                 progress += length;
6036                 m_new->m_len = length;
6037                 if (m_new != m_final)
6038                         m_cat(m_final, m_new);
6039                 m_new = NULL;
6040         }
6041         m_freem(m0);
6042         m0 = m_final;
6043         return (m0);
6044 nospace:
6045         if (m_final)
6046                 m_freem(m_final);
6047         return (NULL);
6048 }
6049
6050 struct mbuf *
6051 m_defrag(struct mbuf *m0, int how)
6052 {
6053         return (m_defrag_offset(m0, 0, how));
6054 }
6055
6056 void
6057 m_mchtype(struct mbuf *m, int t)
6058 {
6059         mtype_stat_inc(t);
6060         mtype_stat_dec(m->m_type);
6061         (m)->m_type = t;
6062 }
6063
6064 void *
6065 m_mtod(struct mbuf *m)
6066 {
6067         return (MTOD(m, void *));
6068 }
6069
6070 struct mbuf *
6071 m_dtom(void *x)
6072 {
6073         return ((struct mbuf *)((uintptr_t)(x) & ~(MSIZE-1)));
6074 }
6075
6076 void
6077 m_mcheck(struct mbuf *m)
6078 {
6079         _MCHECK(m);
6080 }
6081
6082 /*
6083  * Return a pointer to mbuf/offset of location in mbuf chain.
6084  */
6085 struct mbuf *
6086 m_getptr(struct mbuf *m, int loc, int *off)
6087 {
6088
6089         while (loc >= 0) {
6090                 /* Normal end of search. */
6091                 if (m->m_len > loc) {
6092                         *off = loc;
6093                         return (m);
6094                 } else {
6095                         loc -= m->m_len;
6096                         if (m->m_next == NULL) {
6097                                 if (loc == 0) {
6098                                         /* Point at the end of valid data. */
6099                                         *off = m->m_len;
6100                                         return (m);
6101                                 }
6102                                 return (NULL);
6103                         }
6104                         m = m->m_next;
6105                 }
6106         }
6107         return (NULL);
6108 }
6109
6110 /*
6111  * Inform the corresponding mcache(s) that there's a waiter below.
6112  */
6113 static void
6114 mbuf_waiter_inc(mbuf_class_t class, boolean_t comp)
6115 {
6116         mcache_waiter_inc(m_cache(class));
6117         if (comp) {
6118                 if (class == MC_CL) {
6119                         mcache_waiter_inc(m_cache(MC_MBUF_CL));
6120                 } else if (class == MC_BIGCL) {
6121                         mcache_waiter_inc(m_cache(MC_MBUF_BIGCL));
6122                 } else if (class == MC_16KCL) {
6123                         mcache_waiter_inc(m_cache(MC_MBUF_16KCL));
6124                 } else {
6125                         mcache_waiter_inc(m_cache(MC_MBUF_CL));
6126                         mcache_waiter_inc(m_cache(MC_MBUF_BIGCL));
6127                 }
6128         }
6129 }
6130
6131 /*
6132  * Inform the corresponding mcache(s) that there's no more waiter below.
6133  */
6134 static void
6135 mbuf_waiter_dec(mbuf_class_t class, boolean_t comp)
6136 {
6137         mcache_waiter_dec(m_cache(class));
6138         if (comp) {
6139                 if (class == MC_CL) {
6140                         mcache_waiter_dec(m_cache(MC_MBUF_CL));
6141                 } else if (class == MC_BIGCL) {
6142                         mcache_waiter_dec(m_cache(MC_MBUF_BIGCL));
6143                 } else if (class == MC_16KCL) {
6144                         mcache_waiter_dec(m_cache(MC_MBUF_16KCL));
6145                 } else {
6146                         mcache_waiter_dec(m_cache(MC_MBUF_CL));
6147                         mcache_waiter_dec(m_cache(MC_MBUF_BIGCL));
6148                 }
6149         }
6150 }
6151
6152 /*
6153  * Called during slab (blocking and non-blocking) allocation.  If there
6154  * is at least one waiter, and the time since the first waiter is blocked
6155  * is greater than the watchdog timeout, panic the system.
6156  */
6157 static void
6158 mbuf_watchdog(void)
6159 {
6160         struct timeval now;
6161         unsigned int since;
6162
6163         if (mb_waiters == 0 || !mb_watchdog)
6164                 return;
6165
6166         microuptime(&now);
6167         since = now.tv_sec - mb_wdtstart.tv_sec;
6168         if (since >= MB_WDT_MAXTIME) {
6169                 panic_plain("%s: %d waiters stuck for %u secs\n%s", __func__,
6170                     mb_waiters, since, mbuf_dump());
6171                 /* NOTREACHED */
6172         }
6173 }
6174
6175 /*
6176  * Called during blocking allocation.  Returns TRUE if one or more objects
6177  * are available at the per-CPU caches layer and that allocation should be
6178  * retried at that level.
6179  */
6180 static boolean_t
6181 mbuf_sleep(mbuf_class_t class, unsigned int num, int wait)
6182 {
6183         boolean_t mcache_retry = FALSE;
6184
6185         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
6186
6187         /* Check if there's anything at the cache layer */
6188         if (mbuf_cached_above(class, wait)) {
6189                 mcache_retry = TRUE;
6190                 goto done;
6191         }
6192
6193         /* Nothing?  Then try hard to get it from somewhere */
6194         m_reclaim(class, num, (wait & MCR_COMP));
6195
6196         /* We tried hard and got something? */
6197         if (m_infree(class) > 0) {
6198                 mbstat.m_wait++;
6199                 goto done;
6200         } else if (mbuf_cached_above(class, wait)) {
6201                 mbstat.m_wait++;
6202                 mcache_retry = TRUE;
6203                 goto done;
6204         } else if (wait & MCR_TRYHARD) {
6205                 mcache_retry = TRUE;
6206                 goto done;
6207         }
6208
6209         /*
6210          * There's really nothing for us right now; inform the
6211          * cache(s) that there is a waiter below and go to sleep.
6212          */
6213         mbuf_waiter_inc(class, (wait & MCR_COMP));
6214
6215         VERIFY(!(wait & MCR_NOSLEEP));
6216
6217         /*
6218          * If this is the first waiter, arm the watchdog timer.  Otherwise
6219          * check if we need to panic the system due to watchdog timeout.
6220          */
6221         if (mb_waiters == 0)
6222                 microuptime(&mb_wdtstart);
6223         else
6224                 mbuf_watchdog();
6225
6226         mb_waiters++;
6227         (void) msleep(mb_waitchan, mbuf_mlock, (PZERO-1), m_cname(class), NULL);
6228
6229         /* We are now up; stop getting notified until next round */
6230         mbuf_waiter_dec(class, (wait & MCR_COMP));
6231
6232         /* We waited and got something */
6233         if (m_infree(class) > 0) {
6234                 mbstat.m_wait++;
6235                 goto done;
6236         } else if (mbuf_cached_above(class, wait)) {
6237                 mbstat.m_wait++;
6238                 mcache_retry = TRUE;
6239         }
6240 done:
6241         return (mcache_retry);
6242 }
6243
6244 static void
6245 mbuf_worker_thread(void)
6246 {
6247         int mbuf_expand;
6248
6249         while (1) {
6250                 lck_mtx_lock(mbuf_mlock);
6251
6252                 mbuf_expand = 0;
6253                 if (mbuf_expand_mcl) {
6254                         int n;
6255
6256                         /* Adjust to current number of cluster in use */
6257                         n = mbuf_expand_mcl -
6258                             (m_total(MC_CL) - m_infree(MC_CL));
6259                         if ((n + m_total(MC_CL)) > m_maxlimit(MC_CL))
6260                                 n = m_maxlimit(MC_CL) - m_total(MC_CL);
6261                         mbuf_expand_mcl = 0;
6262
6263                         if (n > 0 && freelist_populate(MC_CL, n, M_WAIT) > 0)
6264                                 mbuf_expand++;
6265                 }
6266                 if (mbuf_expand_big) {
6267                         int n;
6268
6269                         /* Adjust to current number of 4 KB cluster in use */
6270                         n = mbuf_expand_big -
6271                             (m_total(MC_BIGCL) - m_infree(MC_BIGCL));
6272                         if ((n + m_total(MC_BIGCL)) > m_maxlimit(MC_BIGCL))
6273                                 n = m_maxlimit(MC_BIGCL) - m_total(MC_BIGCL);
6274                         mbuf_expand_big = 0;
6275
6276                         if (n > 0 && freelist_populate(MC_BIGCL, n, M_WAIT) > 0)
6277                                 mbuf_expand++;
6278                 }
6279                 if (mbuf_expand_16k) {
6280                         int n;
6281
6282                         /* Adjust to current number of 16 KB cluster in use */
6283                         n = mbuf_expand_16k -
6284                             (m_total(MC_16KCL) - m_infree(MC_16KCL));
6285                         if ((n + m_total(MC_16KCL)) > m_maxlimit(MC_16KCL))
6286                                 n = m_maxlimit(MC_16KCL) - m_total(MC_16KCL);
6287                         mbuf_expand_16k = 0;
6288
6289                         if (n > 0)
6290                                 (void) freelist_populate(MC_16KCL, n, M_WAIT);
6291                 }
6292
6293                 /*
6294                  * Because we can run out of memory before filling the mbuf
6295                  * map, we should not allocate more clusters than they are
6296                  * mbufs -- otherwise we could have a large number of useless
6297                  * clusters allocated.
6298                  */
6299                 if (mbuf_expand) {
6300                         while (m_total(MC_MBUF) <
6301                             (m_total(MC_BIGCL) + m_total(MC_CL))) {
6302                                 if (freelist_populate(MC_MBUF, 1, M_WAIT) == 0)
6303                                         break;
6304                         }
6305                 }
6306
6307                 lck_mtx_unlock(mbuf_mlock);
6308
6309                 assert_wait(&mbuf_worker_run, THREAD_UNINT);
6310                 (void) thread_block((thread_continue_t)mbuf_worker_thread);
6311         }
6312 }
6313
6314 static void
6315 mbuf_worker_thread_init(void)
6316 {
6317         mbuf_worker_ready++;
6318         mbuf_worker_thread();
6319 }
6320
6321 static mcl_slab_t *
6322 slab_get(void *buf)
6323 {
6324         mcl_slabg_t *slg;
6325         unsigned int ix, k;
6326
6327         lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
6328
6329         VERIFY(MBUF_IN_MAP(buf));
6330         ix = ((unsigned char *)buf - mbutl) >> MBSHIFT;
6331         VERIFY(ix < maxslabgrp);
6332
6333         if ((slg = slabstbl[ix]) == NULL) {
6334                 /*
6335                  * In the current implementation, we never shrink the slabs
6336                  * table; if we attempt to reallocate a cluster group when
6337                  * it's already allocated, panic since this is a sign of a
6338                  * memory corruption (slabstbl[ix] got nullified).
6339                  */
6340                 ++slabgrp;
6341                 VERIFY(ix < slabgrp);
6342                 /*
6343                  * Slabs expansion can only be done single threaded; when
6344                  * we get here, it must be as a result of m_clalloc() which
6345                  * is serialized and therefore mb_clalloc_busy must be set.
6346                  */
6347                 VERIFY(mb_clalloc_busy);
6348                 lck_mtx_unlock(mbuf_mlock);
6349
6350                 /* This is a new buffer; create the slabs group for it */
6351                 MALLOC(slg, mcl_slabg_t *, sizeof (*slg), M_TEMP,
6352                     M_WAITOK | M_ZERO);
6353                 MALLOC(slg->slg_slab, mcl_slab_t *, sizeof(mcl_slab_t) * NSLABSPMB,
6354                     M_TEMP, M_WAITOK | M_ZERO);
6355                 VERIFY(slg != NULL && slg->slg_slab != NULL);
6356
6357                 lck_mtx_lock(mbuf_mlock);
6358                 /*
6359                  * No other thread could have gone into m_clalloc() after
6360                  * we dropped the lock above, so verify that it's true.
6361                  */
6362                 VERIFY(mb_clalloc_busy);
6363
6364                 slabstbl[ix] = slg;
6365
6366                 /* Chain each slab in the group to its forward neighbor */
6367                 for (k = 1; k < NSLABSPMB; k++)
6368                         slg->slg_slab[k - 1].sl_next = &slg->slg_slab[k];
6369                 VERIFY(slg->slg_slab[NSLABSPMB - 1].sl_next == NULL);
6370
6371                 /* And chain the last slab in the previous group to this */
6372                 if (ix > 0) {
6373                         VERIFY(slabstbl[ix - 1]->
6374                             slg_slab[NSLABSPMB - 1].sl_next == NULL);
6375                         slabstbl[ix - 1]->slg_slab[NSLABSPMB - 1].sl_next =
6376                             &slg->slg_slab[0];
6377                 }
6378         }
6379
6380         ix = MTOPG(buf) % NSLABSPMB;
6381         VERIFY(ix < NSLABSPMB);
6382
6383         return (&slg->slg_slab[ix]);
6384 }
6385
6386 static void
6387 slab_init(mcl_slab_t *sp, mbuf_class_t class, u_int32_t flags,
6388     void *base, void *head, unsigned int len, int refcnt, int chunks)
6389 {
6390         sp->sl_class = class;
6391         sp->sl_flags = flags;
6392         sp->sl_base = base;
6393         sp->sl_head = head;
6394         sp->sl_len = len;
6395         sp->sl_refcnt = refcnt;
6396         sp->sl_chunks = chunks;
6397         slab_detach(sp);
6398 }
6399
6400 static void
6401 slab_insert(mcl_slab_t *sp, mbuf_class_t class)
6402 {
6403         VERIFY(slab_is_detached(sp));
6404         m_slab_cnt(class)++;
6405         TAILQ_INSERT_TAIL(&m_slablist(class), sp, sl_link);
6406         sp->sl_flags &= ~SLF_DETACHED;
6407
6408         /*
6409          * If a buffer spans multiple contiguous pages then mark them as
6410          * detached too
6411          */
6412         if (class == MC_16KCL) {
6413                 int k;
6414                 for (k = 1; k < NSLABSP16KB; k++) {
6415                         sp = sp->sl_next;
6416                         /* Next slab must already be present */
6417                         VERIFY(sp != NULL && slab_is_detached(sp));
6418                         sp->sl_flags &= ~SLF_DETACHED;
6419                 }
6420         }
6421 }
6422
6423 static void
6424 slab_remove(mcl_slab_t *sp, mbuf_class_t class)
6425 {
6426         int k;
6427         VERIFY(!slab_is_detached(sp));
6428         VERIFY(m_slab_cnt(class) > 0);
6429         m_slab_cnt(class)--;
6430         TAILQ_REMOVE(&m_slablist(class), sp, sl_link);
6431         slab_detach(sp);
6432         if (class == MC_16KCL) {
6433                 for (k = 1; k < NSLABSP16KB; k++) {
6434                         sp = sp->sl_next;
6435                         /* Next slab must already be present */
6436                         VERIFY(sp != NULL);
6437                         VERIFY(!slab_is_detached(sp));
6438                         slab_detach(sp);
6439                 }
6440         }
6441 }
6442
6443 static boolean_t
6444 slab_inrange(mcl_slab_t *sp, void *buf)
6445 {
6446         return ((uintptr_t)buf >= (uintptr_t)sp->sl_base &&
6447             (uintptr_t)buf < ((uintptr_t)sp->sl_base + sp->sl_len));
6448 }
6449
6450 #undef panic
6451
6452 static void
6453 slab_nextptr_panic(mcl_slab_t *sp, void *addr)
6454 {
6455         int i;
6456         unsigned int chunk_len = sp->sl_len / sp->sl_chunks;
6457         uintptr_t buf = (uintptr_t)sp->sl_base;
6458
6459         for (i = 0; i < sp->sl_chunks; i++, buf += chunk_len) {
6460                 void *next = ((mcache_obj_t *)buf)->obj_next;
6461                 if (next != addr)
6462                         continue;
6463                 if (!mclverify) {
6464                         if (next != NULL && !MBUF_IN_MAP(next)) {
6465                                 mcache_t *cp = m_cache(sp->sl_class);
6466                                 panic("%s: %s buffer %p in slab %p modified "
6467                                     "after free at offset 0: %p out of range "
6468                                     "[%p-%p)\n", __func__, cp->mc_name,
6469                                     (void *)buf, sp, next, mbutl, embutl);
6470                                 /* NOTREACHED */
6471                         }
6472                 } else {
6473                         mcache_audit_t *mca = mcl_audit_buf2mca(sp->sl_class,
6474                             (mcache_obj_t *)buf);
6475                         mcl_audit_verify_nextptr(next, mca);
6476                 }
6477         }
6478 }
6479
6480 static void
6481 slab_detach(mcl_slab_t *sp)
6482 {
6483         sp->sl_link.tqe_next = (mcl_slab_t *)-1;
6484         sp->sl_link.tqe_prev = (mcl_slab_t **)-1;
6485         sp->sl_flags |= SLF_DETACHED;
6486 }
6487
6488 static boolean_t
6489 slab_is_detached(mcl_slab_t *sp)
6490 {
6491         return ((intptr_t)sp->sl_link.tqe_next == -1 &&
6492             (intptr_t)sp->sl_link.tqe_prev == -1 &&
6493             (sp->sl_flags & SLF_DETACHED));
6494 }
6495
6496 static void
6497 mcl_audit_init(void *buf, mcache_audit_t **mca_list,
6498     mcache_obj_t **con_list, size_t con_size, unsigned int num)
6499 {
6500         mcache_audit_t *mca, *mca_tail;
6501         mcache_obj_t *con = NULL;
6502         boolean_t save_contents = (con_list != NULL);
6503         unsigned int i, ix;
6504
6505         ASSERT(num <= NMBPG);
6506         ASSERT(con_list == NULL || con_size != 0);
6507
6508         ix = MTOPG(buf);
6509         VERIFY(ix < maxclaudit);
6510
6511         /* Make sure we haven't been here before */
6512         for (i = 0; i < NMBPG; i++)
6513                 VERIFY(mclaudit[ix].cl_audit[i] == NULL);
6514
6515         mca = mca_tail = *mca_list;
6516         if (save_contents)
6517                 con = *con_list;
6518
6519         for (i = 0; i < num; i++) {
6520                 mcache_audit_t *next;
6521
6522                 next = mca->mca_next;
6523                 bzero(mca, sizeof (*mca));
6524                 mca->mca_next = next;
6525                 mclaudit[ix].cl_audit[i] = mca;
6526
6527                 /* Attach the contents buffer if requested */
6528                 if (save_contents) {
6529                         mcl_saved_contents_t *msc =
6530                             (mcl_saved_contents_t *)(void *)con;
6531
6532                         VERIFY(msc != NULL);
6533                         VERIFY(IS_P2ALIGNED(msc, sizeof (u_int64_t)));
6534                         VERIFY(con_size == sizeof (*msc));
6535                         mca->mca_contents_size = con_size;
6536                         mca->mca_contents = msc;
6537                         con = con->obj_next;
6538                         bzero(mca->mca_contents, mca->mca_contents_size);
6539                 }
6540
6541                 mca_tail = mca;
6542                 mca = mca->mca_next;
6543         }
6544
6545         if (save_contents)
6546                 *con_list = con;
6547
6548         *mca_list = mca_tail->mca_next;
6549         mca_tail->mca_next = NULL;
6550 }
6551
6552 static void
6553 mcl_audit_free(void *buf, unsigned int num)
6554 {
6555         unsigned int i, ix;
6556         mcache_audit_t *mca, *mca_list;
6557
6558         ix = MTOPG(buf);
6559         VERIFY(ix < maxclaudit);
6560
6561         if (mclaudit[ix].cl_audit[0] != NULL) {
6562                 mca_list = mclaudit[ix].cl_audit[0];
6563                 for (i = 0; i < num; i++) {
6564                         mca = mclaudit[ix].cl_audit[i];
6565                         mclaudit[ix].cl_audit[i] = NULL;
6566                         if (mca->mca_contents)
6567                                 mcache_free(mcl_audit_con_cache,
6568                                     mca->mca_contents);
6569                 }
6570                 mcache_free_ext(mcache_audit_cache,
6571                     (mcache_obj_t *)mca_list);
6572         }
6573 }
6574
6575 /*
6576  * Given an address of a buffer (mbuf/2KB/4KB/16KB), return
6577  * the corresponding audit structure for that buffer.
6578  */
6579 static mcache_audit_t *
6580 mcl_audit_buf2mca(mbuf_class_t class, mcache_obj_t *mobj)
6581 {
6582         mcache_audit_t *mca = NULL;
6583         int ix = MTOPG(mobj), m_idx = 0;
6584         unsigned char *page_addr;
6585
6586         VERIFY(ix < maxclaudit);
6587         VERIFY(IS_P2ALIGNED(mobj, MIN(m_maxsize(class), PAGE_SIZE)));
6588
6589         page_addr = PGTOM(ix);
6590
6591         switch (class) {
6592         case MC_MBUF:
6593                 /*
6594                  * For the mbuf case, find the index of the page
6595                  * used by the mbuf and use that index to locate the
6596                  * base address of the page.  Then find out the
6597                  * mbuf index relative to the page base and use
6598                  * it to locate the audit structure.
6599                  */
6600                 m_idx = MBPAGEIDX(page_addr, mobj);
6601                 VERIFY(m_idx < (int)NMBPG);
6602                 mca = mclaudit[ix].cl_audit[m_idx];
6603                 break;
6604
6605         case MC_CL:
6606                 /*
6607                  * Same thing as above, but for 2KB clusters in a page.
6608                  */
6609                 m_idx = CLPAGEIDX(page_addr, mobj);
6610                 VERIFY(m_idx < (int)NCLPG);
6611                 mca = mclaudit[ix].cl_audit[m_idx];
6612                 break;
6613
6614         case MC_BIGCL:
6615                 m_idx = BCLPAGEIDX(page_addr, mobj);
6616                 VERIFY(m_idx < (int)NBCLPG);
6617                 mca = mclaudit[ix].cl_audit[m_idx];
6618                 break;
6619         case MC_16KCL:
6620                 /*
6621                  * Same as above, but only return the first element.
6622                  */
6623                 mca = mclaudit[ix].cl_audit[0];
6624                 break;
6625
6626         default:
6627                 VERIFY(0);
6628                 /* NOTREACHED */
6629         }
6630
6631         return (mca);
6632 }
6633
6634 static void
6635 mcl_audit_mbuf(mcache_audit_t *mca, void *addr, boolean_t composite,
6636     boolean_t alloc)
6637 {
6638         struct mbuf *m = addr;
6639         mcache_obj_t *next = ((mcache_obj_t *)m)->obj_next;
6640
6641         VERIFY(mca->mca_contents != NULL &&
6642             mca->mca_contents_size == AUDIT_CONTENTS_SIZE);
6643
6644         if (mclverify)
6645                 mcl_audit_verify_nextptr(next, mca);
6646
6647         if (!alloc) {
6648                 /* Save constructed mbuf fields */
6649                 mcl_audit_save_mbuf(m, mca);
6650                 if (mclverify) {
6651                         mcache_set_pattern(MCACHE_FREE_PATTERN, m,
6652                             m_maxsize(MC_MBUF));
6653                 }
6654                 ((mcache_obj_t *)m)->obj_next = next;
6655                 return;
6656         }
6657
6658         /* Check if the buffer has been corrupted while in freelist */
6659         if (mclverify) {
6660                 mcache_audit_free_verify_set(mca, addr, 0, m_maxsize(MC_MBUF));
6661         }
6662         /* Restore constructed mbuf fields */
6663         mcl_audit_restore_mbuf(m, mca, composite);
6664 }
6665
6666 static void
6667 mcl_audit_restore_mbuf(struct mbuf *m, mcache_audit_t *mca, boolean_t composite)
6668 {
6669         struct mbuf *ms = MCA_SAVED_MBUF_PTR(mca);
6670
6671         if (composite) {
6672                 struct mbuf *next = m->m_next;
6673                 VERIFY(ms->m_flags == M_EXT && MEXT_RFA(ms) != NULL &&
6674                     MBUF_IS_COMPOSITE(ms));
6675                 VERIFY(mca->mca_contents_size == AUDIT_CONTENTS_SIZE);
6676                 /*
6677                  * We could have hand-picked the mbuf fields and restore
6678                  * them individually, but that will be a maintenance
6679                  * headache.  Instead, restore everything that was saved;
6680                  * the mbuf layer will recheck and reinitialize anyway.
6681                  */
6682                 bcopy(ms, m, MCA_SAVED_MBUF_SIZE);
6683                 m->m_next = next;
6684         } else {
6685                 /*
6686                  * For a regular mbuf (no cluster attached) there's nothing
6687                  * to restore other than the type field, which is expected
6688                  * to be MT_FREE.
6689                  */
6690                 m->m_type = ms->m_type;
6691         }
6692         _MCHECK(m);
6693 }
6694
6695 static void
6696 mcl_audit_save_mbuf(struct mbuf *m, mcache_audit_t *mca)
6697 {
6698         VERIFY(mca->mca_contents_size == AUDIT_CONTENTS_SIZE);
6699         _MCHECK(m);
6700         bcopy(m, MCA_SAVED_MBUF_PTR(mca), MCA_SAVED_MBUF_SIZE);
6701 }
6702
6703 static void
6704 mcl_audit_cluster(mcache_audit_t *mca, void *addr, size_t size, boolean_t alloc,
6705     boolean_t save_next)
6706 {
6707         mcache_obj_t *next = ((mcache_obj_t *)addr)->obj_next;
6708
6709         if (!alloc) {
6710                 if (mclverify) {
6711                         mcache_set_pattern(MCACHE_FREE_PATTERN, addr, size);
6712                 }
6713                 if (save_next) {
6714                         mcl_audit_verify_nextptr(next, mca);
6715                         ((mcache_obj_t *)addr)->obj_next = next;
6716                 }
6717         } else if (mclverify) {
6718                 /* Check if the buffer has been corrupted while in freelist */
6719                 mcl_audit_verify_nextptr(next, mca);
6720                 mcache_audit_free_verify_set(mca, addr, 0, size);
6721         }
6722 }
6723
6724 static void
6725 mcl_audit_scratch(mcache_audit_t *mca)
6726 {
6727         void *stack[MCACHE_STACK_DEPTH + 1];
6728         mcl_scratch_audit_t *msa;
6729         struct timeval now;
6730
6731         VERIFY(mca->mca_contents != NULL);
6732         msa = MCA_SAVED_SCRATCH_PTR(mca);
6733
6734         msa->msa_pthread = msa->msa_thread;
6735         msa->msa_thread = current_thread();
6736         bcopy(msa->msa_stack, msa->msa_pstack, sizeof (msa->msa_pstack));
6737         msa->msa_pdepth = msa->msa_depth;
6738         bzero(stack, sizeof (stack));
6739         msa->msa_depth = OSBacktrace(stack, MCACHE_STACK_DEPTH + 1) - 1;
6740         bcopy(&stack[1], msa->msa_stack, sizeof (msa->msa_stack));
6741
6742         msa->msa_ptstamp = msa->msa_tstamp;
6743         microuptime(&now);
6744         /* tstamp is in ms relative to base_ts */
6745         msa->msa_tstamp = ((now.tv_usec - mb_start.tv_usec) / 1000);
6746         if ((now.tv_sec - mb_start.tv_sec) > 0)
6747                 msa->msa_tstamp += ((now.tv_sec - mb_start.tv_sec) * 1000);
6748 }
6749
6750 static void
6751 mcl_audit_mcheck_panic(struct mbuf *m)
6752 {
6753         mcache_audit_t *mca;
6754
6755         MRANGE(m);
6756         mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
6757
6758         panic("mcl_audit: freed mbuf %p with type 0x%x (instead of 0x%x)\n%s\n",
6759             m, (u_int16_t)m->m_type, MT_FREE, mcache_dump_mca(mca));
6760         /* NOTREACHED */
6761 }
6762
6763 static void
6764 mcl_audit_verify_nextptr(void *next, mcache_audit_t *mca)
6765 {
6766         if (next != NULL && !MBUF_IN_MAP(next) &&
6767             (next != (void *)MCACHE_FREE_PATTERN || !mclverify)) {
6768                 panic("mcl_audit: buffer %p modified after free at offset 0: "
6769                     "%p out of range [%p-%p)\n%s\n",
6770                     mca->mca_addr, next, mbutl, embutl, mcache_dump_mca(mca));
6771                 /* NOTREACHED */
6772         }
6773 }
6774
6775 /* This function turns on mbuf leak detection */
6776 static void
6777 mleak_activate(void)
6778 {
6779         mleak_table.mleak_sample_factor = MLEAK_SAMPLE_FACTOR;
6780         PE_parse_boot_argn("mleak_sample_factor",
6781             &mleak_table.mleak_sample_factor,
6782             sizeof (mleak_table.mleak_sample_factor));
6783
6784         if (mleak_table.mleak_sample_factor == 0)
6785                 mclfindleak = 0;
6786
6787         if (mclfindleak == 0)
6788                 return;
6789
6790         vm_size_t alloc_size =
6791             mleak_alloc_buckets * sizeof (struct mallocation);
6792         vm_size_t trace_size = mleak_trace_buckets * sizeof (struct mtrace);
6793
6794         MALLOC(mleak_allocations, struct mallocation *, alloc_size,
6795             M_TEMP, M_WAITOK | M_ZERO);
6796         VERIFY(mleak_allocations != NULL);
6797
6798         MALLOC(mleak_traces, struct mtrace *, trace_size,
6799             M_TEMP, M_WAITOK | M_ZERO);
6800         VERIFY(mleak_traces != NULL);
6801
6802         MALLOC(mleak_stat, mleak_stat_t *, MLEAK_STAT_SIZE(MLEAK_NUM_TRACES),
6803             M_TEMP, M_WAITOK | M_ZERO);
6804         VERIFY(mleak_stat != NULL);
6805         mleak_stat->ml_cnt = MLEAK_NUM_TRACES;
6806 #ifdef __LP64__
6807         mleak_stat->ml_isaddr64 = 1;
6808 #endif /* __LP64__ */
6809 }
6810
6811 static void
6812 mleak_logger(u_int32_t num, mcache_obj_t *addr, boolean_t alloc)
6813 {
6814         int temp;
6815
6816         if (mclfindleak == 0)
6817                 return;
6818
6819         if (!alloc)
6820                 return (mleak_free(addr));
6821
6822         temp = atomic_add_32_ov(&mleak_table.mleak_capture, 1);
6823
6824         if ((temp % mleak_table.mleak_sample_factor) == 0 && addr != NULL) {
6825                 uintptr_t bt[MLEAK_STACK_DEPTH];
6826                 int logged = fastbacktrace(bt, MLEAK_STACK_DEPTH);
6827                 mleak_log(bt, addr, logged, num);
6828         }
6829 }
6830
6831 /*
6832  * This function records the allocation in the mleak_allocations table
6833  * and the backtrace in the mleak_traces table; if allocation slot is in use,
6834  * replace old allocation with new one if the trace slot is in use, return
6835  * (or increment refcount if same trace).
6836  */
6837 static boolean_t
6838 mleak_log(uintptr_t *bt, mcache_obj_t *addr, uint32_t depth, int num)
6839 {
6840         struct mallocation *allocation;
6841         struct mtrace *trace;
6842         uint32_t trace_index;
6843
6844         /* Quit if someone else modifying the tables */
6845         if (!lck_mtx_try_lock_spin(mleak_lock)) {
6846                 mleak_table.total_conflicts++;
6847                 return (FALSE);
6848         }
6849
6850         allocation = &mleak_allocations[hashaddr((uintptr_t)addr,
6851             mleak_alloc_buckets)];
6852         trace_index = hashbacktrace(bt, depth, mleak_trace_buckets);
6853         trace = &mleak_traces[trace_index];
6854
6855         VERIFY(allocation <= &mleak_allocations[mleak_alloc_buckets - 1]);
6856         VERIFY(trace <= &mleak_traces[mleak_trace_buckets - 1]);
6857
6858         allocation->hitcount++;
6859         trace->hitcount++;
6860
6861         /*
6862          * If the allocation bucket we want is occupied
6863          * and the occupier has the same trace, just bail.
6864          */
6865         if (allocation->element != NULL &&
6866             trace_index == allocation->trace_index) {
6867                 mleak_table.alloc_collisions++;
6868                 lck_mtx_unlock(mleak_lock);
6869                 return (TRUE);
6870         }
6871
6872         /*
6873          * Store the backtrace in the traces array;
6874          * Size of zero = trace bucket is free.
6875          */
6876         if (trace->allocs > 0 &&
6877             bcmp(trace->addr, bt, (depth * sizeof (uintptr_t))) != 0) {
6878                 /* Different, unique trace, but the same hash! Bail out. */
6879                 trace->collisions++;
6880                 mleak_table.trace_collisions++;
6881                 lck_mtx_unlock(mleak_lock);
6882                 return (TRUE);
6883         } else if (trace->allocs > 0) {
6884                 /* Same trace, already added, so increment refcount */
6885                 trace->allocs++;
6886         } else {
6887                 /* Found an unused trace bucket, so record the trace here */
6888                 if (trace->depth != 0) {
6889                         /* this slot previously used but not currently in use */
6890                         mleak_table.trace_overwrites++;
6891                 }
6892                 mleak_table.trace_recorded++;
6893                 trace->allocs = 1;
6894                 memcpy(trace->addr, bt, (depth * sizeof (uintptr_t)));
6895                 trace->depth = depth;
6896                 trace->collisions = 0;
6897         }
6898
6899         /* Step 2: Store the allocation record in the allocations array */
6900         if (allocation->element != NULL) {
6901                 /*
6902                  * Replace an existing allocation.  No need to preserve
6903                  * because only a subset of the allocations are being
6904                  * recorded anyway.
6905                  */
6906                 mleak_table.alloc_collisions++;
6907         } else if (allocation->trace_index != 0) {
6908                 mleak_table.alloc_overwrites++;
6909         }
6910         allocation->element = addr;
6911         allocation->trace_index = trace_index;
6912         allocation->count = num;
6913         mleak_table.alloc_recorded++;
6914         mleak_table.outstanding_allocs++;
6915
6916         lck_mtx_unlock(mleak_lock);
6917         return (TRUE);
6918 }
6919
6920 static void
6921 mleak_free(mcache_obj_t *addr)
6922 {
6923         while (addr != NULL) {
6924                 struct mallocation *allocation = &mleak_allocations
6925                     [hashaddr((uintptr_t)addr, mleak_alloc_buckets)];
6926
6927                 if (allocation->element == addr &&
6928                     allocation->trace_index < mleak_trace_buckets) {
6929                         lck_mtx_lock_spin(mleak_lock);
6930                         if (allocation->element == addr &&
6931                             allocation->trace_index < mleak_trace_buckets) {
6932                                 struct mtrace *trace;
6933                                 trace = &mleak_traces[allocation->trace_index];
6934                                 /* allocs = 0 means trace bucket is unused */
6935                                 if (trace->allocs > 0)
6936                                         trace->allocs--;
6937                                 if (trace->allocs == 0)
6938                                         trace->depth = 0;
6939                                 /* NULL element means alloc bucket is unused */
6940                                 allocation->element = NULL;
6941                                 mleak_table.outstanding_allocs--;
6942                         }
6943                         lck_mtx_unlock(mleak_lock);
6944                 }
6945                 addr = addr->obj_next;
6946         }
6947 }
6948
6949 static void
6950 mleak_sort_traces()
6951 {
6952         int i, j, k;
6953         struct mtrace *swap;
6954
6955         for(i = 0; i < MLEAK_NUM_TRACES; i++)
6956                 mleak_top_trace[i] = NULL;
6957
6958         for(i = 0, j = 0; j < MLEAK_NUM_TRACES && i < mleak_trace_buckets; i++)
6959         {
6960                 if (mleak_traces[i].allocs <= 0)
6961                         continue;
6962
6963                 mleak_top_trace[j] = &mleak_traces[i];
6964                 for (k = j; k > 0; k--) {
6965                         if (mleak_top_trace[k]->allocs <=
6966                             mleak_top_trace[k-1]->allocs)
6967                                 break;
6968
6969                         swap = mleak_top_trace[k-1];
6970                         mleak_top_trace[k-1] = mleak_top_trace[k];
6971                         mleak_top_trace[k] = swap;
6972                 }
6973                 j++;
6974         }
6975
6976         j--;
6977         for(; i < mleak_trace_buckets; i++) {
6978                 if (mleak_traces[i].allocs <= mleak_top_trace[j]->allocs)
6979                         continue;
6980
6981                 mleak_top_trace[j] = &mleak_traces[i];
6982
6983                 for (k = j; k > 0; k--) {
6984                         if (mleak_top_trace[k]->allocs <=
6985                             mleak_top_trace[k-1]->allocs)
6986                                 break;
6987
6988                         swap = mleak_top_trace[k-1];
6989                         mleak_top_trace[k-1] = mleak_top_trace[k];
6990                         mleak_top_trace[k] = swap;
6991                 }
6992         }
6993 }
6994
6995 static void
6996 mleak_update_stats()
6997 {
6998         mleak_trace_stat_t *mltr;
6999         int i;
7000
7001         VERIFY(mleak_stat != NULL);
7002 #ifdef __LP64__
7003         VERIFY(mleak_stat->ml_isaddr64);
7004 #else
7005         VERIFY(!mleak_stat->ml_isaddr64);
7006 #endif /* !__LP64__ */
7007         VERIFY(mleak_stat->ml_cnt == MLEAK_NUM_TRACES);
7008
7009         mleak_sort_traces();
7010
7011         mltr = &mleak_stat->ml_trace[0];
7012         bzero(mltr, sizeof (*mltr) * MLEAK_NUM_TRACES);
7013         for (i = 0; i < MLEAK_NUM_TRACES; i++) {
7014         int j;
7015
7016                 if (mleak_top_trace[i] == NULL ||
7017                     mleak_top_trace[i]->allocs == 0)
7018                         continue;
7019
7020                 mltr->mltr_collisions   = mleak_top_trace[i]->collisions;
7021                 mltr->mltr_hitcount     = mleak_top_trace[i]->hitcount;
7022                 mltr->mltr_allocs       = mleak_top_trace[i]->allocs;
7023                 mltr->mltr_depth        = mleak_top_trace[i]->depth;
7024
7025                 VERIFY(mltr->mltr_depth <= MLEAK_STACK_DEPTH);
7026                 for (j = 0; j < mltr->mltr_depth; j++)
7027                         mltr->mltr_addr[j] = mleak_top_trace[i]->addr[j];
7028
7029                 mltr++;
7030         }
7031 }
7032
7033 static struct mbtypes {
7034         int             mt_type;
7035         const char      *mt_name;
7036 } mbtypes[] = {
7037         { MT_DATA,      "data" },
7038         { MT_OOBDATA,   "oob data" },
7039         { MT_CONTROL,   "ancillary data" },
7040         { MT_HEADER,    "packet headers" },
7041         { MT_SOCKET,    "socket structures" },
7042         { MT_PCB,       "protocol control blocks" },
7043         { MT_RTABLE,    "routing table entries" },
7044         { MT_HTABLE,    "IMP host table entries" },
7045         { MT_ATABLE,    "address resolution tables" },
7046         { MT_FTABLE,    "fragment reassembly queue headers" },
7047         { MT_SONAME,    "socket names and addresses" },
7048         { MT_SOOPTS,    "socket options" },
7049         { MT_RIGHTS,    "access rights" },
7050         { MT_IFADDR,    "interface addresses" },
7051         { MT_TAG,       "packet tags" },
7052         { 0,            NULL }
7053 };
7054
7055 #define MBUF_DUMP_BUF_CHK() {   \
7056         clen -= k;              \
7057         if (clen < 1)           \
7058                 goto done;      \
7059         c += k;                 \
7060 }
7061
7062 static char *
7063 mbuf_dump(void)
7064 {
7065         unsigned long totmem = 0, totfree = 0, totmbufs, totused, totpct;
7066         u_int32_t m_mbufs = 0, m_clfree = 0, m_bigclfree = 0;
7067         u_int32_t m_mbufclfree = 0, m_mbufbigclfree = 0;
7068         u_int32_t m_16kclusters = 0, m_16kclfree = 0, m_mbuf16kclfree = 0;
7069         int nmbtypes = sizeof (mbstat.m_mtypes) / sizeof (short);
7070         uint8_t seen[256];
7071         struct mbtypes *mp;
7072         mb_class_stat_t *sp;
7073         mleak_trace_stat_t *mltr;
7074         char *c = mbuf_dump_buf;
7075         int i, k, clen = MBUF_DUMP_BUF_SIZE;
7076
7077         mbuf_dump_buf[0] = '\0';
7078
7079         /* synchronize all statistics in the mbuf table */
7080         mbuf_stat_sync();
7081         mbuf_mtypes_sync(TRUE);
7082
7083         sp = &mb_stat->mbs_class[0];
7084         for (i = 0; i < mb_stat->mbs_cnt; i++, sp++) {
7085                 u_int32_t mem;
7086
7087                 if (m_class(i) == MC_MBUF) {
7088                         m_mbufs = sp->mbcl_active;
7089                 } else if (m_class(i) == MC_CL) {
7090                         m_clfree = sp->mbcl_total - sp->mbcl_active;
7091                 } else if (m_class(i) == MC_BIGCL) {
7092                         m_bigclfree = sp->mbcl_total - sp->mbcl_active;
7093                 } else if (njcl > 0 && m_class(i) == MC_16KCL) {
7094                         m_16kclfree = sp->mbcl_total - sp->mbcl_active;
7095                         m_16kclusters = sp->mbcl_total;
7096                 } else if (m_class(i) == MC_MBUF_CL) {
7097                         m_mbufclfree = sp->mbcl_total - sp->mbcl_active;
7098                 } else if (m_class(i) == MC_MBUF_BIGCL) {
7099                         m_mbufbigclfree = sp->mbcl_total - sp->mbcl_active;
7100                 } else if (njcl > 0 && m_class(i) == MC_MBUF_16KCL) {
7101                         m_mbuf16kclfree = sp->mbcl_total - sp->mbcl_active;
7102                 }
7103
7104                 mem = sp->mbcl_ctotal * sp->mbcl_size;
7105                 totmem += mem;
7106                 totfree += (sp->mbcl_mc_cached + sp->mbcl_infree) *
7107                     sp->mbcl_size;
7108
7109         }
7110
7111         /* adjust free counts to include composite caches */
7112         m_clfree += m_mbufclfree;
7113         m_bigclfree += m_mbufbigclfree;
7114         m_16kclfree += m_mbuf16kclfree;
7115
7116         totmbufs = 0;
7117         for (mp = mbtypes; mp->mt_name != NULL; mp++)
7118                 totmbufs += mbstat.m_mtypes[mp->mt_type];
7119         if (totmbufs > m_mbufs)
7120                 totmbufs = m_mbufs;
7121         k = snprintf(c, clen, "%lu/%u mbufs in use:\n", totmbufs, m_mbufs);
7122         MBUF_DUMP_BUF_CHK();
7123
7124         bzero(&seen, sizeof (seen));
7125         for (mp = mbtypes; mp->mt_name != NULL; mp++) {
7126                 if (mbstat.m_mtypes[mp->mt_type] != 0) {
7127                         seen[mp->mt_type] = 1;
7128                         k = snprintf(c, clen, "\t%u mbufs allocated to %s\n",
7129                             mbstat.m_mtypes[mp->mt_type], mp->mt_name);
7130                         MBUF_DUMP_BUF_CHK();
7131                 }
7132         }
7133         seen[MT_FREE] = 1;
7134         for (i = 0; i < nmbtypes; i++)
7135                 if (!seen[i] && mbstat.m_mtypes[i] != 0) {
7136                         k = snprintf(c, clen, "\t%u mbufs allocated to "
7137                             "<mbuf type %d>\n", mbstat.m_mtypes[i], i);
7138                         MBUF_DUMP_BUF_CHK();
7139                 }
7140         if ((m_mbufs - totmbufs) > 0) {
7141                 k = snprintf(c, clen, "\t%lu mbufs allocated to caches\n",
7142                     m_mbufs - totmbufs);
7143                 MBUF_DUMP_BUF_CHK();
7144         }
7145         k = snprintf(c, clen, "%u/%u mbuf 2KB clusters in use\n"
7146             "%u/%u mbuf 4KB clusters in use\n",
7147             (unsigned int)(mbstat.m_clusters - m_clfree),
7148             (unsigned int)mbstat.m_clusters,
7149             (unsigned int)(mbstat.m_bigclusters - m_bigclfree),
7150             (unsigned int)mbstat.m_bigclusters);
7151         MBUF_DUMP_BUF_CHK();
7152
7153         if (njcl > 0) {
7154                 k = snprintf(c, clen, "%u/%u mbuf %uKB clusters in use\n",
7155                     m_16kclusters - m_16kclfree, m_16kclusters,
7156                     njclbytes / 1024);
7157                 MBUF_DUMP_BUF_CHK();
7158         }
7159         totused = totmem - totfree;
7160         if (totmem == 0) {
7161                 totpct = 0;
7162         } else if (totused < (ULONG_MAX / 100)) {
7163                 totpct = (totused * 100) / totmem;
7164         } else {
7165                 u_long totmem1 = totmem / 100;
7166                 u_long totused1 = totused / 100;
7167                 totpct = (totused1 * 100) / totmem1;
7168         }
7169         k = snprintf(c, clen, "%lu KB allocated to network (approx. %lu%% "
7170             "in use)\n", totmem / 1024, totpct);
7171         MBUF_DUMP_BUF_CHK();
7172
7173         /* mbuf leak detection statistics */
7174         mleak_update_stats();
7175
7176         k = snprintf(c, clen, "\nmbuf leak detection table:\n");
7177         MBUF_DUMP_BUF_CHK();
7178         k = snprintf(c, clen, "\ttotal captured: %u (one per %u)\n",
7179             mleak_table.mleak_capture / mleak_table.mleak_sample_factor,
7180             mleak_table.mleak_sample_factor);
7181         MBUF_DUMP_BUF_CHK();
7182         k = snprintf(c, clen, "\ttotal allocs outstanding: %llu\n",
7183             mleak_table.outstanding_allocs);
7184         MBUF_DUMP_BUF_CHK();
7185         k = snprintf(c, clen, "\tnew hash recorded: %llu allocs, %llu traces\n",
7186             mleak_table.alloc_recorded, mleak_table.trace_recorded);
7187         MBUF_DUMP_BUF_CHK();
7188         k = snprintf(c, clen, "\thash collisions: %llu allocs, %llu traces\n",
7189             mleak_table.alloc_collisions, mleak_table.trace_collisions);
7190         MBUF_DUMP_BUF_CHK();
7191         k = snprintf(c, clen, "\toverwrites: %llu allocs, %llu traces\n",
7192             mleak_table.alloc_overwrites, mleak_table.trace_overwrites);
7193         MBUF_DUMP_BUF_CHK();
7194         k = snprintf(c, clen, "\tlock conflicts: %llu\n\n",
7195             mleak_table.total_conflicts);
7196         MBUF_DUMP_BUF_CHK();
7197
7198         k = snprintf(c, clen, "top %d outstanding traces:\n",
7199             mleak_stat->ml_cnt);
7200         MBUF_DUMP_BUF_CHK();
7201         for (i = 0; i < mleak_stat->ml_cnt; i++) {
7202                 mltr = &mleak_stat->ml_trace[i];
7203                 k = snprintf(c, clen, "[%d] %llu outstanding alloc(s), "
7204                     "%llu hit(s), %llu collision(s)\n", (i + 1),
7205                     mltr->mltr_allocs, mltr->mltr_hitcount,
7206                     mltr->mltr_collisions);
7207                 MBUF_DUMP_BUF_CHK();
7208         }
7209
7210         if (mleak_stat->ml_isaddr64)
7211                 k = snprintf(c, clen, MB_LEAK_HDR_64);
7212         else
7213                 k = snprintf(c, clen, MB_LEAK_HDR_32);
7214         MBUF_DUMP_BUF_CHK();
7215
7216         for (i = 0; i < MLEAK_STACK_DEPTH; i++) {
7217                 int j;
7218                 k = snprintf(c, clen, "%2d: ", (i + 1));
7219                 MBUF_DUMP_BUF_CHK();
7220                 for (j = 0; j < mleak_stat->ml_cnt; j++) {
7221                         mltr = &mleak_stat->ml_trace[j];
7222                         if (i < mltr->mltr_depth) {
7223                                 if (mleak_stat->ml_isaddr64) {
7224                                         k = snprintf(c, clen, "0x%0llx  ",
7225                                             (uint64_t)VM_KERNEL_UNSLIDE(
7226                                                 mltr->mltr_addr[i]));
7227                                 } else {
7228                                         k = snprintf(c, clen,
7229                                             "0x%08x  ",
7230                                             (uint32_t)VM_KERNEL_UNSLIDE(
7231                                                 mltr->mltr_addr[i]));
7232                                 }
7233                         } else {
7234                                 if (mleak_stat->ml_isaddr64)
7235                                         k = snprintf(c, clen,
7236                                             MB_LEAK_SPACING_64);
7237                                 else
7238                                         k = snprintf(c, clen,
7239                                             MB_LEAK_SPACING_32);
7240                         }
7241                         MBUF_DUMP_BUF_CHK();
7242                 }
7243                 k = snprintf(c, clen, "\n");
7244                 MBUF_DUMP_BUF_CHK();
7245         }
7246 done:
7247         return (mbuf_dump_buf);
7248 }
7249
7250 #undef MBUF_DUMP_BUF_CHK
7251
7252 /*
7253  * Convert between a regular and a packet header mbuf.  Caller is responsible
7254  * for setting or clearing M_PKTHDR; this routine does the rest of the work.
7255  */
7256 int
7257 m_reinit(struct mbuf *m, int hdr)
7258 {
7259         int ret = 0;
7260
7261         if (hdr) {
7262                 VERIFY(!(m->m_flags & M_PKTHDR));
7263                 if (!(m->m_flags & M_EXT) &&
7264                     (m->m_data != m->m_dat || m->m_len > 0)) {
7265                         /*
7266                          * If there's no external cluster attached and the
7267                          * mbuf appears to contain user data, we cannot
7268                          * safely convert this to a packet header mbuf,
7269                          * as the packet header structure might overlap
7270                          * with the data.
7271                          */
7272                         printf("%s: cannot set M_PKTHDR on altered mbuf %llx, "
7273                             "m_data %llx (expected %llx), "
7274                             "m_len %d (expected 0)\n",
7275                             __func__,
7276                             (uint64_t)VM_KERNEL_ADDRPERM(m),
7277                             (uint64_t)VM_KERNEL_ADDRPERM(m->m_data),
7278                             (uint64_t)VM_KERNEL_ADDRPERM(m->m_dat), m->m_len);
7279                         ret = EBUSY;
7280                 } else {
7281                         VERIFY((m->m_flags & M_EXT) || m->m_data == m->m_dat);
7282                         m->m_flags |= M_PKTHDR;
7283                         MBUF_INIT_PKTHDR(m);
7284                 }
7285         } else {
7286                 /* Check for scratch area overflow */
7287                 m_redzone_verify(m);
7288                 /* Free the aux data and tags if there is any */
7289                 m_tag_delete_chain(m, NULL);
7290                 m->m_flags &= ~M_PKTHDR;
7291         }
7292
7293         return (ret);
7294 }
7295
7296 void
7297 m_scratch_init(struct mbuf *m)
7298 {
7299         struct pkthdr *pkt = &m->m_pkthdr;
7300
7301         VERIFY(m->m_flags & M_PKTHDR);
7302
7303         /* See comments in <rdar://problem/14040693> */
7304         if (pkt->pkt_flags & PKTF_PRIV_GUARDED) {
7305                 panic_plain("Invalid attempt to modify guarded module-private "
7306                     "area: mbuf %p, pkt_flags 0x%x\n", m, pkt->pkt_flags);
7307                 /* NOTREACHED */
7308         }
7309
7310         bzero(&pkt->pkt_mpriv, sizeof (pkt->pkt_mpriv));
7311 }
7312
7313 /*
7314  * This routine is reserved for mbuf_get_driver_scratch(); clients inside
7315  * xnu that intend on utilizing the module-private area should directly
7316  * refer to the pkt_mpriv structure in the pkthdr.  They are also expected
7317  * to set and clear PKTF_PRIV_GUARDED, while owning the packet and prior
7318  * to handing it off to another module, respectively.
7319  */
7320 u_int32_t
7321 m_scratch_get(struct mbuf *m, u_int8_t **p)
7322 {
7323         struct pkthdr *pkt = &m->m_pkthdr;
7324
7325         VERIFY(m->m_flags & M_PKTHDR);
7326
7327         /* See comments in <rdar://problem/14040693> */
7328         if (pkt->pkt_flags & PKTF_PRIV_GUARDED) {
7329                 panic_plain("Invalid attempt to access guarded module-private "
7330                     "area: mbuf %p, pkt_flags 0x%x\n", m, pkt->pkt_flags);
7331                 /* NOTREACHED */
7332         }
7333
7334         if (mcltrace) {
7335                 mcache_audit_t *mca;
7336
7337                 lck_mtx_lock(mbuf_mlock);
7338                 mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
7339                 if (mca->mca_uflags & MB_SCVALID)
7340                         mcl_audit_scratch(mca);
7341                 lck_mtx_unlock(mbuf_mlock);
7342         }
7343
7344         *p = (u_int8_t *)&pkt->pkt_mpriv;
7345         return (sizeof (pkt->pkt_mpriv));
7346 }
7347
7348 static void
7349 m_redzone_init(struct mbuf *m)
7350 {
7351         VERIFY(m->m_flags & M_PKTHDR);
7352         /*
7353          * Each mbuf has a unique red zone pattern, which is a XOR
7354          * of the red zone cookie and the address of the mbuf.
7355          */
7356         m->m_pkthdr.redzone = ((u_int32_t)(uintptr_t)m) ^ mb_redzone_cookie;
7357 }
7358
7359 static void
7360 m_redzone_verify(struct mbuf *m)
7361 {
7362         u_int32_t mb_redzone;
7363
7364         VERIFY(m->m_flags & M_PKTHDR);
7365
7366         mb_redzone = ((u_int32_t)(uintptr_t)m) ^ mb_redzone_cookie;
7367         if (m->m_pkthdr.redzone != mb_redzone) {
7368                 panic("mbuf %p redzone violation with value 0x%x "
7369                     "(instead of 0x%x, using cookie 0x%x)\n",
7370                     m, m->m_pkthdr.redzone, mb_redzone, mb_redzone_cookie);
7371                 /* NOTREACHED */
7372         }
7373 }
7374
7375 /*
7376  * Send a report of mbuf usage if the usage is at least 6% of max limit
7377  * or if there has been at least 3% increase since the last report.
7378  *
7379  * The values 6% and 3% are chosen so that we can do simple arithmetic
7380  * with shift operations.
7381  */
7382 static boolean_t
7383 mbuf_report_usage(mbuf_class_t cl)
7384 {
7385         /* if a report is already in progress, nothing to do */
7386         if (mb_peak_newreport)
7387                 return (TRUE);
7388
7389         if (m_total(cl) > m_peak(cl) &&
7390             m_total(cl) >= (m_maxlimit(cl) >> 4) &&
7391             (m_total(cl) - m_peak(cl)) >= (m_peak(cl) >> 5))
7392                 return (TRUE);
7393         return (FALSE);
7394 }
7395
7396 __private_extern__ void
7397 mbuf_report_peak_usage(void)
7398 {
7399         int i = 0;
7400         u_int64_t uptime;
7401         struct nstat_sysinfo_data ns_data;
7402         uint32_t memreleased = 0;
7403
7404         uptime = net_uptime();
7405         lck_mtx_lock(mbuf_mlock);
7406
7407         /* Generate an initial report after 1 week of uptime */
7408         if (!mb_peak_firstreport &&
7409             uptime > MBUF_PEAK_FIRST_REPORT_THRESHOLD) {
7410                 mb_peak_newreport = TRUE;
7411                 mb_peak_firstreport = TRUE;
7412         }
7413
7414         if (!mb_peak_newreport) {
7415                 lck_mtx_unlock(mbuf_mlock);
7416                 return;
7417         }
7418
7419         /*
7420          * Since a report is being generated before 1 week,
7421          * we do not need to force another one later
7422          */
7423         if (uptime < MBUF_PEAK_FIRST_REPORT_THRESHOLD)
7424                 mb_peak_firstreport = TRUE;
7425
7426         for (i = 0; i < NELEM(mbuf_table); i++) {
7427                 m_peak(m_class(i)) = m_total(m_class(i));
7428                 memreleased += m_release_cnt(i);
7429                 m_release_cnt(i) = 0;
7430         }
7431         mb_peak_newreport = FALSE;
7432         lck_mtx_unlock(mbuf_mlock);
7433
7434         bzero(&ns_data, sizeof(ns_data));
7435         ns_data.flags = NSTAT_SYSINFO_MBUF_STATS;
7436         ns_data.u.mb_stats.total_256b = m_peak(MC_MBUF);
7437         ns_data.u.mb_stats.total_2kb = m_peak(MC_CL);
7438         ns_data.u.mb_stats.total_4kb = m_peak(MC_BIGCL);
7439         ns_data.u.mb_stats.total_16kb = m_peak(MC_16KCL);
7440         ns_data.u.mb_stats.sbmb_total = total_sbmb_cnt_peak;
7441         ns_data.u.mb_stats.sb_atmbuflimit = sbmb_limreached;
7442         ns_data.u.mb_stats.draincnt = mbstat.m_drain;
7443         ns_data.u.mb_stats.memreleased = memreleased;
7444
7445         nstat_sysinfo_send_data(&ns_data);
7446 }
7447
7448 /*
7449  * Called by the VM when there's memory pressure.
7450  */
7451 __private_extern__ void
7452 m_drain(void)
7453 {
7454         mbuf_class_t mc;
7455         mcl_slab_t *sp, *sp_tmp, *nsp;
7456         unsigned int num, k, interval, released = 0;
7457         unsigned int total_mem = 0, use_mem = 0;
7458         boolean_t ret, purge_caches = FALSE;
7459         ppnum_t offset;
7460         mcache_obj_t *obj;
7461         float per;
7462         static uint64_t last_drain = 0;
7463         static unsigned char scratch[32];
7464         static ppnum_t scratch_pa = 0;
7465
7466         if (mb_drain_maxint == 0 || mb_waiters)
7467                 return;
7468         if (scratch_pa == 0) {
7469                 bzero(scratch, sizeof(scratch));
7470                 scratch_pa = pmap_find_phys(kernel_pmap, (addr64_t)scratch);
7471                 VERIFY(scratch_pa);
7472         } else if (mclverify) {
7473                 /*
7474                  * Panic if a driver wrote to our scratch memory.
7475                  */
7476                 for (k = 0; k < sizeof(scratch); k++)
7477                         if (scratch[k])
7478                                 panic("suspect DMA to freed address");
7479         }
7480         /*
7481          * Don't free memory too often as that could cause excessive
7482          * waiting times for mbufs.  Purge caches if we were asked to drain
7483          * in the last 5 minutes.
7484          */
7485         lck_mtx_lock(mbuf_mlock);
7486         if (last_drain == 0) {
7487                 last_drain = net_uptime();
7488                 lck_mtx_unlock(mbuf_mlock);
7489                 return;
7490         }
7491         interval = net_uptime() - last_drain;
7492         if (interval <= mb_drain_maxint) {
7493                 lck_mtx_unlock(mbuf_mlock);
7494                 return;
7495         }
7496         if (interval <= mb_drain_maxint * 5)
7497                 purge_caches = TRUE;
7498         last_drain = net_uptime();
7499         /*
7500          * Don't free any memory if we're using 60% or more.
7501          */
7502         for (mc = 0; mc < NELEM(mbuf_table); mc++) {
7503                 total_mem += m_total(mc) * m_maxsize(mc);
7504                 use_mem += m_active(mc) * m_maxsize(mc);
7505         }
7506         per = (float)use_mem / (float)total_mem;
7507         if (per >= 0.6) {
7508                 lck_mtx_unlock(mbuf_mlock);
7509                 return;
7510         }
7511         /*
7512          * Purge all the caches.  This effectively disables
7513          * caching for a few seconds, but the mbuf worker thread will
7514          * re-enable them again.
7515          */
7516         if (purge_caches == TRUE)
7517                 for (mc = 0; mc < NELEM(mbuf_table); mc++) {
7518                         if (m_total(mc) < m_avgtotal(mc))
7519                                 continue;
7520                         lck_mtx_unlock(mbuf_mlock);
7521                         ret = mcache_purge_cache(m_cache(mc), FALSE);
7522                         lck_mtx_lock(mbuf_mlock);
7523                         if (ret == TRUE)
7524                                 m_purge_cnt(mc)++;
7525                 }
7526         /*
7527          * Move the objects from the composite class freelist to
7528          * the rudimentary slabs list, but keep at least 10% of the average
7529          * total in the freelist.
7530          */
7531         for (mc = 0; mc < NELEM(mbuf_table); mc++) {
7532                 while (m_cobjlist(mc) &&
7533                     m_total(mc) < m_avgtotal(mc) &&
7534                     m_infree(mc) > 0.1 * m_avgtotal(mc) + m_minlimit(mc)) {
7535                         obj = m_cobjlist(mc);
7536                         m_cobjlist(mc) = obj->obj_next;
7537                         obj->obj_next = NULL;
7538                         num = cslab_free(mc, obj, 1);
7539                         VERIFY(num == 1);
7540                         m_free_cnt(mc)++;
7541                         m_infree(mc)--;
7542                         /* cslab_free() handles m_total */
7543                 }
7544         }
7545         /*
7546          * Free the buffers present in the slab list up to 10% of the total
7547          * average per class.
7548          *
7549          * We walk the list backwards in an attempt to reduce fragmentation.
7550          */
7551         for (mc = NELEM(mbuf_table) - 1; (int)mc >= 0; mc--) {
7552                 TAILQ_FOREACH_SAFE(sp, &m_slablist(mc), sl_link, sp_tmp) {
7553                         /*
7554                          * Process only unused slabs occupying memory.
7555                          */
7556                         if (sp->sl_refcnt != 0 || sp->sl_len == 0 ||
7557                             sp->sl_base == NULL)
7558                                 continue;
7559                         if (m_total(mc) < m_avgtotal(mc) ||
7560                             m_infree(mc) < 0.1 * m_avgtotal(mc) + m_minlimit(mc))
7561                                 break;
7562                         slab_remove(sp, mc);
7563                         switch (mc) {
7564                         case MC_MBUF:
7565                                 m_infree(mc) -= NMBPG;
7566                                 m_total(mc) -= NMBPG;
7567                                 if (mclaudit != NULL)
7568                                         mcl_audit_free(sp->sl_base, NMBPG);
7569                                 break;
7570                         case MC_CL:
7571                                 m_infree(mc) -= NCLPG;
7572                                 m_total(mc) -= NCLPG;
7573                                 if (mclaudit != NULL)
7574                                         mcl_audit_free(sp->sl_base, NMBPG);
7575                                 break;
7576                         case MC_BIGCL:
7577                         {
7578                                 m_infree(mc) -= NBCLPG;
7579                                 m_total(mc) -= NBCLPG;
7580                                 if (mclaudit != NULL)
7581                                         mcl_audit_free(sp->sl_base, NMBPG);
7582                                 break;
7583                         }
7584                         case MC_16KCL:
7585                                 m_infree(mc)--;
7586                                 m_total(mc)--;
7587                                 for (nsp = sp, k = 1; k < NSLABSP16KB; k++) {
7588                                         nsp = nsp->sl_next;
7589                                         VERIFY(nsp->sl_refcnt == 0 &&
7590                                             nsp->sl_base != NULL &&
7591                                             nsp->sl_len == 0);
7592                                         slab_init(nsp, 0, 0, NULL, NULL, 0, 0,
7593                                             0);
7594                                         nsp->sl_flags = 0;
7595                                 }
7596                                 if (mclaudit != NULL)
7597                                         mcl_audit_free(sp->sl_base, 1);
7598                                 break;
7599                         default:
7600                                 /*
7601                                  * The composite classes have their own
7602                                  * freelist (m_cobjlist), so we only
7603                                  * process rudimentary classes here.
7604                                  */
7605                                 VERIFY(0);
7606                         }
7607                         m_release_cnt(mc) += m_size(mc);
7608                         released += m_size(mc);
7609                         VERIFY(sp->sl_base != NULL &&
7610                             sp->sl_len >= PAGE_SIZE);
7611                         offset = MTOPG(sp->sl_base);
7612                         /*
7613                          * Make sure the IOMapper points to a valid, but
7614                          * bogus, address.  This should prevent further DMA
7615                          * accesses to freed memory.
7616                          */
7617                         IOMapperInsertPage(mcl_paddr_base, offset, scratch_pa);
7618                         mcl_paddr[offset] = 0;
7619                         kmem_free(mb_map, (vm_offset_t)sp->sl_base,
7620                             sp->sl_len);
7621                         slab_init(sp, 0, 0, NULL, NULL, 0, 0, 0);
7622                         sp->sl_flags = 0;
7623                 }
7624         }
7625         mbstat.m_drain++;
7626         mbstat.m_bigclusters = m_total(MC_BIGCL);
7627         mbstat.m_clusters = m_total(MC_CL);
7628         mbstat.m_mbufs = m_total(MC_MBUF);
7629         mbuf_stat_sync();
7630         mbuf_mtypes_sync(TRUE);
7631         lck_mtx_unlock(mbuf_mlock);
7632 }
7633
7634 static int
7635 m_drain_force_sysctl SYSCTL_HANDLER_ARGS
7636 {
7637 #pragma unused(arg1, arg2)
7638         int val = 0, err;
7639
7640         err = sysctl_handle_int(oidp, &val, 0, req);
7641         if (err != 0 || req->newptr == USER_ADDR_NULL)
7642                 return (err);
7643         if (val)
7644                 m_drain();
7645
7646         return (err);
7647 }
7648
7649 SYSCTL_DECL(_kern_ipc);
7650 SYSCTL_PROC(_kern_ipc, KIPC_MBSTAT, mbstat,
7651     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
7652     0, 0, mbstat_sysctl, "S,mbstat", "");
7653 SYSCTL_PROC(_kern_ipc, OID_AUTO, mb_stat,
7654     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
7655     0, 0, mb_stat_sysctl, "S,mb_stat", "");
7656 SYSCTL_PROC(_kern_ipc, OID_AUTO, mleak_top_trace,
7657     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
7658     0, 0, mleak_top_trace_sysctl, "S,mb_top_trace", "");
7659 SYSCTL_PROC(_kern_ipc, OID_AUTO, mleak_table,
7660     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
7661     0, 0, mleak_table_sysctl, "S,mleak_table", "");
7662 SYSCTL_INT(_kern_ipc, OID_AUTO, mleak_sample_factor,
7663     CTLFLAG_RW | CTLFLAG_LOCKED, &mleak_table.mleak_sample_factor, 0, "");
7664 SYSCTL_INT(_kern_ipc, OID_AUTO, mb_normalized,
7665     CTLFLAG_RD | CTLFLAG_LOCKED, &mb_normalized, 0, "");
7666 SYSCTL_INT(_kern_ipc, OID_AUTO, mb_watchdog,
7667     CTLFLAG_RW | CTLFLAG_LOCKED, &mb_watchdog, 0, "");
7668 SYSCTL_PROC(_kern_ipc, OID_AUTO, mb_drain_force,
7669     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, NULL, 0,
7670     m_drain_force_sysctl, "I",
7671     "Forces the mbuf garbage collection to run");
7672 SYSCTL_INT(_kern_ipc, OID_AUTO, mb_drain_maxint,
7673     CTLFLAG_RW | CTLFLAG_LOCKED, &mb_drain_maxint, 0,
7674     "Minimum time interval between garbage collection");