bsd/kern/uipc_mbuf.c

   1 /*
   2  * Copyright (c) 1998-2017 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
  29 /*
  30  * Copyright (c) 1982, 1986, 1988, 1991, 1993
  31  *      The Regents of the University of California.  All rights reserved.
  32  *
  33  * Redistribution and use in source and binary forms, with or without
  34  * modification, are permitted provided that the following conditions
  35  * are met:
  36  * 1. Redistributions of source code must retain the above copyright
  37  *    notice, this list of conditions and the following disclaimer.
  38  * 2. Redistributions in binary form must reproduce the above copyright
  39  *    notice, this list of conditions and the following disclaimer in the
  40  *    documentation and/or other materials provided with the distribution.
  41  * 3. All advertising materials mentioning features or use of this software
  42  *    must display the following acknowledgement:
  43  *      This product includes software developed by the University of
  44  *      California, Berkeley and its contributors.
  45  * 4. Neither the name of the University nor the names of its contributors
  46  *    may be used to endorse or promote products derived from this software
  47  *    without specific prior written permission.
  48  *
  49  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  50  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  51  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  52  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  53  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  54  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  55  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  56  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  57  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  58  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  59  * SUCH DAMAGE.
  60  *
  61  *      @(#)uipc_mbuf.c 8.2 (Berkeley) 1/4/94
  62  */
  63 /*
  64  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
  65  * support for mandatory and extensible security protections.  This notice
  66  * is included in support of clause 2.2 (b) of the Apple Public License,
  67  * Version 2.0.
  68  */
  69
  70 #include <sys/param.h>
  71 #include <sys/systm.h>
  72 #include <sys/malloc.h>
  73 #include <sys/mbuf.h>
  74 #include <sys/kernel.h>
  75 #include <sys/sysctl.h>
  76 #include <sys/syslog.h>
  77 #include <sys/protosw.h>
  78 #include <sys/domain.h>
  79 #include <sys/queue.h>
  80 #include <sys/proc.h>
  81
  82 #include <dev/random/randomdev.h>
  83
  84 #include <kern/kern_types.h>
  85 #include <kern/simple_lock.h>
  86 #include <kern/queue.h>
  87 #include <kern/sched_prim.h>
  88 #include <kern/backtrace.h>
  89 #include <kern/cpu_number.h>
  90 #include <kern/zalloc.h>
  91
  92 #include <libkern/OSAtomic.h>
  93 #include <libkern/OSDebug.h>
  94 #include <libkern/libkern.h>
  95
  96 #include <IOKit/IOMapper.h>
  97
  98 #include <machine/limits.h>
  99 #include <machine/machine_routines.h>
 100
 101 #if CONFIG_MACF_NET
 102 #include <security/mac_framework.h>
 103 #endif /* MAC_NET */
 104
 105 #include <sys/mcache.h>
 106 #include <net/ntstat.h>
 107
 108 /*
 109  * MBUF IMPLEMENTATION NOTES.
 110  *
 111  * There is a total of 5 per-CPU caches:
 112  *
 113  * MC_MBUF:
 114  *      This is a cache of rudimentary objects of MSIZE in size; each
 115  *      object represents an mbuf structure.  This cache preserves only
 116  *      the m_type field of the mbuf during its transactions.
 117  *
 118  * MC_CL:
 119  *      This is a cache of rudimentary objects of MCLBYTES in size; each
 120  *      object represents a mcluster structure.  This cache does not
 121  *      preserve the contents of the objects during its transactions.
 122  *
 123  * MC_BIGCL:
 124  *      This is a cache of rudimentary objects of MBIGCLBYTES in size; each
 125  *      object represents a mbigcluster structure.  This cache does not
 126  *      preserve the contents of the objects during its transaction.
 127  *
 128  * MC_MBUF_CL:
 129  *      This is a cache of mbufs each having a cluster attached to it.
 130  *      It is backed by MC_MBUF and MC_CL rudimentary caches.  Several
 131  *      fields of the mbuf related to the external cluster are preserved
 132  *      during transactions.
 133  *
 134  * MC_MBUF_BIGCL:
 135  *      This is a cache of mbufs each having a big cluster attached to it.
 136  *      It is backed by MC_MBUF and MC_BIGCL rudimentary caches.  Several
 137  *      fields of the mbuf related to the external cluster are preserved
 138  *      during transactions.
 139  *
 140  * OBJECT ALLOCATION:
 141  *
 142  * Allocation requests are handled first at the per-CPU (mcache) layer
 143  * before falling back to the slab layer.  Performance is optimal when
 144  * the request is satisfied at the CPU layer because global data/lock
 145  * never gets accessed.  When the slab layer is entered for allocation,
 146  * the slab freelist will be checked first for available objects before
 147  * the VM backing store is invoked.  Slab layer operations are serialized
 148  * for all of the caches as the mbuf global lock is held most of the time.
 149  * Allocation paths are different depending on the class of objects:
 150  *
 151  * a. Rudimentary object:
 152  *
 153  *      { m_get_common(), m_clattach(), m_mclget(),
 154  *        m_mclalloc(), m_bigalloc(), m_copym_with_hdrs(),
 155  *        composite object allocation }
 156  *                      |       ^
 157  *                      |       |
 158  *                      |       +-----------------------+
 159  *                      v                               |
 160  *         mcache_alloc/mcache_alloc_ext()      mbuf_slab_audit()
 161  *                      |                               ^
 162  *                      v                               |
 163  *                 [CPU cache] -------> (found?) -------+
 164  *                      |                               |
 165  *                      v                               |
 166  *               mbuf_slab_alloc()                      |
 167  *                      |                               |
 168  *                      v                               |
 169  *      +---------> [freelist] -------> (found?) -------+
 170  *      |               |
 171  *      |               v
 172  *      |           m_clalloc()
 173  *      |               |
 174  *      |               v
 175  *      +---<<---- kmem_mb_alloc()
 176  *
 177  * b. Composite object:
 178  *
 179  *      { m_getpackets_internal(), m_allocpacket_internal() }
 180  *                      |       ^
 181  *                      |       |
 182  *                      |       +------ (done) ---------+
 183  *                      v                               |
 184  *         mcache_alloc/mcache_alloc_ext()      mbuf_cslab_audit()
 185  *                      |                               ^
 186  *                      v                               |
 187  *                 [CPU cache] -------> (found?) -------+
 188  *                      |                               |
 189  *                      v                               |
 190  *               mbuf_cslab_alloc()                     |
 191  *                      |                               |
 192  *                      v                               |
 193  *                  [freelist] -------> (found?) -------+
 194  *                      |                               |
 195  *                      v                               |
 196  *              (rudimentary object)                    |
 197  *         mcache_alloc/mcache_alloc_ext() ------>>-----+
 198  *
 199  * Auditing notes: If auditing is enabled, buffers will be subjected to
 200  * integrity checks by the audit routine.  This is done by verifying their
 201  * contents against DEADBEEF (free) pattern before returning them to caller.
 202  * As part of this step, the routine will also record the transaction and
 203  * pattern-fill the buffers with BADDCAFE (uninitialized) pattern.  It will
 204  * also restore any constructed data structure fields if necessary.
 205  *
 206  * OBJECT DEALLOCATION:
 207  *
 208  * Freeing an object simply involves placing it into the CPU cache; this
 209  * pollutes the cache to benefit subsequent allocations.  The slab layer
 210  * will only be entered if the object is to be purged out of the cache.
 211  * During normal operations, this happens only when the CPU layer resizes
 212  * its bucket while it's adjusting to the allocation load.  Deallocation
 213  * paths are different depending on the class of objects:
 214  *
 215  * a. Rudimentary object:
 216  *
 217  *      { m_free(), m_freem_list(), composite object deallocation }
 218  *                      |       ^
 219  *                      |       |
 220  *                      |       +------ (done) ---------+
 221  *                      v                               |
 222  *         mcache_free/mcache_free_ext()                |
 223  *                      |                               |
 224  *                      v                               |
 225  *              mbuf_slab_audit()                       |
 226  *                      |                               |
 227  *                      v                               |
 228  *                 [CPU cache] ---> (not purging?) -----+
 229  *                      |                               |
 230  *                      v                               |
 231  *               mbuf_slab_free()                       |
 232  *                      |                               |
 233  *                      v                               |
 234  *                  [freelist] ----------->>------------+
 235  *       (objects get purged to VM only on demand)
 236  *
 237  * b. Composite object:
 238  *
 239  *      { m_free(), m_freem_list() }
 240  *                      |       ^
 241  *                      |       |
 242  *                      |       +------ (done) ---------+
 243  *                      v                               |
 244  *         mcache_free/mcache_free_ext()                |
 245  *                      |                               |
 246  *                      v                               |
 247  *              mbuf_cslab_audit()                      |
 248  *                      |                               |
 249  *                      v                               |
 250  *                 [CPU cache] ---> (not purging?) -----+
 251  *                      |                               |
 252  *                      v                               |
 253  *               mbuf_cslab_free()                      |
 254  *                      |                               |
 255  *                      v                               |
 256  *                  [freelist] ---> (not purging?) -----+
 257  *                      |                               |
 258  *                      v                               |
 259  *              (rudimentary object)                    |
 260  *         mcache_free/mcache_free_ext() ------->>------+
 261  *
 262  * Auditing notes: If auditing is enabled, the audit routine will save
 263  * any constructed data structure fields (if necessary) before filling the
 264  * contents of the buffers with DEADBEEF (free) pattern and recording the
 265  * transaction.  Buffers that are freed (whether at CPU or slab layer) are
 266  * expected to contain the free pattern.
 267  *
 268  * DEBUGGING:
 269  *
 270  * Debugging can be enabled by adding "mbuf_debug=0x3" to boot-args; this
 271  * translates to the mcache flags (MCF_VERIFY | MCF_AUDIT).  Additionally,
 272  * the CPU layer cache can be disabled by setting the MCF_NOCPUCACHE flag,
 273  * i.e. modify the boot argument parameter to "mbuf_debug=0x13".  Leak
 274  * detection may also be disabled by setting the MCF_NOLEAKLOG flag, e.g.
 275  * "mbuf_debug=0x113".  Note that debugging consumes more CPU and memory.
 276  *
 277  * Each object is associated with exactly one mcache_audit_t structure that
 278  * contains the information related to its last buffer transaction.  Given
 279  * an address of an object, the audit structure can be retrieved by finding
 280  * the position of the object relevant to the base address of the cluster:
 281  *
 282  *      +------------+                  +=============+
 283  *      | mbuf addr  |                  | mclaudit[i] |
 284  *      +------------+                  +=============+
 285  *            |                         | cl_audit[0] |
 286  *      i = MTOBG(addr)                 +-------------+
 287  *            |                 +-----> | cl_audit[1] | -----> mcache_audit_t
 288  *      b = BGTOM(i)            |       +-------------+
 289  *            |                 |       |     ...     |
 290  *      x = MCLIDX(b, addr)     |       +-------------+
 291  *            |                 |       | cl_audit[7] |
 292  *            +-----------------+       +-------------+
 293  *               (e.g. x == 1)
 294  *
 295  * The mclaudit[] array is allocated at initialization time, but its contents
 296  * get populated when the corresponding cluster is created.  Because a page
 297  * can be turned into NMBPG number of mbufs, we preserve enough space for the
 298  * mbufs so that there is a 1-to-1 mapping between them.  A page that never
 299  * gets (or has not yet) turned into mbufs will use only cl_audit[0] with the
 300  * remaining entries unused.  For 16KB cluster, only one entry from the first
 301  * page is allocated and used for the entire object.
 302  */
 303
 304 /* TODO: should be in header file */
 305 /* kernel translater */
 306 extern vm_offset_t kmem_mb_alloc(vm_map_t, int, int, kern_return_t *);
 307 extern ppnum_t pmap_find_phys(pmap_t pmap, addr64_t va);
 308 extern vm_map_t mb_map;         /* special map */
 309
 310 static uint32_t mb_kmem_contig_failed;
 311 static uint32_t mb_kmem_failed;
 312 static uint32_t mb_kmem_one_failed;
 313 /* Timestamp of allocation failures. */
 314 static uint64_t mb_kmem_contig_failed_ts;
 315 static uint64_t mb_kmem_failed_ts;
 316 static uint64_t mb_kmem_one_failed_ts;
 317 static uint64_t mb_kmem_contig_failed_size;
 318 static uint64_t mb_kmem_failed_size;
 319 static uint32_t mb_kmem_stats[6];
 320 static const char *mb_kmem_stats_labels[] = { "INVALID_ARGUMENT",
 321                                               "INVALID_ADDRESS",
 322                                               "RESOURCE_SHORTAGE",
 323                                               "NO_SPACE",
 324                                               "KERN_FAILURE",
 325                                               "OTHERS" };
 326
 327 /* Global lock */
 328 decl_lck_mtx_data(static, mbuf_mlock_data);
 329 static lck_mtx_t *mbuf_mlock = &mbuf_mlock_data;
 330 static lck_attr_t *mbuf_mlock_attr;
 331 static lck_grp_t *mbuf_mlock_grp;
 332 static lck_grp_attr_t *mbuf_mlock_grp_attr;
 333
 334 /* Back-end (common) layer */
 335 static uint64_t mb_expand_cnt;
 336 static uint64_t mb_expand_cl_cnt;
 337 static uint64_t mb_expand_cl_total;
 338 static uint64_t mb_expand_bigcl_cnt;
 339 static uint64_t mb_expand_bigcl_total;
 340 static uint64_t mb_expand_16kcl_cnt;
 341 static uint64_t mb_expand_16kcl_total;
 342 static boolean_t mbuf_worker_needs_wakeup; /* wait channel for mbuf worker */
 343 static uint32_t mbuf_worker_run_cnt;
 344 static uint64_t mbuf_worker_last_runtime;
 345 static int mbuf_worker_ready;   /* worker thread is runnable */
 346 static int ncpu;                /* number of CPUs */
 347 static ppnum_t *mcl_paddr;      /* Array of cluster physical addresses */
 348 static ppnum_t mcl_pages;       /* Size of array (# physical pages) */
 349 static ppnum_t mcl_paddr_base;  /* Handle returned by IOMapper::iovmAlloc() */
 350 static mcache_t *ref_cache;     /* Cache of cluster reference & flags */
 351 static mcache_t *mcl_audit_con_cache; /* Audit contents cache */
 352 static unsigned int mbuf_debug; /* patchable mbuf mcache flags */
 353 static unsigned int mb_normalized; /* number of packets "normalized" */
 354
 355 #define MB_GROWTH_AGGRESSIVE    1       /* Threshold: 1/2 of total */
 356 #define MB_GROWTH_NORMAL        2       /* Threshold: 3/4 of total */
 357
 358 typedef enum {
 359         MC_MBUF = 0,    /* Regular mbuf */
 360         MC_CL,          /* Cluster */
 361         MC_BIGCL,       /* Large (4KB) cluster */
 362         MC_16KCL,       /* Jumbo (16KB) cluster */
 363         MC_MBUF_CL,     /* mbuf + cluster */
 364         MC_MBUF_BIGCL,  /* mbuf + large (4KB) cluster */
 365         MC_MBUF_16KCL   /* mbuf + jumbo (16KB) cluster */
 366 } mbuf_class_t;
 367
 368 #define MBUF_CLASS_MIN          MC_MBUF
 369 #define MBUF_CLASS_MAX          MC_MBUF_16KCL
 370 #define MBUF_CLASS_LAST         MC_16KCL
 371 #define MBUF_CLASS_VALID(c) \
 372         ((int)(c) >= MBUF_CLASS_MIN && (int)(c) <= MBUF_CLASS_MAX)
 373 #define MBUF_CLASS_COMPOSITE(c) \
 374         ((int)(c) > MBUF_CLASS_LAST)
 375
 376
 377 /*
 378  * mbuf specific mcache allocation request flags.
 379  */
 380 #define MCR_COMP        MCR_USR1 /* for MC_MBUF_{CL,BIGCL,16KCL} caches */
 381
 382 /*
 383  * Per-cluster slab structure.
 384  *
 385  * A slab is a cluster control structure that contains one or more object
 386  * chunks; the available chunks are chained in the slab's freelist (sl_head).
 387  * Each time a chunk is taken out of the slab, the slab's reference count
 388  * gets incremented.  When all chunks have been taken out, the empty slab
 389  * gets removed (SLF_DETACHED) from the class's slab list.  A chunk that is
 390  * returned to a slab causes the slab's reference count to be decremented;
 391  * it also causes the slab to be reinserted back to class's slab list, if
 392  * it's not already done.
 393  *
 394  * Compartmentalizing of the object chunks into slabs allows us to easily
 395  * merge one or more slabs together when the adjacent slabs are idle, as
 396  * well as to convert or move a slab from one class to another; e.g. the
 397  * mbuf cluster slab can be converted to a regular cluster slab when all
 398  * mbufs in the slab have been freed.
 399  *
 400  * A slab may also span across multiple clusters for chunks larger than
 401  * a cluster's size.  In this case, only the slab of the first cluster is
 402  * used.  The rest of the slabs are marked with SLF_PARTIAL to indicate
 403  * that they are part of the larger slab.
 404  *
 405  * Each slab controls a page of memory.
 406  */
 407 typedef struct mcl_slab {
 408         struct mcl_slab *sl_next;       /* neighboring slab */
 409         u_int8_t        sl_class;       /* controlling mbuf class */
 410         int8_t          sl_refcnt;      /* outstanding allocations */
 411         int8_t          sl_chunks;      /* chunks (bufs) in this slab */
 412         u_int16_t       sl_flags;       /* slab flags (see below) */
 413         u_int16_t       sl_len;         /* slab length */
 414         void            *sl_base;       /* base of allocated memory */
 415         void            *sl_head;       /* first free buffer */
 416         TAILQ_ENTRY(mcl_slab) sl_link;  /* next/prev slab on freelist */
 417 } mcl_slab_t;
 418
 419 #define SLF_MAPPED      0x0001          /* backed by a mapped page */
 420 #define SLF_PARTIAL     0x0002          /* part of another slab */
 421 #define SLF_DETACHED    0x0004          /* not in slab freelist */
 422
 423 /*
 424  * The array of slabs are broken into groups of arrays per 1MB of kernel
 425  * memory to reduce the footprint.  Each group is allocated on demand
 426  * whenever a new piece of memory mapped in from the VM crosses the 1MB
 427  * boundary.
 428  */
 429 #define NSLABSPMB       ((1 << MBSHIFT) >> PAGE_SHIFT)
 430
 431 typedef struct mcl_slabg {
 432         mcl_slab_t      *slg_slab;      /* group of slabs */
 433 } mcl_slabg_t;
 434
 435 /*
 436  * Number of slabs needed to control a 16KB cluster object.
 437  */
 438 #define NSLABSP16KB     (M16KCLBYTES >> PAGE_SHIFT)
 439
 440 /*
 441  * Per-cluster audit structure.
 442  */
 443 typedef struct {
 444         mcache_audit_t  **cl_audit;     /* array of audits */
 445 } mcl_audit_t;
 446
 447 typedef struct {
 448         struct thread   *msa_thread;    /* thread doing transaction */
 449         struct thread   *msa_pthread;   /* previous transaction thread */
 450         uint32_t        msa_tstamp;     /* transaction timestamp (ms) */
 451         uint32_t        msa_ptstamp;    /* prev transaction timestamp (ms) */
 452         uint16_t        msa_depth;      /* pc stack depth */
 453         uint16_t        msa_pdepth;     /* previous transaction pc stack */
 454         void            *msa_stack[MCACHE_STACK_DEPTH];
 455         void            *msa_pstack[MCACHE_STACK_DEPTH];
 456 } mcl_scratch_audit_t;
 457
 458 typedef struct {
 459         /*
 460          * Size of data from the beginning of an mbuf that covers m_hdr,
 461          * pkthdr and m_ext structures.  If auditing is enabled, we allocate
 462          * a shadow mbuf structure of this size inside each audit structure,
 463          * and the contents of the real mbuf gets copied into it when the mbuf
 464          * is freed.  This allows us to pattern-fill the mbuf for integrity
 465          * check, and to preserve any constructed mbuf fields (e.g. mbuf +
 466          * cluster cache case).  Note that we don't save the contents of
 467          * clusters when they are freed; we simply pattern-fill them.
 468          */
 469         u_int8_t                sc_mbuf[(MSIZE - _MHLEN) + sizeof (_m_ext_t)];
 470         mcl_scratch_audit_t     sc_scratch __attribute__((aligned(8)));
 471 } mcl_saved_contents_t;
 472
 473 #define AUDIT_CONTENTS_SIZE     (sizeof (mcl_saved_contents_t))
 474
 475 #define MCA_SAVED_MBUF_PTR(_mca)                                        \
 476         ((struct mbuf *)(void *)((mcl_saved_contents_t *)               \
 477         (_mca)->mca_contents)->sc_mbuf)
 478 #define MCA_SAVED_MBUF_SIZE                                             \
 479         (sizeof (((mcl_saved_contents_t *)0)->sc_mbuf))
 480 #define MCA_SAVED_SCRATCH_PTR(_mca)                                     \
 481         (&((mcl_saved_contents_t *)(_mca)->mca_contents)->sc_scratch)
 482
 483 /*
 484  * mbuf specific mcache audit flags
 485  */
 486 #define MB_INUSE        0x01    /* object has not been returned to slab */
 487 #define MB_COMP_INUSE   0x02    /* object has not been returned to cslab */
 488 #define MB_SCVALID      0x04    /* object has valid saved contents */
 489
 490 /*
 491  * Each of the following two arrays hold up to nmbclusters elements.
 492  */
 493 static mcl_audit_t *mclaudit;   /* array of cluster audit information */
 494 static unsigned int maxclaudit; /* max # of entries in audit table */
 495 static mcl_slabg_t **slabstbl;  /* cluster slabs table */
 496 static unsigned int maxslabgrp; /* max # of entries in slabs table */
 497 static unsigned int slabgrp;    /* # of entries in slabs table */
 498
 499 /* Globals */
 500 int nclusters;                  /* # of clusters for non-jumbo (legacy) sizes */
 501 int njcl;                       /* # of clusters for jumbo sizes */
 502 int njclbytes;                  /* size of a jumbo cluster */
 503 unsigned char *mbutl;           /* first mapped cluster address */
 504 unsigned char *embutl;          /* ending virtual address of mclusters */
 505 int _max_linkhdr;               /* largest link-level header */
 506 int _max_protohdr;              /* largest protocol header */
 507 int max_hdr;                    /* largest link+protocol header */
 508 int max_datalen;                /* MHLEN - max_hdr */
 509
 510 static boolean_t mclverify;     /* debug: pattern-checking */
 511 static boolean_t mcltrace;      /* debug: stack tracing */
 512 static boolean_t mclfindleak;   /* debug: leak detection */
 513 static boolean_t mclexpleak;    /* debug: expose leak info to user space */
 514
 515 static struct timeval mb_start; /* beginning of time */
 516
 517 /* mbuf leak detection variables */
 518 static struct mleak_table mleak_table;
 519 static mleak_stat_t *mleak_stat;
 520
 521 #define MLEAK_STAT_SIZE(n) \
 522         __builtin_offsetof(mleak_stat_t, ml_trace[n])
 523
 524 struct mallocation {
 525         mcache_obj_t *element;  /* the alloc'ed element, NULL if unused */
 526         u_int32_t trace_index;  /* mtrace index for corresponding backtrace */
 527         u_int32_t count;        /* How many objects were requested */
 528         u_int64_t hitcount;     /* for determining hash effectiveness */
 529 };
 530
 531 struct mtrace {
 532         u_int64_t       collisions;
 533         u_int64_t       hitcount;
 534         u_int64_t       allocs;
 535         u_int64_t       depth;
 536         uintptr_t       addr[MLEAK_STACK_DEPTH];
 537 };
 538
 539 /* Size must be a power of two for the zhash to be able to just mask off bits */
 540 #define MLEAK_ALLOCATION_MAP_NUM        512
 541 #define MLEAK_TRACE_MAP_NUM             256
 542
 543 /*
 544  * Sample factor for how often to record a trace.  This is overwritable
 545  * by the boot-arg mleak_sample_factor.
 546  */
 547 #define MLEAK_SAMPLE_FACTOR             500
 548
 549 /*
 550  * Number of top leakers recorded.
 551  */
 552 #define MLEAK_NUM_TRACES                5
 553
 554 #define MB_LEAK_SPACING_64 "                    "
 555 #define MB_LEAK_SPACING_32 "            "
 556
 557
 558 #define MB_LEAK_HDR_32  "\n\
 559     trace [1]   trace [2]   trace [3]   trace [4]   trace [5]  \n\
 560     ----------  ----------  ----------  ----------  ---------- \n\
 561 "
 562
 563 #define MB_LEAK_HDR_64  "\n\
 564     trace [1]           trace [2]           trace [3]       \
 565         trace [4]           trace [5]      \n\
 566     ------------------  ------------------  ------------------  \
 567     ------------------  ------------------ \n\
 568 "
 569
 570 static uint32_t mleak_alloc_buckets = MLEAK_ALLOCATION_MAP_NUM;
 571 static uint32_t mleak_trace_buckets = MLEAK_TRACE_MAP_NUM;
 572
 573 /* Hashmaps of allocations and their corresponding traces */
 574 static struct mallocation *mleak_allocations;
 575 static struct mtrace *mleak_traces;
 576 static struct mtrace *mleak_top_trace[MLEAK_NUM_TRACES];
 577
 578 /* Lock to protect mleak tables from concurrent modification */
 579 decl_lck_mtx_data(static, mleak_lock_data);
 580 static lck_mtx_t *mleak_lock = &mleak_lock_data;
 581 static lck_attr_t *mleak_lock_attr;
 582 static lck_grp_t *mleak_lock_grp;
 583 static lck_grp_attr_t *mleak_lock_grp_attr;
 584
 585 /* *Failed* large allocations. */
 586 struct mtracelarge {
 587         uint64_t        size;
 588         uint64_t        depth;
 589         uintptr_t       addr[MLEAK_STACK_DEPTH];
 590 };
 591
 592 #define MTRACELARGE_NUM_TRACES          5
 593 static struct mtracelarge mtracelarge_table[MTRACELARGE_NUM_TRACES];
 594
 595 static void mtracelarge_register(size_t size);
 596
 597 /* Lock to protect the completion callback table */
 598 static lck_grp_attr_t *mbuf_tx_compl_tbl_lck_grp_attr = NULL;
 599 static lck_attr_t *mbuf_tx_compl_tbl_lck_attr = NULL;
 600 static lck_grp_t *mbuf_tx_compl_tbl_lck_grp = NULL;
 601 decl_lck_rw_data(, mbuf_tx_compl_tbl_lck_rw_data);
 602 lck_rw_t *mbuf_tx_compl_tbl_lock = &mbuf_tx_compl_tbl_lck_rw_data;
 603
 604 extern u_int32_t high_sb_max;
 605
 606 /* The minimum number of objects that are allocated, to start. */
 607 #define MINCL           32
 608 #define MINBIGCL        (MINCL >> 1)
 609 #define MIN16KCL        (MINCL >> 2)
 610
 611 /* Low watermarks (only map in pages once free counts go below) */
 612 #define MBIGCL_LOWAT    MINBIGCL
 613 #define M16KCL_LOWAT    MIN16KCL
 614
 615 typedef struct {
 616         mbuf_class_t    mtbl_class;     /* class type */
 617         mcache_t        *mtbl_cache;    /* mcache for this buffer class */
 618         TAILQ_HEAD(mcl_slhead, mcl_slab) mtbl_slablist; /* slab list */
 619         mcache_obj_t    *mtbl_cobjlist; /* composite objects freelist */
 620         mb_class_stat_t *mtbl_stats;    /* statistics fetchable via sysctl */
 621         u_int32_t       mtbl_maxsize;   /* maximum buffer size */
 622         int             mtbl_minlimit;  /* minimum allowed */
 623         int             mtbl_maxlimit;  /* maximum allowed */
 624         u_int32_t       mtbl_wantpurge; /* purge during next reclaim */
 625         uint32_t        mtbl_avgtotal;  /* average total on iOS */
 626         u_int32_t       mtbl_expand;    /* worker should expand the class */
 627 } mbuf_table_t;
 628
 629 #define m_class(c)      mbuf_table[c].mtbl_class
 630 #define m_cache(c)      mbuf_table[c].mtbl_cache
 631 #define m_slablist(c)   mbuf_table[c].mtbl_slablist
 632 #define m_cobjlist(c)   mbuf_table[c].mtbl_cobjlist
 633 #define m_maxsize(c)    mbuf_table[c].mtbl_maxsize
 634 #define m_minlimit(c)   mbuf_table[c].mtbl_minlimit
 635 #define m_maxlimit(c)   mbuf_table[c].mtbl_maxlimit
 636 #define m_wantpurge(c)  mbuf_table[c].mtbl_wantpurge
 637 #define m_avgtotal(c)   mbuf_table[c].mtbl_avgtotal
 638 #define m_cname(c)      mbuf_table[c].mtbl_stats->mbcl_cname
 639 #define m_size(c)       mbuf_table[c].mtbl_stats->mbcl_size
 640 #define m_total(c)      mbuf_table[c].mtbl_stats->mbcl_total
 641 #define m_active(c)     mbuf_table[c].mtbl_stats->mbcl_active
 642 #define m_infree(c)     mbuf_table[c].mtbl_stats->mbcl_infree
 643 #define m_slab_cnt(c)   mbuf_table[c].mtbl_stats->mbcl_slab_cnt
 644 #define m_alloc_cnt(c)  mbuf_table[c].mtbl_stats->mbcl_alloc_cnt
 645 #define m_free_cnt(c)   mbuf_table[c].mtbl_stats->mbcl_free_cnt
 646 #define m_notified(c)   mbuf_table[c].mtbl_stats->mbcl_notified
 647 #define m_purge_cnt(c)  mbuf_table[c].mtbl_stats->mbcl_purge_cnt
 648 #define m_fail_cnt(c)   mbuf_table[c].mtbl_stats->mbcl_fail_cnt
 649 #define m_ctotal(c)     mbuf_table[c].mtbl_stats->mbcl_ctotal
 650 #define m_peak(c)       mbuf_table[c].mtbl_stats->mbcl_peak_reported
 651 #define m_release_cnt(c) mbuf_table[c].mtbl_stats->mbcl_release_cnt
 652 #define m_region_expand(c)      mbuf_table[c].mtbl_expand
 653
 654 static mbuf_table_t mbuf_table[] = {
 655         /*
 656          * The caches for mbufs, regular clusters and big clusters.
 657          * The average total values were based on data gathered by actual
 658          * usage patterns on iOS.
 659          */
 660         { MC_MBUF, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_MBUF)),
 661             NULL, NULL, 0, 0, 0, 0, 3000, 0 },
 662         { MC_CL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_CL)),
 663             NULL, NULL, 0, 0, 0, 0, 2000, 0 },
 664         { MC_BIGCL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_BIGCL)),
 665             NULL, NULL, 0, 0, 0, 0, 1000, 0 },
 666         { MC_16KCL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_16KCL)),
 667             NULL, NULL, 0, 0, 0, 0, 200, 0 },
 668         /*
 669          * The following are special caches; they serve as intermediate
 670          * caches backed by the above rudimentary caches.  Each object
 671          * in the cache is an mbuf with a cluster attached to it.  Unlike
 672          * the above caches, these intermediate caches do not directly
 673          * deal with the slab structures; instead, the constructed
 674          * cached elements are simply stored in the freelists.
 675          */
 676         { MC_MBUF_CL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0, 2000, 0 },
 677         { MC_MBUF_BIGCL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0, 1000, 0 },
 678         { MC_MBUF_16KCL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0, 200, 0 },
 679 };
 680
 681 #define NELEM(a)        (sizeof (a) / sizeof ((a)[0]))
 682
 683 static void *mb_waitchan = &mbuf_table; /* wait channel for all caches */
 684 static int mb_waiters;                  /* number of waiters */
 685
 686 boolean_t mb_peak_newreport = FALSE;
 687 boolean_t mb_peak_firstreport = FALSE;
 688
 689 /* generate a report by default after 1 week of uptime */
 690 #define MBUF_PEAK_FIRST_REPORT_THRESHOLD        604800
 691
 692 #define MB_WDT_MAXTIME  10              /* # of secs before watchdog panic */
 693 static struct timeval mb_wdtstart;      /* watchdog start timestamp */
 694 static char *mbuf_dump_buf;
 695
 696 #define MBUF_DUMP_BUF_SIZE      4096
 697
 698 /*
 699  * mbuf watchdog is enabled by default on embedded platforms.  It is
 700  * also toggeable via the kern.ipc.mb_watchdog sysctl.
 701  * Garbage collection is also enabled by default on embedded platforms.
 702  * mb_drain_maxint controls the amount of time to wait (in seconds) before
 703  * consecutive calls to m_drain().
 704  */
 705 #if CONFIG_EMBEDDED
 706 static unsigned int mb_watchdog = 1;
 707 static unsigned int mb_drain_maxint = 60;
 708 #else
 709 static unsigned int mb_watchdog = 0;
 710 static unsigned int mb_drain_maxint = 0;
 711 #endif /* CONFIG_EMBEDDED */
 712
 713 uintptr_t mb_obscure_extfree __attribute__((visibility("hidden")));
 714 uintptr_t mb_obscure_extref __attribute__((visibility("hidden")));
 715
 716 /* Red zone */
 717 static u_int32_t mb_redzone_cookie;
 718 static void m_redzone_init(struct mbuf *);
 719 static void m_redzone_verify(struct mbuf *m);
 720
 721 /* The following are used to serialize m_clalloc() */
 722 static boolean_t mb_clalloc_busy;
 723 static void *mb_clalloc_waitchan = &mb_clalloc_busy;
 724 static int mb_clalloc_waiters;
 725
 726 static void mbuf_mtypes_sync(boolean_t);
 727 static int mbstat_sysctl SYSCTL_HANDLER_ARGS;
 728 static void mbuf_stat_sync(void);
 729 static int mb_stat_sysctl SYSCTL_HANDLER_ARGS;
 730 static int mleak_top_trace_sysctl SYSCTL_HANDLER_ARGS;
 731 static int mleak_table_sysctl SYSCTL_HANDLER_ARGS;
 732 static char *mbuf_dump(void);
 733 static void mbuf_table_init(void);
 734 static inline void m_incref(struct mbuf *);
 735 static inline u_int16_t m_decref(struct mbuf *);
 736 static int m_clalloc(const u_int32_t, const int, const u_int32_t);
 737 static void mbuf_worker_thread_init(void);
 738 static mcache_obj_t *slab_alloc(mbuf_class_t, int);
 739 static void slab_free(mbuf_class_t, mcache_obj_t *);
 740 static unsigned int mbuf_slab_alloc(void *, mcache_obj_t ***,
 741     unsigned int, int);
 742 static void mbuf_slab_free(void *, mcache_obj_t *, int);
 743 static void mbuf_slab_audit(void *, mcache_obj_t *, boolean_t);
 744 static void mbuf_slab_notify(void *, u_int32_t);
 745 static unsigned int cslab_alloc(mbuf_class_t, mcache_obj_t ***,
 746     unsigned int);
 747 static unsigned int cslab_free(mbuf_class_t, mcache_obj_t *, int);
 748 static unsigned int mbuf_cslab_alloc(void *, mcache_obj_t ***,
 749     unsigned int, int);
 750 static void mbuf_cslab_free(void *, mcache_obj_t *, int);
 751 static void mbuf_cslab_audit(void *, mcache_obj_t *, boolean_t);
 752 static int freelist_populate(mbuf_class_t, unsigned int, int);
 753 static void freelist_init(mbuf_class_t);
 754 static boolean_t mbuf_cached_above(mbuf_class_t, int);
 755 static boolean_t mbuf_steal(mbuf_class_t, unsigned int);
 756 static void m_reclaim(mbuf_class_t, unsigned int, boolean_t);
 757 static int m_howmany(int, size_t);
 758 static void mbuf_worker_thread(void);
 759 static void mbuf_watchdog(void);
 760 static boolean_t mbuf_sleep(mbuf_class_t, unsigned int, int);
 761
 762 static void mcl_audit_init(void *, mcache_audit_t **, mcache_obj_t **,
 763     size_t, unsigned int);
 764 static void mcl_audit_free(void *, unsigned int);
 765 static mcache_audit_t *mcl_audit_buf2mca(mbuf_class_t, mcache_obj_t *);
 766 static void mcl_audit_mbuf(mcache_audit_t *, void *, boolean_t, boolean_t);
 767 static void mcl_audit_cluster(mcache_audit_t *, void *, size_t, boolean_t,
 768     boolean_t);
 769 static void mcl_audit_restore_mbuf(struct mbuf *, mcache_audit_t *, boolean_t);
 770 static void mcl_audit_save_mbuf(struct mbuf *, mcache_audit_t *);
 771 static void mcl_audit_scratch(mcache_audit_t *);
 772 static void mcl_audit_mcheck_panic(struct mbuf *);
 773 static void mcl_audit_verify_nextptr(void *, mcache_audit_t *);
 774
 775 static void mleak_activate(void);
 776 static void mleak_logger(u_int32_t, mcache_obj_t *, boolean_t);
 777 static boolean_t mleak_log(uintptr_t *, mcache_obj_t *, uint32_t, int);
 778 static void mleak_free(mcache_obj_t *);
 779 static void mleak_sort_traces(void);
 780 static void mleak_update_stats(void);
 781
 782 static mcl_slab_t *slab_get(void *);
 783 static void slab_init(mcl_slab_t *, mbuf_class_t, u_int32_t,
 784     void *, void *, unsigned int, int, int);
 785 static void slab_insert(mcl_slab_t *, mbuf_class_t);
 786 static void slab_remove(mcl_slab_t *, mbuf_class_t);
 787 static boolean_t slab_inrange(mcl_slab_t *, void *);
 788 static void slab_nextptr_panic(mcl_slab_t *, void *);
 789 static void slab_detach(mcl_slab_t *);
 790 static boolean_t slab_is_detached(mcl_slab_t *);
 791
 792 static int m_copyback0(struct mbuf **, int, int, const void *, int, int);
 793 static struct mbuf *m_split0(struct mbuf *, int, int, int);
 794 __private_extern__ void mbuf_report_peak_usage(void);
 795 static boolean_t mbuf_report_usage(mbuf_class_t);
 796
 797 /* flags for m_copyback0 */
 798 #define M_COPYBACK0_COPYBACK    0x0001  /* copyback from cp */
 799 #define M_COPYBACK0_PRESERVE    0x0002  /* preserve original data */
 800 #define M_COPYBACK0_COW         0x0004  /* do copy-on-write */
 801 #define M_COPYBACK0_EXTEND      0x0008  /* extend chain */
 802
 803 /*
 804  * This flag is set for all mbufs that come out of and into the composite
 805  * mbuf + cluster caches, i.e. MC_MBUF_CL and MC_MBUF_BIGCL.  mbufs that
 806  * are marked with such a flag have clusters attached to them, and will be
 807  * treated differently when they are freed; instead of being placed back
 808  * into the mbuf and cluster freelists, the composite mbuf + cluster objects
 809  * are placed back into the appropriate composite cache's freelist, and the
 810  * actual freeing is deferred until the composite objects are purged.  At
 811  * such a time, this flag will be cleared from the mbufs and the objects
 812  * will be freed into their own separate freelists.
 813  */
 814 #define EXTF_COMPOSITE  0x1
 815
 816 /*
 817  * This flag indicates that the external cluster is read-only, i.e. it is
 818  * or was referred to by more than one mbufs.  Once set, this flag is never
 819  * cleared.
 820  */
 821 #define EXTF_READONLY   0x2
 822 /*
 823  * This flag indicates that the external cluster is paired with the mbuf.
 824  * Pairing implies an external free routine defined which will be invoked
 825  * when the reference count drops to the minimum at m_free time.  This
 826  * flag is never cleared.
 827  */
 828 #define EXTF_PAIRED     0x4
 829
 830 #define EXTF_MASK       \
 831         (EXTF_COMPOSITE | EXTF_READONLY | EXTF_PAIRED)
 832
 833 #define MEXT_MINREF(m)          ((m_get_rfa(m))->minref)
 834 #define MEXT_REF(m)             ((m_get_rfa(m))->refcnt)
 835 #define MEXT_PREF(m)            ((m_get_rfa(m))->prefcnt)
 836 #define MEXT_FLAGS(m)           ((m_get_rfa(m))->flags)
 837 #define MEXT_PRIV(m)            ((m_get_rfa(m))->priv)
 838 #define MEXT_PMBUF(m)           ((m_get_rfa(m))->paired)
 839 #define MEXT_TOKEN(m)           ((m_get_rfa(m))->ext_token)
 840 #define MBUF_IS_COMPOSITE(m)                                            \
 841         (MEXT_REF(m) == MEXT_MINREF(m) &&                               \
 842         (MEXT_FLAGS(m) & EXTF_MASK) == EXTF_COMPOSITE)
 843 /*
 844  * This macro can be used to test if the mbuf is paired to an external
 845  * cluster.  The test for MEXT_PMBUF being equal to the mbuf in subject
 846  * is important, as EXTF_PAIRED alone is insufficient since it is immutable,
 847  * and thus survives calls to m_free_paired.
 848  */
 849 #define MBUF_IS_PAIRED(m)                                               \
 850         (((m)->m_flags & M_EXT) &&                                      \
 851         (MEXT_FLAGS(m) & EXTF_MASK) == EXTF_PAIRED &&                   \
 852         MEXT_PMBUF(m) == (m))
 853
 854 /*
 855  * Macros used to verify the integrity of the mbuf.
 856  */
 857 #define _MCHECK(m) {                                                    \
 858         if ((m)->m_type != MT_FREE && !MBUF_IS_PAIRED(m)) {             \
 859                 if (mclaudit == NULL)                                   \
 860                         panic("MCHECK: m_type=%d m=%p",                 \
 861                             (u_int16_t)(m)->m_type, m);                 \
 862                 else                                                    \
 863                         mcl_audit_mcheck_panic(m);                      \
 864         }                                                               \
 865 }
 866
 867 #define MBUF_IN_MAP(addr)                                               \
 868         ((unsigned char *)(addr) >= mbutl &&                            \
 869         (unsigned char *)(addr) < embutl)
 870
 871 #define MRANGE(addr) {                                                  \
 872         if (!MBUF_IN_MAP(addr))                                         \
 873                 panic("MRANGE: address out of range 0x%p", addr);       \
 874 }
 875
 876 /*
 877  * Macro version of mtod.
 878  */
 879 #define MTOD(m, t)      ((t)((m)->m_data))
 880
 881 /*
 882  * Macros to obtain page index given a base cluster address
 883  */
 884 #define MTOPG(x)        (((unsigned char *)x - mbutl) >> PAGE_SHIFT)
 885 #define PGTOM(x)        (mbutl + (x << PAGE_SHIFT))
 886
 887 /*
 888  * Macro to find the mbuf index relative to a base.
 889  */
 890 #define MBPAGEIDX(c, m) \
 891         (((unsigned char *)(m) - (unsigned char *)(c)) >> MSIZESHIFT)
 892
 893 /*
 894  * Same thing for 2KB cluster index.
 895  */
 896 #define CLPAGEIDX(c, m) \
 897         (((unsigned char *)(m) - (unsigned char *)(c)) >> MCLSHIFT)
 898
 899 /*
 900  * Macro to find 4KB cluster index relative to a base
 901  */
 902 #define BCLPAGEIDX(c, m) \
 903         (((unsigned char *)(m) - (unsigned char *)(c)) >> MBIGCLSHIFT)
 904
 905 /*
 906  * Macros used during mbuf and cluster initialization.
 907  */
 908 #define MBUF_INIT_PKTHDR(m) {                                           \
 909         (m)->m_pkthdr.rcvif = NULL;                                     \
 910         (m)->m_pkthdr.pkt_hdr = NULL;                                   \
 911         (m)->m_pkthdr.len = 0;                                          \
 912         (m)->m_pkthdr.csum_flags = 0;                                   \
 913         (m)->m_pkthdr.csum_data = 0;                                    \
 914         (m)->m_pkthdr.vlan_tag = 0;                                     \
 915         m_classifier_init(m, 0);                                        \
 916         m_tag_init(m, 1);                                               \
 917         m_scratch_init(m);                                              \
 918         m_redzone_init(m);                                              \
 919 }
 920
 921 #define MBUF_INIT(m, pkthdr, type) {                                    \
 922         _MCHECK(m);                                                     \
 923         (m)->m_next = (m)->m_nextpkt = NULL;                            \
 924         (m)->m_len = 0;                                                 \
 925         (m)->m_type = type;                                             \
 926         if ((pkthdr) == 0) {                                            \
 927                 (m)->m_data = (m)->m_dat;                               \
 928                 (m)->m_flags = 0;                                       \
 929         } else {                                                        \
 930                 (m)->m_data = (m)->m_pktdat;                            \
 931                 (m)->m_flags = M_PKTHDR;                                \
 932                 MBUF_INIT_PKTHDR(m);                                    \
 933         }                                                               \
 934 }
 935
 936 #define MEXT_INIT(m, buf, size, free, arg, rfa, min, ref, pref, flag,   \
 937     priv, pm) {                                                         \
 938         (m)->m_data = (m)->m_ext.ext_buf = (buf);                       \
 939         (m)->m_flags |= M_EXT;                                          \
 940         m_set_ext((m), (rfa), (free), (arg));                           \
 941         (m)->m_ext.ext_size = (size);                                   \
 942         MEXT_MINREF(m) = (min);                                         \
 943         MEXT_REF(m) = (ref);                                            \
 944         MEXT_PREF(m) = (pref);                                          \
 945         MEXT_FLAGS(m) = (flag);                                         \
 946         MEXT_PRIV(m) = (priv);                                          \
 947         MEXT_PMBUF(m) = (pm);                                           \
 948 }
 949
 950 #define MBUF_CL_INIT(m, buf, rfa, ref, flag)    \
 951         MEXT_INIT(m, buf, m_maxsize(MC_CL), NULL, NULL, rfa, 0,         \
 952             ref, 0, flag, 0, NULL)
 953
 954 #define MBUF_BIGCL_INIT(m, buf, rfa, ref, flag) \
 955         MEXT_INIT(m, buf, m_maxsize(MC_BIGCL), m_bigfree, NULL, rfa, 0, \
 956             ref, 0, flag, 0, NULL)
 957
 958 #define MBUF_16KCL_INIT(m, buf, rfa, ref, flag) \
 959         MEXT_INIT(m, buf, m_maxsize(MC_16KCL), m_16kfree, NULL, rfa, 0, \
 960             ref, 0, flag, 0, NULL)
 961
 962 /*
 963  * Macro to convert BSD malloc sleep flag to mcache's
 964  */
 965 #define MSLEEPF(f)      ((!((f) & M_DONTWAIT)) ? MCR_SLEEP : MCR_NOSLEEP)
 966
 967 /*
 968  * The structure that holds all mbuf class statistics exportable via sysctl.
 969  * Similar to mbstat structure, the mb_stat structure is protected by the
 970  * global mbuf lock.  It contains additional information about the classes
 971  * that allows for a more accurate view of the state of the allocator.
 972  */
 973 struct mb_stat *mb_stat;
 974 struct omb_stat *omb_stat;      /* For backwards compatibility */
 975
 976 #define MB_STAT_SIZE(n) \
 977         __builtin_offsetof(mb_stat_t, mbs_class[n])
 978 #define OMB_STAT_SIZE(n) \
 979         ((size_t)(&((struct omb_stat *)0)->mbs_class[n]))
 980
 981 /*
 982  * The legacy structure holding all of the mbuf allocation statistics.
 983  * The actual statistics used by the kernel are stored in the mbuf_table
 984  * instead, and are updated atomically while the global mbuf lock is held.
 985  * They are mirrored in mbstat to support legacy applications (e.g. netstat).
 986  * Unlike before, the kernel no longer relies on the contents of mbstat for
 987  * its operations (e.g. cluster expansion) because the structure is exposed
 988  * to outside and could possibly be modified, therefore making it unsafe.
 989  * With the exception of the mbstat.m_mtypes array (see below), all of the
 990  * statistics are updated as they change.
 991  */
 992 struct mbstat mbstat;
 993
 994 #define MBSTAT_MTYPES_MAX \
 995         (sizeof (mbstat.m_mtypes) / sizeof (mbstat.m_mtypes[0]))
 996
 997 /*
 998  * Allocation statistics related to mbuf types (up to MT_MAX-1) are updated
 999  * atomically and stored in a per-CPU structure which is lock-free; this is
1000  * done in order to avoid writing to the global mbstat data structure which
1001  * would cause false sharing.  During sysctl request for kern.ipc.mbstat,
1002  * the statistics across all CPUs will be converged into the mbstat.m_mtypes
1003  * array and returned to the application.  Any updates for types greater or
1004  * equal than MT_MAX would be done atomically to the mbstat; this slows down
1005  * performance but is okay since the kernel uses only up to MT_MAX-1 while
1006  * anything beyond that (up to type 255) is considered a corner case.
1007  */
1008 typedef struct {
1009         unsigned int    cpu_mtypes[MT_MAX];
1010 } __attribute__((aligned(MAX_CPU_CACHE_LINE_SIZE), packed)) mtypes_cpu_t;
1011
1012 typedef struct {
1013         mtypes_cpu_t    mbs_cpu[1];
1014 } mbuf_mtypes_t;
1015
1016 static mbuf_mtypes_t *mbuf_mtypes;      /* per-CPU statistics */
1017
1018 #define MBUF_MTYPES_SIZE(n) \
1019         ((size_t)(&((mbuf_mtypes_t *)0)->mbs_cpu[n]))
1020
1021 #define MTYPES_CPU(p) \
1022         ((mtypes_cpu_t *)(void *)((char *)(p) + MBUF_MTYPES_SIZE(cpu_number())))
1023
1024 #define mtype_stat_add(type, n) {                                       \
1025         if ((unsigned)(type) < MT_MAX) {                                \
1026                 mtypes_cpu_t *mbs = MTYPES_CPU(mbuf_mtypes);            \
1027                 atomic_add_32(&mbs->cpu_mtypes[type], n);               \
1028         } else if ((unsigned)(type) < (unsigned)MBSTAT_MTYPES_MAX) {    \
1029                 atomic_add_16((int16_t *)&mbstat.m_mtypes[type], n);    \
1030         }                                                               \
1031 }
1032
1033 #define mtype_stat_sub(t, n)    mtype_stat_add(t, -(n))
1034 #define mtype_stat_inc(t)       mtype_stat_add(t, 1)
1035 #define mtype_stat_dec(t)       mtype_stat_sub(t, 1)
1036
1037 static void
1038 mbuf_mtypes_sync(boolean_t locked)
1039 {
1040         int m, n;
1041         mtypes_cpu_t mtc;
1042
1043         if (locked)
1044                 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1045
1046         bzero(&mtc, sizeof (mtc));
1047         for (m = 0; m < ncpu; m++) {
1048                 mtypes_cpu_t *scp = &mbuf_mtypes->mbs_cpu[m];
1049                 mtypes_cpu_t temp;
1050
1051                 bcopy(&scp->cpu_mtypes, &temp.cpu_mtypes,
1052                     sizeof (temp.cpu_mtypes));
1053
1054                 for (n = 0; n < MT_MAX; n++)
1055                         mtc.cpu_mtypes[n] += temp.cpu_mtypes[n];
1056         }
1057         if (!locked)
1058                 lck_mtx_lock(mbuf_mlock);
1059         for (n = 0; n < MT_MAX; n++)
1060                 mbstat.m_mtypes[n] = mtc.cpu_mtypes[n];
1061         if (!locked)
1062                 lck_mtx_unlock(mbuf_mlock);
1063 }
1064
1065 static int
1066 mbstat_sysctl SYSCTL_HANDLER_ARGS
1067 {
1068 #pragma unused(oidp, arg1, arg2)
1069         mbuf_mtypes_sync(FALSE);
1070
1071         return (SYSCTL_OUT(req, &mbstat, sizeof (mbstat)));
1072 }
1073
1074 static void
1075 mbuf_stat_sync(void)
1076 {
1077         mb_class_stat_t *sp;
1078         mcache_cpu_t *ccp;
1079         mcache_t *cp;
1080         int k, m, bktsize;
1081
1082         LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1083
1084         for (k = 0; k < NELEM(mbuf_table); k++) {
1085                 cp = m_cache(k);
1086                 ccp = &cp->mc_cpu[0];
1087                 bktsize = ccp->cc_bktsize;
1088                 sp = mbuf_table[k].mtbl_stats;
1089
1090                 if (cp->mc_flags & MCF_NOCPUCACHE)
1091                         sp->mbcl_mc_state = MCS_DISABLED;
1092                 else if (cp->mc_purge_cnt > 0)
1093                         sp->mbcl_mc_state = MCS_PURGING;
1094                 else if (bktsize == 0)
1095                         sp->mbcl_mc_state = MCS_OFFLINE;
1096                 else
1097                         sp->mbcl_mc_state = MCS_ONLINE;
1098
1099                 sp->mbcl_mc_cached = 0;
1100                 for (m = 0; m < ncpu; m++) {
1101                         ccp = &cp->mc_cpu[m];
1102                         if (ccp->cc_objs > 0)
1103                                 sp->mbcl_mc_cached += ccp->cc_objs;
1104                         if (ccp->cc_pobjs > 0)
1105                                 sp->mbcl_mc_cached += ccp->cc_pobjs;
1106                 }
1107                 sp->mbcl_mc_cached += (cp->mc_full.bl_total * bktsize);
1108                 sp->mbcl_active = sp->mbcl_total - sp->mbcl_mc_cached -
1109                     sp->mbcl_infree;
1110
1111                 sp->mbcl_mc_waiter_cnt = cp->mc_waiter_cnt;
1112                 sp->mbcl_mc_wretry_cnt = cp->mc_wretry_cnt;
1113                 sp->mbcl_mc_nwretry_cnt = cp->mc_nwretry_cnt;
1114
1115                 /* Calculate total count specific to each class */
1116                 sp->mbcl_ctotal = sp->mbcl_total;
1117                 switch (m_class(k)) {
1118                 case MC_MBUF:
1119                         /* Deduct mbufs used in composite caches */
1120                         sp->mbcl_ctotal -= (m_total(MC_MBUF_CL) +
1121                             m_total(MC_MBUF_BIGCL));
1122                         break;
1123
1124                 case MC_CL:
1125                         /* Deduct clusters used in composite cache */
1126                         sp->mbcl_ctotal -= m_total(MC_MBUF_CL);
1127                         break;
1128
1129                 case MC_BIGCL:
1130                         /* Deduct clusters used in composite cache */
1131                         sp->mbcl_ctotal -= m_total(MC_MBUF_BIGCL);
1132                         break;
1133
1134                 case MC_16KCL:
1135                         /* Deduct clusters used in composite cache */
1136                         sp->mbcl_ctotal -= m_total(MC_MBUF_16KCL);
1137                         break;
1138
1139                 default:
1140                         break;
1141                 }
1142         }
1143 }
1144
1145 static int
1146 mb_stat_sysctl SYSCTL_HANDLER_ARGS
1147 {
1148 #pragma unused(oidp, arg1, arg2)
1149         void *statp;
1150         int k, statsz, proc64 = proc_is64bit(req->p);
1151
1152         lck_mtx_lock(mbuf_mlock);
1153         mbuf_stat_sync();
1154
1155         if (!proc64) {
1156                 struct omb_class_stat *oc;
1157                 struct mb_class_stat *c;
1158
1159                 omb_stat->mbs_cnt = mb_stat->mbs_cnt;
1160                 oc = &omb_stat->mbs_class[0];
1161                 c = &mb_stat->mbs_class[0];
1162                 for (k = 0; k < omb_stat->mbs_cnt; k++, oc++, c++) {
1163                         (void) snprintf(oc->mbcl_cname, sizeof (oc->mbcl_cname),
1164                             "%s", c->mbcl_cname);
1165                         oc->mbcl_size = c->mbcl_size;
1166                         oc->mbcl_total = c->mbcl_total;
1167                         oc->mbcl_active = c->mbcl_active;
1168                         oc->mbcl_infree = c->mbcl_infree;
1169                         oc->mbcl_slab_cnt = c->mbcl_slab_cnt;
1170                         oc->mbcl_alloc_cnt = c->mbcl_alloc_cnt;
1171                         oc->mbcl_free_cnt = c->mbcl_free_cnt;
1172                         oc->mbcl_notified = c->mbcl_notified;
1173                         oc->mbcl_purge_cnt = c->mbcl_purge_cnt;
1174                         oc->mbcl_fail_cnt = c->mbcl_fail_cnt;
1175                         oc->mbcl_ctotal = c->mbcl_ctotal;
1176                         oc->mbcl_release_cnt = c->mbcl_release_cnt;
1177                         oc->mbcl_mc_state = c->mbcl_mc_state;
1178                         oc->mbcl_mc_cached = c->mbcl_mc_cached;
1179                         oc->mbcl_mc_waiter_cnt = c->mbcl_mc_waiter_cnt;
1180                         oc->mbcl_mc_wretry_cnt = c->mbcl_mc_wretry_cnt;
1181                         oc->mbcl_mc_nwretry_cnt = c->mbcl_mc_nwretry_cnt;
1182                 }
1183                 statp = omb_stat;
1184                 statsz = OMB_STAT_SIZE(NELEM(mbuf_table));
1185         } else {
1186                 statp = mb_stat;
1187                 statsz = MB_STAT_SIZE(NELEM(mbuf_table));
1188         }
1189
1190         lck_mtx_unlock(mbuf_mlock);
1191
1192         return (SYSCTL_OUT(req, statp, statsz));
1193 }
1194
1195 static int
1196 mleak_top_trace_sysctl SYSCTL_HANDLER_ARGS
1197 {
1198 #pragma unused(oidp, arg1, arg2)
1199         int i;
1200
1201         /* Ensure leak tracing turned on */
1202         if (!mclfindleak || !mclexpleak)
1203                 return (ENXIO);
1204
1205         lck_mtx_lock(mleak_lock);
1206         mleak_update_stats();
1207         i = SYSCTL_OUT(req, mleak_stat, MLEAK_STAT_SIZE(MLEAK_NUM_TRACES));
1208         lck_mtx_unlock(mleak_lock);
1209
1210         return (i);
1211 }
1212
1213 static int
1214 mleak_table_sysctl SYSCTL_HANDLER_ARGS
1215 {
1216 #pragma unused(oidp, arg1, arg2)
1217         int i = 0;
1218
1219         /* Ensure leak tracing turned on */
1220         if (!mclfindleak || !mclexpleak)
1221                 return (ENXIO);
1222
1223         lck_mtx_lock(mleak_lock);
1224         i = SYSCTL_OUT(req, &mleak_table, sizeof (mleak_table));
1225         lck_mtx_unlock(mleak_lock);
1226
1227         return (i);
1228 }
1229
1230 static inline void
1231 m_incref(struct mbuf *m)
1232 {
1233         UInt16 old, new;
1234         volatile UInt16 *addr = (volatile UInt16 *)&MEXT_REF(m);
1235
1236         do {
1237                 old = *addr;
1238                 new = old + 1;
1239                 ASSERT(new != 0);
1240         } while (!OSCompareAndSwap16(old, new, addr));
1241
1242         /*
1243          * If cluster is shared, mark it with (sticky) EXTF_READONLY;
1244          * we don't clear the flag when the refcount goes back to the
1245          * minimum, to simplify code calling m_mclhasreference().
1246          */
1247         if (new > (MEXT_MINREF(m) + 1) && !(MEXT_FLAGS(m) & EXTF_READONLY))
1248                 (void) OSBitOrAtomic16(EXTF_READONLY, &MEXT_FLAGS(m));
1249 }
1250
1251 static inline u_int16_t
1252 m_decref(struct mbuf *m)
1253 {
1254         UInt16 old, new;
1255         volatile UInt16 *addr = (volatile UInt16 *)&MEXT_REF(m);
1256
1257         do {
1258                 old = *addr;
1259                 new = old - 1;
1260                 ASSERT(old != 0);
1261         } while (!OSCompareAndSwap16(old, new, addr));
1262
1263         return (new);
1264 }
1265
1266 static void
1267 mbuf_table_init(void)
1268 {
1269         unsigned int b, c, s;
1270         int m, config_mbuf_jumbo = 0;
1271
1272         MALLOC(omb_stat, struct omb_stat *, OMB_STAT_SIZE(NELEM(mbuf_table)),
1273             M_TEMP, M_WAITOK | M_ZERO);
1274         VERIFY(omb_stat != NULL);
1275
1276         MALLOC(mb_stat, mb_stat_t *, MB_STAT_SIZE(NELEM(mbuf_table)),
1277             M_TEMP, M_WAITOK | M_ZERO);
1278         VERIFY(mb_stat != NULL);
1279
1280         mb_stat->mbs_cnt = NELEM(mbuf_table);
1281         for (m = 0; m < NELEM(mbuf_table); m++)
1282                 mbuf_table[m].mtbl_stats = &mb_stat->mbs_class[m];
1283
1284 #if CONFIG_MBUF_JUMBO
1285         config_mbuf_jumbo = 1;
1286 #endif /* CONFIG_MBUF_JUMBO */
1287
1288         if (config_mbuf_jumbo == 1 || PAGE_SIZE == M16KCLBYTES) {
1289                 /*
1290                  * Set aside 1/3 of the mbuf cluster map for jumbo
1291                  * clusters; we do this only on platforms where jumbo
1292                  * cluster pool is enabled.
1293                  */
1294                 njcl = nmbclusters / 3;
1295                 njclbytes = M16KCLBYTES;
1296         }
1297
1298         /*
1299          * nclusters holds both the 2KB and 4KB pools, so ensure it's
1300          * a multiple of 4KB clusters.
1301          */
1302         nclusters = P2ROUNDDOWN(nmbclusters - njcl, NCLPG);
1303         if (njcl > 0) {
1304                 /*
1305                  * Each jumbo cluster takes 8 2KB clusters, so make
1306                  * sure that the pool size is evenly divisible by 8;
1307                  * njcl is in 2KB unit, hence treated as such.
1308                  */
1309                 njcl = P2ROUNDDOWN(nmbclusters - nclusters, NCLPJCL);
1310
1311                 /* Update nclusters with rounded down value of njcl */
1312                 nclusters = P2ROUNDDOWN(nmbclusters - njcl, NCLPG);
1313         }
1314
1315         /*
1316          * njcl is valid only on platforms with 16KB jumbo clusters or
1317          * with 16KB pages, where it is configured to 1/3 of the pool
1318          * size.  On these platforms, the remaining is used for 2KB
1319          * and 4KB clusters.  On platforms without 16KB jumbo clusters,
1320          * the entire pool is used for both 2KB and 4KB clusters.  A 4KB
1321          * cluster can either be splitted into 16 mbufs, or into 2 2KB
1322          * clusters.
1323          *
1324          *  +---+---+------------ ... -----------+------- ... -------+
1325          *  | c | b |              s             |        njcl       |
1326          *  +---+---+------------ ... -----------+------- ... -------+
1327          *
1328          * 1/32th of the shared region is reserved for pure 2KB and 4KB
1329          * clusters (1/64th each.)
1330          */
1331         c = P2ROUNDDOWN((nclusters >> 6), NCLPG);       /* in 2KB unit */
1332         b = P2ROUNDDOWN((nclusters >> (6 + NCLPBGSHIFT)), NBCLPG); /* in 4KB unit */
1333         s = nclusters - (c + (b << NCLPBGSHIFT));       /* in 2KB unit */
1334
1335         /*
1336          * 1/64th (c) is reserved for 2KB clusters.
1337          */
1338         m_minlimit(MC_CL) = c;
1339         m_maxlimit(MC_CL) = s + c;                      /* in 2KB unit */
1340         m_maxsize(MC_CL) = m_size(MC_CL) = MCLBYTES;
1341         (void) snprintf(m_cname(MC_CL), MAX_MBUF_CNAME, "cl");
1342
1343         /*
1344          * Another 1/64th (b) of the map is reserved for 4KB clusters.
1345          * It cannot be turned into 2KB clusters or mbufs.
1346          */
1347         m_minlimit(MC_BIGCL) = b;
1348         m_maxlimit(MC_BIGCL) = (s >> NCLPBGSHIFT) + b;  /* in 4KB unit */
1349         m_maxsize(MC_BIGCL) = m_size(MC_BIGCL) = MBIGCLBYTES;
1350         (void) snprintf(m_cname(MC_BIGCL), MAX_MBUF_CNAME, "bigcl");
1351
1352         /*
1353          * The remaining 31/32ths (s) are all-purpose (mbufs, 2KB, or 4KB)
1354          */
1355         m_minlimit(MC_MBUF) = 0;
1356         m_maxlimit(MC_MBUF) = (s << NMBPCLSHIFT);       /* in mbuf unit */
1357         m_maxsize(MC_MBUF) = m_size(MC_MBUF) = MSIZE;
1358         (void) snprintf(m_cname(MC_MBUF), MAX_MBUF_CNAME, "mbuf");
1359
1360         /*
1361          * Set limits for the composite classes.
1362          */
1363         m_minlimit(MC_MBUF_CL) = 0;
1364         m_maxlimit(MC_MBUF_CL) = m_maxlimit(MC_CL);
1365         m_maxsize(MC_MBUF_CL) = MCLBYTES;
1366         m_size(MC_MBUF_CL) = m_size(MC_MBUF) + m_size(MC_CL);
1367         (void) snprintf(m_cname(MC_MBUF_CL), MAX_MBUF_CNAME, "mbuf_cl");
1368
1369         m_minlimit(MC_MBUF_BIGCL) = 0;
1370         m_maxlimit(MC_MBUF_BIGCL) = m_maxlimit(MC_BIGCL);
1371         m_maxsize(MC_MBUF_BIGCL) = MBIGCLBYTES;
1372         m_size(MC_MBUF_BIGCL) = m_size(MC_MBUF) + m_size(MC_BIGCL);
1373         (void) snprintf(m_cname(MC_MBUF_BIGCL), MAX_MBUF_CNAME, "mbuf_bigcl");
1374
1375         /*
1376          * And for jumbo classes.
1377          */
1378         m_minlimit(MC_16KCL) = 0;
1379         m_maxlimit(MC_16KCL) = (njcl >> NCLPJCLSHIFT);  /* in 16KB unit */
1380         m_maxsize(MC_16KCL) = m_size(MC_16KCL) = M16KCLBYTES;
1381         (void) snprintf(m_cname(MC_16KCL), MAX_MBUF_CNAME, "16kcl");
1382
1383         m_minlimit(MC_MBUF_16KCL) = 0;
1384         m_maxlimit(MC_MBUF_16KCL) = m_maxlimit(MC_16KCL);
1385         m_maxsize(MC_MBUF_16KCL) = M16KCLBYTES;
1386         m_size(MC_MBUF_16KCL) = m_size(MC_MBUF) + m_size(MC_16KCL);
1387         (void) snprintf(m_cname(MC_MBUF_16KCL), MAX_MBUF_CNAME, "mbuf_16kcl");
1388
1389         /*
1390          * Initialize the legacy mbstat structure.
1391          */
1392         bzero(&mbstat, sizeof (mbstat));
1393         mbstat.m_msize = m_maxsize(MC_MBUF);
1394         mbstat.m_mclbytes = m_maxsize(MC_CL);
1395         mbstat.m_minclsize = MINCLSIZE;
1396         mbstat.m_mlen = MLEN;
1397         mbstat.m_mhlen = MHLEN;
1398         mbstat.m_bigmclbytes = m_maxsize(MC_BIGCL);
1399 }
1400
1401 #if defined(__LP64__)
1402 typedef struct ncl_tbl {
1403         uint64_t nt_maxmem;     /* memory (sane) size */
1404         uint32_t nt_mbpool;     /* mbuf pool size */
1405 } ncl_tbl_t;
1406
1407 /* Non-server */
1408 static ncl_tbl_t ncl_table[] = {
1409         { (1ULL << GBSHIFT)       /*  1 GB */,  (64 << MBSHIFT)  /*  64 MB */ },
1410         { (1ULL << (GBSHIFT + 3)) /*  8 GB */,  (96 << MBSHIFT)  /*  96 MB */ },
1411         { (1ULL << (GBSHIFT + 4)) /* 16 GB */,  (128 << MBSHIFT) /* 128 MB */ },
1412         { 0, 0 }
1413 };
1414
1415 /* Server */
1416 static ncl_tbl_t ncl_table_srv[] = {
1417         { (1ULL << GBSHIFT)       /*  1 GB */,  (96 << MBSHIFT)  /*  96 MB */ },
1418         { (1ULL << (GBSHIFT + 2)) /*  4 GB */,  (128 << MBSHIFT) /* 128 MB */ },
1419         { (1ULL << (GBSHIFT + 3)) /*  8 GB */,  (160 << MBSHIFT) /* 160 MB */ },
1420         { (1ULL << (GBSHIFT + 4)) /* 16 GB */,  (192 << MBSHIFT) /* 192 MB */ },
1421         { (1ULL << (GBSHIFT + 5)) /* 32 GB */,  (256 << MBSHIFT) /* 256 MB */ },
1422         { (1ULL << (GBSHIFT + 6)) /* 64 GB */,  (384 << MBSHIFT) /* 384 MB */ },
1423         { 0, 0 }
1424 };
1425 #endif /* __LP64__ */
1426
1427 __private_extern__ unsigned int
1428 mbuf_default_ncl(int server, uint64_t mem)
1429 {
1430 #if !defined(__LP64__)
1431 #pragma unused(server)
1432         unsigned int n;
1433         /*
1434          * 32-bit kernel (default to 64MB of mbuf pool for >= 1GB RAM).
1435          */
1436         if ((n = ((mem / 16) / MCLBYTES)) > 32768)
1437                 n = 32768;
1438 #else
1439         unsigned int n, i;
1440         ncl_tbl_t *tbl = (server ? ncl_table_srv : ncl_table);
1441         /*
1442          * 64-bit kernel (mbuf pool size based on table).
1443          */
1444         n = tbl[0].nt_mbpool;
1445         for (i = 0; tbl[i].nt_mbpool != 0; i++) {
1446                 if (mem < tbl[i].nt_maxmem)
1447                         break;
1448                 n = tbl[i].nt_mbpool;
1449         }
1450         n >>= MCLSHIFT;
1451 #endif /* !__LP64__ */
1452         return (n);
1453 }
1454
1455 __private_extern__ void
1456 mbinit(void)
1457 {
1458         unsigned int m;
1459         unsigned int initmcl = 0;
1460         void *buf;
1461         thread_t thread = THREAD_NULL;
1462
1463         microuptime(&mb_start);
1464
1465         /*
1466          * These MBUF_ values must be equal to their private counterparts.
1467          */
1468         _CASSERT(MBUF_EXT == M_EXT);
1469         _CASSERT(MBUF_PKTHDR == M_PKTHDR);
1470         _CASSERT(MBUF_EOR == M_EOR);
1471         _CASSERT(MBUF_LOOP == M_LOOP);
1472         _CASSERT(MBUF_BCAST == M_BCAST);
1473         _CASSERT(MBUF_MCAST == M_MCAST);
1474         _CASSERT(MBUF_FRAG == M_FRAG);
1475         _CASSERT(MBUF_FIRSTFRAG == M_FIRSTFRAG);
1476         _CASSERT(MBUF_LASTFRAG == M_LASTFRAG);
1477         _CASSERT(MBUF_PROMISC == M_PROMISC);
1478         _CASSERT(MBUF_HASFCS == M_HASFCS);
1479
1480         _CASSERT(MBUF_TYPE_FREE == MT_FREE);
1481         _CASSERT(MBUF_TYPE_DATA == MT_DATA);
1482         _CASSERT(MBUF_TYPE_HEADER == MT_HEADER);
1483         _CASSERT(MBUF_TYPE_SOCKET == MT_SOCKET);
1484         _CASSERT(MBUF_TYPE_PCB == MT_PCB);
1485         _CASSERT(MBUF_TYPE_RTABLE == MT_RTABLE);
1486         _CASSERT(MBUF_TYPE_HTABLE == MT_HTABLE);
1487         _CASSERT(MBUF_TYPE_ATABLE == MT_ATABLE);
1488         _CASSERT(MBUF_TYPE_SONAME == MT_SONAME);
1489         _CASSERT(MBUF_TYPE_SOOPTS == MT_SOOPTS);
1490         _CASSERT(MBUF_TYPE_FTABLE == MT_FTABLE);
1491         _CASSERT(MBUF_TYPE_RIGHTS == MT_RIGHTS);
1492         _CASSERT(MBUF_TYPE_IFADDR == MT_IFADDR);
1493         _CASSERT(MBUF_TYPE_CONTROL == MT_CONTROL);
1494         _CASSERT(MBUF_TYPE_OOBDATA == MT_OOBDATA);
1495
1496         _CASSERT(MBUF_TSO_IPV4 == CSUM_TSO_IPV4);
1497         _CASSERT(MBUF_TSO_IPV6 == CSUM_TSO_IPV6);
1498         _CASSERT(MBUF_CSUM_REQ_SUM16 == CSUM_PARTIAL);
1499         _CASSERT(MBUF_CSUM_TCP_SUM16 == MBUF_CSUM_REQ_SUM16);
1500         _CASSERT(MBUF_CSUM_REQ_ZERO_INVERT == CSUM_ZERO_INVERT);
1501         _CASSERT(MBUF_CSUM_REQ_IP == CSUM_IP);
1502         _CASSERT(MBUF_CSUM_REQ_TCP == CSUM_TCP);
1503         _CASSERT(MBUF_CSUM_REQ_UDP == CSUM_UDP);
1504         _CASSERT(MBUF_CSUM_REQ_TCPIPV6 == CSUM_TCPIPV6);
1505         _CASSERT(MBUF_CSUM_REQ_UDPIPV6 == CSUM_UDPIPV6);
1506         _CASSERT(MBUF_CSUM_DID_IP == CSUM_IP_CHECKED);
1507         _CASSERT(MBUF_CSUM_IP_GOOD == CSUM_IP_VALID);
1508         _CASSERT(MBUF_CSUM_DID_DATA == CSUM_DATA_VALID);
1509         _CASSERT(MBUF_CSUM_PSEUDO_HDR == CSUM_PSEUDO_HDR);
1510
1511         _CASSERT(MBUF_WAITOK == M_WAIT);
1512         _CASSERT(MBUF_DONTWAIT == M_DONTWAIT);
1513         _CASSERT(MBUF_COPYALL == M_COPYALL);
1514
1515         _CASSERT(MBUF_SC2TC(MBUF_SC_BK_SYS) == MBUF_TC_BK);
1516         _CASSERT(MBUF_SC2TC(MBUF_SC_BK) == MBUF_TC_BK);
1517         _CASSERT(MBUF_SC2TC(MBUF_SC_BE) == MBUF_TC_BE);
1518         _CASSERT(MBUF_SC2TC(MBUF_SC_RD) == MBUF_TC_BE);
1519         _CASSERT(MBUF_SC2TC(MBUF_SC_OAM) == MBUF_TC_BE);
1520         _CASSERT(MBUF_SC2TC(MBUF_SC_AV) == MBUF_TC_VI);
1521         _CASSERT(MBUF_SC2TC(MBUF_SC_RV) == MBUF_TC_VI);
1522         _CASSERT(MBUF_SC2TC(MBUF_SC_VI) == MBUF_TC_VI);
1523         _CASSERT(MBUF_SC2TC(MBUF_SC_VO) == MBUF_TC_VO);
1524         _CASSERT(MBUF_SC2TC(MBUF_SC_CTL) == MBUF_TC_VO);
1525
1526         _CASSERT(MBUF_TC2SCVAL(MBUF_TC_BK) == SCVAL_BK);
1527         _CASSERT(MBUF_TC2SCVAL(MBUF_TC_BE) == SCVAL_BE);
1528         _CASSERT(MBUF_TC2SCVAL(MBUF_TC_VI) == SCVAL_VI);
1529         _CASSERT(MBUF_TC2SCVAL(MBUF_TC_VO) == SCVAL_VO);
1530
1531         /* Module specific scratch space (32-bit alignment requirement) */
1532         _CASSERT(!(offsetof(struct mbuf, m_pkthdr.pkt_mpriv) %
1533             sizeof (uint32_t)));
1534
1535         /* Initialize random red zone cookie value */
1536         _CASSERT(sizeof (mb_redzone_cookie) ==
1537             sizeof (((struct pkthdr *)0)->redzone));
1538         read_random(&mb_redzone_cookie, sizeof (mb_redzone_cookie));
1539         read_random(&mb_obscure_extref, sizeof (mb_obscure_extref));
1540         read_random(&mb_obscure_extfree, sizeof (mb_obscure_extfree));
1541         mb_obscure_extref |= 0x3;
1542         mb_obscure_extfree |= 0x3;
1543
1544         /* Make sure we don't save more than we should */
1545         _CASSERT(MCA_SAVED_MBUF_SIZE <= sizeof (struct mbuf));
1546
1547         if (nmbclusters == 0)
1548                 nmbclusters = NMBCLUSTERS;
1549
1550         /* This should be a sane (at least even) value by now */
1551         VERIFY(nmbclusters != 0 && !(nmbclusters & 0x1));
1552
1553         /* Setup the mbuf table */
1554         mbuf_table_init();
1555
1556         /* Global lock for common layer */
1557         mbuf_mlock_grp_attr = lck_grp_attr_alloc_init();
1558         mbuf_mlock_grp = lck_grp_alloc_init("mbuf", mbuf_mlock_grp_attr);
1559         mbuf_mlock_attr = lck_attr_alloc_init();
1560         lck_mtx_init(mbuf_mlock, mbuf_mlock_grp, mbuf_mlock_attr);
1561
1562         /*
1563          * Allocate cluster slabs table:
1564          *
1565          *      maxslabgrp = (N * 2048) / (1024 * 1024)
1566          *
1567          * Where N is nmbclusters rounded up to the nearest 512.  This yields
1568          * mcl_slab_g_t units, each one representing a MB of memory.
1569          */
1570         maxslabgrp =
1571             (P2ROUNDUP(nmbclusters, (MBSIZE >> MCLSHIFT)) << MCLSHIFT) >> MBSHIFT;
1572         MALLOC(slabstbl, mcl_slabg_t **, maxslabgrp * sizeof (mcl_slabg_t *),
1573             M_TEMP, M_WAITOK | M_ZERO);
1574         VERIFY(slabstbl != NULL);
1575
1576         /*
1577          * Allocate audit structures, if needed:
1578          *
1579          *      maxclaudit = (maxslabgrp * 1024 * 1024) / PAGE_SIZE
1580          *
1581          * This yields mcl_audit_t units, each one representing a page.
1582          */
1583         PE_parse_boot_argn("mbuf_debug", &mbuf_debug, sizeof (mbuf_debug));
1584         mbuf_debug |= mcache_getflags();
1585         if (mbuf_debug & MCF_DEBUG) {
1586                 int l;
1587                 mcl_audit_t *mclad;
1588                 maxclaudit = ((maxslabgrp << MBSHIFT) >> PAGE_SHIFT);
1589                 MALLOC(mclaudit, mcl_audit_t *, maxclaudit * sizeof (*mclaudit),
1590                     M_TEMP, M_WAITOK | M_ZERO);
1591                 VERIFY(mclaudit != NULL);
1592                 for (l = 0, mclad = mclaudit; l < maxclaudit; l++) {
1593                         MALLOC(mclad[l].cl_audit, mcache_audit_t **,
1594                             NMBPG * sizeof(mcache_audit_t *),
1595                             M_TEMP, M_WAITOK | M_ZERO);
1596                         VERIFY(mclad[l].cl_audit != NULL);
1597                 }
1598
1599                 mcl_audit_con_cache = mcache_create("mcl_audit_contents",
1600                     AUDIT_CONTENTS_SIZE, sizeof (u_int64_t), 0, MCR_SLEEP);
1601                 VERIFY(mcl_audit_con_cache != NULL);
1602         }
1603         mclverify = (mbuf_debug & MCF_VERIFY);
1604         mcltrace = (mbuf_debug & MCF_TRACE);
1605         mclfindleak = !(mbuf_debug & MCF_NOLEAKLOG);
1606         mclexpleak = mclfindleak && (mbuf_debug & MCF_EXPLEAKLOG);
1607
1608         /* Enable mbuf leak logging, with a lock to protect the tables */
1609
1610         mleak_lock_grp_attr = lck_grp_attr_alloc_init();
1611         mleak_lock_grp = lck_grp_alloc_init("mleak_lock", mleak_lock_grp_attr);
1612         mleak_lock_attr = lck_attr_alloc_init();
1613         lck_mtx_init(mleak_lock, mleak_lock_grp, mleak_lock_attr);
1614
1615         mleak_activate();
1616
1617         /*
1618          * Allocate structure for per-CPU statistics that's aligned
1619          * on the CPU cache boundary; this code assumes that we never
1620          * uninitialize this framework, since the original address
1621          * before alignment is not saved.
1622          */
1623         ncpu = ml_get_max_cpus();
1624         MALLOC(buf, void *, MBUF_MTYPES_SIZE(ncpu) + CPU_CACHE_LINE_SIZE,
1625             M_TEMP, M_WAITOK);
1626         VERIFY(buf != NULL);
1627
1628         mbuf_mtypes = (mbuf_mtypes_t *)P2ROUNDUP((intptr_t)buf,
1629             CPU_CACHE_LINE_SIZE);
1630         bzero(mbuf_mtypes, MBUF_MTYPES_SIZE(ncpu));
1631
1632         /* Calculate the number of pages assigned to the cluster pool */
1633         mcl_pages = (nmbclusters << MCLSHIFT) / PAGE_SIZE;
1634         MALLOC(mcl_paddr, ppnum_t *, mcl_pages * sizeof (ppnum_t),
1635             M_TEMP, M_WAITOK);
1636         VERIFY(mcl_paddr != NULL);
1637
1638         /* Register with the I/O Bus mapper */
1639         mcl_paddr_base = IOMapperIOVMAlloc(mcl_pages);
1640         bzero((char *)mcl_paddr, mcl_pages * sizeof (ppnum_t));
1641
1642         embutl = (mbutl + (nmbclusters * MCLBYTES));
1643         VERIFY(((embutl - mbutl) % MBIGCLBYTES) == 0);
1644
1645         /* Prime up the freelist */
1646         PE_parse_boot_argn("initmcl", &initmcl, sizeof (initmcl));
1647         if (initmcl != 0) {
1648                 initmcl >>= NCLPBGSHIFT;        /* become a 4K unit */
1649                 if (initmcl > m_maxlimit(MC_BIGCL))
1650                         initmcl = m_maxlimit(MC_BIGCL);
1651         }
1652         if (initmcl < m_minlimit(MC_BIGCL))
1653                 initmcl = m_minlimit(MC_BIGCL);
1654
1655         lck_mtx_lock(mbuf_mlock);
1656
1657         /*
1658          * For classes with non-zero minimum limits, populate their freelists
1659          * so that m_total(class) is at least m_minlimit(class).
1660          */
1661         VERIFY(m_total(MC_BIGCL) == 0 && m_minlimit(MC_BIGCL) != 0);
1662         freelist_populate(m_class(MC_BIGCL), initmcl, M_WAIT);
1663         VERIFY(m_total(MC_BIGCL) >= m_minlimit(MC_BIGCL));
1664         freelist_init(m_class(MC_CL));
1665
1666         for (m = 0; m < NELEM(mbuf_table); m++) {
1667                 /* Make sure we didn't miss any */
1668                 VERIFY(m_minlimit(m_class(m)) == 0 ||
1669                     m_total(m_class(m)) >= m_minlimit(m_class(m)));
1670
1671                 /* populate the initial sizes and report from there on */
1672                 m_peak(m_class(m)) = m_total(m_class(m));
1673         }
1674         mb_peak_newreport = FALSE;
1675
1676         lck_mtx_unlock(mbuf_mlock);
1677
1678         (void) kernel_thread_start((thread_continue_t)mbuf_worker_thread_init,
1679             NULL, &thread);
1680         thread_deallocate(thread);
1681
1682         ref_cache = mcache_create("mext_ref", sizeof (struct ext_ref),
1683             0, 0, MCR_SLEEP);
1684
1685         /* Create the cache for each class */
1686         for (m = 0; m < NELEM(mbuf_table); m++) {
1687                 void *allocfunc, *freefunc, *auditfunc, *logfunc;
1688                 u_int32_t flags;
1689
1690                 flags = mbuf_debug;
1691                 if (m_class(m) == MC_MBUF_CL || m_class(m) == MC_MBUF_BIGCL ||
1692                     m_class(m) == MC_MBUF_16KCL) {
1693                         allocfunc = mbuf_cslab_alloc;
1694                         freefunc = mbuf_cslab_free;
1695                         auditfunc = mbuf_cslab_audit;
1696                         logfunc = mleak_logger;
1697                 } else {
1698                         allocfunc = mbuf_slab_alloc;
1699                         freefunc = mbuf_slab_free;
1700                         auditfunc = mbuf_slab_audit;
1701                         logfunc = mleak_logger;
1702                 }
1703
1704                 /*
1705                  * Disable per-CPU caches for jumbo classes if there
1706                  * is no jumbo cluster pool available in the system.
1707                  * The cache itself is still created (but will never
1708                  * be populated) since it simplifies the code.
1709                  */
1710                 if ((m_class(m) == MC_MBUF_16KCL || m_class(m) == MC_16KCL) &&
1711                     njcl == 0)
1712                         flags |= MCF_NOCPUCACHE;
1713
1714                 if (!mclfindleak)
1715                         flags |= MCF_NOLEAKLOG;
1716
1717                 m_cache(m) = mcache_create_ext(m_cname(m), m_maxsize(m),
1718                     allocfunc, freefunc, auditfunc, logfunc, mbuf_slab_notify,
1719                     (void *)(uintptr_t)m, flags, MCR_SLEEP);
1720         }
1721
1722         /*
1723          * Set the max limit on sb_max to be 1/16 th of the size of
1724          * memory allocated for mbuf clusters.
1725          */
1726         high_sb_max = (nmbclusters << (MCLSHIFT - 4));
1727         if (high_sb_max < sb_max) {
1728                 /* sb_max is too large for this configuration, scale it down */
1729                 if (high_sb_max > (1 << MBSHIFT)) {
1730                         /* We have atleast 16 M of mbuf pool */
1731                         sb_max = high_sb_max;
1732                 } else if ((nmbclusters << MCLSHIFT) > (1 << MBSHIFT)) {
1733                         /*
1734                          * If we have more than 1M of mbufpool, cap the size of
1735                          * max sock buf at 1M
1736                          */
1737                         sb_max = high_sb_max = (1 << MBSHIFT);
1738                 } else {
1739                         sb_max = high_sb_max;
1740                 }
1741         }
1742
1743         /* allocate space for mbuf_dump_buf */
1744         MALLOC(mbuf_dump_buf, char *, MBUF_DUMP_BUF_SIZE, M_TEMP, M_WAITOK);
1745         VERIFY(mbuf_dump_buf != NULL);
1746
1747         if (mbuf_debug & MCF_DEBUG) {
1748                 printf("%s: MLEN %d, MHLEN %d\n", __func__,
1749                     (int)_MLEN, (int)_MHLEN);
1750         }
1751
1752         printf("%s: done [%d MB total pool size, (%d/%d) split]\n", __func__,
1753             (nmbclusters << MCLSHIFT) >> MBSHIFT,
1754             (nclusters << MCLSHIFT) >> MBSHIFT,
1755             (njcl << MCLSHIFT) >> MBSHIFT);
1756
1757         /* initialize lock form tx completion callback table */
1758         mbuf_tx_compl_tbl_lck_grp_attr = lck_grp_attr_alloc_init();
1759         if (mbuf_tx_compl_tbl_lck_grp_attr == NULL) {
1760                 panic("%s: lck_grp_attr_alloc_init failed", __func__);
1761                 /* NOTREACHED */
1762         }
1763         mbuf_tx_compl_tbl_lck_grp = lck_grp_alloc_init("mbuf_tx_compl_tbl",
1764             mbuf_tx_compl_tbl_lck_grp_attr);
1765         if (mbuf_tx_compl_tbl_lck_grp == NULL) {
1766                 panic("%s: lck_grp_alloc_init failed", __func__);
1767                 /* NOTREACHED */
1768         }
1769         mbuf_tx_compl_tbl_lck_attr = lck_attr_alloc_init();
1770         if (mbuf_tx_compl_tbl_lck_attr == NULL) {
1771                 panic("%s: lck_attr_alloc_init failed", __func__);
1772                 /* NOTREACHED */
1773         }
1774         lck_rw_init(mbuf_tx_compl_tbl_lock, mbuf_tx_compl_tbl_lck_grp,
1775             mbuf_tx_compl_tbl_lck_attr);
1776
1777 }
1778
1779 /*
1780  * Obtain a slab of object(s) from the class's freelist.
1781  */
1782 static mcache_obj_t *
1783 slab_alloc(mbuf_class_t class, int wait)
1784 {
1785         mcl_slab_t *sp;
1786         mcache_obj_t *buf;
1787
1788         LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1789
1790         /* This should always be NULL for us */
1791         VERIFY(m_cobjlist(class) == NULL);
1792
1793         /*
1794          * Treat composite objects as having longer lifespan by using
1795          * a slab from the reverse direction, in hoping that this could
1796          * reduce the probability of fragmentation for slabs that hold
1797          * more than one buffer chunks (e.g. mbuf slabs).  For other
1798          * slabs, this probably doesn't make much of a difference.
1799          */
1800         if ((class == MC_MBUF || class == MC_CL || class == MC_BIGCL)
1801             && (wait & MCR_COMP))
1802                 sp = (mcl_slab_t *)TAILQ_LAST(&m_slablist(class), mcl_slhead);
1803         else
1804                 sp = (mcl_slab_t *)TAILQ_FIRST(&m_slablist(class));
1805
1806         if (sp == NULL) {
1807                 VERIFY(m_infree(class) == 0 && m_slab_cnt(class) == 0);
1808                 /* The slab list for this class is empty */
1809                 return (NULL);
1810         }
1811
1812         VERIFY(m_infree(class) > 0);
1813         VERIFY(!slab_is_detached(sp));
1814         VERIFY(sp->sl_class == class &&
1815             (sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED);
1816         buf = sp->sl_head;
1817         VERIFY(slab_inrange(sp, buf) && sp == slab_get(buf));
1818         sp->sl_head = buf->obj_next;
1819         /* Increment slab reference */
1820         sp->sl_refcnt++;
1821
1822         VERIFY(sp->sl_head != NULL || sp->sl_refcnt == sp->sl_chunks);
1823
1824         if (sp->sl_head != NULL && !slab_inrange(sp, sp->sl_head)) {
1825                 slab_nextptr_panic(sp, sp->sl_head);
1826                 /* In case sl_head is in the map but not in the slab */
1827                 VERIFY(slab_inrange(sp, sp->sl_head));
1828                 /* NOTREACHED */
1829         }
1830
1831         if (mclaudit != NULL) {
1832                 mcache_audit_t *mca = mcl_audit_buf2mca(class, buf);
1833                 mca->mca_uflags = 0;
1834                 /* Save contents on mbuf objects only */
1835                 if (class == MC_MBUF)
1836                         mca->mca_uflags |= MB_SCVALID;
1837         }
1838
1839         if (class == MC_CL) {
1840                 mbstat.m_clfree = (--m_infree(MC_CL)) + m_infree(MC_MBUF_CL);
1841                 /*
1842                  * A 2K cluster slab can have at most NCLPG references.
1843                  */
1844                 VERIFY(sp->sl_refcnt >= 1 && sp->sl_refcnt <= NCLPG &&
1845                     sp->sl_chunks == NCLPG && sp->sl_len == PAGE_SIZE);
1846                 VERIFY(sp->sl_refcnt < NCLPG || sp->sl_head == NULL);
1847         } else if (class == MC_BIGCL) {
1848                 mbstat.m_bigclfree = (--m_infree(MC_BIGCL)) +
1849                     m_infree(MC_MBUF_BIGCL);
1850                 /*
1851                  * A 4K cluster slab can have NBCLPG references.
1852                  */
1853                 VERIFY(sp->sl_refcnt >= 1 && sp->sl_chunks == NBCLPG &&
1854                     sp->sl_len == PAGE_SIZE &&
1855                     (sp->sl_refcnt < NBCLPG || sp->sl_head == NULL));
1856         } else if (class == MC_16KCL) {
1857                 mcl_slab_t *nsp;
1858                 int k;
1859
1860                 --m_infree(MC_16KCL);
1861                 VERIFY(sp->sl_refcnt == 1 && sp->sl_chunks == 1 &&
1862                     sp->sl_len == m_maxsize(class) && sp->sl_head == NULL);
1863                 /*
1864                  * Increment 2nd-Nth slab reference, where N is NSLABSP16KB.
1865                  * A 16KB big cluster takes NSLABSP16KB slabs, each having at
1866                  * most 1 reference.
1867                  */
1868                 for (nsp = sp, k = 1; k < NSLABSP16KB; k++) {
1869                         nsp = nsp->sl_next;
1870                         /* Next slab must already be present */
1871                         VERIFY(nsp != NULL);
1872                         nsp->sl_refcnt++;
1873                         VERIFY(!slab_is_detached(nsp));
1874                         VERIFY(nsp->sl_class == MC_16KCL &&
1875                             nsp->sl_flags == (SLF_MAPPED | SLF_PARTIAL) &&
1876                             nsp->sl_refcnt == 1 && nsp->sl_chunks == 0 &&
1877                             nsp->sl_len == 0 && nsp->sl_base == sp->sl_base &&
1878                             nsp->sl_head == NULL);
1879                 }
1880         } else {
1881                 VERIFY(class == MC_MBUF);
1882                 --m_infree(MC_MBUF);
1883                 /*
1884                  * If auditing is turned on, this check is
1885                  * deferred until later in mbuf_slab_audit().
1886                  */
1887                 if (mclaudit == NULL)
1888                         _MCHECK((struct mbuf *)buf);
1889                 /*
1890                  * Since we have incremented the reference count above,
1891                  * an mbuf slab (formerly a 4KB cluster slab that was cut
1892                  * up into mbufs) must have a reference count between 1
1893                  * and NMBPG at this point.
1894                  */
1895                 VERIFY(sp->sl_refcnt >= 1 && sp->sl_refcnt <= NMBPG &&
1896                     sp->sl_chunks == NMBPG &&
1897                     sp->sl_len == PAGE_SIZE);
1898                 VERIFY(sp->sl_refcnt < NMBPG || sp->sl_head == NULL);
1899         }
1900
1901         /* If empty, remove this slab from the class's freelist */
1902         if (sp->sl_head == NULL) {
1903                 VERIFY(class != MC_MBUF || sp->sl_refcnt == NMBPG);
1904                 VERIFY(class != MC_CL || sp->sl_refcnt == NCLPG);
1905                 VERIFY(class != MC_BIGCL || sp->sl_refcnt == NBCLPG);
1906                 slab_remove(sp, class);
1907         }
1908
1909         return (buf);
1910 }
1911
1912 /*
1913  * Place a slab of object(s) back into a class's slab list.
1914  */
1915 static void
1916 slab_free(mbuf_class_t class, mcache_obj_t *buf)
1917 {
1918         mcl_slab_t *sp;
1919         boolean_t reinit_supercl = false;
1920         mbuf_class_t super_class;
1921
1922         LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1923
1924         VERIFY(class != MC_16KCL || njcl > 0);
1925         VERIFY(buf->obj_next == NULL);
1926
1927         /*
1928          * Synchronizing with m_clalloc, as it reads m_total, while we here
1929          * are modifying m_total.
1930          */
1931         while (mb_clalloc_busy) {
1932                 mb_clalloc_waiters++;
1933                 (void) msleep(mb_clalloc_waitchan, mbuf_mlock,
1934                     (PZERO-1), "m_clalloc", NULL);
1935                 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1936         }
1937
1938         /* We are busy now; tell everyone else to go away */
1939         mb_clalloc_busy = TRUE;
1940
1941         sp = slab_get(buf);
1942         VERIFY(sp->sl_class == class && slab_inrange(sp, buf) &&
1943             (sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED);
1944
1945         /* Decrement slab reference */
1946         sp->sl_refcnt--;
1947
1948         if (class == MC_CL) {
1949                 VERIFY(IS_P2ALIGNED(buf, MCLBYTES));
1950                 /*
1951                  * A slab that has been splitted for 2KB clusters can have
1952                  * at most 1 outstanding reference at this point.
1953                  */
1954                 VERIFY(sp->sl_refcnt >= 0 && sp->sl_refcnt <= (NCLPG - 1) &&
1955                     sp->sl_chunks == NCLPG && sp->sl_len == PAGE_SIZE);
1956                 VERIFY(sp->sl_refcnt < (NCLPG - 1) ||
1957                     (slab_is_detached(sp) && sp->sl_head == NULL));
1958         } else if (class == MC_BIGCL) {
1959                 VERIFY(IS_P2ALIGNED(buf, MBIGCLBYTES));
1960
1961                 /* A 4KB cluster slab can have NBCLPG references at most */
1962                 VERIFY(sp->sl_refcnt >= 0 && sp->sl_chunks == NBCLPG);
1963                 VERIFY(sp->sl_refcnt < (NBCLPG - 1) ||
1964                     (slab_is_detached(sp) && sp->sl_head == NULL));
1965         } else if (class == MC_16KCL) {
1966                 mcl_slab_t *nsp;
1967                 int k;
1968                 /*
1969                  * A 16KB cluster takes NSLABSP16KB slabs, all must
1970                  * now have 0 reference.
1971                  */
1972                 VERIFY(IS_P2ALIGNED(buf, PAGE_SIZE));
1973                 VERIFY(sp->sl_refcnt == 0 && sp->sl_chunks == 1 &&
1974                     sp->sl_len == m_maxsize(class) && sp->sl_head == NULL);
1975                 VERIFY(slab_is_detached(sp));
1976                 for (nsp = sp, k = 1; k < NSLABSP16KB; k++) {
1977                         nsp = nsp->sl_next;
1978                         /* Next slab must already be present */
1979                         VERIFY(nsp != NULL);
1980                         nsp->sl_refcnt--;
1981                         VERIFY(slab_is_detached(nsp));
1982                         VERIFY(nsp->sl_class == MC_16KCL &&
1983                             (nsp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) &&
1984                             nsp->sl_refcnt == 0 && nsp->sl_chunks == 0 &&
1985                             nsp->sl_len == 0 && nsp->sl_base == sp->sl_base &&
1986                             nsp->sl_head == NULL);
1987                 }
1988         } else {
1989                 /*
1990                  * A slab that has been splitted for mbufs has at most
1991                  * NMBPG reference counts.  Since we have decremented
1992                  * one reference above, it must now be between 0 and
1993                  * NMBPG-1.
1994                  */
1995                 VERIFY(class == MC_MBUF);
1996                 VERIFY(sp->sl_refcnt >= 0 &&
1997                     sp->sl_refcnt <= (NMBPG - 1) &&
1998                     sp->sl_chunks == NMBPG &&
1999                     sp->sl_len == PAGE_SIZE);
2000                 VERIFY(sp->sl_refcnt < (NMBPG - 1) ||
2001                     (slab_is_detached(sp) && sp->sl_head == NULL));
2002         }
2003
2004         /*
2005          * When auditing is enabled, ensure that the buffer still
2006          * contains the free pattern.  Otherwise it got corrupted
2007          * while at the CPU cache layer.
2008          */
2009         if (mclaudit != NULL) {
2010                 mcache_audit_t *mca = mcl_audit_buf2mca(class, buf);
2011                 if (mclverify) {
2012                         mcache_audit_free_verify(mca, buf, 0,
2013                             m_maxsize(class));
2014                 }
2015                 mca->mca_uflags &= ~MB_SCVALID;
2016         }
2017
2018         if (class == MC_CL) {
2019                 mbstat.m_clfree = (++m_infree(MC_CL)) + m_infree(MC_MBUF_CL);
2020                 buf->obj_next = sp->sl_head;
2021         } else if (class == MC_BIGCL) {
2022                 mbstat.m_bigclfree = (++m_infree(MC_BIGCL)) +
2023                     m_infree(MC_MBUF_BIGCL);
2024                 buf->obj_next = sp->sl_head;
2025         } else if (class == MC_16KCL) {
2026                 ++m_infree(MC_16KCL);
2027         } else {
2028                 ++m_infree(MC_MBUF);
2029                 buf->obj_next = sp->sl_head;
2030         }
2031         sp->sl_head = buf;
2032
2033         /*
2034          * If a slab has been split to either one which holds 2KB clusters,
2035          * or one which holds mbufs, turn it back to one which holds a
2036          * 4 or 16 KB cluster depending on the page size.
2037          */
2038         if (m_maxsize(MC_BIGCL) == PAGE_SIZE) {
2039                 super_class = MC_BIGCL;
2040         } else {
2041                 VERIFY(PAGE_SIZE == m_maxsize(MC_16KCL));
2042                 super_class = MC_16KCL;
2043         }
2044         if (class == MC_MBUF && sp->sl_refcnt == 0 &&
2045             m_total(class) >= (m_minlimit(class) + NMBPG) &&
2046             m_total(super_class) < m_maxlimit(super_class)) {
2047                 int i = NMBPG;
2048
2049                 m_total(MC_MBUF) -= NMBPG;
2050                 mbstat.m_mbufs = m_total(MC_MBUF);
2051                 m_infree(MC_MBUF) -= NMBPG;
2052                 mtype_stat_add(MT_FREE, -((unsigned)NMBPG));
2053
2054                 while (i--) {
2055                         struct mbuf *m = sp->sl_head;
2056                         VERIFY(m != NULL);
2057                         sp->sl_head = m->m_next;
2058                         m->m_next = NULL;
2059                 }
2060                 reinit_supercl = true;
2061         } else if (class == MC_CL && sp->sl_refcnt == 0 &&
2062             m_total(class) >=  (m_minlimit(class) + NCLPG) &&
2063             m_total(super_class) < m_maxlimit(super_class)) {
2064                 int i = NCLPG;
2065
2066                 m_total(MC_CL) -= NCLPG;
2067                 mbstat.m_clusters = m_total(MC_CL);
2068                 m_infree(MC_CL) -= NCLPG;
2069
2070                 while (i--) {
2071                         union mcluster *c = sp->sl_head;
2072                         VERIFY(c != NULL);
2073                         sp->sl_head = c->mcl_next;
2074                         c->mcl_next = NULL;
2075                 }
2076                 reinit_supercl = true;
2077         } else if (class == MC_BIGCL && super_class != MC_BIGCL &&
2078             sp->sl_refcnt == 0 &&
2079             m_total(class) >= (m_minlimit(class) + NBCLPG) &&
2080             m_total(super_class) < m_maxlimit(super_class)) {
2081                 int i = NBCLPG;
2082
2083                 VERIFY(super_class == MC_16KCL);
2084                 m_total(MC_BIGCL) -= NBCLPG;
2085                 mbstat.m_bigclusters = m_total(MC_BIGCL);
2086                 m_infree(MC_BIGCL) -= NBCLPG;
2087
2088                 while (i--) {
2089                         union mbigcluster *bc = sp->sl_head;
2090                         VERIFY(bc != NULL);
2091                         sp->sl_head = bc->mbc_next;
2092                         bc->mbc_next = NULL;
2093                 }
2094                 reinit_supercl = true;
2095         }
2096
2097         if (reinit_supercl) {
2098                 VERIFY(sp->sl_head == NULL);
2099                 VERIFY(m_total(class) >= m_minlimit(class));
2100                 slab_remove(sp, class);
2101
2102                 /* Reinitialize it as a cluster for the super class */
2103                 m_total(super_class)++;
2104                 m_infree(super_class)++;
2105                 VERIFY(sp->sl_flags == (SLF_MAPPED | SLF_DETACHED) &&
2106                     sp->sl_len == PAGE_SIZE && sp->sl_refcnt == 0);
2107
2108                 slab_init(sp, super_class, SLF_MAPPED, sp->sl_base,
2109                     sp->sl_base, PAGE_SIZE, 0, 1);
2110                 if (mclverify)
2111                         mcache_set_pattern(MCACHE_FREE_PATTERN,
2112                             (caddr_t)sp->sl_base, sp->sl_len);
2113                 ((mcache_obj_t *)(sp->sl_base))->obj_next = NULL;
2114
2115                 if (super_class == MC_BIGCL) {
2116                         mbstat.m_bigclusters = m_total(MC_BIGCL);
2117                         mbstat.m_bigclfree = m_infree(MC_BIGCL) +
2118                             m_infree(MC_MBUF_BIGCL);
2119                 }
2120
2121                 VERIFY(slab_is_detached(sp));
2122                 VERIFY(m_total(super_class) <= m_maxlimit(super_class));
2123
2124                 /* And finally switch class */
2125                 class = super_class;
2126         }
2127
2128         /* Reinsert the slab to the class's slab list */
2129         if (slab_is_detached(sp))
2130                 slab_insert(sp, class);
2131
2132         /* We're done; let others enter */
2133         mb_clalloc_busy = FALSE;
2134         if (mb_clalloc_waiters > 0) {
2135                 mb_clalloc_waiters = 0;
2136                 wakeup(mb_clalloc_waitchan);
2137         }
2138 }
2139
2140 /*
2141  * Common allocator for rudimentary objects called by the CPU cache layer
2142  * during an allocation request whenever there is no available element in the
2143  * bucket layer.  It returns one or more elements from the appropriate global
2144  * freelist.  If the freelist is empty, it will attempt to populate it and
2145  * retry the allocation.
2146  */
2147 static unsigned int
2148 mbuf_slab_alloc(void *arg, mcache_obj_t ***plist, unsigned int num, int wait)
2149 {
2150         mbuf_class_t class = (mbuf_class_t)arg;
2151         unsigned int need = num;
2152         mcache_obj_t **list = *plist;
2153
2154         ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
2155         ASSERT(need > 0);
2156
2157         lck_mtx_lock(mbuf_mlock);
2158
2159         for (;;) {
2160                 if ((*list = slab_alloc(class, wait)) != NULL) {
2161                         (*list)->obj_next = NULL;
2162                         list = *plist = &(*list)->obj_next;
2163
2164                         if (--need == 0) {
2165                                 /*
2166                                  * If the number of elements in freelist has
2167                                  * dropped below low watermark, asynchronously
2168                                  * populate the freelist now rather than doing
2169                                  * it later when we run out of elements.
2170                                  */
2171                                 if (!mbuf_cached_above(class, wait) &&
2172                                     m_infree(class) < (m_total(class) >> 5)) {
2173                                         (void) freelist_populate(class, 1,
2174                                             M_DONTWAIT);
2175                                 }
2176                                 break;
2177                         }
2178                 } else {
2179                         VERIFY(m_infree(class) == 0 || class == MC_CL);
2180
2181                         (void) freelist_populate(class, 1,
2182                             (wait & MCR_NOSLEEP) ? M_DONTWAIT : M_WAIT);
2183
2184                         if (m_infree(class) > 0)
2185                                 continue;
2186
2187                         /* Check if there's anything at the cache layer */
2188                         if (mbuf_cached_above(class, wait))
2189                                 break;
2190
2191                         /* watchdog checkpoint */
2192                         mbuf_watchdog();
2193
2194                         /* We have nothing and cannot block; give up */
2195                         if (wait & MCR_NOSLEEP) {
2196                                 if (!(wait & MCR_TRYHARD)) {
2197                                         m_fail_cnt(class)++;
2198                                         mbstat.m_drops++;
2199                                         break;
2200                                 }
2201                         }
2202
2203                         /*
2204                          * If the freelist is still empty and the caller is
2205                          * willing to be blocked, sleep on the wait channel
2206                          * until an element is available.  Otherwise, if
2207                          * MCR_TRYHARD is set, do our best to satisfy the
2208                          * request without having to go to sleep.
2209                          */
2210                         if (mbuf_worker_ready &&
2211                             mbuf_sleep(class, need, wait))
2212                                 break;
2213
2214                         LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2215                 }
2216         }
2217
2218         m_alloc_cnt(class) += num - need;
2219         lck_mtx_unlock(mbuf_mlock);
2220
2221         return (num - need);
2222 }
2223
2224 /*
2225  * Common de-allocator for rudimentary objects called by the CPU cache
2226  * layer when one or more elements need to be returned to the appropriate
2227  * global freelist.
2228  */
2229 static void
2230 mbuf_slab_free(void *arg, mcache_obj_t *list, __unused int purged)
2231 {
2232         mbuf_class_t class = (mbuf_class_t)arg;
2233         mcache_obj_t *nlist;
2234         unsigned int num = 0;
2235         int w;
2236
2237         ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
2238
2239         lck_mtx_lock(mbuf_mlock);
2240
2241         for (;;) {
2242                 nlist = list->obj_next;
2243                 list->obj_next = NULL;
2244                 slab_free(class, list);
2245                 ++num;
2246                 if ((list = nlist) == NULL)
2247                         break;
2248         }
2249         m_free_cnt(class) += num;
2250
2251         if ((w = mb_waiters) > 0)
2252                 mb_waiters = 0;
2253
2254         lck_mtx_unlock(mbuf_mlock);
2255
2256         if (w != 0)
2257                 wakeup(mb_waitchan);
2258 }
2259
2260 /*
2261  * Common auditor for rudimentary objects called by the CPU cache layer
2262  * during an allocation or free request.  For the former, this is called
2263  * after the objects are obtained from either the bucket or slab layer
2264  * and before they are returned to the caller.  For the latter, this is
2265  * called immediately during free and before placing the objects into
2266  * the bucket or slab layer.
2267  */
2268 static void
2269 mbuf_slab_audit(void *arg, mcache_obj_t *list, boolean_t alloc)
2270 {
2271         mbuf_class_t class = (mbuf_class_t)arg;
2272         mcache_audit_t *mca;
2273
2274         ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
2275
2276         while (list != NULL) {
2277                 lck_mtx_lock(mbuf_mlock);
2278                 mca = mcl_audit_buf2mca(class, list);
2279
2280                 /* Do the sanity checks */
2281                 if (class == MC_MBUF) {
2282                         mcl_audit_mbuf(mca, list, FALSE, alloc);
2283                         ASSERT(mca->mca_uflags & MB_SCVALID);
2284                 } else {
2285                         mcl_audit_cluster(mca, list, m_maxsize(class),
2286                             alloc, TRUE);
2287                         ASSERT(!(mca->mca_uflags & MB_SCVALID));
2288                 }
2289                 /* Record this transaction */
2290                 if (mcltrace)
2291                         mcache_buffer_log(mca, list, m_cache(class), &mb_start);
2292
2293                 if (alloc)
2294                         mca->mca_uflags |= MB_INUSE;
2295                 else
2296                         mca->mca_uflags &= ~MB_INUSE;
2297                 /* Unpair the object (unconditionally) */
2298                 mca->mca_uptr = NULL;
2299                 lck_mtx_unlock(mbuf_mlock);
2300
2301                 list = list->obj_next;
2302         }
2303 }
2304
2305 /*
2306  * Common notify routine for all caches.  It is called by mcache when
2307  * one or more objects get freed.  We use this indication to trigger
2308  * the wakeup of any sleeping threads so that they can retry their
2309  * allocation requests.
2310  */
2311 static void
2312 mbuf_slab_notify(void *arg, u_int32_t reason)
2313 {
2314         mbuf_class_t class = (mbuf_class_t)arg;
2315         int w;
2316
2317         ASSERT(MBUF_CLASS_VALID(class));
2318
2319         if (reason != MCN_RETRYALLOC)
2320                 return;
2321
2322         lck_mtx_lock(mbuf_mlock);
2323         if ((w = mb_waiters) > 0) {
2324                 m_notified(class)++;
2325                 mb_waiters = 0;
2326         }
2327         lck_mtx_unlock(mbuf_mlock);
2328
2329         if (w != 0)
2330                 wakeup(mb_waitchan);
2331 }
2332
2333 /*
2334  * Obtain object(s) from the composite class's freelist.
2335  */
2336 static unsigned int
2337 cslab_alloc(mbuf_class_t class, mcache_obj_t ***plist, unsigned int num)
2338 {
2339         unsigned int need = num;
2340         mcl_slab_t *sp, *clsp, *nsp;
2341         struct mbuf *m;
2342         mcache_obj_t **list = *plist;
2343         void *cl;
2344
2345         VERIFY(need > 0);
2346         VERIFY(class != MC_MBUF_16KCL || njcl > 0);
2347         LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2348
2349         /* Get what we can from the freelist */
2350         while ((*list = m_cobjlist(class)) != NULL) {
2351                 MRANGE(*list);
2352
2353                 m = (struct mbuf *)*list;
2354                 sp = slab_get(m);
2355                 cl = m->m_ext.ext_buf;
2356                 clsp = slab_get(cl);
2357                 VERIFY(m->m_flags == M_EXT && cl != NULL);
2358                 VERIFY(m_get_rfa(m) != NULL && MBUF_IS_COMPOSITE(m));
2359
2360                 if (class == MC_MBUF_CL) {
2361                         VERIFY(clsp->sl_refcnt >= 1 &&
2362                             clsp->sl_refcnt <= NCLPG);
2363                 } else {
2364                         VERIFY(clsp->sl_refcnt >= 1 &&
2365                             clsp->sl_refcnt <= NBCLPG);
2366                 }
2367
2368                 if (class == MC_MBUF_16KCL) {
2369                         int k;
2370                         for (nsp = clsp, k = 1; k < NSLABSP16KB; k++) {
2371                                 nsp = nsp->sl_next;
2372                                 /* Next slab must already be present */
2373                                 VERIFY(nsp != NULL);
2374                                 VERIFY(nsp->sl_refcnt == 1);
2375                         }
2376                 }
2377
2378                 if ((m_cobjlist(class) = (*list)->obj_next) != NULL &&
2379                     !MBUF_IN_MAP(m_cobjlist(class))) {
2380                         slab_nextptr_panic(sp, m_cobjlist(class));
2381                         /* NOTREACHED */
2382                 }
2383                 (*list)->obj_next = NULL;
2384                 list = *plist = &(*list)->obj_next;
2385
2386                 if (--need == 0)
2387                         break;
2388         }
2389         m_infree(class) -= (num - need);
2390
2391         return (num - need);
2392 }
2393
2394 /*
2395  * Place object(s) back into a composite class's freelist.
2396  */
2397 static unsigned int
2398 cslab_free(mbuf_class_t class, mcache_obj_t *list, int purged)
2399 {
2400         mcache_obj_t *o, *tail;
2401         unsigned int num = 0;
2402         struct mbuf *m, *ms;
2403         mcache_audit_t *mca = NULL;
2404         mcache_obj_t *ref_list = NULL;
2405         mcl_slab_t *clsp, *nsp;
2406         void *cl;
2407         mbuf_class_t cl_class;
2408
2409         ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
2410         VERIFY(class != MC_MBUF_16KCL || njcl > 0);
2411         LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2412
2413         if (class == MC_MBUF_CL) {
2414                 cl_class = MC_CL;
2415         } else if (class == MC_MBUF_BIGCL) {
2416                 cl_class = MC_BIGCL;
2417         } else {
2418                 VERIFY(class == MC_MBUF_16KCL);
2419                 cl_class = MC_16KCL;
2420         }
2421
2422         o = tail = list;
2423
2424         while ((m = ms = (struct mbuf *)o) != NULL) {
2425                 mcache_obj_t *rfa, *nexto = o->obj_next;
2426
2427                 /* Do the mbuf sanity checks */
2428                 if (mclaudit != NULL) {
2429                         mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
2430                         if (mclverify) {
2431                                 mcache_audit_free_verify(mca, m, 0,
2432                                     m_maxsize(MC_MBUF));
2433                         }
2434                         ms = MCA_SAVED_MBUF_PTR(mca);
2435                 }
2436
2437                 /* Do the cluster sanity checks */
2438                 cl = ms->m_ext.ext_buf;
2439                 clsp = slab_get(cl);
2440                 if (mclverify) {
2441                         size_t size = m_maxsize(cl_class);
2442                         mcache_audit_free_verify(mcl_audit_buf2mca(cl_class,
2443                             (mcache_obj_t *)cl), cl, 0, size);
2444                 }
2445                 VERIFY(ms->m_type == MT_FREE);
2446                 VERIFY(ms->m_flags == M_EXT);
2447                 VERIFY(m_get_rfa(ms) != NULL && MBUF_IS_COMPOSITE(ms));
2448                 if (cl_class == MC_CL) {
2449                         VERIFY(clsp->sl_refcnt >= 1 &&
2450                             clsp->sl_refcnt <= NCLPG);
2451                 } else {
2452                         VERIFY(clsp->sl_refcnt >= 1 &&
2453                             clsp->sl_refcnt <= NBCLPG);
2454                 }
2455                 if (cl_class == MC_16KCL) {
2456                         int k;
2457                         for (nsp = clsp, k = 1; k < NSLABSP16KB; k++) {
2458                                 nsp = nsp->sl_next;
2459                                 /* Next slab must already be present */
2460                                 VERIFY(nsp != NULL);
2461                                 VERIFY(nsp->sl_refcnt == 1);
2462                         }
2463                 }
2464
2465                 /*
2466                  * If we're asked to purge, restore the actual mbuf using
2467                  * contents of the shadow structure (if auditing is enabled)
2468                  * and clear EXTF_COMPOSITE flag from the mbuf, as we are
2469                  * about to free it and the attached cluster into their caches.
2470                  */
2471                 if (purged) {
2472                         /* Restore constructed mbuf fields */
2473                         if (mclaudit != NULL)
2474                                 mcl_audit_restore_mbuf(m, mca, TRUE);
2475
2476                         MEXT_MINREF(m) = 0;
2477                         MEXT_REF(m) = 0;
2478                         MEXT_PREF(m) = 0;
2479                         MEXT_FLAGS(m) = 0;
2480                         MEXT_PRIV(m) = 0;
2481                         MEXT_PMBUF(m) = NULL;
2482                         MEXT_TOKEN(m) = 0;
2483
2484                         rfa = (mcache_obj_t *)(void *)m_get_rfa(m);
2485                         m_set_ext(m, NULL, NULL, NULL);
2486                         rfa->obj_next = ref_list;
2487                         ref_list = rfa;
2488
2489                         m->m_type = MT_FREE;
2490                         m->m_flags = m->m_len = 0;
2491                         m->m_next = m->m_nextpkt = NULL;
2492
2493                         /* Save mbuf fields and make auditing happy */
2494                         if (mclaudit != NULL)
2495                                 mcl_audit_mbuf(mca, o, FALSE, FALSE);
2496
2497                         VERIFY(m_total(class) > 0);
2498                         m_total(class)--;
2499
2500                         /* Free the mbuf */
2501                         o->obj_next = NULL;
2502                         slab_free(MC_MBUF, o);
2503
2504                         /* And free the cluster */
2505                         ((mcache_obj_t *)cl)->obj_next = NULL;
2506                         if (class == MC_MBUF_CL)
2507                                 slab_free(MC_CL, cl);
2508                         else if (class == MC_MBUF_BIGCL)
2509                                 slab_free(MC_BIGCL, cl);
2510                         else
2511                                 slab_free(MC_16KCL, cl);
2512                 }
2513
2514                 ++num;
2515                 tail = o;
2516                 o = nexto;
2517         }
2518
2519         if (!purged) {
2520                 tail->obj_next = m_cobjlist(class);
2521                 m_cobjlist(class) = list;
2522                 m_infree(class) += num;
2523         } else if (ref_list != NULL) {
2524                 mcache_free_ext(ref_cache, ref_list);
2525         }
2526
2527         return (num);
2528 }
2529
2530 /*
2531  * Common allocator for composite objects called by the CPU cache layer
2532  * during an allocation request whenever there is no available element in
2533  * the bucket layer.  It returns one or more composite elements from the
2534  * appropriate global freelist.  If the freelist is empty, it will attempt
2535  * to obtain the rudimentary objects from their caches and construct them
2536  * into composite mbuf + cluster objects.
2537  */
2538 static unsigned int
2539 mbuf_cslab_alloc(void *arg, mcache_obj_t ***plist, unsigned int needed,
2540     int wait)
2541 {
2542         mbuf_class_t class = (mbuf_class_t)arg;
2543         mbuf_class_t cl_class = 0;
2544         unsigned int num = 0, cnum = 0, want = needed;
2545         mcache_obj_t *ref_list = NULL;
2546         mcache_obj_t *mp_list = NULL;
2547         mcache_obj_t *clp_list = NULL;
2548         mcache_obj_t **list;
2549         struct ext_ref *rfa;
2550         struct mbuf *m;
2551         void *cl;
2552
2553         ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
2554         ASSERT(needed > 0);
2555
2556         VERIFY(class != MC_MBUF_16KCL || njcl > 0);
2557
2558         /* There should not be any slab for this class */
2559         VERIFY(m_slab_cnt(class) == 0 &&
2560             m_slablist(class).tqh_first == NULL &&
2561             m_slablist(class).tqh_last == NULL);
2562
2563         lck_mtx_lock(mbuf_mlock);
2564
2565         /* Try using the freelist first */
2566         num = cslab_alloc(class, plist, needed);
2567         list = *plist;
2568         if (num == needed) {
2569                 m_alloc_cnt(class) += num;
2570                 lck_mtx_unlock(mbuf_mlock);
2571                 return (needed);
2572         }
2573
2574         lck_mtx_unlock(mbuf_mlock);
2575
2576         /*
2577          * We could not satisfy the request using the freelist alone;
2578          * allocate from the appropriate rudimentary caches and use
2579          * whatever we can get to construct the composite objects.
2580          */
2581         needed -= num;
2582
2583         /*
2584          * Mark these allocation requests as coming from a composite cache.
2585          * Also, if the caller is willing to be blocked, mark the request
2586          * with MCR_FAILOK such that we don't end up sleeping at the mbuf
2587          * slab layer waiting for the individual object when one or more
2588          * of the already-constructed composite objects are available.
2589          */
2590         wait |= MCR_COMP;
2591         if (!(wait & MCR_NOSLEEP))
2592                 wait |= MCR_FAILOK;
2593
2594         /* allocate mbufs */
2595         needed = mcache_alloc_ext(m_cache(MC_MBUF), &mp_list, needed, wait);
2596         if (needed == 0) {
2597                 ASSERT(mp_list == NULL);
2598                 goto fail;
2599         }
2600
2601         /* allocate clusters */
2602         if (class == MC_MBUF_CL) {
2603                 cl_class = MC_CL;
2604         } else if (class == MC_MBUF_BIGCL) {
2605                 cl_class = MC_BIGCL;
2606         } else {
2607                 VERIFY(class == MC_MBUF_16KCL);
2608                 cl_class = MC_16KCL;
2609         }
2610         needed = mcache_alloc_ext(m_cache(cl_class), &clp_list, needed, wait);
2611         if (needed == 0) {
2612                 ASSERT(clp_list == NULL);
2613                 goto fail;
2614         }
2615
2616         needed = mcache_alloc_ext(ref_cache, &ref_list, needed, wait);
2617         if (needed == 0) {
2618                 ASSERT(ref_list == NULL);
2619                 goto fail;
2620         }
2621
2622         /*
2623          * By this time "needed" is MIN(mbuf, cluster, ref).  Any left
2624          * overs will get freed accordingly before we return to caller.
2625          */
2626         for (cnum = 0; cnum < needed; cnum++) {
2627                 struct mbuf *ms;
2628
2629                 m = ms = (struct mbuf *)mp_list;
2630                 mp_list = mp_list->obj_next;
2631
2632                 cl = clp_list;
2633                 clp_list = clp_list->obj_next;
2634                 ((mcache_obj_t *)cl)->obj_next = NULL;
2635
2636                 rfa = (struct ext_ref *)ref_list;
2637                 ref_list = ref_list->obj_next;
2638                 ((mcache_obj_t *)(void *)rfa)->obj_next = NULL;
2639
2640                 /*
2641                  * If auditing is enabled, construct the shadow mbuf
2642                  * in the audit structure instead of in the actual one.
2643                  * mbuf_cslab_audit() will take care of restoring the
2644                  * contents after the integrity check.
2645                  */
2646                 if (mclaudit != NULL) {
2647                         mcache_audit_t *mca, *cl_mca;
2648
2649                         lck_mtx_lock(mbuf_mlock);
2650                         mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
2651                         ms = MCA_SAVED_MBUF_PTR(mca);
2652                         cl_mca = mcl_audit_buf2mca(cl_class,
2653                             (mcache_obj_t *)cl);
2654
2655                         /*
2656                          * Pair them up.  Note that this is done at the time
2657                          * the mbuf+cluster objects are constructed.  This
2658                          * information should be treated as "best effort"
2659                          * debugging hint since more than one mbufs can refer
2660                          * to a cluster.  In that case, the cluster might not
2661                          * be freed along with the mbuf it was paired with.
2662                          */
2663                         mca->mca_uptr = cl_mca;
2664                         cl_mca->mca_uptr = mca;
2665
2666                         ASSERT(mca->mca_uflags & MB_SCVALID);
2667                         ASSERT(!(cl_mca->mca_uflags & MB_SCVALID));
2668                         lck_mtx_unlock(mbuf_mlock);
2669
2670                         /* Technically, they are in the freelist */
2671                         if (mclverify) {
2672                                 size_t size;
2673
2674                                 mcache_set_pattern(MCACHE_FREE_PATTERN, m,
2675                                     m_maxsize(MC_MBUF));
2676
2677                                 if (class == MC_MBUF_CL)
2678                                         size = m_maxsize(MC_CL);
2679                                 else if (class == MC_MBUF_BIGCL)
2680                                         size = m_maxsize(MC_BIGCL);
2681                                 else
2682                                         size = m_maxsize(MC_16KCL);
2683
2684                                 mcache_set_pattern(MCACHE_FREE_PATTERN, cl,
2685                                     size);
2686                         }
2687                 }
2688
2689                 MBUF_INIT(ms, 0, MT_FREE);
2690                 if (class == MC_MBUF_16KCL) {
2691                         MBUF_16KCL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE);
2692                 } else if (class == MC_MBUF_BIGCL) {
2693                         MBUF_BIGCL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE);
2694                 } else {
2695                         MBUF_CL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE);
2696                 }
2697                 VERIFY(ms->m_flags == M_EXT);
2698                 VERIFY(m_get_rfa(ms) != NULL && MBUF_IS_COMPOSITE(ms));
2699
2700                 *list = (mcache_obj_t *)m;
2701                 (*list)->obj_next = NULL;
2702                 list = *plist = &(*list)->obj_next;
2703         }
2704
2705 fail:
2706         /*
2707          * Free up what's left of the above.
2708          */
2709         if (mp_list != NULL)
2710                 mcache_free_ext(m_cache(MC_MBUF), mp_list);
2711         if (clp_list != NULL)
2712                 mcache_free_ext(m_cache(cl_class), clp_list);
2713         if (ref_list != NULL)
2714                 mcache_free_ext(ref_cache, ref_list);
2715
2716         lck_mtx_lock(mbuf_mlock);
2717         if (num > 0 || cnum > 0) {
2718                 m_total(class) += cnum;
2719                 VERIFY(m_total(class) <= m_maxlimit(class));
2720                 m_alloc_cnt(class) += num + cnum;
2721         }
2722         if ((num + cnum) < want)
2723                 m_fail_cnt(class) += (want - (num + cnum));
2724         lck_mtx_unlock(mbuf_mlock);
2725
2726         return (num + cnum);
2727 }
2728
2729 /*
2730  * Common de-allocator for composite objects called by the CPU cache
2731  * layer when one or more elements need to be returned to the appropriate
2732  * global freelist.
2733  */
2734 static void
2735 mbuf_cslab_free(void *arg, mcache_obj_t *list, int purged)
2736 {
2737         mbuf_class_t class = (mbuf_class_t)arg;
2738         unsigned int num;
2739         int w;
2740
2741         ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
2742
2743         lck_mtx_lock(mbuf_mlock);
2744
2745         num = cslab_free(class, list, purged);
2746         m_free_cnt(class) += num;
2747
2748         if ((w = mb_waiters) > 0)
2749                 mb_waiters = 0;
2750
2751         lck_mtx_unlock(mbuf_mlock);
2752
2753         if (w != 0)
2754                 wakeup(mb_waitchan);
2755 }
2756
2757 /*
2758  * Common auditor for composite objects called by the CPU cache layer
2759  * during an allocation or free request.  For the former, this is called
2760  * after the objects are obtained from either the bucket or slab layer
2761  * and before they are returned to the caller.  For the latter, this is
2762  * called immediately during free and before placing the objects into
2763  * the bucket or slab layer.
2764  */
2765 static void
2766 mbuf_cslab_audit(void *arg, mcache_obj_t *list, boolean_t alloc)
2767 {
2768         mbuf_class_t class = (mbuf_class_t)arg, cl_class;
2769         mcache_audit_t *mca;
2770         struct mbuf *m, *ms;
2771         mcl_slab_t *clsp, *nsp;
2772         size_t cl_size;
2773         void *cl;
2774
2775         ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
2776         if (class == MC_MBUF_CL)
2777                 cl_class = MC_CL;
2778         else if (class == MC_MBUF_BIGCL)
2779                 cl_class = MC_BIGCL;
2780         else
2781                 cl_class = MC_16KCL;
2782         cl_size = m_maxsize(cl_class);
2783
2784         while ((m = ms = (struct mbuf *)list) != NULL) {
2785                 lck_mtx_lock(mbuf_mlock);
2786                 /* Do the mbuf sanity checks and record its transaction */
2787                 mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
2788                 mcl_audit_mbuf(mca, m, TRUE, alloc);
2789                 if (mcltrace)
2790                         mcache_buffer_log(mca, m, m_cache(class), &mb_start);
2791
2792                 if (alloc)
2793                         mca->mca_uflags |= MB_COMP_INUSE;
2794                 else
2795                         mca->mca_uflags &= ~MB_COMP_INUSE;
2796
2797                 /*
2798                  * Use the shadow mbuf in the audit structure if we are
2799                  * freeing, since the contents of the actual mbuf has been
2800                  * pattern-filled by the above call to mcl_audit_mbuf().
2801                  */
2802                 if (!alloc && mclverify)
2803                         ms = MCA_SAVED_MBUF_PTR(mca);
2804
2805                 /* Do the cluster sanity checks and record its transaction */
2806                 cl = ms->m_ext.ext_buf;
2807                 clsp = slab_get(cl);
2808                 VERIFY(ms->m_flags == M_EXT && cl != NULL);
2809                 VERIFY(m_get_rfa(ms) != NULL && MBUF_IS_COMPOSITE(ms));
2810                 if (class == MC_MBUF_CL)
2811                         VERIFY(clsp->sl_refcnt >= 1 &&
2812                             clsp->sl_refcnt <= NCLPG);
2813                 else
2814                         VERIFY(clsp->sl_refcnt >= 1 &&
2815                             clsp->sl_refcnt <= NBCLPG);
2816
2817                 if (class == MC_MBUF_16KCL) {
2818                         int k;
2819                         for (nsp = clsp, k = 1; k < NSLABSP16KB; k++) {
2820                                 nsp = nsp->sl_next;
2821                                 /* Next slab must already be present */
2822                                 VERIFY(nsp != NULL);
2823                                 VERIFY(nsp->sl_refcnt == 1);
2824                         }
2825                 }
2826
2827
2828                 mca = mcl_audit_buf2mca(cl_class, cl);
2829                 mcl_audit_cluster(mca, cl, cl_size, alloc, FALSE);
2830                 if (mcltrace)
2831                         mcache_buffer_log(mca, cl, m_cache(class), &mb_start);
2832
2833                 if (alloc)
2834                         mca->mca_uflags |= MB_COMP_INUSE;
2835                 else
2836                         mca->mca_uflags &= ~MB_COMP_INUSE;
2837                 lck_mtx_unlock(mbuf_mlock);
2838
2839                 list = list->obj_next;
2840         }
2841 }
2842
2843 static void
2844 m_vm_error_stats(uint32_t *cnt, uint64_t *ts, uint64_t *size,
2845                  uint64_t alloc_size, kern_return_t error)
2846 {
2847
2848         *cnt = *cnt + 1;
2849         *ts = net_uptime();
2850         if (size) {
2851                 *size = alloc_size;
2852         }
2853         _CASSERT(sizeof(mb_kmem_stats) / sizeof(mb_kmem_stats[0]) ==
2854             sizeof(mb_kmem_stats_labels) / sizeof(mb_kmem_stats_labels[0]));
2855         switch (error) {
2856         case KERN_SUCCESS:
2857                 break;
2858         case KERN_INVALID_ARGUMENT:
2859                 mb_kmem_stats[0]++;
2860                 break;
2861         case KERN_INVALID_ADDRESS:
2862                 mb_kmem_stats[1]++;
2863                 break;
2864         case KERN_RESOURCE_SHORTAGE:
2865                 mb_kmem_stats[2]++;
2866                 break;
2867         case KERN_NO_SPACE:
2868                 mb_kmem_stats[3]++;
2869                 break;
2870         case KERN_FAILURE:
2871                 mb_kmem_stats[4]++;
2872                 break;
2873         default:
2874                 mb_kmem_stats[5]++;
2875                 break;
2876         }
2877 }
2878
2879 /*
2880  * Allocate some number of mbuf clusters and place on cluster freelist.
2881  */
2882 static int
2883 m_clalloc(const u_int32_t num, const int wait, const u_int32_t bufsize)
2884 {
2885         int i, count = 0;
2886         vm_size_t size = 0;
2887         int numpages = 0, large_buffer;
2888         vm_offset_t page = 0;
2889         mcache_audit_t *mca_list = NULL;
2890         mcache_obj_t *con_list = NULL;
2891         mcl_slab_t *sp;
2892         mbuf_class_t class;
2893         kern_return_t error;
2894
2895         /* Set if a buffer allocation needs allocation of multiple pages */
2896         large_buffer = ((bufsize == m_maxsize(MC_16KCL)) &&
2897                 PAGE_SIZE < M16KCLBYTES);
2898         VERIFY(bufsize == m_maxsize(MC_BIGCL) ||
2899             bufsize == m_maxsize(MC_16KCL));
2900
2901         VERIFY((bufsize == PAGE_SIZE) ||
2902             (bufsize > PAGE_SIZE && bufsize == m_maxsize(MC_16KCL)));
2903
2904         if (bufsize == m_size(MC_BIGCL))
2905                 class = MC_BIGCL;
2906         else
2907                 class = MC_16KCL;
2908
2909         LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2910
2911         /*
2912          * Multiple threads may attempt to populate the cluster map one
2913          * after another.  Since we drop the lock below prior to acquiring
2914          * the physical page(s), our view of the cluster map may no longer
2915          * be accurate, and we could end up over-committing the pages beyond
2916          * the maximum allowed for each class.  To prevent it, this entire
2917          * operation (including the page mapping) is serialized.
2918          */
2919         while (mb_clalloc_busy) {
2920                 mb_clalloc_waiters++;
2921                 (void) msleep(mb_clalloc_waitchan, mbuf_mlock,
2922                     (PZERO-1), "m_clalloc", NULL);
2923                 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2924         }
2925
2926         /* We are busy now; tell everyone else to go away */
2927         mb_clalloc_busy = TRUE;
2928
2929         /*
2930          * Honor the caller's wish to block or not block.  We have a way
2931          * to grow the pool asynchronously using the mbuf worker thread.
2932          */
2933         i = m_howmany(num, bufsize);
2934         if (i <= 0 || (wait & M_DONTWAIT))
2935                 goto out;
2936
2937         lck_mtx_unlock(mbuf_mlock);
2938
2939         size = round_page(i * bufsize);
2940         page = kmem_mb_alloc(mb_map, size, large_buffer, &error);
2941
2942         /*
2943          * If we did ask for "n" 16KB physically contiguous chunks
2944          * and didn't get them, then please try again without this
2945          * restriction.
2946          */
2947         net_update_uptime();
2948         if (large_buffer && page == 0) {
2949                 m_vm_error_stats(&mb_kmem_contig_failed,
2950                     &mb_kmem_contig_failed_ts,
2951                     &mb_kmem_contig_failed_size,
2952                     size, error);
2953                 page = kmem_mb_alloc(mb_map, size, 0, &error);
2954         }
2955
2956         if (page == 0) {
2957                 m_vm_error_stats(&mb_kmem_failed,
2958                     &mb_kmem_failed_ts,
2959                     &mb_kmem_failed_size,
2960                     size, error);
2961 #if PAGE_SIZE == 4096
2962                 if (bufsize == m_maxsize(MC_BIGCL)) {
2963 #else
2964                 if (bufsize >= m_maxsize(MC_BIGCL)) {
2965 #endif
2966                         /* Try for 1 page if failed */
2967                         size = PAGE_SIZE;
2968                         page = kmem_mb_alloc(mb_map, size, 0, &error);
2969                         if (page == 0) {
2970                                 m_vm_error_stats(&mb_kmem_one_failed,
2971                                     &mb_kmem_one_failed_ts,
2972                                     NULL, size, error);
2973                         }
2974                 }
2975
2976                 if (page == 0) {
2977                         lck_mtx_lock(mbuf_mlock);
2978                         goto out;
2979                 }
2980         }
2981
2982         VERIFY(IS_P2ALIGNED(page, PAGE_SIZE));
2983         numpages = size / PAGE_SIZE;
2984
2985         /* If auditing is enabled, allocate the audit structures now */
2986         if (mclaudit != NULL) {
2987                 int needed;
2988
2989                 /*
2990                  * Yes, I realize this is a waste of memory for clusters
2991                  * that never get transformed into mbufs, as we may end
2992                  * up with NMBPG-1 unused audit structures per cluster.
2993                  * But doing so tremendously simplifies the allocation
2994                  * strategy, since at this point we are not holding the
2995                  * mbuf lock and the caller is okay to be blocked.
2996                  */
2997                 if (bufsize == PAGE_SIZE) {
2998                         needed = numpages * NMBPG;
2999
3000                         i = mcache_alloc_ext(mcl_audit_con_cache,
3001                             &con_list, needed, MCR_SLEEP);
3002
3003                         VERIFY(con_list != NULL && i == needed);
3004                 } else {
3005                         /*
3006                          * if multiple 4K pages are being used for a
3007                          * 16K cluster
3008                          */
3009                         needed = numpages / NSLABSP16KB;
3010                 }
3011
3012                 i = mcache_alloc_ext(mcache_audit_cache,
3013                     (mcache_obj_t **)&mca_list, needed, MCR_SLEEP);
3014
3015                 VERIFY(mca_list != NULL && i == needed);
3016         }
3017
3018         lck_mtx_lock(mbuf_mlock);
3019
3020         for (i = 0; i < numpages; i++, page += PAGE_SIZE) {
3021                 ppnum_t offset =
3022                     ((unsigned char *)page - mbutl) >> PAGE_SHIFT;
3023                 ppnum_t new_page = pmap_find_phys(kernel_pmap, page);
3024
3025                 /*
3026                  * If there is a mapper the appropriate I/O page is
3027                  * returned; zero out the page to discard its past
3028                  * contents to prevent exposing leftover kernel memory.
3029                  */
3030                 VERIFY(offset < mcl_pages);
3031                 if (mcl_paddr_base != 0) {
3032                         bzero((void *)(uintptr_t) page, PAGE_SIZE);
3033                         new_page = IOMapperInsertPage(mcl_paddr_base,
3034                             offset, new_page);
3035                 }
3036                 mcl_paddr[offset] = new_page;
3037
3038                 /* Pattern-fill this fresh page */
3039                 if (mclverify) {
3040                         mcache_set_pattern(MCACHE_FREE_PATTERN,
3041                             (caddr_t)page, PAGE_SIZE);
3042                 }
3043                 if (bufsize == PAGE_SIZE) {
3044                         mcache_obj_t *buf;
3045                         /* One for the entire page */
3046                         sp = slab_get((void *)page);
3047                         if (mclaudit != NULL) {
3048                                 mcl_audit_init((void *)page,
3049                                     &mca_list, &con_list,
3050                                     AUDIT_CONTENTS_SIZE, NMBPG);
3051                         }
3052                         VERIFY(sp->sl_refcnt == 0 && sp->sl_flags == 0);
3053                         slab_init(sp, class, SLF_MAPPED, (void *)page,
3054                             (void *)page, PAGE_SIZE, 0, 1);
3055                         buf = (mcache_obj_t *)page;
3056                         buf->obj_next = NULL;
3057
3058                         /* Insert this slab */
3059                         slab_insert(sp, class);
3060
3061                         /* Update stats now since slab_get drops the lock */
3062                         ++m_infree(class);
3063                         ++m_total(class);
3064                         VERIFY(m_total(class) <= m_maxlimit(class));
3065                         if (class == MC_BIGCL) {
3066                                 mbstat.m_bigclfree = m_infree(MC_BIGCL) +
3067                                     m_infree(MC_MBUF_BIGCL);
3068                                 mbstat.m_bigclusters = m_total(MC_BIGCL);
3069                         }
3070                         ++count;
3071                 } else if ((bufsize > PAGE_SIZE) &&
3072                     (i % NSLABSP16KB) == 0) {
3073                         union m16kcluster *m16kcl = (union m16kcluster *)page;
3074                         mcl_slab_t *nsp;
3075                         int k;
3076
3077                         /* One for the entire 16KB */
3078                         sp = slab_get(m16kcl);
3079                         if (mclaudit != NULL)
3080                                 mcl_audit_init(m16kcl, &mca_list, NULL, 0, 1);
3081
3082                         VERIFY(sp->sl_refcnt == 0 && sp->sl_flags == 0);
3083                         slab_init(sp, MC_16KCL, SLF_MAPPED,
3084                             m16kcl, m16kcl, bufsize, 0, 1);
3085                         m16kcl->m16kcl_next = NULL;
3086
3087                         /*
3088                          * 2nd-Nth page's slab is part of the first one,
3089                          * where N is NSLABSP16KB.
3090                          */
3091                         for (k = 1; k < NSLABSP16KB; k++) {
3092                                 nsp = slab_get(((union mbigcluster *)page) + k);
3093                                 VERIFY(nsp->sl_refcnt == 0 &&
3094                                     nsp->sl_flags == 0);
3095                                 slab_init(nsp, MC_16KCL,
3096                                     SLF_MAPPED | SLF_PARTIAL,
3097                                     m16kcl, NULL, 0, 0, 0);
3098                         }
3099                         /* Insert this slab */
3100                         slab_insert(sp, MC_16KCL);
3101
3102                         /* Update stats now since slab_get drops the lock */
3103                         ++m_infree(MC_16KCL);
3104                         ++m_total(MC_16KCL);
3105                         VERIFY(m_total(MC_16KCL) <= m_maxlimit(MC_16KCL));
3106                         ++count;
3107                 }
3108         }
3109         VERIFY(mca_list == NULL && con_list == NULL);
3110
3111         if (!mb_peak_newreport && mbuf_report_usage(class))
3112                 mb_peak_newreport = TRUE;
3113
3114         /* We're done; let others enter */
3115         mb_clalloc_busy = FALSE;
3116         if (mb_clalloc_waiters > 0) {
3117                 mb_clalloc_waiters = 0;
3118                 wakeup(mb_clalloc_waitchan);
3119         }
3120
3121         return (count);
3122 out:
3123         LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
3124
3125         mtracelarge_register(size);
3126
3127         /* We're done; let others enter */
3128         mb_clalloc_busy = FALSE;
3129         if (mb_clalloc_waiters > 0) {
3130                 mb_clalloc_waiters = 0;
3131                 wakeup(mb_clalloc_waitchan);
3132         }
3133
3134         /*
3135          * When non-blocking we kick a thread if we have to grow the
3136          * pool or if the number of free clusters is less than requested.
3137          */
3138         if (i > 0 && mbuf_worker_ready && mbuf_worker_needs_wakeup) {
3139                 wakeup((caddr_t)&mbuf_worker_needs_wakeup);
3140                 mbuf_worker_needs_wakeup = FALSE;
3141         }
3142         if (class == MC_BIGCL) {
3143                 if (i > 0) {
3144                         /*
3145                          * Remember total number of 4KB clusters needed
3146                          * at this time.
3147                          */
3148                         i += m_total(MC_BIGCL);
3149                         if (i > m_region_expand(MC_BIGCL)) {
3150                                 m_region_expand(MC_BIGCL) = i;
3151                         }
3152                 }
3153                 if (m_infree(MC_BIGCL) >= num)
3154                         return (1);
3155         } else {
3156                 if (i > 0) {
3157                         /*
3158                          * Remember total number of 16KB clusters needed
3159                          * at this time.
3160                          */
3161                         i += m_total(MC_16KCL);
3162                         if (i > m_region_expand(MC_16KCL)) {
3163                                 m_region_expand(MC_16KCL) = i;
3164                         }
3165                 }
3166                 if (m_infree(MC_16KCL) >= num)
3167                         return (1);
3168         }
3169         return (0);
3170 }
3171
3172 /*
3173  * Populate the global freelist of the corresponding buffer class.
3174  */
3175 static int
3176 freelist_populate(mbuf_class_t class, unsigned int num, int wait)
3177 {
3178         mcache_obj_t *o = NULL;
3179         int i, numpages = 0, count;
3180         mbuf_class_t super_class;
3181
3182         VERIFY(class == MC_MBUF || class == MC_CL || class == MC_BIGCL ||
3183             class == MC_16KCL);
3184
3185         LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
3186
3187         VERIFY(PAGE_SIZE == m_maxsize(MC_BIGCL) ||
3188             PAGE_SIZE == m_maxsize(MC_16KCL));
3189
3190         if (m_maxsize(class) >= PAGE_SIZE)
3191                 return(m_clalloc(num, wait, m_maxsize(class)) != 0);
3192
3193         /*
3194          * The rest of the function will allocate pages and will slice
3195          * them up into the right size
3196          */
3197
3198         numpages = (num * m_size(class) + PAGE_SIZE - 1) / PAGE_SIZE;
3199
3200         /* Currently assume that pages are 4K or 16K */
3201         if (PAGE_SIZE == m_maxsize(MC_BIGCL))
3202                 super_class = MC_BIGCL;
3203         else
3204                 super_class = MC_16KCL;
3205
3206         i = m_clalloc(numpages, wait, m_maxsize(super_class));
3207
3208         /* how many objects will we cut the page into? */
3209         int numobj = PAGE_SIZE / m_maxsize(class);
3210
3211         for (count = 0; count < numpages; count++) {
3212                 /* respect totals, minlimit, maxlimit */
3213                 if (m_total(super_class) <= m_minlimit(super_class) ||
3214                     m_total(class) >= m_maxlimit(class))
3215                         break;
3216
3217                 if ((o = slab_alloc(super_class, wait)) == NULL)
3218                         break;
3219
3220                 struct mbuf *m = (struct mbuf *)o;
3221                 union mcluster *c = (union mcluster *)o;
3222                 union mbigcluster *mbc = (union mbigcluster *)o;
3223                 mcl_slab_t *sp = slab_get(o);
3224                 mcache_audit_t *mca = NULL;
3225
3226                 /*
3227                  * since one full page will be converted to MC_MBUF or
3228                  * MC_CL, verify that the reference count will match that
3229                  * assumption
3230                  */
3231                 VERIFY(sp->sl_refcnt == 1 && slab_is_detached(sp));
3232                 VERIFY((sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED);
3233                 /*
3234                  * Make sure that the cluster is unmolested
3235                  * while in freelist
3236                  */
3237                 if (mclverify) {
3238                         mca = mcl_audit_buf2mca(super_class,
3239                             (mcache_obj_t *)o);
3240                         mcache_audit_free_verify(mca,
3241                             (mcache_obj_t *)o, 0, m_maxsize(super_class));
3242                 }
3243
3244                 /* Reinitialize it as an mbuf or 2K or 4K slab */
3245                 slab_init(sp, class, sp->sl_flags,
3246                     sp->sl_base, NULL, PAGE_SIZE, 0, numobj);
3247
3248                 VERIFY(sp->sl_head == NULL);
3249
3250                 VERIFY(m_total(super_class) >= 1);
3251                 m_total(super_class)--;
3252
3253                 if (super_class == MC_BIGCL)
3254                         mbstat.m_bigclusters = m_total(MC_BIGCL);
3255
3256                 m_total(class) += numobj;
3257                 VERIFY(m_total(class) <= m_maxlimit(class));
3258                 m_infree(class) += numobj;
3259
3260                 if (!mb_peak_newreport && mbuf_report_usage(class))
3261                         mb_peak_newreport = TRUE;
3262
3263                 i = numobj;
3264                 if (class == MC_MBUF) {
3265                         mbstat.m_mbufs = m_total(MC_MBUF);
3266                         mtype_stat_add(MT_FREE, NMBPG);
3267                         while (i--) {
3268                                 /*
3269                                  * If auditing is enabled, construct the
3270                                  * shadow mbuf in the audit structure
3271                                  * instead of the actual one.
3272                                  * mbuf_slab_audit() will take care of
3273                                  * restoring the contents after the
3274                                  * integrity check.
3275                                  */
3276                                 if (mclaudit != NULL) {
3277                                         struct mbuf *ms;
3278                                         mca = mcl_audit_buf2mca(MC_MBUF,
3279                                             (mcache_obj_t *)m);
3280                                         ms = MCA_SAVED_MBUF_PTR(mca);
3281                                         ms->m_type = MT_FREE;
3282                                 } else {
3283                                         m->m_type = MT_FREE;
3284                                 }
3285                                 m->m_next = sp->sl_head;
3286                                 sp->sl_head = (void *)m++;
3287                         }
3288                 } else if (class == MC_CL) { /* MC_CL */
3289                         mbstat.m_clfree =
3290                             m_infree(MC_CL) + m_infree(MC_MBUF_CL);
3291                         mbstat.m_clusters = m_total(MC_CL);
3292                         while (i--) {
3293                                 c->mcl_next = sp->sl_head;
3294                                 sp->sl_head = (void *)c++;
3295                         }
3296                 } else {
3297                         VERIFY(class == MC_BIGCL);
3298                         mbstat.m_bigclusters = m_total(MC_BIGCL);
3299                         mbstat.m_bigclfree = m_infree(MC_BIGCL) +
3300                             m_infree(MC_MBUF_BIGCL);
3301                         while (i--) {
3302                                 mbc->mbc_next = sp->sl_head;
3303                                 sp->sl_head = (void *)mbc++;
3304                         }
3305                 }
3306
3307                 /* Insert into the mbuf or 2k or 4k slab list */
3308                 slab_insert(sp, class);
3309
3310                 if ((i = mb_waiters) > 0)
3311                         mb_waiters = 0;
3312                 if (i != 0)
3313                         wakeup(mb_waitchan);
3314         }
3315         return (count != 0);
3316 }
3317
3318 /*
3319  * For each class, initialize the freelist to hold m_minlimit() objects.
3320  */
3321 static void
3322 freelist_init(mbuf_class_t class)
3323 {
3324         LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
3325
3326         VERIFY(class == MC_CL || class == MC_BIGCL);
3327         VERIFY(m_total(class) == 0);
3328         VERIFY(m_minlimit(class) > 0);
3329
3330         while (m_total(class) < m_minlimit(class))
3331                 (void) freelist_populate(class, m_minlimit(class), M_WAIT);
3332
3333         VERIFY(m_total(class) >= m_minlimit(class));
3334 }
3335
3336 /*
3337  * (Inaccurately) check if it might be worth a trip back to the
3338  * mcache layer due the availability of objects there.  We'll
3339  * end up back here if there's nothing up there.
3340  */
3341 static boolean_t
3342 mbuf_cached_above(mbuf_class_t class, int wait)
3343 {
3344         switch (class) {
3345         case MC_MBUF:
3346                 if (wait & MCR_COMP)
3347                         return (!mcache_bkt_isempty(m_cache(MC_MBUF_CL)) ||
3348                             !mcache_bkt_isempty(m_cache(MC_MBUF_BIGCL)));
3349                 break;
3350
3351         case MC_CL:
3352                 if (wait & MCR_COMP)
3353                         return (!mcache_bkt_isempty(m_cache(MC_MBUF_CL)));
3354                 break;
3355
3356         case MC_BIGCL:
3357                 if (wait & MCR_COMP)
3358                         return (!mcache_bkt_isempty(m_cache(MC_MBUF_BIGCL)));
3359                 break;
3360
3361         case MC_16KCL:
3362                 if (wait & MCR_COMP)
3363                         return (!mcache_bkt_isempty(m_cache(MC_MBUF_16KCL)));
3364                 break;
3365
3366         case MC_MBUF_CL:
3367         case MC_MBUF_BIGCL:
3368         case MC_MBUF_16KCL:
3369                 break;
3370
3371         default:
3372                 VERIFY(0);
3373                 /* NOTREACHED */
3374         }
3375
3376         return (!mcache_bkt_isempty(m_cache(class)));
3377 }
3378
3379 /*
3380  * If possible, convert constructed objects to raw ones.
3381  */
3382 static boolean_t
3383 mbuf_steal(mbuf_class_t class, unsigned int num)
3384 {
3385         mcache_obj_t *top = NULL;
3386         mcache_obj_t **list = &top;
3387         unsigned int tot = 0;
3388
3389         LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
3390
3391         switch (class) {
3392         case MC_MBUF:
3393         case MC_CL:
3394         case MC_BIGCL:
3395         case MC_16KCL:
3396                 return (FALSE);
3397
3398         case MC_MBUF_CL:
3399         case MC_MBUF_BIGCL:
3400         case MC_MBUF_16KCL:
3401                 /* Get the required number of constructed objects if possible */
3402                 if (m_infree(class) > m_minlimit(class)) {
3403                         tot = cslab_alloc(class, &list,
3404                             MIN(num, m_infree(class)));
3405                 }
3406
3407                 /* And destroy them to get back the raw objects */
3408                 if (top != NULL)
3409                         (void) cslab_free(class, top, 1);
3410                 break;
3411
3412         default:
3413                 VERIFY(0);
3414                 /* NOTREACHED */
3415         }
3416
3417         return (tot == num);
3418 }
3419
3420 static void
3421 m_reclaim(mbuf_class_t class, unsigned int num, boolean_t comp)
3422 {
3423         int m, bmap = 0;
3424
3425         LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
3426
3427         VERIFY(m_total(MC_CL) <= m_maxlimit(MC_CL));
3428         VERIFY(m_total(MC_BIGCL) <= m_maxlimit(MC_BIGCL));
3429         VERIFY(m_total(MC_16KCL) <= m_maxlimit(MC_16KCL));
3430
3431         /*
3432          * This logic can be made smarter; for now, simply mark
3433          * all other related classes as potential victims.
3434          */
3435         switch (class) {
3436         case MC_MBUF:
3437                 m_wantpurge(MC_CL)++;
3438                 m_wantpurge(MC_BIGCL)++;
3439                 m_wantpurge(MC_MBUF_CL)++;
3440                 m_wantpurge(MC_MBUF_BIGCL)++;
3441                 break;
3442
3443         case MC_CL:
3444                 m_wantpurge(MC_MBUF)++;
3445                 m_wantpurge(MC_BIGCL)++;
3446                 m_wantpurge(MC_MBUF_BIGCL)++;
3447                 if (!comp)
3448                         m_wantpurge(MC_MBUF_CL)++;
3449                 break;
3450
3451         case MC_BIGCL:
3452                 m_wantpurge(MC_MBUF)++;
3453                 m_wantpurge(MC_CL)++;
3454                 m_wantpurge(MC_MBUF_CL)++;
3455                 if (!comp)
3456                         m_wantpurge(MC_MBUF_BIGCL)++;
3457                 break;
3458
3459         case MC_16KCL:
3460                 if (!comp)
3461                         m_wantpurge(MC_MBUF_16KCL)++;
3462                 break;
3463
3464         default:
3465                 VERIFY(0);
3466                 /* NOTREACHED */
3467         }
3468
3469         /*
3470          * Run through each marked class and check if we really need to
3471          * purge (and therefore temporarily disable) the per-CPU caches
3472          * layer used by the class.  If so, remember the classes since
3473          * we are going to drop the lock below prior to purging.
3474          */
3475         for (m = 0; m < NELEM(mbuf_table); m++) {
3476                 if (m_wantpurge(m) > 0) {
3477                         m_wantpurge(m) = 0;
3478                         /*
3479                          * Try hard to steal the required number of objects
3480                          * from the freelist of other mbuf classes.  Only
3481                          * purge and disable the per-CPU caches layer when
3482                          * we don't have enough; it's the last resort.
3483                          */
3484                         if (!mbuf_steal(m, num))
3485                                 bmap |= (1 << m);
3486                 }
3487         }
3488
3489         lck_mtx_unlock(mbuf_mlock);
3490
3491         if (bmap != 0) {
3492                 /* signal the domains to drain */
3493                 net_drain_domains();
3494
3495                 /* Sigh; we have no other choices but to ask mcache to purge */
3496                 for (m = 0; m < NELEM(mbuf_table); m++) {
3497                         if ((bmap & (1 << m)) &&
3498                             mcache_purge_cache(m_cache(m), TRUE)) {
3499                                 lck_mtx_lock(mbuf_mlock);
3500                                 m_purge_cnt(m)++;
3501                                 mbstat.m_drain++;
3502                                 lck_mtx_unlock(mbuf_mlock);
3503                         }
3504                 }
3505         } else {
3506                 /*
3507                  * Request mcache to reap extra elements from all of its caches;
3508                  * note that all reaps are serialized and happen only at a fixed
3509                  * interval.
3510                  */
3511                 mcache_reap();
3512         }
3513         lck_mtx_lock(mbuf_mlock);
3514 }
3515
3516 static inline struct mbuf *
3517 m_get_common(int wait, short type, int hdr)
3518 {
3519         struct mbuf *m;
3520         int mcflags = MSLEEPF(wait);
3521
3522         /* Is this due to a non-blocking retry?  If so, then try harder */
3523         if (mcflags & MCR_NOSLEEP)
3524                 mcflags |= MCR_TRYHARD;
3525
3526         m = mcache_alloc(m_cache(MC_MBUF), mcflags);
3527         if (m != NULL) {
3528                 MBUF_INIT(m, hdr, type);
3529                 mtype_stat_inc(type);
3530                 mtype_stat_dec(MT_FREE);
3531 #if CONFIG_MACF_NET
3532                 if (hdr && mac_init_mbuf(m, wait) != 0) {
3533                         m_free(m);
3534                         return (NULL);
3535                 }
3536 #endif /* MAC_NET */
3537         }
3538         return (m);
3539 }
3540
3541 /*
3542  * Space allocation routines; these are also available as macros
3543  * for critical paths.
3544  */
3545 #define _M_GET(wait, type)      m_get_common(wait, type, 0)
3546 #define _M_GETHDR(wait, type)   m_get_common(wait, type, 1)
3547 #define _M_RETRY(wait, type)    _M_GET(wait, type)
3548 #define _M_RETRYHDR(wait, type) _M_GETHDR(wait, type)
3549 #define _MGET(m, how, type)     ((m) = _M_GET(how, type))
3550 #define _MGETHDR(m, how, type)  ((m) = _M_GETHDR(how, type))
3551
3552 struct mbuf *
3553 m_get(int wait, int type)
3554 {
3555         return (_M_GET(wait, type));
3556 }
3557
3558 struct mbuf *
3559 m_gethdr(int wait, int type)
3560 {
3561         return (_M_GETHDR(wait, type));
3562 }
3563
3564 struct mbuf *
3565 m_retry(int wait, int type)
3566 {
3567         return (_M_RETRY(wait, type));
3568 }
3569
3570 struct mbuf *
3571 m_retryhdr(int wait, int type)
3572 {
3573         return (_M_RETRYHDR(wait, type));
3574 }
3575
3576 struct mbuf *
3577 m_getclr(int wait, int type)
3578 {
3579         struct mbuf *m;
3580
3581         _MGET(m, wait, type);
3582         if (m != NULL)
3583                 bzero(MTOD(m, caddr_t), MLEN);
3584         return (m);
3585 }
3586
3587 static int
3588 m_free_paired(struct mbuf *m)
3589 {
3590         VERIFY((m->m_flags & M_EXT) && (MEXT_FLAGS(m) & EXTF_PAIRED));
3591
3592         membar_sync();
3593         if (MEXT_PMBUF(m) == m) {
3594                 volatile UInt16 *addr = (volatile UInt16 *)&MEXT_PREF(m);
3595                 int16_t oprefcnt, prefcnt;
3596
3597                 /*
3598                  * Paired ref count might be negative in case we lose
3599                  * against another thread clearing MEXT_PMBUF, in the
3600                  * event it occurs after the above memory barrier sync.
3601                  * In that case just ignore as things have been unpaired.
3602                  */
3603                 do {
3604                         oprefcnt = *addr;
3605                         prefcnt = oprefcnt - 1;
3606                 } while (!OSCompareAndSwap16(oprefcnt, prefcnt, addr));
3607
3608                 if (prefcnt > 1) {
3609                         return (1);
3610                 } else if (prefcnt == 1) {
3611                         (*(m_get_ext_free(m)))(m->m_ext.ext_buf,
3612                             m->m_ext.ext_size, m_get_ext_arg(m));
3613                         return (1);
3614                 } else if (prefcnt == 0) {
3615                         VERIFY(MBUF_IS_PAIRED(m));
3616
3617                         /*
3618                          * Restore minref to its natural value, so that
3619                          * the caller will be able to free the cluster
3620                          * as appropriate.
3621                          */
3622                         MEXT_MINREF(m) = 0;
3623
3624                         /*
3625                          * Clear MEXT_PMBUF, but leave EXTF_PAIRED intact
3626                          * as it is immutable.  atomic_set_ptr also causes
3627                          * memory barrier sync.
3628                          */
3629                         atomic_set_ptr(&MEXT_PMBUF(m), NULL);
3630
3631                         switch (m->m_ext.ext_size) {
3632                         case MCLBYTES:
3633                                 m_set_ext(m, m_get_rfa(m), NULL, NULL);
3634                                 break;
3635
3636                         case MBIGCLBYTES:
3637                                 m_set_ext(m, m_get_rfa(m), m_bigfree, NULL);
3638                                 break;
3639
3640                         case M16KCLBYTES:
3641                                 m_set_ext(m, m_get_rfa(m), m_16kfree, NULL);
3642                                 break;
3643
3644                         default:
3645                                 VERIFY(0);
3646                                 /* NOTREACHED */
3647                         }
3648                 }
3649         }
3650
3651         /*
3652          * Tell caller the unpair has occurred, and that the reference
3653          * count on the external cluster held for the paired mbuf should
3654          * now be dropped.
3655          */
3656         return (0);
3657 }
3658
3659 struct mbuf *
3660 m_free(struct mbuf *m)
3661 {
3662         struct mbuf *n = m->m_next;
3663
3664         if (m->m_type == MT_FREE)
3665                 panic("m_free: freeing an already freed mbuf");
3666
3667         if (m->m_flags & M_PKTHDR) {
3668                 /* Check for scratch area overflow */
3669                 m_redzone_verify(m);
3670                 /* Free the aux data and tags if there is any */
3671                 m_tag_delete_chain(m, NULL);
3672
3673                 m_do_tx_compl_callback(m, NULL);
3674         }
3675
3676         if (m->m_flags & M_EXT) {
3677                 u_int16_t refcnt;
3678                 u_int32_t composite;
3679                 m_ext_free_func_t m_free_func;
3680
3681                 if (MBUF_IS_PAIRED(m) && m_free_paired(m))
3682                         return (n);
3683
3684                 refcnt = m_decref(m);
3685                 composite = (MEXT_FLAGS(m) & EXTF_COMPOSITE);
3686                 m_free_func = m_get_ext_free(m);
3687
3688                 if (refcnt == MEXT_MINREF(m) && !composite) {
3689                         if (m_free_func == NULL) {
3690                                 mcache_free(m_cache(MC_CL), m->m_ext.ext_buf);
3691                         } else if (m_free_func == m_bigfree) {
3692                                 mcache_free(m_cache(MC_BIGCL),
3693                                     m->m_ext.ext_buf);
3694                         } else if (m_free_func == m_16kfree) {
3695                                 mcache_free(m_cache(MC_16KCL),
3696                                     m->m_ext.ext_buf);
3697                         } else {
3698                                 (*m_free_func)(m->m_ext.ext_buf,
3699                                     m->m_ext.ext_size, m_get_ext_arg(m));
3700                         }
3701                         mcache_free(ref_cache, m_get_rfa(m));
3702                         m_set_ext(m, NULL, NULL, NULL);
3703                 } else if (refcnt == MEXT_MINREF(m) && composite) {
3704                         VERIFY(!(MEXT_FLAGS(m) & EXTF_PAIRED));
3705                         VERIFY(m->m_type != MT_FREE);
3706
3707                         mtype_stat_dec(m->m_type);
3708                         mtype_stat_inc(MT_FREE);
3709
3710                         m->m_type = MT_FREE;
3711                         m->m_flags = M_EXT;
3712                         m->m_len = 0;
3713                         m->m_next = m->m_nextpkt = NULL;
3714
3715                         MEXT_FLAGS(m) &= ~EXTF_READONLY;
3716
3717                         /* "Free" into the intermediate cache */
3718                         if (m_free_func == NULL) {
3719                                 mcache_free(m_cache(MC_MBUF_CL), m);
3720                         } else if (m_free_func == m_bigfree) {
3721                                 mcache_free(m_cache(MC_MBUF_BIGCL), m);
3722                         } else {
3723                                 VERIFY(m_free_func == m_16kfree);
3724                                 mcache_free(m_cache(MC_MBUF_16KCL), m);
3725                         }
3726                         return (n);
3727                 }
3728         }
3729
3730         if (m->m_type != MT_FREE) {
3731                 mtype_stat_dec(m->m_type);
3732                 mtype_stat_inc(MT_FREE);
3733         }
3734
3735         m->m_type = MT_FREE;
3736         m->m_flags = m->m_len = 0;
3737         m->m_next = m->m_nextpkt = NULL;
3738
3739         mcache_free(m_cache(MC_MBUF), m);
3740
3741         return (n);
3742 }
3743
3744 __private_extern__ struct mbuf *
3745 m_clattach(struct mbuf *m, int type, caddr_t extbuf,
3746     void (*extfree)(caddr_t, u_int, caddr_t), u_int extsize, caddr_t extarg,
3747     int wait, int pair)
3748 {
3749         struct ext_ref *rfa = NULL;
3750
3751         /*
3752          * If pairing is requested and an existing mbuf is provided, reject
3753          * it if it's already been paired to another cluster.  Otherwise,
3754          * allocate a new one or free any existing below.
3755          */
3756         if ((m != NULL && MBUF_IS_PAIRED(m)) ||
3757             (m == NULL && (m = _M_GETHDR(wait, type)) == NULL))
3758                 return (NULL);
3759
3760         if (m->m_flags & M_EXT) {
3761                 u_int16_t refcnt;
3762                 u_int32_t composite;
3763                 m_ext_free_func_t m_free_func;
3764
3765                 refcnt = m_decref(m);
3766                 composite = (MEXT_FLAGS(m) & EXTF_COMPOSITE);
3767                 VERIFY(!(MEXT_FLAGS(m) & EXTF_PAIRED) && MEXT_PMBUF(m) == NULL);
3768                 m_free_func = m_get_ext_free(m);
3769                 if (refcnt == MEXT_MINREF(m) && !composite) {
3770                         if (m_free_func == NULL) {
3771                                 mcache_free(m_cache(MC_CL), m->m_ext.ext_buf);
3772                         } else if (m_free_func == m_bigfree) {
3773                                 mcache_free(m_cache(MC_BIGCL),
3774                                     m->m_ext.ext_buf);
3775                         } else if (m_free_func == m_16kfree) {
3776                                 mcache_free(m_cache(MC_16KCL),
3777                                     m->m_ext.ext_buf);
3778                         } else {
3779                                 (*m_free_func)(m->m_ext.ext_buf,
3780                                     m->m_ext.ext_size, m_get_ext_arg(m));
3781                         }
3782                         /* Re-use the reference structure */
3783                         rfa = m_get_rfa(m);
3784                 } else if (refcnt == MEXT_MINREF(m) && composite) {
3785                         VERIFY(m->m_type != MT_FREE);
3786
3787                         mtype_stat_dec(m->m_type);
3788                         mtype_stat_inc(MT_FREE);
3789
3790                         m->m_type = MT_FREE;
3791                         m->m_flags = M_EXT;
3792                         m->m_len = 0;
3793                         m->m_next = m->m_nextpkt = NULL;
3794
3795                         MEXT_FLAGS(m) &= ~EXTF_READONLY;
3796
3797                         /* "Free" into the intermediate cache */
3798                         if (m_free_func == NULL) {
3799                                 mcache_free(m_cache(MC_MBUF_CL), m);
3800                         } else if (m_free_func == m_bigfree) {
3801                                 mcache_free(m_cache(MC_MBUF_BIGCL), m);
3802                         } else {
3803                                 VERIFY(m_free_func == m_16kfree);
3804                                 mcache_free(m_cache(MC_MBUF_16KCL), m);
3805                         }
3806                         /*
3807                          * Allocate a new mbuf, since we didn't divorce
3808                          * the composite mbuf + cluster pair above.
3809                          */
3810                         if ((m = _M_GETHDR(wait, type)) == NULL)
3811                                 return (NULL);
3812                 }
3813         }
3814
3815         if (rfa == NULL &&
3816             (rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL) {
3817                 m_free(m);
3818                 return (NULL);
3819         }
3820
3821         if (!pair) {
3822                 MEXT_INIT(m, extbuf, extsize, extfree, extarg, rfa,
3823                     0, 1, 0, 0, 0, NULL);
3824         } else {
3825                 MEXT_INIT(m, extbuf, extsize, extfree, (caddr_t)m, rfa,
3826                     1, 1, 1, EXTF_PAIRED, 0, m);
3827         }
3828
3829         return (m);
3830 }
3831
3832 /*
3833  * Perform `fast' allocation mbuf clusters from a cache of recently-freed
3834  * clusters. (If the cache is empty, new clusters are allocated en-masse.)
3835  */
3836 struct mbuf *
3837 m_getcl(int wait, int type, int flags)
3838 {
3839         struct mbuf *m;
3840         int mcflags = MSLEEPF(wait);
3841         int hdr = (flags & M_PKTHDR);
3842
3843         /* Is this due to a non-blocking retry?  If so, then try harder */
3844         if (mcflags & MCR_NOSLEEP)
3845                 mcflags |= MCR_TRYHARD;
3846
3847         m = mcache_alloc(m_cache(MC_MBUF_CL), mcflags);
3848         if (m != NULL) {
3849                 u_int16_t flag;
3850                 struct ext_ref *rfa;
3851                 void *cl;
3852
3853                 VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT);
3854                 cl = m->m_ext.ext_buf;
3855                 rfa = m_get_rfa(m);
3856
3857                 ASSERT(cl != NULL && rfa != NULL);
3858                 VERIFY(MBUF_IS_COMPOSITE(m) && m_get_ext_free(m) == NULL);
3859
3860                 flag = MEXT_FLAGS(m);
3861
3862                 MBUF_INIT(m, hdr, type);
3863                 MBUF_CL_INIT(m, cl, rfa, 1, flag);
3864
3865                 mtype_stat_inc(type);
3866                 mtype_stat_dec(MT_FREE);
3867 #if CONFIG_MACF_NET
3868                 if (hdr && mac_init_mbuf(m, wait) != 0) {
3869                         m_freem(m);
3870                         return (NULL);
3871                 }
3872 #endif /* MAC_NET */
3873         }
3874         return (m);
3875 }
3876
3877 /* m_mclget() add an mbuf cluster to a normal mbuf */
3878 struct mbuf *
3879 m_mclget(struct mbuf *m, int wait)
3880 {
3881         struct ext_ref *rfa;
3882
3883         if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL)
3884                 return (m);
3885
3886         m->m_ext.ext_buf = m_mclalloc(wait);
3887         if (m->m_ext.ext_buf != NULL) {
3888                 MBUF_CL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0);
3889         } else {
3890                 mcache_free(ref_cache, rfa);
3891         }
3892         return (m);
3893 }
3894
3895 /* Allocate an mbuf cluster */
3896 caddr_t
3897 m_mclalloc(int wait)
3898 {
3899         int mcflags = MSLEEPF(wait);
3900
3901         /* Is this due to a non-blocking retry?  If so, then try harder */
3902         if (mcflags & MCR_NOSLEEP)
3903                 mcflags |= MCR_TRYHARD;
3904
3905         return (mcache_alloc(m_cache(MC_CL), mcflags));
3906 }
3907
3908 /* Free an mbuf cluster */
3909 void
3910 m_mclfree(caddr_t p)
3911 {
3912         mcache_free(m_cache(MC_CL), p);
3913 }
3914
3915 /*
3916  * mcl_hasreference() checks if a cluster of an mbuf is referenced by
3917  * another mbuf; see comments in m_incref() regarding EXTF_READONLY.
3918  */
3919 int
3920 m_mclhasreference(struct mbuf *m)
3921 {
3922         if (!(m->m_flags & M_EXT))
3923                 return (0);
3924
3925         ASSERT(m_get_rfa(m) != NULL);
3926
3927         return ((MEXT_FLAGS(m) & EXTF_READONLY) ? 1 : 0);
3928 }
3929
3930 __private_extern__ caddr_t
3931 m_bigalloc(int wait)
3932 {
3933         int mcflags = MSLEEPF(wait);
3934
3935         /* Is this due to a non-blocking retry?  If so, then try harder */
3936         if (mcflags & MCR_NOSLEEP)
3937                 mcflags |= MCR_TRYHARD;
3938
3939         return (mcache_alloc(m_cache(MC_BIGCL), mcflags));
3940 }
3941
3942 __private_extern__ void
3943 m_bigfree(caddr_t p, __unused u_int size, __unused caddr_t arg)
3944 {
3945         mcache_free(m_cache(MC_BIGCL), p);
3946 }
3947
3948 /* m_mbigget() add an 4KB mbuf cluster to a normal mbuf */
3949 __private_extern__ struct mbuf *
3950 m_mbigget(struct mbuf *m, int wait)
3951 {
3952         struct ext_ref *rfa;
3953
3954         if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL)
3955                 return (m);
3956
3957         m->m_ext.ext_buf =  m_bigalloc(wait);
3958         if (m->m_ext.ext_buf != NULL) {
3959                 MBUF_BIGCL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0);
3960         } else {
3961                 mcache_free(ref_cache, rfa);
3962         }
3963         return (m);
3964 }
3965
3966 __private_extern__ caddr_t
3967 m_16kalloc(int wait)
3968 {
3969         int mcflags = MSLEEPF(wait);
3970
3971         /* Is this due to a non-blocking retry?  If so, then try harder */
3972         if (mcflags & MCR_NOSLEEP)
3973                 mcflags |= MCR_TRYHARD;
3974
3975         return (mcache_alloc(m_cache(MC_16KCL), mcflags));
3976 }
3977
3978 __private_extern__ void
3979 m_16kfree(caddr_t p, __unused u_int size, __unused caddr_t arg)
3980 {
3981         mcache_free(m_cache(MC_16KCL), p);
3982 }
3983
3984 /* m_m16kget() add a 16KB mbuf cluster to a normal mbuf */
3985 __private_extern__ struct mbuf *
3986 m_m16kget(struct mbuf *m, int wait)
3987 {
3988         struct ext_ref *rfa;
3989
3990         if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL)
3991                 return (m);
3992
3993         m->m_ext.ext_buf =  m_16kalloc(wait);
3994         if (m->m_ext.ext_buf != NULL) {
3995                 MBUF_16KCL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0);
3996         } else {
3997                 mcache_free(ref_cache, rfa);
3998         }
3999         return (m);
4000 }
4001
4002 /*
4003  * "Move" mbuf pkthdr from "from" to "to".
4004  * "from" must have M_PKTHDR set, and "to" must be empty.
4005  */
4006 void
4007 m_copy_pkthdr(struct mbuf *to, struct mbuf *from)
4008 {
4009         VERIFY(from->m_flags & M_PKTHDR);
4010
4011         /* Check for scratch area overflow */
4012         m_redzone_verify(from);
4013
4014         if (to->m_flags & M_PKTHDR) {
4015                 /* Check for scratch area overflow */
4016                 m_redzone_verify(to);
4017                 /* We will be taking over the tags of 'to' */
4018                 m_tag_delete_chain(to, NULL);
4019         }
4020         to->m_pkthdr = from->m_pkthdr;          /* especially tags */
4021         m_classifier_init(from, 0);             /* purge classifier info */
4022         m_tag_init(from, 1);                    /* purge all tags from src */
4023         m_scratch_init(from);                   /* clear src scratch area */
4024         to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT);
4025         if ((to->m_flags & M_EXT) == 0)
4026                 to->m_data = to->m_pktdat;
4027         m_redzone_init(to);                     /* setup red zone on dst */
4028 }
4029
4030 /*
4031  * Duplicate "from"'s mbuf pkthdr in "to".
4032  * "from" must have M_PKTHDR set, and "to" must be empty.
4033  * In particular, this does a deep copy of the packet tags.
4034  */
4035 static int
4036 m_dup_pkthdr(struct mbuf *to, struct mbuf *from, int how)
4037 {
4038         VERIFY(from->m_flags & M_PKTHDR);
4039
4040         /* Check for scratch area overflow */
4041         m_redzone_verify(from);
4042
4043         if (to->m_flags & M_PKTHDR) {
4044                 /* Check for scratch area overflow */
4045                 m_redzone_verify(to);
4046                 /* We will be taking over the tags of 'to' */
4047                 m_tag_delete_chain(to, NULL);
4048         }
4049         to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT);
4050         if ((to->m_flags & M_EXT) == 0)
4051                 to->m_data = to->m_pktdat;
4052         to->m_pkthdr = from->m_pkthdr;
4053         m_redzone_init(to);                     /* setup red zone on dst */
4054         m_tag_init(to, 0);                      /* preserve dst static tags */
4055         return (m_tag_copy_chain(to, from, how));
4056 }
4057
4058 void
4059 m_copy_pftag(struct mbuf *to, struct mbuf *from)
4060 {
4061         memcpy(m_pftag(to), m_pftag(from), sizeof(struct pf_mtag));
4062 #if PF_ECN
4063         m_pftag(to)->pftag_hdr = NULL;
4064         m_pftag(to)->pftag_flags &= ~(PF_TAG_HDR_INET|PF_TAG_HDR_INET6);
4065 #endif /* PF_ECN */
4066 }
4067
4068 void
4069 m_classifier_init(struct mbuf *m, uint32_t pktf_mask)
4070 {
4071         VERIFY(m->m_flags & M_PKTHDR);
4072
4073         m->m_pkthdr.pkt_proto = 0;
4074         m->m_pkthdr.pkt_flowsrc = 0;
4075         m->m_pkthdr.pkt_flowid = 0;
4076         m->m_pkthdr.pkt_flags &= pktf_mask;     /* caller-defined mask */
4077         /* preserve service class and interface info for loopback packets */
4078         if (!(m->m_pkthdr.pkt_flags & PKTF_LOOP))
4079                 (void) m_set_service_class(m, MBUF_SC_BE);
4080         if (!(m->m_pkthdr.pkt_flags & PKTF_IFAINFO))
4081                 m->m_pkthdr.pkt_ifainfo = 0;
4082         /*
4083          * Preserve timestamp if requested
4084          */
4085         if (!(m->m_pkthdr.pkt_flags & PKTF_TS_VALID))
4086                 m->m_pkthdr.pkt_timestamp = 0;
4087 }
4088
4089 void
4090 m_copy_classifier(struct mbuf *to, struct mbuf *from)
4091 {
4092         VERIFY(to->m_flags & M_PKTHDR);
4093         VERIFY(from->m_flags & M_PKTHDR);
4094
4095         to->m_pkthdr.pkt_proto = from->m_pkthdr.pkt_proto;
4096         to->m_pkthdr.pkt_flowsrc = from->m_pkthdr.pkt_flowsrc;
4097         to->m_pkthdr.pkt_flowid = from->m_pkthdr.pkt_flowid;
4098         to->m_pkthdr.pkt_flags = from->m_pkthdr.pkt_flags;
4099         (void) m_set_service_class(to, from->m_pkthdr.pkt_svc);
4100         to->m_pkthdr.pkt_ifainfo  = from->m_pkthdr.pkt_ifainfo;
4101 }
4102
4103 /*
4104  * Return a list of mbuf hdrs that point to clusters.  Try for num_needed;
4105  * if wantall is not set, return whatever number were available.  Set up the
4106  * first num_with_pkthdrs with mbuf hdrs configured as packet headers; these
4107  * are chained on the m_nextpkt field.  Any packets requested beyond this
4108  * are chained onto the last packet header's m_next field.  The size of
4109  * the cluster is controlled by the parameter bufsize.
4110  */
4111 __private_extern__ struct mbuf *
4112 m_getpackets_internal(unsigned int *num_needed, int num_with_pkthdrs,
4113     int wait, int wantall, size_t bufsize)
4114 {
4115         struct mbuf *m;
4116         struct mbuf **np, *top;
4117         unsigned int pnum, needed = *num_needed;
4118         mcache_obj_t *mp_list = NULL;
4119         int mcflags = MSLEEPF(wait);
4120         u_int16_t flag;
4121         struct ext_ref *rfa;
4122         mcache_t *cp;
4123         void *cl;
4124
4125         ASSERT(bufsize == m_maxsize(MC_CL) ||
4126             bufsize == m_maxsize(MC_BIGCL) ||
4127             bufsize == m_maxsize(MC_16KCL));
4128
4129         /*
4130          * Caller must first check for njcl because this
4131          * routine is internal and not exposed/used via KPI.
4132          */
4133         VERIFY(bufsize != m_maxsize(MC_16KCL) || njcl > 0);
4134
4135         top = NULL;
4136         np = &top;
4137         pnum = 0;
4138
4139         /*
4140          * The caller doesn't want all the requested buffers; only some.
4141          * Try hard to get what we can, but don't block.  This effectively
4142          * overrides MCR_SLEEP, since this thread will not go to sleep
4143          * if we can't get all the buffers.
4144          */
4145         if (!wantall || (mcflags & MCR_NOSLEEP))
4146                 mcflags |= MCR_TRYHARD;
4147
4148         /* Allocate the composite mbuf + cluster elements from the cache */
4149         if (bufsize == m_maxsize(MC_CL))
4150                 cp = m_cache(MC_MBUF_CL);
4151         else if (bufsize == m_maxsize(MC_BIGCL))
4152                 cp = m_cache(MC_MBUF_BIGCL);
4153         else
4154                 cp = m_cache(MC_MBUF_16KCL);
4155         needed = mcache_alloc_ext(cp, &mp_list, needed, mcflags);
4156
4157         for (pnum = 0; pnum < needed; pnum++) {
4158                 m = (struct mbuf *)mp_list;
4159                 mp_list = mp_list->obj_next;
4160
4161                 VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT);
4162                 cl = m->m_ext.ext_buf;
4163                 rfa = m_get_rfa(m);
4164
4165                 ASSERT(cl != NULL && rfa != NULL);
4166                 VERIFY(MBUF_IS_COMPOSITE(m));
4167
4168                 flag = MEXT_FLAGS(m);
4169
4170                 MBUF_INIT(m, num_with_pkthdrs, MT_DATA);
4171                 if (bufsize == m_maxsize(MC_16KCL)) {
4172                         MBUF_16KCL_INIT(m, cl, rfa, 1, flag);
4173                 } else if (bufsize == m_maxsize(MC_BIGCL)) {
4174                         MBUF_BIGCL_INIT(m, cl, rfa, 1, flag);
4175                 } else {
4176                         MBUF_CL_INIT(m, cl, rfa, 1, flag);
4177                 }
4178
4179                 if (num_with_pkthdrs > 0) {
4180                         --num_with_pkthdrs;
4181 #if CONFIG_MACF_NET
4182                         if (mac_mbuf_label_init(m, wait) != 0) {
4183                                 m_freem(m);
4184                                 break;
4185                         }
4186 #endif /* MAC_NET */
4187                 }
4188
4189                 *np = m;
4190                 if (num_with_pkthdrs > 0)
4191                         np = &m->m_nextpkt;
4192                 else
4193                         np = &m->m_next;
4194         }
4195         ASSERT(pnum != *num_needed || mp_list == NULL);
4196         if (mp_list != NULL)
4197                 mcache_free_ext(cp, mp_list);
4198
4199         if (pnum > 0) {
4200                 mtype_stat_add(MT_DATA, pnum);
4201                 mtype_stat_sub(MT_FREE, pnum);
4202         }
4203
4204         if (wantall && (pnum != *num_needed)) {
4205                 if (top != NULL)
4206                         m_freem_list(top);
4207                 return (NULL);
4208         }
4209
4210         if (pnum > *num_needed) {
4211                 printf("%s: File a radar related to <rdar://10146739>. \
4212                         needed = %u, pnum = %u, num_needed = %u \n",
4213                         __func__, needed, pnum, *num_needed);
4214         }
4215
4216         *num_needed = pnum;
4217         return (top);
4218 }
4219
4220 /*
4221  * Return list of mbuf linked by m_nextpkt.  Try for numlist, and if
4222  * wantall is not set, return whatever number were available.  The size of
4223  * each mbuf in the list is controlled by the parameter packetlen.  Each
4224  * mbuf of the list may have a chain of mbufs linked by m_next.  Each mbuf
4225  * in the chain is called a segment.  If maxsegments is not null and the
4226  * value pointed to is not null, this specify the maximum number of segments
4227  * for a chain of mbufs.  If maxsegments is zero or the value pointed to
4228  * is zero the caller does not have any restriction on the number of segments.
4229  * The actual  number of segments of a mbuf chain is return in the value
4230  * pointed to by maxsegments.
4231  */
4232 __private_extern__ struct mbuf *
4233 m_allocpacket_internal(unsigned int *numlist, size_t packetlen,
4234     unsigned int *maxsegments, int wait, int wantall, size_t wantsize)
4235 {
4236         struct mbuf **np, *top, *first = NULL;
4237         size_t bufsize, r_bufsize;
4238         unsigned int num = 0;
4239         unsigned int nsegs = 0;
4240         unsigned int needed, resid;
4241         int mcflags = MSLEEPF(wait);
4242         mcache_obj_t *mp_list = NULL, *rmp_list = NULL;
4243         mcache_t *cp = NULL, *rcp = NULL;
4244
4245         if (*numlist == 0)
4246                 return (NULL);
4247
4248         top = NULL;
4249         np = &top;
4250
4251         if (wantsize == 0) {
4252                 if (packetlen <= MINCLSIZE) {
4253                         bufsize = packetlen;
4254                 } else if (packetlen > m_maxsize(MC_CL)) {
4255                         /* Use 4KB if jumbo cluster pool isn't available */
4256                         if (packetlen <= m_maxsize(MC_BIGCL) || njcl == 0)
4257                                 bufsize = m_maxsize(MC_BIGCL);
4258                         else
4259                                 bufsize = m_maxsize(MC_16KCL);
4260                 } else {
4261                         bufsize = m_maxsize(MC_CL);
4262                 }
4263         } else if (wantsize == m_maxsize(MC_CL) ||
4264             wantsize == m_maxsize(MC_BIGCL) ||
4265             (wantsize == m_maxsize(MC_16KCL) && njcl > 0)) {
4266                 bufsize = wantsize;
4267         } else {
4268                 return (NULL);
4269         }
4270
4271         if (bufsize <= MHLEN) {
4272                 nsegs = 1;
4273         } else if (bufsize <= MINCLSIZE) {
4274                 if (maxsegments != NULL && *maxsegments == 1) {
4275                         bufsize = m_maxsize(MC_CL);
4276                         nsegs = 1;
4277                 } else {
4278                         nsegs = 2;
4279                 }
4280         } else if (bufsize == m_maxsize(MC_16KCL)) {
4281                 VERIFY(njcl > 0);
4282                 nsegs = ((packetlen - 1) >> M16KCLSHIFT) + 1;
4283         } else if (bufsize == m_maxsize(MC_BIGCL)) {
4284                 nsegs = ((packetlen - 1) >> MBIGCLSHIFT) + 1;
4285         } else {
4286                 nsegs = ((packetlen - 1) >> MCLSHIFT) + 1;
4287         }
4288         if (maxsegments != NULL) {
4289                 if (*maxsegments && nsegs > *maxsegments) {
4290                         *maxsegments = nsegs;
4291                         return (NULL);
4292                 }
4293                 *maxsegments = nsegs;
4294         }
4295
4296         /*
4297          * The caller doesn't want all the requested buffers; only some.
4298          * Try hard to get what we can, but don't block.  This effectively
4299          * overrides MCR_SLEEP, since this thread will not go to sleep
4300          * if we can't get all the buffers.
4301          */
4302         if (!wantall || (mcflags & MCR_NOSLEEP))
4303                 mcflags |= MCR_TRYHARD;
4304
4305         /*
4306          * Simple case where all elements in the lists/chains are mbufs.
4307          * Unless bufsize is greater than MHLEN, each segment chain is made
4308          * up of exactly 1 mbuf.  Otherwise, each segment chain is made up
4309          * of 2 mbufs; the second one is used for the residual data, i.e.
4310          * the remaining data that cannot fit into the first mbuf.
4311          */
4312         if (bufsize <= MINCLSIZE) {
4313                 /* Allocate the elements in one shot from the mbuf cache */
4314                 ASSERT(bufsize <= MHLEN || nsegs == 2);
4315                 cp = m_cache(MC_MBUF);
4316                 needed = mcache_alloc_ext(cp, &mp_list,
4317                     (*numlist) * nsegs, mcflags);
4318
4319                 /*
4320                  * The number of elements must be even if we are to use an
4321                  * mbuf (instead of a cluster) to store the residual data.
4322                  * If we couldn't allocate the requested number of mbufs,
4323                  * trim the number down (if it's odd) in order to avoid
4324                  * creating a partial segment chain.
4325                  */
4326                 if (bufsize > MHLEN && (needed & 0x1))
4327                         needed--;
4328
4329                 while (num < needed) {
4330                         struct mbuf *m;
4331
4332                         m = (struct mbuf *)mp_list;
4333                         mp_list = mp_list->obj_next;
4334                         ASSERT(m != NULL);
4335
4336                         MBUF_INIT(m, 1, MT_DATA);
4337 #if CONFIG_MACF_NET
4338                         if (mac_init_mbuf(m, wait) != 0) {
4339                                 m_free(m);
4340                                 break;
4341                         }
4342 #endif /* MAC_NET */
4343                         num++;
4344                         if (bufsize > MHLEN) {
4345                                 /* A second mbuf for this segment chain */
4346                                 m->m_next = (struct mbuf *)mp_list;
4347                                 mp_list = mp_list->obj_next;
4348                                 ASSERT(m->m_next != NULL);
4349
4350                                 MBUF_INIT(m->m_next, 0, MT_DATA);
4351                                 num++;
4352                         }
4353                         *np = m;
4354                         np = &m->m_nextpkt;
4355                 }
4356                 ASSERT(num != *numlist || mp_list == NULL);
4357
4358                 if (num > 0) {
4359                         mtype_stat_add(MT_DATA, num);
4360                         mtype_stat_sub(MT_FREE, num);
4361                 }
4362                 num /= nsegs;
4363
4364                 /* We've got them all; return to caller */
4365                 if (num == *numlist)
4366                         return (top);
4367
4368                 goto fail;
4369         }
4370
4371         /*
4372          * Complex cases where elements are made up of one or more composite
4373          * mbufs + cluster, depending on packetlen.  Each N-segment chain can
4374          * be illustrated as follows:
4375          *
4376          * [mbuf + cluster 1] [mbuf + cluster 2] ... [mbuf + cluster N]
4377          *
4378          * Every composite mbuf + cluster element comes from the intermediate
4379          * cache (either MC_MBUF_CL or MC_MBUF_BIGCL).  For space efficiency,
4380          * the last composite element will come from the MC_MBUF_CL cache,
4381          * unless the residual data is larger than 2KB where we use the
4382          * big cluster composite cache (MC_MBUF_BIGCL) instead.  Residual
4383          * data is defined as extra data beyond the first element that cannot
4384          * fit into the previous element, i.e. there is no residual data if
4385          * the chain only has 1 segment.
4386          */
4387         r_bufsize = bufsize;
4388         resid = packetlen > bufsize ? packetlen % bufsize : 0;
4389         if (resid > 0) {
4390                 /* There is residual data; figure out the cluster size */
4391                 if (wantsize == 0 && packetlen > MINCLSIZE) {
4392                         /*
4393                          * Caller didn't request that all of the segments
4394                          * in the chain use the same cluster size; use the
4395                          * smaller of the cluster sizes.
4396                          */
4397                         if (njcl > 0 && resid > m_maxsize(MC_BIGCL))
4398                                 r_bufsize = m_maxsize(MC_16KCL);
4399                         else if (resid > m_maxsize(MC_CL))
4400                                 r_bufsize = m_maxsize(MC_BIGCL);
4401                         else
4402                                 r_bufsize = m_maxsize(MC_CL);
4403                 } else {
4404                         /* Use the same cluster size as the other segments */
4405                         resid = 0;
4406                 }
4407         }
4408
4409         needed = *numlist;
4410         if (resid > 0) {
4411                 /*
4412                  * Attempt to allocate composite mbuf + cluster elements for
4413                  * the residual data in each chain; record the number of such
4414                  * elements that can be allocated so that we know how many
4415                  * segment chains we can afford to create.
4416                  */
4417                 if (r_bufsize <= m_maxsize(MC_CL))
4418                         rcp = m_cache(MC_MBUF_CL);
4419                 else if (r_bufsize <= m_maxsize(MC_BIGCL))
4420                         rcp = m_cache(MC_MBUF_BIGCL);
4421                 else
4422                         rcp = m_cache(MC_MBUF_16KCL);
4423                 needed = mcache_alloc_ext(rcp, &rmp_list, *numlist, mcflags);
4424
4425                 if (needed == 0)
4426                         goto fail;
4427
4428                 /* This is temporarily reduced for calculation */
4429                 ASSERT(nsegs > 1);
4430                 nsegs--;
4431         }
4432
4433         /*
4434          * Attempt to allocate the rest of the composite mbuf + cluster
4435          * elements for the number of segment chains that we need.
4436          */
4437         if (bufsize <= m_maxsize(MC_CL))
4438                 cp = m_cache(MC_MBUF_CL);
4439         else if (bufsize <= m_maxsize(MC_BIGCL))
4440                 cp = m_cache(MC_MBUF_BIGCL);
4441         else
4442                 cp = m_cache(MC_MBUF_16KCL);
4443         needed = mcache_alloc_ext(cp, &mp_list, needed * nsegs, mcflags);
4444
4445         /* Round it down to avoid creating a partial segment chain */
4446         needed = (needed / nsegs) * nsegs;
4447         if (needed == 0)
4448                 goto fail;
4449
4450         if (resid > 0) {
4451                 /*
4452                  * We're about to construct the chain(s); take into account
4453                  * the number of segments we have created above to hold the
4454                  * residual data for each chain, as well as restore the
4455                  * original count of segments per chain.
4456                  */
4457                 ASSERT(nsegs > 0);
4458                 needed += needed / nsegs;
4459                 nsegs++;
4460         }
4461
4462         for (;;) {
4463                 struct mbuf *m;
4464                 u_int16_t flag;
4465                 struct ext_ref *rfa;
4466                 void *cl;
4467                 int pkthdr;
4468                 m_ext_free_func_t m_free_func;
4469
4470                 ++num;
4471                 if (nsegs == 1 || (num % nsegs) != 0 || resid == 0) {
4472                         m = (struct mbuf *)mp_list;
4473                         mp_list = mp_list->obj_next;
4474                 } else {
4475                         m = (struct mbuf *)rmp_list;
4476                         rmp_list = rmp_list->obj_next;
4477                 }
4478                 m_free_func = m_get_ext_free(m);
4479                 ASSERT(m != NULL);
4480                 VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT);
4481                 VERIFY(m_free_func == NULL || m_free_func == m_bigfree ||
4482                     m_free_func == m_16kfree);
4483
4484                 cl = m->m_ext.ext_buf;
4485                 rfa = m_get_rfa(m);
4486
4487                 ASSERT(cl != NULL && rfa != NULL);
4488                 VERIFY(MBUF_IS_COMPOSITE(m));
4489
4490                 flag = MEXT_FLAGS(m);
4491
4492                 pkthdr = (nsegs == 1 || (num % nsegs) == 1);
4493                 if (pkthdr)
4494                         first = m;
4495                 MBUF_INIT(m, pkthdr, MT_DATA);
4496                 if (m_free_func == m_16kfree) {
4497                         MBUF_16KCL_INIT(m, cl, rfa, 1, flag);
4498                 } else if (m_free_func == m_bigfree) {
4499                         MBUF_BIGCL_INIT(m, cl, rfa, 1, flag);
4500                 } else {
4501                         MBUF_CL_INIT(m, cl, rfa, 1, flag);
4502                 }
4503 #if CONFIG_MACF_NET
4504                 if (pkthdr && mac_init_mbuf(m, wait) != 0) {
4505                         --num;
4506                         m_freem(m);
4507                         break;
4508                 }
4509 #endif /* MAC_NET */
4510
4511                 *np = m;
4512                 if ((num % nsegs) == 0)
4513                         np = &first->m_nextpkt;
4514                 else
4515                         np = &m->m_next;
4516
4517                 if (num == needed)
4518                         break;
4519         }
4520
4521         if (num > 0) {
4522                 mtype_stat_add(MT_DATA, num);
4523                 mtype_stat_sub(MT_FREE, num);
4524         }
4525
4526         num /= nsegs;
4527
4528         /* We've got them all; return to caller */
4529         if (num == *numlist) {
4530                 ASSERT(mp_list == NULL && rmp_list == NULL);
4531                 return (top);
4532         }
4533
4534 fail:
4535         /* Free up what's left of the above */
4536         if (mp_list != NULL)
4537                 mcache_free_ext(cp, mp_list);
4538         if (rmp_list != NULL)
4539                 mcache_free_ext(rcp, rmp_list);
4540         if (wantall && top != NULL) {
4541                 m_freem(top);
4542                 return (NULL);
4543         }
4544         *numlist = num;
4545         return (top);
4546 }
4547
4548 /*
4549  * Best effort to get a mbuf cluster + pkthdr.  Used by drivers to allocated
4550  * packets on receive ring.
4551  */
4552 __private_extern__ struct mbuf *
4553 m_getpacket_how(int wait)
4554 {
4555         unsigned int num_needed = 1;
4556
4557         return (m_getpackets_internal(&num_needed, 1, wait, 1,
4558             m_maxsize(MC_CL)));
4559 }
4560
4561 /*
4562  * Best effort to get a mbuf cluster + pkthdr.  Used by drivers to allocated
4563  * packets on receive ring.
4564  */
4565 struct mbuf *
4566 m_getpacket(void)
4567 {
4568         unsigned int num_needed = 1;
4569
4570         return (m_getpackets_internal(&num_needed, 1, M_WAIT, 1,
4571             m_maxsize(MC_CL)));
4572 }
4573
4574 /*
4575  * Return a list of mbuf hdrs that point to clusters.  Try for num_needed;
4576  * if this can't be met, return whatever number were available.  Set up the
4577  * first num_with_pkthdrs with mbuf hdrs configured as packet headers.  These
4578  * are chained on the m_nextpkt field.  Any packets requested beyond this are
4579  * chained onto the last packet header's m_next field.
4580  */
4581 struct mbuf *
4582 m_getpackets(int num_needed, int num_with_pkthdrs, int how)
4583 {
4584         unsigned int n = num_needed;
4585
4586         return (m_getpackets_internal(&n, num_with_pkthdrs, how, 0,
4587             m_maxsize(MC_CL)));
4588 }
4589
4590 /*
4591  * Return a list of mbuf hdrs set up as packet hdrs chained together
4592  * on the m_nextpkt field
4593  */
4594 struct mbuf *
4595 m_getpackethdrs(int num_needed, int how)
4596 {
4597         struct mbuf *m;
4598         struct mbuf **np, *top;
4599
4600         top = NULL;
4601         np = &top;
4602
4603         while (num_needed--) {
4604                 m = _M_RETRYHDR(how, MT_DATA);
4605                 if (m == NULL)
4606                         break;
4607
4608                 *np = m;
4609                 np = &m->m_nextpkt;
4610         }
4611
4612         return (top);
4613 }
4614
4615 /*
4616  * Free an mbuf list (m_nextpkt) while following m_next.  Returns the count
4617  * for mbufs packets freed.  Used by the drivers.
4618  */
4619 int
4620 m_freem_list(struct mbuf *m)
4621 {
4622         struct mbuf *nextpkt;
4623         mcache_obj_t *mp_list = NULL;
4624         mcache_obj_t *mcl_list = NULL;
4625         mcache_obj_t *mbc_list = NULL;
4626         mcache_obj_t *m16k_list = NULL;
4627         mcache_obj_t *m_mcl_list = NULL;
4628         mcache_obj_t *m_mbc_list = NULL;
4629         mcache_obj_t *m_m16k_list = NULL;
4630         mcache_obj_t *ref_list = NULL;
4631         int pktcount = 0;
4632         int mt_free = 0, mt_data = 0, mt_header = 0, mt_soname = 0, mt_tag = 0;
4633
4634         while (m != NULL) {
4635                 pktcount++;
4636
4637                 nextpkt = m->m_nextpkt;
4638                 m->m_nextpkt = NULL;
4639
4640                 while (m != NULL) {
4641                         struct mbuf *next = m->m_next;
4642                         mcache_obj_t *o, *rfa;
4643                         u_int32_t composite;
4644                         u_int16_t refcnt;
4645                         m_ext_free_func_t m_free_func;
4646
4647                         if (m->m_type == MT_FREE)
4648                                 panic("m_free: freeing an already freed mbuf");
4649
4650                         if (m->m_flags & M_PKTHDR) {
4651                                 /* Check for scratch area overflow */
4652                                 m_redzone_verify(m);
4653                                 /* Free the aux data and tags if there is any */
4654                                 m_tag_delete_chain(m, NULL);
4655                         }
4656
4657                         if (!(m->m_flags & M_EXT)) {
4658                                 mt_free++;
4659                                 goto simple_free;
4660                         }
4661
4662                         if (MBUF_IS_PAIRED(m) && m_free_paired(m)) {
4663                                 m = next;
4664                                 continue;
4665                         }
4666
4667                         mt_free++;
4668
4669                         o = (mcache_obj_t *)(void *)m->m_ext.ext_buf;
4670                         refcnt = m_decref(m);
4671                         composite = (MEXT_FLAGS(m) & EXTF_COMPOSITE);
4672                         m_free_func = m_get_ext_free(m);
4673                         if (refcnt == MEXT_MINREF(m) && !composite) {
4674                                 if (m_free_func == NULL) {
4675                                         o->obj_next = mcl_list;
4676                                         mcl_list = o;
4677                                 } else if (m_free_func == m_bigfree) {
4678                                         o->obj_next = mbc_list;
4679                                         mbc_list = o;
4680                                 } else if (m_free_func == m_16kfree) {
4681                                         o->obj_next = m16k_list;
4682                                         m16k_list = o;
4683                                 } else {
4684                                         (*(m_free_func))((caddr_t)o,
4685                                             m->m_ext.ext_size,
4686                                             m_get_ext_arg(m));
4687                                 }
4688                                 rfa = (mcache_obj_t *)(void *)m_get_rfa(m);
4689                                 rfa->obj_next = ref_list;
4690                                 ref_list = rfa;
4691                                 m_set_ext(m, NULL, NULL, NULL);
4692                         } else if (refcnt == MEXT_MINREF(m) && composite) {
4693                                 VERIFY(!(MEXT_FLAGS(m) & EXTF_PAIRED));
4694                                 VERIFY(m->m_type != MT_FREE);
4695                                 /*
4696                                  * Amortize the costs of atomic operations
4697                                  * by doing them at the end, if possible.
4698                                  */
4699                                 if (m->m_type == MT_DATA)
4700                                         mt_data++;
4701                                 else if (m->m_type == MT_HEADER)
4702                                         mt_header++;
4703                                 else if (m->m_type == MT_SONAME)
4704                                         mt_soname++;
4705                                 else if (m->m_type == MT_TAG)
4706                                         mt_tag++;
4707                                 else
4708                                         mtype_stat_dec(m->m_type);
4709
4710                                 m->m_type = MT_FREE;
4711                                 m->m_flags = M_EXT;
4712                                 m->m_len = 0;
4713                                 m->m_next = m->m_nextpkt = NULL;
4714
4715                                 MEXT_FLAGS(m) &= ~EXTF_READONLY;
4716
4717                                 /* "Free" into the intermediate cache */
4718                                 o = (mcache_obj_t *)m;
4719                                 if (m_free_func == NULL) {
4720                                         o->obj_next = m_mcl_list;
4721                                         m_mcl_list = o;
4722                                 } else if (m_free_func == m_bigfree) {
4723                                         o->obj_next = m_mbc_list;
4724                                         m_mbc_list = o;
4725                                 } else {
4726                                         VERIFY(m_free_func == m_16kfree);
4727                                         o->obj_next = m_m16k_list;
4728                                         m_m16k_list = o;
4729                                 }
4730                                 m = next;
4731                                 continue;
4732                         }
4733 simple_free:
4734                         /*
4735                          * Amortize the costs of atomic operations
4736                          * by doing them at the end, if possible.
4737                          */
4738                         if (m->m_type == MT_DATA)
4739                                 mt_data++;
4740                         else if (m->m_type == MT_HEADER)
4741                                 mt_header++;
4742                         else if (m->m_type == MT_SONAME)
4743                                 mt_soname++;
4744                         else if (m->m_type == MT_TAG)
4745                                 mt_tag++;
4746                         else if (m->m_type != MT_FREE)
4747                                 mtype_stat_dec(m->m_type);
4748
4749                         m->m_type = MT_FREE;
4750                         m->m_flags = m->m_len = 0;
4751                         m->m_next = m->m_nextpkt = NULL;
4752
4753                         ((mcache_obj_t *)m)->obj_next = mp_list;
4754                         mp_list = (mcache_obj_t *)m;
4755
4756                         m = next;
4757                 }
4758
4759                 m = nextpkt;
4760         }
4761
4762         if (mt_free > 0)
4763                 mtype_stat_add(MT_FREE, mt_free);
4764         if (mt_data > 0)
4765                 mtype_stat_sub(MT_DATA, mt_data);
4766         if (mt_header > 0)
4767                 mtype_stat_sub(MT_HEADER, mt_header);
4768         if (mt_soname > 0)
4769                 mtype_stat_sub(MT_SONAME, mt_soname);
4770         if (mt_tag > 0)
4771                 mtype_stat_sub(MT_TAG, mt_tag);
4772
4773         if (mp_list != NULL)
4774                 mcache_free_ext(m_cache(MC_MBUF), mp_list);
4775         if (mcl_list != NULL)
4776                 mcache_free_ext(m_cache(MC_CL), mcl_list);
4777         if (mbc_list != NULL)
4778                 mcache_free_ext(m_cache(MC_BIGCL), mbc_list);
4779         if (m16k_list != NULL)
4780                 mcache_free_ext(m_cache(MC_16KCL), m16k_list);
4781         if (m_mcl_list != NULL)
4782                 mcache_free_ext(m_cache(MC_MBUF_CL), m_mcl_list);
4783         if (m_mbc_list != NULL)
4784                 mcache_free_ext(m_cache(MC_MBUF_BIGCL), m_mbc_list);
4785         if (m_m16k_list != NULL)
4786                 mcache_free_ext(m_cache(MC_MBUF_16KCL), m_m16k_list);
4787         if (ref_list != NULL)
4788                 mcache_free_ext(ref_cache, ref_list);
4789
4790         return (pktcount);
4791 }
4792
4793 void
4794 m_freem(struct mbuf *m)
4795 {
4796         while (m != NULL)
4797                 m = m_free(m);
4798 }
4799
4800 /*
4801  * Mbuffer utility routines.
4802  */
4803
4804 /*
4805  * Compute the amount of space available before the current start
4806  * of data in an mbuf.
4807  */
4808 int
4809 m_leadingspace(struct mbuf *m)
4810 {
4811         if (m->m_flags & M_EXT) {
4812                 if (MCLHASREFERENCE(m))
4813                         return (0);
4814                 return (m->m_data - m->m_ext.ext_buf);
4815         }
4816         if (m->m_flags & M_PKTHDR)
4817                 return (m->m_data - m->m_pktdat);
4818         return (m->m_data - m->m_dat);
4819 }
4820
4821 /*
4822  * Compute the amount of space available after the end of data in an mbuf.
4823  */
4824 int
4825 m_trailingspace(struct mbuf *m)
4826 {
4827         if (m->m_flags & M_EXT) {
4828                 if (MCLHASREFERENCE(m))
4829                         return (0);
4830                 return (m->m_ext.ext_buf + m->m_ext.ext_size -
4831                     (m->m_data + m->m_len));
4832         }
4833         return (&m->m_dat[MLEN] - (m->m_data + m->m_len));
4834 }
4835
4836 /*
4837  * Lesser-used path for M_PREPEND: allocate new mbuf to prepend to chain,
4838  * copy junk along.  Does not adjust packet header length.
4839  */
4840 struct mbuf *
4841 m_prepend(struct mbuf *m, int len, int how)
4842 {
4843         struct mbuf *mn;
4844
4845         _MGET(mn, how, m->m_type);
4846         if (mn == NULL) {
4847                 m_freem(m);
4848                 return (NULL);
4849         }
4850         if (m->m_flags & M_PKTHDR) {
4851                 M_COPY_PKTHDR(mn, m);
4852                 m->m_flags &= ~M_PKTHDR;
4853         }
4854         mn->m_next = m;
4855         m = mn;
4856         if (m->m_flags & M_PKTHDR) {
4857                 VERIFY(len <= MHLEN);
4858                 MH_ALIGN(m, len);
4859         } else {
4860                 VERIFY(len <= MLEN);
4861                 M_ALIGN(m, len);
4862         }
4863         m->m_len = len;
4864         return (m);
4865 }
4866
4867 /*
4868  * Replacement for old M_PREPEND macro: allocate new mbuf to prepend to
4869  * chain, copy junk along, and adjust length.
4870  */
4871 struct mbuf *
4872 m_prepend_2(struct mbuf *m, int len, int how, int align)
4873 {
4874         if (M_LEADINGSPACE(m) >= len &&
4875             (!align || IS_P2ALIGNED((m->m_data - len), sizeof(u_int32_t)))) {
4876                 m->m_data -= len;
4877                 m->m_len += len;
4878         } else {
4879                 m = m_prepend(m, len, how);
4880         }
4881         if ((m) && (m->m_flags & M_PKTHDR))
4882                 m->m_pkthdr.len += len;
4883         return (m);
4884 }
4885
4886 /*
4887  * Make a copy of an mbuf chain starting "off0" bytes from the beginning,
4888  * continuing for "len" bytes.  If len is M_COPYALL, copy to end of mbuf.
4889  * The wait parameter is a choice of M_WAIT/M_DONTWAIT from caller.
4890  */
4891 int MCFail;
4892
4893 struct mbuf *
4894 m_copym_mode(struct mbuf *m, int off0, int len, int wait, uint32_t mode)
4895 {
4896         struct mbuf *n, *mhdr = NULL, **np;
4897         int off = off0;
4898         struct mbuf *top;
4899         int copyhdr = 0;
4900
4901         if (off < 0 || len < 0)
4902                 panic("m_copym: invalid offset %d or len %d", off, len);
4903
4904         VERIFY((mode != M_COPYM_MUST_COPY_HDR &&
4905             mode != M_COPYM_MUST_MOVE_HDR) || (m->m_flags & M_PKTHDR));
4906
4907         if ((off == 0 && (m->m_flags & M_PKTHDR)) ||
4908             mode == M_COPYM_MUST_COPY_HDR || mode == M_COPYM_MUST_MOVE_HDR) {
4909                 mhdr = m;
4910                 copyhdr = 1;
4911         }
4912
4913         while (off >= m->m_len) {
4914                 if (m->m_next == NULL)
4915                         panic("m_copym: invalid mbuf chain");
4916                 off -= m->m_len;
4917                 m = m->m_next;
4918         }
4919         np = &top;
4920         top = NULL;
4921
4922         while (len > 0) {
4923                 if (m == NULL) {
4924                         if (len != M_COPYALL)
4925                                 panic("m_copym: len != M_COPYALL");
4926                         break;
4927                 }
4928
4929                 if (copyhdr)
4930                         n = _M_RETRYHDR(wait, m->m_type);
4931                 else
4932                         n = _M_RETRY(wait, m->m_type);
4933                 *np = n;
4934
4935                 if (n == NULL)
4936                         goto nospace;
4937
4938                 if (copyhdr != 0) {
4939                         if ((mode == M_COPYM_MOVE_HDR) ||
4940                             (mode == M_COPYM_MUST_MOVE_HDR)) {
4941                                 M_COPY_PKTHDR(n, mhdr);
4942                         } else if ((mode == M_COPYM_COPY_HDR) ||
4943                             (mode == M_COPYM_MUST_COPY_HDR)) {
4944                                 if (m_dup_pkthdr(n, mhdr, wait) == 0)
4945                                         goto nospace;
4946                         }
4947                         if (len == M_COPYALL)
4948                                 n->m_pkthdr.len -= off0;
4949                         else
4950                                 n->m_pkthdr.len = len;
4951                         copyhdr = 0;
4952                         /*
4953                          * There is data to copy from the packet header mbuf
4954                          * if it is empty or it is before the starting offset
4955                          */
4956                         if (mhdr != m) {
4957                                 np = &n->m_next;
4958                                 continue;
4959                         }
4960                 }
4961                 n->m_len = MIN(len, (m->m_len - off));
4962                 if (m->m_flags & M_EXT) {
4963                         n->m_ext = m->m_ext;
4964                         m_incref(m);
4965                         n->m_data = m->m_data + off;
4966                         n->m_flags |= M_EXT;
4967                 } else {
4968                         /*
4969                          * Limit to the capacity of the destination
4970                          */
4971                         if (n->m_flags & M_PKTHDR)
4972                                 n->m_len = MIN(n->m_len, MHLEN);
4973                         else
4974                                 n->m_len = MIN(n->m_len, MLEN);
4975
4976                         if (MTOD(n, char *) + n->m_len > ((char *)n) + MSIZE)
4977                                 panic("%s n %p copy overflow",
4978                                         __func__, n);
4979
4980                         bcopy(MTOD(m, caddr_t)+off, MTOD(n, caddr_t),
4981                             (unsigned)n->m_len);
4982                 }
4983                 if (len != M_COPYALL)
4984                         len -= n->m_len;
4985                 off = 0;
4986                 m = m->m_next;
4987                 np = &n->m_next;
4988         }
4989
4990         if (top == NULL)
4991                 MCFail++;
4992
4993         return (top);
4994 nospace:
4995
4996         m_freem(top);
4997         MCFail++;
4998         return (NULL);
4999 }
5000
5001
5002 struct mbuf *
5003 m_copym(struct mbuf *m, int off0, int len, int wait)
5004 {
5005         return (m_copym_mode(m, off0, len, wait, M_COPYM_MOVE_HDR));
5006 }
5007
5008 /*
5009  * Equivalent to m_copym except that all necessary mbuf hdrs are allocated
5010  * within this routine also, the last mbuf and offset accessed are passed
5011  * out and can be passed back in to avoid having to rescan the entire mbuf
5012  * list (normally hung off of the socket)
5013  */
5014 struct mbuf *
5015 m_copym_with_hdrs(struct mbuf *m0, int off0, int len0, int wait,
5016     struct mbuf **m_lastm, int *m_off, uint32_t mode)
5017 {
5018         struct mbuf *m = m0, *n, **np = NULL;
5019         int off = off0, len = len0;
5020         struct mbuf *top = NULL;
5021         int mcflags = MSLEEPF(wait);
5022         int copyhdr = 0;
5023         int type = 0;
5024         mcache_obj_t *list = NULL;
5025         int needed = 0;
5026
5027         if (off == 0 && (m->m_flags & M_PKTHDR))
5028                 copyhdr = 1;
5029
5030         if (m_lastm != NULL && *m_lastm != NULL) {
5031                 m = *m_lastm;
5032                 off = *m_off;
5033         } else {
5034                 while (off >= m->m_len) {
5035                         off -= m->m_len;
5036                         m = m->m_next;
5037                 }
5038         }
5039
5040         n = m;
5041         while (len > 0) {
5042                 needed++;
5043                 ASSERT(n != NULL);
5044                 len -= MIN(len, (n->m_len - ((needed == 1) ? off : 0)));
5045                 n = n->m_next;
5046         }
5047         needed++;
5048         len = len0;
5049
5050         /*
5051          * If the caller doesn't want to be put to sleep, mark it with
5052          * MCR_TRYHARD so that we may reclaim buffers from other places
5053          * before giving up.
5054          */
5055         if (mcflags & MCR_NOSLEEP)
5056                 mcflags |= MCR_TRYHARD;
5057
5058         if (mcache_alloc_ext(m_cache(MC_MBUF), &list, needed,
5059             mcflags) != needed)
5060                 goto nospace;
5061
5062         needed = 0;
5063         while (len > 0) {
5064                 n = (struct mbuf *)list;
5065                 list = list->obj_next;
5066                 ASSERT(n != NULL && m != NULL);
5067
5068                 type = (top == NULL) ? MT_HEADER : m->m_type;
5069                 MBUF_INIT(n, (top == NULL), type);
5070 #if CONFIG_MACF_NET
5071                 if (top == NULL && mac_mbuf_label_init(n, wait) != 0) {
5072                         mtype_stat_inc(MT_HEADER);
5073                         mtype_stat_dec(MT_FREE);
5074                         m_free(n);
5075                         goto nospace;
5076                 }
5077 #endif /* MAC_NET */
5078
5079                 if (top == NULL) {
5080                         top = n;
5081                         np = &top->m_next;
5082                         continue;
5083                 } else {
5084                         needed++;
5085                         *np = n;
5086                 }
5087
5088                 if (copyhdr) {
5089                         if ((mode == M_COPYM_MOVE_HDR) ||
5090                             (mode == M_COPYM_MUST_MOVE_HDR)) {
5091                                 M_COPY_PKTHDR(n, m);
5092                         } else if ((mode == M_COPYM_COPY_HDR) ||
5093                             (mode == M_COPYM_MUST_COPY_HDR)) {
5094                                 if (m_dup_pkthdr(n, m, wait) == 0)
5095                                         goto nospace;
5096                         }
5097                         n->m_pkthdr.len = len;
5098                         copyhdr = 0;
5099                 }
5100                 n->m_len = MIN(len, (m->m_len - off));
5101
5102                 if (m->m_flags & M_EXT) {
5103                         n->m_ext = m->m_ext;
5104                         m_incref(m);
5105                         n->m_data = m->m_data + off;
5106                         n->m_flags |= M_EXT;
5107                 } else {
5108                         if (MTOD(n, char *) + n->m_len > ((char *)n) + MSIZE)
5109                                 panic("%s n %p copy overflow",
5110                                         __func__, n);
5111
5112                         bcopy(MTOD(m, caddr_t)+off, MTOD(n, caddr_t),
5113                             (unsigned)n->m_len);
5114                 }
5115                 len -= n->m_len;
5116
5117                 if (len == 0) {
5118                         if (m_lastm != NULL && m_off != NULL) {
5119                                 if ((off + n->m_len) == m->m_len) {
5120                                         *m_lastm = m->m_next;
5121                                         *m_off  = 0;
5122                                 } else {
5123                                         *m_lastm = m;
5124                                         *m_off  = off + n->m_len;
5125                                 }
5126                         }
5127                         break;
5128                 }
5129                 off = 0;
5130                 m = m->m_next;
5131                 np = &n->m_next;
5132         }
5133
5134         mtype_stat_inc(MT_HEADER);
5135         mtype_stat_add(type, needed);
5136         mtype_stat_sub(MT_FREE, needed + 1);
5137
5138         ASSERT(list == NULL);
5139         return (top);
5140
5141 nospace:
5142         if (list != NULL)
5143                 mcache_free_ext(m_cache(MC_MBUF), list);
5144         if (top != NULL)
5145                 m_freem(top);
5146         MCFail++;
5147         return (NULL);
5148 }
5149
5150 /*
5151  * Copy data from an mbuf chain starting "off" bytes from the beginning,
5152  * continuing for "len" bytes, into the indicated buffer.
5153  */
5154 void
5155 m_copydata(struct mbuf *m, int off, int len, void *vp)
5156 {
5157         int off0 = off, len0 = len;
5158         struct mbuf *m0 = m;
5159         unsigned count;
5160         char *cp = vp;
5161
5162         if (__improbable(off < 0 || len < 0)) {
5163                 panic("%s: invalid offset %d or len %d", __func__, off, len);
5164                 /* NOTREACHED */
5165         }
5166
5167         while (off > 0) {
5168                 if (__improbable(m == NULL)) {
5169                         panic("%s: invalid mbuf chain %p [off %d, len %d]",
5170                             __func__, m0, off0, len0);
5171                         /* NOTREACHED */
5172                 }
5173                 if (off < m->m_len)
5174                         break;
5175                 off -= m->m_len;
5176                 m = m->m_next;
5177         }
5178         while (len > 0) {
5179                 if (__improbable(m == NULL)) {
5180                         panic("%s: invalid mbuf chain %p [off %d, len %d]",
5181                             __func__, m0, off0, len0);
5182                         /* NOTREACHED */
5183                 }
5184                 count = MIN(m->m_len - off, len);
5185                 bcopy(MTOD(m, caddr_t) + off, cp, count);
5186                 len -= count;
5187                 cp += count;
5188                 off = 0;
5189                 m = m->m_next;
5190         }
5191 }
5192
5193 /*
5194  * Concatenate mbuf chain n to m.  Both chains must be of the same type
5195  * (e.g. MT_DATA).  Any m_pkthdr is not updated.
5196  */
5197 void
5198 m_cat(struct mbuf *m, struct mbuf *n)
5199 {
5200         while (m->m_next)
5201                 m = m->m_next;
5202         while (n) {
5203                 if ((m->m_flags & M_EXT) ||
5204                     m->m_data + m->m_len + n->m_len >= &m->m_dat[MLEN]) {
5205                         /* just join the two chains */
5206                         m->m_next = n;
5207                         return;
5208                 }
5209                 /* splat the data from one into the other */
5210                 bcopy(MTOD(n, caddr_t), MTOD(m, caddr_t) + m->m_len,
5211                     (u_int)n->m_len);
5212                 m->m_len += n->m_len;
5213                 n = m_free(n);
5214         }
5215 }
5216
5217 void
5218 m_adj(struct mbuf *mp, int req_len)
5219 {
5220         int len = req_len;
5221         struct mbuf *m;
5222         int count;
5223
5224         if ((m = mp) == NULL)
5225                 return;
5226         if (len >= 0) {
5227                 /*
5228                  * Trim from head.
5229                  */
5230                 while (m != NULL && len > 0) {
5231                         if (m->m_len <= len) {
5232                                 len -= m->m_len;
5233                                 m->m_len = 0;
5234                                 m = m->m_next;
5235                         } else {
5236                                 m->m_len -= len;
5237                                 m->m_data += len;
5238                                 len = 0;
5239                         }
5240                 }
5241                 m = mp;
5242                 if (m->m_flags & M_PKTHDR)
5243                         m->m_pkthdr.len -= (req_len - len);
5244         } else {
5245                 /*
5246                  * Trim from tail.  Scan the mbuf chain,
5247                  * calculating its length and finding the last mbuf.
5248                  * If the adjustment only affects this mbuf, then just
5249                  * adjust and return.  Otherwise, rescan and truncate
5250                  * after the remaining size.
5251                  */
5252                 len = -len;
5253                 count = 0;
5254                 for (;;) {
5255                         count += m->m_len;
5256                         if (m->m_next == (struct mbuf *)0)
5257                                 break;
5258                         m = m->m_next;
5259                 }
5260                 if (m->m_len >= len) {
5261                         m->m_len -= len;
5262                         m = mp;
5263                         if (m->m_flags & M_PKTHDR)
5264                                 m->m_pkthdr.len -= len;
5265                         return;
5266                 }
5267                 count -= len;
5268                 if (count < 0)
5269                         count = 0;
5270                 /*
5271                  * Correct length for chain is "count".
5272                  * Find the mbuf with last data, adjust its length,
5273                  * and toss data from remaining mbufs on chain.
5274                  */
5275                 m = mp;
5276                 if (m->m_flags & M_PKTHDR)
5277                         m->m_pkthdr.len = count;
5278                 for (; m; m = m->m_next) {
5279                         if (m->m_len >= count) {
5280                                 m->m_len = count;
5281                                 break;
5282                         }
5283                         count -= m->m_len;
5284                 }
5285                 while ((m = m->m_next))
5286                         m->m_len = 0;
5287         }
5288 }
5289
5290 /*
5291  * Rearange an mbuf chain so that len bytes are contiguous
5292  * and in the data area of an mbuf (so that mtod and dtom
5293  * will work for a structure of size len).  Returns the resulting
5294  * mbuf chain on success, frees it and returns null on failure.
5295  * If there is room, it will add up to max_protohdr-len extra bytes to the
5296  * contiguous region in an attempt to avoid being called next time.
5297  */
5298 int MPFail;
5299
5300 struct mbuf *
5301 m_pullup(struct mbuf *n, int len)
5302 {
5303         struct mbuf *m;
5304         int count;
5305         int space;
5306
5307         /*
5308          * If first mbuf has no cluster, and has room for len bytes
5309          * without shifting current data, pullup into it,
5310          * otherwise allocate a new mbuf to prepend to the chain.
5311          */
5312         if ((n->m_flags & M_EXT) == 0 &&
5313             n->m_data + len < &n->m_dat[MLEN] && n->m_next) {
5314                 if (n->m_len >= len)
5315                         return (n);
5316                 m = n;
5317                 n = n->m_next;
5318                 len -= m->m_len;
5319         } else {
5320                 if (len > MHLEN)
5321                         goto bad;
5322                 _MGET(m, M_DONTWAIT, n->m_type);
5323                 if (m == 0)
5324                         goto bad;
5325                 m->m_len = 0;
5326                 if (n->m_flags & M_PKTHDR) {
5327                         M_COPY_PKTHDR(m, n);
5328                         n->m_flags &= ~M_PKTHDR;
5329                 }
5330         }
5331         space = &m->m_dat[MLEN] - (m->m_data + m->m_len);
5332         do {
5333                 count = MIN(MIN(MAX(len, max_protohdr), space), n->m_len);
5334                 bcopy(MTOD(n, caddr_t), MTOD(m, caddr_t) + m->m_len,
5335                     (unsigned)count);
5336                 len -= count;
5337                 m->m_len += count;
5338                 n->m_len -= count;
5339                 space -= count;
5340                 if (n->m_len)
5341                         n->m_data += count;
5342                 else
5343                         n = m_free(n);
5344         } while (len > 0 && n);
5345         if (len > 0) {
5346                 (void) m_free(m);
5347                 goto bad;
5348         }
5349         m->m_next = n;
5350         return (m);
5351 bad:
5352         m_freem(n);
5353         MPFail++;
5354         return (0);
5355 }
5356
5357 /*
5358  * Like m_pullup(), except a new mbuf is always allocated, and we allow
5359  * the amount of empty space before the data in the new mbuf to be specified
5360  * (in the event that the caller expects to prepend later).
5361  */
5362 __private_extern__ int MSFail = 0;
5363
5364 __private_extern__ struct mbuf *
5365 m_copyup(struct mbuf *n, int len, int dstoff)
5366 {
5367         struct mbuf *m;
5368         int count, space;
5369
5370         if (len > (MHLEN - dstoff))
5371                 goto bad;
5372         MGET(m, M_DONTWAIT, n->m_type);
5373         if (m == NULL)
5374                 goto bad;
5375         m->m_len = 0;
5376         if (n->m_flags & M_PKTHDR) {
5377                 m_copy_pkthdr(m, n);
5378                 n->m_flags &= ~M_PKTHDR;
5379         }
5380         m->m_data += dstoff;
5381         space = &m->m_dat[MLEN] - (m->m_data + m->m_len);
5382         do {
5383                 count = min(min(max(len, max_protohdr), space), n->m_len);
5384                 memcpy(mtod(m, caddr_t) + m->m_len, mtod(n, caddr_t),
5385                     (unsigned)count);
5386                 len -= count;
5387                 m->m_len += count;
5388                 n->m_len -= count;
5389                 space -= count;
5390                 if (n->m_len)
5391                         n->m_data += count;
5392                 else
5393                         n = m_free(n);
5394         } while (len > 0 && n);
5395         if (len > 0) {
5396                 (void) m_free(m);
5397                 goto bad;
5398         }
5399         m->m_next = n;
5400         return (m);
5401 bad:
5402         m_freem(n);
5403         MSFail++;
5404         return (NULL);
5405 }
5406
5407 /*
5408  * Partition an mbuf chain in two pieces, returning the tail --
5409  * all but the first len0 bytes.  In case of failure, it returns NULL and
5410  * attempts to restore the chain to its original state.
5411  */
5412 struct mbuf *
5413 m_split(struct mbuf *m0, int len0, int wait)
5414 {
5415         return (m_split0(m0, len0, wait, 1));
5416 }
5417
5418 static struct mbuf *
5419 m_split0(struct mbuf *m0, int len0, int wait, int copyhdr)
5420 {
5421         struct mbuf *m, *n;
5422         unsigned len = len0, remain;
5423
5424         for (m = m0; m && len > m->m_len; m = m->m_next)
5425                 len -= m->m_len;
5426         if (m == NULL)
5427                 return (NULL);
5428         remain = m->m_len - len;
5429         if (copyhdr && (m0->m_flags & M_PKTHDR)) {
5430                 _MGETHDR(n, wait, m0->m_type);
5431                 if (n == NULL)
5432                         return (NULL);
5433                 n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif;
5434                 n->m_pkthdr.len = m0->m_pkthdr.len - len0;
5435                 m0->m_pkthdr.len = len0;
5436                 if (m->m_flags & M_EXT)
5437                         goto extpacket;
5438                 if (remain > MHLEN) {
5439                         /* m can't be the lead packet */
5440                         MH_ALIGN(n, 0);
5441                         n->m_next = m_split(m, len, wait);
5442                         if (n->m_next == NULL) {
5443                                 (void) m_free(n);
5444                                 return (NULL);
5445                         } else
5446                                 return (n);
5447                 } else
5448                         MH_ALIGN(n, remain);
5449         } else if (remain == 0) {
5450                 n = m->m_next;
5451                 m->m_next = NULL;
5452                 return (n);
5453         } else {
5454                 _MGET(n, wait, m->m_type);
5455                 if (n == NULL)
5456                         return (NULL);
5457                 M_ALIGN(n, remain);
5458         }
5459 extpacket:
5460         if (m->m_flags & M_EXT) {
5461                 n->m_flags |= M_EXT;
5462                 n->m_ext = m->m_ext;
5463                 m_incref(m);
5464                 n->m_data = m->m_data + len;
5465         } else {
5466                 bcopy(MTOD(m, caddr_t) + len, MTOD(n, caddr_t), remain);
5467         }
5468         n->m_len = remain;
5469         m->m_len = len;
5470         n->m_next = m->m_next;
5471         m->m_next = NULL;
5472         return (n);
5473 }
5474
5475 /*
5476  * Routine to copy from device local memory into mbufs.
5477  */
5478 struct mbuf *
5479 m_devget(char *buf, int totlen, int off0, struct ifnet *ifp,
5480     void (*copy)(const void *, void *, size_t))
5481 {
5482         struct mbuf *m;
5483         struct mbuf *top = NULL, **mp = &top;
5484         int off = off0, len;
5485         char *cp;
5486         char *epkt;
5487
5488         cp = buf;
5489         epkt = cp + totlen;
5490         if (off) {
5491                 /*
5492                  * If 'off' is non-zero, packet is trailer-encapsulated,
5493                  * so we have to skip the type and length fields.
5494                  */
5495                 cp += off + 2 * sizeof (u_int16_t);
5496                 totlen -= 2 * sizeof (u_int16_t);
5497         }
5498         _MGETHDR(m, M_DONTWAIT, MT_DATA);
5499         if (m == NULL)
5500                 return (NULL);
5501         m->m_pkthdr.rcvif = ifp;
5502         m->m_pkthdr.len = totlen;
5503         m->m_len = MHLEN;
5504
5505         while (totlen > 0) {
5506                 if (top != NULL) {
5507                         _MGET(m, M_DONTWAIT, MT_DATA);
5508                         if (m == NULL) {
5509                                 m_freem(top);
5510                                 return (NULL);
5511                         }
5512                         m->m_len = MLEN;
5513                 }
5514                 len = MIN(totlen, epkt - cp);
5515                 if (len >= MINCLSIZE) {
5516                         MCLGET(m, M_DONTWAIT);
5517                         if (m->m_flags & M_EXT) {
5518                                 m->m_len = len = MIN(len, m_maxsize(MC_CL));
5519                         } else {
5520                                 /* give up when it's out of cluster mbufs */
5521                                 if (top != NULL)
5522                                         m_freem(top);
5523                                 m_freem(m);
5524                                 return (NULL);
5525                         }
5526                 } else {
5527                         /*
5528                          * Place initial small packet/header at end of mbuf.
5529                          */
5530                         if (len < m->m_len) {
5531                                 if (top == NULL &&
5532                                     len + max_linkhdr <= m->m_len)
5533                                         m->m_data += max_linkhdr;
5534                                 m->m_len = len;
5535                         } else {
5536                                 len = m->m_len;
5537                         }
5538                 }
5539                 if (copy)
5540                         copy(cp, MTOD(m, caddr_t), (unsigned)len);
5541                 else
5542                         bcopy(cp, MTOD(m, caddr_t), (unsigned)len);
5543                 cp += len;
5544                 *mp = m;
5545                 mp = &m->m_next;
5546                 totlen -= len;
5547                 if (cp == epkt)
5548                         cp = buf;
5549         }
5550         return (top);
5551 }
5552
5553 #ifndef MBUF_GROWTH_NORMAL_THRESH
5554 #define MBUF_GROWTH_NORMAL_THRESH 25
5555 #endif
5556
5557 /*
5558  * Cluster freelist allocation check.
5559  */
5560 static int
5561 m_howmany(int num, size_t bufsize)
5562 {
5563         int i = 0, j = 0;
5564         u_int32_t m_mbclusters, m_clusters, m_bigclusters, m_16kclusters;
5565         u_int32_t m_mbfree, m_clfree, m_bigclfree, m_16kclfree;
5566         u_int32_t sumclusters, freeclusters;
5567         u_int32_t percent_pool, percent_kmem;
5568         u_int32_t mb_growth, mb_growth_thresh;
5569
5570         VERIFY(bufsize == m_maxsize(MC_BIGCL) ||
5571             bufsize == m_maxsize(MC_16KCL));
5572
5573         LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
5574
5575         /* Numbers in 2K cluster units */
5576         m_mbclusters = m_total(MC_MBUF) >> NMBPCLSHIFT;
5577         m_clusters = m_total(MC_CL);
5578         m_bigclusters = m_total(MC_BIGCL) << NCLPBGSHIFT;
5579         m_16kclusters = m_total(MC_16KCL);
5580         sumclusters = m_mbclusters + m_clusters + m_bigclusters;
5581
5582         m_mbfree = m_infree(MC_MBUF) >> NMBPCLSHIFT;
5583         m_clfree = m_infree(MC_CL);
5584         m_bigclfree = m_infree(MC_BIGCL) << NCLPBGSHIFT;
5585         m_16kclfree = m_infree(MC_16KCL);
5586         freeclusters = m_mbfree + m_clfree + m_bigclfree;
5587
5588         /* Bail if we've maxed out the mbuf memory map */
5589         if ((bufsize == m_maxsize(MC_BIGCL) && sumclusters >= nclusters) ||
5590             (njcl > 0 && bufsize == m_maxsize(MC_16KCL) &&
5591             (m_16kclusters << NCLPJCLSHIFT) >= njcl)) {
5592                 return (0);
5593         }
5594
5595         if (bufsize == m_maxsize(MC_BIGCL)) {
5596                 /* Under minimum */
5597                 if (m_bigclusters < m_minlimit(MC_BIGCL))
5598                         return (m_minlimit(MC_BIGCL) - m_bigclusters);
5599
5600                 percent_pool =
5601                     ((sumclusters - freeclusters) * 100) / sumclusters;
5602                 percent_kmem = (sumclusters * 100) / nclusters;
5603
5604                 /*
5605                  * If a light/normal user, grow conservatively (75%)
5606                  * If a heavy user, grow aggressively (50%)
5607                  */
5608                 if (percent_kmem < MBUF_GROWTH_NORMAL_THRESH)
5609                         mb_growth = MB_GROWTH_NORMAL;
5610                 else
5611                         mb_growth = MB_GROWTH_AGGRESSIVE;
5612
5613                 if (percent_kmem < 5) {
5614                         /* For initial allocations */
5615                         i = num;
5616                 } else {
5617                         /* Return if >= MBIGCL_LOWAT clusters available */
5618                         if (m_infree(MC_BIGCL) >= MBIGCL_LOWAT &&
5619                             m_total(MC_BIGCL) >=
5620                             MBIGCL_LOWAT + m_minlimit(MC_BIGCL))
5621                                 return (0);
5622
5623                         /* Ensure at least num clusters are accessible */
5624                         if (num >= m_infree(MC_BIGCL))
5625                                 i = num - m_infree(MC_BIGCL);
5626                         if (num > m_total(MC_BIGCL) - m_minlimit(MC_BIGCL))
5627                                 j = num - (m_total(MC_BIGCL) -
5628                                     m_minlimit(MC_BIGCL));
5629
5630                         i = MAX(i, j);
5631
5632                         /*
5633                          * Grow pool if percent_pool > 75 (normal growth)
5634                          * or percent_pool > 50 (aggressive growth).
5635                          */
5636                         mb_growth_thresh = 100 - (100 / (1 << mb_growth));
5637                         if (percent_pool > mb_growth_thresh)
5638                                 j = ((sumclusters + num) >> mb_growth) -
5639                                     freeclusters;
5640                         i = MAX(i, j);
5641                 }
5642
5643                 /* Check to ensure we didn't go over limits */
5644                 if (i + m_bigclusters >= m_maxlimit(MC_BIGCL))
5645                         i = m_maxlimit(MC_BIGCL) - m_bigclusters;
5646                 if ((i << 1) + sumclusters >= nclusters)
5647                         i = (nclusters - sumclusters) >> 1;
5648                 VERIFY((m_total(MC_BIGCL) + i) <= m_maxlimit(MC_BIGCL));
5649                 VERIFY(sumclusters + (i << 1) <= nclusters);
5650
5651         } else { /* 16K CL */
5652                 VERIFY(njcl > 0);
5653                 /* Ensure at least num clusters are available */
5654                 if (num >= m_16kclfree)
5655                         i = num - m_16kclfree;
5656
5657                 /* Always grow 16KCL pool aggressively */
5658                 if (((m_16kclusters + num) >> 1) > m_16kclfree)
5659                         j = ((m_16kclusters + num) >> 1) - m_16kclfree;
5660                 i = MAX(i, j);
5661
5662                 /* Check to ensure we don't go over limit */
5663                 if ((i + m_total(MC_16KCL)) >= m_maxlimit(MC_16KCL))
5664                         i = m_maxlimit(MC_16KCL) - m_total(MC_16KCL);
5665         }
5666         return (i);
5667 }
5668 /*
5669  * Return the number of bytes in the mbuf chain, m.
5670  */
5671 unsigned int
5672 m_length(struct mbuf *m)
5673 {
5674         struct mbuf *m0;
5675         unsigned int pktlen;
5676
5677         if (m->m_flags & M_PKTHDR)
5678                 return (m->m_pkthdr.len);
5679
5680         pktlen = 0;
5681         for (m0 = m; m0 != NULL; m0 = m0->m_next)
5682                 pktlen += m0->m_len;
5683         return (pktlen);
5684 }
5685
5686 /*
5687  * Copy data from a buffer back into the indicated mbuf chain,
5688  * starting "off" bytes from the beginning, extending the mbuf
5689  * chain if necessary.
5690  */
5691 void
5692 m_copyback(struct mbuf *m0, int off, int len, const void *cp)
5693 {
5694 #if DEBUG
5695         struct mbuf *origm = m0;
5696         int error;
5697 #endif /* DEBUG */
5698
5699         if (m0 == NULL)
5700                 return;
5701
5702 #if DEBUG
5703         error =
5704 #endif /* DEBUG */
5705         m_copyback0(&m0, off, len, cp,
5706             M_COPYBACK0_COPYBACK | M_COPYBACK0_EXTEND, M_DONTWAIT);
5707
5708 #if DEBUG
5709         if (error != 0 || (m0 != NULL && origm != m0))
5710                 panic("m_copyback");
5711 #endif /* DEBUG */
5712 }
5713
5714 struct mbuf *
5715 m_copyback_cow(struct mbuf *m0, int off, int len, const void *cp, int how)
5716 {
5717         int error;
5718
5719         /* don't support chain expansion */
5720         VERIFY(off + len <= m_length(m0));
5721
5722         error = m_copyback0(&m0, off, len, cp,
5723             M_COPYBACK0_COPYBACK | M_COPYBACK0_COW, how);
5724         if (error) {
5725                 /*
5726                  * no way to recover from partial success.
5727                  * just free the chain.
5728                  */
5729                 m_freem(m0);
5730                 return (NULL);
5731         }
5732         return (m0);
5733 }
5734
5735 /*
5736  * m_makewritable: ensure the specified range writable.
5737  */
5738 int
5739 m_makewritable(struct mbuf **mp, int off, int len, int how)
5740 {
5741         int error;
5742 #if DEBUG
5743         struct mbuf *n;
5744         int origlen, reslen;
5745
5746         origlen = m_length(*mp);
5747 #endif /* DEBUG */
5748
5749 #if 0 /* M_COPYALL is large enough */
5750         if (len == M_COPYALL)
5751                 len = m_length(*mp) - off; /* XXX */
5752 #endif
5753
5754         error = m_copyback0(mp, off, len, NULL,
5755             M_COPYBACK0_PRESERVE | M_COPYBACK0_COW, how);
5756
5757 #if DEBUG
5758         reslen = 0;
5759         for (n = *mp; n; n = n->m_next)
5760                 reslen += n->m_len;
5761         if (origlen != reslen)
5762                 panic("m_makewritable: length changed");
5763         if (((*mp)->m_flags & M_PKTHDR) && reslen != (*mp)->m_pkthdr.len)
5764                 panic("m_makewritable: inconsist");
5765 #endif /* DEBUG */
5766
5767         return (error);
5768 }
5769
5770 static int
5771 m_copyback0(struct mbuf **mp0, int off, int len, const void *vp, int flags,
5772     int how)
5773 {
5774         int mlen;
5775         struct mbuf *m, *n;
5776         struct mbuf **mp;
5777         int totlen = 0;
5778         const char *cp = vp;
5779
5780         VERIFY(mp0 != NULL);
5781         VERIFY(*mp0 != NULL);
5782         VERIFY((flags & M_COPYBACK0_PRESERVE) == 0 || cp == NULL);
5783         VERIFY((flags & M_COPYBACK0_COPYBACK) == 0 || cp != NULL);
5784
5785         /*
5786          * we don't bother to update "totlen" in the case of M_COPYBACK0_COW,
5787          * assuming that M_COPYBACK0_EXTEND and M_COPYBACK0_COW are exclusive.
5788          */
5789
5790         VERIFY((~flags & (M_COPYBACK0_EXTEND|M_COPYBACK0_COW)) != 0);
5791
5792         mp = mp0;
5793         m = *mp;
5794         while (off > (mlen = m->m_len)) {
5795                 off -= mlen;
5796                 totlen += mlen;
5797                 if (m->m_next == NULL) {
5798                         int tspace;
5799 extend:
5800                         if (!(flags & M_COPYBACK0_EXTEND))
5801                                 goto out;
5802
5803                         /*
5804                          * try to make some space at the end of "m".
5805                          */
5806
5807                         mlen = m->m_len;
5808                         if (off + len >= MINCLSIZE &&
5809                             !(m->m_flags & M_EXT) && m->m_len == 0) {
5810                                 MCLGET(m, how);
5811                         }
5812                         tspace = M_TRAILINGSPACE(m);
5813                         if (tspace > 0) {
5814                                 tspace = MIN(tspace, off + len);
5815                                 VERIFY(tspace > 0);
5816                                 bzero(mtod(m, char *) + m->m_len,
5817                                     MIN(off, tspace));
5818                                 m->m_len += tspace;
5819                                 off += mlen;
5820                                 totlen -= mlen;
5821                                 continue;
5822                         }
5823
5824                         /*
5825                          * need to allocate an mbuf.
5826                          */
5827
5828                         if (off + len >= MINCLSIZE) {
5829                                 n = m_getcl(how, m->m_type, 0);
5830                         } else {
5831                                 n = _M_GET(how, m->m_type);
5832                         }
5833                         if (n == NULL) {
5834                                 goto out;
5835                         }
5836                         n->m_len = 0;
5837                         n->m_len = MIN(M_TRAILINGSPACE(n), off + len);
5838                         bzero(mtod(n, char *), MIN(n->m_len, off));
5839                         m->m_next = n;
5840                 }
5841                 mp = &m->m_next;
5842                 m = m->m_next;
5843         }
5844         while (len > 0) {
5845                 mlen = m->m_len - off;
5846                 if (mlen != 0 && m_mclhasreference(m)) {
5847                         char *datap;
5848                         int eatlen;
5849
5850                         /*
5851                          * this mbuf is read-only.
5852                          * allocate a new writable mbuf and try again.
5853                          */
5854
5855 #if DIAGNOSTIC
5856                         if (!(flags & M_COPYBACK0_COW))
5857                                 panic("m_copyback0: read-only");
5858 #endif /* DIAGNOSTIC */
5859
5860                         /*
5861                          * if we're going to write into the middle of
5862                          * a mbuf, split it first.
5863                          */
5864                         if (off > 0 && len < mlen) {
5865                                 n = m_split0(m, off, how, 0);
5866                                 if (n == NULL)
5867                                         goto enobufs;
5868                                 m->m_next = n;
5869                                 mp = &m->m_next;
5870                                 m = n;
5871                                 off = 0;
5872                                 continue;
5873                         }
5874
5875                         /*
5876                          * XXX TODO coalesce into the trailingspace of
5877                          * the previous mbuf when possible.
5878                          */
5879
5880                         /*
5881                          * allocate a new mbuf.  copy packet header if needed.
5882                          */
5883                         n = _M_GET(how, m->m_type);
5884                         if (n == NULL)
5885                                 goto enobufs;
5886                         if (off == 0 && (m->m_flags & M_PKTHDR)) {
5887                                 M_COPY_PKTHDR(n, m);
5888                                 n->m_len = MHLEN;
5889                         } else {
5890                                 if (len >= MINCLSIZE)
5891                                         MCLGET(n, M_DONTWAIT);
5892                                 n->m_len =
5893                                     (n->m_flags & M_EXT) ? MCLBYTES : MLEN;
5894                         }
5895                         if (n->m_len > len)
5896                                 n->m_len = len;
5897
5898                         /*
5899                          * free the region which has been overwritten.
5900                          * copying data from old mbufs if requested.
5901                          */
5902                         if (flags & M_COPYBACK0_PRESERVE)
5903                                 datap = mtod(n, char *);
5904                         else
5905                                 datap = NULL;
5906                         eatlen = n->m_len;
5907                         VERIFY(off == 0 || eatlen >= mlen);
5908                         if (off > 0) {
5909                                 VERIFY(len >= mlen);
5910                                 m->m_len = off;
5911                                 m->m_next = n;
5912                                 if (datap) {
5913                                         m_copydata(m, off, mlen, datap);
5914                                         datap += mlen;
5915                                 }
5916                                 eatlen -= mlen;
5917                                 mp = &m->m_next;
5918                                 m = m->m_next;
5919                         }
5920                         while (m != NULL && m_mclhasreference(m) &&
5921                             n->m_type == m->m_type && eatlen > 0) {
5922                                 mlen = MIN(eatlen, m->m_len);
5923                                 if (datap) {
5924                                         m_copydata(m, 0, mlen, datap);
5925                                         datap += mlen;
5926                                 }
5927                                 m->m_data += mlen;
5928                                 m->m_len -= mlen;
5929                                 eatlen -= mlen;
5930                                 if (m->m_len == 0)
5931                                         *mp = m = m_free(m);
5932                         }
5933                         if (eatlen > 0)
5934                                 n->m_len -= eatlen;
5935                         n->m_next = m;
5936                         *mp = m = n;
5937                         continue;
5938                 }
5939                 mlen = MIN(mlen, len);
5940                 if (flags & M_COPYBACK0_COPYBACK) {
5941                         bcopy(cp, mtod(m, caddr_t) + off, (unsigned)mlen);
5942                         cp += mlen;
5943                 }
5944                 len -= mlen;
5945                 mlen += off;
5946                 off = 0;
5947                 totlen += mlen;
5948                 if (len == 0)
5949                         break;
5950                 if (m->m_next == NULL) {
5951                         goto extend;
5952                 }
5953                 mp = &m->m_next;
5954                 m = m->m_next;
5955         }
5956 out:
5957         if (((m = *mp0)->m_flags & M_PKTHDR) && (m->m_pkthdr.len < totlen)) {
5958                 VERIFY(flags & M_COPYBACK0_EXTEND);
5959                 m->m_pkthdr.len = totlen;
5960         }
5961
5962         return (0);
5963
5964 enobufs:
5965         return (ENOBUFS);
5966 }
5967
5968 uint64_t
5969 mcl_to_paddr(char *addr)
5970 {
5971         vm_offset_t base_phys;
5972
5973         if (!MBUF_IN_MAP(addr))
5974                 return (0);
5975         base_phys = mcl_paddr[atop_64(addr - (char *)mbutl)];
5976
5977         if (base_phys == 0)
5978                 return (0);
5979         return ((uint64_t)(ptoa_64(base_phys) | ((uint64_t)addr & PAGE_MASK)));
5980 }
5981
5982 /*
5983  * Dup the mbuf chain passed in.  The whole thing.  No cute additional cruft.
5984  * And really copy the thing.  That way, we don't "precompute" checksums
5985  * for unsuspecting consumers.  Assumption: m->m_nextpkt == 0.  Trick: for
5986  * small packets, don't dup into a cluster.  That way received  packets
5987  * don't take up too much room in the sockbuf (cf. sbspace()).
5988  */
5989 int MDFail;
5990
5991 struct mbuf *
5992 m_dup(struct mbuf *m, int how)
5993 {
5994         struct mbuf *n, **np;
5995         struct mbuf *top;
5996         int copyhdr = 0;
5997
5998         np = &top;
5999         top = NULL;
6000         if (m->m_flags & M_PKTHDR)
6001                 copyhdr = 1;
6002
6003         /*
6004          * Quick check: if we have one mbuf and its data fits in an
6005          *  mbuf with packet header, just copy and go.
6006          */
6007         if (m->m_next == NULL) {
6008                 /* Then just move the data into an mbuf and be done... */
6009                 if (copyhdr) {
6010                         if (m->m_pkthdr.len <= MHLEN && m->m_len <= MHLEN) {
6011                                 if ((n = _M_GETHDR(how, m->m_type)) == NULL)
6012                                         return (NULL);
6013                                 n->m_len = m->m_len;
6014                                 m_dup_pkthdr(n, m, how);
6015                                 bcopy(m->m_data, n->m_data, m->m_len);
6016                                 return (n);
6017                         }
6018                 } else if (m->m_len <= MLEN) {
6019                         if ((n = _M_GET(how, m->m_type)) == NULL)
6020                                 return (NULL);
6021                         bcopy(m->m_data, n->m_data, m->m_len);
6022                         n->m_len = m->m_len;
6023                         return (n);
6024                 }
6025         }
6026         while (m != NULL) {
6027 #if BLUE_DEBUG
6028                 printf("<%x: %x, %x, %x\n", m, m->m_flags, m->m_len,
6029                     m->m_data);
6030 #endif
6031                 if (copyhdr)
6032                         n = _M_GETHDR(how, m->m_type);
6033                 else
6034                         n = _M_GET(how, m->m_type);
6035                 if (n == NULL)
6036                         goto nospace;
6037                 if (m->m_flags & M_EXT) {
6038                         if (m->m_len <= m_maxsize(MC_CL))
6039                                 MCLGET(n, how);
6040                         else if (m->m_len <= m_maxsize(MC_BIGCL))
6041                                 n = m_mbigget(n, how);
6042                         else if (m->m_len <= m_maxsize(MC_16KCL) && njcl > 0)
6043                                 n = m_m16kget(n, how);
6044                         if (!(n->m_flags & M_EXT)) {
6045                                 (void) m_free(n);
6046                                 goto nospace;
6047                         }
6048                 }
6049                 *np = n;
6050                 if (copyhdr) {
6051                         /* Don't use M_COPY_PKTHDR: preserve m_data */
6052                         m_dup_pkthdr(n, m, how);
6053                         copyhdr = 0;
6054                         if (!(n->m_flags & M_EXT))
6055                                 n->m_data = n->m_pktdat;
6056                 }
6057                 n->m_len = m->m_len;
6058                 /*
6059                  * Get the dup on the same bdry as the original
6060                  * Assume that the two mbufs have the same offset to data area
6061                  * (up to word boundaries)
6062                  */
6063                 bcopy(MTOD(m, caddr_t), MTOD(n, caddr_t), (unsigned)n->m_len);
6064                 m = m->m_next;
6065                 np = &n->m_next;
6066 #if BLUE_DEBUG
6067                 printf(">%x: %x, %x, %x\n", n, n->m_flags, n->m_len,
6068                     n->m_data);
6069 #endif
6070         }
6071
6072         if (top == NULL)
6073                 MDFail++;
6074         return (top);
6075
6076 nospace:
6077         m_freem(top);
6078         MDFail++;
6079         return (NULL);
6080 }
6081
6082 #define MBUF_MULTIPAGES(m)                                              \
6083         (((m)->m_flags & M_EXT) &&                                      \
6084         ((IS_P2ALIGNED((m)->m_data, PAGE_SIZE)                          \
6085         && (m)->m_len > PAGE_SIZE) ||                                   \
6086         (!IS_P2ALIGNED((m)->m_data, PAGE_SIZE) &&                       \
6087         P2ROUNDUP((m)->m_data, PAGE_SIZE) < ((uintptr_t)(m)->m_data + (m)->m_len))))
6088
6089 static struct mbuf *
6090 m_expand(struct mbuf *m, struct mbuf **last)
6091 {
6092         struct mbuf *top = NULL;
6093         struct mbuf **nm = &top;
6094         uintptr_t data0, data;
6095         unsigned int len0, len;
6096
6097         VERIFY(MBUF_MULTIPAGES(m));
6098         VERIFY(m->m_next == NULL);
6099         data0 = (uintptr_t)m->m_data;
6100         len0 = m->m_len;
6101         *last = top;
6102
6103         for (;;) {
6104                 struct mbuf *n;
6105
6106                 data = data0;
6107                 if (IS_P2ALIGNED(data, PAGE_SIZE) && len0 > PAGE_SIZE)
6108                         len = PAGE_SIZE;
6109                 else if (!IS_P2ALIGNED(data, PAGE_SIZE) &&
6110                     P2ROUNDUP(data, PAGE_SIZE) < (data + len0))
6111                         len = P2ROUNDUP(data, PAGE_SIZE) - data;
6112                 else
6113                         len = len0;
6114
6115                 VERIFY(len > 0);
6116                 VERIFY(m->m_flags & M_EXT);
6117                 m->m_data = (void *)data;
6118                 m->m_len = len;
6119
6120                 *nm = *last = m;
6121                 nm = &m->m_next;
6122                 m->m_next = NULL;
6123
6124                 data0 += len;
6125                 len0 -= len;
6126                 if (len0 == 0)
6127                         break;
6128
6129                 n = _M_RETRY(M_DONTWAIT, MT_DATA);
6130                 if (n == NULL) {
6131                         m_freem(top);
6132                         top = *last = NULL;
6133                         break;
6134                 }
6135
6136                 n->m_ext = m->m_ext;
6137                 m_incref(m);
6138                 n->m_flags |= M_EXT;
6139                 m = n;
6140         }
6141         return (top);
6142 }
6143
6144 struct mbuf *
6145 m_normalize(struct mbuf *m)
6146 {
6147         struct mbuf *top = NULL;
6148         struct mbuf **nm = &top;
6149         boolean_t expanded = FALSE;
6150
6151         while (m != NULL) {
6152                 struct mbuf *n;
6153
6154                 n = m->m_next;
6155                 m->m_next = NULL;
6156
6157                 /* Does the data cross one or more page boundaries? */
6158                 if (MBUF_MULTIPAGES(m)) {
6159                         struct mbuf *last;
6160                         if ((m = m_expand(m, &last)) == NULL) {
6161                                 m_freem(n);
6162                                 m_freem(top);
6163                                 top = NULL;
6164                                 break;
6165                         }
6166                         *nm = m;
6167                         nm = &last->m_next;
6168                         expanded = TRUE;
6169                 } else {
6170                         *nm = m;
6171                         nm = &m->m_next;
6172                 }
6173                 m = n;
6174         }
6175         if (expanded)
6176                 atomic_add_32(&mb_normalized, 1);
6177         return (top);
6178 }
6179
6180 /*
6181  * Append the specified data to the indicated mbuf chain,
6182  * Extend the mbuf chain if the new data does not fit in
6183  * existing space.
6184  *
6185  * Return 1 if able to complete the job; otherwise 0.
6186  */
6187 int
6188 m_append(struct mbuf *m0, int len, caddr_t cp)
6189 {
6190         struct mbuf *m, *n;
6191         int remainder, space;
6192
6193         for (m = m0; m->m_next != NULL; m = m->m_next)
6194                 ;
6195         remainder = len;
6196         space = M_TRAILINGSPACE(m);
6197         if (space > 0) {
6198                 /*
6199                  * Copy into available space.
6200                  */
6201                 if (space > remainder)
6202                         space = remainder;
6203                 bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
6204                 m->m_len += space;
6205                 cp += space;
6206                 remainder -= space;
6207         }
6208         while (remainder > 0) {
6209                 /*
6210                  * Allocate a new mbuf; could check space
6211                  * and allocate a cluster instead.
6212                  */
6213                 n = m_get(M_WAITOK, m->m_type);
6214                 if (n == NULL)
6215                         break;
6216                 n->m_len = min(MLEN, remainder);
6217                 bcopy(cp, mtod(n, caddr_t), n->m_len);
6218                 cp += n->m_len;
6219                 remainder -= n->m_len;
6220                 m->m_next = n;
6221                 m = n;
6222         }
6223         if (m0->m_flags & M_PKTHDR)
6224                 m0->m_pkthdr.len += len - remainder;
6225         return (remainder == 0);
6226 }
6227
6228 struct mbuf *
6229 m_last(struct mbuf *m)
6230 {
6231         while (m->m_next != NULL)
6232                 m = m->m_next;
6233         return (m);
6234 }
6235
6236 unsigned int
6237 m_fixhdr(struct mbuf *m0)
6238 {
6239         u_int len;
6240
6241         VERIFY(m0->m_flags & M_PKTHDR);
6242
6243         len = m_length2(m0, NULL);
6244         m0->m_pkthdr.len = len;
6245         return (len);
6246 }
6247
6248 unsigned int
6249 m_length2(struct mbuf *m0, struct mbuf **last)
6250 {
6251         struct mbuf *m;
6252         u_int len;
6253
6254         len = 0;
6255         for (m = m0; m != NULL; m = m->m_next) {
6256                 len += m->m_len;
6257                 if (m->m_next == NULL)
6258                         break;
6259         }
6260         if (last != NULL)
6261                 *last = m;
6262         return (len);
6263 }
6264
6265 /*
6266  * Defragment a mbuf chain, returning the shortest possible chain of mbufs
6267  * and clusters.  If allocation fails and this cannot be completed, NULL will
6268  * be returned, but the passed in chain will be unchanged.  Upon success,
6269  * the original chain will be freed, and the new chain will be returned.
6270  *
6271  * If a non-packet header is passed in, the original mbuf (chain?) will
6272  * be returned unharmed.
6273  *
6274  * If offset is specfied, the first mbuf in the chain will have a leading
6275  * space of the amount stated by the "off" parameter.
6276  *
6277  * This routine requires that the m_pkthdr.header field of the original
6278  * mbuf chain is cleared by the caller.
6279  */
6280 struct mbuf *
6281 m_defrag_offset(struct mbuf *m0, u_int32_t off, int how)
6282 {
6283         struct mbuf *m_new = NULL, *m_final = NULL;
6284         int progress = 0, length, pktlen;
6285
6286         if (!(m0->m_flags & M_PKTHDR))
6287                 return (m0);
6288
6289         VERIFY(off < MHLEN);
6290         m_fixhdr(m0); /* Needed sanity check */
6291
6292         pktlen = m0->m_pkthdr.len + off;
6293         if (pktlen > MHLEN)
6294                 m_final = m_getcl(how, MT_DATA, M_PKTHDR);
6295         else
6296                 m_final = m_gethdr(how, MT_DATA);
6297
6298         if (m_final == NULL)
6299                 goto nospace;
6300
6301         if (off > 0) {
6302                 pktlen -= off;
6303                 m_final->m_data += off;
6304         }
6305
6306         /*
6307          * Caller must have handled the contents pointed to by this
6308          * pointer before coming here, as otherwise it will point to
6309          * the original mbuf which will get freed upon success.
6310          */
6311         VERIFY(m0->m_pkthdr.pkt_hdr == NULL);
6312
6313         if (m_dup_pkthdr(m_final, m0, how) == 0)
6314                 goto nospace;
6315
6316         m_new = m_final;
6317
6318         while (progress < pktlen) {
6319                 length = pktlen - progress;
6320                 if (length > MCLBYTES)
6321                         length = MCLBYTES;
6322                 length -= ((m_new == m_final) ? off : 0);
6323                 if (length < 0)
6324                         goto nospace;
6325
6326                 if (m_new == NULL) {
6327                         if (length > MLEN)
6328                                 m_new = m_getcl(how, MT_DATA, 0);
6329                         else
6330                                 m_new = m_get(how, MT_DATA);
6331                         if (m_new == NULL)
6332                                 goto nospace;
6333                 }
6334
6335                 m_copydata(m0, progress, length, mtod(m_new, caddr_t));
6336                 progress += length;
6337                 m_new->m_len = length;
6338                 if (m_new != m_final)
6339                         m_cat(m_final, m_new);
6340                 m_new = NULL;
6341         }
6342         m_freem(m0);
6343         m0 = m_final;
6344         return (m0);
6345 nospace:
6346         if (m_final)
6347                 m_freem(m_final);
6348         return (NULL);
6349 }
6350
6351 struct mbuf *
6352 m_defrag(struct mbuf *m0, int how)
6353 {
6354         return (m_defrag_offset(m0, 0, how));
6355 }
6356
6357 void
6358 m_mchtype(struct mbuf *m, int t)
6359 {
6360         mtype_stat_inc(t);
6361         mtype_stat_dec(m->m_type);
6362         (m)->m_type = t;
6363 }
6364
6365 void *
6366 m_mtod(struct mbuf *m)
6367 {
6368         return (MTOD(m, void *));
6369 }
6370
6371 struct mbuf *
6372 m_dtom(void *x)
6373 {
6374         return ((struct mbuf *)((uintptr_t)(x) & ~(MSIZE-1)));
6375 }
6376
6377 void
6378 m_mcheck(struct mbuf *m)
6379 {
6380         _MCHECK(m);
6381 }
6382
6383 /*
6384  * Return a pointer to mbuf/offset of location in mbuf chain.
6385  */
6386 struct mbuf *
6387 m_getptr(struct mbuf *m, int loc, int *off)
6388 {
6389
6390         while (loc >= 0) {
6391                 /* Normal end of search. */
6392                 if (m->m_len > loc) {
6393                         *off = loc;
6394                         return (m);
6395                 } else {
6396                         loc -= m->m_len;
6397                         if (m->m_next == NULL) {
6398                                 if (loc == 0) {
6399                                         /* Point at the end of valid data. */
6400                                         *off = m->m_len;
6401                                         return (m);
6402                                 }
6403                                 return (NULL);
6404                         }
6405                         m = m->m_next;
6406                 }
6407         }
6408         return (NULL);
6409 }
6410
6411 /*
6412  * Inform the corresponding mcache(s) that there's a waiter below.
6413  */
6414 static void
6415 mbuf_waiter_inc(mbuf_class_t class, boolean_t comp)
6416 {
6417         mcache_waiter_inc(m_cache(class));
6418         if (comp) {
6419                 if (class == MC_CL) {
6420                         mcache_waiter_inc(m_cache(MC_MBUF_CL));
6421                 } else if (class == MC_BIGCL) {
6422                         mcache_waiter_inc(m_cache(MC_MBUF_BIGCL));
6423                 } else if (class == MC_16KCL) {
6424                         mcache_waiter_inc(m_cache(MC_MBUF_16KCL));
6425                 } else {
6426                         mcache_waiter_inc(m_cache(MC_MBUF_CL));
6427                         mcache_waiter_inc(m_cache(MC_MBUF_BIGCL));
6428                 }
6429         }
6430 }
6431
6432 /*
6433  * Inform the corresponding mcache(s) that there's no more waiter below.
6434  */
6435 static void
6436 mbuf_waiter_dec(mbuf_class_t class, boolean_t comp)
6437 {
6438         mcache_waiter_dec(m_cache(class));
6439         if (comp) {
6440                 if (class == MC_CL) {
6441                         mcache_waiter_dec(m_cache(MC_MBUF_CL));
6442                 } else if (class == MC_BIGCL) {
6443                         mcache_waiter_dec(m_cache(MC_MBUF_BIGCL));
6444                 } else if (class == MC_16KCL) {
6445                         mcache_waiter_dec(m_cache(MC_MBUF_16KCL));
6446                 } else {
6447                         mcache_waiter_dec(m_cache(MC_MBUF_CL));
6448                         mcache_waiter_dec(m_cache(MC_MBUF_BIGCL));
6449                 }
6450         }
6451 }
6452
6453 /*
6454  * Called during slab (blocking and non-blocking) allocation.  If there
6455  * is at least one waiter, and the time since the first waiter is blocked
6456  * is greater than the watchdog timeout, panic the system.
6457  */
6458 static void
6459 mbuf_watchdog(void)
6460 {
6461         struct timeval now;
6462         unsigned int since;
6463
6464         if (mb_waiters == 0 || !mb_watchdog)
6465                 return;
6466
6467         microuptime(&now);
6468         since = now.tv_sec - mb_wdtstart.tv_sec;
6469         if (since >= MB_WDT_MAXTIME) {
6470                 panic_plain("%s: %d waiters stuck for %u secs\n%s", __func__,
6471                     mb_waiters, since, mbuf_dump());
6472                 /* NOTREACHED */
6473         }
6474 }
6475
6476 /*
6477  * Called during blocking allocation.  Returns TRUE if one or more objects
6478  * are available at the per-CPU caches layer and that allocation should be
6479  * retried at that level.
6480  */
6481 static boolean_t
6482 mbuf_sleep(mbuf_class_t class, unsigned int num, int wait)
6483 {
6484         boolean_t mcache_retry = FALSE;
6485
6486         LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
6487
6488         /* Check if there's anything at the cache layer */
6489         if (mbuf_cached_above(class, wait)) {
6490                 mcache_retry = TRUE;
6491                 goto done;
6492         }
6493
6494         /* Nothing?  Then try hard to get it from somewhere */
6495         m_reclaim(class, num, (wait & MCR_COMP));
6496
6497         /* We tried hard and got something? */
6498         if (m_infree(class) > 0) {
6499                 mbstat.m_wait++;
6500                 goto done;
6501         } else if (mbuf_cached_above(class, wait)) {
6502                 mbstat.m_wait++;
6503                 mcache_retry = TRUE;
6504                 goto done;
6505         } else if (wait & MCR_TRYHARD) {
6506                 mcache_retry = TRUE;
6507                 goto done;
6508         }
6509
6510         /*
6511          * There's really nothing for us right now; inform the
6512          * cache(s) that there is a waiter below and go to sleep.
6513          */
6514         mbuf_waiter_inc(class, (wait & MCR_COMP));
6515
6516         VERIFY(!(wait & MCR_NOSLEEP));
6517
6518         /*
6519          * If this is the first waiter, arm the watchdog timer.  Otherwise
6520          * check if we need to panic the system due to watchdog timeout.
6521          */
6522         if (mb_waiters == 0)
6523                 microuptime(&mb_wdtstart);
6524         else
6525                 mbuf_watchdog();
6526
6527         mb_waiters++;
6528         m_region_expand(class) += m_total(class) + num;
6529         /* wake up the worker thread */
6530         if (class > MC_MBUF && mbuf_worker_ready &&
6531             mbuf_worker_needs_wakeup) {
6532                 wakeup((caddr_t)&mbuf_worker_needs_wakeup);
6533                 mbuf_worker_needs_wakeup = FALSE;
6534         }
6535
6536         (void) msleep(mb_waitchan, mbuf_mlock, (PZERO-1), m_cname(class), NULL);
6537
6538         /* We are now up; stop getting notified until next round */
6539         mbuf_waiter_dec(class, (wait & MCR_COMP));
6540
6541         /* We waited and got something */
6542         if (m_infree(class) > 0) {
6543                 mbstat.m_wait++;
6544                 goto done;
6545         } else if (mbuf_cached_above(class, wait)) {
6546                 mbstat.m_wait++;
6547                 mcache_retry = TRUE;
6548         }
6549 done:
6550         return (mcache_retry);
6551 }
6552
6553 __attribute__((noreturn))
6554 static void
6555 mbuf_worker_thread(void)
6556 {
6557         int mbuf_expand;
6558
6559         while (1) {
6560                 lck_mtx_lock(mbuf_mlock);
6561                 mbuf_worker_run_cnt++;
6562                 mbuf_expand = 0;
6563                 if (m_region_expand(MC_CL) > 0) {
6564                         int n;
6565                         mb_expand_cl_cnt++;
6566                         /* Adjust to current number of cluster in use */
6567                         n = m_region_expand(MC_CL) -
6568                             (m_total(MC_CL) - m_infree(MC_CL));
6569                         if ((n + m_total(MC_CL)) > m_maxlimit(MC_CL))
6570                                 n = m_maxlimit(MC_CL) - m_total(MC_CL);
6571                         if (n > 0) {
6572                                 mb_expand_cl_total += n;
6573                         }
6574                         m_region_expand(MC_CL) = 0;
6575
6576                         if (n > 0 && freelist_populate(MC_CL, n, M_WAIT) > 0)
6577                                 mbuf_expand++;
6578                 }
6579                 if (m_region_expand(MC_BIGCL) > 0) {
6580                         int n;
6581                         mb_expand_bigcl_cnt++;
6582                         /* Adjust to current number of 4 KB cluster in use */
6583                         n = m_region_expand(MC_BIGCL) -
6584                             (m_total(MC_BIGCL) - m_infree(MC_BIGCL));
6585                         if ((n + m_total(MC_BIGCL)) > m_maxlimit(MC_BIGCL))
6586                                 n = m_maxlimit(MC_BIGCL) - m_total(MC_BIGCL);
6587                         if (n > 0) {
6588                                 mb_expand_bigcl_total += n;
6589                         }
6590                         m_region_expand(MC_BIGCL) = 0;
6591
6592                         if (n > 0 && freelist_populate(MC_BIGCL, n, M_WAIT) > 0)
6593                                 mbuf_expand++;
6594                 }
6595                 if (m_region_expand(MC_16KCL) > 0) {
6596                         int n;
6597                         mb_expand_16kcl_cnt++;
6598                         /* Adjust to current number of 16 KB cluster in use */
6599                         n = m_region_expand(MC_16KCL) -
6600                             (m_total(MC_16KCL) - m_infree(MC_16KCL));
6601                         if ((n + m_total(MC_16KCL)) > m_maxlimit(MC_16KCL))
6602                                 n = m_maxlimit(MC_16KCL) - m_total(MC_16KCL);
6603                         if (n > 0) {
6604                                 mb_expand_16kcl_total += n;
6605                         }
6606                         m_region_expand(MC_16KCL) = 0;
6607
6608                         if (n > 0)
6609                                 (void) freelist_populate(MC_16KCL, n, M_WAIT);
6610                 }
6611
6612                 /*
6613                  * Because we can run out of memory before filling the mbuf
6614                  * map, we should not allocate more clusters than they are
6615                  * mbufs -- otherwise we could have a large number of useless
6616                  * clusters allocated.
6617                  */
6618                 if (mbuf_expand) {
6619                         while (m_total(MC_MBUF) <
6620                             (m_total(MC_BIGCL) + m_total(MC_CL))) {
6621                                 mb_expand_cnt++;
6622                                 if (freelist_populate(MC_MBUF, 1, M_WAIT) == 0)
6623                                         break;
6624                         }
6625                 }
6626
6627                 mbuf_worker_needs_wakeup = TRUE;
6628                 /*
6629                  * If there's a deadlock and we're not sending / receiving
6630                  * packets, net_uptime() won't be updated.  Update it here
6631                  * so we are sure it's correct.
6632                  */
6633                 net_update_uptime();
6634                 mbuf_worker_last_runtime = net_uptime();
6635                 assert_wait((caddr_t)&mbuf_worker_needs_wakeup,
6636                     THREAD_UNINT);
6637                 lck_mtx_unlock(mbuf_mlock);
6638                 (void) thread_block((thread_continue_t)mbuf_worker_thread);
6639         }
6640 }
6641
6642 __attribute__((noreturn))
6643 static void
6644 mbuf_worker_thread_init(void)
6645 {
6646         mbuf_worker_ready++;
6647         mbuf_worker_thread();
6648 }
6649
6650 static mcl_slab_t *
6651 slab_get(void *buf)
6652 {
6653         mcl_slabg_t *slg;
6654         unsigned int ix, k;
6655
6656         LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
6657
6658         VERIFY(MBUF_IN_MAP(buf));
6659         ix = ((unsigned char *)buf - mbutl) >> MBSHIFT;
6660         VERIFY(ix < maxslabgrp);
6661
6662         if ((slg = slabstbl[ix]) == NULL) {
6663                 /*
6664                  * In the current implementation, we never shrink the slabs
6665                  * table; if we attempt to reallocate a cluster group when
6666                  * it's already allocated, panic since this is a sign of a
6667                  * memory corruption (slabstbl[ix] got nullified).
6668                  */
6669                 ++slabgrp;
6670                 VERIFY(ix < slabgrp);
6671                 /*
6672                  * Slabs expansion can only be done single threaded; when
6673                  * we get here, it must be as a result of m_clalloc() which
6674                  * is serialized and therefore mb_clalloc_busy must be set.
6675                  */
6676                 VERIFY(mb_clalloc_busy);
6677                 lck_mtx_unlock(mbuf_mlock);
6678
6679                 /* This is a new buffer; create the slabs group for it */
6680                 MALLOC(slg, mcl_slabg_t *, sizeof (*slg), M_TEMP,
6681                     M_WAITOK | M_ZERO);
6682                 MALLOC(slg->slg_slab, mcl_slab_t *, sizeof(mcl_slab_t) * NSLABSPMB,
6683                     M_TEMP, M_WAITOK | M_ZERO);
6684                 VERIFY(slg != NULL && slg->slg_slab != NULL);
6685
6686                 lck_mtx_lock(mbuf_mlock);
6687                 /*
6688                  * No other thread could have gone into m_clalloc() after
6689                  * we dropped the lock above, so verify that it's true.
6690                  */
6691                 VERIFY(mb_clalloc_busy);
6692
6693                 slabstbl[ix] = slg;
6694
6695                 /* Chain each slab in the group to its forward neighbor */
6696                 for (k = 1; k < NSLABSPMB; k++)
6697                         slg->slg_slab[k - 1].sl_next = &slg->slg_slab[k];
6698                 VERIFY(slg->slg_slab[NSLABSPMB - 1].sl_next == NULL);
6699
6700                 /* And chain the last slab in the previous group to this */
6701                 if (ix > 0) {
6702                         VERIFY(slabstbl[ix - 1]->
6703                             slg_slab[NSLABSPMB - 1].sl_next == NULL);
6704                         slabstbl[ix - 1]->slg_slab[NSLABSPMB - 1].sl_next =
6705                             &slg->slg_slab[0];
6706                 }
6707         }
6708
6709         ix = MTOPG(buf) % NSLABSPMB;
6710         VERIFY(ix < NSLABSPMB);
6711
6712         return (&slg->slg_slab[ix]);
6713 }
6714
6715 static void
6716 slab_init(mcl_slab_t *sp, mbuf_class_t class, u_int32_t flags,
6717     void *base, void *head, unsigned int len, int refcnt, int chunks)
6718 {
6719         sp->sl_class = class;
6720         sp->sl_flags = flags;
6721         sp->sl_base = base;
6722         sp->sl_head = head;
6723         sp->sl_len = len;
6724         sp->sl_refcnt = refcnt;
6725         sp->sl_chunks = chunks;
6726         slab_detach(sp);
6727 }
6728
6729 static void
6730 slab_insert(mcl_slab_t *sp, mbuf_class_t class)
6731 {
6732         VERIFY(slab_is_detached(sp));
6733         m_slab_cnt(class)++;
6734         TAILQ_INSERT_TAIL(&m_slablist(class), sp, sl_link);
6735         sp->sl_flags &= ~SLF_DETACHED;
6736
6737         /*
6738          * If a buffer spans multiple contiguous pages then mark them as
6739          * detached too
6740          */
6741         if (class == MC_16KCL) {
6742                 int k;
6743                 for (k = 1; k < NSLABSP16KB; k++) {
6744                         sp = sp->sl_next;
6745                         /* Next slab must already be present */
6746                         VERIFY(sp != NULL && slab_is_detached(sp));
6747                         sp->sl_flags &= ~SLF_DETACHED;
6748                 }
6749         }
6750 }
6751
6752 static void
6753 slab_remove(mcl_slab_t *sp, mbuf_class_t class)
6754 {
6755         int k;
6756         VERIFY(!slab_is_detached(sp));
6757         VERIFY(m_slab_cnt(class) > 0);
6758         m_slab_cnt(class)--;
6759         TAILQ_REMOVE(&m_slablist(class), sp, sl_link);
6760         slab_detach(sp);
6761         if (class == MC_16KCL) {
6762                 for (k = 1; k < NSLABSP16KB; k++) {
6763                         sp = sp->sl_next;
6764                         /* Next slab must already be present */
6765                         VERIFY(sp != NULL);
6766                         VERIFY(!slab_is_detached(sp));
6767                         slab_detach(sp);
6768                 }
6769         }
6770 }
6771
6772 static boolean_t
6773 slab_inrange(mcl_slab_t *sp, void *buf)
6774 {
6775         return ((uintptr_t)buf >= (uintptr_t)sp->sl_base &&
6776             (uintptr_t)buf < ((uintptr_t)sp->sl_base + sp->sl_len));
6777 }
6778
6779 #undef panic
6780
6781 static void
6782 slab_nextptr_panic(mcl_slab_t *sp, void *addr)
6783 {
6784         int i;
6785         unsigned int chunk_len = sp->sl_len / sp->sl_chunks;
6786         uintptr_t buf = (uintptr_t)sp->sl_base;
6787
6788         for (i = 0; i < sp->sl_chunks; i++, buf += chunk_len) {
6789                 void *next = ((mcache_obj_t *)buf)->obj_next;
6790                 if (next != addr)
6791                         continue;
6792                 if (!mclverify) {
6793                         if (next != NULL && !MBUF_IN_MAP(next)) {
6794                                 mcache_t *cp = m_cache(sp->sl_class);
6795                                 panic("%s: %s buffer %p in slab %p modified "
6796                                     "after free at offset 0: %p out of range "
6797                                     "[%p-%p)\n", __func__, cp->mc_name,
6798                                     (void *)buf, sp, next, mbutl, embutl);
6799                                 /* NOTREACHED */
6800                         }
6801                 } else {
6802                         mcache_audit_t *mca = mcl_audit_buf2mca(sp->sl_class,
6803                             (mcache_obj_t *)buf);
6804                         mcl_audit_verify_nextptr(next, mca);
6805                 }
6806         }
6807 }
6808
6809 static void
6810 slab_detach(mcl_slab_t *sp)
6811 {
6812         sp->sl_link.tqe_next = (mcl_slab_t *)-1;
6813         sp->sl_link.tqe_prev = (mcl_slab_t **)-1;
6814         sp->sl_flags |= SLF_DETACHED;
6815 }
6816
6817 static boolean_t
6818 slab_is_detached(mcl_slab_t *sp)
6819 {
6820         return ((intptr_t)sp->sl_link.tqe_next == -1 &&
6821             (intptr_t)sp->sl_link.tqe_prev == -1 &&
6822             (sp->sl_flags & SLF_DETACHED));
6823 }
6824
6825 static void
6826 mcl_audit_init(void *buf, mcache_audit_t **mca_list,
6827     mcache_obj_t **con_list, size_t con_size, unsigned int num)
6828 {
6829         mcache_audit_t *mca, *mca_tail;
6830         mcache_obj_t *con = NULL;
6831         boolean_t save_contents = (con_list != NULL);
6832         unsigned int i, ix;
6833
6834         ASSERT(num <= NMBPG);
6835         ASSERT(con_list == NULL || con_size != 0);
6836
6837         ix = MTOPG(buf);
6838         VERIFY(ix < maxclaudit);
6839
6840         /* Make sure we haven't been here before */
6841         for (i = 0; i < num; i++)
6842                 VERIFY(mclaudit[ix].cl_audit[i] == NULL);
6843
6844         mca = mca_tail = *mca_list;
6845         if (save_contents)
6846                 con = *con_list;
6847
6848         for (i = 0; i < num; i++) {
6849                 mcache_audit_t *next;
6850
6851                 next = mca->mca_next;
6852                 bzero(mca, sizeof (*mca));
6853                 mca->mca_next = next;
6854                 mclaudit[ix].cl_audit[i] = mca;
6855
6856                 /* Attach the contents buffer if requested */
6857                 if (save_contents) {
6858                         mcl_saved_contents_t *msc =
6859                             (mcl_saved_contents_t *)(void *)con;
6860
6861                         VERIFY(msc != NULL);
6862                         VERIFY(IS_P2ALIGNED(msc, sizeof (u_int64_t)));
6863                         VERIFY(con_size == sizeof (*msc));
6864                         mca->mca_contents_size = con_size;
6865                         mca->mca_contents = msc;
6866                         con = con->obj_next;
6867                         bzero(mca->mca_contents, mca->mca_contents_size);
6868                 }
6869
6870                 mca_tail = mca;
6871                 mca = mca->mca_next;
6872         }
6873
6874         if (save_contents)
6875                 *con_list = con;
6876
6877         *mca_list = mca_tail->mca_next;
6878         mca_tail->mca_next = NULL;
6879 }
6880
6881 static void
6882 mcl_audit_free(void *buf, unsigned int num)
6883 {
6884         unsigned int i, ix;
6885         mcache_audit_t *mca, *mca_list;
6886
6887         ix = MTOPG(buf);
6888         VERIFY(ix < maxclaudit);
6889
6890         if (mclaudit[ix].cl_audit[0] != NULL) {
6891                 mca_list = mclaudit[ix].cl_audit[0];
6892                 for (i = 0; i < num; i++) {
6893                         mca = mclaudit[ix].cl_audit[i];
6894                         mclaudit[ix].cl_audit[i] = NULL;
6895                         if (mca->mca_contents)
6896                                 mcache_free(mcl_audit_con_cache,
6897                                     mca->mca_contents);
6898                 }
6899                 mcache_free_ext(mcache_audit_cache,
6900                     (mcache_obj_t *)mca_list);
6901         }
6902 }
6903
6904 /*
6905  * Given an address of a buffer (mbuf/2KB/4KB/16KB), return
6906  * the corresponding audit structure for that buffer.
6907  */
6908 static mcache_audit_t *
6909 mcl_audit_buf2mca(mbuf_class_t class, mcache_obj_t *mobj)
6910 {
6911         mcache_audit_t *mca = NULL;
6912         int ix = MTOPG(mobj), m_idx = 0;
6913         unsigned char *page_addr;
6914
6915         VERIFY(ix < maxclaudit);
6916         VERIFY(IS_P2ALIGNED(mobj, MIN(m_maxsize(class), PAGE_SIZE)));
6917
6918         page_addr = PGTOM(ix);
6919
6920         switch (class) {
6921         case MC_MBUF:
6922                 /*
6923                  * For the mbuf case, find the index of the page
6924                  * used by the mbuf and use that index to locate the
6925                  * base address of the page.  Then find out the
6926                  * mbuf index relative to the page base and use
6927                  * it to locate the audit structure.
6928                  */
6929                 m_idx = MBPAGEIDX(page_addr, mobj);
6930                 VERIFY(m_idx < (int)NMBPG);
6931                 mca = mclaudit[ix].cl_audit[m_idx];
6932                 break;
6933
6934         case MC_CL:
6935                 /*
6936                  * Same thing as above, but for 2KB clusters in a page.
6937                  */
6938                 m_idx = CLPAGEIDX(page_addr, mobj);
6939                 VERIFY(m_idx < (int)NCLPG);
6940                 mca = mclaudit[ix].cl_audit[m_idx];
6941                 break;
6942
6943         case MC_BIGCL:
6944                 m_idx = BCLPAGEIDX(page_addr, mobj);
6945                 VERIFY(m_idx < (int)NBCLPG);
6946                 mca = mclaudit[ix].cl_audit[m_idx];
6947                 break;
6948         case MC_16KCL:
6949                 /*
6950                  * Same as above, but only return the first element.
6951                  */
6952                 mca = mclaudit[ix].cl_audit[0];
6953                 break;
6954
6955         default:
6956                 VERIFY(0);
6957                 /* NOTREACHED */
6958         }
6959
6960         return (mca);
6961 }
6962
6963 static void
6964 mcl_audit_mbuf(mcache_audit_t *mca, void *addr, boolean_t composite,
6965     boolean_t alloc)
6966 {
6967         struct mbuf *m = addr;
6968         mcache_obj_t *next = ((mcache_obj_t *)m)->obj_next;
6969
6970         VERIFY(mca->mca_contents != NULL &&
6971             mca->mca_contents_size == AUDIT_CONTENTS_SIZE);
6972
6973         if (mclverify)
6974                 mcl_audit_verify_nextptr(next, mca);
6975
6976         if (!alloc) {
6977                 /* Save constructed mbuf fields */
6978                 mcl_audit_save_mbuf(m, mca);
6979                 if (mclverify) {
6980                         mcache_set_pattern(MCACHE_FREE_PATTERN, m,
6981                             m_maxsize(MC_MBUF));
6982                 }
6983                 ((mcache_obj_t *)m)->obj_next = next;
6984                 return;
6985         }
6986
6987         /* Check if the buffer has been corrupted while in freelist */
6988         if (mclverify) {
6989                 mcache_audit_free_verify_set(mca, addr, 0, m_maxsize(MC_MBUF));
6990         }
6991         /* Restore constructed mbuf fields */
6992         mcl_audit_restore_mbuf(m, mca, composite);
6993 }
6994
6995 static void
6996 mcl_audit_restore_mbuf(struct mbuf *m, mcache_audit_t *mca, boolean_t composite)
6997 {
6998         struct mbuf *ms = MCA_SAVED_MBUF_PTR(mca);
6999
7000         if (composite) {
7001                 struct mbuf *next = m->m_next;
7002                 VERIFY(ms->m_flags == M_EXT && m_get_rfa(ms) != NULL &&
7003                     MBUF_IS_COMPOSITE(ms));
7004                 VERIFY(mca->mca_contents_size == AUDIT_CONTENTS_SIZE);
7005                 /*
7006                  * We could have hand-picked the mbuf fields and restore
7007                  * them individually, but that will be a maintenance
7008                  * headache.  Instead, restore everything that was saved;
7009                  * the mbuf layer will recheck and reinitialize anyway.
7010                  */
7011                 bcopy(ms, m, MCA_SAVED_MBUF_SIZE);
7012                 m->m_next = next;
7013         } else {
7014                 /*
7015                  * For a regular mbuf (no cluster attached) there's nothing
7016                  * to restore other than the type field, which is expected
7017                  * to be MT_FREE.
7018                  */
7019                 m->m_type = ms->m_type;
7020         }
7021         _MCHECK(m);
7022 }
7023
7024 static void
7025 mcl_audit_save_mbuf(struct mbuf *m, mcache_audit_t *mca)
7026 {
7027         VERIFY(mca->mca_contents_size == AUDIT_CONTENTS_SIZE);
7028         _MCHECK(m);
7029         bcopy(m, MCA_SAVED_MBUF_PTR(mca), MCA_SAVED_MBUF_SIZE);
7030 }
7031
7032 static void
7033 mcl_audit_cluster(mcache_audit_t *mca, void *addr, size_t size, boolean_t alloc,
7034     boolean_t save_next)
7035 {
7036         mcache_obj_t *next = ((mcache_obj_t *)addr)->obj_next;
7037
7038         if (!alloc) {
7039                 if (mclverify) {
7040                         mcache_set_pattern(MCACHE_FREE_PATTERN, addr, size);
7041                 }
7042                 if (save_next) {
7043                         mcl_audit_verify_nextptr(next, mca);
7044                         ((mcache_obj_t *)addr)->obj_next = next;
7045                 }
7046         } else if (mclverify) {
7047                 /* Check if the buffer has been corrupted while in freelist */
7048                 mcl_audit_verify_nextptr(next, mca);
7049                 mcache_audit_free_verify_set(mca, addr, 0, size);
7050         }
7051 }
7052
7053 static void
7054 mcl_audit_scratch(mcache_audit_t *mca)
7055 {
7056         void *stack[MCACHE_STACK_DEPTH + 1];
7057         mcl_scratch_audit_t *msa;
7058         struct timeval now;
7059
7060         VERIFY(mca->mca_contents != NULL);
7061         msa = MCA_SAVED_SCRATCH_PTR(mca);
7062
7063         msa->msa_pthread = msa->msa_thread;
7064         msa->msa_thread = current_thread();
7065         bcopy(msa->msa_stack, msa->msa_pstack, sizeof (msa->msa_pstack));
7066         msa->msa_pdepth = msa->msa_depth;
7067         bzero(stack, sizeof (stack));
7068         msa->msa_depth = OSBacktrace(stack, MCACHE_STACK_DEPTH + 1) - 1;
7069         bcopy(&stack[1], msa->msa_stack, sizeof (msa->msa_stack));
7070
7071         msa->msa_ptstamp = msa->msa_tstamp;
7072         microuptime(&now);
7073         /* tstamp is in ms relative to base_ts */
7074         msa->msa_tstamp = ((now.tv_usec - mb_start.tv_usec) / 1000);
7075         if ((now.tv_sec - mb_start.tv_sec) > 0)
7076                 msa->msa_tstamp += ((now.tv_sec - mb_start.tv_sec) * 1000);
7077 }
7078
7079 static void
7080 mcl_audit_mcheck_panic(struct mbuf *m)
7081 {
7082         mcache_audit_t *mca;
7083
7084         MRANGE(m);
7085         mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
7086
7087         panic("mcl_audit: freed mbuf %p with type 0x%x (instead of 0x%x)\n%s\n",
7088             m, (u_int16_t)m->m_type, MT_FREE, mcache_dump_mca(mca));
7089         /* NOTREACHED */
7090 }
7091
7092 static void
7093 mcl_audit_verify_nextptr(void *next, mcache_audit_t *mca)
7094 {
7095         if (next != NULL && !MBUF_IN_MAP(next) &&
7096             (next != (void *)MCACHE_FREE_PATTERN || !mclverify)) {
7097                 panic("mcl_audit: buffer %p modified after free at offset 0: "
7098                     "%p out of range [%p-%p)\n%s\n",
7099                     mca->mca_addr, next, mbutl, embutl, mcache_dump_mca(mca));
7100                 /* NOTREACHED */
7101         }
7102 }
7103
7104 /* This function turns on mbuf leak detection */
7105 static void
7106 mleak_activate(void)
7107 {
7108         mleak_table.mleak_sample_factor = MLEAK_SAMPLE_FACTOR;
7109         PE_parse_boot_argn("mleak_sample_factor",
7110             &mleak_table.mleak_sample_factor,
7111             sizeof (mleak_table.mleak_sample_factor));
7112
7113         if (mleak_table.mleak_sample_factor == 0)
7114                 mclfindleak = 0;
7115
7116         if (mclfindleak == 0)
7117                 return;
7118
7119         vm_size_t alloc_size =
7120             mleak_alloc_buckets * sizeof (struct mallocation);
7121         vm_size_t trace_size = mleak_trace_buckets * sizeof (struct mtrace);
7122
7123         MALLOC(mleak_allocations, struct mallocation *, alloc_size,
7124             M_TEMP, M_WAITOK | M_ZERO);
7125         VERIFY(mleak_allocations != NULL);
7126
7127         MALLOC(mleak_traces, struct mtrace *, trace_size,
7128             M_TEMP, M_WAITOK | M_ZERO);
7129         VERIFY(mleak_traces != NULL);
7130
7131         MALLOC(mleak_stat, mleak_stat_t *, MLEAK_STAT_SIZE(MLEAK_NUM_TRACES),
7132             M_TEMP, M_WAITOK | M_ZERO);
7133         VERIFY(mleak_stat != NULL);
7134         mleak_stat->ml_cnt = MLEAK_NUM_TRACES;
7135 #ifdef __LP64__
7136         mleak_stat->ml_isaddr64 = 1;
7137 #endif /* __LP64__ */
7138 }
7139
7140 static void
7141 mleak_logger(u_int32_t num, mcache_obj_t *addr, boolean_t alloc)
7142 {
7143         int temp;
7144
7145         if (mclfindleak == 0)
7146                 return;
7147
7148         if (!alloc)
7149                 return (mleak_free(addr));
7150
7151         temp = atomic_add_32_ov(&mleak_table.mleak_capture, 1);
7152
7153         if ((temp % mleak_table.mleak_sample_factor) == 0 && addr != NULL) {
7154                 uintptr_t bt[MLEAK_STACK_DEPTH];
7155                 int logged = backtrace(bt, MLEAK_STACK_DEPTH);
7156                 mleak_log(bt, addr, logged, num);
7157         }
7158 }
7159
7160 /*
7161  * This function records the allocation in the mleak_allocations table
7162  * and the backtrace in the mleak_traces table; if allocation slot is in use,
7163  * replace old allocation with new one if the trace slot is in use, return
7164  * (or increment refcount if same trace).
7165  */
7166 static boolean_t
7167 mleak_log(uintptr_t *bt, mcache_obj_t *addr, uint32_t depth, int num)
7168 {
7169         struct mallocation *allocation;
7170         struct mtrace *trace;
7171         uint32_t trace_index;
7172
7173         /* Quit if someone else modifying the tables */
7174         if (!lck_mtx_try_lock_spin(mleak_lock)) {
7175                 mleak_table.total_conflicts++;
7176                 return (FALSE);
7177         }
7178
7179         allocation = &mleak_allocations[hashaddr((uintptr_t)addr,
7180             mleak_alloc_buckets)];
7181         trace_index = hashbacktrace(bt, depth, mleak_trace_buckets);
7182         trace = &mleak_traces[trace_index];
7183
7184         VERIFY(allocation <= &mleak_allocations[mleak_alloc_buckets - 1]);
7185         VERIFY(trace <= &mleak_traces[mleak_trace_buckets - 1]);
7186
7187         allocation->hitcount++;
7188         trace->hitcount++;
7189
7190         /*
7191          * If the allocation bucket we want is occupied
7192          * and the occupier has the same trace, just bail.
7193          */
7194         if (allocation->element != NULL &&
7195             trace_index == allocation->trace_index) {
7196                 mleak_table.alloc_collisions++;
7197                 lck_mtx_unlock(mleak_lock);
7198                 return (TRUE);
7199         }
7200
7201         /*
7202          * Store the backtrace in the traces array;
7203          * Size of zero = trace bucket is free.
7204          */
7205         if (trace->allocs > 0 &&
7206             bcmp(trace->addr, bt, (depth * sizeof (uintptr_t))) != 0) {
7207                 /* Different, unique trace, but the same hash! Bail out. */
7208                 trace->collisions++;
7209                 mleak_table.trace_collisions++;
7210                 lck_mtx_unlock(mleak_lock);
7211                 return (TRUE);
7212         } else if (trace->allocs > 0) {
7213                 /* Same trace, already added, so increment refcount */
7214                 trace->allocs++;
7215         } else {
7216                 /* Found an unused trace bucket, so record the trace here */
7217                 if (trace->depth != 0) {
7218                         /* this slot previously used but not currently in use */
7219                         mleak_table.trace_overwrites++;
7220                 }
7221                 mleak_table.trace_recorded++;
7222                 trace->allocs = 1;
7223                 memcpy(trace->addr, bt, (depth * sizeof (uintptr_t)));
7224                 trace->depth = depth;
7225                 trace->collisions = 0;
7226         }
7227
7228         /* Step 2: Store the allocation record in the allocations array */
7229         if (allocation->element != NULL) {
7230                 /*
7231                  * Replace an existing allocation.  No need to preserve
7232                  * because only a subset of the allocations are being
7233                  * recorded anyway.
7234                  */
7235                 mleak_table.alloc_collisions++;
7236         } else if (allocation->trace_index != 0) {
7237                 mleak_table.alloc_overwrites++;
7238         }
7239         allocation->element = addr;
7240         allocation->trace_index = trace_index;
7241         allocation->count = num;
7242         mleak_table.alloc_recorded++;
7243         mleak_table.outstanding_allocs++;
7244
7245         lck_mtx_unlock(mleak_lock);
7246         return (TRUE);
7247 }
7248
7249 static void
7250 mleak_free(mcache_obj_t *addr)
7251 {
7252         while (addr != NULL) {
7253                 struct mallocation *allocation = &mleak_allocations
7254                     [hashaddr((uintptr_t)addr, mleak_alloc_buckets)];
7255
7256                 if (allocation->element == addr &&
7257                     allocation->trace_index < mleak_trace_buckets) {
7258                         lck_mtx_lock_spin(mleak_lock);
7259                         if (allocation->element == addr &&
7260                             allocation->trace_index < mleak_trace_buckets) {
7261                                 struct mtrace *trace;
7262                                 trace = &mleak_traces[allocation->trace_index];
7263                                 /* allocs = 0 means trace bucket is unused */
7264                                 if (trace->allocs > 0)
7265                                         trace->allocs--;
7266                                 if (trace->allocs == 0)
7267                                         trace->depth = 0;
7268                                 /* NULL element means alloc bucket is unused */
7269                                 allocation->element = NULL;
7270                                 mleak_table.outstanding_allocs--;
7271                         }
7272                         lck_mtx_unlock(mleak_lock);
7273                 }
7274                 addr = addr->obj_next;
7275         }
7276 }
7277
7278 static void
7279 mleak_sort_traces()
7280 {
7281         int i, j, k;
7282         struct mtrace *swap;
7283
7284         for(i = 0; i < MLEAK_NUM_TRACES; i++)
7285                 mleak_top_trace[i] = NULL;
7286
7287         for(i = 0, j = 0; j < MLEAK_NUM_TRACES && i < mleak_trace_buckets; i++)
7288         {
7289                 if (mleak_traces[i].allocs <= 0)
7290                         continue;
7291
7292                 mleak_top_trace[j] = &mleak_traces[i];
7293                 for (k = j; k > 0; k--) {
7294                         if (mleak_top_trace[k]->allocs <=
7295                             mleak_top_trace[k-1]->allocs)
7296                                 break;
7297
7298                         swap = mleak_top_trace[k-1];
7299                         mleak_top_trace[k-1] = mleak_top_trace[k];
7300                         mleak_top_trace[k] = swap;
7301                 }
7302                 j++;
7303         }
7304
7305         j--;
7306         for(; i < mleak_trace_buckets; i++) {
7307                 if (mleak_traces[i].allocs <= mleak_top_trace[j]->allocs)
7308                         continue;
7309
7310                 mleak_top_trace[j] = &mleak_traces[i];
7311
7312                 for (k = j; k > 0; k--) {
7313                         if (mleak_top_trace[k]->allocs <=
7314                             mleak_top_trace[k-1]->allocs)
7315                                 break;
7316
7317                         swap = mleak_top_trace[k-1];
7318                         mleak_top_trace[k-1] = mleak_top_trace[k];
7319                         mleak_top_trace[k] = swap;
7320                 }
7321         }
7322 }
7323
7324 static void
7325 mleak_update_stats()
7326 {
7327         mleak_trace_stat_t *mltr;
7328         int i;
7329
7330         VERIFY(mleak_stat != NULL);
7331 #ifdef __LP64__
7332         VERIFY(mleak_stat->ml_isaddr64);
7333 #else
7334         VERIFY(!mleak_stat->ml_isaddr64);
7335 #endif /* !__LP64__ */
7336         VERIFY(mleak_stat->ml_cnt == MLEAK_NUM_TRACES);
7337
7338         mleak_sort_traces();
7339
7340         mltr = &mleak_stat->ml_trace[0];
7341         bzero(mltr, sizeof (*mltr) * MLEAK_NUM_TRACES);
7342         for (i = 0; i < MLEAK_NUM_TRACES; i++) {
7343                 int j;
7344
7345                 if (mleak_top_trace[i] == NULL ||
7346                     mleak_top_trace[i]->allocs == 0)
7347                         continue;
7348
7349                 mltr->mltr_collisions   = mleak_top_trace[i]->collisions;
7350                 mltr->mltr_hitcount     = mleak_top_trace[i]->hitcount;
7351                 mltr->mltr_allocs       = mleak_top_trace[i]->allocs;
7352                 mltr->mltr_depth        = mleak_top_trace[i]->depth;
7353
7354                 VERIFY(mltr->mltr_depth <= MLEAK_STACK_DEPTH);
7355                 for (j = 0; j < mltr->mltr_depth; j++)
7356                         mltr->mltr_addr[j] = mleak_top_trace[i]->addr[j];
7357
7358                 mltr++;
7359         }
7360 }
7361
7362 static struct mbtypes {
7363         int             mt_type;
7364         const char      *mt_name;
7365 } mbtypes[] = {
7366         { MT_DATA,      "data" },
7367         { MT_OOBDATA,   "oob data" },
7368         { MT_CONTROL,   "ancillary data" },
7369         { MT_HEADER,    "packet headers" },
7370         { MT_SOCKET,    "socket structures" },
7371         { MT_PCB,       "protocol control blocks" },
7372         { MT_RTABLE,    "routing table entries" },
7373         { MT_HTABLE,    "IMP host table entries" },
7374         { MT_ATABLE,    "address resolution tables" },
7375         { MT_FTABLE,    "fragment reassembly queue headers" },
7376         { MT_SONAME,    "socket names and addresses" },
7377         { MT_SOOPTS,    "socket options" },
7378         { MT_RIGHTS,    "access rights" },
7379         { MT_IFADDR,    "interface addresses" },
7380         { MT_TAG,       "packet tags" },
7381         { 0,            NULL }
7382 };
7383
7384 #define MBUF_DUMP_BUF_CHK() {   \
7385         clen -= k;              \
7386         if (clen < 1)           \
7387                 goto done;      \
7388         c += k;                 \
7389 }
7390
7391 static char *
7392 mbuf_dump(void)
7393 {
7394         unsigned long totmem = 0, totfree = 0, totmbufs, totused, totpct,
7395             totreturned = 0;
7396         u_int32_t m_mbufs = 0, m_clfree = 0, m_bigclfree = 0;
7397         u_int32_t m_mbufclfree = 0, m_mbufbigclfree = 0;
7398         u_int32_t m_16kclusters = 0, m_16kclfree = 0, m_mbuf16kclfree = 0;
7399         int nmbtypes = sizeof (mbstat.m_mtypes) / sizeof (short);
7400         uint8_t seen[256];
7401         struct mbtypes *mp;
7402         mb_class_stat_t *sp;
7403         mleak_trace_stat_t *mltr;
7404         char *c = mbuf_dump_buf;
7405         int i, j, k, clen = MBUF_DUMP_BUF_SIZE;
7406
7407         mbuf_dump_buf[0] = '\0';
7408
7409         /* synchronize all statistics in the mbuf table */
7410         mbuf_stat_sync();
7411         mbuf_mtypes_sync(TRUE);
7412
7413         sp = &mb_stat->mbs_class[0];
7414         for (i = 0; i < mb_stat->mbs_cnt; i++, sp++) {
7415                 u_int32_t mem;
7416
7417                 if (m_class(i) == MC_MBUF) {
7418                         m_mbufs = sp->mbcl_active;
7419                 } else if (m_class(i) == MC_CL) {
7420                         m_clfree = sp->mbcl_total - sp->mbcl_active;
7421                 } else if (m_class(i) == MC_BIGCL) {
7422                         m_bigclfree = sp->mbcl_total - sp->mbcl_active;
7423                 } else if (njcl > 0 && m_class(i) == MC_16KCL) {
7424                         m_16kclfree = sp->mbcl_total - sp->mbcl_active;
7425                         m_16kclusters = sp->mbcl_total;
7426                 } else if (m_class(i) == MC_MBUF_CL) {
7427                         m_mbufclfree = sp->mbcl_total - sp->mbcl_active;
7428                 } else if (m_class(i) == MC_MBUF_BIGCL) {
7429                         m_mbufbigclfree = sp->mbcl_total - sp->mbcl_active;
7430                 } else if (njcl > 0 && m_class(i) == MC_MBUF_16KCL) {
7431                         m_mbuf16kclfree = sp->mbcl_total - sp->mbcl_active;
7432                 }
7433
7434                 mem = sp->mbcl_ctotal * sp->mbcl_size;
7435                 totmem += mem;
7436                 totfree += (sp->mbcl_mc_cached + sp->mbcl_infree) *
7437                     sp->mbcl_size;
7438                 totreturned += sp->mbcl_release_cnt;
7439
7440         }
7441
7442         /* adjust free counts to include composite caches */
7443         m_clfree += m_mbufclfree;
7444         m_bigclfree += m_mbufbigclfree;
7445         m_16kclfree += m_mbuf16kclfree;
7446
7447         totmbufs = 0;
7448         for (mp = mbtypes; mp->mt_name != NULL; mp++)
7449                 totmbufs += mbstat.m_mtypes[mp->mt_type];
7450         if (totmbufs > m_mbufs)
7451                 totmbufs = m_mbufs;
7452         k = snprintf(c, clen, "%lu/%u mbufs in use:\n", totmbufs, m_mbufs);
7453         MBUF_DUMP_BUF_CHK();
7454
7455         bzero(&seen, sizeof (seen));
7456         for (mp = mbtypes; mp->mt_name != NULL; mp++) {
7457                 if (mbstat.m_mtypes[mp->mt_type] != 0) {
7458                         seen[mp->mt_type] = 1;
7459                         k = snprintf(c, clen, "\t%u mbufs allocated to %s\n",
7460                             mbstat.m_mtypes[mp->mt_type], mp->mt_name);
7461                         MBUF_DUMP_BUF_CHK();
7462                 }
7463         }
7464         seen[MT_FREE] = 1;
7465         for (i = 0; i < nmbtypes; i++)
7466                 if (!seen[i] && mbstat.m_mtypes[i] != 0) {
7467                         k = snprintf(c, clen, "\t%u mbufs allocated to "
7468                             "<mbuf type %d>\n", mbstat.m_mtypes[i], i);
7469                         MBUF_DUMP_BUF_CHK();
7470                 }
7471         if ((m_mbufs - totmbufs) > 0) {
7472                 k = snprintf(c, clen, "\t%lu mbufs allocated to caches\n",
7473                     m_mbufs - totmbufs);
7474                 MBUF_DUMP_BUF_CHK();
7475         }
7476         k = snprintf(c, clen, "%u/%u mbuf 2KB clusters in use\n"
7477             "%u/%u mbuf 4KB clusters in use\n",
7478             (unsigned int)(mbstat.m_clusters - m_clfree),
7479             (unsigned int)mbstat.m_clusters,
7480             (unsigned int)(mbstat.m_bigclusters - m_bigclfree),
7481             (unsigned int)mbstat.m_bigclusters);
7482         MBUF_DUMP_BUF_CHK();
7483
7484         if (njcl > 0) {
7485                 k = snprintf(c, clen, "%u/%u mbuf %uKB clusters in use\n",
7486                     m_16kclusters - m_16kclfree, m_16kclusters,
7487                     njclbytes / 1024);
7488                 MBUF_DUMP_BUF_CHK();
7489         }
7490         totused = totmem - totfree;
7491         if (totmem == 0) {
7492                 totpct = 0;
7493         } else if (totused < (ULONG_MAX / 100)) {
7494                 totpct = (totused * 100) / totmem;
7495         } else {
7496                 u_long totmem1 = totmem / 100;
7497                 u_long totused1 = totused / 100;
7498                 totpct = (totused1 * 100) / totmem1;
7499         }
7500         k = snprintf(c, clen, "%lu KB allocated to network (approx. %lu%% "
7501             "in use)\n", totmem / 1024, totpct);
7502         MBUF_DUMP_BUF_CHK();
7503         k = snprintf(c, clen, "%lu KB returned to the system\n",
7504             totreturned / 1024);
7505         MBUF_DUMP_BUF_CHK();
7506
7507         net_update_uptime();
7508         k = snprintf(c, clen,
7509             "VM allocation failures: contiguous %u, normal %u, one page %u\n",
7510             mb_kmem_contig_failed, mb_kmem_failed, mb_kmem_one_failed);
7511         MBUF_DUMP_BUF_CHK();
7512         if (mb_kmem_contig_failed_ts || mb_kmem_failed_ts ||
7513             mb_kmem_one_failed_ts) {
7514                 k = snprintf(c, clen,
7515                     "VM allocation failure timestamps: contiguous %llu "
7516                     "(size %llu), normal %llu (size %llu), one page %llu "
7517                     "(now %llu)\n",
7518                     mb_kmem_contig_failed_ts, mb_kmem_contig_failed_size,
7519                     mb_kmem_failed_ts, mb_kmem_failed_size,
7520                     mb_kmem_one_failed_ts, net_uptime());
7521                 MBUF_DUMP_BUF_CHK();
7522                 k = snprintf(c, clen,
7523                     "VM return codes: ");
7524                 MBUF_DUMP_BUF_CHK();
7525                 for (i = 0;
7526                      i < sizeof(mb_kmem_stats) / sizeof(mb_kmem_stats[0]);
7527                      i++) {
7528                         k = snprintf(c, clen, "%s: %u ", mb_kmem_stats_labels[i],
7529                             mb_kmem_stats[i]);
7530                         MBUF_DUMP_BUF_CHK();
7531                 }
7532                 k = snprintf(c, clen, "\n");
7533                 MBUF_DUMP_BUF_CHK();
7534         }
7535         k = snprintf(c, clen,
7536             "worker thread runs: %u, expansions: %llu, cl %llu/%llu, "
7537             "bigcl %llu/%llu, 16k %llu/%llu\n", mbuf_worker_run_cnt,
7538             mb_expand_cnt, mb_expand_cl_cnt, mb_expand_cl_total,
7539             mb_expand_bigcl_cnt, mb_expand_bigcl_total, mb_expand_16kcl_cnt,
7540             mb_expand_16kcl_total);
7541         MBUF_DUMP_BUF_CHK();
7542         if (mbuf_worker_last_runtime != 0) {
7543                 k = snprintf(c, clen, "worker thread last run time: "
7544                     "%llu (%llu seconds ago)\n",
7545                     mbuf_worker_last_runtime,
7546                     net_uptime() - mbuf_worker_last_runtime);
7547                 MBUF_DUMP_BUF_CHK();
7548         }
7549
7550         k = snprintf(c, clen, "\nlargest allocation failure backtraces:\n");
7551         MBUF_DUMP_BUF_CHK();
7552
7553         for (j = 0; j < MTRACELARGE_NUM_TRACES; j++) {
7554                 struct mtracelarge *trace = &mtracelarge_table[j];
7555                 if (trace->size == 0 || trace->depth == 0)
7556                         continue;
7557                 k = snprintf(c, clen, "size %llu: < ", trace->size);
7558                 MBUF_DUMP_BUF_CHK();
7559                 for (i = 0; i < trace->depth; i++) {
7560                         if (mleak_stat->ml_isaddr64) {
7561                                 k = snprintf(c, clen, "0x%0llx ",
7562                                     (uint64_t)VM_KERNEL_UNSLIDE(
7563                                             trace->addr[i]));
7564                         } else {
7565                                 k = snprintf(c, clen,
7566                                     "0x%08x ",
7567                                     (uint32_t)VM_KERNEL_UNSLIDE(
7568                                             trace->addr[i]));
7569                         }
7570                         MBUF_DUMP_BUF_CHK();
7571                 }
7572                 k = snprintf(c, clen, ">\n");
7573                 MBUF_DUMP_BUF_CHK();
7574         }
7575
7576         /* mbuf leak detection statistics */
7577         mleak_update_stats();
7578
7579         k = snprintf(c, clen, "\nmbuf leak detection table:\n");
7580         MBUF_DUMP_BUF_CHK();
7581         k = snprintf(c, clen, "\ttotal captured: %u (one per %u)\n",
7582             mleak_table.mleak_capture / mleak_table.mleak_sample_factor,
7583             mleak_table.mleak_sample_factor);
7584         MBUF_DUMP_BUF_CHK();
7585         k = snprintf(c, clen, "\ttotal allocs outstanding: %llu\n",
7586             mleak_table.outstanding_allocs);
7587         MBUF_DUMP_BUF_CHK();
7588         k = snprintf(c, clen, "\tnew hash recorded: %llu allocs, %llu traces\n",
7589             mleak_table.alloc_recorded, mleak_table.trace_recorded);
7590         MBUF_DUMP_BUF_CHK();
7591         k = snprintf(c, clen, "\thash collisions: %llu allocs, %llu traces\n",
7592             mleak_table.alloc_collisions, mleak_table.trace_collisions);
7593         MBUF_DUMP_BUF_CHK();
7594         k = snprintf(c, clen, "\toverwrites: %llu allocs, %llu traces\n",
7595             mleak_table.alloc_overwrites, mleak_table.trace_overwrites);
7596         MBUF_DUMP_BUF_CHK();
7597         k = snprintf(c, clen, "\tlock conflicts: %llu\n\n",
7598             mleak_table.total_conflicts);
7599         MBUF_DUMP_BUF_CHK();
7600
7601         k = snprintf(c, clen, "top %d outstanding traces:\n",
7602             mleak_stat->ml_cnt);
7603         MBUF_DUMP_BUF_CHK();
7604         for (i = 0; i < mleak_stat->ml_cnt; i++) {
7605                 mltr = &mleak_stat->ml_trace[i];
7606                 k = snprintf(c, clen, "[%d] %llu outstanding alloc(s), "
7607                     "%llu hit(s), %llu collision(s)\n", (i + 1),
7608                     mltr->mltr_allocs, mltr->mltr_hitcount,
7609                     mltr->mltr_collisions);
7610                 MBUF_DUMP_BUF_CHK();
7611         }
7612
7613         if (mleak_stat->ml_isaddr64)
7614                 k = snprintf(c, clen, MB_LEAK_HDR_64);
7615         else
7616                 k = snprintf(c, clen, MB_LEAK_HDR_32);
7617         MBUF_DUMP_BUF_CHK();
7618
7619         for (i = 0; i < MLEAK_STACK_DEPTH; i++) {
7620                 k = snprintf(c, clen, "%2d: ", (i + 1));
7621                 MBUF_DUMP_BUF_CHK();
7622                 for (j = 0; j < mleak_stat->ml_cnt; j++) {
7623                         mltr = &mleak_stat->ml_trace[j];
7624                         if (i < mltr->mltr_depth) {
7625                                 if (mleak_stat->ml_isaddr64) {
7626                                         k = snprintf(c, clen, "0x%0llx  ",
7627                                             (uint64_t)VM_KERNEL_UNSLIDE(
7628                                                 mltr->mltr_addr[i]));
7629                                 } else {
7630                                         k = snprintf(c, clen,
7631                                             "0x%08x  ",
7632                                             (uint32_t)VM_KERNEL_UNSLIDE(
7633                                                 mltr->mltr_addr[i]));
7634                                 }
7635                         } else {
7636                                 if (mleak_stat->ml_isaddr64)
7637                                         k = snprintf(c, clen,
7638                                             MB_LEAK_SPACING_64);
7639                                 else
7640                                         k = snprintf(c, clen,
7641                                             MB_LEAK_SPACING_32);
7642                         }
7643                         MBUF_DUMP_BUF_CHK();
7644                 }
7645                 k = snprintf(c, clen, "\n");
7646                 MBUF_DUMP_BUF_CHK();
7647         }
7648 done:
7649         return (mbuf_dump_buf);
7650 }
7651
7652 #undef MBUF_DUMP_BUF_CHK
7653
7654 /*
7655  * Convert between a regular and a packet header mbuf.  Caller is responsible
7656  * for setting or clearing M_PKTHDR; this routine does the rest of the work.
7657  */
7658 int
7659 m_reinit(struct mbuf *m, int hdr)
7660 {
7661         int ret = 0;
7662
7663         if (hdr) {
7664                 VERIFY(!(m->m_flags & M_PKTHDR));
7665                 if (!(m->m_flags & M_EXT) &&
7666                     (m->m_data != m->m_dat || m->m_len > 0)) {
7667                         /*
7668                          * If there's no external cluster attached and the
7669                          * mbuf appears to contain user data, we cannot
7670                          * safely convert this to a packet header mbuf,
7671                          * as the packet header structure might overlap
7672                          * with the data.
7673                          */
7674                         printf("%s: cannot set M_PKTHDR on altered mbuf %llx, "
7675                             "m_data %llx (expected %llx), "
7676                             "m_len %d (expected 0)\n",
7677                             __func__,
7678                             (uint64_t)VM_KERNEL_ADDRPERM(m),
7679                             (uint64_t)VM_KERNEL_ADDRPERM(m->m_data),
7680                             (uint64_t)VM_KERNEL_ADDRPERM(m->m_dat), m->m_len);
7681                         ret = EBUSY;
7682                 } else {
7683                         VERIFY((m->m_flags & M_EXT) || m->m_data == m->m_dat);
7684                         m->m_flags |= M_PKTHDR;
7685                         MBUF_INIT_PKTHDR(m);
7686                 }
7687         } else {
7688                 /* Check for scratch area overflow */
7689                 m_redzone_verify(m);
7690                 /* Free the aux data and tags if there is any */
7691                 m_tag_delete_chain(m, NULL);
7692                 m->m_flags &= ~M_PKTHDR;
7693         }
7694
7695         return (ret);
7696 }
7697
7698 int
7699 m_ext_set_prop(struct mbuf *m, uint32_t o, uint32_t n)
7700 {
7701         ASSERT(m->m_flags & M_EXT);
7702         return (atomic_test_set_32(&MEXT_PRIV(m), o, n));
7703 }
7704
7705 uint32_t
7706 m_ext_get_prop(struct mbuf *m)
7707 {
7708         ASSERT(m->m_flags & M_EXT);
7709         return (MEXT_PRIV(m));
7710 }
7711
7712 int
7713 m_ext_paired_is_active(struct mbuf *m)
7714 {
7715         return (MBUF_IS_PAIRED(m) ? (MEXT_PREF(m) > MEXT_MINREF(m)) : 1);
7716 }
7717
7718 void
7719 m_ext_paired_activate(struct mbuf *m)
7720 {
7721         struct ext_ref *rfa;
7722         int hdr, type;
7723         caddr_t extbuf;
7724         m_ext_free_func_t extfree;
7725         u_int extsize;
7726
7727         VERIFY(MBUF_IS_PAIRED(m));
7728         VERIFY(MEXT_REF(m) == MEXT_MINREF(m));
7729         VERIFY(MEXT_PREF(m) == MEXT_MINREF(m));
7730
7731         hdr = (m->m_flags & M_PKTHDR);
7732         type = m->m_type;
7733         extbuf = m->m_ext.ext_buf;
7734         extfree = m_get_ext_free(m);
7735         extsize = m->m_ext.ext_size;
7736         rfa = m_get_rfa(m);
7737
7738         VERIFY(extbuf != NULL && rfa != NULL);
7739
7740         /*
7741          * Safe to reinitialize packet header tags, since it's
7742          * already taken care of at m_free() time.  Similar to
7743          * what's done in m_clattach() for the cluster.  Bump
7744          * up MEXT_PREF to indicate activation.
7745          */
7746         MBUF_INIT(m, hdr, type);
7747         MEXT_INIT(m, extbuf, extsize, extfree, (caddr_t)m, rfa,
7748             1, 1, 2, EXTF_PAIRED, MEXT_PRIV(m), m);
7749 }
7750
7751 void
7752 m_scratch_init(struct mbuf *m)
7753 {
7754         struct pkthdr *pkt = &m->m_pkthdr;
7755
7756         VERIFY(m->m_flags & M_PKTHDR);
7757
7758         /* See comments in <rdar://problem/14040693> */
7759         if (pkt->pkt_flags & PKTF_PRIV_GUARDED) {
7760                 panic_plain("Invalid attempt to modify guarded module-private "
7761                     "area: mbuf %p, pkt_flags 0x%x\n", m, pkt->pkt_flags);
7762                 /* NOTREACHED */
7763         }
7764
7765         bzero(&pkt->pkt_mpriv, sizeof (pkt->pkt_mpriv));
7766 }
7767
7768 /*
7769  * This routine is reserved for mbuf_get_driver_scratch(); clients inside
7770  * xnu that intend on utilizing the module-private area should directly
7771  * refer to the pkt_mpriv structure in the pkthdr.  They are also expected
7772  * to set and clear PKTF_PRIV_GUARDED, while owning the packet and prior
7773  * to handing it off to another module, respectively.
7774  */
7775 u_int32_t
7776 m_scratch_get(struct mbuf *m, u_int8_t **p)
7777 {
7778         struct pkthdr *pkt = &m->m_pkthdr;
7779
7780         VERIFY(m->m_flags & M_PKTHDR);
7781
7782         /* See comments in <rdar://problem/14040693> */
7783         if (pkt->pkt_flags & PKTF_PRIV_GUARDED) {
7784                 panic_plain("Invalid attempt to access guarded module-private "
7785                     "area: mbuf %p, pkt_flags 0x%x\n", m, pkt->pkt_flags);
7786                 /* NOTREACHED */
7787         }
7788
7789         if (mcltrace) {
7790                 mcache_audit_t *mca;
7791
7792                 lck_mtx_lock(mbuf_mlock);
7793                 mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
7794                 if (mca->mca_uflags & MB_SCVALID)
7795                         mcl_audit_scratch(mca);
7796                 lck_mtx_unlock(mbuf_mlock);
7797         }
7798
7799         *p = (u_int8_t *)&pkt->pkt_mpriv;
7800         return (sizeof (pkt->pkt_mpriv));
7801 }
7802
7803 static void
7804 m_redzone_init(struct mbuf *m)
7805 {
7806         VERIFY(m->m_flags & M_PKTHDR);
7807         /*
7808          * Each mbuf has a unique red zone pattern, which is a XOR
7809          * of the red zone cookie and the address of the mbuf.
7810          */
7811         m->m_pkthdr.redzone = ((u_int32_t)(uintptr_t)m) ^ mb_redzone_cookie;
7812 }
7813
7814 static void
7815 m_redzone_verify(struct mbuf *m)
7816 {
7817         u_int32_t mb_redzone;
7818
7819         VERIFY(m->m_flags & M_PKTHDR);
7820
7821         mb_redzone = ((u_int32_t)(uintptr_t)m) ^ mb_redzone_cookie;
7822         if (m->m_pkthdr.redzone != mb_redzone) {
7823                 panic("mbuf %p redzone violation with value 0x%x "
7824                     "(instead of 0x%x, using cookie 0x%x)\n",
7825                     m, m->m_pkthdr.redzone, mb_redzone, mb_redzone_cookie);
7826                 /* NOTREACHED */
7827         }
7828 }
7829
7830 __private_extern__ inline void
7831 m_set_ext(struct mbuf *m, struct ext_ref *rfa, m_ext_free_func_t ext_free,
7832     caddr_t ext_arg)
7833 {
7834         VERIFY(m->m_flags & M_EXT);
7835         if (rfa != NULL) {
7836                 m->m_ext.ext_refflags =
7837                     (struct ext_ref *)(((uintptr_t)rfa) ^ mb_obscure_extref);
7838                 if (ext_free != NULL) {
7839                         rfa->ext_token = ((uintptr_t)&rfa->ext_token) ^
7840                             mb_obscure_extfree;
7841                         m->m_ext.ext_free = (m_ext_free_func_t)
7842                             (((uintptr_t)ext_free) ^ rfa->ext_token);
7843                         if (ext_arg != NULL) {
7844                                 m->m_ext.ext_arg =
7845                                     (caddr_t)(((uintptr_t)ext_arg) ^ rfa->ext_token);
7846                         } else {
7847                                 m->m_ext.ext_arg = NULL;
7848                         }
7849                 } else {
7850                         rfa->ext_token = 0;
7851                         m->m_ext.ext_free = NULL;
7852                         m->m_ext.ext_arg = NULL;
7853                 }
7854         } else {
7855                 /*
7856                  * If we are going to loose the cookie in ext_token by
7857                  * resetting the rfa, we should use the global cookie
7858                  * to obscure the ext_free and ext_arg pointers.
7859                  */
7860                 if (ext_free != NULL) {
7861                         m->m_ext.ext_free =
7862                             (m_ext_free_func_t)((uintptr_t)ext_free ^
7863                             mb_obscure_extfree);
7864                         if (ext_arg != NULL) {
7865                                 m->m_ext.ext_arg =
7866                                     (caddr_t)((uintptr_t)ext_arg ^
7867                                     mb_obscure_extfree);
7868                         } else {
7869                                 m->m_ext.ext_arg = NULL;
7870                         }
7871                 } else {
7872                         m->m_ext.ext_free = NULL;
7873                         m->m_ext.ext_arg = NULL;
7874                 }
7875                 m->m_ext.ext_refflags = NULL;
7876         }
7877 }
7878
7879 __private_extern__ inline struct ext_ref *
7880 m_get_rfa(struct mbuf *m)
7881 {
7882         if (m->m_ext.ext_refflags == NULL)
7883                 return (NULL);
7884         else
7885                 return ((struct ext_ref *)(((uintptr_t)m->m_ext.ext_refflags) ^ mb_obscure_extref));
7886 }
7887
7888 __private_extern__ inline m_ext_free_func_t
7889 m_get_ext_free(struct mbuf *m)
7890 {
7891         struct ext_ref *rfa;
7892         if (m->m_ext.ext_free == NULL)
7893                 return (NULL);
7894
7895         rfa = m_get_rfa(m);
7896         if (rfa == NULL)
7897                 return ((m_ext_free_func_t)((uintptr_t)m->m_ext.ext_free ^ mb_obscure_extfree));
7898         else
7899                 return ((m_ext_free_func_t)(((uintptr_t)m->m_ext.ext_free)
7900                     ^ rfa->ext_token));
7901 }
7902
7903 __private_extern__ inline caddr_t
7904 m_get_ext_arg(struct mbuf *m)
7905 {
7906         struct ext_ref *rfa;
7907         if (m->m_ext.ext_arg == NULL)
7908                 return (NULL);
7909
7910         rfa = m_get_rfa(m);
7911         if (rfa == NULL) {
7912                 return ((caddr_t)((uintptr_t)m->m_ext.ext_arg ^ mb_obscure_extfree));
7913         } else {
7914                 return ((caddr_t)(((uintptr_t)m->m_ext.ext_arg) ^
7915                     rfa->ext_token));
7916         }
7917 }
7918
7919 /*
7920  * Send a report of mbuf usage if the usage is at least 6% of max limit
7921  * or if there has been at least 3% increase since the last report.
7922  *
7923  * The values 6% and 3% are chosen so that we can do simple arithmetic
7924  * with shift operations.
7925  */
7926 static boolean_t
7927 mbuf_report_usage(mbuf_class_t cl)
7928 {
7929         /* if a report is already in progress, nothing to do */
7930         if (mb_peak_newreport)
7931                 return (TRUE);
7932
7933         if (m_total(cl) > m_peak(cl) &&
7934             m_total(cl) >= (m_maxlimit(cl) >> 4) &&
7935             (m_total(cl) - m_peak(cl)) >= (m_peak(cl) >> 5))
7936                 return (TRUE);
7937         return (FALSE);
7938 }
7939
7940 __private_extern__ void
7941 mbuf_report_peak_usage(void)
7942 {
7943         int i = 0;
7944         u_int64_t uptime;
7945         struct nstat_sysinfo_data ns_data;
7946         uint32_t memreleased = 0;
7947         static uint32_t prevmemreleased;
7948
7949         uptime = net_uptime();
7950         lck_mtx_lock(mbuf_mlock);
7951
7952         /* Generate an initial report after 1 week of uptime */
7953         if (!mb_peak_firstreport &&
7954             uptime > MBUF_PEAK_FIRST_REPORT_THRESHOLD) {
7955                 mb_peak_newreport = TRUE;
7956                 mb_peak_firstreport = TRUE;
7957         }
7958
7959         if (!mb_peak_newreport) {
7960                 lck_mtx_unlock(mbuf_mlock);
7961                 return;
7962         }
7963
7964         /*
7965          * Since a report is being generated before 1 week,
7966          * we do not need to force another one later
7967          */
7968         if (uptime < MBUF_PEAK_FIRST_REPORT_THRESHOLD)
7969                 mb_peak_firstreport = TRUE;
7970
7971         for (i = 0; i < NELEM(mbuf_table); i++) {
7972                 m_peak(m_class(i)) = m_total(m_class(i));
7973                 memreleased += m_release_cnt(i);
7974         }
7975         memreleased = memreleased - prevmemreleased;
7976         prevmemreleased = memreleased;
7977         mb_peak_newreport = FALSE;
7978         lck_mtx_unlock(mbuf_mlock);
7979
7980         bzero(&ns_data, sizeof(ns_data));
7981         ns_data.flags = NSTAT_SYSINFO_MBUF_STATS;
7982         ns_data.u.mb_stats.total_256b = m_peak(MC_MBUF);
7983         ns_data.u.mb_stats.total_2kb = m_peak(MC_CL);
7984         ns_data.u.mb_stats.total_4kb = m_peak(MC_BIGCL);
7985         ns_data.u.mb_stats.total_16kb = m_peak(MC_16KCL);
7986         ns_data.u.mb_stats.sbmb_total = total_sbmb_cnt_peak;
7987         ns_data.u.mb_stats.sb_atmbuflimit = sbmb_limreached;
7988         ns_data.u.mb_stats.draincnt = mbstat.m_drain;
7989         ns_data.u.mb_stats.memreleased = memreleased;
7990         ns_data.u.mb_stats.sbmb_floor = total_sbmb_cnt_floor;
7991
7992         nstat_sysinfo_send_data(&ns_data);
7993
7994         /*
7995          * Reset the floor whenever we report a new
7996          * peak to track the trend (increase peek usage
7997          * is not a leak if mbufs get released
7998          * between reports and the floor stays low)
7999          */
8000         total_sbmb_cnt_floor = total_sbmb_cnt_peak;
8001 }
8002
8003 /*
8004  * Called by the VM when there's memory pressure.
8005  */
8006 __private_extern__ void
8007 m_drain(void)
8008 {
8009         mbuf_class_t mc;
8010         mcl_slab_t *sp, *sp_tmp, *nsp;
8011         unsigned int num, k, interval, released = 0;
8012         unsigned long total_mem = 0, use_mem = 0;
8013         boolean_t ret, purge_caches = FALSE;
8014         ppnum_t offset;
8015         mcache_obj_t *obj;
8016         unsigned long per;
8017         static uint64_t last_drain = 0;
8018         static unsigned char scratch[32];
8019         static ppnum_t scratch_pa = 0;
8020
8021         if (mb_drain_maxint == 0 || mb_waiters)
8022                 return;
8023         if (scratch_pa == 0) {
8024                 bzero(scratch, sizeof(scratch));
8025                 scratch_pa = pmap_find_phys(kernel_pmap, (addr64_t)scratch);
8026                 VERIFY(scratch_pa);
8027         } else if (mclverify) {
8028                 /*
8029                  * Panic if a driver wrote to our scratch memory.
8030                  */
8031                 for (k = 0; k < sizeof(scratch); k++)
8032                         if (scratch[k])
8033                                 panic("suspect DMA to freed address");
8034         }
8035         /*
8036          * Don't free memory too often as that could cause excessive
8037          * waiting times for mbufs.  Purge caches if we were asked to drain
8038          * in the last 5 minutes.
8039          */
8040         lck_mtx_lock(mbuf_mlock);
8041         if (last_drain == 0) {
8042                 last_drain = net_uptime();
8043                 lck_mtx_unlock(mbuf_mlock);
8044                 return;
8045         }
8046         interval = net_uptime() - last_drain;
8047         if (interval <= mb_drain_maxint) {
8048                 lck_mtx_unlock(mbuf_mlock);
8049                 return;
8050         }
8051         if (interval <= mb_drain_maxint * 5)
8052                 purge_caches = TRUE;
8053         last_drain = net_uptime();
8054         /*
8055          * Don't free any memory if we're using 60% or more.
8056          */
8057         for (mc = 0; mc < NELEM(mbuf_table); mc++) {
8058                 total_mem += m_total(mc) * m_maxsize(mc);
8059                 use_mem += m_active(mc) * m_maxsize(mc);
8060         }
8061         per = (use_mem * 100) / total_mem;
8062         if (per >= 60) {
8063                 lck_mtx_unlock(mbuf_mlock);
8064                 return;
8065         }
8066         /*
8067          * Purge all the caches.  This effectively disables
8068          * caching for a few seconds, but the mbuf worker thread will
8069          * re-enable them again.
8070          */
8071         if (purge_caches == TRUE)
8072                 for (mc = 0; mc < NELEM(mbuf_table); mc++) {
8073                         if (m_total(mc) < m_avgtotal(mc))
8074                                 continue;
8075                         lck_mtx_unlock(mbuf_mlock);
8076                         ret = mcache_purge_cache(m_cache(mc), FALSE);
8077                         lck_mtx_lock(mbuf_mlock);
8078                         if (ret == TRUE)
8079                                 m_purge_cnt(mc)++;
8080                 }
8081         /*
8082          * Move the objects from the composite class freelist to
8083          * the rudimentary slabs list, but keep at least 10% of the average
8084          * total in the freelist.
8085          */
8086         for (mc = 0; mc < NELEM(mbuf_table); mc++) {
8087                 while (m_cobjlist(mc) &&
8088                     m_total(mc) < m_avgtotal(mc) &&
8089                     m_infree(mc) > 0.1 * m_avgtotal(mc) + m_minlimit(mc)) {
8090                         obj = m_cobjlist(mc);
8091                         m_cobjlist(mc) = obj->obj_next;
8092                         obj->obj_next = NULL;
8093                         num = cslab_free(mc, obj, 1);
8094                         VERIFY(num == 1);
8095                         m_free_cnt(mc)++;
8096                         m_infree(mc)--;
8097                         /* cslab_free() handles m_total */
8098                 }
8099         }
8100         /*
8101          * Free the buffers present in the slab list up to 10% of the total
8102          * average per class.
8103          *
8104          * We walk the list backwards in an attempt to reduce fragmentation.
8105          */
8106         for (mc = NELEM(mbuf_table) - 1; (int)mc >= 0; mc--) {
8107                 TAILQ_FOREACH_SAFE(sp, &m_slablist(mc), sl_link, sp_tmp) {
8108                         /*
8109                          * Process only unused slabs occupying memory.
8110                          */
8111                         if (sp->sl_refcnt != 0 || sp->sl_len == 0 ||
8112                             sp->sl_base == NULL)
8113                                 continue;
8114                         if (m_total(mc) < m_avgtotal(mc) ||
8115                             m_infree(mc) < 0.1 * m_avgtotal(mc) + m_minlimit(mc))
8116                                 break;
8117                         slab_remove(sp, mc);
8118                         switch (mc) {
8119                         case MC_MBUF:
8120                                 m_infree(mc) -= NMBPG;
8121                                 m_total(mc) -= NMBPG;
8122                                 if (mclaudit != NULL)
8123                                         mcl_audit_free(sp->sl_base, NMBPG);
8124                                 break;
8125                         case MC_CL:
8126                                 m_infree(mc) -= NCLPG;
8127                                 m_total(mc) -= NCLPG;
8128                                 if (mclaudit != NULL)
8129                                         mcl_audit_free(sp->sl_base, NMBPG);
8130                                 break;
8131                         case MC_BIGCL:
8132                         {
8133                                 m_infree(mc) -= NBCLPG;
8134                                 m_total(mc) -= NBCLPG;
8135                                 if (mclaudit != NULL)
8136                                         mcl_audit_free(sp->sl_base, NMBPG);
8137                                 break;
8138                         }
8139                         case MC_16KCL:
8140                                 m_infree(mc)--;
8141                                 m_total(mc)--;
8142                                 for (nsp = sp, k = 1; k < NSLABSP16KB; k++) {
8143                                         nsp = nsp->sl_next;
8144                                         VERIFY(nsp->sl_refcnt == 0 &&
8145                                             nsp->sl_base != NULL &&
8146                                             nsp->sl_len == 0);
8147                                         slab_init(nsp, 0, 0, NULL, NULL, 0, 0,
8148                                             0);
8149                                         nsp->sl_flags = 0;
8150                                 }
8151                                 if (mclaudit != NULL) {
8152                                         if (sp->sl_len == PAGE_SIZE) {
8153                                                 mcl_audit_free(sp->sl_base,
8154                                                     NMBPG);
8155                                         } else {
8156                                                 mcl_audit_free(sp->sl_base, 1);
8157                                         }
8158                                 }
8159                                 break;
8160                         default:
8161                                 /*
8162                                  * The composite classes have their own
8163                                  * freelist (m_cobjlist), so we only
8164                                  * process rudimentary classes here.
8165                                  */
8166                                 VERIFY(0);
8167                         }
8168                         m_release_cnt(mc) += m_size(mc);
8169                         released += m_size(mc);
8170                         VERIFY(sp->sl_base != NULL &&
8171                             sp->sl_len >= PAGE_SIZE);
8172                         offset = MTOPG(sp->sl_base);
8173                         /*
8174                          * Make sure the IOMapper points to a valid, but
8175                          * bogus, address.  This should prevent further DMA
8176                          * accesses to freed memory.
8177                          */
8178                         IOMapperInsertPage(mcl_paddr_base, offset, scratch_pa);
8179                         mcl_paddr[offset] = 0;
8180                         kmem_free(mb_map, (vm_offset_t)sp->sl_base,
8181                             sp->sl_len);
8182                         slab_init(sp, 0, 0, NULL, NULL, 0, 0, 0);
8183                         sp->sl_flags = 0;
8184                 }
8185         }
8186         mbstat.m_drain++;
8187         mbstat.m_bigclusters = m_total(MC_BIGCL);
8188         mbstat.m_clusters = m_total(MC_CL);
8189         mbstat.m_mbufs = m_total(MC_MBUF);
8190         mbuf_stat_sync();
8191         mbuf_mtypes_sync(TRUE);
8192         lck_mtx_unlock(mbuf_mlock);
8193 }
8194
8195 static int
8196 m_drain_force_sysctl SYSCTL_HANDLER_ARGS
8197 {
8198 #pragma unused(arg1, arg2)
8199         int val = 0, err;
8200
8201         err = sysctl_handle_int(oidp, &val, 0, req);
8202         if (err != 0 || req->newptr == USER_ADDR_NULL)
8203                 return (err);
8204         if (val) {
8205                 lck_mtx_lock(mbuf_mlock);
8206                 printf("%s\n", mbuf_dump());
8207                 lck_mtx_unlock(mbuf_mlock);
8208                 m_drain();
8209         }
8210
8211         return (err);
8212 }
8213
8214 #if DEBUG || DEVELOPMENT
8215
8216 static int mbtest_val;
8217 static int mbtest_running;
8218
8219 static void mbtest_thread(__unused void *arg)
8220 {
8221         int i;
8222
8223         printf("%s thread starting\n", __func__);
8224
8225         for (i = 0; i < 1000; i++) {
8226                 unsigned int needed = 100000;
8227                 struct mbuf *m1, *m2, *m3;
8228
8229                 if (njcl > 0) {
8230                         needed = 100000;
8231                         m3 = m_getpackets_internal(&needed, 0, M_DONTWAIT, 0, M16KCLBYTES);
8232                         m_freem_list(m3);
8233                 }
8234
8235                 needed = 100000;
8236                 m2 = m_getpackets_internal(&needed, 0, M_DONTWAIT, 0, MBIGCLBYTES);
8237                 m_freem_list(m2);
8238
8239                 m1 = m_getpackets_internal(&needed, 0, M_DONTWAIT, 0, MCLBYTES);
8240                 m_freem_list(m1);
8241         }
8242
8243         printf("%s thread ending\n", __func__);
8244
8245         OSDecrementAtomic(&mbtest_running);
8246         wakeup_one((caddr_t)&mbtest_running);
8247 }
8248
8249 static void sysctl_mbtest(void)
8250 {
8251         /* We launch three threads - wait for all of them */
8252         OSIncrementAtomic(&mbtest_running);
8253         OSIncrementAtomic(&mbtest_running);
8254         OSIncrementAtomic(&mbtest_running);
8255
8256         thread_call_func_delayed((thread_call_func_t)mbtest_thread, NULL, 10);
8257         thread_call_func_delayed((thread_call_func_t)mbtest_thread, NULL, 10);
8258         thread_call_func_delayed((thread_call_func_t)mbtest_thread, NULL, 10);
8259
8260         while (mbtest_running) {
8261                 msleep((caddr_t)&mbtest_running, NULL, PUSER, "mbtest_running", NULL);
8262         }
8263 }
8264
8265 static int
8266 mbtest SYSCTL_HANDLER_ARGS
8267 {
8268 #pragma unused(arg1, arg2)
8269         int error = 0, val, oldval = mbtest_val;
8270
8271         val = oldval;
8272         error = sysctl_handle_int(oidp, &val, 0, req);
8273         if (error || !req->newptr)
8274                 return (error);
8275
8276         if (val != oldval)
8277                 sysctl_mbtest();
8278
8279         mbtest_val = val;
8280
8281         return (error);
8282 }
8283 #endif
8284
8285
8286 static void
8287 mtracelarge_register(size_t size)
8288 {
8289         int i;
8290         struct mtracelarge *trace;
8291         uintptr_t bt[MLEAK_STACK_DEPTH];
8292         unsigned int depth;
8293
8294         depth = backtrace(bt, MLEAK_STACK_DEPTH);
8295         /* Check if this entry is already on the list. */
8296         for (i = 0; i < MTRACELARGE_NUM_TRACES; i++) {
8297                 trace = &mtracelarge_table[i];
8298                 if (trace->size == size && trace->depth == depth &&
8299                     memcmp(bt, trace->addr, depth * sizeof(uintptr_t)) == 0) {
8300                         return;
8301                 }
8302
8303         }
8304         for (i = 0; i < MTRACELARGE_NUM_TRACES; i++) {
8305                 trace = &mtracelarge_table[i];
8306                 if (size > trace->size) {
8307                         trace->depth = depth;
8308                         memcpy(trace->addr, bt, depth * sizeof(uintptr_t));
8309                         trace->size = size;
8310                         break;
8311                 }
8312         }
8313 }
8314
8315 SYSCTL_DECL(_kern_ipc);
8316 #if DEBUG || DEVELOPMENT
8317 SYSCTL_PROC(_kern_ipc, OID_AUTO, mbtest,
8318     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &mbtest_val, 0, &mbtest, "I",
8319     "Toggle to test mbufs");
8320 #endif
8321 SYSCTL_PROC(_kern_ipc, KIPC_MBSTAT, mbstat,
8322     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
8323     0, 0, mbstat_sysctl, "S,mbstat", "");
8324 SYSCTL_PROC(_kern_ipc, OID_AUTO, mb_stat,
8325     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
8326     0, 0, mb_stat_sysctl, "S,mb_stat", "");
8327 SYSCTL_PROC(_kern_ipc, OID_AUTO, mleak_top_trace,
8328     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
8329     0, 0, mleak_top_trace_sysctl, "S,mb_top_trace", "");
8330 SYSCTL_PROC(_kern_ipc, OID_AUTO, mleak_table,
8331     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
8332     0, 0, mleak_table_sysctl, "S,mleak_table", "");
8333 SYSCTL_INT(_kern_ipc, OID_AUTO, mleak_sample_factor,
8334     CTLFLAG_RW | CTLFLAG_LOCKED, &mleak_table.mleak_sample_factor, 0, "");
8335 SYSCTL_INT(_kern_ipc, OID_AUTO, mb_normalized,
8336     CTLFLAG_RD | CTLFLAG_LOCKED, &mb_normalized, 0, "");
8337 SYSCTL_INT(_kern_ipc, OID_AUTO, mb_watchdog,
8338     CTLFLAG_RW | CTLFLAG_LOCKED, &mb_watchdog, 0, "");
8339 SYSCTL_PROC(_kern_ipc, OID_AUTO, mb_drain_force,
8340     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, NULL, 0,
8341     m_drain_force_sysctl, "I",
8342     "Forces the mbuf garbage collection to run");
8343 SYSCTL_INT(_kern_ipc, OID_AUTO, mb_drain_maxint,
8344     CTLFLAG_RW | CTLFLAG_LOCKED, &mb_drain_maxint, 0,
8345     "Minimum time interval between garbage collection");