bsd/kern/uipc_mbuf.c

   1 /*
   2  * Copyright (c) 1998-2018 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
  29 /*
  30  * Copyright (c) 1982, 1986, 1988, 1991, 1993
  31  *      The Regents of the University of California.  All rights reserved.
  32  *
  33  * Redistribution and use in source and binary forms, with or without
  34  * modification, are permitted provided that the following conditions
  35  * are met:
  36  * 1. Redistributions of source code must retain the above copyright
  37  *    notice, this list of conditions and the following disclaimer.
  38  * 2. Redistributions in binary form must reproduce the above copyright
  39  *    notice, this list of conditions and the following disclaimer in the
  40  *    documentation and/or other materials provided with the distribution.
  41  * 3. All advertising materials mentioning features or use of this software
  42  *    must display the following acknowledgement:
  43  *      This product includes software developed by the University of
  44  *      California, Berkeley and its contributors.
  45  * 4. Neither the name of the University nor the names of its contributors
  46  *    may be used to endorse or promote products derived from this software
  47  *    without specific prior written permission.
  48  *
  49  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  50  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  51  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  52  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  53  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  54  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  55  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  56  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  57  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  58  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  59  * SUCH DAMAGE.
  60  *
  61  *      @(#)uipc_mbuf.c 8.2 (Berkeley) 1/4/94
  62  */
  63 /*
  64  * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
  65  * support for mandatory and extensible security protections.  This notice
  66  * is included in support of clause 2.2 (b) of the Apple Public License,
  67  * Version 2.0.
  68  */
  69
  70 #include <sys/param.h>
  71 #include <sys/systm.h>
  72 #include <sys/malloc.h>
  73 #include <sys/mbuf.h>
  74 #include <sys/kernel.h>
  75 #include <sys/sysctl.h>
  76 #include <sys/syslog.h>
  77 #include <sys/protosw.h>
  78 #include <sys/domain.h>
  79 #include <sys/queue.h>
  80 #include <sys/proc.h>
  81
  82 #include <dev/random/randomdev.h>
  83
  84 #include <kern/kern_types.h>
  85 #include <kern/simple_lock.h>
  86 #include <kern/queue.h>
  87 #include <kern/sched_prim.h>
  88 #include <kern/backtrace.h>
  89 #include <kern/cpu_number.h>
  90 #include <kern/zalloc.h>
  91
  92 #include <libkern/OSAtomic.h>
  93 #include <libkern/OSDebug.h>
  94 #include <libkern/libkern.h>
  95
  96 #include <os/log.h>
  97
  98 #include <IOKit/IOMapper.h>
  99
 100 #include <machine/limits.h>
 101 #include <machine/machine_routines.h>
 102
 103 #if CONFIG_MACF_NET
 104 #include <security/mac_framework.h>
 105 #endif /* MAC_NET */
 106
 107 #include <sys/mcache.h>
 108 #include <net/ntstat.h>
 109
 110 /*
 111  * MBUF IMPLEMENTATION NOTES.
 112  *
 113  * There is a total of 5 per-CPU caches:
 114  *
 115  * MC_MBUF:
 116  *      This is a cache of rudimentary objects of MSIZE in size; each
 117  *      object represents an mbuf structure.  This cache preserves only
 118  *      the m_type field of the mbuf during its transactions.
 119  *
 120  * MC_CL:
 121  *      This is a cache of rudimentary objects of MCLBYTES in size; each
 122  *      object represents a mcluster structure.  This cache does not
 123  *      preserve the contents of the objects during its transactions.
 124  *
 125  * MC_BIGCL:
 126  *      This is a cache of rudimentary objects of MBIGCLBYTES in size; each
 127  *      object represents a mbigcluster structure.  This cache does not
 128  *      preserve the contents of the objects during its transaction.
 129  *
 130  * MC_MBUF_CL:
 131  *      This is a cache of mbufs each having a cluster attached to it.
 132  *      It is backed by MC_MBUF and MC_CL rudimentary caches.  Several
 133  *      fields of the mbuf related to the external cluster are preserved
 134  *      during transactions.
 135  *
 136  * MC_MBUF_BIGCL:
 137  *      This is a cache of mbufs each having a big cluster attached to it.
 138  *      It is backed by MC_MBUF and MC_BIGCL rudimentary caches.  Several
 139  *      fields of the mbuf related to the external cluster are preserved
 140  *      during transactions.
 141  *
 142  * OBJECT ALLOCATION:
 143  *
 144  * Allocation requests are handled first at the per-CPU (mcache) layer
 145  * before falling back to the slab layer.  Performance is optimal when
 146  * the request is satisfied at the CPU layer because global data/lock
 147  * never gets accessed.  When the slab layer is entered for allocation,
 148  * the slab freelist will be checked first for available objects before
 149  * the VM backing store is invoked.  Slab layer operations are serialized
 150  * for all of the caches as the mbuf global lock is held most of the time.
 151  * Allocation paths are different depending on the class of objects:
 152  *
 153  * a. Rudimentary object:
 154  *
 155  *      { m_get_common(), m_clattach(), m_mclget(),
 156  *        m_mclalloc(), m_bigalloc(), m_copym_with_hdrs(),
 157  *        composite object allocation }
 158  *                      |       ^
 159  *                      |       |
 160  *                      |       +-----------------------+
 161  *                      v                               |
 162  *         mcache_alloc/mcache_alloc_ext()      mbuf_slab_audit()
 163  *                      |                               ^
 164  *                      v                               |
 165  *                 [CPU cache] -------> (found?) -------+
 166  *                      |                               |
 167  *                      v                               |
 168  *               mbuf_slab_alloc()                      |
 169  *                      |                               |
 170  *                      v                               |
 171  *      +---------> [freelist] -------> (found?) -------+
 172  *      |               |
 173  *      |               v
 174  *      |           m_clalloc()
 175  *      |               |
 176  *      |               v
 177  *      +---<<---- kmem_mb_alloc()
 178  *
 179  * b. Composite object:
 180  *
 181  *      { m_getpackets_internal(), m_allocpacket_internal() }
 182  *                      |       ^
 183  *                      |       |
 184  *                      |       +------ (done) ---------+
 185  *                      v                               |
 186  *         mcache_alloc/mcache_alloc_ext()      mbuf_cslab_audit()
 187  *                      |                               ^
 188  *                      v                               |
 189  *                 [CPU cache] -------> (found?) -------+
 190  *                      |                               |
 191  *                      v                               |
 192  *               mbuf_cslab_alloc()                     |
 193  *                      |                               |
 194  *                      v                               |
 195  *                  [freelist] -------> (found?) -------+
 196  *                      |                               |
 197  *                      v                               |
 198  *              (rudimentary object)                    |
 199  *         mcache_alloc/mcache_alloc_ext() ------>>-----+
 200  *
 201  * Auditing notes: If auditing is enabled, buffers will be subjected to
 202  * integrity checks by the audit routine.  This is done by verifying their
 203  * contents against DEADBEEF (free) pattern before returning them to caller.
 204  * As part of this step, the routine will also record the transaction and
 205  * pattern-fill the buffers with BADDCAFE (uninitialized) pattern.  It will
 206  * also restore any constructed data structure fields if necessary.
 207  *
 208  * OBJECT DEALLOCATION:
 209  *
 210  * Freeing an object simply involves placing it into the CPU cache; this
 211  * pollutes the cache to benefit subsequent allocations.  The slab layer
 212  * will only be entered if the object is to be purged out of the cache.
 213  * During normal operations, this happens only when the CPU layer resizes
 214  * its bucket while it's adjusting to the allocation load.  Deallocation
 215  * paths are different depending on the class of objects:
 216  *
 217  * a. Rudimentary object:
 218  *
 219  *      { m_free(), m_freem_list(), composite object deallocation }
 220  *                      |       ^
 221  *                      |       |
 222  *                      |       +------ (done) ---------+
 223  *                      v                               |
 224  *         mcache_free/mcache_free_ext()                |
 225  *                      |                               |
 226  *                      v                               |
 227  *              mbuf_slab_audit()                       |
 228  *                      |                               |
 229  *                      v                               |
 230  *                 [CPU cache] ---> (not purging?) -----+
 231  *                      |                               |
 232  *                      v                               |
 233  *               mbuf_slab_free()                       |
 234  *                      |                               |
 235  *                      v                               |
 236  *                  [freelist] ----------->>------------+
 237  *       (objects get purged to VM only on demand)
 238  *
 239  * b. Composite object:
 240  *
 241  *      { m_free(), m_freem_list() }
 242  *                      |       ^
 243  *                      |       |
 244  *                      |       +------ (done) ---------+
 245  *                      v                               |
 246  *         mcache_free/mcache_free_ext()                |
 247  *                      |                               |
 248  *                      v                               |
 249  *              mbuf_cslab_audit()                      |
 250  *                      |                               |
 251  *                      v                               |
 252  *                 [CPU cache] ---> (not purging?) -----+
 253  *                      |                               |
 254  *                      v                               |
 255  *               mbuf_cslab_free()                      |
 256  *                      |                               |
 257  *                      v                               |
 258  *                  [freelist] ---> (not purging?) -----+
 259  *                      |                               |
 260  *                      v                               |
 261  *              (rudimentary object)                    |
 262  *         mcache_free/mcache_free_ext() ------->>------+
 263  *
 264  * Auditing notes: If auditing is enabled, the audit routine will save
 265  * any constructed data structure fields (if necessary) before filling the
 266  * contents of the buffers with DEADBEEF (free) pattern and recording the
 267  * transaction.  Buffers that are freed (whether at CPU or slab layer) are
 268  * expected to contain the free pattern.
 269  *
 270  * DEBUGGING:
 271  *
 272  * Debugging can be enabled by adding "mbuf_debug=0x3" to boot-args; this
 273  * translates to the mcache flags (MCF_VERIFY | MCF_AUDIT).  Additionally,
 274  * the CPU layer cache can be disabled by setting the MCF_NOCPUCACHE flag,
 275  * i.e. modify the boot argument parameter to "mbuf_debug=0x13".  Leak
 276  * detection may also be disabled by setting the MCF_NOLEAKLOG flag, e.g.
 277  * "mbuf_debug=0x113".  Note that debugging consumes more CPU and memory.
 278  *
 279  * Each object is associated with exactly one mcache_audit_t structure that
 280  * contains the information related to its last buffer transaction.  Given
 281  * an address of an object, the audit structure can be retrieved by finding
 282  * the position of the object relevant to the base address of the cluster:
 283  *
 284  *      +------------+                  +=============+
 285  *      | mbuf addr  |                  | mclaudit[i] |
 286  *      +------------+                  +=============+
 287  *            |                         | cl_audit[0] |
 288  *      i = MTOBG(addr)                 +-------------+
 289  *            |                 +-----> | cl_audit[1] | -----> mcache_audit_t
 290  *      b = BGTOM(i)            |       +-------------+
 291  *            |                 |       |     ...     |
 292  *      x = MCLIDX(b, addr)     |       +-------------+
 293  *            |                 |       | cl_audit[7] |
 294  *            +-----------------+       +-------------+
 295  *               (e.g. x == 1)
 296  *
 297  * The mclaudit[] array is allocated at initialization time, but its contents
 298  * get populated when the corresponding cluster is created.  Because a page
 299  * can be turned into NMBPG number of mbufs, we preserve enough space for the
 300  * mbufs so that there is a 1-to-1 mapping between them.  A page that never
 301  * gets (or has not yet) turned into mbufs will use only cl_audit[0] with the
 302  * remaining entries unused.  For 16KB cluster, only one entry from the first
 303  * page is allocated and used for the entire object.
 304  */
 305
 306 /* TODO: should be in header file */
 307 /* kernel translater */
 308 extern vm_offset_t kmem_mb_alloc(vm_map_t, int, int, kern_return_t *);
 309 extern ppnum_t pmap_find_phys(pmap_t pmap, addr64_t va);
 310 extern vm_map_t mb_map;         /* special map */
 311
 312 static uint32_t mb_kmem_contig_failed;
 313 static uint32_t mb_kmem_failed;
 314 static uint32_t mb_kmem_one_failed;
 315 /* Timestamp of allocation failures. */
 316 static uint64_t mb_kmem_contig_failed_ts;
 317 static uint64_t mb_kmem_failed_ts;
 318 static uint64_t mb_kmem_one_failed_ts;
 319 static uint64_t mb_kmem_contig_failed_size;
 320 static uint64_t mb_kmem_failed_size;
 321 static uint32_t mb_kmem_stats[6];
 322 static const char *mb_kmem_stats_labels[] = { "INVALID_ARGUMENT",
 323                                               "INVALID_ADDRESS",
 324                                               "RESOURCE_SHORTAGE",
 325                                               "NO_SPACE",
 326                                               "KERN_FAILURE",
 327                                               "OTHERS" };
 328
 329 /* Global lock */
 330 decl_lck_mtx_data(static, mbuf_mlock_data);
 331 static lck_mtx_t *mbuf_mlock = &mbuf_mlock_data;
 332 static lck_attr_t *mbuf_mlock_attr;
 333 static lck_grp_t *mbuf_mlock_grp;
 334 static lck_grp_attr_t *mbuf_mlock_grp_attr;
 335
 336 /* Back-end (common) layer */
 337 static uint64_t mb_expand_cnt;
 338 static uint64_t mb_expand_cl_cnt;
 339 static uint64_t mb_expand_cl_total;
 340 static uint64_t mb_expand_bigcl_cnt;
 341 static uint64_t mb_expand_bigcl_total;
 342 static uint64_t mb_expand_16kcl_cnt;
 343 static uint64_t mb_expand_16kcl_total;
 344 static boolean_t mbuf_worker_needs_wakeup; /* wait channel for mbuf worker */
 345 static uint32_t mbuf_worker_run_cnt;
 346 static uint64_t mbuf_worker_last_runtime;
 347 static uint64_t mbuf_drain_last_runtime;
 348 static int mbuf_worker_ready;   /* worker thread is runnable */
 349 static int ncpu;                /* number of CPUs */
 350 static ppnum_t *mcl_paddr;      /* Array of cluster physical addresses */
 351 static ppnum_t mcl_pages;       /* Size of array (# physical pages) */
 352 static ppnum_t mcl_paddr_base;  /* Handle returned by IOMapper::iovmAlloc() */
 353 static mcache_t *ref_cache;     /* Cache of cluster reference & flags */
 354 static mcache_t *mcl_audit_con_cache; /* Audit contents cache */
 355 static unsigned int mbuf_debug; /* patchable mbuf mcache flags */
 356 static unsigned int mb_normalized; /* number of packets "normalized" */
 357
 358 #define MB_GROWTH_AGGRESSIVE    1       /* Threshold: 1/2 of total */
 359 #define MB_GROWTH_NORMAL        2       /* Threshold: 3/4 of total */
 360
 361 typedef enum {
 362         MC_MBUF = 0,    /* Regular mbuf */
 363         MC_CL,          /* Cluster */
 364         MC_BIGCL,       /* Large (4KB) cluster */
 365         MC_16KCL,       /* Jumbo (16KB) cluster */
 366         MC_MBUF_CL,     /* mbuf + cluster */
 367         MC_MBUF_BIGCL,  /* mbuf + large (4KB) cluster */
 368         MC_MBUF_16KCL   /* mbuf + jumbo (16KB) cluster */
 369 } mbuf_class_t;
 370
 371 #define MBUF_CLASS_MIN          MC_MBUF
 372 #define MBUF_CLASS_MAX          MC_MBUF_16KCL
 373 #define MBUF_CLASS_LAST         MC_16KCL
 374 #define MBUF_CLASS_VALID(c) \
 375         ((int)(c) >= MBUF_CLASS_MIN && (int)(c) <= MBUF_CLASS_MAX)
 376 #define MBUF_CLASS_COMPOSITE(c) \
 377         ((int)(c) > MBUF_CLASS_LAST)
 378
 379
 380 /*
 381  * mbuf specific mcache allocation request flags.
 382  */
 383 #define MCR_COMP        MCR_USR1 /* for MC_MBUF_{CL,BIGCL,16KCL} caches */
 384
 385 /*
 386  * Per-cluster slab structure.
 387  *
 388  * A slab is a cluster control structure that contains one or more object
 389  * chunks; the available chunks are chained in the slab's freelist (sl_head).
 390  * Each time a chunk is taken out of the slab, the slab's reference count
 391  * gets incremented.  When all chunks have been taken out, the empty slab
 392  * gets removed (SLF_DETACHED) from the class's slab list.  A chunk that is
 393  * returned to a slab causes the slab's reference count to be decremented;
 394  * it also causes the slab to be reinserted back to class's slab list, if
 395  * it's not already done.
 396  *
 397  * Compartmentalizing of the object chunks into slabs allows us to easily
 398  * merge one or more slabs together when the adjacent slabs are idle, as
 399  * well as to convert or move a slab from one class to another; e.g. the
 400  * mbuf cluster slab can be converted to a regular cluster slab when all
 401  * mbufs in the slab have been freed.
 402  *
 403  * A slab may also span across multiple clusters for chunks larger than
 404  * a cluster's size.  In this case, only the slab of the first cluster is
 405  * used.  The rest of the slabs are marked with SLF_PARTIAL to indicate
 406  * that they are part of the larger slab.
 407  *
 408  * Each slab controls a page of memory.
 409  */
 410 typedef struct mcl_slab {
 411         struct mcl_slab *sl_next;       /* neighboring slab */
 412         u_int8_t        sl_class;       /* controlling mbuf class */
 413         int8_t          sl_refcnt;      /* outstanding allocations */
 414         int8_t          sl_chunks;      /* chunks (bufs) in this slab */
 415         u_int16_t       sl_flags;       /* slab flags (see below) */
 416         u_int16_t       sl_len;         /* slab length */
 417         void            *sl_base;       /* base of allocated memory */
 418         void            *sl_head;       /* first free buffer */
 419         TAILQ_ENTRY(mcl_slab) sl_link;  /* next/prev slab on freelist */
 420 } mcl_slab_t;
 421
 422 #define SLF_MAPPED      0x0001          /* backed by a mapped page */
 423 #define SLF_PARTIAL     0x0002          /* part of another slab */
 424 #define SLF_DETACHED    0x0004          /* not in slab freelist */
 425
 426 /*
 427  * The array of slabs are broken into groups of arrays per 1MB of kernel
 428  * memory to reduce the footprint.  Each group is allocated on demand
 429  * whenever a new piece of memory mapped in from the VM crosses the 1MB
 430  * boundary.
 431  */
 432 #define NSLABSPMB       ((1 << MBSHIFT) >> PAGE_SHIFT)
 433
 434 typedef struct mcl_slabg {
 435         mcl_slab_t      *slg_slab;      /* group of slabs */
 436 } mcl_slabg_t;
 437
 438 /*
 439  * Number of slabs needed to control a 16KB cluster object.
 440  */
 441 #define NSLABSP16KB     (M16KCLBYTES >> PAGE_SHIFT)
 442
 443 /*
 444  * Per-cluster audit structure.
 445  */
 446 typedef struct {
 447         mcache_audit_t  **cl_audit;     /* array of audits */
 448 } mcl_audit_t;
 449
 450 typedef struct {
 451         struct thread   *msa_thread;    /* thread doing transaction */
 452         struct thread   *msa_pthread;   /* previous transaction thread */
 453         uint32_t        msa_tstamp;     /* transaction timestamp (ms) */
 454         uint32_t        msa_ptstamp;    /* prev transaction timestamp (ms) */
 455         uint16_t        msa_depth;      /* pc stack depth */
 456         uint16_t        msa_pdepth;     /* previous transaction pc stack */
 457         void            *msa_stack[MCACHE_STACK_DEPTH];
 458         void            *msa_pstack[MCACHE_STACK_DEPTH];
 459 } mcl_scratch_audit_t;
 460
 461 typedef struct {
 462         /*
 463          * Size of data from the beginning of an mbuf that covers m_hdr,
 464          * pkthdr and m_ext structures.  If auditing is enabled, we allocate
 465          * a shadow mbuf structure of this size inside each audit structure,
 466          * and the contents of the real mbuf gets copied into it when the mbuf
 467          * is freed.  This allows us to pattern-fill the mbuf for integrity
 468          * check, and to preserve any constructed mbuf fields (e.g. mbuf +
 469          * cluster cache case).  Note that we don't save the contents of
 470          * clusters when they are freed; we simply pattern-fill them.
 471          */
 472         u_int8_t                sc_mbuf[(MSIZE - _MHLEN) + sizeof (_m_ext_t)];
 473         mcl_scratch_audit_t     sc_scratch __attribute__((aligned(8)));
 474 } mcl_saved_contents_t;
 475
 476 #define AUDIT_CONTENTS_SIZE     (sizeof (mcl_saved_contents_t))
 477
 478 #define MCA_SAVED_MBUF_PTR(_mca)                                        \
 479         ((struct mbuf *)(void *)((mcl_saved_contents_t *)               \
 480         (_mca)->mca_contents)->sc_mbuf)
 481 #define MCA_SAVED_MBUF_SIZE                                             \
 482         (sizeof (((mcl_saved_contents_t *)0)->sc_mbuf))
 483 #define MCA_SAVED_SCRATCH_PTR(_mca)                                     \
 484         (&((mcl_saved_contents_t *)(_mca)->mca_contents)->sc_scratch)
 485
 486 /*
 487  * mbuf specific mcache audit flags
 488  */
 489 #define MB_INUSE        0x01    /* object has not been returned to slab */
 490 #define MB_COMP_INUSE   0x02    /* object has not been returned to cslab */
 491 #define MB_SCVALID      0x04    /* object has valid saved contents */
 492
 493 /*
 494  * Each of the following two arrays hold up to nmbclusters elements.
 495  */
 496 static mcl_audit_t *mclaudit;   /* array of cluster audit information */
 497 static unsigned int maxclaudit; /* max # of entries in audit table */
 498 static mcl_slabg_t **slabstbl;  /* cluster slabs table */
 499 static unsigned int maxslabgrp; /* max # of entries in slabs table */
 500 static unsigned int slabgrp;    /* # of entries in slabs table */
 501
 502 /* Globals */
 503 int nclusters;                  /* # of clusters for non-jumbo (legacy) sizes */
 504 int njcl;                       /* # of clusters for jumbo sizes */
 505 int njclbytes;                  /* size of a jumbo cluster */
 506 unsigned char *mbutl;           /* first mapped cluster address */
 507 unsigned char *embutl;          /* ending virtual address of mclusters */
 508 int _max_linkhdr;               /* largest link-level header */
 509 int _max_protohdr;              /* largest protocol header */
 510 int max_hdr;                    /* largest link+protocol header */
 511 int max_datalen;                /* MHLEN - max_hdr */
 512
 513 static boolean_t mclverify;     /* debug: pattern-checking */
 514 static boolean_t mcltrace;      /* debug: stack tracing */
 515 static boolean_t mclfindleak;   /* debug: leak detection */
 516 static boolean_t mclexpleak;    /* debug: expose leak info to user space */
 517
 518 static struct timeval mb_start; /* beginning of time */
 519
 520 /* mbuf leak detection variables */
 521 static struct mleak_table mleak_table;
 522 static mleak_stat_t *mleak_stat;
 523
 524 #define MLEAK_STAT_SIZE(n) \
 525         __builtin_offsetof(mleak_stat_t, ml_trace[n])
 526
 527 struct mallocation {
 528         mcache_obj_t *element;  /* the alloc'ed element, NULL if unused */
 529         u_int32_t trace_index;  /* mtrace index for corresponding backtrace */
 530         u_int32_t count;        /* How many objects were requested */
 531         u_int64_t hitcount;     /* for determining hash effectiveness */
 532 };
 533
 534 struct mtrace {
 535         u_int64_t       collisions;
 536         u_int64_t       hitcount;
 537         u_int64_t       allocs;
 538         u_int64_t       depth;
 539         uintptr_t       addr[MLEAK_STACK_DEPTH];
 540 };
 541
 542 /* Size must be a power of two for the zhash to be able to just mask off bits */
 543 #define MLEAK_ALLOCATION_MAP_NUM        512
 544 #define MLEAK_TRACE_MAP_NUM             256
 545
 546 /*
 547  * Sample factor for how often to record a trace.  This is overwritable
 548  * by the boot-arg mleak_sample_factor.
 549  */
 550 #define MLEAK_SAMPLE_FACTOR             500
 551
 552 /*
 553  * Number of top leakers recorded.
 554  */
 555 #define MLEAK_NUM_TRACES                5
 556
 557 #define MB_LEAK_SPACING_64 "                    "
 558 #define MB_LEAK_SPACING_32 "            "
 559
 560
 561 #define MB_LEAK_HDR_32  "\n\
 562     trace [1]   trace [2]   trace [3]   trace [4]   trace [5]  \n\
 563     ----------  ----------  ----------  ----------  ---------- \n\
 564 "
 565
 566 #define MB_LEAK_HDR_64  "\n\
 567     trace [1]           trace [2]           trace [3]       \
 568         trace [4]           trace [5]      \n\
 569     ------------------  ------------------  ------------------  \
 570     ------------------  ------------------ \n\
 571 "
 572
 573 static uint32_t mleak_alloc_buckets = MLEAK_ALLOCATION_MAP_NUM;
 574 static uint32_t mleak_trace_buckets = MLEAK_TRACE_MAP_NUM;
 575
 576 /* Hashmaps of allocations and their corresponding traces */
 577 static struct mallocation *mleak_allocations;
 578 static struct mtrace *mleak_traces;
 579 static struct mtrace *mleak_top_trace[MLEAK_NUM_TRACES];
 580
 581 /* Lock to protect mleak tables from concurrent modification */
 582 decl_lck_mtx_data(static, mleak_lock_data);
 583 static lck_mtx_t *mleak_lock = &mleak_lock_data;
 584 static lck_attr_t *mleak_lock_attr;
 585 static lck_grp_t *mleak_lock_grp;
 586 static lck_grp_attr_t *mleak_lock_grp_attr;
 587
 588 /* *Failed* large allocations. */
 589 struct mtracelarge {
 590         uint64_t        size;
 591         uint64_t        depth;
 592         uintptr_t       addr[MLEAK_STACK_DEPTH];
 593 };
 594
 595 #define MTRACELARGE_NUM_TRACES          5
 596 static struct mtracelarge mtracelarge_table[MTRACELARGE_NUM_TRACES];
 597
 598 static void mtracelarge_register(size_t size);
 599
 600 /* Lock to protect the completion callback table */
 601 static lck_grp_attr_t *mbuf_tx_compl_tbl_lck_grp_attr = NULL;
 602 static lck_attr_t *mbuf_tx_compl_tbl_lck_attr = NULL;
 603 static lck_grp_t *mbuf_tx_compl_tbl_lck_grp = NULL;
 604 decl_lck_rw_data(, mbuf_tx_compl_tbl_lck_rw_data);
 605 lck_rw_t *mbuf_tx_compl_tbl_lock = &mbuf_tx_compl_tbl_lck_rw_data;
 606
 607 extern u_int32_t high_sb_max;
 608
 609 /* The minimum number of objects that are allocated, to start. */
 610 #define MINCL           32
 611 #define MINBIGCL        (MINCL >> 1)
 612 #define MIN16KCL        (MINCL >> 2)
 613
 614 /* Low watermarks (only map in pages once free counts go below) */
 615 #define MBIGCL_LOWAT    MINBIGCL
 616 #define M16KCL_LOWAT    MIN16KCL
 617
 618 typedef struct {
 619         mbuf_class_t    mtbl_class;     /* class type */
 620         mcache_t        *mtbl_cache;    /* mcache for this buffer class */
 621         TAILQ_HEAD(mcl_slhead, mcl_slab) mtbl_slablist; /* slab list */
 622         mcache_obj_t    *mtbl_cobjlist; /* composite objects freelist */
 623         mb_class_stat_t *mtbl_stats;    /* statistics fetchable via sysctl */
 624         u_int32_t       mtbl_maxsize;   /* maximum buffer size */
 625         int             mtbl_minlimit;  /* minimum allowed */
 626         int             mtbl_maxlimit;  /* maximum allowed */
 627         u_int32_t       mtbl_wantpurge; /* purge during next reclaim */
 628         uint32_t        mtbl_avgtotal;  /* average total on iOS */
 629         u_int32_t       mtbl_expand;    /* worker should expand the class */
 630 } mbuf_table_t;
 631
 632 #define m_class(c)      mbuf_table[c].mtbl_class
 633 #define m_cache(c)      mbuf_table[c].mtbl_cache
 634 #define m_slablist(c)   mbuf_table[c].mtbl_slablist
 635 #define m_cobjlist(c)   mbuf_table[c].mtbl_cobjlist
 636 #define m_maxsize(c)    mbuf_table[c].mtbl_maxsize
 637 #define m_minlimit(c)   mbuf_table[c].mtbl_minlimit
 638 #define m_maxlimit(c)   mbuf_table[c].mtbl_maxlimit
 639 #define m_wantpurge(c)  mbuf_table[c].mtbl_wantpurge
 640 #define m_cname(c)      mbuf_table[c].mtbl_stats->mbcl_cname
 641 #define m_size(c)       mbuf_table[c].mtbl_stats->mbcl_size
 642 #define m_total(c)      mbuf_table[c].mtbl_stats->mbcl_total
 643 #define m_active(c)     mbuf_table[c].mtbl_stats->mbcl_active
 644 #define m_infree(c)     mbuf_table[c].mtbl_stats->mbcl_infree
 645 #define m_slab_cnt(c)   mbuf_table[c].mtbl_stats->mbcl_slab_cnt
 646 #define m_alloc_cnt(c)  mbuf_table[c].mtbl_stats->mbcl_alloc_cnt
 647 #define m_free_cnt(c)   mbuf_table[c].mtbl_stats->mbcl_free_cnt
 648 #define m_notified(c)   mbuf_table[c].mtbl_stats->mbcl_notified
 649 #define m_purge_cnt(c)  mbuf_table[c].mtbl_stats->mbcl_purge_cnt
 650 #define m_fail_cnt(c)   mbuf_table[c].mtbl_stats->mbcl_fail_cnt
 651 #define m_ctotal(c)     mbuf_table[c].mtbl_stats->mbcl_ctotal
 652 #define m_peak(c)       mbuf_table[c].mtbl_stats->mbcl_peak_reported
 653 #define m_release_cnt(c) mbuf_table[c].mtbl_stats->mbcl_release_cnt
 654 #define m_region_expand(c)      mbuf_table[c].mtbl_expand
 655
 656 static mbuf_table_t mbuf_table[] = {
 657         /*
 658          * The caches for mbufs, regular clusters and big clusters.
 659          * The average total values were based on data gathered by actual
 660          * usage patterns on iOS.
 661          */
 662         { MC_MBUF, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_MBUF)),
 663             NULL, NULL, 0, 0, 0, 0, 3000, 0 },
 664         { MC_CL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_CL)),
 665             NULL, NULL, 0, 0, 0, 0, 2000, 0 },
 666         { MC_BIGCL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_BIGCL)),
 667             NULL, NULL, 0, 0, 0, 0, 1000, 0 },
 668         { MC_16KCL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_16KCL)),
 669             NULL, NULL, 0, 0, 0, 0, 200, 0 },
 670         /*
 671          * The following are special caches; they serve as intermediate
 672          * caches backed by the above rudimentary caches.  Each object
 673          * in the cache is an mbuf with a cluster attached to it.  Unlike
 674          * the above caches, these intermediate caches do not directly
 675          * deal with the slab structures; instead, the constructed
 676          * cached elements are simply stored in the freelists.
 677          */
 678         { MC_MBUF_CL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0, 2000, 0 },
 679         { MC_MBUF_BIGCL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0, 1000, 0 },
 680         { MC_MBUF_16KCL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0, 200, 0 },
 681 };
 682
 683 #define NELEM(a)        (sizeof (a) / sizeof ((a)[0]))
 684
 685
 686 static uint32_t
 687 m_avgtotal(mbuf_class_t c)
 688 {
 689         return (mbuf_table[c].mtbl_avgtotal);
 690 }
 691
 692 static void *mb_waitchan = &mbuf_table; /* wait channel for all caches */
 693 static int mb_waiters;                  /* number of waiters */
 694
 695 boolean_t mb_peak_newreport = FALSE;
 696 boolean_t mb_peak_firstreport = FALSE;
 697
 698 /* generate a report by default after 1 week of uptime */
 699 #define MBUF_PEAK_FIRST_REPORT_THRESHOLD        604800
 700
 701 #define MB_WDT_MAXTIME  10              /* # of secs before watchdog panic */
 702 static struct timeval mb_wdtstart;      /* watchdog start timestamp */
 703 static char *mbuf_dump_buf;
 704
 705 #define MBUF_DUMP_BUF_SIZE      4096
 706
 707 /*
 708  * mbuf watchdog is enabled by default on embedded platforms.  It is
 709  * also toggeable via the kern.ipc.mb_watchdog sysctl.
 710  * Garbage collection is also enabled by default on embedded platforms.
 711  * mb_drain_maxint controls the amount of time to wait (in seconds) before
 712  * consecutive calls to mbuf_drain().
 713  */
 714 #if CONFIG_EMBEDDED
 715 static unsigned int mb_watchdog = 1;
 716 static unsigned int mb_drain_maxint = 60;
 717 #else
 718 static unsigned int mb_watchdog = 0;
 719 static unsigned int mb_drain_maxint = 0;
 720 #endif /* CONFIG_EMBEDDED */
 721
 722 uintptr_t mb_obscure_extfree __attribute__((visibility("hidden")));
 723 uintptr_t mb_obscure_extref __attribute__((visibility("hidden")));
 724
 725 /* Red zone */
 726 static u_int32_t mb_redzone_cookie;
 727 static void m_redzone_init(struct mbuf *);
 728 static void m_redzone_verify(struct mbuf *m);
 729
 730 /* The following are used to serialize m_clalloc() */
 731 static boolean_t mb_clalloc_busy;
 732 static void *mb_clalloc_waitchan = &mb_clalloc_busy;
 733 static int mb_clalloc_waiters;
 734
 735 static void mbuf_mtypes_sync(boolean_t);
 736 static int mbstat_sysctl SYSCTL_HANDLER_ARGS;
 737 static void mbuf_stat_sync(void);
 738 static int mb_stat_sysctl SYSCTL_HANDLER_ARGS;
 739 static int mleak_top_trace_sysctl SYSCTL_HANDLER_ARGS;
 740 static int mleak_table_sysctl SYSCTL_HANDLER_ARGS;
 741 static char *mbuf_dump(void);
 742 static void mbuf_table_init(void);
 743 static inline void m_incref(struct mbuf *);
 744 static inline u_int16_t m_decref(struct mbuf *);
 745 static int m_clalloc(const u_int32_t, const int, const u_int32_t);
 746 static void mbuf_worker_thread_init(void);
 747 static mcache_obj_t *slab_alloc(mbuf_class_t, int);
 748 static void slab_free(mbuf_class_t, mcache_obj_t *);
 749 static unsigned int mbuf_slab_alloc(void *, mcache_obj_t ***,
 750     unsigned int, int);
 751 static void mbuf_slab_free(void *, mcache_obj_t *, int);
 752 static void mbuf_slab_audit(void *, mcache_obj_t *, boolean_t);
 753 static void mbuf_slab_notify(void *, u_int32_t);
 754 static unsigned int cslab_alloc(mbuf_class_t, mcache_obj_t ***,
 755     unsigned int);
 756 static unsigned int cslab_free(mbuf_class_t, mcache_obj_t *, int);
 757 static unsigned int mbuf_cslab_alloc(void *, mcache_obj_t ***,
 758     unsigned int, int);
 759 static void mbuf_cslab_free(void *, mcache_obj_t *, int);
 760 static void mbuf_cslab_audit(void *, mcache_obj_t *, boolean_t);
 761 static int freelist_populate(mbuf_class_t, unsigned int, int);
 762 static void freelist_init(mbuf_class_t);
 763 static boolean_t mbuf_cached_above(mbuf_class_t, int);
 764 static boolean_t mbuf_steal(mbuf_class_t, unsigned int);
 765 static void m_reclaim(mbuf_class_t, unsigned int, boolean_t);
 766 static int m_howmany(int, size_t);
 767 static void mbuf_worker_thread(void);
 768 static void mbuf_watchdog(void);
 769 static boolean_t mbuf_sleep(mbuf_class_t, unsigned int, int);
 770
 771 static void mcl_audit_init(void *, mcache_audit_t **, mcache_obj_t **,
 772     size_t, unsigned int);
 773 static void mcl_audit_free(void *, unsigned int);
 774 static mcache_audit_t *mcl_audit_buf2mca(mbuf_class_t, mcache_obj_t *);
 775 static void mcl_audit_mbuf(mcache_audit_t *, void *, boolean_t, boolean_t);
 776 static void mcl_audit_cluster(mcache_audit_t *, void *, size_t, boolean_t,
 777     boolean_t);
 778 static void mcl_audit_restore_mbuf(struct mbuf *, mcache_audit_t *, boolean_t);
 779 static void mcl_audit_save_mbuf(struct mbuf *, mcache_audit_t *);
 780 static void mcl_audit_scratch(mcache_audit_t *);
 781 static void mcl_audit_mcheck_panic(struct mbuf *);
 782 static void mcl_audit_verify_nextptr(void *, mcache_audit_t *);
 783
 784 static void mleak_activate(void);
 785 static void mleak_logger(u_int32_t, mcache_obj_t *, boolean_t);
 786 static boolean_t mleak_log(uintptr_t *, mcache_obj_t *, uint32_t, int);
 787 static void mleak_free(mcache_obj_t *);
 788 static void mleak_sort_traces(void);
 789 static void mleak_update_stats(void);
 790
 791 static mcl_slab_t *slab_get(void *);
 792 static void slab_init(mcl_slab_t *, mbuf_class_t, u_int32_t,
 793     void *, void *, unsigned int, int, int);
 794 static void slab_insert(mcl_slab_t *, mbuf_class_t);
 795 static void slab_remove(mcl_slab_t *, mbuf_class_t);
 796 static boolean_t slab_inrange(mcl_slab_t *, void *);
 797 static void slab_nextptr_panic(mcl_slab_t *, void *);
 798 static void slab_detach(mcl_slab_t *);
 799 static boolean_t slab_is_detached(mcl_slab_t *);
 800
 801 static int m_copyback0(struct mbuf **, int, int, const void *, int, int);
 802 static struct mbuf *m_split0(struct mbuf *, int, int, int);
 803 __private_extern__ void mbuf_report_peak_usage(void);
 804 static boolean_t mbuf_report_usage(mbuf_class_t);
 805 #if DEBUG || DEVELOPMENT
 806 #define mbwdog_logger(fmt, ...)  _mbwdog_logger(__func__, __LINE__, fmt, ## __VA_ARGS__)
 807 static void _mbwdog_logger(const char *func, const int line, const char *fmt, ...);
 808 static char *mbwdog_logging;
 809 const unsigned mbwdog_logging_size = 4096;
 810 static size_t mbwdog_logging_used;
 811 #else
 812 #define mbwdog_logger(fmt, ...)  do { } while (0)
 813 #endif
 814 static void mbuf_drain_locked(boolean_t);
 815
 816 /* flags for m_copyback0 */
 817 #define M_COPYBACK0_COPYBACK    0x0001  /* copyback from cp */
 818 #define M_COPYBACK0_PRESERVE    0x0002  /* preserve original data */
 819 #define M_COPYBACK0_COW         0x0004  /* do copy-on-write */
 820 #define M_COPYBACK0_EXTEND      0x0008  /* extend chain */
 821
 822 /*
 823  * This flag is set for all mbufs that come out of and into the composite
 824  * mbuf + cluster caches, i.e. MC_MBUF_CL and MC_MBUF_BIGCL.  mbufs that
 825  * are marked with such a flag have clusters attached to them, and will be
 826  * treated differently when they are freed; instead of being placed back
 827  * into the mbuf and cluster freelists, the composite mbuf + cluster objects
 828  * are placed back into the appropriate composite cache's freelist, and the
 829  * actual freeing is deferred until the composite objects are purged.  At
 830  * such a time, this flag will be cleared from the mbufs and the objects
 831  * will be freed into their own separate freelists.
 832  */
 833 #define EXTF_COMPOSITE  0x1
 834
 835 /*
 836  * This flag indicates that the external cluster is read-only, i.e. it is
 837  * or was referred to by more than one mbufs.  Once set, this flag is never
 838  * cleared.
 839  */
 840 #define EXTF_READONLY   0x2
 841 /*
 842  * This flag indicates that the external cluster is paired with the mbuf.
 843  * Pairing implies an external free routine defined which will be invoked
 844  * when the reference count drops to the minimum at m_free time.  This
 845  * flag is never cleared.
 846  */
 847 #define EXTF_PAIRED     0x4
 848
 849 #define EXTF_MASK       \
 850         (EXTF_COMPOSITE | EXTF_READONLY | EXTF_PAIRED)
 851
 852 #define MEXT_MINREF(m)          ((m_get_rfa(m))->minref)
 853 #define MEXT_REF(m)             ((m_get_rfa(m))->refcnt)
 854 #define MEXT_PREF(m)            ((m_get_rfa(m))->prefcnt)
 855 #define MEXT_FLAGS(m)           ((m_get_rfa(m))->flags)
 856 #define MEXT_PRIV(m)            ((m_get_rfa(m))->priv)
 857 #define MEXT_PMBUF(m)           ((m_get_rfa(m))->paired)
 858 #define MEXT_TOKEN(m)           ((m_get_rfa(m))->ext_token)
 859 #define MBUF_IS_COMPOSITE(m)                                            \
 860         (MEXT_REF(m) == MEXT_MINREF(m) &&                               \
 861         (MEXT_FLAGS(m) & EXTF_MASK) == EXTF_COMPOSITE)
 862 /*
 863  * This macro can be used to test if the mbuf is paired to an external
 864  * cluster.  The test for MEXT_PMBUF being equal to the mbuf in subject
 865  * is important, as EXTF_PAIRED alone is insufficient since it is immutable,
 866  * and thus survives calls to m_free_paired.
 867  */
 868 #define MBUF_IS_PAIRED(m)                                               \
 869         (((m)->m_flags & M_EXT) &&                                      \
 870         (MEXT_FLAGS(m) & EXTF_MASK) == EXTF_PAIRED &&                   \
 871         MEXT_PMBUF(m) == (m))
 872
 873 /*
 874  * Macros used to verify the integrity of the mbuf.
 875  */
 876 #define _MCHECK(m) {                                                    \
 877         if ((m)->m_type != MT_FREE && !MBUF_IS_PAIRED(m)) {             \
 878                 if (mclaudit == NULL)                                   \
 879                         panic("MCHECK: m_type=%d m=%p",                 \
 880                             (u_int16_t)(m)->m_type, m);                 \
 881                 else                                                    \
 882                         mcl_audit_mcheck_panic(m);                      \
 883         }                                                               \
 884 }
 885
 886 #define MBUF_IN_MAP(addr)                                               \
 887         ((unsigned char *)(addr) >= mbutl &&                            \
 888         (unsigned char *)(addr) < embutl)
 889
 890 #define MRANGE(addr) {                                                  \
 891         if (!MBUF_IN_MAP(addr))                                         \
 892                 panic("MRANGE: address out of range 0x%p", addr);       \
 893 }
 894
 895 /*
 896  * Macro version of mtod.
 897  */
 898 #define MTOD(m, t)      ((t)((m)->m_data))
 899
 900 /*
 901  * Macros to obtain page index given a base cluster address
 902  */
 903 #define MTOPG(x)        (((unsigned char *)x - mbutl) >> PAGE_SHIFT)
 904 #define PGTOM(x)        (mbutl + (x << PAGE_SHIFT))
 905
 906 /*
 907  * Macro to find the mbuf index relative to a base.
 908  */
 909 #define MBPAGEIDX(c, m) \
 910         (((unsigned char *)(m) - (unsigned char *)(c)) >> MSIZESHIFT)
 911
 912 /*
 913  * Same thing for 2KB cluster index.
 914  */
 915 #define CLPAGEIDX(c, m) \
 916         (((unsigned char *)(m) - (unsigned char *)(c)) >> MCLSHIFT)
 917
 918 /*
 919  * Macro to find 4KB cluster index relative to a base
 920  */
 921 #define BCLPAGEIDX(c, m) \
 922         (((unsigned char *)(m) - (unsigned char *)(c)) >> MBIGCLSHIFT)
 923
 924 /*
 925  * Macros used during mbuf and cluster initialization.
 926  */
 927 #define MBUF_INIT_PKTHDR(m) {                                           \
 928         (m)->m_pkthdr.rcvif = NULL;                                     \
 929         (m)->m_pkthdr.pkt_hdr = NULL;                                   \
 930         (m)->m_pkthdr.len = 0;                                          \
 931         (m)->m_pkthdr.csum_flags = 0;                                   \
 932         (m)->m_pkthdr.csum_data = 0;                                    \
 933         (m)->m_pkthdr.vlan_tag = 0;                                     \
 934         m_classifier_init(m, 0);                                        \
 935         m_tag_init(m, 1);                                               \
 936         m_scratch_init(m);                                              \
 937         m_redzone_init(m);                                              \
 938 }
 939
 940 #define MBUF_INIT(m, pkthdr, type) {                                    \
 941         _MCHECK(m);                                                     \
 942         (m)->m_next = (m)->m_nextpkt = NULL;                            \
 943         (m)->m_len = 0;                                                 \
 944         (m)->m_type = type;                                             \
 945         if ((pkthdr) == 0) {                                            \
 946                 (m)->m_data = (m)->m_dat;                               \
 947                 (m)->m_flags = 0;                                       \
 948         } else {                                                        \
 949                 (m)->m_data = (m)->m_pktdat;                            \
 950                 (m)->m_flags = M_PKTHDR;                                \
 951                 MBUF_INIT_PKTHDR(m);                                    \
 952         }                                                               \
 953 }
 954
 955 #define MEXT_INIT(m, buf, size, free, arg, rfa, min, ref, pref, flag,   \
 956     priv, pm) {                                                         \
 957         (m)->m_data = (m)->m_ext.ext_buf = (buf);                       \
 958         (m)->m_flags |= M_EXT;                                          \
 959         m_set_ext((m), (rfa), (free), (arg));                           \
 960         (m)->m_ext.ext_size = (size);                                   \
 961         MEXT_MINREF(m) = (min);                                         \
 962         MEXT_REF(m) = (ref);                                            \
 963         MEXT_PREF(m) = (pref);                                          \
 964         MEXT_FLAGS(m) = (flag);                                         \
 965         MEXT_PRIV(m) = (priv);                                          \
 966         MEXT_PMBUF(m) = (pm);                                           \
 967 }
 968
 969 #define MBUF_CL_INIT(m, buf, rfa, ref, flag)    \
 970         MEXT_INIT(m, buf, m_maxsize(MC_CL), NULL, NULL, rfa, 0,         \
 971             ref, 0, flag, 0, NULL)
 972
 973 #define MBUF_BIGCL_INIT(m, buf, rfa, ref, flag) \
 974         MEXT_INIT(m, buf, m_maxsize(MC_BIGCL), m_bigfree, NULL, rfa, 0, \
 975             ref, 0, flag, 0, NULL)
 976
 977 #define MBUF_16KCL_INIT(m, buf, rfa, ref, flag) \
 978         MEXT_INIT(m, buf, m_maxsize(MC_16KCL), m_16kfree, NULL, rfa, 0, \
 979             ref, 0, flag, 0, NULL)
 980
 981 /*
 982  * Macro to convert BSD malloc sleep flag to mcache's
 983  */
 984 #define MSLEEPF(f)      ((!((f) & M_DONTWAIT)) ? MCR_SLEEP : MCR_NOSLEEP)
 985
 986 /*
 987  * The structure that holds all mbuf class statistics exportable via sysctl.
 988  * Similar to mbstat structure, the mb_stat structure is protected by the
 989  * global mbuf lock.  It contains additional information about the classes
 990  * that allows for a more accurate view of the state of the allocator.
 991  */
 992 struct mb_stat *mb_stat;
 993 struct omb_stat *omb_stat;      /* For backwards compatibility */
 994
 995 #define MB_STAT_SIZE(n) \
 996         __builtin_offsetof(mb_stat_t, mbs_class[n])
 997 #define OMB_STAT_SIZE(n) \
 998         ((size_t)(&((struct omb_stat *)0)->mbs_class[n]))
 999
1000 /*
1001  * The legacy structure holding all of the mbuf allocation statistics.
1002  * The actual statistics used by the kernel are stored in the mbuf_table
1003  * instead, and are updated atomically while the global mbuf lock is held.
1004  * They are mirrored in mbstat to support legacy applications (e.g. netstat).
1005  * Unlike before, the kernel no longer relies on the contents of mbstat for
1006  * its operations (e.g. cluster expansion) because the structure is exposed
1007  * to outside and could possibly be modified, therefore making it unsafe.
1008  * With the exception of the mbstat.m_mtypes array (see below), all of the
1009  * statistics are updated as they change.
1010  */
1011 struct mbstat mbstat;
1012
1013 #define MBSTAT_MTYPES_MAX \
1014         (sizeof (mbstat.m_mtypes) / sizeof (mbstat.m_mtypes[0]))
1015
1016 /*
1017  * Allocation statistics related to mbuf types (up to MT_MAX-1) are updated
1018  * atomically and stored in a per-CPU structure which is lock-free; this is
1019  * done in order to avoid writing to the global mbstat data structure which
1020  * would cause false sharing.  During sysctl request for kern.ipc.mbstat,
1021  * the statistics across all CPUs will be converged into the mbstat.m_mtypes
1022  * array and returned to the application.  Any updates for types greater or
1023  * equal than MT_MAX would be done atomically to the mbstat; this slows down
1024  * performance but is okay since the kernel uses only up to MT_MAX-1 while
1025  * anything beyond that (up to type 255) is considered a corner case.
1026  */
1027 typedef struct {
1028         unsigned int    cpu_mtypes[MT_MAX];
1029 } __attribute__((aligned(MAX_CPU_CACHE_LINE_SIZE), packed)) mtypes_cpu_t;
1030
1031 typedef struct {
1032         mtypes_cpu_t    mbs_cpu[1];
1033 } mbuf_mtypes_t;
1034
1035 static mbuf_mtypes_t *mbuf_mtypes;      /* per-CPU statistics */
1036
1037 #define MBUF_MTYPES_SIZE(n) \
1038         ((size_t)(&((mbuf_mtypes_t *)0)->mbs_cpu[n]))
1039
1040 #define MTYPES_CPU(p) \
1041         ((mtypes_cpu_t *)(void *)((char *)(p) + MBUF_MTYPES_SIZE(cpu_number())))
1042
1043 #define mtype_stat_add(type, n) {                                       \
1044         if ((unsigned)(type) < MT_MAX) {                                \
1045                 mtypes_cpu_t *mbs = MTYPES_CPU(mbuf_mtypes);            \
1046                 atomic_add_32(&mbs->cpu_mtypes[type], n);               \
1047         } else if ((unsigned)(type) < (unsigned)MBSTAT_MTYPES_MAX) {    \
1048                 atomic_add_16((int16_t *)&mbstat.m_mtypes[type], n);    \
1049         }                                                               \
1050 }
1051
1052 #define mtype_stat_sub(t, n)    mtype_stat_add(t, -(n))
1053 #define mtype_stat_inc(t)       mtype_stat_add(t, 1)
1054 #define mtype_stat_dec(t)       mtype_stat_sub(t, 1)
1055
1056 static void
1057 mbuf_mtypes_sync(boolean_t locked)
1058 {
1059         int m, n;
1060         mtypes_cpu_t mtc;
1061
1062         if (locked)
1063                 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1064
1065         bzero(&mtc, sizeof (mtc));
1066         for (m = 0; m < ncpu; m++) {
1067                 mtypes_cpu_t *scp = &mbuf_mtypes->mbs_cpu[m];
1068                 mtypes_cpu_t temp;
1069
1070                 bcopy(&scp->cpu_mtypes, &temp.cpu_mtypes,
1071                     sizeof (temp.cpu_mtypes));
1072
1073                 for (n = 0; n < MT_MAX; n++)
1074                         mtc.cpu_mtypes[n] += temp.cpu_mtypes[n];
1075         }
1076         if (!locked)
1077                 lck_mtx_lock(mbuf_mlock);
1078         for (n = 0; n < MT_MAX; n++)
1079                 mbstat.m_mtypes[n] = mtc.cpu_mtypes[n];
1080         if (!locked)
1081                 lck_mtx_unlock(mbuf_mlock);
1082 }
1083
1084 static int
1085 mbstat_sysctl SYSCTL_HANDLER_ARGS
1086 {
1087 #pragma unused(oidp, arg1, arg2)
1088         mbuf_mtypes_sync(FALSE);
1089
1090         return (SYSCTL_OUT(req, &mbstat, sizeof (mbstat)));
1091 }
1092
1093 static void
1094 mbuf_stat_sync(void)
1095 {
1096         mb_class_stat_t *sp;
1097         mcache_cpu_t *ccp;
1098         mcache_t *cp;
1099         int k, m, bktsize;
1100
1101         LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1102
1103         for (k = 0; k < NELEM(mbuf_table); k++) {
1104                 cp = m_cache(k);
1105                 ccp = &cp->mc_cpu[0];
1106                 bktsize = ccp->cc_bktsize;
1107                 sp = mbuf_table[k].mtbl_stats;
1108
1109                 if (cp->mc_flags & MCF_NOCPUCACHE)
1110                         sp->mbcl_mc_state = MCS_DISABLED;
1111                 else if (cp->mc_purge_cnt > 0)
1112                         sp->mbcl_mc_state = MCS_PURGING;
1113                 else if (bktsize == 0)
1114                         sp->mbcl_mc_state = MCS_OFFLINE;
1115                 else
1116                         sp->mbcl_mc_state = MCS_ONLINE;
1117
1118                 sp->mbcl_mc_cached = 0;
1119                 for (m = 0; m < ncpu; m++) {
1120                         ccp = &cp->mc_cpu[m];
1121                         if (ccp->cc_objs > 0)
1122                                 sp->mbcl_mc_cached += ccp->cc_objs;
1123                         if (ccp->cc_pobjs > 0)
1124                                 sp->mbcl_mc_cached += ccp->cc_pobjs;
1125                 }
1126                 sp->mbcl_mc_cached += (cp->mc_full.bl_total * bktsize);
1127                 sp->mbcl_active = sp->mbcl_total - sp->mbcl_mc_cached -
1128                     sp->mbcl_infree;
1129
1130                 sp->mbcl_mc_waiter_cnt = cp->mc_waiter_cnt;
1131                 sp->mbcl_mc_wretry_cnt = cp->mc_wretry_cnt;
1132                 sp->mbcl_mc_nwretry_cnt = cp->mc_nwretry_cnt;
1133
1134                 /* Calculate total count specific to each class */
1135                 sp->mbcl_ctotal = sp->mbcl_total;
1136                 switch (m_class(k)) {
1137                 case MC_MBUF:
1138                         /* Deduct mbufs used in composite caches */
1139                         sp->mbcl_ctotal -= (m_total(MC_MBUF_CL) +
1140                             m_total(MC_MBUF_BIGCL));
1141                         break;
1142
1143                 case MC_CL:
1144                         /* Deduct clusters used in composite cache */
1145                         sp->mbcl_ctotal -= m_total(MC_MBUF_CL);
1146                         break;
1147
1148                 case MC_BIGCL:
1149                         /* Deduct clusters used in composite cache */
1150                         sp->mbcl_ctotal -= m_total(MC_MBUF_BIGCL);
1151                         break;
1152
1153                 case MC_16KCL:
1154                         /* Deduct clusters used in composite cache */
1155                         sp->mbcl_ctotal -= m_total(MC_MBUF_16KCL);
1156                         break;
1157
1158                 default:
1159                         break;
1160                 }
1161         }
1162 }
1163
1164 static int
1165 mb_stat_sysctl SYSCTL_HANDLER_ARGS
1166 {
1167 #pragma unused(oidp, arg1, arg2)
1168         void *statp;
1169         int k, statsz, proc64 = proc_is64bit(req->p);
1170
1171         lck_mtx_lock(mbuf_mlock);
1172         mbuf_stat_sync();
1173
1174         if (!proc64) {
1175                 struct omb_class_stat *oc;
1176                 struct mb_class_stat *c;
1177
1178                 omb_stat->mbs_cnt = mb_stat->mbs_cnt;
1179                 oc = &omb_stat->mbs_class[0];
1180                 c = &mb_stat->mbs_class[0];
1181                 for (k = 0; k < omb_stat->mbs_cnt; k++, oc++, c++) {
1182                         (void) snprintf(oc->mbcl_cname, sizeof (oc->mbcl_cname),
1183                             "%s", c->mbcl_cname);
1184                         oc->mbcl_size = c->mbcl_size;
1185                         oc->mbcl_total = c->mbcl_total;
1186                         oc->mbcl_active = c->mbcl_active;
1187                         oc->mbcl_infree = c->mbcl_infree;
1188                         oc->mbcl_slab_cnt = c->mbcl_slab_cnt;
1189                         oc->mbcl_alloc_cnt = c->mbcl_alloc_cnt;
1190                         oc->mbcl_free_cnt = c->mbcl_free_cnt;
1191                         oc->mbcl_notified = c->mbcl_notified;
1192                         oc->mbcl_purge_cnt = c->mbcl_purge_cnt;
1193                         oc->mbcl_fail_cnt = c->mbcl_fail_cnt;
1194                         oc->mbcl_ctotal = c->mbcl_ctotal;
1195                         oc->mbcl_release_cnt = c->mbcl_release_cnt;
1196                         oc->mbcl_mc_state = c->mbcl_mc_state;
1197                         oc->mbcl_mc_cached = c->mbcl_mc_cached;
1198                         oc->mbcl_mc_waiter_cnt = c->mbcl_mc_waiter_cnt;
1199                         oc->mbcl_mc_wretry_cnt = c->mbcl_mc_wretry_cnt;
1200                         oc->mbcl_mc_nwretry_cnt = c->mbcl_mc_nwretry_cnt;
1201                 }
1202                 statp = omb_stat;
1203                 statsz = OMB_STAT_SIZE(NELEM(mbuf_table));
1204         } else {
1205                 statp = mb_stat;
1206                 statsz = MB_STAT_SIZE(NELEM(mbuf_table));
1207         }
1208
1209         lck_mtx_unlock(mbuf_mlock);
1210
1211         return (SYSCTL_OUT(req, statp, statsz));
1212 }
1213
1214 static int
1215 mleak_top_trace_sysctl SYSCTL_HANDLER_ARGS
1216 {
1217 #pragma unused(oidp, arg1, arg2)
1218         int i;
1219
1220         /* Ensure leak tracing turned on */
1221         if (!mclfindleak || !mclexpleak)
1222                 return (ENXIO);
1223
1224         lck_mtx_lock(mleak_lock);
1225         mleak_update_stats();
1226         i = SYSCTL_OUT(req, mleak_stat, MLEAK_STAT_SIZE(MLEAK_NUM_TRACES));
1227         lck_mtx_unlock(mleak_lock);
1228
1229         return (i);
1230 }
1231
1232 static int
1233 mleak_table_sysctl SYSCTL_HANDLER_ARGS
1234 {
1235 #pragma unused(oidp, arg1, arg2)
1236         int i = 0;
1237
1238         /* Ensure leak tracing turned on */
1239         if (!mclfindleak || !mclexpleak)
1240                 return (ENXIO);
1241
1242         lck_mtx_lock(mleak_lock);
1243         i = SYSCTL_OUT(req, &mleak_table, sizeof (mleak_table));
1244         lck_mtx_unlock(mleak_lock);
1245
1246         return (i);
1247 }
1248
1249 static inline void
1250 m_incref(struct mbuf *m)
1251 {
1252         UInt16 old, new;
1253         volatile UInt16 *addr = (volatile UInt16 *)&MEXT_REF(m);
1254
1255         do {
1256                 old = *addr;
1257                 new = old + 1;
1258                 ASSERT(new != 0);
1259         } while (!OSCompareAndSwap16(old, new, addr));
1260
1261         /*
1262          * If cluster is shared, mark it with (sticky) EXTF_READONLY;
1263          * we don't clear the flag when the refcount goes back to the
1264          * minimum, to simplify code calling m_mclhasreference().
1265          */
1266         if (new > (MEXT_MINREF(m) + 1) && !(MEXT_FLAGS(m) & EXTF_READONLY))
1267                 (void) OSBitOrAtomic16(EXTF_READONLY, &MEXT_FLAGS(m));
1268 }
1269
1270 static inline u_int16_t
1271 m_decref(struct mbuf *m)
1272 {
1273         UInt16 old, new;
1274         volatile UInt16 *addr = (volatile UInt16 *)&MEXT_REF(m);
1275
1276         do {
1277                 old = *addr;
1278                 new = old - 1;
1279                 ASSERT(old != 0);
1280         } while (!OSCompareAndSwap16(old, new, addr));
1281
1282         return (new);
1283 }
1284
1285 static void
1286 mbuf_table_init(void)
1287 {
1288         unsigned int b, c, s;
1289         int m, config_mbuf_jumbo = 0;
1290
1291         MALLOC(omb_stat, struct omb_stat *, OMB_STAT_SIZE(NELEM(mbuf_table)),
1292             M_TEMP, M_WAITOK | M_ZERO);
1293         VERIFY(omb_stat != NULL);
1294
1295         MALLOC(mb_stat, mb_stat_t *, MB_STAT_SIZE(NELEM(mbuf_table)),
1296             M_TEMP, M_WAITOK | M_ZERO);
1297         VERIFY(mb_stat != NULL);
1298
1299         mb_stat->mbs_cnt = NELEM(mbuf_table);
1300         for (m = 0; m < NELEM(mbuf_table); m++)
1301                 mbuf_table[m].mtbl_stats = &mb_stat->mbs_class[m];
1302
1303 #if CONFIG_MBUF_JUMBO
1304         config_mbuf_jumbo = 1;
1305 #endif /* CONFIG_MBUF_JUMBO */
1306
1307         if (config_mbuf_jumbo == 1 || PAGE_SIZE == M16KCLBYTES) {
1308                 /*
1309                  * Set aside 1/3 of the mbuf cluster map for jumbo
1310                  * clusters; we do this only on platforms where jumbo
1311                  * cluster pool is enabled.
1312                  */
1313                 njcl = nmbclusters / 3;
1314                 njclbytes = M16KCLBYTES;
1315         }
1316
1317         /*
1318          * nclusters holds both the 2KB and 4KB pools, so ensure it's
1319          * a multiple of 4KB clusters.
1320          */
1321         nclusters = P2ROUNDDOWN(nmbclusters - njcl, NCLPG);
1322         if (njcl > 0) {
1323                 /*
1324                  * Each jumbo cluster takes 8 2KB clusters, so make
1325                  * sure that the pool size is evenly divisible by 8;
1326                  * njcl is in 2KB unit, hence treated as such.
1327                  */
1328                 njcl = P2ROUNDDOWN(nmbclusters - nclusters, NCLPJCL);
1329
1330                 /* Update nclusters with rounded down value of njcl */
1331                 nclusters = P2ROUNDDOWN(nmbclusters - njcl, NCLPG);
1332         }
1333
1334         /*
1335          * njcl is valid only on platforms with 16KB jumbo clusters or
1336          * with 16KB pages, where it is configured to 1/3 of the pool
1337          * size.  On these platforms, the remaining is used for 2KB
1338          * and 4KB clusters.  On platforms without 16KB jumbo clusters,
1339          * the entire pool is used for both 2KB and 4KB clusters.  A 4KB
1340          * cluster can either be splitted into 16 mbufs, or into 2 2KB
1341          * clusters.
1342          *
1343          *  +---+---+------------ ... -----------+------- ... -------+
1344          *  | c | b |              s             |        njcl       |
1345          *  +---+---+------------ ... -----------+------- ... -------+
1346          *
1347          * 1/32th of the shared region is reserved for pure 2KB and 4KB
1348          * clusters (1/64th each.)
1349          */
1350         c = P2ROUNDDOWN((nclusters >> 6), NCLPG);       /* in 2KB unit */
1351         b = P2ROUNDDOWN((nclusters >> (6 + NCLPBGSHIFT)), NBCLPG); /* in 4KB unit */
1352         s = nclusters - (c + (b << NCLPBGSHIFT));       /* in 2KB unit */
1353
1354         /*
1355          * 1/64th (c) is reserved for 2KB clusters.
1356          */
1357         m_minlimit(MC_CL) = c;
1358         m_maxlimit(MC_CL) = s + c;                      /* in 2KB unit */
1359         m_maxsize(MC_CL) = m_size(MC_CL) = MCLBYTES;
1360         (void) snprintf(m_cname(MC_CL), MAX_MBUF_CNAME, "cl");
1361
1362         /*
1363          * Another 1/64th (b) of the map is reserved for 4KB clusters.
1364          * It cannot be turned into 2KB clusters or mbufs.
1365          */
1366         m_minlimit(MC_BIGCL) = b;
1367         m_maxlimit(MC_BIGCL) = (s >> NCLPBGSHIFT) + b;  /* in 4KB unit */
1368         m_maxsize(MC_BIGCL) = m_size(MC_BIGCL) = MBIGCLBYTES;
1369         (void) snprintf(m_cname(MC_BIGCL), MAX_MBUF_CNAME, "bigcl");
1370
1371         /*
1372          * The remaining 31/32ths (s) are all-purpose (mbufs, 2KB, or 4KB)
1373          */
1374         m_minlimit(MC_MBUF) = 0;
1375         m_maxlimit(MC_MBUF) = (s << NMBPCLSHIFT);       /* in mbuf unit */
1376         m_maxsize(MC_MBUF) = m_size(MC_MBUF) = MSIZE;
1377         (void) snprintf(m_cname(MC_MBUF), MAX_MBUF_CNAME, "mbuf");
1378
1379         /*
1380          * Set limits for the composite classes.
1381          */
1382         m_minlimit(MC_MBUF_CL) = 0;
1383         m_maxlimit(MC_MBUF_CL) = m_maxlimit(MC_CL);
1384         m_maxsize(MC_MBUF_CL) = MCLBYTES;
1385         m_size(MC_MBUF_CL) = m_size(MC_MBUF) + m_size(MC_CL);
1386         (void) snprintf(m_cname(MC_MBUF_CL), MAX_MBUF_CNAME, "mbuf_cl");
1387
1388         m_minlimit(MC_MBUF_BIGCL) = 0;
1389         m_maxlimit(MC_MBUF_BIGCL) = m_maxlimit(MC_BIGCL);
1390         m_maxsize(MC_MBUF_BIGCL) = MBIGCLBYTES;
1391         m_size(MC_MBUF_BIGCL) = m_size(MC_MBUF) + m_size(MC_BIGCL);
1392         (void) snprintf(m_cname(MC_MBUF_BIGCL), MAX_MBUF_CNAME, "mbuf_bigcl");
1393
1394         /*
1395          * And for jumbo classes.
1396          */
1397         m_minlimit(MC_16KCL) = 0;
1398         m_maxlimit(MC_16KCL) = (njcl >> NCLPJCLSHIFT);  /* in 16KB unit */
1399         m_maxsize(MC_16KCL) = m_size(MC_16KCL) = M16KCLBYTES;
1400         (void) snprintf(m_cname(MC_16KCL), MAX_MBUF_CNAME, "16kcl");
1401
1402         m_minlimit(MC_MBUF_16KCL) = 0;
1403         m_maxlimit(MC_MBUF_16KCL) = m_maxlimit(MC_16KCL);
1404         m_maxsize(MC_MBUF_16KCL) = M16KCLBYTES;
1405         m_size(MC_MBUF_16KCL) = m_size(MC_MBUF) + m_size(MC_16KCL);
1406         (void) snprintf(m_cname(MC_MBUF_16KCL), MAX_MBUF_CNAME, "mbuf_16kcl");
1407
1408         /*
1409          * Initialize the legacy mbstat structure.
1410          */
1411         bzero(&mbstat, sizeof (mbstat));
1412         mbstat.m_msize = m_maxsize(MC_MBUF);
1413         mbstat.m_mclbytes = m_maxsize(MC_CL);
1414         mbstat.m_minclsize = MINCLSIZE;
1415         mbstat.m_mlen = MLEN;
1416         mbstat.m_mhlen = MHLEN;
1417         mbstat.m_bigmclbytes = m_maxsize(MC_BIGCL);
1418 }
1419
1420 #if defined(__LP64__)
1421 typedef struct ncl_tbl {
1422         uint64_t nt_maxmem;     /* memory (sane) size */
1423         uint32_t nt_mbpool;     /* mbuf pool size */
1424 } ncl_tbl_t;
1425
1426 /* Non-server */
1427 static ncl_tbl_t ncl_table[] = {
1428         { (1ULL << GBSHIFT)       /*  1 GB */,  (64 << MBSHIFT)  /*  64 MB */ },
1429         { (1ULL << (GBSHIFT + 3)) /*  8 GB */,  (96 << MBSHIFT)  /*  96 MB */ },
1430         { (1ULL << (GBSHIFT + 4)) /* 16 GB */,  (128 << MBSHIFT) /* 128 MB */ },
1431         { 0, 0 }
1432 };
1433
1434 /* Server */
1435 static ncl_tbl_t ncl_table_srv[] = {
1436         { (1ULL << GBSHIFT)       /*  1 GB */,  (96 << MBSHIFT)  /*  96 MB */ },
1437         { (1ULL << (GBSHIFT + 2)) /*  4 GB */,  (128 << MBSHIFT) /* 128 MB */ },
1438         { (1ULL << (GBSHIFT + 3)) /*  8 GB */,  (160 << MBSHIFT) /* 160 MB */ },
1439         { (1ULL << (GBSHIFT + 4)) /* 16 GB */,  (192 << MBSHIFT) /* 192 MB */ },
1440         { (1ULL << (GBSHIFT + 5)) /* 32 GB */,  (256 << MBSHIFT) /* 256 MB */ },
1441         { (1ULL << (GBSHIFT + 6)) /* 64 GB */,  (384 << MBSHIFT) /* 384 MB */ },
1442         { 0, 0 }
1443 };
1444 #endif /* __LP64__ */
1445
1446 __private_extern__ unsigned int
1447 mbuf_default_ncl(int server, uint64_t mem)
1448 {
1449 #if !defined(__LP64__)
1450 #pragma unused(server)
1451         unsigned int n;
1452         /*
1453          * 32-bit kernel (default to 64MB of mbuf pool for >= 1GB RAM).
1454          */
1455         if ((n = ((mem / 16) / MCLBYTES)) > 32768)
1456                 n = 32768;
1457 #else
1458         unsigned int n, i;
1459         ncl_tbl_t *tbl = (server ? ncl_table_srv : ncl_table);
1460         /*
1461          * 64-bit kernel (mbuf pool size based on table).
1462          */
1463         n = tbl[0].nt_mbpool;
1464         for (i = 0; tbl[i].nt_mbpool != 0; i++) {
1465                 if (mem < tbl[i].nt_maxmem)
1466                         break;
1467                 n = tbl[i].nt_mbpool;
1468         }
1469         n >>= MCLSHIFT;
1470 #endif /* !__LP64__ */
1471         return (n);
1472 }
1473
1474 __private_extern__ void
1475 mbinit(void)
1476 {
1477         unsigned int m;
1478         unsigned int initmcl = 0;
1479         void *buf;
1480         thread_t thread = THREAD_NULL;
1481
1482         microuptime(&mb_start);
1483
1484         /*
1485          * These MBUF_ values must be equal to their private counterparts.
1486          */
1487         _CASSERT(MBUF_EXT == M_EXT);
1488         _CASSERT(MBUF_PKTHDR == M_PKTHDR);
1489         _CASSERT(MBUF_EOR == M_EOR);
1490         _CASSERT(MBUF_LOOP == M_LOOP);
1491         _CASSERT(MBUF_BCAST == M_BCAST);
1492         _CASSERT(MBUF_MCAST == M_MCAST);
1493         _CASSERT(MBUF_FRAG == M_FRAG);
1494         _CASSERT(MBUF_FIRSTFRAG == M_FIRSTFRAG);
1495         _CASSERT(MBUF_LASTFRAG == M_LASTFRAG);
1496         _CASSERT(MBUF_PROMISC == M_PROMISC);
1497         _CASSERT(MBUF_HASFCS == M_HASFCS);
1498
1499         _CASSERT(MBUF_TYPE_FREE == MT_FREE);
1500         _CASSERT(MBUF_TYPE_DATA == MT_DATA);
1501         _CASSERT(MBUF_TYPE_HEADER == MT_HEADER);
1502         _CASSERT(MBUF_TYPE_SOCKET == MT_SOCKET);
1503         _CASSERT(MBUF_TYPE_PCB == MT_PCB);
1504         _CASSERT(MBUF_TYPE_RTABLE == MT_RTABLE);
1505         _CASSERT(MBUF_TYPE_HTABLE == MT_HTABLE);
1506         _CASSERT(MBUF_TYPE_ATABLE == MT_ATABLE);
1507         _CASSERT(MBUF_TYPE_SONAME == MT_SONAME);
1508         _CASSERT(MBUF_TYPE_SOOPTS == MT_SOOPTS);
1509         _CASSERT(MBUF_TYPE_FTABLE == MT_FTABLE);
1510         _CASSERT(MBUF_TYPE_RIGHTS == MT_RIGHTS);
1511         _CASSERT(MBUF_TYPE_IFADDR == MT_IFADDR);
1512         _CASSERT(MBUF_TYPE_CONTROL == MT_CONTROL);
1513         _CASSERT(MBUF_TYPE_OOBDATA == MT_OOBDATA);
1514
1515         _CASSERT(MBUF_TSO_IPV4 == CSUM_TSO_IPV4);
1516         _CASSERT(MBUF_TSO_IPV6 == CSUM_TSO_IPV6);
1517         _CASSERT(MBUF_CSUM_REQ_SUM16 == CSUM_PARTIAL);
1518         _CASSERT(MBUF_CSUM_TCP_SUM16 == MBUF_CSUM_REQ_SUM16);
1519         _CASSERT(MBUF_CSUM_REQ_ZERO_INVERT == CSUM_ZERO_INVERT);
1520         _CASSERT(MBUF_CSUM_REQ_IP == CSUM_IP);
1521         _CASSERT(MBUF_CSUM_REQ_TCP == CSUM_TCP);
1522         _CASSERT(MBUF_CSUM_REQ_UDP == CSUM_UDP);
1523         _CASSERT(MBUF_CSUM_REQ_TCPIPV6 == CSUM_TCPIPV6);
1524         _CASSERT(MBUF_CSUM_REQ_UDPIPV6 == CSUM_UDPIPV6);
1525         _CASSERT(MBUF_CSUM_DID_IP == CSUM_IP_CHECKED);
1526         _CASSERT(MBUF_CSUM_IP_GOOD == CSUM_IP_VALID);
1527         _CASSERT(MBUF_CSUM_DID_DATA == CSUM_DATA_VALID);
1528         _CASSERT(MBUF_CSUM_PSEUDO_HDR == CSUM_PSEUDO_HDR);
1529
1530         _CASSERT(MBUF_WAITOK == M_WAIT);
1531         _CASSERT(MBUF_DONTWAIT == M_DONTWAIT);
1532         _CASSERT(MBUF_COPYALL == M_COPYALL);
1533
1534         _CASSERT(MBUF_SC2TC(MBUF_SC_BK_SYS) == MBUF_TC_BK);
1535         _CASSERT(MBUF_SC2TC(MBUF_SC_BK) == MBUF_TC_BK);
1536         _CASSERT(MBUF_SC2TC(MBUF_SC_BE) == MBUF_TC_BE);
1537         _CASSERT(MBUF_SC2TC(MBUF_SC_RD) == MBUF_TC_BE);
1538         _CASSERT(MBUF_SC2TC(MBUF_SC_OAM) == MBUF_TC_BE);
1539         _CASSERT(MBUF_SC2TC(MBUF_SC_AV) == MBUF_TC_VI);
1540         _CASSERT(MBUF_SC2TC(MBUF_SC_RV) == MBUF_TC_VI);
1541         _CASSERT(MBUF_SC2TC(MBUF_SC_VI) == MBUF_TC_VI);
1542         _CASSERT(MBUF_SC2TC(MBUF_SC_SIG) == MBUF_TC_VI);
1543         _CASSERT(MBUF_SC2TC(MBUF_SC_VO) == MBUF_TC_VO);
1544         _CASSERT(MBUF_SC2TC(MBUF_SC_CTL) == MBUF_TC_VO);
1545
1546         _CASSERT(MBUF_TC2SCVAL(MBUF_TC_BK) == SCVAL_BK);
1547         _CASSERT(MBUF_TC2SCVAL(MBUF_TC_BE) == SCVAL_BE);
1548         _CASSERT(MBUF_TC2SCVAL(MBUF_TC_VI) == SCVAL_VI);
1549         _CASSERT(MBUF_TC2SCVAL(MBUF_TC_VO) == SCVAL_VO);
1550
1551         /* Module specific scratch space (32-bit alignment requirement) */
1552         _CASSERT(!(offsetof(struct mbuf, m_pkthdr.pkt_mpriv) %
1553             sizeof (uint32_t)));
1554
1555         /* Initialize random red zone cookie value */
1556         _CASSERT(sizeof (mb_redzone_cookie) ==
1557             sizeof (((struct pkthdr *)0)->redzone));
1558         read_random(&mb_redzone_cookie, sizeof (mb_redzone_cookie));
1559         read_random(&mb_obscure_extref, sizeof (mb_obscure_extref));
1560         read_random(&mb_obscure_extfree, sizeof (mb_obscure_extfree));
1561         mb_obscure_extref |= 0x3;
1562         mb_obscure_extfree |= 0x3;
1563
1564         /* Make sure we don't save more than we should */
1565         _CASSERT(MCA_SAVED_MBUF_SIZE <= sizeof (struct mbuf));
1566
1567         if (nmbclusters == 0)
1568                 nmbclusters = NMBCLUSTERS;
1569
1570         /* This should be a sane (at least even) value by now */
1571         VERIFY(nmbclusters != 0 && !(nmbclusters & 0x1));
1572
1573         /* Setup the mbuf table */
1574         mbuf_table_init();
1575
1576         /* Global lock for common layer */
1577         mbuf_mlock_grp_attr = lck_grp_attr_alloc_init();
1578         mbuf_mlock_grp = lck_grp_alloc_init("mbuf", mbuf_mlock_grp_attr);
1579         mbuf_mlock_attr = lck_attr_alloc_init();
1580         lck_mtx_init(mbuf_mlock, mbuf_mlock_grp, mbuf_mlock_attr);
1581
1582         /*
1583          * Allocate cluster slabs table:
1584          *
1585          *      maxslabgrp = (N * 2048) / (1024 * 1024)
1586          *
1587          * Where N is nmbclusters rounded up to the nearest 512.  This yields
1588          * mcl_slab_g_t units, each one representing a MB of memory.
1589          */
1590         maxslabgrp =
1591             (P2ROUNDUP(nmbclusters, (MBSIZE >> MCLSHIFT)) << MCLSHIFT) >> MBSHIFT;
1592         MALLOC(slabstbl, mcl_slabg_t **, maxslabgrp * sizeof (mcl_slabg_t *),
1593             M_TEMP, M_WAITOK | M_ZERO);
1594         VERIFY(slabstbl != NULL);
1595
1596         /*
1597          * Allocate audit structures, if needed:
1598          *
1599          *      maxclaudit = (maxslabgrp * 1024 * 1024) / PAGE_SIZE
1600          *
1601          * This yields mcl_audit_t units, each one representing a page.
1602          */
1603         PE_parse_boot_argn("mbuf_debug", &mbuf_debug, sizeof (mbuf_debug));
1604         mbuf_debug |= mcache_getflags();
1605         if (mbuf_debug & MCF_DEBUG) {
1606                 int l;
1607                 mcl_audit_t *mclad;
1608                 maxclaudit = ((maxslabgrp << MBSHIFT) >> PAGE_SHIFT);
1609                 MALLOC(mclaudit, mcl_audit_t *, maxclaudit * sizeof (*mclaudit),
1610                     M_TEMP, M_WAITOK | M_ZERO);
1611                 VERIFY(mclaudit != NULL);
1612                 for (l = 0, mclad = mclaudit; l < maxclaudit; l++) {
1613                         MALLOC(mclad[l].cl_audit, mcache_audit_t **,
1614                             NMBPG * sizeof(mcache_audit_t *),
1615                             M_TEMP, M_WAITOK | M_ZERO);
1616                         VERIFY(mclad[l].cl_audit != NULL);
1617                 }
1618
1619                 mcl_audit_con_cache = mcache_create("mcl_audit_contents",
1620                     AUDIT_CONTENTS_SIZE, sizeof (u_int64_t), 0, MCR_SLEEP);
1621                 VERIFY(mcl_audit_con_cache != NULL);
1622         }
1623         mclverify = (mbuf_debug & MCF_VERIFY);
1624         mcltrace = (mbuf_debug & MCF_TRACE);
1625         mclfindleak = !(mbuf_debug & MCF_NOLEAKLOG);
1626         mclexpleak = mclfindleak && (mbuf_debug & MCF_EXPLEAKLOG);
1627
1628         /* Enable mbuf leak logging, with a lock to protect the tables */
1629
1630         mleak_lock_grp_attr = lck_grp_attr_alloc_init();
1631         mleak_lock_grp = lck_grp_alloc_init("mleak_lock", mleak_lock_grp_attr);
1632         mleak_lock_attr = lck_attr_alloc_init();
1633         lck_mtx_init(mleak_lock, mleak_lock_grp, mleak_lock_attr);
1634
1635         mleak_activate();
1636
1637         /*
1638          * Allocate structure for per-CPU statistics that's aligned
1639          * on the CPU cache boundary; this code assumes that we never
1640          * uninitialize this framework, since the original address
1641          * before alignment is not saved.
1642          */
1643         ncpu = ml_get_max_cpus();
1644         MALLOC(buf, void *, MBUF_MTYPES_SIZE(ncpu) + CPU_CACHE_LINE_SIZE,
1645             M_TEMP, M_WAITOK);
1646         VERIFY(buf != NULL);
1647
1648         mbuf_mtypes = (mbuf_mtypes_t *)P2ROUNDUP((intptr_t)buf,
1649             CPU_CACHE_LINE_SIZE);
1650         bzero(mbuf_mtypes, MBUF_MTYPES_SIZE(ncpu));
1651
1652         /* Calculate the number of pages assigned to the cluster pool */
1653         mcl_pages = (nmbclusters << MCLSHIFT) / PAGE_SIZE;
1654         MALLOC(mcl_paddr, ppnum_t *, mcl_pages * sizeof (ppnum_t),
1655             M_TEMP, M_WAITOK);
1656         VERIFY(mcl_paddr != NULL);
1657
1658         /* Register with the I/O Bus mapper */
1659         mcl_paddr_base = IOMapperIOVMAlloc(mcl_pages);
1660         bzero((char *)mcl_paddr, mcl_pages * sizeof (ppnum_t));
1661
1662         embutl = (mbutl + (nmbclusters * MCLBYTES));
1663         VERIFY(((embutl - mbutl) % MBIGCLBYTES) == 0);
1664
1665         /* Prime up the freelist */
1666         PE_parse_boot_argn("initmcl", &initmcl, sizeof (initmcl));
1667         if (initmcl != 0) {
1668                 initmcl >>= NCLPBGSHIFT;        /* become a 4K unit */
1669                 if (initmcl > m_maxlimit(MC_BIGCL))
1670                         initmcl = m_maxlimit(MC_BIGCL);
1671         }
1672         if (initmcl < m_minlimit(MC_BIGCL))
1673                 initmcl = m_minlimit(MC_BIGCL);
1674
1675         lck_mtx_lock(mbuf_mlock);
1676
1677         /*
1678          * For classes with non-zero minimum limits, populate their freelists
1679          * so that m_total(class) is at least m_minlimit(class).
1680          */
1681         VERIFY(m_total(MC_BIGCL) == 0 && m_minlimit(MC_BIGCL) != 0);
1682         freelist_populate(m_class(MC_BIGCL), initmcl, M_WAIT);
1683         VERIFY(m_total(MC_BIGCL) >= m_minlimit(MC_BIGCL));
1684         freelist_init(m_class(MC_CL));
1685
1686         for (m = 0; m < NELEM(mbuf_table); m++) {
1687                 /* Make sure we didn't miss any */
1688                 VERIFY(m_minlimit(m_class(m)) == 0 ||
1689                     m_total(m_class(m)) >= m_minlimit(m_class(m)));
1690
1691                 /* populate the initial sizes and report from there on */
1692                 m_peak(m_class(m)) = m_total(m_class(m));
1693         }
1694         mb_peak_newreport = FALSE;
1695
1696         lck_mtx_unlock(mbuf_mlock);
1697
1698         (void) kernel_thread_start((thread_continue_t)mbuf_worker_thread_init,
1699             NULL, &thread);
1700         thread_deallocate(thread);
1701
1702         ref_cache = mcache_create("mext_ref", sizeof (struct ext_ref),
1703             0, 0, MCR_SLEEP);
1704
1705         /* Create the cache for each class */
1706         for (m = 0; m < NELEM(mbuf_table); m++) {
1707                 void *allocfunc, *freefunc, *auditfunc, *logfunc;
1708                 u_int32_t flags;
1709
1710                 flags = mbuf_debug;
1711                 if (m_class(m) == MC_MBUF_CL || m_class(m) == MC_MBUF_BIGCL ||
1712                     m_class(m) == MC_MBUF_16KCL) {
1713                         allocfunc = mbuf_cslab_alloc;
1714                         freefunc = mbuf_cslab_free;
1715                         auditfunc = mbuf_cslab_audit;
1716                         logfunc = mleak_logger;
1717                 } else {
1718                         allocfunc = mbuf_slab_alloc;
1719                         freefunc = mbuf_slab_free;
1720                         auditfunc = mbuf_slab_audit;
1721                         logfunc = mleak_logger;
1722                 }
1723
1724                 /*
1725                  * Disable per-CPU caches for jumbo classes if there
1726                  * is no jumbo cluster pool available in the system.
1727                  * The cache itself is still created (but will never
1728                  * be populated) since it simplifies the code.
1729                  */
1730                 if ((m_class(m) == MC_MBUF_16KCL || m_class(m) == MC_16KCL) &&
1731                     njcl == 0)
1732                         flags |= MCF_NOCPUCACHE;
1733
1734                 if (!mclfindleak)
1735                         flags |= MCF_NOLEAKLOG;
1736
1737                 m_cache(m) = mcache_create_ext(m_cname(m), m_maxsize(m),
1738                     allocfunc, freefunc, auditfunc, logfunc, mbuf_slab_notify,
1739                     (void *)(uintptr_t)m, flags, MCR_SLEEP);
1740         }
1741
1742         /*
1743          * Set the max limit on sb_max to be 1/16 th of the size of
1744          * memory allocated for mbuf clusters.
1745          */
1746         high_sb_max = (nmbclusters << (MCLSHIFT - 4));
1747         if (high_sb_max < sb_max) {
1748                 /* sb_max is too large for this configuration, scale it down */
1749                 if (high_sb_max > (1 << MBSHIFT)) {
1750                         /* We have atleast 16 M of mbuf pool */
1751                         sb_max = high_sb_max;
1752                 } else if ((nmbclusters << MCLSHIFT) > (1 << MBSHIFT)) {
1753                         /*
1754                          * If we have more than 1M of mbufpool, cap the size of
1755                          * max sock buf at 1M
1756                          */
1757                         sb_max = high_sb_max = (1 << MBSHIFT);
1758                 } else {
1759                         sb_max = high_sb_max;
1760                 }
1761         }
1762
1763         /* allocate space for mbuf_dump_buf */
1764         MALLOC(mbuf_dump_buf, char *, MBUF_DUMP_BUF_SIZE, M_TEMP, M_WAITOK);
1765         VERIFY(mbuf_dump_buf != NULL);
1766
1767         if (mbuf_debug & MCF_DEBUG) {
1768                 printf("%s: MLEN %d, MHLEN %d\n", __func__,
1769                     (int)_MLEN, (int)_MHLEN);
1770         }
1771
1772         printf("%s: done [%d MB total pool size, (%d/%d) split]\n", __func__,
1773             (nmbclusters << MCLSHIFT) >> MBSHIFT,
1774             (nclusters << MCLSHIFT) >> MBSHIFT,
1775             (njcl << MCLSHIFT) >> MBSHIFT);
1776
1777         /* initialize lock form tx completion callback table */
1778         mbuf_tx_compl_tbl_lck_grp_attr = lck_grp_attr_alloc_init();
1779         if (mbuf_tx_compl_tbl_lck_grp_attr == NULL) {
1780                 panic("%s: lck_grp_attr_alloc_init failed", __func__);
1781                 /* NOTREACHED */
1782         }
1783         mbuf_tx_compl_tbl_lck_grp = lck_grp_alloc_init("mbuf_tx_compl_tbl",
1784             mbuf_tx_compl_tbl_lck_grp_attr);
1785         if (mbuf_tx_compl_tbl_lck_grp == NULL) {
1786                 panic("%s: lck_grp_alloc_init failed", __func__);
1787                 /* NOTREACHED */
1788         }
1789         mbuf_tx_compl_tbl_lck_attr = lck_attr_alloc_init();
1790         if (mbuf_tx_compl_tbl_lck_attr == NULL) {
1791                 panic("%s: lck_attr_alloc_init failed", __func__);
1792                 /* NOTREACHED */
1793         }
1794         lck_rw_init(mbuf_tx_compl_tbl_lock, mbuf_tx_compl_tbl_lck_grp,
1795             mbuf_tx_compl_tbl_lck_attr);
1796
1797 }
1798
1799 /*
1800  * Obtain a slab of object(s) from the class's freelist.
1801  */
1802 static mcache_obj_t *
1803 slab_alloc(mbuf_class_t class, int wait)
1804 {
1805         mcl_slab_t *sp;
1806         mcache_obj_t *buf;
1807
1808         LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1809
1810         /* This should always be NULL for us */
1811         VERIFY(m_cobjlist(class) == NULL);
1812
1813         /*
1814          * Treat composite objects as having longer lifespan by using
1815          * a slab from the reverse direction, in hoping that this could
1816          * reduce the probability of fragmentation for slabs that hold
1817          * more than one buffer chunks (e.g. mbuf slabs).  For other
1818          * slabs, this probably doesn't make much of a difference.
1819          */
1820         if ((class == MC_MBUF || class == MC_CL || class == MC_BIGCL)
1821             && (wait & MCR_COMP))
1822                 sp = (mcl_slab_t *)TAILQ_LAST(&m_slablist(class), mcl_slhead);
1823         else
1824                 sp = (mcl_slab_t *)TAILQ_FIRST(&m_slablist(class));
1825
1826         if (sp == NULL) {
1827                 VERIFY(m_infree(class) == 0 && m_slab_cnt(class) == 0);
1828                 /* The slab list for this class is empty */
1829                 return (NULL);
1830         }
1831
1832         VERIFY(m_infree(class) > 0);
1833         VERIFY(!slab_is_detached(sp));
1834         VERIFY(sp->sl_class == class &&
1835             (sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED);
1836         buf = sp->sl_head;
1837         VERIFY(slab_inrange(sp, buf) && sp == slab_get(buf));
1838         sp->sl_head = buf->obj_next;
1839         /* Increment slab reference */
1840         sp->sl_refcnt++;
1841
1842         VERIFY(sp->sl_head != NULL || sp->sl_refcnt == sp->sl_chunks);
1843
1844         if (sp->sl_head != NULL && !slab_inrange(sp, sp->sl_head)) {
1845                 slab_nextptr_panic(sp, sp->sl_head);
1846                 /* In case sl_head is in the map but not in the slab */
1847                 VERIFY(slab_inrange(sp, sp->sl_head));
1848                 /* NOTREACHED */
1849         }
1850
1851         if (mclaudit != NULL) {
1852                 mcache_audit_t *mca = mcl_audit_buf2mca(class, buf);
1853                 mca->mca_uflags = 0;
1854                 /* Save contents on mbuf objects only */
1855                 if (class == MC_MBUF)
1856                         mca->mca_uflags |= MB_SCVALID;
1857         }
1858
1859         if (class == MC_CL) {
1860                 mbstat.m_clfree = (--m_infree(MC_CL)) + m_infree(MC_MBUF_CL);
1861                 /*
1862                  * A 2K cluster slab can have at most NCLPG references.
1863                  */
1864                 VERIFY(sp->sl_refcnt >= 1 && sp->sl_refcnt <= NCLPG &&
1865                     sp->sl_chunks == NCLPG && sp->sl_len == PAGE_SIZE);
1866                 VERIFY(sp->sl_refcnt < NCLPG || sp->sl_head == NULL);
1867         } else if (class == MC_BIGCL) {
1868                 mbstat.m_bigclfree = (--m_infree(MC_BIGCL)) +
1869                     m_infree(MC_MBUF_BIGCL);
1870                 /*
1871                  * A 4K cluster slab can have NBCLPG references.
1872                  */
1873                 VERIFY(sp->sl_refcnt >= 1 && sp->sl_chunks == NBCLPG &&
1874                     sp->sl_len == PAGE_SIZE &&
1875                     (sp->sl_refcnt < NBCLPG || sp->sl_head == NULL));
1876         } else if (class == MC_16KCL) {
1877                 mcl_slab_t *nsp;
1878                 int k;
1879
1880                 --m_infree(MC_16KCL);
1881                 VERIFY(sp->sl_refcnt == 1 && sp->sl_chunks == 1 &&
1882                     sp->sl_len == m_maxsize(class) && sp->sl_head == NULL);
1883                 /*
1884                  * Increment 2nd-Nth slab reference, where N is NSLABSP16KB.
1885                  * A 16KB big cluster takes NSLABSP16KB slabs, each having at
1886                  * most 1 reference.
1887                  */
1888                 for (nsp = sp, k = 1; k < NSLABSP16KB; k++) {
1889                         nsp = nsp->sl_next;
1890                         /* Next slab must already be present */
1891                         VERIFY(nsp != NULL);
1892                         nsp->sl_refcnt++;
1893                         VERIFY(!slab_is_detached(nsp));
1894                         VERIFY(nsp->sl_class == MC_16KCL &&
1895                             nsp->sl_flags == (SLF_MAPPED | SLF_PARTIAL) &&
1896                             nsp->sl_refcnt == 1 && nsp->sl_chunks == 0 &&
1897                             nsp->sl_len == 0 && nsp->sl_base == sp->sl_base &&
1898                             nsp->sl_head == NULL);
1899                 }
1900         } else {
1901                 VERIFY(class == MC_MBUF);
1902                 --m_infree(MC_MBUF);
1903                 /*
1904                  * If auditing is turned on, this check is
1905                  * deferred until later in mbuf_slab_audit().
1906                  */
1907                 if (mclaudit == NULL)
1908                         _MCHECK((struct mbuf *)buf);
1909                 /*
1910                  * Since we have incremented the reference count above,
1911                  * an mbuf slab (formerly a 4KB cluster slab that was cut
1912                  * up into mbufs) must have a reference count between 1
1913                  * and NMBPG at this point.
1914                  */
1915                 VERIFY(sp->sl_refcnt >= 1 && sp->sl_refcnt <= NMBPG &&
1916                     sp->sl_chunks == NMBPG &&
1917                     sp->sl_len == PAGE_SIZE);
1918                 VERIFY(sp->sl_refcnt < NMBPG || sp->sl_head == NULL);
1919         }
1920
1921         /* If empty, remove this slab from the class's freelist */
1922         if (sp->sl_head == NULL) {
1923                 VERIFY(class != MC_MBUF || sp->sl_refcnt == NMBPG);
1924                 VERIFY(class != MC_CL || sp->sl_refcnt == NCLPG);
1925                 VERIFY(class != MC_BIGCL || sp->sl_refcnt == NBCLPG);
1926                 slab_remove(sp, class);
1927         }
1928
1929         return (buf);
1930 }
1931
1932 /*
1933  * Place a slab of object(s) back into a class's slab list.
1934  */
1935 static void
1936 slab_free(mbuf_class_t class, mcache_obj_t *buf)
1937 {
1938         mcl_slab_t *sp;
1939         boolean_t reinit_supercl = false;
1940         mbuf_class_t super_class;
1941
1942         LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1943
1944         VERIFY(class != MC_16KCL || njcl > 0);
1945         VERIFY(buf->obj_next == NULL);
1946
1947         /*
1948          * Synchronizing with m_clalloc, as it reads m_total, while we here
1949          * are modifying m_total.
1950          */
1951         while (mb_clalloc_busy) {
1952                 mb_clalloc_waiters++;
1953                 (void) msleep(mb_clalloc_waitchan, mbuf_mlock,
1954                     (PZERO-1), "m_clalloc", NULL);
1955                 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1956         }
1957
1958         /* We are busy now; tell everyone else to go away */
1959         mb_clalloc_busy = TRUE;
1960
1961         sp = slab_get(buf);
1962         VERIFY(sp->sl_class == class && slab_inrange(sp, buf) &&
1963             (sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED);
1964
1965         /* Decrement slab reference */
1966         sp->sl_refcnt--;
1967
1968         if (class == MC_CL) {
1969                 VERIFY(IS_P2ALIGNED(buf, MCLBYTES));
1970                 /*
1971                  * A slab that has been splitted for 2KB clusters can have
1972                  * at most 1 outstanding reference at this point.
1973                  */
1974                 VERIFY(sp->sl_refcnt >= 0 && sp->sl_refcnt <= (NCLPG - 1) &&
1975                     sp->sl_chunks == NCLPG && sp->sl_len == PAGE_SIZE);
1976                 VERIFY(sp->sl_refcnt < (NCLPG - 1) ||
1977                     (slab_is_detached(sp) && sp->sl_head == NULL));
1978         } else if (class == MC_BIGCL) {
1979                 VERIFY(IS_P2ALIGNED(buf, MBIGCLBYTES));
1980
1981                 /* A 4KB cluster slab can have NBCLPG references at most */
1982                 VERIFY(sp->sl_refcnt >= 0 && sp->sl_chunks == NBCLPG);
1983                 VERIFY(sp->sl_refcnt < (NBCLPG - 1) ||
1984                     (slab_is_detached(sp) && sp->sl_head == NULL));
1985         } else if (class == MC_16KCL) {
1986                 mcl_slab_t *nsp;
1987                 int k;
1988                 /*
1989                  * A 16KB cluster takes NSLABSP16KB slabs, all must
1990                  * now have 0 reference.
1991                  */
1992                 VERIFY(IS_P2ALIGNED(buf, PAGE_SIZE));
1993                 VERIFY(sp->sl_refcnt == 0 && sp->sl_chunks == 1 &&
1994                     sp->sl_len == m_maxsize(class) && sp->sl_head == NULL);
1995                 VERIFY(slab_is_detached(sp));
1996                 for (nsp = sp, k = 1; k < NSLABSP16KB; k++) {
1997                         nsp = nsp->sl_next;
1998                         /* Next slab must already be present */
1999                         VERIFY(nsp != NULL);
2000                         nsp->sl_refcnt--;
2001                         VERIFY(slab_is_detached(nsp));
2002                         VERIFY(nsp->sl_class == MC_16KCL &&
2003                             (nsp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) &&
2004                             nsp->sl_refcnt == 0 && nsp->sl_chunks == 0 &&
2005                             nsp->sl_len == 0 && nsp->sl_base == sp->sl_base &&
2006                             nsp->sl_head == NULL);
2007                 }
2008         } else {
2009                 /*
2010                  * A slab that has been splitted for mbufs has at most
2011                  * NMBPG reference counts.  Since we have decremented
2012                  * one reference above, it must now be between 0 and
2013                  * NMBPG-1.
2014                  */
2015                 VERIFY(class == MC_MBUF);
2016                 VERIFY(sp->sl_refcnt >= 0 &&
2017                     sp->sl_refcnt <= (NMBPG - 1) &&
2018                     sp->sl_chunks == NMBPG &&
2019                     sp->sl_len == PAGE_SIZE);
2020                 VERIFY(sp->sl_refcnt < (NMBPG - 1) ||
2021                     (slab_is_detached(sp) && sp->sl_head == NULL));
2022         }
2023
2024         /*
2025          * When auditing is enabled, ensure that the buffer still
2026          * contains the free pattern.  Otherwise it got corrupted
2027          * while at the CPU cache layer.
2028          */
2029         if (mclaudit != NULL) {
2030                 mcache_audit_t *mca = mcl_audit_buf2mca(class, buf);
2031                 if (mclverify) {
2032                         mcache_audit_free_verify(mca, buf, 0,
2033                             m_maxsize(class));
2034                 }
2035                 mca->mca_uflags &= ~MB_SCVALID;
2036         }
2037
2038         if (class == MC_CL) {
2039                 mbstat.m_clfree = (++m_infree(MC_CL)) + m_infree(MC_MBUF_CL);
2040                 buf->obj_next = sp->sl_head;
2041         } else if (class == MC_BIGCL) {
2042                 mbstat.m_bigclfree = (++m_infree(MC_BIGCL)) +
2043                     m_infree(MC_MBUF_BIGCL);
2044                 buf->obj_next = sp->sl_head;
2045         } else if (class == MC_16KCL) {
2046                 ++m_infree(MC_16KCL);
2047         } else {
2048                 ++m_infree(MC_MBUF);
2049                 buf->obj_next = sp->sl_head;
2050         }
2051         sp->sl_head = buf;
2052
2053         /*
2054          * If a slab has been split to either one which holds 2KB clusters,
2055          * or one which holds mbufs, turn it back to one which holds a
2056          * 4 or 16 KB cluster depending on the page size.
2057          */
2058         if (m_maxsize(MC_BIGCL) == PAGE_SIZE) {
2059                 super_class = MC_BIGCL;
2060         } else {
2061                 VERIFY(PAGE_SIZE == m_maxsize(MC_16KCL));
2062                 super_class = MC_16KCL;
2063         }
2064         if (class == MC_MBUF && sp->sl_refcnt == 0 &&
2065             m_total(class) >= (m_minlimit(class) + NMBPG) &&
2066             m_total(super_class) < m_maxlimit(super_class)) {
2067                 int i = NMBPG;
2068
2069                 m_total(MC_MBUF) -= NMBPG;
2070                 mbstat.m_mbufs = m_total(MC_MBUF);
2071                 m_infree(MC_MBUF) -= NMBPG;
2072                 mtype_stat_add(MT_FREE, -((unsigned)NMBPG));
2073
2074                 while (i--) {
2075                         struct mbuf *m = sp->sl_head;
2076                         VERIFY(m != NULL);
2077                         sp->sl_head = m->m_next;
2078                         m->m_next = NULL;
2079                 }
2080                 reinit_supercl = true;
2081         } else if (class == MC_CL && sp->sl_refcnt == 0 &&
2082             m_total(class) >=  (m_minlimit(class) + NCLPG) &&
2083             m_total(super_class) < m_maxlimit(super_class)) {
2084                 int i = NCLPG;
2085
2086                 m_total(MC_CL) -= NCLPG;
2087                 mbstat.m_clusters = m_total(MC_CL);
2088                 m_infree(MC_CL) -= NCLPG;
2089
2090                 while (i--) {
2091                         union mcluster *c = sp->sl_head;
2092                         VERIFY(c != NULL);
2093                         sp->sl_head = c->mcl_next;
2094                         c->mcl_next = NULL;
2095                 }
2096                 reinit_supercl = true;
2097         } else if (class == MC_BIGCL && super_class != MC_BIGCL &&
2098             sp->sl_refcnt == 0 &&
2099             m_total(class) >= (m_minlimit(class) + NBCLPG) &&
2100             m_total(super_class) < m_maxlimit(super_class)) {
2101                 int i = NBCLPG;
2102
2103                 VERIFY(super_class == MC_16KCL);
2104                 m_total(MC_BIGCL) -= NBCLPG;
2105                 mbstat.m_bigclusters = m_total(MC_BIGCL);
2106                 m_infree(MC_BIGCL) -= NBCLPG;
2107
2108                 while (i--) {
2109                         union mbigcluster *bc = sp->sl_head;
2110                         VERIFY(bc != NULL);
2111                         sp->sl_head = bc->mbc_next;
2112                         bc->mbc_next = NULL;
2113                 }
2114                 reinit_supercl = true;
2115         }
2116
2117         if (reinit_supercl) {
2118                 VERIFY(sp->sl_head == NULL);
2119                 VERIFY(m_total(class) >= m_minlimit(class));
2120                 slab_remove(sp, class);
2121
2122                 /* Reinitialize it as a cluster for the super class */
2123                 m_total(super_class)++;
2124                 m_infree(super_class)++;
2125                 VERIFY(sp->sl_flags == (SLF_MAPPED | SLF_DETACHED) &&
2126                     sp->sl_len == PAGE_SIZE && sp->sl_refcnt == 0);
2127
2128                 slab_init(sp, super_class, SLF_MAPPED, sp->sl_base,
2129                     sp->sl_base, PAGE_SIZE, 0, 1);
2130                 if (mclverify)
2131                         mcache_set_pattern(MCACHE_FREE_PATTERN,
2132                             (caddr_t)sp->sl_base, sp->sl_len);
2133                 ((mcache_obj_t *)(sp->sl_base))->obj_next = NULL;
2134
2135                 if (super_class == MC_BIGCL) {
2136                         mbstat.m_bigclusters = m_total(MC_BIGCL);
2137                         mbstat.m_bigclfree = m_infree(MC_BIGCL) +
2138                             m_infree(MC_MBUF_BIGCL);
2139                 }
2140
2141                 VERIFY(slab_is_detached(sp));
2142                 VERIFY(m_total(super_class) <= m_maxlimit(super_class));
2143
2144                 /* And finally switch class */
2145                 class = super_class;
2146         }
2147
2148         /* Reinsert the slab to the class's slab list */
2149         if (slab_is_detached(sp))
2150                 slab_insert(sp, class);
2151
2152         /* We're done; let others enter */
2153         mb_clalloc_busy = FALSE;
2154         if (mb_clalloc_waiters > 0) {
2155                 mb_clalloc_waiters = 0;
2156                 wakeup(mb_clalloc_waitchan);
2157         }
2158 }
2159
2160 /*
2161  * Common allocator for rudimentary objects called by the CPU cache layer
2162  * during an allocation request whenever there is no available element in the
2163  * bucket layer.  It returns one or more elements from the appropriate global
2164  * freelist.  If the freelist is empty, it will attempt to populate it and
2165  * retry the allocation.
2166  */
2167 static unsigned int
2168 mbuf_slab_alloc(void *arg, mcache_obj_t ***plist, unsigned int num, int wait)
2169 {
2170         mbuf_class_t class = (mbuf_class_t)arg;
2171         unsigned int need = num;
2172         mcache_obj_t **list = *plist;
2173
2174         ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
2175         ASSERT(need > 0);
2176
2177         lck_mtx_lock(mbuf_mlock);
2178
2179         for (;;) {
2180                 if ((*list = slab_alloc(class, wait)) != NULL) {
2181                         (*list)->obj_next = NULL;
2182                         list = *plist = &(*list)->obj_next;
2183
2184                         if (--need == 0) {
2185                                 /*
2186                                  * If the number of elements in freelist has
2187                                  * dropped below low watermark, asynchronously
2188                                  * populate the freelist now rather than doing
2189                                  * it later when we run out of elements.
2190                                  */
2191                                 if (!mbuf_cached_above(class, wait) &&
2192                                     m_infree(class) < (m_total(class) >> 5)) {
2193                                         (void) freelist_populate(class, 1,
2194                                             M_DONTWAIT);
2195                                 }
2196                                 break;
2197                         }
2198                 } else {
2199                         VERIFY(m_infree(class) == 0 || class == MC_CL);
2200
2201                         (void) freelist_populate(class, 1,
2202                             (wait & MCR_NOSLEEP) ? M_DONTWAIT : M_WAIT);
2203
2204                         if (m_infree(class) > 0)
2205                                 continue;
2206
2207                         /* Check if there's anything at the cache layer */
2208                         if (mbuf_cached_above(class, wait))
2209                                 break;
2210
2211                         /* watchdog checkpoint */
2212                         mbuf_watchdog();
2213
2214                         /* We have nothing and cannot block; give up */
2215                         if (wait & MCR_NOSLEEP) {
2216                                 if (!(wait & MCR_TRYHARD)) {
2217                                         m_fail_cnt(class)++;
2218                                         mbstat.m_drops++;
2219                                         break;
2220                                 }
2221                         }
2222
2223                         /*
2224                          * If the freelist is still empty and the caller is
2225                          * willing to be blocked, sleep on the wait channel
2226                          * until an element is available.  Otherwise, if
2227                          * MCR_TRYHARD is set, do our best to satisfy the
2228                          * request without having to go to sleep.
2229                          */
2230                         if (mbuf_worker_ready &&
2231                             mbuf_sleep(class, need, wait))
2232                                 break;
2233
2234                         LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2235                 }
2236         }
2237
2238         m_alloc_cnt(class) += num - need;
2239         lck_mtx_unlock(mbuf_mlock);
2240
2241         return (num - need);
2242 }
2243
2244 /*
2245  * Common de-allocator for rudimentary objects called by the CPU cache
2246  * layer when one or more elements need to be returned to the appropriate
2247  * global freelist.
2248  */
2249 static void
2250 mbuf_slab_free(void *arg, mcache_obj_t *list, __unused int purged)
2251 {
2252         mbuf_class_t class = (mbuf_class_t)arg;
2253         mcache_obj_t *nlist;
2254         unsigned int num = 0;
2255         int w;
2256
2257         ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
2258
2259         lck_mtx_lock(mbuf_mlock);
2260
2261         for (;;) {
2262                 nlist = list->obj_next;
2263                 list->obj_next = NULL;
2264                 slab_free(class, list);
2265                 ++num;
2266                 if ((list = nlist) == NULL)
2267                         break;
2268         }
2269         m_free_cnt(class) += num;
2270
2271         if ((w = mb_waiters) > 0)
2272                 mb_waiters = 0;
2273         if (w) {
2274                 mbwdog_logger("waking up all threads");
2275         }
2276         lck_mtx_unlock(mbuf_mlock);
2277
2278         if (w != 0)
2279                 wakeup(mb_waitchan);
2280 }
2281
2282 /*
2283  * Common auditor for rudimentary objects called by the CPU cache layer
2284  * during an allocation or free request.  For the former, this is called
2285  * after the objects are obtained from either the bucket or slab layer
2286  * and before they are returned to the caller.  For the latter, this is
2287  * called immediately during free and before placing the objects into
2288  * the bucket or slab layer.
2289  */
2290 static void
2291 mbuf_slab_audit(void *arg, mcache_obj_t *list, boolean_t alloc)
2292 {
2293         mbuf_class_t class = (mbuf_class_t)arg;
2294         mcache_audit_t *mca;
2295
2296         ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
2297
2298         while (list != NULL) {
2299                 lck_mtx_lock(mbuf_mlock);
2300                 mca = mcl_audit_buf2mca(class, list);
2301
2302                 /* Do the sanity checks */
2303                 if (class == MC_MBUF) {
2304                         mcl_audit_mbuf(mca, list, FALSE, alloc);
2305                         ASSERT(mca->mca_uflags & MB_SCVALID);
2306                 } else {
2307                         mcl_audit_cluster(mca, list, m_maxsize(class),
2308                             alloc, TRUE);
2309                         ASSERT(!(mca->mca_uflags & MB_SCVALID));
2310                 }
2311                 /* Record this transaction */
2312                 if (mcltrace)
2313                         mcache_buffer_log(mca, list, m_cache(class), &mb_start);
2314
2315                 if (alloc)
2316                         mca->mca_uflags |= MB_INUSE;
2317                 else
2318                         mca->mca_uflags &= ~MB_INUSE;
2319                 /* Unpair the object (unconditionally) */
2320                 mca->mca_uptr = NULL;
2321                 lck_mtx_unlock(mbuf_mlock);
2322
2323                 list = list->obj_next;
2324         }
2325 }
2326
2327 /*
2328  * Common notify routine for all caches.  It is called by mcache when
2329  * one or more objects get freed.  We use this indication to trigger
2330  * the wakeup of any sleeping threads so that they can retry their
2331  * allocation requests.
2332  */
2333 static void
2334 mbuf_slab_notify(void *arg, u_int32_t reason)
2335 {
2336         mbuf_class_t class = (mbuf_class_t)arg;
2337         int w;
2338
2339         ASSERT(MBUF_CLASS_VALID(class));
2340
2341         if (reason != MCN_RETRYALLOC)
2342                 return;
2343
2344         lck_mtx_lock(mbuf_mlock);
2345         if ((w = mb_waiters) > 0) {
2346                 m_notified(class)++;
2347                 mb_waiters = 0;
2348         }
2349         if (w) {
2350                 mbwdog_logger("waking up all threads");
2351         }
2352         lck_mtx_unlock(mbuf_mlock);
2353
2354         if (w != 0)
2355                 wakeup(mb_waitchan);
2356 }
2357
2358 /*
2359  * Obtain object(s) from the composite class's freelist.
2360  */
2361 static unsigned int
2362 cslab_alloc(mbuf_class_t class, mcache_obj_t ***plist, unsigned int num)
2363 {
2364         unsigned int need = num;
2365         mcl_slab_t *sp, *clsp, *nsp;
2366         struct mbuf *m;
2367         mcache_obj_t **list = *plist;
2368         void *cl;
2369
2370         VERIFY(need > 0);
2371         VERIFY(class != MC_MBUF_16KCL || njcl > 0);
2372         LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2373
2374         /* Get what we can from the freelist */
2375         while ((*list = m_cobjlist(class)) != NULL) {
2376                 MRANGE(*list);
2377
2378                 m = (struct mbuf *)*list;
2379                 sp = slab_get(m);
2380                 cl = m->m_ext.ext_buf;
2381                 clsp = slab_get(cl);
2382                 VERIFY(m->m_flags == M_EXT && cl != NULL);
2383                 VERIFY(m_get_rfa(m) != NULL && MBUF_IS_COMPOSITE(m));
2384
2385                 if (class == MC_MBUF_CL) {
2386                         VERIFY(clsp->sl_refcnt >= 1 &&
2387                             clsp->sl_refcnt <= NCLPG);
2388                 } else {
2389                         VERIFY(clsp->sl_refcnt >= 1 &&
2390                             clsp->sl_refcnt <= NBCLPG);
2391                 }
2392
2393                 if (class == MC_MBUF_16KCL) {
2394                         int k;
2395                         for (nsp = clsp, k = 1; k < NSLABSP16KB; k++) {
2396                                 nsp = nsp->sl_next;
2397                                 /* Next slab must already be present */
2398                                 VERIFY(nsp != NULL);
2399                                 VERIFY(nsp->sl_refcnt == 1);
2400                         }
2401                 }
2402
2403                 if ((m_cobjlist(class) = (*list)->obj_next) != NULL &&
2404                     !MBUF_IN_MAP(m_cobjlist(class))) {
2405                         slab_nextptr_panic(sp, m_cobjlist(class));
2406                         /* NOTREACHED */
2407                 }
2408                 (*list)->obj_next = NULL;
2409                 list = *plist = &(*list)->obj_next;
2410
2411                 if (--need == 0)
2412                         break;
2413         }
2414         m_infree(class) -= (num - need);
2415
2416         return (num - need);
2417 }
2418
2419 /*
2420  * Place object(s) back into a composite class's freelist.
2421  */
2422 static unsigned int
2423 cslab_free(mbuf_class_t class, mcache_obj_t *list, int purged)
2424 {
2425         mcache_obj_t *o, *tail;
2426         unsigned int num = 0;
2427         struct mbuf *m, *ms;
2428         mcache_audit_t *mca = NULL;
2429         mcache_obj_t *ref_list = NULL;
2430         mcl_slab_t *clsp, *nsp;
2431         void *cl;
2432         mbuf_class_t cl_class;
2433
2434         ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
2435         VERIFY(class != MC_MBUF_16KCL || njcl > 0);
2436         LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2437
2438         if (class == MC_MBUF_CL) {
2439                 cl_class = MC_CL;
2440         } else if (class == MC_MBUF_BIGCL) {
2441                 cl_class = MC_BIGCL;
2442         } else {
2443                 VERIFY(class == MC_MBUF_16KCL);
2444                 cl_class = MC_16KCL;
2445         }
2446
2447         o = tail = list;
2448
2449         while ((m = ms = (struct mbuf *)o) != NULL) {
2450                 mcache_obj_t *rfa, *nexto = o->obj_next;
2451
2452                 /* Do the mbuf sanity checks */
2453                 if (mclaudit != NULL) {
2454                         mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
2455                         if (mclverify) {
2456                                 mcache_audit_free_verify(mca, m, 0,
2457                                     m_maxsize(MC_MBUF));
2458                         }
2459                         ms = MCA_SAVED_MBUF_PTR(mca);
2460                 }
2461
2462                 /* Do the cluster sanity checks */
2463                 cl = ms->m_ext.ext_buf;
2464                 clsp = slab_get(cl);
2465                 if (mclverify) {
2466                         size_t size = m_maxsize(cl_class);
2467                         mcache_audit_free_verify(mcl_audit_buf2mca(cl_class,
2468                             (mcache_obj_t *)cl), cl, 0, size);
2469                 }
2470                 VERIFY(ms->m_type == MT_FREE);
2471                 VERIFY(ms->m_flags == M_EXT);
2472                 VERIFY(m_get_rfa(ms) != NULL && MBUF_IS_COMPOSITE(ms));
2473                 if (cl_class == MC_CL) {
2474                         VERIFY(clsp->sl_refcnt >= 1 &&
2475                             clsp->sl_refcnt <= NCLPG);
2476                 } else {
2477                         VERIFY(clsp->sl_refcnt >= 1 &&
2478                             clsp->sl_refcnt <= NBCLPG);
2479                 }
2480                 if (cl_class == MC_16KCL) {
2481                         int k;
2482                         for (nsp = clsp, k = 1; k < NSLABSP16KB; k++) {
2483                                 nsp = nsp->sl_next;
2484                                 /* Next slab must already be present */
2485                                 VERIFY(nsp != NULL);
2486                                 VERIFY(nsp->sl_refcnt == 1);
2487                         }
2488                 }
2489
2490                 /*
2491                  * If we're asked to purge, restore the actual mbuf using
2492                  * contents of the shadow structure (if auditing is enabled)
2493                  * and clear EXTF_COMPOSITE flag from the mbuf, as we are
2494                  * about to free it and the attached cluster into their caches.
2495                  */
2496                 if (purged) {
2497                         /* Restore constructed mbuf fields */
2498                         if (mclaudit != NULL)
2499                                 mcl_audit_restore_mbuf(m, mca, TRUE);
2500
2501                         MEXT_MINREF(m) = 0;
2502                         MEXT_REF(m) = 0;
2503                         MEXT_PREF(m) = 0;
2504                         MEXT_FLAGS(m) = 0;
2505                         MEXT_PRIV(m) = 0;
2506                         MEXT_PMBUF(m) = NULL;
2507                         MEXT_TOKEN(m) = 0;
2508
2509                         rfa = (mcache_obj_t *)(void *)m_get_rfa(m);
2510                         m_set_ext(m, NULL, NULL, NULL);
2511                         rfa->obj_next = ref_list;
2512                         ref_list = rfa;
2513
2514                         m->m_type = MT_FREE;
2515                         m->m_flags = m->m_len = 0;
2516                         m->m_next = m->m_nextpkt = NULL;
2517
2518                         /* Save mbuf fields and make auditing happy */
2519                         if (mclaudit != NULL)
2520                                 mcl_audit_mbuf(mca, o, FALSE, FALSE);
2521
2522                         VERIFY(m_total(class) > 0);
2523                         m_total(class)--;
2524
2525                         /* Free the mbuf */
2526                         o->obj_next = NULL;
2527                         slab_free(MC_MBUF, o);
2528
2529                         /* And free the cluster */
2530                         ((mcache_obj_t *)cl)->obj_next = NULL;
2531                         if (class == MC_MBUF_CL)
2532                                 slab_free(MC_CL, cl);
2533                         else if (class == MC_MBUF_BIGCL)
2534                                 slab_free(MC_BIGCL, cl);
2535                         else
2536                                 slab_free(MC_16KCL, cl);
2537                 }
2538
2539                 ++num;
2540                 tail = o;
2541                 o = nexto;
2542         }
2543
2544         if (!purged) {
2545                 tail->obj_next = m_cobjlist(class);
2546                 m_cobjlist(class) = list;
2547                 m_infree(class) += num;
2548         } else if (ref_list != NULL) {
2549                 mcache_free_ext(ref_cache, ref_list);
2550         }
2551
2552         return (num);
2553 }
2554
2555 /*
2556  * Common allocator for composite objects called by the CPU cache layer
2557  * during an allocation request whenever there is no available element in
2558  * the bucket layer.  It returns one or more composite elements from the
2559  * appropriate global freelist.  If the freelist is empty, it will attempt
2560  * to obtain the rudimentary objects from their caches and construct them
2561  * into composite mbuf + cluster objects.
2562  */
2563 static unsigned int
2564 mbuf_cslab_alloc(void *arg, mcache_obj_t ***plist, unsigned int needed,
2565     int wait)
2566 {
2567         mbuf_class_t class = (mbuf_class_t)arg;
2568         mbuf_class_t cl_class = 0;
2569         unsigned int num = 0, cnum = 0, want = needed;
2570         mcache_obj_t *ref_list = NULL;
2571         mcache_obj_t *mp_list = NULL;
2572         mcache_obj_t *clp_list = NULL;
2573         mcache_obj_t **list;
2574         struct ext_ref *rfa;
2575         struct mbuf *m;
2576         void *cl;
2577
2578         ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
2579         ASSERT(needed > 0);
2580
2581         VERIFY(class != MC_MBUF_16KCL || njcl > 0);
2582
2583         /* There should not be any slab for this class */
2584         VERIFY(m_slab_cnt(class) == 0 &&
2585             m_slablist(class).tqh_first == NULL &&
2586             m_slablist(class).tqh_last == NULL);
2587
2588         lck_mtx_lock(mbuf_mlock);
2589
2590         /* Try using the freelist first */
2591         num = cslab_alloc(class, plist, needed);
2592         list = *plist;
2593         if (num == needed) {
2594                 m_alloc_cnt(class) += num;
2595                 lck_mtx_unlock(mbuf_mlock);
2596                 return (needed);
2597         }
2598
2599         lck_mtx_unlock(mbuf_mlock);
2600
2601         /*
2602          * We could not satisfy the request using the freelist alone;
2603          * allocate from the appropriate rudimentary caches and use
2604          * whatever we can get to construct the composite objects.
2605          */
2606         needed -= num;
2607
2608         /*
2609          * Mark these allocation requests as coming from a composite cache.
2610          * Also, if the caller is willing to be blocked, mark the request
2611          * with MCR_FAILOK such that we don't end up sleeping at the mbuf
2612          * slab layer waiting for the individual object when one or more
2613          * of the already-constructed composite objects are available.
2614          */
2615         wait |= MCR_COMP;
2616         if (!(wait & MCR_NOSLEEP))
2617                 wait |= MCR_FAILOK;
2618
2619         /* allocate mbufs */
2620         needed = mcache_alloc_ext(m_cache(MC_MBUF), &mp_list, needed, wait);
2621         if (needed == 0) {
2622                 ASSERT(mp_list == NULL);
2623                 goto fail;
2624         }
2625
2626         /* allocate clusters */
2627         if (class == MC_MBUF_CL) {
2628                 cl_class = MC_CL;
2629         } else if (class == MC_MBUF_BIGCL) {
2630                 cl_class = MC_BIGCL;
2631         } else {
2632                 VERIFY(class == MC_MBUF_16KCL);
2633                 cl_class = MC_16KCL;
2634         }
2635         needed = mcache_alloc_ext(m_cache(cl_class), &clp_list, needed, wait);
2636         if (needed == 0) {
2637                 ASSERT(clp_list == NULL);
2638                 goto fail;
2639         }
2640
2641         needed = mcache_alloc_ext(ref_cache, &ref_list, needed, wait);
2642         if (needed == 0) {
2643                 ASSERT(ref_list == NULL);
2644                 goto fail;
2645         }
2646
2647         /*
2648          * By this time "needed" is MIN(mbuf, cluster, ref).  Any left
2649          * overs will get freed accordingly before we return to caller.
2650          */
2651         for (cnum = 0; cnum < needed; cnum++) {
2652                 struct mbuf *ms;
2653
2654                 m = ms = (struct mbuf *)mp_list;
2655                 mp_list = mp_list->obj_next;
2656
2657                 cl = clp_list;
2658                 clp_list = clp_list->obj_next;
2659                 ((mcache_obj_t *)cl)->obj_next = NULL;
2660
2661                 rfa = (struct ext_ref *)ref_list;
2662                 ref_list = ref_list->obj_next;
2663                 ((mcache_obj_t *)(void *)rfa)->obj_next = NULL;
2664
2665                 /*
2666                  * If auditing is enabled, construct the shadow mbuf
2667                  * in the audit structure instead of in the actual one.
2668                  * mbuf_cslab_audit() will take care of restoring the
2669                  * contents after the integrity check.
2670                  */
2671                 if (mclaudit != NULL) {
2672                         mcache_audit_t *mca, *cl_mca;
2673
2674                         lck_mtx_lock(mbuf_mlock);
2675                         mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
2676                         ms = MCA_SAVED_MBUF_PTR(mca);
2677                         cl_mca = mcl_audit_buf2mca(cl_class,
2678                             (mcache_obj_t *)cl);
2679
2680                         /*
2681                          * Pair them up.  Note that this is done at the time
2682                          * the mbuf+cluster objects are constructed.  This
2683                          * information should be treated as "best effort"
2684                          * debugging hint since more than one mbufs can refer
2685                          * to a cluster.  In that case, the cluster might not
2686                          * be freed along with the mbuf it was paired with.
2687                          */
2688                         mca->mca_uptr = cl_mca;
2689                         cl_mca->mca_uptr = mca;
2690
2691                         ASSERT(mca->mca_uflags & MB_SCVALID);
2692                         ASSERT(!(cl_mca->mca_uflags & MB_SCVALID));
2693                         lck_mtx_unlock(mbuf_mlock);
2694
2695                         /* Technically, they are in the freelist */
2696                         if (mclverify) {
2697                                 size_t size;
2698
2699                                 mcache_set_pattern(MCACHE_FREE_PATTERN, m,
2700                                     m_maxsize(MC_MBUF));
2701
2702                                 if (class == MC_MBUF_CL)
2703                                         size = m_maxsize(MC_CL);
2704                                 else if (class == MC_MBUF_BIGCL)
2705                                         size = m_maxsize(MC_BIGCL);
2706                                 else
2707                                         size = m_maxsize(MC_16KCL);
2708
2709                                 mcache_set_pattern(MCACHE_FREE_PATTERN, cl,
2710                                     size);
2711                         }
2712                 }
2713
2714                 MBUF_INIT(ms, 0, MT_FREE);
2715                 if (class == MC_MBUF_16KCL) {
2716                         MBUF_16KCL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE);
2717                 } else if (class == MC_MBUF_BIGCL) {
2718                         MBUF_BIGCL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE);
2719                 } else {
2720                         MBUF_CL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE);
2721                 }
2722                 VERIFY(ms->m_flags == M_EXT);
2723                 VERIFY(m_get_rfa(ms) != NULL && MBUF_IS_COMPOSITE(ms));
2724
2725                 *list = (mcache_obj_t *)m;
2726                 (*list)->obj_next = NULL;
2727                 list = *plist = &(*list)->obj_next;
2728         }
2729
2730 fail:
2731         /*
2732          * Free up what's left of the above.
2733          */
2734         if (mp_list != NULL)
2735                 mcache_free_ext(m_cache(MC_MBUF), mp_list);
2736         if (clp_list != NULL)
2737                 mcache_free_ext(m_cache(cl_class), clp_list);
2738         if (ref_list != NULL)
2739                 mcache_free_ext(ref_cache, ref_list);
2740
2741         lck_mtx_lock(mbuf_mlock);
2742         if (num > 0 || cnum > 0) {
2743                 m_total(class) += cnum;
2744                 VERIFY(m_total(class) <= m_maxlimit(class));
2745                 m_alloc_cnt(class) += num + cnum;
2746         }
2747         if ((num + cnum) < want)
2748                 m_fail_cnt(class) += (want - (num + cnum));
2749         lck_mtx_unlock(mbuf_mlock);
2750
2751         return (num + cnum);
2752 }
2753
2754 /*
2755  * Common de-allocator for composite objects called by the CPU cache
2756  * layer when one or more elements need to be returned to the appropriate
2757  * global freelist.
2758  */
2759 static void
2760 mbuf_cslab_free(void *arg, mcache_obj_t *list, int purged)
2761 {
2762         mbuf_class_t class = (mbuf_class_t)arg;
2763         unsigned int num;
2764         int w;
2765
2766         ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
2767
2768         lck_mtx_lock(mbuf_mlock);
2769
2770         num = cslab_free(class, list, purged);
2771         m_free_cnt(class) += num;
2772
2773         if ((w = mb_waiters) > 0)
2774                 mb_waiters = 0;
2775         if (w) {
2776                 mbwdog_logger("waking up all threads");
2777         }
2778
2779         lck_mtx_unlock(mbuf_mlock);
2780
2781         if (w != 0)
2782                 wakeup(mb_waitchan);
2783 }
2784
2785 /*
2786  * Common auditor for composite objects called by the CPU cache layer
2787  * during an allocation or free request.  For the former, this is called
2788  * after the objects are obtained from either the bucket or slab layer
2789  * and before they are returned to the caller.  For the latter, this is
2790  * called immediately during free and before placing the objects into
2791  * the bucket or slab layer.
2792  */
2793 static void
2794 mbuf_cslab_audit(void *arg, mcache_obj_t *list, boolean_t alloc)
2795 {
2796         mbuf_class_t class = (mbuf_class_t)arg, cl_class;
2797         mcache_audit_t *mca;
2798         struct mbuf *m, *ms;
2799         mcl_slab_t *clsp, *nsp;
2800         size_t cl_size;
2801         void *cl;
2802
2803         ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
2804         if (class == MC_MBUF_CL)
2805                 cl_class = MC_CL;
2806         else if (class == MC_MBUF_BIGCL)
2807                 cl_class = MC_BIGCL;
2808         else
2809                 cl_class = MC_16KCL;
2810         cl_size = m_maxsize(cl_class);
2811
2812         while ((m = ms = (struct mbuf *)list) != NULL) {
2813                 lck_mtx_lock(mbuf_mlock);
2814                 /* Do the mbuf sanity checks and record its transaction */
2815                 mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
2816                 mcl_audit_mbuf(mca, m, TRUE, alloc);
2817                 if (mcltrace)
2818                         mcache_buffer_log(mca, m, m_cache(class), &mb_start);
2819
2820                 if (alloc)
2821                         mca->mca_uflags |= MB_COMP_INUSE;
2822                 else
2823                         mca->mca_uflags &= ~MB_COMP_INUSE;
2824
2825                 /*
2826                  * Use the shadow mbuf in the audit structure if we are
2827                  * freeing, since the contents of the actual mbuf has been
2828                  * pattern-filled by the above call to mcl_audit_mbuf().
2829                  */
2830                 if (!alloc && mclverify)
2831                         ms = MCA_SAVED_MBUF_PTR(mca);
2832
2833                 /* Do the cluster sanity checks and record its transaction */
2834                 cl = ms->m_ext.ext_buf;
2835                 clsp = slab_get(cl);
2836                 VERIFY(ms->m_flags == M_EXT && cl != NULL);
2837                 VERIFY(m_get_rfa(ms) != NULL && MBUF_IS_COMPOSITE(ms));
2838                 if (class == MC_MBUF_CL)
2839                         VERIFY(clsp->sl_refcnt >= 1 &&
2840                             clsp->sl_refcnt <= NCLPG);
2841                 else
2842                         VERIFY(clsp->sl_refcnt >= 1 &&
2843                             clsp->sl_refcnt <= NBCLPG);
2844
2845                 if (class == MC_MBUF_16KCL) {
2846                         int k;
2847                         for (nsp = clsp, k = 1; k < NSLABSP16KB; k++) {
2848                                 nsp = nsp->sl_next;
2849                                 /* Next slab must already be present */
2850                                 VERIFY(nsp != NULL);
2851                                 VERIFY(nsp->sl_refcnt == 1);
2852                         }
2853                 }
2854
2855
2856                 mca = mcl_audit_buf2mca(cl_class, cl);
2857                 mcl_audit_cluster(mca, cl, cl_size, alloc, FALSE);
2858                 if (mcltrace)
2859                         mcache_buffer_log(mca, cl, m_cache(class), &mb_start);
2860
2861                 if (alloc)
2862                         mca->mca_uflags |= MB_COMP_INUSE;
2863                 else
2864                         mca->mca_uflags &= ~MB_COMP_INUSE;
2865                 lck_mtx_unlock(mbuf_mlock);
2866
2867                 list = list->obj_next;
2868         }
2869 }
2870
2871 static void
2872 m_vm_error_stats(uint32_t *cnt, uint64_t *ts, uint64_t *size,
2873                  uint64_t alloc_size, kern_return_t error)
2874 {
2875
2876         *cnt = *cnt + 1;
2877         *ts = net_uptime();
2878         if (size) {
2879                 *size = alloc_size;
2880         }
2881         _CASSERT(sizeof(mb_kmem_stats) / sizeof(mb_kmem_stats[0]) ==
2882             sizeof(mb_kmem_stats_labels) / sizeof(mb_kmem_stats_labels[0]));
2883         switch (error) {
2884         case KERN_SUCCESS:
2885                 break;
2886         case KERN_INVALID_ARGUMENT:
2887                 mb_kmem_stats[0]++;
2888                 break;
2889         case KERN_INVALID_ADDRESS:
2890                 mb_kmem_stats[1]++;
2891                 break;
2892         case KERN_RESOURCE_SHORTAGE:
2893                 mb_kmem_stats[2]++;
2894                 break;
2895         case KERN_NO_SPACE:
2896                 mb_kmem_stats[3]++;
2897                 break;
2898         case KERN_FAILURE:
2899                 mb_kmem_stats[4]++;
2900                 break;
2901         default:
2902                 mb_kmem_stats[5]++;
2903                 break;
2904         }
2905 }
2906
2907 /*
2908  * Allocate some number of mbuf clusters and place on cluster freelist.
2909  */
2910 static int
2911 m_clalloc(const u_int32_t num, const int wait, const u_int32_t bufsize)
2912 {
2913         int i, count = 0;
2914         vm_size_t size = 0;
2915         int numpages = 0, large_buffer;
2916         vm_offset_t page = 0;
2917         mcache_audit_t *mca_list = NULL;
2918         mcache_obj_t *con_list = NULL;
2919         mcl_slab_t *sp;
2920         mbuf_class_t class;
2921         kern_return_t error;
2922
2923         /* Set if a buffer allocation needs allocation of multiple pages */
2924         large_buffer = ((bufsize == m_maxsize(MC_16KCL)) &&
2925                 PAGE_SIZE < M16KCLBYTES);
2926         VERIFY(bufsize == m_maxsize(MC_BIGCL) ||
2927             bufsize == m_maxsize(MC_16KCL));
2928
2929         VERIFY((bufsize == PAGE_SIZE) ||
2930             (bufsize > PAGE_SIZE && bufsize == m_maxsize(MC_16KCL)));
2931
2932         if (bufsize == m_size(MC_BIGCL))
2933                 class = MC_BIGCL;
2934         else
2935                 class = MC_16KCL;
2936
2937         LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2938
2939         /*
2940          * Multiple threads may attempt to populate the cluster map one
2941          * after another.  Since we drop the lock below prior to acquiring
2942          * the physical page(s), our view of the cluster map may no longer
2943          * be accurate, and we could end up over-committing the pages beyond
2944          * the maximum allowed for each class.  To prevent it, this entire
2945          * operation (including the page mapping) is serialized.
2946          */
2947         while (mb_clalloc_busy) {
2948                 mb_clalloc_waiters++;
2949                 (void) msleep(mb_clalloc_waitchan, mbuf_mlock,
2950                     (PZERO-1), "m_clalloc", NULL);
2951                 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2952         }
2953
2954         /* We are busy now; tell everyone else to go away */
2955         mb_clalloc_busy = TRUE;
2956
2957         /*
2958          * Honor the caller's wish to block or not block.  We have a way
2959          * to grow the pool asynchronously using the mbuf worker thread.
2960          */
2961         i = m_howmany(num, bufsize);
2962         if (i <= 0 || (wait & M_DONTWAIT))
2963                 goto out;
2964
2965         lck_mtx_unlock(mbuf_mlock);
2966
2967         size = round_page(i * bufsize);
2968         page = kmem_mb_alloc(mb_map, size, large_buffer, &error);
2969
2970         /*
2971          * If we did ask for "n" 16KB physically contiguous chunks
2972          * and didn't get them, then please try again without this
2973          * restriction.
2974          */
2975         net_update_uptime();
2976         if (large_buffer && page == 0) {
2977                 m_vm_error_stats(&mb_kmem_contig_failed,
2978                     &mb_kmem_contig_failed_ts,
2979                     &mb_kmem_contig_failed_size,
2980                     size, error);
2981                 page = kmem_mb_alloc(mb_map, size, 0, &error);
2982         }
2983
2984         if (page == 0) {
2985                 m_vm_error_stats(&mb_kmem_failed,
2986                     &mb_kmem_failed_ts,
2987                     &mb_kmem_failed_size,
2988                     size, error);
2989 #if PAGE_SIZE == 4096
2990                 if (bufsize == m_maxsize(MC_BIGCL)) {
2991 #else
2992                 if (bufsize >= m_maxsize(MC_BIGCL)) {
2993 #endif
2994                         /* Try for 1 page if failed */
2995                         size = PAGE_SIZE;
2996                         page = kmem_mb_alloc(mb_map, size, 0, &error);
2997                         if (page == 0) {
2998                                 m_vm_error_stats(&mb_kmem_one_failed,
2999                                     &mb_kmem_one_failed_ts,
3000                                     NULL, size, error);
3001                         }
3002                 }
3003
3004                 if (page == 0) {
3005                         lck_mtx_lock(mbuf_mlock);
3006                         goto out;
3007                 }
3008         }
3009
3010         VERIFY(IS_P2ALIGNED(page, PAGE_SIZE));
3011         numpages = size / PAGE_SIZE;
3012
3013         /* If auditing is enabled, allocate the audit structures now */
3014         if (mclaudit != NULL) {
3015                 int needed;
3016
3017                 /*
3018                  * Yes, I realize this is a waste of memory for clusters
3019                  * that never get transformed into mbufs, as we may end
3020                  * up with NMBPG-1 unused audit structures per cluster.
3021                  * But doing so tremendously simplifies the allocation
3022                  * strategy, since at this point we are not holding the
3023                  * mbuf lock and the caller is okay to be blocked.
3024                  */
3025                 if (bufsize == PAGE_SIZE) {
3026                         needed = numpages * NMBPG;
3027
3028                         i = mcache_alloc_ext(mcl_audit_con_cache,
3029                             &con_list, needed, MCR_SLEEP);
3030
3031                         VERIFY(con_list != NULL && i == needed);
3032                 } else {
3033                         /*
3034                          * if multiple 4K pages are being used for a
3035                          * 16K cluster
3036                          */
3037                         needed = numpages / NSLABSP16KB;
3038                 }
3039
3040                 i = mcache_alloc_ext(mcache_audit_cache,
3041                     (mcache_obj_t **)&mca_list, needed, MCR_SLEEP);
3042
3043                 VERIFY(mca_list != NULL && i == needed);
3044         }
3045
3046         lck_mtx_lock(mbuf_mlock);
3047
3048         for (i = 0; i < numpages; i++, page += PAGE_SIZE) {
3049                 ppnum_t offset =
3050                     ((unsigned char *)page - mbutl) >> PAGE_SHIFT;
3051                 ppnum_t new_page = pmap_find_phys(kernel_pmap, page);
3052
3053                 /*
3054                  * If there is a mapper the appropriate I/O page is
3055                  * returned; zero out the page to discard its past
3056                  * contents to prevent exposing leftover kernel memory.
3057                  */
3058                 VERIFY(offset < mcl_pages);
3059                 if (mcl_paddr_base != 0) {
3060                         bzero((void *)(uintptr_t) page, PAGE_SIZE);
3061                         new_page = IOMapperInsertPage(mcl_paddr_base,
3062                             offset, new_page);
3063                 }
3064                 mcl_paddr[offset] = new_page;
3065
3066                 /* Pattern-fill this fresh page */
3067                 if (mclverify) {
3068                         mcache_set_pattern(MCACHE_FREE_PATTERN,
3069                             (caddr_t)page, PAGE_SIZE);
3070                 }
3071                 if (bufsize == PAGE_SIZE) {
3072                         mcache_obj_t *buf;
3073                         /* One for the entire page */
3074                         sp = slab_get((void *)page);
3075                         if (mclaudit != NULL) {
3076                                 mcl_audit_init((void *)page,
3077                                     &mca_list, &con_list,
3078                                     AUDIT_CONTENTS_SIZE, NMBPG);
3079                         }
3080                         VERIFY(sp->sl_refcnt == 0 && sp->sl_flags == 0);
3081                         slab_init(sp, class, SLF_MAPPED, (void *)page,
3082                             (void *)page, PAGE_SIZE, 0, 1);
3083                         buf = (mcache_obj_t *)page;
3084                         buf->obj_next = NULL;
3085
3086                         /* Insert this slab */
3087                         slab_insert(sp, class);
3088
3089                         /* Update stats now since slab_get drops the lock */
3090                         ++m_infree(class);
3091                         ++m_total(class);
3092                         VERIFY(m_total(class) <= m_maxlimit(class));
3093                         if (class == MC_BIGCL) {
3094                                 mbstat.m_bigclfree = m_infree(MC_BIGCL) +
3095                                     m_infree(MC_MBUF_BIGCL);
3096                                 mbstat.m_bigclusters = m_total(MC_BIGCL);
3097                         }
3098                         ++count;
3099                 } else if ((bufsize > PAGE_SIZE) &&
3100                     (i % NSLABSP16KB) == 0) {
3101                         union m16kcluster *m16kcl = (union m16kcluster *)page;
3102                         mcl_slab_t *nsp;
3103                         int k;
3104
3105                         /* One for the entire 16KB */
3106                         sp = slab_get(m16kcl);
3107                         if (mclaudit != NULL)
3108                                 mcl_audit_init(m16kcl, &mca_list, NULL, 0, 1);
3109
3110                         VERIFY(sp->sl_refcnt == 0 && sp->sl_flags == 0);
3111                         slab_init(sp, MC_16KCL, SLF_MAPPED,
3112                             m16kcl, m16kcl, bufsize, 0, 1);
3113                         m16kcl->m16kcl_next = NULL;
3114
3115                         /*
3116                          * 2nd-Nth page's slab is part of the first one,
3117                          * where N is NSLABSP16KB.
3118                          */
3119                         for (k = 1; k < NSLABSP16KB; k++) {
3120                                 nsp = slab_get(((union mbigcluster *)page) + k);
3121                                 VERIFY(nsp->sl_refcnt == 0 &&
3122                                     nsp->sl_flags == 0);
3123                                 slab_init(nsp, MC_16KCL,
3124                                     SLF_MAPPED | SLF_PARTIAL,
3125                                     m16kcl, NULL, 0, 0, 0);
3126                         }
3127                         /* Insert this slab */
3128                         slab_insert(sp, MC_16KCL);
3129
3130                         /* Update stats now since slab_get drops the lock */
3131                         ++m_infree(MC_16KCL);
3132                         ++m_total(MC_16KCL);
3133                         VERIFY(m_total(MC_16KCL) <= m_maxlimit(MC_16KCL));
3134                         ++count;
3135                 }
3136         }
3137         VERIFY(mca_list == NULL && con_list == NULL);
3138
3139         if (!mb_peak_newreport && mbuf_report_usage(class))
3140                 mb_peak_newreport = TRUE;
3141
3142         /* We're done; let others enter */
3143         mb_clalloc_busy = FALSE;
3144         if (mb_clalloc_waiters > 0) {
3145                 mb_clalloc_waiters = 0;
3146                 wakeup(mb_clalloc_waitchan);
3147         }
3148
3149         return (count);
3150 out:
3151         LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
3152
3153         mtracelarge_register(size);
3154
3155         /* We're done; let others enter */
3156         mb_clalloc_busy = FALSE;
3157         if (mb_clalloc_waiters > 0) {
3158                 mb_clalloc_waiters = 0;
3159                 wakeup(mb_clalloc_waitchan);
3160         }
3161
3162         /*
3163          * When non-blocking we kick a thread if we have to grow the
3164          * pool or if the number of free clusters is less than requested.
3165          */
3166         if (i > 0 && mbuf_worker_ready && mbuf_worker_needs_wakeup) {
3167                 mbwdog_logger("waking up the worker thread to to grow %s by %d",
3168                     m_cname(class), i);
3169                 wakeup((caddr_t)&mbuf_worker_needs_wakeup);
3170                 mbuf_worker_needs_wakeup = FALSE;
3171         }
3172         if (class == MC_BIGCL) {
3173                 if (i > 0) {
3174                         /*
3175                          * Remember total number of 4KB clusters needed
3176                          * at this time.
3177                          */
3178                         i += m_total(MC_BIGCL);
3179                         if (i > m_region_expand(MC_BIGCL)) {
3180                                 m_region_expand(MC_BIGCL) = i;
3181                         }
3182                 }
3183                 if (m_infree(MC_BIGCL) >= num)
3184                         return (1);
3185         } else {
3186                 if (i > 0) {
3187                         /*
3188                          * Remember total number of 16KB clusters needed
3189                          * at this time.
3190                          */
3191                         i += m_total(MC_16KCL);
3192                         if (i > m_region_expand(MC_16KCL)) {
3193                                 m_region_expand(MC_16KCL) = i;
3194                         }
3195                 }
3196                 if (m_infree(MC_16KCL) >= num)
3197                         return (1);
3198         }
3199         return (0);
3200 }
3201
3202 /*
3203  * Populate the global freelist of the corresponding buffer class.
3204  */
3205 static int
3206 freelist_populate(mbuf_class_t class, unsigned int num, int wait)
3207 {
3208         mcache_obj_t *o = NULL;
3209         int i, numpages = 0, count;
3210         mbuf_class_t super_class;
3211
3212         VERIFY(class == MC_MBUF || class == MC_CL || class == MC_BIGCL ||
3213             class == MC_16KCL);
3214
3215         LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
3216
3217         VERIFY(PAGE_SIZE == m_maxsize(MC_BIGCL) ||
3218             PAGE_SIZE == m_maxsize(MC_16KCL));
3219
3220         if (m_maxsize(class) >= PAGE_SIZE)
3221                 return(m_clalloc(num, wait, m_maxsize(class)) != 0);
3222
3223         /*
3224          * The rest of the function will allocate pages and will slice
3225          * them up into the right size
3226          */
3227
3228         numpages = (num * m_size(class) + PAGE_SIZE - 1) / PAGE_SIZE;
3229
3230         /* Currently assume that pages are 4K or 16K */
3231         if (PAGE_SIZE == m_maxsize(MC_BIGCL))
3232                 super_class = MC_BIGCL;
3233         else
3234                 super_class = MC_16KCL;
3235
3236         i = m_clalloc(numpages, wait, m_maxsize(super_class));
3237
3238         /* how many objects will we cut the page into? */
3239         int numobj = PAGE_SIZE / m_maxsize(class);
3240
3241         for (count = 0; count < numpages; count++) {
3242                 /* respect totals, minlimit, maxlimit */
3243                 if (m_total(super_class) <= m_minlimit(super_class) ||
3244                     m_total(class) >= m_maxlimit(class))
3245                         break;
3246
3247                 if ((o = slab_alloc(super_class, wait)) == NULL)
3248                         break;
3249
3250                 struct mbuf *m = (struct mbuf *)o;
3251                 union mcluster *c = (union mcluster *)o;
3252                 union mbigcluster *mbc = (union mbigcluster *)o;
3253                 mcl_slab_t *sp = slab_get(o);
3254                 mcache_audit_t *mca = NULL;
3255
3256                 /*
3257                  * since one full page will be converted to MC_MBUF or
3258                  * MC_CL, verify that the reference count will match that
3259                  * assumption
3260                  */
3261                 VERIFY(sp->sl_refcnt == 1 && slab_is_detached(sp));
3262                 VERIFY((sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED);
3263                 /*
3264                  * Make sure that the cluster is unmolested
3265                  * while in freelist
3266                  */
3267                 if (mclverify) {
3268                         mca = mcl_audit_buf2mca(super_class,
3269                             (mcache_obj_t *)o);
3270                         mcache_audit_free_verify(mca,
3271                             (mcache_obj_t *)o, 0, m_maxsize(super_class));
3272                 }
3273
3274                 /* Reinitialize it as an mbuf or 2K or 4K slab */
3275                 slab_init(sp, class, sp->sl_flags,
3276                     sp->sl_base, NULL, PAGE_SIZE, 0, numobj);
3277
3278                 VERIFY(sp->sl_head == NULL);
3279
3280                 VERIFY(m_total(super_class) >= 1);
3281                 m_total(super_class)--;
3282
3283                 if (super_class == MC_BIGCL)
3284                         mbstat.m_bigclusters = m_total(MC_BIGCL);
3285
3286                 m_total(class) += numobj;
3287                 VERIFY(m_total(class) <= m_maxlimit(class));
3288                 m_infree(class) += numobj;
3289
3290                 if (!mb_peak_newreport && mbuf_report_usage(class))
3291                         mb_peak_newreport = TRUE;
3292
3293                 i = numobj;
3294                 if (class == MC_MBUF) {
3295                         mbstat.m_mbufs = m_total(MC_MBUF);
3296                         mtype_stat_add(MT_FREE, NMBPG);
3297                         while (i--) {
3298                                 /*
3299                                  * If auditing is enabled, construct the
3300                                  * shadow mbuf in the audit structure
3301                                  * instead of the actual one.
3302                                  * mbuf_slab_audit() will take care of
3303                                  * restoring the contents after the
3304                                  * integrity check.
3305                                  */
3306                                 if (mclaudit != NULL) {
3307                                         struct mbuf *ms;
3308                                         mca = mcl_audit_buf2mca(MC_MBUF,
3309                                             (mcache_obj_t *)m);
3310                                         ms = MCA_SAVED_MBUF_PTR(mca);
3311                                         ms->m_type = MT_FREE;
3312                                 } else {
3313                                         m->m_type = MT_FREE;
3314                                 }
3315                                 m->m_next = sp->sl_head;
3316                                 sp->sl_head = (void *)m++;
3317                         }
3318                 } else if (class == MC_CL) { /* MC_CL */
3319                         mbstat.m_clfree =
3320                             m_infree(MC_CL) + m_infree(MC_MBUF_CL);
3321                         mbstat.m_clusters = m_total(MC_CL);
3322                         while (i--) {
3323                                 c->mcl_next = sp->sl_head;
3324                                 sp->sl_head = (void *)c++;
3325                         }
3326                 } else {
3327                         VERIFY(class == MC_BIGCL);
3328                         mbstat.m_bigclusters = m_total(MC_BIGCL);
3329                         mbstat.m_bigclfree = m_infree(MC_BIGCL) +
3330                             m_infree(MC_MBUF_BIGCL);
3331                         while (i--) {
3332                                 mbc->mbc_next = sp->sl_head;
3333                                 sp->sl_head = (void *)mbc++;
3334                         }
3335                 }
3336
3337                 /* Insert into the mbuf or 2k or 4k slab list */
3338                 slab_insert(sp, class);
3339
3340                 if ((i = mb_waiters) > 0)
3341                         mb_waiters = 0;
3342                 if (i != 0) {
3343                         mbwdog_logger("waking up all threads");
3344                         wakeup(mb_waitchan);
3345                 }
3346         }
3347         return (count != 0);
3348 }
3349
3350 /*
3351  * For each class, initialize the freelist to hold m_minlimit() objects.
3352  */
3353 static void
3354 freelist_init(mbuf_class_t class)
3355 {
3356         LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
3357
3358         VERIFY(class == MC_CL || class == MC_BIGCL);
3359         VERIFY(m_total(class) == 0);
3360         VERIFY(m_minlimit(class) > 0);
3361
3362         while (m_total(class) < m_minlimit(class))
3363                 (void) freelist_populate(class, m_minlimit(class), M_WAIT);
3364
3365         VERIFY(m_total(class) >= m_minlimit(class));
3366 }
3367
3368 /*
3369  * (Inaccurately) check if it might be worth a trip back to the
3370  * mcache layer due the availability of objects there.  We'll
3371  * end up back here if there's nothing up there.
3372  */
3373 static boolean_t
3374 mbuf_cached_above(mbuf_class_t class, int wait)
3375 {
3376         switch (class) {
3377         case MC_MBUF:
3378                 if (wait & MCR_COMP)
3379                         return (!mcache_bkt_isempty(m_cache(MC_MBUF_CL)) ||
3380                             !mcache_bkt_isempty(m_cache(MC_MBUF_BIGCL)));
3381                 break;
3382
3383         case MC_CL:
3384                 if (wait & MCR_COMP)
3385                         return (!mcache_bkt_isempty(m_cache(MC_MBUF_CL)));
3386                 break;
3387
3388         case MC_BIGCL:
3389                 if (wait & MCR_COMP)
3390                         return (!mcache_bkt_isempty(m_cache(MC_MBUF_BIGCL)));
3391                 break;
3392
3393         case MC_16KCL:
3394                 if (wait & MCR_COMP)
3395                         return (!mcache_bkt_isempty(m_cache(MC_MBUF_16KCL)));
3396                 break;
3397
3398         case MC_MBUF_CL:
3399         case MC_MBUF_BIGCL:
3400         case MC_MBUF_16KCL:
3401                 break;
3402
3403         default:
3404                 VERIFY(0);
3405                 /* NOTREACHED */
3406         }
3407
3408         return (!mcache_bkt_isempty(m_cache(class)));
3409 }
3410
3411 /*
3412  * If possible, convert constructed objects to raw ones.
3413  */
3414 static boolean_t
3415 mbuf_steal(mbuf_class_t class, unsigned int num)
3416 {
3417         mcache_obj_t *top = NULL;
3418         mcache_obj_t **list = &top;
3419         unsigned int tot = 0;
3420
3421         LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
3422
3423         switch (class) {
3424         case MC_MBUF:
3425         case MC_CL:
3426         case MC_BIGCL:
3427         case MC_16KCL:
3428                 return (FALSE);
3429
3430         case MC_MBUF_CL:
3431         case MC_MBUF_BIGCL:
3432         case MC_MBUF_16KCL:
3433                 /* Get the required number of constructed objects if possible */
3434                 if (m_infree(class) > m_minlimit(class)) {
3435                         tot = cslab_alloc(class, &list,
3436                             MIN(num, m_infree(class)));
3437                 }
3438
3439                 /* And destroy them to get back the raw objects */
3440                 if (top != NULL)
3441                         (void) cslab_free(class, top, 1);
3442                 break;
3443
3444         default:
3445                 VERIFY(0);
3446                 /* NOTREACHED */
3447         }
3448
3449         return (tot == num);
3450 }
3451
3452 static void
3453 m_reclaim(mbuf_class_t class, unsigned int num, boolean_t comp)
3454 {
3455         int m, bmap = 0;
3456
3457         LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
3458
3459         VERIFY(m_total(MC_CL) <= m_maxlimit(MC_CL));
3460         VERIFY(m_total(MC_BIGCL) <= m_maxlimit(MC_BIGCL));
3461         VERIFY(m_total(MC_16KCL) <= m_maxlimit(MC_16KCL));
3462
3463         /*
3464          * This logic can be made smarter; for now, simply mark
3465          * all other related classes as potential victims.
3466          */
3467         switch (class) {
3468         case MC_MBUF:
3469                 m_wantpurge(MC_CL)++;
3470                 m_wantpurge(MC_BIGCL)++;
3471                 m_wantpurge(MC_MBUF_CL)++;
3472                 m_wantpurge(MC_MBUF_BIGCL)++;
3473                 break;
3474
3475         case MC_CL:
3476                 m_wantpurge(MC_MBUF)++;
3477                 m_wantpurge(MC_BIGCL)++;
3478                 m_wantpurge(MC_MBUF_BIGCL)++;
3479                 if (!comp)
3480                         m_wantpurge(MC_MBUF_CL)++;
3481                 break;
3482
3483         case MC_BIGCL:
3484                 m_wantpurge(MC_MBUF)++;
3485                 m_wantpurge(MC_CL)++;
3486                 m_wantpurge(MC_MBUF_CL)++;
3487                 if (!comp)
3488                         m_wantpurge(MC_MBUF_BIGCL)++;
3489                 break;
3490
3491         case MC_16KCL:
3492                 if (!comp)
3493                         m_wantpurge(MC_MBUF_16KCL)++;
3494                 break;
3495
3496         default:
3497                 VERIFY(0);
3498                 /* NOTREACHED */
3499         }
3500
3501         /*
3502          * Run through each marked class and check if we really need to
3503          * purge (and therefore temporarily disable) the per-CPU caches
3504          * layer used by the class.  If so, remember the classes since
3505          * we are going to drop the lock below prior to purging.
3506          */
3507         for (m = 0; m < NELEM(mbuf_table); m++) {
3508                 if (m_wantpurge(m) > 0) {
3509                         m_wantpurge(m) = 0;
3510                         /*
3511                          * Try hard to steal the required number of objects
3512                          * from the freelist of other mbuf classes.  Only
3513                          * purge and disable the per-CPU caches layer when
3514                          * we don't have enough; it's the last resort.
3515                          */
3516                         if (!mbuf_steal(m, num))
3517                                 bmap |= (1 << m);
3518                 }
3519         }
3520
3521         lck_mtx_unlock(mbuf_mlock);
3522
3523         if (bmap != 0) {
3524                 /* signal the domains to drain */
3525                 net_drain_domains();
3526
3527                 /* Sigh; we have no other choices but to ask mcache to purge */
3528                 for (m = 0; m < NELEM(mbuf_table); m++) {
3529                         if ((bmap & (1 << m)) &&
3530                             mcache_purge_cache(m_cache(m), TRUE)) {
3531                                 lck_mtx_lock(mbuf_mlock);
3532                                 m_purge_cnt(m)++;
3533                                 mbstat.m_drain++;
3534                                 lck_mtx_unlock(mbuf_mlock);
3535                         }
3536                 }
3537         } else {
3538                 /*
3539                  * Request mcache to reap extra elements from all of its caches;
3540                  * note that all reaps are serialized and happen only at a fixed
3541                  * interval.
3542                  */
3543                 mcache_reap();
3544         }
3545         lck_mtx_lock(mbuf_mlock);
3546 }
3547
3548 static inline struct mbuf *
3549 m_get_common(int wait, short type, int hdr)
3550 {
3551         struct mbuf *m;
3552         int mcflags = MSLEEPF(wait);
3553
3554         /* Is this due to a non-blocking retry?  If so, then try harder */
3555         if (mcflags & MCR_NOSLEEP)
3556                 mcflags |= MCR_TRYHARD;
3557
3558         m = mcache_alloc(m_cache(MC_MBUF), mcflags);
3559         if (m != NULL) {
3560                 MBUF_INIT(m, hdr, type);
3561                 mtype_stat_inc(type);
3562                 mtype_stat_dec(MT_FREE);
3563 #if CONFIG_MACF_NET
3564                 if (hdr && mac_init_mbuf(m, wait) != 0) {
3565                         m_free(m);
3566                         return (NULL);
3567                 }
3568 #endif /* MAC_NET */
3569         }
3570         return (m);
3571 }
3572
3573 /*
3574  * Space allocation routines; these are also available as macros
3575  * for critical paths.
3576  */
3577 #define _M_GET(wait, type)      m_get_common(wait, type, 0)
3578 #define _M_GETHDR(wait, type)   m_get_common(wait, type, 1)
3579 #define _M_RETRY(wait, type)    _M_GET(wait, type)
3580 #define _M_RETRYHDR(wait, type) _M_GETHDR(wait, type)
3581 #define _MGET(m, how, type)     ((m) = _M_GET(how, type))
3582 #define _MGETHDR(m, how, type)  ((m) = _M_GETHDR(how, type))
3583
3584 struct mbuf *
3585 m_get(int wait, int type)
3586 {
3587         return (_M_GET(wait, type));
3588 }
3589
3590 struct mbuf *
3591 m_gethdr(int wait, int type)
3592 {
3593         return (_M_GETHDR(wait, type));
3594 }
3595
3596 struct mbuf *
3597 m_retry(int wait, int type)
3598 {
3599         return (_M_RETRY(wait, type));
3600 }
3601
3602 struct mbuf *
3603 m_retryhdr(int wait, int type)
3604 {
3605         return (_M_RETRYHDR(wait, type));
3606 }
3607
3608 struct mbuf *
3609 m_getclr(int wait, int type)
3610 {
3611         struct mbuf *m;
3612
3613         _MGET(m, wait, type);
3614         if (m != NULL)
3615                 bzero(MTOD(m, caddr_t), MLEN);
3616         return (m);
3617 }
3618
3619 static int
3620 m_free_paired(struct mbuf *m)
3621 {
3622         VERIFY((m->m_flags & M_EXT) && (MEXT_FLAGS(m) & EXTF_PAIRED));
3623
3624         membar_sync();
3625         if (MEXT_PMBUF(m) == m) {
3626                 volatile UInt16 *addr = (volatile UInt16 *)&MEXT_PREF(m);
3627                 int16_t oprefcnt, prefcnt;
3628
3629                 /*
3630                  * Paired ref count might be negative in case we lose
3631                  * against another thread clearing MEXT_PMBUF, in the
3632                  * event it occurs after the above memory barrier sync.
3633                  * In that case just ignore as things have been unpaired.
3634                  */
3635                 do {
3636                         oprefcnt = *addr;
3637                         prefcnt = oprefcnt - 1;
3638                 } while (!OSCompareAndSwap16(oprefcnt, prefcnt, addr));
3639
3640                 if (prefcnt > 1) {
3641                         return (1);
3642                 } else if (prefcnt == 1) {
3643                         (*(m_get_ext_free(m)))(m->m_ext.ext_buf,
3644                             m->m_ext.ext_size, m_get_ext_arg(m));
3645                         return (1);
3646                 } else if (prefcnt == 0) {
3647                         VERIFY(MBUF_IS_PAIRED(m));
3648
3649                         /*
3650                          * Restore minref to its natural value, so that
3651                          * the caller will be able to free the cluster
3652                          * as appropriate.
3653                          */
3654                         MEXT_MINREF(m) = 0;
3655
3656                         /*
3657                          * Clear MEXT_PMBUF, but leave EXTF_PAIRED intact
3658                          * as it is immutable.  atomic_set_ptr also causes
3659                          * memory barrier sync.
3660                          */
3661                         atomic_set_ptr(&MEXT_PMBUF(m), NULL);
3662
3663                         switch (m->m_ext.ext_size) {
3664                         case MCLBYTES:
3665                                 m_set_ext(m, m_get_rfa(m), NULL, NULL);
3666                                 break;
3667
3668                         case MBIGCLBYTES:
3669                                 m_set_ext(m, m_get_rfa(m), m_bigfree, NULL);
3670                                 break;
3671
3672                         case M16KCLBYTES:
3673                                 m_set_ext(m, m_get_rfa(m), m_16kfree, NULL);
3674                                 break;
3675
3676                         default:
3677                                 VERIFY(0);
3678                                 /* NOTREACHED */
3679                         }
3680                 }
3681         }
3682
3683         /*
3684          * Tell caller the unpair has occurred, and that the reference
3685          * count on the external cluster held for the paired mbuf should
3686          * now be dropped.
3687          */
3688         return (0);
3689 }
3690
3691 struct mbuf *
3692 m_free(struct mbuf *m)
3693 {
3694         struct mbuf *n = m->m_next;
3695
3696         if (m->m_type == MT_FREE)
3697                 panic("m_free: freeing an already freed mbuf");
3698
3699         if (m->m_flags & M_PKTHDR) {
3700                 /* Check for scratch area overflow */
3701                 m_redzone_verify(m);
3702                 /* Free the aux data and tags if there is any */
3703                 m_tag_delete_chain(m, NULL);
3704
3705                 m_do_tx_compl_callback(m, NULL);
3706         }
3707
3708         if (m->m_flags & M_EXT) {
3709                 u_int16_t refcnt;
3710                 u_int32_t composite;
3711                 m_ext_free_func_t m_free_func;
3712
3713                 if (MBUF_IS_PAIRED(m) && m_free_paired(m))
3714                         return (n);
3715
3716                 refcnt = m_decref(m);
3717                 composite = (MEXT_FLAGS(m) & EXTF_COMPOSITE);
3718                 m_free_func = m_get_ext_free(m);
3719
3720                 if (refcnt == MEXT_MINREF(m) && !composite) {
3721                         if (m_free_func == NULL) {
3722                                 mcache_free(m_cache(MC_CL), m->m_ext.ext_buf);
3723                         } else if (m_free_func == m_bigfree) {
3724                                 mcache_free(m_cache(MC_BIGCL),
3725                                     m->m_ext.ext_buf);
3726                         } else if (m_free_func == m_16kfree) {
3727                                 mcache_free(m_cache(MC_16KCL),
3728                                     m->m_ext.ext_buf);
3729                         } else {
3730                                 (*m_free_func)(m->m_ext.ext_buf,
3731                                     m->m_ext.ext_size, m_get_ext_arg(m));
3732                         }
3733                         mcache_free(ref_cache, m_get_rfa(m));
3734                         m_set_ext(m, NULL, NULL, NULL);
3735                 } else if (refcnt == MEXT_MINREF(m) && composite) {
3736                         VERIFY(!(MEXT_FLAGS(m) & EXTF_PAIRED));
3737                         VERIFY(m->m_type != MT_FREE);
3738
3739                         mtype_stat_dec(m->m_type);
3740                         mtype_stat_inc(MT_FREE);
3741
3742                         m->m_type = MT_FREE;
3743                         m->m_flags = M_EXT;
3744                         m->m_len = 0;
3745                         m->m_next = m->m_nextpkt = NULL;
3746
3747                         MEXT_FLAGS(m) &= ~EXTF_READONLY;
3748
3749                         /* "Free" into the intermediate cache */
3750                         if (m_free_func == NULL) {
3751                                 mcache_free(m_cache(MC_MBUF_CL), m);
3752                         } else if (m_free_func == m_bigfree) {
3753                                 mcache_free(m_cache(MC_MBUF_BIGCL), m);
3754                         } else {
3755                                 VERIFY(m_free_func == m_16kfree);
3756                                 mcache_free(m_cache(MC_MBUF_16KCL), m);
3757                         }
3758                         return (n);
3759                 }
3760         }
3761
3762         if (m->m_type != MT_FREE) {
3763                 mtype_stat_dec(m->m_type);
3764                 mtype_stat_inc(MT_FREE);
3765         }
3766
3767         m->m_type = MT_FREE;
3768         m->m_flags = m->m_len = 0;
3769         m->m_next = m->m_nextpkt = NULL;
3770
3771         mcache_free(m_cache(MC_MBUF), m);
3772
3773         return (n);
3774 }
3775
3776 __private_extern__ struct mbuf *
3777 m_clattach(struct mbuf *m, int type, caddr_t extbuf,
3778     void (*extfree)(caddr_t, u_int, caddr_t), u_int extsize, caddr_t extarg,
3779     int wait, int pair)
3780 {
3781         struct ext_ref *rfa = NULL;
3782
3783         /*
3784          * If pairing is requested and an existing mbuf is provided, reject
3785          * it if it's already been paired to another cluster.  Otherwise,
3786          * allocate a new one or free any existing below.
3787          */
3788         if ((m != NULL && MBUF_IS_PAIRED(m)) ||
3789             (m == NULL && (m = _M_GETHDR(wait, type)) == NULL))
3790                 return (NULL);
3791
3792         if (m->m_flags & M_EXT) {
3793                 u_int16_t refcnt;
3794                 u_int32_t composite;
3795                 m_ext_free_func_t m_free_func;
3796
3797                 refcnt = m_decref(m);
3798                 composite = (MEXT_FLAGS(m) & EXTF_COMPOSITE);
3799                 VERIFY(!(MEXT_FLAGS(m) & EXTF_PAIRED) && MEXT_PMBUF(m) == NULL);
3800                 m_free_func = m_get_ext_free(m);
3801                 if (refcnt == MEXT_MINREF(m) && !composite) {
3802                         if (m_free_func == NULL) {
3803                                 mcache_free(m_cache(MC_CL), m->m_ext.ext_buf);
3804                         } else if (m_free_func == m_bigfree) {
3805                                 mcache_free(m_cache(MC_BIGCL),
3806                                     m->m_ext.ext_buf);
3807                         } else if (m_free_func == m_16kfree) {
3808                                 mcache_free(m_cache(MC_16KCL),
3809                                     m->m_ext.ext_buf);
3810                         } else {
3811                                 (*m_free_func)(m->m_ext.ext_buf,
3812                                     m->m_ext.ext_size, m_get_ext_arg(m));
3813                         }
3814                         /* Re-use the reference structure */
3815                         rfa = m_get_rfa(m);
3816                 } else if (refcnt == MEXT_MINREF(m) && composite) {
3817                         VERIFY(m->m_type != MT_FREE);
3818
3819                         mtype_stat_dec(m->m_type);
3820                         mtype_stat_inc(MT_FREE);
3821
3822                         m->m_type = MT_FREE;
3823                         m->m_flags = M_EXT;
3824                         m->m_len = 0;
3825                         m->m_next = m->m_nextpkt = NULL;
3826
3827                         MEXT_FLAGS(m) &= ~EXTF_READONLY;
3828
3829                         /* "Free" into the intermediate cache */
3830                         if (m_free_func == NULL) {
3831                                 mcache_free(m_cache(MC_MBUF_CL), m);
3832                         } else if (m_free_func == m_bigfree) {
3833                                 mcache_free(m_cache(MC_MBUF_BIGCL), m);
3834                         } else {
3835                                 VERIFY(m_free_func == m_16kfree);
3836                                 mcache_free(m_cache(MC_MBUF_16KCL), m);
3837                         }
3838                         /*
3839                          * Allocate a new mbuf, since we didn't divorce
3840                          * the composite mbuf + cluster pair above.
3841                          */
3842                         if ((m = _M_GETHDR(wait, type)) == NULL)
3843                                 return (NULL);
3844                 }
3845         }
3846
3847         if (rfa == NULL &&
3848             (rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL) {
3849                 m_free(m);
3850                 return (NULL);
3851         }
3852
3853         if (!pair) {
3854                 MEXT_INIT(m, extbuf, extsize, extfree, extarg, rfa,
3855                     0, 1, 0, 0, 0, NULL);
3856         } else {
3857                 MEXT_INIT(m, extbuf, extsize, extfree, (caddr_t)m, rfa,
3858                     1, 1, 1, EXTF_PAIRED, 0, m);
3859         }
3860
3861         return (m);
3862 }
3863
3864 /*
3865  * Perform `fast' allocation mbuf clusters from a cache of recently-freed
3866  * clusters. (If the cache is empty, new clusters are allocated en-masse.)
3867  */
3868 struct mbuf *
3869 m_getcl(int wait, int type, int flags)
3870 {
3871         struct mbuf *m;
3872         int mcflags = MSLEEPF(wait);
3873         int hdr = (flags & M_PKTHDR);
3874
3875         /* Is this due to a non-blocking retry?  If so, then try harder */
3876         if (mcflags & MCR_NOSLEEP)
3877                 mcflags |= MCR_TRYHARD;
3878
3879         m = mcache_alloc(m_cache(MC_MBUF_CL), mcflags);
3880         if (m != NULL) {
3881                 u_int16_t flag;
3882                 struct ext_ref *rfa;
3883                 void *cl;
3884
3885                 VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT);
3886                 cl = m->m_ext.ext_buf;
3887                 rfa = m_get_rfa(m);
3888
3889                 ASSERT(cl != NULL && rfa != NULL);
3890                 VERIFY(MBUF_IS_COMPOSITE(m) && m_get_ext_free(m) == NULL);
3891
3892                 flag = MEXT_FLAGS(m);
3893
3894                 MBUF_INIT(m, hdr, type);
3895                 MBUF_CL_INIT(m, cl, rfa, 1, flag);
3896
3897                 mtype_stat_inc(type);
3898                 mtype_stat_dec(MT_FREE);
3899 #if CONFIG_MACF_NET
3900                 if (hdr && mac_init_mbuf(m, wait) != 0) {
3901                         m_freem(m);
3902                         return (NULL);
3903                 }
3904 #endif /* MAC_NET */
3905         }
3906         return (m);
3907 }
3908
3909 /* m_mclget() add an mbuf cluster to a normal mbuf */
3910 struct mbuf *
3911 m_mclget(struct mbuf *m, int wait)
3912 {
3913         struct ext_ref *rfa;
3914
3915         if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL)
3916                 return (m);
3917
3918         m->m_ext.ext_buf = m_mclalloc(wait);
3919         if (m->m_ext.ext_buf != NULL) {
3920                 MBUF_CL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0);
3921         } else {
3922                 mcache_free(ref_cache, rfa);
3923         }
3924         return (m);
3925 }
3926
3927 /* Allocate an mbuf cluster */
3928 caddr_t
3929 m_mclalloc(int wait)
3930 {
3931         int mcflags = MSLEEPF(wait);
3932
3933         /* Is this due to a non-blocking retry?  If so, then try harder */
3934         if (mcflags & MCR_NOSLEEP)
3935                 mcflags |= MCR_TRYHARD;
3936
3937         return (mcache_alloc(m_cache(MC_CL), mcflags));
3938 }
3939
3940 /* Free an mbuf cluster */
3941 void
3942 m_mclfree(caddr_t p)
3943 {
3944         mcache_free(m_cache(MC_CL), p);
3945 }
3946
3947 /*
3948  * mcl_hasreference() checks if a cluster of an mbuf is referenced by
3949  * another mbuf; see comments in m_incref() regarding EXTF_READONLY.
3950  */
3951 int
3952 m_mclhasreference(struct mbuf *m)
3953 {
3954         if (!(m->m_flags & M_EXT))
3955                 return (0);
3956
3957         ASSERT(m_get_rfa(m) != NULL);
3958
3959         return ((MEXT_FLAGS(m) & EXTF_READONLY) ? 1 : 0);
3960 }
3961
3962 __private_extern__ caddr_t
3963 m_bigalloc(int wait)
3964 {
3965         int mcflags = MSLEEPF(wait);
3966
3967         /* Is this due to a non-blocking retry?  If so, then try harder */
3968         if (mcflags & MCR_NOSLEEP)
3969                 mcflags |= MCR_TRYHARD;
3970
3971         return (mcache_alloc(m_cache(MC_BIGCL), mcflags));
3972 }
3973
3974 __private_extern__ void
3975 m_bigfree(caddr_t p, __unused u_int size, __unused caddr_t arg)
3976 {
3977         mcache_free(m_cache(MC_BIGCL), p);
3978 }
3979
3980 /* m_mbigget() add an 4KB mbuf cluster to a normal mbuf */
3981 __private_extern__ struct mbuf *
3982 m_mbigget(struct mbuf *m, int wait)
3983 {
3984         struct ext_ref *rfa;
3985
3986         if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL)
3987                 return (m);
3988
3989         m->m_ext.ext_buf =  m_bigalloc(wait);
3990         if (m->m_ext.ext_buf != NULL) {
3991                 MBUF_BIGCL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0);
3992         } else {
3993                 mcache_free(ref_cache, rfa);
3994         }
3995         return (m);
3996 }
3997
3998 __private_extern__ caddr_t
3999 m_16kalloc(int wait)
4000 {
4001         int mcflags = MSLEEPF(wait);
4002
4003         /* Is this due to a non-blocking retry?  If so, then try harder */
4004         if (mcflags & MCR_NOSLEEP)
4005                 mcflags |= MCR_TRYHARD;
4006
4007         return (mcache_alloc(m_cache(MC_16KCL), mcflags));
4008 }
4009
4010 __private_extern__ void
4011 m_16kfree(caddr_t p, __unused u_int size, __unused caddr_t arg)
4012 {
4013         mcache_free(m_cache(MC_16KCL), p);
4014 }
4015
4016 /* m_m16kget() add a 16KB mbuf cluster to a normal mbuf */
4017 __private_extern__ struct mbuf *
4018 m_m16kget(struct mbuf *m, int wait)
4019 {
4020         struct ext_ref *rfa;
4021
4022         if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL)
4023                 return (m);
4024
4025         m->m_ext.ext_buf =  m_16kalloc(wait);
4026         if (m->m_ext.ext_buf != NULL) {
4027                 MBUF_16KCL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0);
4028         } else {
4029                 mcache_free(ref_cache, rfa);
4030         }
4031         return (m);
4032 }
4033
4034 /*
4035  * "Move" mbuf pkthdr from "from" to "to".
4036  * "from" must have M_PKTHDR set, and "to" must be empty.
4037  */
4038 void
4039 m_copy_pkthdr(struct mbuf *to, struct mbuf *from)
4040 {
4041         VERIFY(from->m_flags & M_PKTHDR);
4042
4043         /* Check for scratch area overflow */
4044         m_redzone_verify(from);
4045
4046         if (to->m_flags & M_PKTHDR) {
4047                 /* Check for scratch area overflow */
4048                 m_redzone_verify(to);
4049                 /* We will be taking over the tags of 'to' */
4050                 m_tag_delete_chain(to, NULL);
4051         }
4052         to->m_pkthdr = from->m_pkthdr;          /* especially tags */
4053         m_classifier_init(from, 0);             /* purge classifier info */
4054         m_tag_init(from, 1);                    /* purge all tags from src */
4055         m_scratch_init(from);                   /* clear src scratch area */
4056         to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT);
4057         if ((to->m_flags & M_EXT) == 0)
4058                 to->m_data = to->m_pktdat;
4059         m_redzone_init(to);                     /* setup red zone on dst */
4060 }
4061
4062 /*
4063  * Duplicate "from"'s mbuf pkthdr in "to".
4064  * "from" must have M_PKTHDR set, and "to" must be empty.
4065  * In particular, this does a deep copy of the packet tags.
4066  */
4067 static int
4068 m_dup_pkthdr(struct mbuf *to, struct mbuf *from, int how)
4069 {
4070         VERIFY(from->m_flags & M_PKTHDR);
4071
4072         /* Check for scratch area overflow */
4073         m_redzone_verify(from);
4074
4075         if (to->m_flags & M_PKTHDR) {
4076                 /* Check for scratch area overflow */
4077                 m_redzone_verify(to);
4078                 /* We will be taking over the tags of 'to' */
4079                 m_tag_delete_chain(to, NULL);
4080         }
4081         to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT);
4082         if ((to->m_flags & M_EXT) == 0)
4083                 to->m_data = to->m_pktdat;
4084         to->m_pkthdr = from->m_pkthdr;
4085         m_redzone_init(to);                     /* setup red zone on dst */
4086         m_tag_init(to, 0);                      /* preserve dst static tags */
4087         return (m_tag_copy_chain(to, from, how));
4088 }
4089
4090 void
4091 m_copy_pftag(struct mbuf *to, struct mbuf *from)
4092 {
4093         memcpy(m_pftag(to), m_pftag(from), sizeof(struct pf_mtag));
4094 #if PF_ECN
4095         m_pftag(to)->pftag_hdr = NULL;
4096         m_pftag(to)->pftag_flags &= ~(PF_TAG_HDR_INET|PF_TAG_HDR_INET6);
4097 #endif /* PF_ECN */
4098 }
4099
4100 void
4101 m_classifier_init(struct mbuf *m, uint32_t pktf_mask)
4102 {
4103         VERIFY(m->m_flags & M_PKTHDR);
4104
4105         m->m_pkthdr.pkt_proto = 0;
4106         m->m_pkthdr.pkt_flowsrc = 0;
4107         m->m_pkthdr.pkt_flowid = 0;
4108         m->m_pkthdr.pkt_flags &= pktf_mask;     /* caller-defined mask */
4109         /* preserve service class and interface info for loopback packets */
4110         if (!(m->m_pkthdr.pkt_flags & PKTF_LOOP))
4111                 (void) m_set_service_class(m, MBUF_SC_BE);
4112         if (!(m->m_pkthdr.pkt_flags & PKTF_IFAINFO))
4113                 m->m_pkthdr.pkt_ifainfo = 0;
4114         /*
4115          * Preserve timestamp if requested
4116          */
4117         if (!(m->m_pkthdr.pkt_flags & PKTF_TS_VALID))
4118                 m->m_pkthdr.pkt_timestamp = 0;
4119 }
4120
4121 void
4122 m_copy_classifier(struct mbuf *to, struct mbuf *from)
4123 {
4124         VERIFY(to->m_flags & M_PKTHDR);
4125         VERIFY(from->m_flags & M_PKTHDR);
4126
4127         to->m_pkthdr.pkt_proto = from->m_pkthdr.pkt_proto;
4128         to->m_pkthdr.pkt_flowsrc = from->m_pkthdr.pkt_flowsrc;
4129         to->m_pkthdr.pkt_flowid = from->m_pkthdr.pkt_flowid;
4130         to->m_pkthdr.pkt_flags = from->m_pkthdr.pkt_flags;
4131         (void) m_set_service_class(to, from->m_pkthdr.pkt_svc);
4132         to->m_pkthdr.pkt_ifainfo  = from->m_pkthdr.pkt_ifainfo;
4133 }
4134
4135 /*
4136  * Return a list of mbuf hdrs that point to clusters.  Try for num_needed;
4137  * if wantall is not set, return whatever number were available.  Set up the
4138  * first num_with_pkthdrs with mbuf hdrs configured as packet headers; these
4139  * are chained on the m_nextpkt field.  Any packets requested beyond this
4140  * are chained onto the last packet header's m_next field.  The size of
4141  * the cluster is controlled by the parameter bufsize.
4142  */
4143 __private_extern__ struct mbuf *
4144 m_getpackets_internal(unsigned int *num_needed, int num_with_pkthdrs,
4145     int wait, int wantall, size_t bufsize)
4146 {
4147         struct mbuf *m;
4148         struct mbuf **np, *top;
4149         unsigned int pnum, needed = *num_needed;
4150         mcache_obj_t *mp_list = NULL;
4151         int mcflags = MSLEEPF(wait);
4152         u_int16_t flag;
4153         struct ext_ref *rfa;
4154         mcache_t *cp;
4155         void *cl;
4156
4157         ASSERT(bufsize == m_maxsize(MC_CL) ||
4158             bufsize == m_maxsize(MC_BIGCL) ||
4159             bufsize == m_maxsize(MC_16KCL));
4160
4161         /*
4162          * Caller must first check for njcl because this
4163          * routine is internal and not exposed/used via KPI.
4164          */
4165         VERIFY(bufsize != m_maxsize(MC_16KCL) || njcl > 0);
4166
4167         top = NULL;
4168         np = &top;
4169         pnum = 0;
4170
4171         /*
4172          * The caller doesn't want all the requested buffers; only some.
4173          * Try hard to get what we can, but don't block.  This effectively
4174          * overrides MCR_SLEEP, since this thread will not go to sleep
4175          * if we can't get all the buffers.
4176          */
4177         if (!wantall || (mcflags & MCR_NOSLEEP))
4178                 mcflags |= MCR_TRYHARD;
4179
4180         /* Allocate the composite mbuf + cluster elements from the cache */
4181         if (bufsize == m_maxsize(MC_CL))
4182                 cp = m_cache(MC_MBUF_CL);
4183         else if (bufsize == m_maxsize(MC_BIGCL))
4184                 cp = m_cache(MC_MBUF_BIGCL);
4185         else
4186                 cp = m_cache(MC_MBUF_16KCL);
4187         needed = mcache_alloc_ext(cp, &mp_list, needed, mcflags);
4188
4189         for (pnum = 0; pnum < needed; pnum++) {
4190                 m = (struct mbuf *)mp_list;
4191                 mp_list = mp_list->obj_next;
4192
4193                 VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT);
4194                 cl = m->m_ext.ext_buf;
4195                 rfa = m_get_rfa(m);
4196
4197                 ASSERT(cl != NULL && rfa != NULL);
4198                 VERIFY(MBUF_IS_COMPOSITE(m));
4199
4200                 flag = MEXT_FLAGS(m);
4201
4202                 MBUF_INIT(m, num_with_pkthdrs, MT_DATA);
4203                 if (bufsize == m_maxsize(MC_16KCL)) {
4204                         MBUF_16KCL_INIT(m, cl, rfa, 1, flag);
4205                 } else if (bufsize == m_maxsize(MC_BIGCL)) {
4206                         MBUF_BIGCL_INIT(m, cl, rfa, 1, flag);
4207                 } else {
4208                         MBUF_CL_INIT(m, cl, rfa, 1, flag);
4209                 }
4210
4211                 if (num_with_pkthdrs > 0) {
4212                         --num_with_pkthdrs;
4213 #if CONFIG_MACF_NET
4214                         if (mac_mbuf_label_init(m, wait) != 0) {
4215                                 m_freem(m);
4216                                 break;
4217                         }
4218 #endif /* MAC_NET */
4219                 }
4220
4221                 *np = m;
4222                 if (num_with_pkthdrs > 0)
4223                         np = &m->m_nextpkt;
4224                 else
4225                         np = &m->m_next;
4226         }
4227         ASSERT(pnum != *num_needed || mp_list == NULL);
4228         if (mp_list != NULL)
4229                 mcache_free_ext(cp, mp_list);
4230
4231         if (pnum > 0) {
4232                 mtype_stat_add(MT_DATA, pnum);
4233                 mtype_stat_sub(MT_FREE, pnum);
4234         }
4235
4236         if (wantall && (pnum != *num_needed)) {
4237                 if (top != NULL)
4238                         m_freem_list(top);
4239                 return (NULL);
4240         }
4241
4242         if (pnum > *num_needed) {
4243                 printf("%s: File a radar related to <rdar://10146739>. \
4244                         needed = %u, pnum = %u, num_needed = %u \n",
4245                         __func__, needed, pnum, *num_needed);
4246         }
4247
4248         *num_needed = pnum;
4249         return (top);
4250 }
4251
4252 /*
4253  * Return list of mbuf linked by m_nextpkt.  Try for numlist, and if
4254  * wantall is not set, return whatever number were available.  The size of
4255  * each mbuf in the list is controlled by the parameter packetlen.  Each
4256  * mbuf of the list may have a chain of mbufs linked by m_next.  Each mbuf
4257  * in the chain is called a segment.  If maxsegments is not null and the
4258  * value pointed to is not null, this specify the maximum number of segments
4259  * for a chain of mbufs.  If maxsegments is zero or the value pointed to
4260  * is zero the caller does not have any restriction on the number of segments.
4261  * The actual  number of segments of a mbuf chain is return in the value
4262  * pointed to by maxsegments.
4263  */
4264 __private_extern__ struct mbuf *
4265 m_allocpacket_internal(unsigned int *numlist, size_t packetlen,
4266     unsigned int *maxsegments, int wait, int wantall, size_t wantsize)
4267 {
4268         struct mbuf **np, *top, *first = NULL;
4269         size_t bufsize, r_bufsize;
4270         unsigned int num = 0;
4271         unsigned int nsegs = 0;
4272         unsigned int needed, resid;
4273         int mcflags = MSLEEPF(wait);
4274         mcache_obj_t *mp_list = NULL, *rmp_list = NULL;
4275         mcache_t *cp = NULL, *rcp = NULL;
4276
4277         if (*numlist == 0)
4278                 return (NULL);
4279
4280         top = NULL;
4281         np = &top;
4282
4283         if (wantsize == 0) {
4284                 if (packetlen <= MINCLSIZE) {
4285                         bufsize = packetlen;
4286                 } else if (packetlen > m_maxsize(MC_CL)) {
4287                         /* Use 4KB if jumbo cluster pool isn't available */
4288                         if (packetlen <= m_maxsize(MC_BIGCL) || njcl == 0)
4289                                 bufsize = m_maxsize(MC_BIGCL);
4290                         else
4291                                 bufsize = m_maxsize(MC_16KCL);
4292                 } else {
4293                         bufsize = m_maxsize(MC_CL);
4294                 }
4295         } else if (wantsize == m_maxsize(MC_CL) ||
4296             wantsize == m_maxsize(MC_BIGCL) ||
4297             (wantsize == m_maxsize(MC_16KCL) && njcl > 0)) {
4298                 bufsize = wantsize;
4299         } else {
4300                 return (NULL);
4301         }
4302
4303         if (bufsize <= MHLEN) {
4304                 nsegs = 1;
4305         } else if (bufsize <= MINCLSIZE) {
4306                 if (maxsegments != NULL && *maxsegments == 1) {
4307                         bufsize = m_maxsize(MC_CL);
4308                         nsegs = 1;
4309                 } else {
4310                         nsegs = 2;
4311                 }
4312         } else if (bufsize == m_maxsize(MC_16KCL)) {
4313                 VERIFY(njcl > 0);
4314                 nsegs = ((packetlen - 1) >> M16KCLSHIFT) + 1;
4315         } else if (bufsize == m_maxsize(MC_BIGCL)) {
4316                 nsegs = ((packetlen - 1) >> MBIGCLSHIFT) + 1;
4317         } else {
4318                 nsegs = ((packetlen - 1) >> MCLSHIFT) + 1;
4319         }
4320         if (maxsegments != NULL) {
4321                 if (*maxsegments && nsegs > *maxsegments) {
4322                         *maxsegments = nsegs;
4323                         return (NULL);
4324                 }
4325                 *maxsegments = nsegs;
4326         }
4327
4328         /*
4329          * The caller doesn't want all the requested buffers; only some.
4330          * Try hard to get what we can, but don't block.  This effectively
4331          * overrides MCR_SLEEP, since this thread will not go to sleep
4332          * if we can't get all the buffers.
4333          */
4334         if (!wantall || (mcflags & MCR_NOSLEEP))
4335                 mcflags |= MCR_TRYHARD;
4336
4337         /*
4338          * Simple case where all elements in the lists/chains are mbufs.
4339          * Unless bufsize is greater than MHLEN, each segment chain is made
4340          * up of exactly 1 mbuf.  Otherwise, each segment chain is made up
4341          * of 2 mbufs; the second one is used for the residual data, i.e.
4342          * the remaining data that cannot fit into the first mbuf.
4343          */
4344         if (bufsize <= MINCLSIZE) {
4345                 /* Allocate the elements in one shot from the mbuf cache */
4346                 ASSERT(bufsize <= MHLEN || nsegs == 2);
4347                 cp = m_cache(MC_MBUF);
4348                 needed = mcache_alloc_ext(cp, &mp_list,
4349                     (*numlist) * nsegs, mcflags);
4350
4351                 /*
4352                  * The number of elements must be even if we are to use an
4353                  * mbuf (instead of a cluster) to store the residual data.
4354                  * If we couldn't allocate the requested number of mbufs,
4355                  * trim the number down (if it's odd) in order to avoid
4356                  * creating a partial segment chain.
4357                  */
4358                 if (bufsize > MHLEN && (needed & 0x1))
4359                         needed--;
4360
4361                 while (num < needed) {
4362                         struct mbuf *m;
4363
4364                         m = (struct mbuf *)mp_list;
4365                         mp_list = mp_list->obj_next;
4366                         ASSERT(m != NULL);
4367
4368                         MBUF_INIT(m, 1, MT_DATA);
4369 #if CONFIG_MACF_NET
4370                         if (mac_init_mbuf(m, wait) != 0) {
4371                                 m_free(m);
4372                                 break;
4373                         }
4374 #endif /* MAC_NET */
4375                         num++;
4376                         if (bufsize > MHLEN) {
4377                                 /* A second mbuf for this segment chain */
4378                                 m->m_next = (struct mbuf *)mp_list;
4379                                 mp_list = mp_list->obj_next;
4380                                 ASSERT(m->m_next != NULL);
4381
4382                                 MBUF_INIT(m->m_next, 0, MT_DATA);
4383                                 num++;
4384                         }
4385                         *np = m;
4386                         np = &m->m_nextpkt;
4387                 }
4388                 ASSERT(num != *numlist || mp_list == NULL);
4389
4390                 if (num > 0) {
4391                         mtype_stat_add(MT_DATA, num);
4392                         mtype_stat_sub(MT_FREE, num);
4393                 }
4394                 num /= nsegs;
4395
4396                 /* We've got them all; return to caller */
4397                 if (num == *numlist)
4398                         return (top);
4399
4400                 goto fail;
4401         }
4402
4403         /*
4404          * Complex cases where elements are made up of one or more composite
4405          * mbufs + cluster, depending on packetlen.  Each N-segment chain can
4406          * be illustrated as follows:
4407          *
4408          * [mbuf + cluster 1] [mbuf + cluster 2] ... [mbuf + cluster N]
4409          *
4410          * Every composite mbuf + cluster element comes from the intermediate
4411          * cache (either MC_MBUF_CL or MC_MBUF_BIGCL).  For space efficiency,
4412          * the last composite element will come from the MC_MBUF_CL cache,
4413          * unless the residual data is larger than 2KB where we use the
4414          * big cluster composite cache (MC_MBUF_BIGCL) instead.  Residual
4415          * data is defined as extra data beyond the first element that cannot
4416          * fit into the previous element, i.e. there is no residual data if
4417          * the chain only has 1 segment.
4418          */
4419         r_bufsize = bufsize;
4420         resid = packetlen > bufsize ? packetlen % bufsize : 0;
4421         if (resid > 0) {
4422                 /* There is residual data; figure out the cluster size */
4423                 if (wantsize == 0 && packetlen > MINCLSIZE) {
4424                         /*
4425                          * Caller didn't request that all of the segments
4426                          * in the chain use the same cluster size; use the
4427                          * smaller of the cluster sizes.
4428                          */
4429                         if (njcl > 0 && resid > m_maxsize(MC_BIGCL))
4430                                 r_bufsize = m_maxsize(MC_16KCL);
4431                         else if (resid > m_maxsize(MC_CL))
4432                                 r_bufsize = m_maxsize(MC_BIGCL);
4433                         else
4434                                 r_bufsize = m_maxsize(MC_CL);
4435                 } else {
4436                         /* Use the same cluster size as the other segments */
4437                         resid = 0;
4438                 }
4439         }
4440
4441         needed = *numlist;
4442         if (resid > 0) {
4443                 /*
4444                  * Attempt to allocate composite mbuf + cluster elements for
4445                  * the residual data in each chain; record the number of such
4446                  * elements that can be allocated so that we know how many
4447                  * segment chains we can afford to create.
4448                  */
4449                 if (r_bufsize <= m_maxsize(MC_CL))
4450                         rcp = m_cache(MC_MBUF_CL);
4451                 else if (r_bufsize <= m_maxsize(MC_BIGCL))
4452                         rcp = m_cache(MC_MBUF_BIGCL);
4453                 else
4454                         rcp = m_cache(MC_MBUF_16KCL);
4455                 needed = mcache_alloc_ext(rcp, &rmp_list, *numlist, mcflags);
4456
4457                 if (needed == 0)
4458                         goto fail;
4459
4460                 /* This is temporarily reduced for calculation */
4461                 ASSERT(nsegs > 1);
4462                 nsegs--;
4463         }
4464
4465         /*
4466          * Attempt to allocate the rest of the composite mbuf + cluster
4467          * elements for the number of segment chains that we need.
4468          */
4469         if (bufsize <= m_maxsize(MC_CL))
4470                 cp = m_cache(MC_MBUF_CL);
4471         else if (bufsize <= m_maxsize(MC_BIGCL))
4472                 cp = m_cache(MC_MBUF_BIGCL);
4473         else
4474                 cp = m_cache(MC_MBUF_16KCL);
4475         needed = mcache_alloc_ext(cp, &mp_list, needed * nsegs, mcflags);
4476
4477         /* Round it down to avoid creating a partial segment chain */
4478         needed = (needed / nsegs) * nsegs;
4479         if (needed == 0)
4480                 goto fail;
4481
4482         if (resid > 0) {
4483                 /*
4484                  * We're about to construct the chain(s); take into account
4485                  * the number of segments we have created above to hold the
4486                  * residual data for each chain, as well as restore the
4487                  * original count of segments per chain.
4488                  */
4489                 ASSERT(nsegs > 0);
4490                 needed += needed / nsegs;
4491                 nsegs++;
4492         }
4493
4494         for (;;) {
4495                 struct mbuf *m;
4496                 u_int16_t flag;
4497                 struct ext_ref *rfa;
4498                 void *cl;
4499                 int pkthdr;
4500                 m_ext_free_func_t m_free_func;
4501
4502                 ++num;
4503                 if (nsegs == 1 || (num % nsegs) != 0 || resid == 0) {
4504                         m = (struct mbuf *)mp_list;
4505                         mp_list = mp_list->obj_next;
4506                 } else {
4507                         m = (struct mbuf *)rmp_list;
4508                         rmp_list = rmp_list->obj_next;
4509                 }
4510                 m_free_func = m_get_ext_free(m);
4511                 ASSERT(m != NULL);
4512                 VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT);
4513                 VERIFY(m_free_func == NULL || m_free_func == m_bigfree ||
4514                     m_free_func == m_16kfree);
4515
4516                 cl = m->m_ext.ext_buf;
4517                 rfa = m_get_rfa(m);
4518
4519                 ASSERT(cl != NULL && rfa != NULL);
4520                 VERIFY(MBUF_IS_COMPOSITE(m));
4521
4522                 flag = MEXT_FLAGS(m);
4523
4524                 pkthdr = (nsegs == 1 || (num % nsegs) == 1);
4525                 if (pkthdr)
4526                         first = m;
4527                 MBUF_INIT(m, pkthdr, MT_DATA);
4528                 if (m_free_func == m_16kfree) {
4529                         MBUF_16KCL_INIT(m, cl, rfa, 1, flag);
4530                 } else if (m_free_func == m_bigfree) {
4531                         MBUF_BIGCL_INIT(m, cl, rfa, 1, flag);
4532                 } else {
4533                         MBUF_CL_INIT(m, cl, rfa, 1, flag);
4534                 }
4535 #if CONFIG_MACF_NET
4536                 if (pkthdr && mac_init_mbuf(m, wait) != 0) {
4537                         --num;
4538                         m_freem(m);
4539                         break;
4540                 }
4541 #endif /* MAC_NET */
4542
4543                 *np = m;
4544                 if ((num % nsegs) == 0)
4545                         np = &first->m_nextpkt;
4546                 else
4547                         np = &m->m_next;
4548
4549                 if (num == needed)
4550                         break;
4551         }
4552
4553         if (num > 0) {
4554                 mtype_stat_add(MT_DATA, num);
4555                 mtype_stat_sub(MT_FREE, num);
4556         }
4557
4558         num /= nsegs;
4559
4560         /* We've got them all; return to caller */
4561         if (num == *numlist) {
4562                 ASSERT(mp_list == NULL && rmp_list == NULL);
4563                 return (top);
4564         }
4565
4566 fail:
4567         /* Free up what's left of the above */
4568         if (mp_list != NULL)
4569                 mcache_free_ext(cp, mp_list);
4570         if (rmp_list != NULL)
4571                 mcache_free_ext(rcp, rmp_list);
4572         if (wantall && top != NULL) {
4573                 m_freem(top);
4574                 return (NULL);
4575         }
4576         *numlist = num;
4577         return (top);
4578 }
4579
4580 /*
4581  * Best effort to get a mbuf cluster + pkthdr.  Used by drivers to allocated
4582  * packets on receive ring.
4583  */
4584 __private_extern__ struct mbuf *
4585 m_getpacket_how(int wait)
4586 {
4587         unsigned int num_needed = 1;
4588
4589         return (m_getpackets_internal(&num_needed, 1, wait, 1,
4590             m_maxsize(MC_CL)));
4591 }
4592
4593 /*
4594  * Best effort to get a mbuf cluster + pkthdr.  Used by drivers to allocated
4595  * packets on receive ring.
4596  */
4597 struct mbuf *
4598 m_getpacket(void)
4599 {
4600         unsigned int num_needed = 1;
4601
4602         return (m_getpackets_internal(&num_needed, 1, M_WAIT, 1,
4603             m_maxsize(MC_CL)));
4604 }
4605
4606 /*
4607  * Return a list of mbuf hdrs that point to clusters.  Try for num_needed;
4608  * if this can't be met, return whatever number were available.  Set up the
4609  * first num_with_pkthdrs with mbuf hdrs configured as packet headers.  These
4610  * are chained on the m_nextpkt field.  Any packets requested beyond this are
4611  * chained onto the last packet header's m_next field.
4612  */
4613 struct mbuf *
4614 m_getpackets(int num_needed, int num_with_pkthdrs, int how)
4615 {
4616         unsigned int n = num_needed;
4617
4618         return (m_getpackets_internal(&n, num_with_pkthdrs, how, 0,
4619             m_maxsize(MC_CL)));
4620 }
4621
4622 /*
4623  * Return a list of mbuf hdrs set up as packet hdrs chained together
4624  * on the m_nextpkt field
4625  */
4626 struct mbuf *
4627 m_getpackethdrs(int num_needed, int how)
4628 {
4629         struct mbuf *m;
4630         struct mbuf **np, *top;
4631
4632         top = NULL;
4633         np = &top;
4634
4635         while (num_needed--) {
4636                 m = _M_RETRYHDR(how, MT_DATA);
4637                 if (m == NULL)
4638                         break;
4639
4640                 *np = m;
4641                 np = &m->m_nextpkt;
4642         }
4643
4644         return (top);
4645 }
4646
4647 /*
4648  * Free an mbuf list (m_nextpkt) while following m_next.  Returns the count
4649  * for mbufs packets freed.  Used by the drivers.
4650  */
4651 int
4652 m_freem_list(struct mbuf *m)
4653 {
4654         struct mbuf *nextpkt;
4655         mcache_obj_t *mp_list = NULL;
4656         mcache_obj_t *mcl_list = NULL;
4657         mcache_obj_t *mbc_list = NULL;
4658         mcache_obj_t *m16k_list = NULL;
4659         mcache_obj_t *m_mcl_list = NULL;
4660         mcache_obj_t *m_mbc_list = NULL;
4661         mcache_obj_t *m_m16k_list = NULL;
4662         mcache_obj_t *ref_list = NULL;
4663         int pktcount = 0;
4664         int mt_free = 0, mt_data = 0, mt_header = 0, mt_soname = 0, mt_tag = 0;
4665
4666         while (m != NULL) {
4667                 pktcount++;
4668
4669                 nextpkt = m->m_nextpkt;
4670                 m->m_nextpkt = NULL;
4671
4672                 while (m != NULL) {
4673                         struct mbuf *next = m->m_next;
4674                         mcache_obj_t *o, *rfa;
4675                         u_int32_t composite;
4676                         u_int16_t refcnt;
4677                         m_ext_free_func_t m_free_func;
4678
4679                         if (m->m_type == MT_FREE)
4680                                 panic("m_free: freeing an already freed mbuf");
4681
4682                         if (m->m_flags & M_PKTHDR) {
4683                                 /* Check for scratch area overflow */
4684                                 m_redzone_verify(m);
4685                                 /* Free the aux data and tags if there is any */
4686                                 m_tag_delete_chain(m, NULL);
4687                         }
4688
4689                         if (!(m->m_flags & M_EXT)) {
4690                                 mt_free++;
4691                                 goto simple_free;
4692                         }
4693
4694                         if (MBUF_IS_PAIRED(m) && m_free_paired(m)) {
4695                                 m = next;
4696                                 continue;
4697                         }
4698
4699                         mt_free++;
4700
4701                         o = (mcache_obj_t *)(void *)m->m_ext.ext_buf;
4702                         refcnt = m_decref(m);
4703                         composite = (MEXT_FLAGS(m) & EXTF_COMPOSITE);
4704                         m_free_func = m_get_ext_free(m);
4705                         if (refcnt == MEXT_MINREF(m) && !composite) {
4706                                 if (m_free_func == NULL) {
4707                                         o->obj_next = mcl_list;
4708                                         mcl_list = o;
4709                                 } else if (m_free_func == m_bigfree) {
4710                                         o->obj_next = mbc_list;
4711                                         mbc_list = o;
4712                                 } else if (m_free_func == m_16kfree) {
4713                                         o->obj_next = m16k_list;
4714                                         m16k_list = o;
4715                                 } else {
4716                                         (*(m_free_func))((caddr_t)o,
4717                                             m->m_ext.ext_size,
4718                                             m_get_ext_arg(m));
4719                                 }
4720                                 rfa = (mcache_obj_t *)(void *)m_get_rfa(m);
4721                                 rfa->obj_next = ref_list;
4722                                 ref_list = rfa;
4723                                 m_set_ext(m, NULL, NULL, NULL);
4724                         } else if (refcnt == MEXT_MINREF(m) && composite) {
4725                                 VERIFY(!(MEXT_FLAGS(m) & EXTF_PAIRED));
4726                                 VERIFY(m->m_type != MT_FREE);
4727                                 /*
4728                                  * Amortize the costs of atomic operations
4729                                  * by doing them at the end, if possible.
4730                                  */
4731                                 if (m->m_type == MT_DATA)
4732                                         mt_data++;
4733                                 else if (m->m_type == MT_HEADER)
4734                                         mt_header++;
4735                                 else if (m->m_type == MT_SONAME)
4736                                         mt_soname++;
4737                                 else if (m->m_type == MT_TAG)
4738                                         mt_tag++;
4739                                 else
4740                                         mtype_stat_dec(m->m_type);
4741
4742                                 m->m_type = MT_FREE;
4743                                 m->m_flags = M_EXT;
4744                                 m->m_len = 0;
4745                                 m->m_next = m->m_nextpkt = NULL;
4746
4747                                 MEXT_FLAGS(m) &= ~EXTF_READONLY;
4748
4749                                 /* "Free" into the intermediate cache */
4750                                 o = (mcache_obj_t *)m;
4751                                 if (m_free_func == NULL) {
4752                                         o->obj_next = m_mcl_list;
4753                                         m_mcl_list = o;
4754                                 } else if (m_free_func == m_bigfree) {
4755                                         o->obj_next = m_mbc_list;
4756                                         m_mbc_list = o;
4757                                 } else {
4758                                         VERIFY(m_free_func == m_16kfree);
4759                                         o->obj_next = m_m16k_list;
4760                                         m_m16k_list = o;
4761                                 }
4762                                 m = next;
4763                                 continue;
4764                         }
4765 simple_free:
4766                         /*
4767                          * Amortize the costs of atomic operations
4768                          * by doing them at the end, if possible.
4769                          */
4770                         if (m->m_type == MT_DATA)
4771                                 mt_data++;
4772                         else if (m->m_type == MT_HEADER)
4773                                 mt_header++;
4774                         else if (m->m_type == MT_SONAME)
4775                                 mt_soname++;
4776                         else if (m->m_type == MT_TAG)
4777                                 mt_tag++;
4778                         else if (m->m_type != MT_FREE)
4779                                 mtype_stat_dec(m->m_type);
4780
4781                         m->m_type = MT_FREE;
4782                         m->m_flags = m->m_len = 0;
4783                         m->m_next = m->m_nextpkt = NULL;
4784
4785                         ((mcache_obj_t *)m)->obj_next = mp_list;
4786                         mp_list = (mcache_obj_t *)m;
4787
4788                         m = next;
4789                 }
4790
4791                 m = nextpkt;
4792         }
4793
4794         if (mt_free > 0)
4795                 mtype_stat_add(MT_FREE, mt_free);
4796         if (mt_data > 0)
4797                 mtype_stat_sub(MT_DATA, mt_data);
4798         if (mt_header > 0)
4799                 mtype_stat_sub(MT_HEADER, mt_header);
4800         if (mt_soname > 0)
4801                 mtype_stat_sub(MT_SONAME, mt_soname);
4802         if (mt_tag > 0)
4803                 mtype_stat_sub(MT_TAG, mt_tag);
4804
4805         if (mp_list != NULL)
4806                 mcache_free_ext(m_cache(MC_MBUF), mp_list);
4807         if (mcl_list != NULL)
4808                 mcache_free_ext(m_cache(MC_CL), mcl_list);
4809         if (mbc_list != NULL)
4810                 mcache_free_ext(m_cache(MC_BIGCL), mbc_list);
4811         if (m16k_list != NULL)
4812                 mcache_free_ext(m_cache(MC_16KCL), m16k_list);
4813         if (m_mcl_list != NULL)
4814                 mcache_free_ext(m_cache(MC_MBUF_CL), m_mcl_list);
4815         if (m_mbc_list != NULL)
4816                 mcache_free_ext(m_cache(MC_MBUF_BIGCL), m_mbc_list);
4817         if (m_m16k_list != NULL)
4818                 mcache_free_ext(m_cache(MC_MBUF_16KCL), m_m16k_list);
4819         if (ref_list != NULL)
4820                 mcache_free_ext(ref_cache, ref_list);
4821
4822         return (pktcount);
4823 }
4824
4825 void
4826 m_freem(struct mbuf *m)
4827 {
4828         while (m != NULL)
4829                 m = m_free(m);
4830 }
4831
4832 /*
4833  * Mbuffer utility routines.
4834  */
4835 /*
4836  * Set the m_data pointer of a newly allocated mbuf to place an object of the
4837  * specified size at the end of the mbuf, longword aligned.
4838  *
4839  * NB: Historically, we had M_ALIGN(), MH_ALIGN(), and MEXT_ALIGN() as
4840  * separate macros, each asserting that it was called at the proper moment.
4841  * This required callers to themselves test the storage type and call the
4842  * right one.  Rather than require callers to be aware of those layout
4843  * decisions, we centralize here.
4844  */
4845 void
4846 m_align(struct mbuf *m, int len)
4847 {
4848         int adjust = 0;
4849
4850         /* At this point data must point to start */
4851         VERIFY(m->m_data == M_START(m));
4852         VERIFY(len >= 0);
4853         VERIFY(len <= M_SIZE(m));
4854         adjust = M_SIZE(m) - len;
4855         m->m_data += adjust &~ (sizeof(long) - 1);
4856 }
4857
4858 /*
4859  * Lesser-used path for M_PREPEND: allocate new mbuf to prepend to chain,
4860  * copy junk along.  Does not adjust packet header length.
4861  */
4862 struct mbuf *
4863 m_prepend(struct mbuf *m, int len, int how)
4864 {
4865         struct mbuf *mn;
4866
4867         _MGET(mn, how, m->m_type);
4868         if (mn == NULL) {
4869                 m_freem(m);
4870                 return (NULL);
4871         }
4872         if (m->m_flags & M_PKTHDR) {
4873                 M_COPY_PKTHDR(mn, m);
4874                 m->m_flags &= ~M_PKTHDR;
4875         }
4876         mn->m_next = m;
4877         m = mn;
4878         if (m->m_flags & M_PKTHDR) {
4879                 VERIFY(len <= MHLEN);
4880                 MH_ALIGN(m, len);
4881         } else {
4882                 VERIFY(len <= MLEN);
4883                 M_ALIGN(m, len);
4884         }
4885         m->m_len = len;
4886         return (m);
4887 }
4888
4889 /*
4890  * Replacement for old M_PREPEND macro: allocate new mbuf to prepend to
4891  * chain, copy junk along, and adjust length.
4892  */
4893 struct mbuf *
4894 m_prepend_2(struct mbuf *m, int len, int how, int align)
4895 {
4896         if (M_LEADINGSPACE(m) >= len &&
4897             (!align || IS_P2ALIGNED((m->m_data - len), sizeof(u_int32_t)))) {
4898                 m->m_data -= len;
4899                 m->m_len += len;
4900         } else {
4901                 m = m_prepend(m, len, how);
4902         }
4903         if ((m) && (m->m_flags & M_PKTHDR))
4904                 m->m_pkthdr.len += len;
4905         return (m);
4906 }
4907
4908 /*
4909  * Make a copy of an mbuf chain starting "off0" bytes from the beginning,
4910  * continuing for "len" bytes.  If len is M_COPYALL, copy to end of mbuf.
4911  * The wait parameter is a choice of M_WAIT/M_DONTWAIT from caller.
4912  */
4913 int MCFail;
4914
4915 struct mbuf *
4916 m_copym_mode(struct mbuf *m, int off0, int len, int wait, uint32_t mode)
4917 {
4918         struct mbuf *n, *mhdr = NULL, **np;
4919         int off = off0;
4920         struct mbuf *top;
4921         int copyhdr = 0;
4922
4923         if (off < 0 || len < 0)
4924                 panic("m_copym: invalid offset %d or len %d", off, len);
4925
4926         VERIFY((mode != M_COPYM_MUST_COPY_HDR &&
4927             mode != M_COPYM_MUST_MOVE_HDR) || (m->m_flags & M_PKTHDR));
4928
4929         if ((off == 0 && (m->m_flags & M_PKTHDR)) ||
4930             mode == M_COPYM_MUST_COPY_HDR || mode == M_COPYM_MUST_MOVE_HDR) {
4931                 mhdr = m;
4932                 copyhdr = 1;
4933         }
4934
4935         while (off >= m->m_len) {
4936                 if (m->m_next == NULL)
4937                         panic("m_copym: invalid mbuf chain");
4938                 off -= m->m_len;
4939                 m = m->m_next;
4940         }
4941         np = &top;
4942         top = NULL;
4943
4944         while (len > 0) {
4945                 if (m == NULL) {
4946                         if (len != M_COPYALL)
4947                                 panic("m_copym: len != M_COPYALL");
4948                         break;
4949                 }
4950
4951                 if (copyhdr)
4952                         n = _M_RETRYHDR(wait, m->m_type);
4953                 else
4954                         n = _M_RETRY(wait, m->m_type);
4955                 *np = n;
4956
4957                 if (n == NULL)
4958                         goto nospace;
4959
4960                 if (copyhdr != 0) {
4961                         if ((mode == M_COPYM_MOVE_HDR) ||
4962                             (mode == M_COPYM_MUST_MOVE_HDR)) {
4963                                 M_COPY_PKTHDR(n, mhdr);
4964                         } else if ((mode == M_COPYM_COPY_HDR) ||
4965                             (mode == M_COPYM_MUST_COPY_HDR)) {
4966                                 if (m_dup_pkthdr(n, mhdr, wait) == 0)
4967                                         goto nospace;
4968                         }
4969                         if (len == M_COPYALL)
4970                                 n->m_pkthdr.len -= off0;
4971                         else
4972                                 n->m_pkthdr.len = len;
4973                         copyhdr = 0;
4974                         /*
4975                          * There is data to copy from the packet header mbuf
4976                          * if it is empty or it is before the starting offset
4977                          */
4978                         if (mhdr != m) {
4979                                 np = &n->m_next;
4980                                 continue;
4981                         }
4982                 }
4983                 n->m_len = MIN(len, (m->m_len - off));
4984                 if (m->m_flags & M_EXT) {
4985                         n->m_ext = m->m_ext;
4986                         m_incref(m);
4987                         n->m_data = m->m_data + off;
4988                         n->m_flags |= M_EXT;
4989                 } else {
4990                         /*
4991                          * Limit to the capacity of the destination
4992                          */
4993                         if (n->m_flags & M_PKTHDR)
4994                                 n->m_len = MIN(n->m_len, MHLEN);
4995                         else
4996                                 n->m_len = MIN(n->m_len, MLEN);
4997
4998                         if (MTOD(n, char *) + n->m_len > ((char *)n) + MSIZE)
4999                                 panic("%s n %p copy overflow",
5000                                         __func__, n);
5001
5002                         bcopy(MTOD(m, caddr_t)+off, MTOD(n, caddr_t),
5003                             (unsigned)n->m_len);
5004                 }
5005                 if (len != M_COPYALL)
5006                         len -= n->m_len;
5007                 off = 0;
5008                 m = m->m_next;
5009                 np = &n->m_next;
5010         }
5011
5012         if (top == NULL)
5013                 MCFail++;
5014
5015         return (top);
5016 nospace:
5017
5018         m_freem(top);
5019         MCFail++;
5020         return (NULL);
5021 }
5022
5023
5024 struct mbuf *
5025 m_copym(struct mbuf *m, int off0, int len, int wait)
5026 {
5027         return (m_copym_mode(m, off0, len, wait, M_COPYM_MOVE_HDR));
5028 }
5029
5030 /*
5031  * Equivalent to m_copym except that all necessary mbuf hdrs are allocated
5032  * within this routine also, the last mbuf and offset accessed are passed
5033  * out and can be passed back in to avoid having to rescan the entire mbuf
5034  * list (normally hung off of the socket)
5035  */
5036 struct mbuf *
5037 m_copym_with_hdrs(struct mbuf *m0, int off0, int len0, int wait,
5038     struct mbuf **m_lastm, int *m_off, uint32_t mode)
5039 {
5040         struct mbuf *m = m0, *n, **np = NULL;
5041         int off = off0, len = len0;
5042         struct mbuf *top = NULL;
5043         int mcflags = MSLEEPF(wait);
5044         int copyhdr = 0;
5045         int type = 0;
5046         mcache_obj_t *list = NULL;
5047         int needed = 0;
5048
5049         if (off == 0 && (m->m_flags & M_PKTHDR))
5050                 copyhdr = 1;
5051
5052         if (m_lastm != NULL && *m_lastm != NULL) {
5053                 m = *m_lastm;
5054                 off = *m_off;
5055         } else {
5056                 while (off >= m->m_len) {
5057                         off -= m->m_len;
5058                         m = m->m_next;
5059                 }
5060         }
5061
5062         n = m;
5063         while (len > 0) {
5064                 needed++;
5065                 ASSERT(n != NULL);
5066                 len -= MIN(len, (n->m_len - ((needed == 1) ? off : 0)));
5067                 n = n->m_next;
5068         }
5069         needed++;
5070         len = len0;
5071
5072         /*
5073          * If the caller doesn't want to be put to sleep, mark it with
5074          * MCR_TRYHARD so that we may reclaim buffers from other places
5075          * before giving up.
5076          */
5077         if (mcflags & MCR_NOSLEEP)
5078                 mcflags |= MCR_TRYHARD;
5079
5080         if (mcache_alloc_ext(m_cache(MC_MBUF), &list, needed,
5081             mcflags) != needed)
5082                 goto nospace;
5083
5084         needed = 0;
5085         while (len > 0) {
5086                 n = (struct mbuf *)list;
5087                 list = list->obj_next;
5088                 ASSERT(n != NULL && m != NULL);
5089
5090                 type = (top == NULL) ? MT_HEADER : m->m_type;
5091                 MBUF_INIT(n, (top == NULL), type);
5092 #if CONFIG_MACF_NET
5093                 if (top == NULL && mac_mbuf_label_init(n, wait) != 0) {
5094                         mtype_stat_inc(MT_HEADER);
5095                         mtype_stat_dec(MT_FREE);
5096                         m_free(n);
5097                         goto nospace;
5098                 }
5099 #endif /* MAC_NET */
5100
5101                 if (top == NULL) {
5102                         top = n;
5103                         np = &top->m_next;
5104                         continue;
5105                 } else {
5106                         needed++;
5107                         *np = n;
5108                 }
5109
5110                 if (copyhdr) {
5111                         if ((mode == M_COPYM_MOVE_HDR) ||
5112                             (mode == M_COPYM_MUST_MOVE_HDR)) {
5113                                 M_COPY_PKTHDR(n, m);
5114                         } else if ((mode == M_COPYM_COPY_HDR) ||
5115                             (mode == M_COPYM_MUST_COPY_HDR)) {
5116                                 if (m_dup_pkthdr(n, m, wait) == 0)
5117                                         goto nospace;
5118                         }
5119                         n->m_pkthdr.len = len;
5120                         copyhdr = 0;
5121                 }
5122                 n->m_len = MIN(len, (m->m_len - off));
5123
5124                 if (m->m_flags & M_EXT) {
5125                         n->m_ext = m->m_ext;
5126                         m_incref(m);
5127                         n->m_data = m->m_data + off;
5128                         n->m_flags |= M_EXT;
5129                 } else {
5130                         if (MTOD(n, char *) + n->m_len > ((char *)n) + MSIZE)
5131                                 panic("%s n %p copy overflow",
5132                                         __func__, n);
5133
5134                         bcopy(MTOD(m, caddr_t)+off, MTOD(n, caddr_t),
5135                             (unsigned)n->m_len);
5136                 }
5137                 len -= n->m_len;
5138
5139                 if (len == 0) {
5140                         if (m_lastm != NULL && m_off != NULL) {
5141                                 if ((off + n->m_len) == m->m_len) {
5142                                         *m_lastm = m->m_next;
5143                                         *m_off  = 0;
5144                                 } else {
5145                                         *m_lastm = m;
5146                                         *m_off  = off + n->m_len;
5147                                 }
5148                         }
5149                         break;
5150                 }
5151                 off = 0;
5152                 m = m->m_next;
5153                 np = &n->m_next;
5154         }
5155
5156         mtype_stat_inc(MT_HEADER);
5157         mtype_stat_add(type, needed);
5158         mtype_stat_sub(MT_FREE, needed + 1);
5159
5160         ASSERT(list == NULL);
5161         return (top);
5162
5163 nospace:
5164         if (list != NULL)
5165                 mcache_free_ext(m_cache(MC_MBUF), list);
5166         if (top != NULL)
5167                 m_freem(top);
5168         MCFail++;
5169         return (NULL);
5170 }
5171
5172 /*
5173  * Copy data from an mbuf chain starting "off" bytes from the beginning,
5174  * continuing for "len" bytes, into the indicated buffer.
5175  */
5176 void
5177 m_copydata(struct mbuf *m, int off, int len, void *vp)
5178 {
5179         int off0 = off, len0 = len;
5180         struct mbuf *m0 = m;
5181         unsigned count;
5182         char *cp = vp;
5183
5184         if (__improbable(off < 0 || len < 0)) {
5185                 panic("%s: invalid offset %d or len %d", __func__, off, len);
5186                 /* NOTREACHED */
5187         }
5188
5189         while (off > 0) {
5190                 if (__improbable(m == NULL)) {
5191                         panic("%s: invalid mbuf chain %p [off %d, len %d]",
5192                             __func__, m0, off0, len0);
5193                         /* NOTREACHED */
5194                 }
5195                 if (off < m->m_len)
5196                         break;
5197                 off -= m->m_len;
5198                 m = m->m_next;
5199         }
5200         while (len > 0) {
5201                 if (__improbable(m == NULL)) {
5202                         panic("%s: invalid mbuf chain %p [off %d, len %d]",
5203                             __func__, m0, off0, len0);
5204                         /* NOTREACHED */
5205                 }
5206                 count = MIN(m->m_len - off, len);
5207                 bcopy(MTOD(m, caddr_t) + off, cp, count);
5208                 len -= count;
5209                 cp += count;
5210                 off = 0;
5211                 m = m->m_next;
5212         }
5213 }
5214
5215 /*
5216  * Concatenate mbuf chain n to m.  Both chains must be of the same type
5217  * (e.g. MT_DATA).  Any m_pkthdr is not updated.
5218  */
5219 void
5220 m_cat(struct mbuf *m, struct mbuf *n)
5221 {
5222         while (m->m_next)
5223                 m = m->m_next;
5224         while (n) {
5225                 if ((m->m_flags & M_EXT) ||
5226                     m->m_data + m->m_len + n->m_len >= &m->m_dat[MLEN]) {
5227                         /* just join the two chains */
5228                         m->m_next = n;
5229                         return;
5230                 }
5231                 /* splat the data from one into the other */
5232                 bcopy(MTOD(n, caddr_t), MTOD(m, caddr_t) + m->m_len,
5233                     (u_int)n->m_len);
5234                 m->m_len += n->m_len;
5235                 n = m_free(n);
5236         }
5237 }
5238
5239 void
5240 m_adj(struct mbuf *mp, int req_len)
5241 {
5242         int len = req_len;
5243         struct mbuf *m;
5244         int count;
5245
5246         if ((m = mp) == NULL)
5247                 return;
5248         if (len >= 0) {
5249                 /*
5250                  * Trim from head.
5251                  */
5252                 while (m != NULL && len > 0) {
5253                         if (m->m_len <= len) {
5254                                 len -= m->m_len;
5255                                 m->m_len = 0;
5256                                 m = m->m_next;
5257                         } else {
5258                                 m->m_len -= len;
5259                                 m->m_data += len;
5260                                 len = 0;
5261                         }
5262                 }
5263                 m = mp;
5264                 if (m->m_flags & M_PKTHDR)
5265                         m->m_pkthdr.len -= (req_len - len);
5266         } else {
5267                 /*
5268                  * Trim from tail.  Scan the mbuf chain,
5269                  * calculating its length and finding the last mbuf.
5270                  * If the adjustment only affects this mbuf, then just
5271                  * adjust and return.  Otherwise, rescan and truncate
5272                  * after the remaining size.
5273                  */
5274                 len = -len;
5275                 count = 0;
5276                 for (;;) {
5277                         count += m->m_len;
5278                         if (m->m_next == (struct mbuf *)0)
5279                                 break;
5280                         m = m->m_next;
5281                 }
5282                 if (m->m_len >= len) {
5283                         m->m_len -= len;
5284                         m = mp;
5285                         if (m->m_flags & M_PKTHDR)
5286                                 m->m_pkthdr.len -= len;
5287                         return;
5288                 }
5289                 count -= len;
5290                 if (count < 0)
5291                         count = 0;
5292                 /*
5293                  * Correct length for chain is "count".
5294                  * Find the mbuf with last data, adjust its length,
5295                  * and toss data from remaining mbufs on chain.
5296                  */
5297                 m = mp;
5298                 if (m->m_flags & M_PKTHDR)
5299                         m->m_pkthdr.len = count;
5300                 for (; m; m = m->m_next) {
5301                         if (m->m_len >= count) {
5302                                 m->m_len = count;
5303                                 break;
5304                         }
5305                         count -= m->m_len;
5306                 }
5307                 while ((m = m->m_next))
5308                         m->m_len = 0;
5309         }
5310 }
5311
5312 /*
5313  * Rearange an mbuf chain so that len bytes are contiguous
5314  * and in the data area of an mbuf (so that mtod and dtom
5315  * will work for a structure of size len).  Returns the resulting
5316  * mbuf chain on success, frees it and returns null on failure.
5317  * If there is room, it will add up to max_protohdr-len extra bytes to the
5318  * contiguous region in an attempt to avoid being called next time.
5319  */
5320 int MPFail;
5321
5322 struct mbuf *
5323 m_pullup(struct mbuf *n, int len)
5324 {
5325         struct mbuf *m;
5326         int count;
5327         int space;
5328
5329         /* check invalid arguments */
5330         if (n == NULL) {
5331                  panic("%s: n == NULL", __func__);
5332         }
5333         if (len < 0) {
5334                 os_log_info(OS_LOG_DEFAULT, "%s: failed negative len %d",
5335                     __func__, len);
5336                 goto bad;
5337         }
5338         if (len > MLEN) {
5339                 os_log_info(OS_LOG_DEFAULT, "%s: failed len %d too big",
5340                     __func__, len);
5341                 goto bad;
5342         }
5343         if ((n->m_flags & M_EXT) == 0 &&
5344             n->m_data >= &n->m_dat[MLEN]) {
5345                 os_log_info(OS_LOG_DEFAULT, "%s: m_data out of bounds",
5346                     __func__);
5347                 goto bad;
5348         }
5349
5350         /*
5351          * If first mbuf has no cluster, and has room for len bytes
5352          * without shifting current data, pullup into it,
5353          * otherwise allocate a new mbuf to prepend to the chain.
5354          */
5355         if ((n->m_flags & M_EXT) == 0 &&
5356             len < &n->m_dat[MLEN] - n->m_data && n->m_next != NULL) {
5357                 if (n->m_len >= len)
5358                         return (n);
5359                 m = n;
5360                 n = n->m_next;
5361                 len -= m->m_len;
5362         } else {
5363                 if (len > MHLEN)
5364                         goto bad;
5365                 _MGET(m, M_DONTWAIT, n->m_type);
5366                 if (m == 0)
5367                         goto bad;
5368                 m->m_len = 0;
5369                 if (n->m_flags & M_PKTHDR) {
5370                         M_COPY_PKTHDR(m, n);
5371                         n->m_flags &= ~M_PKTHDR;
5372                 }
5373         }
5374         space = &m->m_dat[MLEN] - (m->m_data + m->m_len);
5375         do {
5376                 count = MIN(MIN(MAX(len, max_protohdr), space), n->m_len);
5377                 bcopy(MTOD(n, caddr_t), MTOD(m, caddr_t) + m->m_len,
5378                     (unsigned)count);
5379                 len -= count;
5380                 m->m_len += count;
5381                 n->m_len -= count;
5382                 space -= count;
5383                 if (n->m_len != 0)
5384                         n->m_data += count;
5385                 else
5386                         n = m_free(n);
5387         } while (len > 0 && n != NULL);
5388         if (len > 0) {
5389                 (void) m_free(m);
5390                 goto bad;
5391         }
5392         m->m_next = n;
5393         return (m);
5394 bad:
5395         m_freem(n);
5396         MPFail++;
5397         return (0);
5398 }
5399
5400 /*
5401  * Like m_pullup(), except a new mbuf is always allocated, and we allow
5402  * the amount of empty space before the data in the new mbuf to be specified
5403  * (in the event that the caller expects to prepend later).
5404  */
5405 __private_extern__ int MSFail = 0;
5406
5407 __private_extern__ struct mbuf *
5408 m_copyup(struct mbuf *n, int len, int dstoff)
5409 {
5410         struct mbuf *m;
5411         int count, space;
5412
5413         if (len > (MHLEN - dstoff))
5414                 goto bad;
5415         MGET(m, M_DONTWAIT, n->m_type);
5416         if (m == NULL)
5417                 goto bad;
5418         m->m_len = 0;
5419         if (n->m_flags & M_PKTHDR) {
5420                 m_copy_pkthdr(m, n);
5421                 n->m_flags &= ~M_PKTHDR;
5422         }
5423         m->m_data += dstoff;
5424         space = &m->m_dat[MLEN] - (m->m_data + m->m_len);
5425         do {
5426                 count = min(min(max(len, max_protohdr), space), n->m_len);
5427                 memcpy(mtod(m, caddr_t) + m->m_len, mtod(n, caddr_t),
5428                     (unsigned)count);
5429                 len -= count;
5430                 m->m_len += count;
5431                 n->m_len -= count;
5432                 space -= count;
5433                 if (n->m_len)
5434                         n->m_data += count;
5435                 else
5436                         n = m_free(n);
5437         } while (len > 0 && n);
5438         if (len > 0) {
5439                 (void) m_free(m);
5440                 goto bad;
5441         }
5442         m->m_next = n;
5443         return (m);
5444 bad:
5445         m_freem(n);
5446         MSFail++;
5447         return (NULL);
5448 }
5449
5450 /*
5451  * Partition an mbuf chain in two pieces, returning the tail --
5452  * all but the first len0 bytes.  In case of failure, it returns NULL and
5453  * attempts to restore the chain to its original state.
5454  */
5455 struct mbuf *
5456 m_split(struct mbuf *m0, int len0, int wait)
5457 {
5458         return (m_split0(m0, len0, wait, 1));
5459 }
5460
5461 static struct mbuf *
5462 m_split0(struct mbuf *m0, int len0, int wait, int copyhdr)
5463 {
5464         struct mbuf *m, *n;
5465         unsigned len = len0, remain;
5466
5467         /*
5468          * First iterate to the mbuf which contains the first byte of
5469          * data at offset len0
5470          */
5471         for (m = m0; m && len > m->m_len; m = m->m_next)
5472                 len -= m->m_len;
5473         if (m == NULL)
5474                 return (NULL);
5475         /*
5476          * len effectively is now the offset in the current
5477          * mbuf where we have to perform split.
5478          *
5479          * remain becomes the tail length.
5480          * Note that len can also be == m->m_len
5481          */
5482         remain = m->m_len - len;
5483
5484         /*
5485          * If current mbuf len contains the entire remaining offset len,
5486          * just make the second mbuf chain pointing to next mbuf onwards
5487          * and return after making necessary adjustments
5488          */
5489         if (copyhdr && (m0->m_flags & M_PKTHDR) && remain == 0) {
5490                 _MGETHDR(n, wait, m0->m_type);
5491                 if (n == NULL)
5492                         return (NULL);
5493                 n->m_next = m->m_next;
5494                 m->m_next = NULL;
5495                 n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif;
5496                 n->m_pkthdr.len = m0->m_pkthdr.len - len0;
5497                 m0->m_pkthdr.len = len0;
5498                 return (n);
5499         } if (copyhdr && (m0->m_flags & M_PKTHDR)) {
5500                 _MGETHDR(n, wait, m0->m_type);
5501                 if (n == NULL)
5502                         return (NULL);
5503                 n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif;
5504                 n->m_pkthdr.len = m0->m_pkthdr.len - len0;
5505                 m0->m_pkthdr.len = len0;
5506
5507                 /*
5508                  * If current points to external storage
5509                  * then it can be shared by making last mbuf
5510                  * of head chain and first mbuf of current chain
5511                  * pointing to different data offsets
5512                  */
5513                 if (m->m_flags & M_EXT)
5514                         goto extpacket;
5515                 if (remain > MHLEN) {
5516                         /* m can't be the lead packet */
5517                         MH_ALIGN(n, 0);
5518                         n->m_next = m_split(m, len, wait);
5519                         if (n->m_next == NULL) {
5520                                 (void) m_free(n);
5521                                 return (NULL);
5522                         } else
5523                                 return (n);
5524                 } else
5525                         MH_ALIGN(n, remain);
5526         } else if (remain == 0) {
5527                 n = m->m_next;
5528                 m->m_next = NULL;
5529                 return (n);
5530         } else {
5531                 _MGET(n, wait, m->m_type);
5532                 if (n == NULL)
5533                         return (NULL);
5534
5535                 if ((m->m_flags & M_EXT) == 0) {
5536                         VERIFY(remain <= MLEN);
5537                         M_ALIGN(n, remain);
5538                 }
5539         }
5540 extpacket:
5541         if (m->m_flags & M_EXT) {
5542                 n->m_flags |= M_EXT;
5543                 n->m_ext = m->m_ext;
5544                 m_incref(m);
5545                 n->m_data = m->m_data + len;
5546         } else {
5547                 bcopy(MTOD(m, caddr_t) + len, MTOD(n, caddr_t), remain);
5548         }
5549         n->m_len = remain;
5550         m->m_len = len;
5551         n->m_next = m->m_next;
5552         m->m_next = NULL;
5553         return (n);
5554 }
5555
5556 /*
5557  * Routine to copy from device local memory into mbufs.
5558  */
5559 struct mbuf *
5560 m_devget(char *buf, int totlen, int off0, struct ifnet *ifp,
5561     void (*copy)(const void *, void *, size_t))
5562 {
5563         struct mbuf *m;
5564         struct mbuf *top = NULL, **mp = &top;
5565         int off = off0, len;
5566         char *cp;
5567         char *epkt;
5568
5569         cp = buf;
5570         epkt = cp + totlen;
5571         if (off) {
5572                 /*
5573                  * If 'off' is non-zero, packet is trailer-encapsulated,
5574                  * so we have to skip the type and length fields.
5575                  */
5576                 cp += off + 2 * sizeof (u_int16_t);
5577                 totlen -= 2 * sizeof (u_int16_t);
5578         }
5579         _MGETHDR(m, M_DONTWAIT, MT_DATA);
5580         if (m == NULL)
5581                 return (NULL);
5582         m->m_pkthdr.rcvif = ifp;
5583         m->m_pkthdr.len = totlen;
5584         m->m_len = MHLEN;
5585
5586         while (totlen > 0) {
5587                 if (top != NULL) {
5588                         _MGET(m, M_DONTWAIT, MT_DATA);
5589                         if (m == NULL) {
5590                                 m_freem(top);
5591                                 return (NULL);
5592                         }
5593                         m->m_len = MLEN;
5594                 }
5595                 len = MIN(totlen, epkt - cp);
5596                 if (len >= MINCLSIZE) {
5597                         MCLGET(m, M_DONTWAIT);
5598                         if (m->m_flags & M_EXT) {
5599                                 m->m_len = len = MIN(len, m_maxsize(MC_CL));
5600                         } else {
5601                                 /* give up when it's out of cluster mbufs */
5602                                 if (top != NULL)
5603                                         m_freem(top);
5604                                 m_freem(m);
5605                                 return (NULL);
5606                         }
5607                 } else {
5608                         /*
5609                          * Place initial small packet/header at end of mbuf.
5610                          */
5611                         if (len < m->m_len) {
5612                                 if (top == NULL &&
5613                                     len + max_linkhdr <= m->m_len)
5614                                         m->m_data += max_linkhdr;
5615                                 m->m_len = len;
5616                         } else {
5617                                 len = m->m_len;
5618                         }
5619                 }
5620                 if (copy)
5621                         copy(cp, MTOD(m, caddr_t), (unsigned)len);
5622                 else
5623                         bcopy(cp, MTOD(m, caddr_t), (unsigned)len);
5624                 cp += len;
5625                 *mp = m;
5626                 mp = &m->m_next;
5627                 totlen -= len;
5628                 if (cp == epkt)
5629                         cp = buf;
5630         }
5631         return (top);
5632 }
5633
5634 #ifndef MBUF_GROWTH_NORMAL_THRESH
5635 #define MBUF_GROWTH_NORMAL_THRESH 25
5636 #endif
5637
5638 /*
5639  * Cluster freelist allocation check.
5640  */
5641 static int
5642 m_howmany(int num, size_t bufsize)
5643 {
5644         int i = 0, j = 0;
5645         u_int32_t m_mbclusters, m_clusters, m_bigclusters, m_16kclusters;
5646         u_int32_t m_mbfree, m_clfree, m_bigclfree, m_16kclfree;
5647         u_int32_t sumclusters, freeclusters;
5648         u_int32_t percent_pool, percent_kmem;
5649         u_int32_t mb_growth, mb_growth_thresh;
5650
5651         VERIFY(bufsize == m_maxsize(MC_BIGCL) ||
5652             bufsize == m_maxsize(MC_16KCL));
5653
5654         LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
5655
5656         /* Numbers in 2K cluster units */
5657         m_mbclusters = m_total(MC_MBUF) >> NMBPCLSHIFT;
5658         m_clusters = m_total(MC_CL);
5659         m_bigclusters = m_total(MC_BIGCL) << NCLPBGSHIFT;
5660         m_16kclusters = m_total(MC_16KCL);
5661         sumclusters = m_mbclusters + m_clusters + m_bigclusters;
5662
5663         m_mbfree = m_infree(MC_MBUF) >> NMBPCLSHIFT;
5664         m_clfree = m_infree(MC_CL);
5665         m_bigclfree = m_infree(MC_BIGCL) << NCLPBGSHIFT;
5666         m_16kclfree = m_infree(MC_16KCL);
5667         freeclusters = m_mbfree + m_clfree + m_bigclfree;
5668
5669         /* Bail if we've maxed out the mbuf memory map */
5670         if ((bufsize == m_maxsize(MC_BIGCL) && sumclusters >= nclusters) ||
5671             (njcl > 0 && bufsize == m_maxsize(MC_16KCL) &&
5672             (m_16kclusters << NCLPJCLSHIFT) >= njcl)) {
5673                 mbwdog_logger("maxed out nclusters (%u >= %u) or njcl (%u >= %u)",
5674                     sumclusters, nclusters,
5675                     (m_16kclusters << NCLPJCLSHIFT), njcl);
5676                 return (0);
5677         }
5678
5679         if (bufsize == m_maxsize(MC_BIGCL)) {
5680                 /* Under minimum */
5681                 if (m_bigclusters < m_minlimit(MC_BIGCL))
5682                         return (m_minlimit(MC_BIGCL) - m_bigclusters);
5683
5684                 percent_pool =
5685                     ((sumclusters - freeclusters) * 100) / sumclusters;
5686                 percent_kmem = (sumclusters * 100) / nclusters;
5687
5688                 /*
5689                  * If a light/normal user, grow conservatively (75%)
5690                  * If a heavy user, grow aggressively (50%)
5691                  */
5692                 if (percent_kmem < MBUF_GROWTH_NORMAL_THRESH)
5693                         mb_growth = MB_GROWTH_NORMAL;
5694                 else
5695                         mb_growth = MB_GROWTH_AGGRESSIVE;
5696
5697                 if (percent_kmem < 5) {
5698                         /* For initial allocations */
5699                         i = num;
5700                 } else {
5701                         /* Return if >= MBIGCL_LOWAT clusters available */
5702                         if (m_infree(MC_BIGCL) >= MBIGCL_LOWAT &&
5703                             m_total(MC_BIGCL) >=
5704                             MBIGCL_LOWAT + m_minlimit(MC_BIGCL))
5705                                 return (0);
5706
5707                         /* Ensure at least num clusters are accessible */
5708                         if (num >= m_infree(MC_BIGCL))
5709                                 i = num - m_infree(MC_BIGCL);
5710                         if (num > m_total(MC_BIGCL) - m_minlimit(MC_BIGCL))
5711                                 j = num - (m_total(MC_BIGCL) -
5712                                     m_minlimit(MC_BIGCL));
5713
5714                         i = MAX(i, j);
5715
5716                         /*
5717                          * Grow pool if percent_pool > 75 (normal growth)
5718                          * or percent_pool > 50 (aggressive growth).
5719                          */
5720                         mb_growth_thresh = 100 - (100 / (1 << mb_growth));
5721                         if (percent_pool > mb_growth_thresh)
5722                                 j = ((sumclusters + num) >> mb_growth) -
5723                                     freeclusters;
5724                         i = MAX(i, j);
5725                 }
5726
5727                 /* Check to ensure we didn't go over limits */
5728                 if (i + m_bigclusters >= m_maxlimit(MC_BIGCL))
5729                         i = m_maxlimit(MC_BIGCL) - m_bigclusters;
5730                 if ((i << 1) + sumclusters >= nclusters)
5731                         i = (nclusters - sumclusters) >> 1;
5732                 VERIFY((m_total(MC_BIGCL) + i) <= m_maxlimit(MC_BIGCL));
5733                 VERIFY(sumclusters + (i << 1) <= nclusters);
5734
5735         } else { /* 16K CL */
5736                 VERIFY(njcl > 0);
5737                 /* Ensure at least num clusters are available */
5738                 if (num >= m_16kclfree)
5739                         i = num - m_16kclfree;
5740
5741                 /* Always grow 16KCL pool aggressively */
5742                 if (((m_16kclusters + num) >> 1) > m_16kclfree)
5743                         j = ((m_16kclusters + num) >> 1) - m_16kclfree;
5744                 i = MAX(i, j);
5745
5746                 /* Check to ensure we don't go over limit */
5747                 if ((i + m_total(MC_16KCL)) >= m_maxlimit(MC_16KCL))
5748                         i = m_maxlimit(MC_16KCL) - m_total(MC_16KCL);
5749         }
5750         return (i);
5751 }
5752 /*
5753  * Return the number of bytes in the mbuf chain, m.
5754  */
5755 unsigned int
5756 m_length(struct mbuf *m)
5757 {
5758         struct mbuf *m0;
5759         unsigned int pktlen;
5760
5761         if (m->m_flags & M_PKTHDR)
5762                 return (m->m_pkthdr.len);
5763
5764         pktlen = 0;
5765         for (m0 = m; m0 != NULL; m0 = m0->m_next)
5766                 pktlen += m0->m_len;
5767         return (pktlen);
5768 }
5769
5770 /*
5771  * Copy data from a buffer back into the indicated mbuf chain,
5772  * starting "off" bytes from the beginning, extending the mbuf
5773  * chain if necessary.
5774  */
5775 void
5776 m_copyback(struct mbuf *m0, int off, int len, const void *cp)
5777 {
5778 #if DEBUG
5779         struct mbuf *origm = m0;
5780         int error;
5781 #endif /* DEBUG */
5782
5783         if (m0 == NULL)
5784                 return;
5785
5786 #if DEBUG
5787         error =
5788 #endif /* DEBUG */
5789         m_copyback0(&m0, off, len, cp,
5790             M_COPYBACK0_COPYBACK | M_COPYBACK0_EXTEND, M_DONTWAIT);
5791
5792 #if DEBUG
5793         if (error != 0 || (m0 != NULL && origm != m0))
5794                 panic("m_copyback");
5795 #endif /* DEBUG */
5796 }
5797
5798 struct mbuf *
5799 m_copyback_cow(struct mbuf *m0, int off, int len, const void *cp, int how)
5800 {
5801         int error;
5802
5803         /* don't support chain expansion */
5804         VERIFY(off + len <= m_length(m0));
5805
5806         error = m_copyback0(&m0, off, len, cp,
5807             M_COPYBACK0_COPYBACK | M_COPYBACK0_COW, how);
5808         if (error) {
5809                 /*
5810                  * no way to recover from partial success.
5811                  * just free the chain.
5812                  */
5813                 m_freem(m0);
5814                 return (NULL);
5815         }
5816         return (m0);
5817 }
5818
5819 /*
5820  * m_makewritable: ensure the specified range writable.
5821  */
5822 int
5823 m_makewritable(struct mbuf **mp, int off, int len, int how)
5824 {
5825         int error;
5826 #if DEBUG
5827         struct mbuf *n;
5828         int origlen, reslen;
5829
5830         origlen = m_length(*mp);
5831 #endif /* DEBUG */
5832
5833 #if 0 /* M_COPYALL is large enough */
5834         if (len == M_COPYALL)
5835                 len = m_length(*mp) - off; /* XXX */
5836 #endif
5837
5838         error = m_copyback0(mp, off, len, NULL,
5839             M_COPYBACK0_PRESERVE | M_COPYBACK0_COW, how);
5840
5841 #if DEBUG
5842         reslen = 0;
5843         for (n = *mp; n; n = n->m_next)
5844                 reslen += n->m_len;
5845         if (origlen != reslen)
5846                 panic("m_makewritable: length changed");
5847         if (((*mp)->m_flags & M_PKTHDR) && reslen != (*mp)->m_pkthdr.len)
5848                 panic("m_makewritable: inconsist");
5849 #endif /* DEBUG */
5850
5851         return (error);
5852 }
5853
5854 static int
5855 m_copyback0(struct mbuf **mp0, int off, int len, const void *vp, int flags,
5856     int how)
5857 {
5858         int mlen;
5859         struct mbuf *m, *n;
5860         struct mbuf **mp;
5861         int totlen = 0;
5862         const char *cp = vp;
5863
5864         VERIFY(mp0 != NULL);
5865         VERIFY(*mp0 != NULL);
5866         VERIFY((flags & M_COPYBACK0_PRESERVE) == 0 || cp == NULL);
5867         VERIFY((flags & M_COPYBACK0_COPYBACK) == 0 || cp != NULL);
5868
5869         /*
5870          * we don't bother to update "totlen" in the case of M_COPYBACK0_COW,
5871          * assuming that M_COPYBACK0_EXTEND and M_COPYBACK0_COW are exclusive.
5872          */
5873
5874         VERIFY((~flags & (M_COPYBACK0_EXTEND|M_COPYBACK0_COW)) != 0);
5875
5876         mp = mp0;
5877         m = *mp;
5878         while (off > (mlen = m->m_len)) {
5879                 off -= mlen;
5880                 totlen += mlen;
5881                 if (m->m_next == NULL) {
5882                         int tspace;
5883 extend:
5884                         if (!(flags & M_COPYBACK0_EXTEND))
5885                                 goto out;
5886
5887                         /*
5888                          * try to make some space at the end of "m".
5889                          */
5890
5891                         mlen = m->m_len;
5892                         if (off + len >= MINCLSIZE &&
5893                             !(m->m_flags & M_EXT) && m->m_len == 0) {
5894                                 MCLGET(m, how);
5895                         }
5896                         tspace = M_TRAILINGSPACE(m);
5897                         if (tspace > 0) {
5898                                 tspace = MIN(tspace, off + len);
5899                                 VERIFY(tspace > 0);
5900                                 bzero(mtod(m, char *) + m->m_len,
5901                                     MIN(off, tspace));
5902                                 m->m_len += tspace;
5903                                 off += mlen;
5904                                 totlen -= mlen;
5905                                 continue;
5906                         }
5907
5908                         /*
5909                          * need to allocate an mbuf.
5910                          */
5911
5912                         if (off + len >= MINCLSIZE) {
5913                                 n = m_getcl(how, m->m_type, 0);
5914                         } else {
5915                                 n = _M_GET(how, m->m_type);
5916                         }
5917                         if (n == NULL) {
5918                                 goto out;
5919                         }
5920                         n->m_len = 0;
5921                         n->m_len = MIN(M_TRAILINGSPACE(n), off + len);
5922                         bzero(mtod(n, char *), MIN(n->m_len, off));
5923                         m->m_next = n;
5924                 }
5925                 mp = &m->m_next;
5926                 m = m->m_next;
5927         }
5928         while (len > 0) {
5929                 mlen = m->m_len - off;
5930                 if (mlen != 0 && m_mclhasreference(m)) {
5931                         char *datap;
5932                         int eatlen;
5933
5934                         /*
5935                          * this mbuf is read-only.
5936                          * allocate a new writable mbuf and try again.
5937                          */
5938
5939 #if DIAGNOSTIC
5940                         if (!(flags & M_COPYBACK0_COW))
5941                                 panic("m_copyback0: read-only");
5942 #endif /* DIAGNOSTIC */
5943
5944                         /*
5945                          * if we're going to write into the middle of
5946                          * a mbuf, split it first.
5947                          */
5948                         if (off > 0 && len < mlen) {
5949                                 n = m_split0(m, off, how, 0);
5950                                 if (n == NULL)
5951                                         goto enobufs;
5952                                 m->m_next = n;
5953                                 mp = &m->m_next;
5954                                 m = n;
5955                                 off = 0;
5956                                 continue;
5957                         }
5958
5959                         /*
5960                          * XXX TODO coalesce into the trailingspace of
5961                          * the previous mbuf when possible.
5962                          */
5963
5964                         /*
5965                          * allocate a new mbuf.  copy packet header if needed.
5966                          */
5967                         n = _M_GET(how, m->m_type);
5968                         if (n == NULL)
5969                                 goto enobufs;
5970                         if (off == 0 && (m->m_flags & M_PKTHDR)) {
5971                                 M_COPY_PKTHDR(n, m);
5972                                 n->m_len = MHLEN;
5973                         } else {
5974                                 if (len >= MINCLSIZE)
5975                                         MCLGET(n, M_DONTWAIT);
5976                                 n->m_len =
5977                                     (n->m_flags & M_EXT) ? MCLBYTES : MLEN;
5978                         }
5979                         if (n->m_len > len)
5980                                 n->m_len = len;
5981
5982                         /*
5983                          * free the region which has been overwritten.
5984                          * copying data from old mbufs if requested.
5985                          */
5986                         if (flags & M_COPYBACK0_PRESERVE)
5987                                 datap = mtod(n, char *);
5988                         else
5989                                 datap = NULL;
5990                         eatlen = n->m_len;
5991                         VERIFY(off == 0 || eatlen >= mlen);
5992                         if (off > 0) {
5993                                 VERIFY(len >= mlen);
5994                                 m->m_len = off;
5995                                 m->m_next = n;
5996                                 if (datap) {
5997                                         m_copydata(m, off, mlen, datap);
5998                                         datap += mlen;
5999                                 }
6000                                 eatlen -= mlen;
6001                                 mp = &m->m_next;
6002                                 m = m->m_next;
6003                         }
6004                         while (m != NULL && m_mclhasreference(m) &&
6005                             n->m_type == m->m_type && eatlen > 0) {
6006                                 mlen = MIN(eatlen, m->m_len);
6007                                 if (datap) {
6008                                         m_copydata(m, 0, mlen, datap);
6009                                         datap += mlen;
6010                                 }
6011                                 m->m_data += mlen;
6012                                 m->m_len -= mlen;
6013                                 eatlen -= mlen;
6014                                 if (m->m_len == 0)
6015                                         *mp = m = m_free(m);
6016                         }
6017                         if (eatlen > 0)
6018                                 n->m_len -= eatlen;
6019                         n->m_next = m;
6020                         *mp = m = n;
6021                         continue;
6022                 }
6023                 mlen = MIN(mlen, len);
6024                 if (flags & M_COPYBACK0_COPYBACK) {
6025                         bcopy(cp, mtod(m, caddr_t) + off, (unsigned)mlen);
6026                         cp += mlen;
6027                 }
6028                 len -= mlen;
6029                 mlen += off;
6030                 off = 0;
6031                 totlen += mlen;
6032                 if (len == 0)
6033                         break;
6034                 if (m->m_next == NULL) {
6035                         goto extend;
6036                 }
6037                 mp = &m->m_next;
6038                 m = m->m_next;
6039         }
6040 out:
6041         if (((m = *mp0)->m_flags & M_PKTHDR) && (m->m_pkthdr.len < totlen)) {
6042                 VERIFY(flags & M_COPYBACK0_EXTEND);
6043                 m->m_pkthdr.len = totlen;
6044         }
6045
6046         return (0);
6047
6048 enobufs:
6049         return (ENOBUFS);
6050 }
6051
6052 uint64_t
6053 mcl_to_paddr(char *addr)
6054 {
6055         vm_offset_t base_phys;
6056
6057         if (!MBUF_IN_MAP(addr))
6058                 return (0);
6059         base_phys = mcl_paddr[atop_64(addr - (char *)mbutl)];
6060
6061         if (base_phys == 0)
6062                 return (0);
6063         return ((uint64_t)(ptoa_64(base_phys) | ((uint64_t)addr & PAGE_MASK)));
6064 }
6065
6066 /*
6067  * Dup the mbuf chain passed in.  The whole thing.  No cute additional cruft.
6068  * And really copy the thing.  That way, we don't "precompute" checksums
6069  * for unsuspecting consumers.  Assumption: m->m_nextpkt == 0.  Trick: for
6070  * small packets, don't dup into a cluster.  That way received  packets
6071  * don't take up too much room in the sockbuf (cf. sbspace()).
6072  */
6073 int MDFail;
6074
6075 struct mbuf *
6076 m_dup(struct mbuf *m, int how)
6077 {
6078         struct mbuf *n, **np;
6079         struct mbuf *top;
6080         int copyhdr = 0;
6081
6082         np = &top;
6083         top = NULL;
6084         if (m->m_flags & M_PKTHDR)
6085                 copyhdr = 1;
6086
6087         /*
6088          * Quick check: if we have one mbuf and its data fits in an
6089          *  mbuf with packet header, just copy and go.
6090          */
6091         if (m->m_next == NULL) {
6092                 /* Then just move the data into an mbuf and be done... */
6093                 if (copyhdr) {
6094                         if (m->m_pkthdr.len <= MHLEN && m->m_len <= MHLEN) {
6095                                 if ((n = _M_GETHDR(how, m->m_type)) == NULL)
6096                                         return (NULL);
6097                                 n->m_len = m->m_len;
6098                                 m_dup_pkthdr(n, m, how);
6099                                 bcopy(m->m_data, n->m_data, m->m_len);
6100                                 return (n);
6101                         }
6102                 } else if (m->m_len <= MLEN) {
6103                         if ((n = _M_GET(how, m->m_type)) == NULL)
6104                                 return (NULL);
6105                         bcopy(m->m_data, n->m_data, m->m_len);
6106                         n->m_len = m->m_len;
6107                         return (n);
6108                 }
6109         }
6110         while (m != NULL) {
6111 #if BLUE_DEBUG
6112                 printf("<%x: %x, %x, %x\n", m, m->m_flags, m->m_len,
6113                     m->m_data);
6114 #endif
6115                 if (copyhdr)
6116                         n = _M_GETHDR(how, m->m_type);
6117                 else
6118                         n = _M_GET(how, m->m_type);
6119                 if (n == NULL)
6120                         goto nospace;
6121                 if (m->m_flags & M_EXT) {
6122                         if (m->m_len <= m_maxsize(MC_CL))
6123                                 MCLGET(n, how);
6124                         else if (m->m_len <= m_maxsize(MC_BIGCL))
6125                                 n = m_mbigget(n, how);
6126                         else if (m->m_len <= m_maxsize(MC_16KCL) && njcl > 0)
6127                                 n = m_m16kget(n, how);
6128                         if (!(n->m_flags & M_EXT)) {
6129                                 (void) m_free(n);
6130                                 goto nospace;
6131                         }
6132                 }
6133                 *np = n;
6134                 if (copyhdr) {
6135                         /* Don't use M_COPY_PKTHDR: preserve m_data */
6136                         m_dup_pkthdr(n, m, how);
6137                         copyhdr = 0;
6138                         if (!(n->m_flags & M_EXT))
6139                                 n->m_data = n->m_pktdat;
6140                 }
6141                 n->m_len = m->m_len;
6142                 /*
6143                  * Get the dup on the same bdry as the original
6144                  * Assume that the two mbufs have the same offset to data area
6145                  * (up to word boundaries)
6146                  */
6147                 bcopy(MTOD(m, caddr_t), MTOD(n, caddr_t), (unsigned)n->m_len);
6148                 m = m->m_next;
6149                 np = &n->m_next;
6150 #if BLUE_DEBUG
6151                 printf(">%x: %x, %x, %x\n", n, n->m_flags, n->m_len,
6152                     n->m_data);
6153 #endif
6154         }
6155
6156         if (top == NULL)
6157                 MDFail++;
6158         return (top);
6159
6160 nospace:
6161         m_freem(top);
6162         MDFail++;
6163         return (NULL);
6164 }
6165
6166 #define MBUF_MULTIPAGES(m)                                              \
6167         (((m)->m_flags & M_EXT) &&                                      \
6168         ((IS_P2ALIGNED((m)->m_data, PAGE_SIZE)                          \
6169         && (m)->m_len > PAGE_SIZE) ||                                   \
6170         (!IS_P2ALIGNED((m)->m_data, PAGE_SIZE) &&                       \
6171         P2ROUNDUP((m)->m_data, PAGE_SIZE) < ((uintptr_t)(m)->m_data + (m)->m_len))))
6172
6173 static struct mbuf *
6174 m_expand(struct mbuf *m, struct mbuf **last)
6175 {
6176         struct mbuf *top = NULL;
6177         struct mbuf **nm = &top;
6178         uintptr_t data0, data;
6179         unsigned int len0, len;
6180
6181         VERIFY(MBUF_MULTIPAGES(m));
6182         VERIFY(m->m_next == NULL);
6183         data0 = (uintptr_t)m->m_data;
6184         len0 = m->m_len;
6185         *last = top;
6186
6187         for (;;) {
6188                 struct mbuf *n;
6189
6190                 data = data0;
6191                 if (IS_P2ALIGNED(data, PAGE_SIZE) && len0 > PAGE_SIZE)
6192                         len = PAGE_SIZE;
6193                 else if (!IS_P2ALIGNED(data, PAGE_SIZE) &&
6194                     P2ROUNDUP(data, PAGE_SIZE) < (data + len0))
6195                         len = P2ROUNDUP(data, PAGE_SIZE) - data;
6196                 else
6197                         len = len0;
6198
6199                 VERIFY(len > 0);
6200                 VERIFY(m->m_flags & M_EXT);
6201                 m->m_data = (void *)data;
6202                 m->m_len = len;
6203
6204                 *nm = *last = m;
6205                 nm = &m->m_next;
6206                 m->m_next = NULL;
6207
6208                 data0 += len;
6209                 len0 -= len;
6210                 if (len0 == 0)
6211                         break;
6212
6213                 n = _M_RETRY(M_DONTWAIT, MT_DATA);
6214                 if (n == NULL) {
6215                         m_freem(top);
6216                         top = *last = NULL;
6217                         break;
6218                 }
6219
6220                 n->m_ext = m->m_ext;
6221                 m_incref(m);
6222                 n->m_flags |= M_EXT;
6223                 m = n;
6224         }
6225         return (top);
6226 }
6227
6228 struct mbuf *
6229 m_normalize(struct mbuf *m)
6230 {
6231         struct mbuf *top = NULL;
6232         struct mbuf **nm = &top;
6233         boolean_t expanded = FALSE;
6234
6235         while (m != NULL) {
6236                 struct mbuf *n;
6237
6238                 n = m->m_next;
6239                 m->m_next = NULL;
6240
6241                 /* Does the data cross one or more page boundaries? */
6242                 if (MBUF_MULTIPAGES(m)) {
6243                         struct mbuf *last;
6244                         if ((m = m_expand(m, &last)) == NULL) {
6245                                 m_freem(n);
6246                                 m_freem(top);
6247                                 top = NULL;
6248                                 break;
6249                         }
6250                         *nm = m;
6251                         nm = &last->m_next;
6252                         expanded = TRUE;
6253                 } else {
6254                         *nm = m;
6255                         nm = &m->m_next;
6256                 }
6257                 m = n;
6258         }
6259         if (expanded)
6260                 atomic_add_32(&mb_normalized, 1);
6261         return (top);
6262 }
6263
6264 /*
6265  * Append the specified data to the indicated mbuf chain,
6266  * Extend the mbuf chain if the new data does not fit in
6267  * existing space.
6268  *
6269  * Return 1 if able to complete the job; otherwise 0.
6270  */
6271 int
6272 m_append(struct mbuf *m0, int len, caddr_t cp)
6273 {
6274         struct mbuf *m, *n;
6275         int remainder, space;
6276
6277         for (m = m0; m->m_next != NULL; m = m->m_next)
6278                 ;
6279         remainder = len;
6280         space = M_TRAILINGSPACE(m);
6281         if (space > 0) {
6282                 /*
6283                  * Copy into available space.
6284                  */
6285                 if (space > remainder)
6286                         space = remainder;
6287                 bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
6288                 m->m_len += space;
6289                 cp += space;
6290                 remainder -= space;
6291         }
6292         while (remainder > 0) {
6293                 /*
6294                  * Allocate a new mbuf; could check space
6295                  * and allocate a cluster instead.
6296                  */
6297                 n = m_get(M_WAITOK, m->m_type);
6298                 if (n == NULL)
6299                         break;
6300                 n->m_len = min(MLEN, remainder);
6301                 bcopy(cp, mtod(n, caddr_t), n->m_len);
6302                 cp += n->m_len;
6303                 remainder -= n->m_len;
6304                 m->m_next = n;
6305                 m = n;
6306         }
6307         if (m0->m_flags & M_PKTHDR)
6308                 m0->m_pkthdr.len += len - remainder;
6309         return (remainder == 0);
6310 }
6311
6312 struct mbuf *
6313 m_last(struct mbuf *m)
6314 {
6315         while (m->m_next != NULL)
6316                 m = m->m_next;
6317         return (m);
6318 }
6319
6320 unsigned int
6321 m_fixhdr(struct mbuf *m0)
6322 {
6323         u_int len;
6324
6325         VERIFY(m0->m_flags & M_PKTHDR);
6326
6327         len = m_length2(m0, NULL);
6328         m0->m_pkthdr.len = len;
6329         return (len);
6330 }
6331
6332 unsigned int
6333 m_length2(struct mbuf *m0, struct mbuf **last)
6334 {
6335         struct mbuf *m;
6336         u_int len;
6337
6338         len = 0;
6339         for (m = m0; m != NULL; m = m->m_next) {
6340                 len += m->m_len;
6341                 if (m->m_next == NULL)
6342                         break;
6343         }
6344         if (last != NULL)
6345                 *last = m;
6346         return (len);
6347 }
6348
6349 /*
6350  * Defragment a mbuf chain, returning the shortest possible chain of mbufs
6351  * and clusters.  If allocation fails and this cannot be completed, NULL will
6352  * be returned, but the passed in chain will be unchanged.  Upon success,
6353  * the original chain will be freed, and the new chain will be returned.
6354  *
6355  * If a non-packet header is passed in, the original mbuf (chain?) will
6356  * be returned unharmed.
6357  *
6358  * If offset is specfied, the first mbuf in the chain will have a leading
6359  * space of the amount stated by the "off" parameter.
6360  *
6361  * This routine requires that the m_pkthdr.header field of the original
6362  * mbuf chain is cleared by the caller.
6363  */
6364 struct mbuf *
6365 m_defrag_offset(struct mbuf *m0, u_int32_t off, int how)
6366 {
6367         struct mbuf *m_new = NULL, *m_final = NULL;
6368         int progress = 0, length, pktlen;
6369
6370         if (!(m0->m_flags & M_PKTHDR))
6371                 return (m0);
6372
6373         VERIFY(off < MHLEN);
6374         m_fixhdr(m0); /* Needed sanity check */
6375
6376         pktlen = m0->m_pkthdr.len + off;
6377         if (pktlen > MHLEN)
6378                 m_final = m_getcl(how, MT_DATA, M_PKTHDR);
6379         else
6380                 m_final = m_gethdr(how, MT_DATA);
6381
6382         if (m_final == NULL)
6383                 goto nospace;
6384
6385         if (off > 0) {
6386                 pktlen -= off;
6387                 m_final->m_data += off;
6388         }
6389
6390         /*
6391          * Caller must have handled the contents pointed to by this
6392          * pointer before coming here, as otherwise it will point to
6393          * the original mbuf which will get freed upon success.
6394          */
6395         VERIFY(m0->m_pkthdr.pkt_hdr == NULL);
6396
6397         if (m_dup_pkthdr(m_final, m0, how) == 0)
6398                 goto nospace;
6399
6400         m_new = m_final;
6401
6402         while (progress < pktlen) {
6403                 length = pktlen - progress;
6404                 if (length > MCLBYTES)
6405                         length = MCLBYTES;
6406                 length -= ((m_new == m_final) ? off : 0);
6407                 if (length < 0)
6408                         goto nospace;
6409
6410                 if (m_new == NULL) {
6411                         if (length > MLEN)
6412                                 m_new = m_getcl(how, MT_DATA, 0);
6413                         else
6414                                 m_new = m_get(how, MT_DATA);
6415                         if (m_new == NULL)
6416                                 goto nospace;
6417                 }
6418
6419                 m_copydata(m0, progress, length, mtod(m_new, caddr_t));
6420                 progress += length;
6421                 m_new->m_len = length;
6422                 if (m_new != m_final)
6423                         m_cat(m_final, m_new);
6424                 m_new = NULL;
6425         }
6426         m_freem(m0);
6427         m0 = m_final;
6428         return (m0);
6429 nospace:
6430         if (m_final)
6431                 m_freem(m_final);
6432         return (NULL);
6433 }
6434
6435 struct mbuf *
6436 m_defrag(struct mbuf *m0, int how)
6437 {
6438         return (m_defrag_offset(m0, 0, how));
6439 }
6440
6441 void
6442 m_mchtype(struct mbuf *m, int t)
6443 {
6444         mtype_stat_inc(t);
6445         mtype_stat_dec(m->m_type);
6446         (m)->m_type = t;
6447 }
6448
6449 void *
6450 m_mtod(struct mbuf *m)
6451 {
6452         return (MTOD(m, void *));
6453 }
6454
6455 struct mbuf *
6456 m_dtom(void *x)
6457 {
6458         return ((struct mbuf *)((uintptr_t)(x) & ~(MSIZE-1)));
6459 }
6460
6461 void
6462 m_mcheck(struct mbuf *m)
6463 {
6464         _MCHECK(m);
6465 }
6466
6467 /*
6468  * Return a pointer to mbuf/offset of location in mbuf chain.
6469  */
6470 struct mbuf *
6471 m_getptr(struct mbuf *m, int loc, int *off)
6472 {
6473
6474         while (loc >= 0) {
6475                 /* Normal end of search. */
6476                 if (m->m_len > loc) {
6477                         *off = loc;
6478                         return (m);
6479                 } else {
6480                         loc -= m->m_len;
6481                         if (m->m_next == NULL) {
6482                                 if (loc == 0) {
6483                                         /* Point at the end of valid data. */
6484                                         *off = m->m_len;
6485                                         return (m);
6486                                 }
6487                                 return (NULL);
6488                         }
6489                         m = m->m_next;
6490                 }
6491         }
6492         return (NULL);
6493 }
6494
6495 /*
6496  * Inform the corresponding mcache(s) that there's a waiter below.
6497  */
6498 static void
6499 mbuf_waiter_inc(mbuf_class_t class, boolean_t comp)
6500 {
6501         mcache_waiter_inc(m_cache(class));
6502         if (comp) {
6503                 if (class == MC_CL) {
6504                         mcache_waiter_inc(m_cache(MC_MBUF_CL));
6505                 } else if (class == MC_BIGCL) {
6506                         mcache_waiter_inc(m_cache(MC_MBUF_BIGCL));
6507                 } else if (class == MC_16KCL) {
6508                         mcache_waiter_inc(m_cache(MC_MBUF_16KCL));
6509                 } else {
6510                         mcache_waiter_inc(m_cache(MC_MBUF_CL));
6511                         mcache_waiter_inc(m_cache(MC_MBUF_BIGCL));
6512                 }
6513         }
6514 }
6515
6516 /*
6517  * Inform the corresponding mcache(s) that there's no more waiter below.
6518  */
6519 static void
6520 mbuf_waiter_dec(mbuf_class_t class, boolean_t comp)
6521 {
6522         mcache_waiter_dec(m_cache(class));
6523         if (comp) {
6524                 if (class == MC_CL) {
6525                         mcache_waiter_dec(m_cache(MC_MBUF_CL));
6526                 } else if (class == MC_BIGCL) {
6527                         mcache_waiter_dec(m_cache(MC_MBUF_BIGCL));
6528                 } else if (class == MC_16KCL) {
6529                         mcache_waiter_dec(m_cache(MC_MBUF_16KCL));
6530                 } else {
6531                         mcache_waiter_dec(m_cache(MC_MBUF_CL));
6532                         mcache_waiter_dec(m_cache(MC_MBUF_BIGCL));
6533                 }
6534         }
6535 }
6536
6537 /*
6538  * Called during slab (blocking and non-blocking) allocation.  If there
6539  * is at least one waiter, and the time since the first waiter is blocked
6540  * is greater than the watchdog timeout, panic the system.
6541  */
6542 static void
6543 mbuf_watchdog(void)
6544 {
6545         struct timeval now;
6546         unsigned int since;
6547
6548         if (mb_waiters == 0 || !mb_watchdog)
6549                 return;
6550
6551         microuptime(&now);
6552         since = now.tv_sec - mb_wdtstart.tv_sec;
6553         if (since >= MB_WDT_MAXTIME) {
6554                 panic_plain("%s: %d waiters stuck for %u secs\n%s", __func__,
6555                     mb_waiters, since, mbuf_dump());
6556                 /* NOTREACHED */
6557         }
6558 }
6559
6560 /*
6561  * Called during blocking allocation.  Returns TRUE if one or more objects
6562  * are available at the per-CPU caches layer and that allocation should be
6563  * retried at that level.
6564  */
6565 static boolean_t
6566 mbuf_sleep(mbuf_class_t class, unsigned int num, int wait)
6567 {
6568         boolean_t mcache_retry = FALSE;
6569
6570         LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
6571
6572         /* Check if there's anything at the cache layer */
6573         if (mbuf_cached_above(class, wait)) {
6574                 mcache_retry = TRUE;
6575                 goto done;
6576         }
6577
6578         /* Nothing?  Then try hard to get it from somewhere */
6579         m_reclaim(class, num, (wait & MCR_COMP));
6580
6581         /* We tried hard and got something? */
6582         if (m_infree(class) > 0) {
6583                 mbstat.m_wait++;
6584                 goto done;
6585         } else if (mbuf_cached_above(class, wait)) {
6586                 mbstat.m_wait++;
6587                 mcache_retry = TRUE;
6588                 goto done;
6589         } else if (wait & MCR_TRYHARD) {
6590                 mcache_retry = TRUE;
6591                 goto done;
6592         }
6593
6594         /*
6595          * There's really nothing for us right now; inform the
6596          * cache(s) that there is a waiter below and go to sleep.
6597          */
6598         mbuf_waiter_inc(class, (wait & MCR_COMP));
6599
6600         VERIFY(!(wait & MCR_NOSLEEP));
6601
6602         /*
6603          * If this is the first waiter, arm the watchdog timer.  Otherwise
6604          * check if we need to panic the system due to watchdog timeout.
6605          */
6606         if (mb_waiters == 0)
6607                 microuptime(&mb_wdtstart);
6608         else
6609                 mbuf_watchdog();
6610
6611         mb_waiters++;
6612         m_region_expand(class) += m_total(class) + num;
6613         /* wake up the worker thread */
6614         if (mbuf_worker_ready &&
6615             mbuf_worker_needs_wakeup) {
6616                 wakeup((caddr_t)&mbuf_worker_needs_wakeup);
6617                 mbuf_worker_needs_wakeup = FALSE;
6618         }
6619         mbwdog_logger("waiting (%d mbufs in class %s)", num, m_cname(class));
6620         (void) msleep(mb_waitchan, mbuf_mlock, (PZERO-1), m_cname(class), NULL);
6621         mbwdog_logger("woke up (%d mbufs in class %s) ", num, m_cname(class));
6622
6623         /* We are now up; stop getting notified until next round */
6624         mbuf_waiter_dec(class, (wait & MCR_COMP));
6625
6626         /* We waited and got something */
6627         if (m_infree(class) > 0) {
6628                 mbstat.m_wait++;
6629                 goto done;
6630         } else if (mbuf_cached_above(class, wait)) {
6631                 mbstat.m_wait++;
6632                 mcache_retry = TRUE;
6633         }
6634 done:
6635         return (mcache_retry);
6636 }
6637
6638 __attribute__((noreturn))
6639 static void
6640 mbuf_worker_thread(void)
6641 {
6642         int mbuf_expand;
6643
6644         while (1) {
6645                 lck_mtx_lock(mbuf_mlock);
6646                 mbwdog_logger("worker thread running");
6647                 mbuf_worker_run_cnt++;
6648                 mbuf_expand = 0;
6649                 /*
6650                  * Allocations are based on page size, so if we have depleted
6651                  * the reserved spaces, try to free mbufs from the major classes.
6652                  */
6653 #if PAGE_SIZE == 4096
6654                 uint32_t m_mbclusters = m_total(MC_MBUF) >> NMBPCLSHIFT;
6655                 uint32_t m_clusters = m_total(MC_CL);
6656                 uint32_t m_bigclusters = m_total(MC_BIGCL) << NCLPBGSHIFT;
6657                 uint32_t sumclusters = m_mbclusters + m_clusters + m_bigclusters;
6658                 if (sumclusters >= nclusters) {
6659                         mbwdog_logger("reclaiming bigcl");
6660                         mbuf_drain_locked(TRUE);
6661                         m_reclaim(MC_BIGCL, 4, FALSE);
6662                 }
6663 #else
6664                 uint32_t m_16kclusters = m_total(MC_16KCL);
6665                 if (njcl > 0 && (m_16kclusters << NCLPJCLSHIFT) >= njcl) {
6666                         mbwdog_logger("reclaiming 16kcl");
6667                         mbuf_drain_locked(TRUE);
6668                         m_reclaim(MC_16KCL, 4, FALSE);
6669                 }
6670 #endif
6671                 if (m_region_expand(MC_CL) > 0) {
6672                         int n;
6673                         mb_expand_cl_cnt++;
6674                         /* Adjust to current number of cluster in use */
6675                         n = m_region_expand(MC_CL) -
6676                             (m_total(MC_CL) - m_infree(MC_CL));
6677                         if ((n + m_total(MC_CL)) > m_maxlimit(MC_CL))
6678                                 n = m_maxlimit(MC_CL) - m_total(MC_CL);
6679                         if (n > 0) {
6680                                 mb_expand_cl_total += n;
6681                         }
6682                         m_region_expand(MC_CL) = 0;
6683
6684                         if (n > 0) {
6685                                 mbwdog_logger("expanding MC_CL by %d", n);
6686                                 freelist_populate(MC_CL, n, M_WAIT);
6687                         }
6688                 }
6689                 if (m_region_expand(MC_BIGCL) > 0) {
6690                         int n;
6691                         mb_expand_bigcl_cnt++;
6692                         /* Adjust to current number of 4 KB cluster in use */
6693                         n = m_region_expand(MC_BIGCL) -
6694                             (m_total(MC_BIGCL) - m_infree(MC_BIGCL));
6695                         if ((n + m_total(MC_BIGCL)) > m_maxlimit(MC_BIGCL))
6696                                 n = m_maxlimit(MC_BIGCL) - m_total(MC_BIGCL);
6697                         if (n > 0) {
6698                                 mb_expand_bigcl_total += n;
6699                         }
6700                         m_region_expand(MC_BIGCL) = 0;
6701
6702                         if (n > 0) {
6703                                 mbwdog_logger("expanding MC_BIGCL by %d", n);
6704                                 freelist_populate(MC_BIGCL, n, M_WAIT);
6705                         }
6706                 }
6707                 if (m_region_expand(MC_16KCL) > 0) {
6708                         int n;
6709                         mb_expand_16kcl_cnt++;
6710                         /* Adjust to current number of 16 KB cluster in use */
6711                         n = m_region_expand(MC_16KCL) -
6712                             (m_total(MC_16KCL) - m_infree(MC_16KCL));
6713                         if ((n + m_total(MC_16KCL)) > m_maxlimit(MC_16KCL))
6714                                 n = m_maxlimit(MC_16KCL) - m_total(MC_16KCL);
6715                         if (n > 0) {
6716                                 mb_expand_16kcl_total += n;
6717                         }
6718                         m_region_expand(MC_16KCL) = 0;
6719
6720                         if (n > 0) {
6721                                 mbwdog_logger("expanding MC_16KCL by %d", n);
6722                                 (void) freelist_populate(MC_16KCL, n, M_WAIT);
6723                         }
6724                 }
6725
6726                 /*
6727                  * Because we can run out of memory before filling the mbuf
6728                  * map, we should not allocate more clusters than they are
6729                  * mbufs -- otherwise we could have a large number of useless
6730                  * clusters allocated.
6731                  */
6732                 mbwdog_logger("totals: MC_MBUF %d MC_BIGCL %d MC_CL %d MC_16KCL %d",
6733                     m_total(MC_MBUF), m_total(MC_BIGCL), m_total(MC_CL),
6734                     m_total(MC_16KCL));
6735                 uint32_t total_mbufs = m_total(MC_MBUF);
6736                 uint32_t total_clusters = m_total(MC_BIGCL) + m_total(MC_CL) +
6737                     m_total(MC_16KCL);
6738                 if (total_mbufs < total_clusters) {
6739                         mbwdog_logger("expanding MC_MBUF by %d",
6740                                 total_clusters - total_mbufs);
6741                 }
6742                 while (total_mbufs < total_clusters) {
6743                         mb_expand_cnt++;
6744                         if (freelist_populate(MC_MBUF, 1, M_WAIT) == 0)
6745                                 break;
6746                         total_mbufs = m_total(MC_MBUF);
6747                         total_clusters = m_total(MC_BIGCL) + m_total(MC_CL) +
6748                             m_total(MC_16KCL);
6749                 }
6750
6751                 mbuf_worker_needs_wakeup = TRUE;
6752                 /*
6753                  * If there's a deadlock and we're not sending / receiving
6754                  * packets, net_uptime() won't be updated.  Update it here
6755                  * so we are sure it's correct.
6756                  */
6757                 net_update_uptime();
6758                 mbuf_worker_last_runtime = net_uptime();
6759                 assert_wait((caddr_t)&mbuf_worker_needs_wakeup,
6760                     THREAD_UNINT);
6761                 mbwdog_logger("worker thread sleeping");
6762                 lck_mtx_unlock(mbuf_mlock);
6763                 (void) thread_block((thread_continue_t)mbuf_worker_thread);
6764         }
6765 }
6766
6767 __attribute__((noreturn))
6768 static void
6769 mbuf_worker_thread_init(void)
6770 {
6771         mbuf_worker_ready++;
6772         mbuf_worker_thread();
6773 }
6774
6775 static mcl_slab_t *
6776 slab_get(void *buf)
6777 {
6778         mcl_slabg_t *slg;
6779         unsigned int ix, k;
6780
6781         LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
6782
6783         VERIFY(MBUF_IN_MAP(buf));
6784         ix = ((unsigned char *)buf - mbutl) >> MBSHIFT;
6785         VERIFY(ix < maxslabgrp);
6786
6787         if ((slg = slabstbl[ix]) == NULL) {
6788                 /*
6789                  * In the current implementation, we never shrink the slabs
6790                  * table; if we attempt to reallocate a cluster group when
6791                  * it's already allocated, panic since this is a sign of a
6792                  * memory corruption (slabstbl[ix] got nullified).
6793                  */
6794                 ++slabgrp;
6795                 VERIFY(ix < slabgrp);
6796                 /*
6797                  * Slabs expansion can only be done single threaded; when
6798                  * we get here, it must be as a result of m_clalloc() which
6799                  * is serialized and therefore mb_clalloc_busy must be set.
6800                  */
6801                 VERIFY(mb_clalloc_busy);
6802                 lck_mtx_unlock(mbuf_mlock);
6803
6804                 /* This is a new buffer; create the slabs group for it */
6805                 MALLOC(slg, mcl_slabg_t *, sizeof (*slg), M_TEMP,
6806                     M_WAITOK | M_ZERO);
6807                 MALLOC(slg->slg_slab, mcl_slab_t *, sizeof(mcl_slab_t) * NSLABSPMB,
6808                     M_TEMP, M_WAITOK | M_ZERO);
6809                 VERIFY(slg != NULL && slg->slg_slab != NULL);
6810
6811                 lck_mtx_lock(mbuf_mlock);
6812                 /*
6813                  * No other thread could have gone into m_clalloc() after
6814                  * we dropped the lock above, so verify that it's true.
6815                  */
6816                 VERIFY(mb_clalloc_busy);
6817
6818                 slabstbl[ix] = slg;
6819
6820                 /* Chain each slab in the group to its forward neighbor */
6821                 for (k = 1; k < NSLABSPMB; k++)
6822                         slg->slg_slab[k - 1].sl_next = &slg->slg_slab[k];
6823                 VERIFY(slg->slg_slab[NSLABSPMB - 1].sl_next == NULL);
6824
6825                 /* And chain the last slab in the previous group to this */
6826                 if (ix > 0) {
6827                         VERIFY(slabstbl[ix - 1]->
6828                             slg_slab[NSLABSPMB - 1].sl_next == NULL);
6829                         slabstbl[ix - 1]->slg_slab[NSLABSPMB - 1].sl_next =
6830                             &slg->slg_slab[0];
6831                 }
6832         }
6833
6834         ix = MTOPG(buf) % NSLABSPMB;
6835         VERIFY(ix < NSLABSPMB);
6836
6837         return (&slg->slg_slab[ix]);
6838 }
6839
6840 static void
6841 slab_init(mcl_slab_t *sp, mbuf_class_t class, u_int32_t flags,
6842     void *base, void *head, unsigned int len, int refcnt, int chunks)
6843 {
6844         sp->sl_class = class;
6845         sp->sl_flags = flags;
6846         sp->sl_base = base;
6847         sp->sl_head = head;
6848         sp->sl_len = len;
6849         sp->sl_refcnt = refcnt;
6850         sp->sl_chunks = chunks;
6851         slab_detach(sp);
6852 }
6853
6854 static void
6855 slab_insert(mcl_slab_t *sp, mbuf_class_t class)
6856 {
6857         VERIFY(slab_is_detached(sp));
6858         m_slab_cnt(class)++;
6859         TAILQ_INSERT_TAIL(&m_slablist(class), sp, sl_link);
6860         sp->sl_flags &= ~SLF_DETACHED;
6861
6862         /*
6863          * If a buffer spans multiple contiguous pages then mark them as
6864          * detached too
6865          */
6866         if (class == MC_16KCL) {
6867                 int k;
6868                 for (k = 1; k < NSLABSP16KB; k++) {
6869                         sp = sp->sl_next;
6870                         /* Next slab must already be present */
6871                         VERIFY(sp != NULL && slab_is_detached(sp));
6872                         sp->sl_flags &= ~SLF_DETACHED;
6873                 }
6874         }
6875 }
6876
6877 static void
6878 slab_remove(mcl_slab_t *sp, mbuf_class_t class)
6879 {
6880         int k;
6881         VERIFY(!slab_is_detached(sp));
6882         VERIFY(m_slab_cnt(class) > 0);
6883         m_slab_cnt(class)--;
6884         TAILQ_REMOVE(&m_slablist(class), sp, sl_link);
6885         slab_detach(sp);
6886         if (class == MC_16KCL) {
6887                 for (k = 1; k < NSLABSP16KB; k++) {
6888                         sp = sp->sl_next;
6889                         /* Next slab must already be present */
6890                         VERIFY(sp != NULL);
6891                         VERIFY(!slab_is_detached(sp));
6892                         slab_detach(sp);
6893                 }
6894         }
6895 }
6896
6897 static boolean_t
6898 slab_inrange(mcl_slab_t *sp, void *buf)
6899 {
6900         return ((uintptr_t)buf >= (uintptr_t)sp->sl_base &&
6901             (uintptr_t)buf < ((uintptr_t)sp->sl_base + sp->sl_len));
6902 }
6903
6904 #undef panic
6905
6906 static void
6907 slab_nextptr_panic(mcl_slab_t *sp, void *addr)
6908 {
6909         int i;
6910         unsigned int chunk_len = sp->sl_len / sp->sl_chunks;
6911         uintptr_t buf = (uintptr_t)sp->sl_base;
6912
6913         for (i = 0; i < sp->sl_chunks; i++, buf += chunk_len) {
6914                 void *next = ((mcache_obj_t *)buf)->obj_next;
6915                 if (next != addr)
6916                         continue;
6917                 if (!mclverify) {
6918                         if (next != NULL && !MBUF_IN_MAP(next)) {
6919                                 mcache_t *cp = m_cache(sp->sl_class);
6920                                 panic("%s: %s buffer %p in slab %p modified "
6921                                     "after free at offset 0: %p out of range "
6922                                     "[%p-%p)\n", __func__, cp->mc_name,
6923                                     (void *)buf, sp, next, mbutl, embutl);
6924                                 /* NOTREACHED */
6925                         }
6926                 } else {
6927                         mcache_audit_t *mca = mcl_audit_buf2mca(sp->sl_class,
6928                             (mcache_obj_t *)buf);
6929                         mcl_audit_verify_nextptr(next, mca);
6930                 }
6931         }
6932 }
6933
6934 static void
6935 slab_detach(mcl_slab_t *sp)
6936 {
6937         sp->sl_link.tqe_next = (mcl_slab_t *)-1;
6938         sp->sl_link.tqe_prev = (mcl_slab_t **)-1;
6939         sp->sl_flags |= SLF_DETACHED;
6940 }
6941
6942 static boolean_t
6943 slab_is_detached(mcl_slab_t *sp)
6944 {
6945         return ((intptr_t)sp->sl_link.tqe_next == -1 &&
6946             (intptr_t)sp->sl_link.tqe_prev == -1 &&
6947             (sp->sl_flags & SLF_DETACHED));
6948 }
6949
6950 static void
6951 mcl_audit_init(void *buf, mcache_audit_t **mca_list,
6952     mcache_obj_t **con_list, size_t con_size, unsigned int num)
6953 {
6954         mcache_audit_t *mca, *mca_tail;
6955         mcache_obj_t *con = NULL;
6956         boolean_t save_contents = (con_list != NULL);
6957         unsigned int i, ix;
6958
6959         ASSERT(num <= NMBPG);
6960         ASSERT(con_list == NULL || con_size != 0);
6961
6962         ix = MTOPG(buf);
6963         VERIFY(ix < maxclaudit);
6964
6965         /* Make sure we haven't been here before */
6966         for (i = 0; i < num; i++)
6967                 VERIFY(mclaudit[ix].cl_audit[i] == NULL);
6968
6969         mca = mca_tail = *mca_list;
6970         if (save_contents)
6971                 con = *con_list;
6972
6973         for (i = 0; i < num; i++) {
6974                 mcache_audit_t *next;
6975
6976                 next = mca->mca_next;
6977                 bzero(mca, sizeof (*mca));
6978                 mca->mca_next = next;
6979                 mclaudit[ix].cl_audit[i] = mca;
6980
6981                 /* Attach the contents buffer if requested */
6982                 if (save_contents) {
6983                         mcl_saved_contents_t *msc =
6984                             (mcl_saved_contents_t *)(void *)con;
6985
6986                         VERIFY(msc != NULL);
6987                         VERIFY(IS_P2ALIGNED(msc, sizeof (u_int64_t)));
6988                         VERIFY(con_size == sizeof (*msc));
6989                         mca->mca_contents_size = con_size;
6990                         mca->mca_contents = msc;
6991                         con = con->obj_next;
6992                         bzero(mca->mca_contents, mca->mca_contents_size);
6993                 }
6994
6995                 mca_tail = mca;
6996                 mca = mca->mca_next;
6997         }
6998
6999         if (save_contents)
7000                 *con_list = con;
7001
7002         *mca_list = mca_tail->mca_next;
7003         mca_tail->mca_next = NULL;
7004 }
7005
7006 static void
7007 mcl_audit_free(void *buf, unsigned int num)
7008 {
7009         unsigned int i, ix;
7010         mcache_audit_t *mca, *mca_list;
7011
7012         ix = MTOPG(buf);
7013         VERIFY(ix < maxclaudit);
7014
7015         if (mclaudit[ix].cl_audit[0] != NULL) {
7016                 mca_list = mclaudit[ix].cl_audit[0];
7017                 for (i = 0; i < num; i++) {
7018                         mca = mclaudit[ix].cl_audit[i];
7019                         mclaudit[ix].cl_audit[i] = NULL;
7020                         if (mca->mca_contents)
7021                                 mcache_free(mcl_audit_con_cache,
7022                                     mca->mca_contents);
7023                 }
7024                 mcache_free_ext(mcache_audit_cache,
7025                     (mcache_obj_t *)mca_list);
7026         }
7027 }
7028
7029 /*
7030  * Given an address of a buffer (mbuf/2KB/4KB/16KB), return
7031  * the corresponding audit structure for that buffer.
7032  */
7033 static mcache_audit_t *
7034 mcl_audit_buf2mca(mbuf_class_t class, mcache_obj_t *mobj)
7035 {
7036         mcache_audit_t *mca = NULL;
7037         int ix = MTOPG(mobj), m_idx = 0;
7038         unsigned char *page_addr;
7039
7040         VERIFY(ix < maxclaudit);
7041         VERIFY(IS_P2ALIGNED(mobj, MIN(m_maxsize(class), PAGE_SIZE)));
7042
7043         page_addr = PGTOM(ix);
7044
7045         switch (class) {
7046         case MC_MBUF:
7047                 /*
7048                  * For the mbuf case, find the index of the page
7049                  * used by the mbuf and use that index to locate the
7050                  * base address of the page.  Then find out the
7051                  * mbuf index relative to the page base and use
7052                  * it to locate the audit structure.
7053                  */
7054                 m_idx = MBPAGEIDX(page_addr, mobj);
7055                 VERIFY(m_idx < (int)NMBPG);
7056                 mca = mclaudit[ix].cl_audit[m_idx];
7057                 break;
7058
7059         case MC_CL:
7060                 /*
7061                  * Same thing as above, but for 2KB clusters in a page.
7062                  */
7063                 m_idx = CLPAGEIDX(page_addr, mobj);
7064                 VERIFY(m_idx < (int)NCLPG);
7065                 mca = mclaudit[ix].cl_audit[m_idx];
7066                 break;
7067
7068         case MC_BIGCL:
7069                 m_idx = BCLPAGEIDX(page_addr, mobj);
7070                 VERIFY(m_idx < (int)NBCLPG);
7071                 mca = mclaudit[ix].cl_audit[m_idx];
7072                 break;
7073         case MC_16KCL:
7074                 /*
7075                  * Same as above, but only return the first element.
7076                  */
7077                 mca = mclaudit[ix].cl_audit[0];
7078                 break;
7079
7080         default:
7081                 VERIFY(0);
7082                 /* NOTREACHED */
7083         }
7084
7085         return (mca);
7086 }
7087
7088 static void
7089 mcl_audit_mbuf(mcache_audit_t *mca, void *addr, boolean_t composite,
7090     boolean_t alloc)
7091 {
7092         struct mbuf *m = addr;
7093         mcache_obj_t *next = ((mcache_obj_t *)m)->obj_next;
7094
7095         VERIFY(mca->mca_contents != NULL &&
7096             mca->mca_contents_size == AUDIT_CONTENTS_SIZE);
7097
7098         if (mclverify)
7099                 mcl_audit_verify_nextptr(next, mca);
7100
7101         if (!alloc) {
7102                 /* Save constructed mbuf fields */
7103                 mcl_audit_save_mbuf(m, mca);
7104                 if (mclverify) {
7105                         mcache_set_pattern(MCACHE_FREE_PATTERN, m,
7106                             m_maxsize(MC_MBUF));
7107                 }
7108                 ((mcache_obj_t *)m)->obj_next = next;
7109                 return;
7110         }
7111
7112         /* Check if the buffer has been corrupted while in freelist */
7113         if (mclverify) {
7114                 mcache_audit_free_verify_set(mca, addr, 0, m_maxsize(MC_MBUF));
7115         }
7116         /* Restore constructed mbuf fields */
7117         mcl_audit_restore_mbuf(m, mca, composite);
7118 }
7119
7120 static void
7121 mcl_audit_restore_mbuf(struct mbuf *m, mcache_audit_t *mca, boolean_t composite)
7122 {
7123         struct mbuf *ms = MCA_SAVED_MBUF_PTR(mca);
7124
7125         if (composite) {
7126                 struct mbuf *next = m->m_next;
7127                 VERIFY(ms->m_flags == M_EXT && m_get_rfa(ms) != NULL &&
7128                     MBUF_IS_COMPOSITE(ms));
7129                 VERIFY(mca->mca_contents_size == AUDIT_CONTENTS_SIZE);
7130                 /*
7131                  * We could have hand-picked the mbuf fields and restore
7132                  * them individually, but that will be a maintenance
7133                  * headache.  Instead, restore everything that was saved;
7134                  * the mbuf layer will recheck and reinitialize anyway.
7135                  */
7136                 bcopy(ms, m, MCA_SAVED_MBUF_SIZE);
7137                 m->m_next = next;
7138         } else {
7139                 /*
7140                  * For a regular mbuf (no cluster attached) there's nothing
7141                  * to restore other than the type field, which is expected
7142                  * to be MT_FREE.
7143                  */
7144                 m->m_type = ms->m_type;
7145         }
7146         _MCHECK(m);
7147 }
7148
7149 static void
7150 mcl_audit_save_mbuf(struct mbuf *m, mcache_audit_t *mca)
7151 {
7152         VERIFY(mca->mca_contents_size == AUDIT_CONTENTS_SIZE);
7153         _MCHECK(m);
7154         bcopy(m, MCA_SAVED_MBUF_PTR(mca), MCA_SAVED_MBUF_SIZE);
7155 }
7156
7157 static void
7158 mcl_audit_cluster(mcache_audit_t *mca, void *addr, size_t size, boolean_t alloc,
7159     boolean_t save_next)
7160 {
7161         mcache_obj_t *next = ((mcache_obj_t *)addr)->obj_next;
7162
7163         if (!alloc) {
7164                 if (mclverify) {
7165                         mcache_set_pattern(MCACHE_FREE_PATTERN, addr, size);
7166                 }
7167                 if (save_next) {
7168                         mcl_audit_verify_nextptr(next, mca);
7169                         ((mcache_obj_t *)addr)->obj_next = next;
7170                 }
7171         } else if (mclverify) {
7172                 /* Check if the buffer has been corrupted while in freelist */
7173                 mcl_audit_verify_nextptr(next, mca);
7174                 mcache_audit_free_verify_set(mca, addr, 0, size);
7175         }
7176 }
7177
7178 static void
7179 mcl_audit_scratch(mcache_audit_t *mca)
7180 {
7181         void *stack[MCACHE_STACK_DEPTH + 1];
7182         mcl_scratch_audit_t *msa;
7183         struct timeval now;
7184
7185         VERIFY(mca->mca_contents != NULL);
7186         msa = MCA_SAVED_SCRATCH_PTR(mca);
7187
7188         msa->msa_pthread = msa->msa_thread;
7189         msa->msa_thread = current_thread();
7190         bcopy(msa->msa_stack, msa->msa_pstack, sizeof (msa->msa_pstack));
7191         msa->msa_pdepth = msa->msa_depth;
7192         bzero(stack, sizeof (stack));
7193         msa->msa_depth = OSBacktrace(stack, MCACHE_STACK_DEPTH + 1) - 1;
7194         bcopy(&stack[1], msa->msa_stack, sizeof (msa->msa_stack));
7195
7196         msa->msa_ptstamp = msa->msa_tstamp;
7197         microuptime(&now);
7198         /* tstamp is in ms relative to base_ts */
7199         msa->msa_tstamp = ((now.tv_usec - mb_start.tv_usec) / 1000);
7200         if ((now.tv_sec - mb_start.tv_sec) > 0)
7201                 msa->msa_tstamp += ((now.tv_sec - mb_start.tv_sec) * 1000);
7202 }
7203
7204 static void
7205 mcl_audit_mcheck_panic(struct mbuf *m)
7206 {
7207         mcache_audit_t *mca;
7208
7209         MRANGE(m);
7210         mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
7211
7212         panic("mcl_audit: freed mbuf %p with type 0x%x (instead of 0x%x)\n%s\n",
7213             m, (u_int16_t)m->m_type, MT_FREE, mcache_dump_mca(mca));
7214         /* NOTREACHED */
7215 }
7216
7217 static void
7218 mcl_audit_verify_nextptr(void *next, mcache_audit_t *mca)
7219 {
7220         if (next != NULL && !MBUF_IN_MAP(next) &&
7221             (next != (void *)MCACHE_FREE_PATTERN || !mclverify)) {
7222                 panic("mcl_audit: buffer %p modified after free at offset 0: "
7223                     "%p out of range [%p-%p)\n%s\n",
7224                     mca->mca_addr, next, mbutl, embutl, mcache_dump_mca(mca));
7225                 /* NOTREACHED */
7226         }
7227 }
7228
7229 /* This function turns on mbuf leak detection */
7230 static void
7231 mleak_activate(void)
7232 {
7233         mleak_table.mleak_sample_factor = MLEAK_SAMPLE_FACTOR;
7234         PE_parse_boot_argn("mleak_sample_factor",
7235             &mleak_table.mleak_sample_factor,
7236             sizeof (mleak_table.mleak_sample_factor));
7237
7238         if (mleak_table.mleak_sample_factor == 0)
7239                 mclfindleak = 0;
7240
7241         if (mclfindleak == 0)
7242                 return;
7243
7244         vm_size_t alloc_size =
7245             mleak_alloc_buckets * sizeof (struct mallocation);
7246         vm_size_t trace_size = mleak_trace_buckets * sizeof (struct mtrace);
7247
7248         MALLOC(mleak_allocations, struct mallocation *, alloc_size,
7249             M_TEMP, M_WAITOK | M_ZERO);
7250         VERIFY(mleak_allocations != NULL);
7251
7252         MALLOC(mleak_traces, struct mtrace *, trace_size,
7253             M_TEMP, M_WAITOK | M_ZERO);
7254         VERIFY(mleak_traces != NULL);
7255
7256         MALLOC(mleak_stat, mleak_stat_t *, MLEAK_STAT_SIZE(MLEAK_NUM_TRACES),
7257             M_TEMP, M_WAITOK | M_ZERO);
7258         VERIFY(mleak_stat != NULL);
7259         mleak_stat->ml_cnt = MLEAK_NUM_TRACES;
7260 #ifdef __LP64__
7261         mleak_stat->ml_isaddr64 = 1;
7262 #endif /* __LP64__ */
7263 }
7264
7265 static void
7266 mleak_logger(u_int32_t num, mcache_obj_t *addr, boolean_t alloc)
7267 {
7268         int temp;
7269
7270         if (mclfindleak == 0)
7271                 return;
7272
7273         if (!alloc)
7274                 return (mleak_free(addr));
7275
7276         temp = atomic_add_32_ov(&mleak_table.mleak_capture, 1);
7277
7278         if ((temp % mleak_table.mleak_sample_factor) == 0 && addr != NULL) {
7279                 uintptr_t bt[MLEAK_STACK_DEPTH];
7280                 int logged = backtrace(bt, MLEAK_STACK_DEPTH);
7281                 mleak_log(bt, addr, logged, num);
7282         }
7283 }
7284
7285 /*
7286  * This function records the allocation in the mleak_allocations table
7287  * and the backtrace in the mleak_traces table; if allocation slot is in use,
7288  * replace old allocation with new one if the trace slot is in use, return
7289  * (or increment refcount if same trace).
7290  */
7291 static boolean_t
7292 mleak_log(uintptr_t *bt, mcache_obj_t *addr, uint32_t depth, int num)
7293 {
7294         struct mallocation *allocation;
7295         struct mtrace *trace;
7296         uint32_t trace_index;
7297
7298         /* Quit if someone else modifying the tables */
7299         if (!lck_mtx_try_lock_spin(mleak_lock)) {
7300                 mleak_table.total_conflicts++;
7301                 return (FALSE);
7302         }
7303
7304         allocation = &mleak_allocations[hashaddr((uintptr_t)addr,
7305             mleak_alloc_buckets)];
7306         trace_index = hashbacktrace(bt, depth, mleak_trace_buckets);
7307         trace = &mleak_traces[trace_index];
7308
7309         VERIFY(allocation <= &mleak_allocations[mleak_alloc_buckets - 1]);
7310         VERIFY(trace <= &mleak_traces[mleak_trace_buckets - 1]);
7311
7312         allocation->hitcount++;
7313         trace->hitcount++;
7314
7315         /*
7316          * If the allocation bucket we want is occupied
7317          * and the occupier has the same trace, just bail.
7318          */
7319         if (allocation->element != NULL &&
7320             trace_index == allocation->trace_index) {
7321                 mleak_table.alloc_collisions++;
7322                 lck_mtx_unlock(mleak_lock);
7323                 return (TRUE);
7324         }
7325
7326         /*
7327          * Store the backtrace in the traces array;
7328          * Size of zero = trace bucket is free.
7329          */
7330         if (trace->allocs > 0 &&
7331             bcmp(trace->addr, bt, (depth * sizeof (uintptr_t))) != 0) {
7332                 /* Different, unique trace, but the same hash! Bail out. */
7333                 trace->collisions++;
7334                 mleak_table.trace_collisions++;
7335                 lck_mtx_unlock(mleak_lock);
7336                 return (TRUE);
7337         } else if (trace->allocs > 0) {
7338                 /* Same trace, already added, so increment refcount */
7339                 trace->allocs++;
7340         } else {
7341                 /* Found an unused trace bucket, so record the trace here */
7342                 if (trace->depth != 0) {
7343                         /* this slot previously used but not currently in use */
7344                         mleak_table.trace_overwrites++;
7345                 }
7346                 mleak_table.trace_recorded++;
7347                 trace->allocs = 1;
7348                 memcpy(trace->addr, bt, (depth * sizeof (uintptr_t)));
7349                 trace->depth = depth;
7350                 trace->collisions = 0;
7351         }
7352
7353         /* Step 2: Store the allocation record in the allocations array */
7354         if (allocation->element != NULL) {
7355                 /*
7356                  * Replace an existing allocation.  No need to preserve
7357                  * because only a subset of the allocations are being
7358                  * recorded anyway.
7359                  */
7360                 mleak_table.alloc_collisions++;
7361         } else if (allocation->trace_index != 0) {
7362                 mleak_table.alloc_overwrites++;
7363         }
7364         allocation->element = addr;
7365         allocation->trace_index = trace_index;
7366         allocation->count = num;
7367         mleak_table.alloc_recorded++;
7368         mleak_table.outstanding_allocs++;
7369
7370         lck_mtx_unlock(mleak_lock);
7371         return (TRUE);
7372 }
7373
7374 static void
7375 mleak_free(mcache_obj_t *addr)
7376 {
7377         while (addr != NULL) {
7378                 struct mallocation *allocation = &mleak_allocations
7379                     [hashaddr((uintptr_t)addr, mleak_alloc_buckets)];
7380
7381                 if (allocation->element == addr &&
7382                     allocation->trace_index < mleak_trace_buckets) {
7383                         lck_mtx_lock_spin(mleak_lock);
7384                         if (allocation->element == addr &&
7385                             allocation->trace_index < mleak_trace_buckets) {
7386                                 struct mtrace *trace;
7387                                 trace = &mleak_traces[allocation->trace_index];
7388                                 /* allocs = 0 means trace bucket is unused */
7389                                 if (trace->allocs > 0)
7390                                         trace->allocs--;
7391                                 if (trace->allocs == 0)
7392                                         trace->depth = 0;
7393                                 /* NULL element means alloc bucket is unused */
7394                                 allocation->element = NULL;
7395                                 mleak_table.outstanding_allocs--;
7396                         }
7397                         lck_mtx_unlock(mleak_lock);
7398                 }
7399                 addr = addr->obj_next;
7400         }
7401 }
7402
7403 static void
7404 mleak_sort_traces()
7405 {
7406         int i, j, k;
7407         struct mtrace *swap;
7408
7409         for(i = 0; i < MLEAK_NUM_TRACES; i++)
7410                 mleak_top_trace[i] = NULL;
7411
7412         for(i = 0, j = 0; j < MLEAK_NUM_TRACES && i < mleak_trace_buckets; i++)
7413         {
7414                 if (mleak_traces[i].allocs <= 0)
7415                         continue;
7416
7417                 mleak_top_trace[j] = &mleak_traces[i];
7418                 for (k = j; k > 0; k--) {
7419                         if (mleak_top_trace[k]->allocs <=
7420                             mleak_top_trace[k-1]->allocs)
7421                                 break;
7422
7423                         swap = mleak_top_trace[k-1];
7424                         mleak_top_trace[k-1] = mleak_top_trace[k];
7425                         mleak_top_trace[k] = swap;
7426                 }
7427                 j++;
7428         }
7429
7430         j--;
7431         for(; i < mleak_trace_buckets; i++) {
7432                 if (mleak_traces[i].allocs <= mleak_top_trace[j]->allocs)
7433                         continue;
7434
7435                 mleak_top_trace[j] = &mleak_traces[i];
7436
7437                 for (k = j; k > 0; k--) {
7438                         if (mleak_top_trace[k]->allocs <=
7439                             mleak_top_trace[k-1]->allocs)
7440                                 break;
7441
7442                         swap = mleak_top_trace[k-1];
7443                         mleak_top_trace[k-1] = mleak_top_trace[k];
7444                         mleak_top_trace[k] = swap;
7445                 }
7446         }
7447 }
7448
7449 static void
7450 mleak_update_stats()
7451 {
7452         mleak_trace_stat_t *mltr;
7453         int i;
7454
7455         VERIFY(mleak_stat != NULL);
7456 #ifdef __LP64__
7457         VERIFY(mleak_stat->ml_isaddr64);
7458 #else
7459         VERIFY(!mleak_stat->ml_isaddr64);
7460 #endif /* !__LP64__ */
7461         VERIFY(mleak_stat->ml_cnt == MLEAK_NUM_TRACES);
7462
7463         mleak_sort_traces();
7464
7465         mltr = &mleak_stat->ml_trace[0];
7466         bzero(mltr, sizeof (*mltr) * MLEAK_NUM_TRACES);
7467         for (i = 0; i < MLEAK_NUM_TRACES; i++) {
7468                 int j;
7469
7470                 if (mleak_top_trace[i] == NULL ||
7471                     mleak_top_trace[i]->allocs == 0)
7472                         continue;
7473
7474                 mltr->mltr_collisions   = mleak_top_trace[i]->collisions;
7475                 mltr->mltr_hitcount     = mleak_top_trace[i]->hitcount;
7476                 mltr->mltr_allocs       = mleak_top_trace[i]->allocs;
7477                 mltr->mltr_depth        = mleak_top_trace[i]->depth;
7478
7479                 VERIFY(mltr->mltr_depth <= MLEAK_STACK_DEPTH);
7480                 for (j = 0; j < mltr->mltr_depth; j++)
7481                         mltr->mltr_addr[j] = mleak_top_trace[i]->addr[j];
7482
7483                 mltr++;
7484         }
7485 }
7486
7487 static struct mbtypes {
7488         int             mt_type;
7489         const char      *mt_name;
7490 } mbtypes[] = {
7491         { MT_DATA,      "data" },
7492         { MT_OOBDATA,   "oob data" },
7493         { MT_CONTROL,   "ancillary data" },
7494         { MT_HEADER,    "packet headers" },
7495         { MT_SOCKET,    "socket structures" },
7496         { MT_PCB,       "protocol control blocks" },
7497         { MT_RTABLE,    "routing table entries" },
7498         { MT_HTABLE,    "IMP host table entries" },
7499         { MT_ATABLE,    "address resolution tables" },
7500         { MT_FTABLE,    "fragment reassembly queue headers" },
7501         { MT_SONAME,    "socket names and addresses" },
7502         { MT_SOOPTS,    "socket options" },
7503         { MT_RIGHTS,    "access rights" },
7504         { MT_IFADDR,    "interface addresses" },
7505         { MT_TAG,       "packet tags" },
7506         { 0,            NULL }
7507 };
7508
7509 #define MBUF_DUMP_BUF_CHK() {   \
7510         clen -= k;              \
7511         if (clen < 1)           \
7512                 goto done;      \
7513         c += k;                 \
7514 }
7515
7516 static char *
7517 mbuf_dump(void)
7518 {
7519         unsigned long totmem = 0, totfree = 0, totmbufs, totused, totpct,
7520             totreturned = 0;
7521         u_int32_t m_mbufs = 0, m_clfree = 0, m_bigclfree = 0;
7522         u_int32_t m_mbufclfree = 0, m_mbufbigclfree = 0;
7523         u_int32_t m_16kclusters = 0, m_16kclfree = 0, m_mbuf16kclfree = 0;
7524         int nmbtypes = sizeof (mbstat.m_mtypes) / sizeof (short);
7525         uint8_t seen[256];
7526         struct mbtypes *mp;
7527         mb_class_stat_t *sp;
7528         mleak_trace_stat_t *mltr;
7529         char *c = mbuf_dump_buf;
7530         int i, j, k, clen = MBUF_DUMP_BUF_SIZE;
7531         bool printed_banner = false;
7532
7533         mbuf_dump_buf[0] = '\0';
7534
7535         /* synchronize all statistics in the mbuf table */
7536         mbuf_stat_sync();
7537         mbuf_mtypes_sync(TRUE);
7538
7539         sp = &mb_stat->mbs_class[0];
7540         for (i = 0; i < mb_stat->mbs_cnt; i++, sp++) {
7541                 u_int32_t mem;
7542
7543                 if (m_class(i) == MC_MBUF) {
7544                         m_mbufs = sp->mbcl_active;
7545                 } else if (m_class(i) == MC_CL) {
7546                         m_clfree = sp->mbcl_total - sp->mbcl_active;
7547                 } else if (m_class(i) == MC_BIGCL) {
7548                         m_bigclfree = sp->mbcl_total - sp->mbcl_active;
7549                 } else if (njcl > 0 && m_class(i) == MC_16KCL) {
7550                         m_16kclfree = sp->mbcl_total - sp->mbcl_active;
7551                         m_16kclusters = sp->mbcl_total;
7552                 } else if (m_class(i) == MC_MBUF_CL) {
7553                         m_mbufclfree = sp->mbcl_total - sp->mbcl_active;
7554                 } else if (m_class(i) == MC_MBUF_BIGCL) {
7555                         m_mbufbigclfree = sp->mbcl_total - sp->mbcl_active;
7556                 } else if (njcl > 0 && m_class(i) == MC_MBUF_16KCL) {
7557                         m_mbuf16kclfree = sp->mbcl_total - sp->mbcl_active;
7558                 }
7559
7560                 mem = sp->mbcl_ctotal * sp->mbcl_size;
7561                 totmem += mem;
7562                 totfree += (sp->mbcl_mc_cached + sp->mbcl_infree) *
7563                     sp->mbcl_size;
7564                 totreturned += sp->mbcl_release_cnt;
7565
7566         }
7567
7568         /* adjust free counts to include composite caches */
7569         m_clfree += m_mbufclfree;
7570         m_bigclfree += m_mbufbigclfree;
7571         m_16kclfree += m_mbuf16kclfree;
7572
7573         totmbufs = 0;
7574         for (mp = mbtypes; mp->mt_name != NULL; mp++)
7575                 totmbufs += mbstat.m_mtypes[mp->mt_type];
7576         if (totmbufs > m_mbufs)
7577                 totmbufs = m_mbufs;
7578         k = snprintf(c, clen, "%lu/%u mbufs in use:\n", totmbufs, m_mbufs);
7579         MBUF_DUMP_BUF_CHK();
7580
7581         bzero(&seen, sizeof (seen));
7582         for (mp = mbtypes; mp->mt_name != NULL; mp++) {
7583                 if (mbstat.m_mtypes[mp->mt_type] != 0) {
7584                         seen[mp->mt_type] = 1;
7585                         k = snprintf(c, clen, "\t%u mbufs allocated to %s\n",
7586                             mbstat.m_mtypes[mp->mt_type], mp->mt_name);
7587                         MBUF_DUMP_BUF_CHK();
7588                 }
7589         }
7590         seen[MT_FREE] = 1;
7591         for (i = 0; i < nmbtypes; i++)
7592                 if (!seen[i] && mbstat.m_mtypes[i] != 0) {
7593                         k = snprintf(c, clen, "\t%u mbufs allocated to "
7594                             "<mbuf type %d>\n", mbstat.m_mtypes[i], i);
7595                         MBUF_DUMP_BUF_CHK();
7596                 }
7597         if ((m_mbufs - totmbufs) > 0) {
7598                 k = snprintf(c, clen, "\t%lu mbufs allocated to caches\n",
7599                     m_mbufs - totmbufs);
7600                 MBUF_DUMP_BUF_CHK();
7601         }
7602         k = snprintf(c, clen, "%u/%u mbuf 2KB clusters in use\n"
7603             "%u/%u mbuf 4KB clusters in use\n",
7604             (unsigned int)(mbstat.m_clusters - m_clfree),
7605             (unsigned int)mbstat.m_clusters,
7606             (unsigned int)(mbstat.m_bigclusters - m_bigclfree),
7607             (unsigned int)mbstat.m_bigclusters);
7608         MBUF_DUMP_BUF_CHK();
7609
7610         if (njcl > 0) {
7611                 k = snprintf(c, clen, "%u/%u mbuf %uKB clusters in use\n",
7612                     m_16kclusters - m_16kclfree, m_16kclusters,
7613                     njclbytes / 1024);
7614                 MBUF_DUMP_BUF_CHK();
7615         }
7616         totused = totmem - totfree;
7617         if (totmem == 0) {
7618                 totpct = 0;
7619         } else if (totused < (ULONG_MAX / 100)) {
7620                 totpct = (totused * 100) / totmem;
7621         } else {
7622                 u_long totmem1 = totmem / 100;
7623                 u_long totused1 = totused / 100;
7624                 totpct = (totused1 * 100) / totmem1;
7625         }
7626         k = snprintf(c, clen, "%lu KB allocated to network (approx. %lu%% "
7627             "in use)\n", totmem / 1024, totpct);
7628         MBUF_DUMP_BUF_CHK();
7629         k = snprintf(c, clen, "%lu KB returned to the system\n",
7630             totreturned / 1024);
7631         MBUF_DUMP_BUF_CHK();
7632
7633         net_update_uptime();
7634         k = snprintf(c, clen,
7635             "VM allocation failures: contiguous %u, normal %u, one page %u\n",
7636             mb_kmem_contig_failed, mb_kmem_failed, mb_kmem_one_failed);
7637         MBUF_DUMP_BUF_CHK();
7638         if (mb_kmem_contig_failed_ts || mb_kmem_failed_ts ||
7639             mb_kmem_one_failed_ts) {
7640                 k = snprintf(c, clen,
7641                     "VM allocation failure timestamps: contiguous %llu "
7642                     "(size %llu), normal %llu (size %llu), one page %llu "
7643                     "(now %llu)\n",
7644                     mb_kmem_contig_failed_ts, mb_kmem_contig_failed_size,
7645                     mb_kmem_failed_ts, mb_kmem_failed_size,
7646                     mb_kmem_one_failed_ts, net_uptime());
7647                 MBUF_DUMP_BUF_CHK();
7648                 k = snprintf(c, clen,
7649                     "VM return codes: ");
7650                 MBUF_DUMP_BUF_CHK();
7651                 for (i = 0;
7652                      i < sizeof(mb_kmem_stats) / sizeof(mb_kmem_stats[0]);
7653                      i++) {
7654                         k = snprintf(c, clen, "%s: %u ", mb_kmem_stats_labels[i],
7655                             mb_kmem_stats[i]);
7656                         MBUF_DUMP_BUF_CHK();
7657                 }
7658                 k = snprintf(c, clen, "\n");
7659                 MBUF_DUMP_BUF_CHK();
7660         }
7661         k = snprintf(c, clen,
7662             "worker thread runs: %u, expansions: %llu, cl %llu/%llu, "
7663             "bigcl %llu/%llu, 16k %llu/%llu\n", mbuf_worker_run_cnt,
7664             mb_expand_cnt, mb_expand_cl_cnt, mb_expand_cl_total,
7665             mb_expand_bigcl_cnt, mb_expand_bigcl_total, mb_expand_16kcl_cnt,
7666             mb_expand_16kcl_total);
7667         MBUF_DUMP_BUF_CHK();
7668         if (mbuf_worker_last_runtime != 0) {
7669                 k = snprintf(c, clen, "worker thread last run time: "
7670                     "%llu (%llu seconds ago)\n",
7671                     mbuf_worker_last_runtime,
7672                     net_uptime() - mbuf_worker_last_runtime);
7673                 MBUF_DUMP_BUF_CHK();
7674         }
7675         if (mbuf_drain_last_runtime != 0) {
7676                 k = snprintf(c, clen, "drain routine last run time: "
7677                     "%llu (%llu seconds ago)\n",
7678                     mbuf_drain_last_runtime,
7679                     net_uptime() - mbuf_drain_last_runtime);
7680                 MBUF_DUMP_BUF_CHK();
7681         }
7682
7683 #if DEBUG || DEVELOPMENT
7684         k = snprintf(c, clen, "\nworker thread log:\n%s\n", mbwdog_logging);
7685         MBUF_DUMP_BUF_CHK();
7686 #endif
7687
7688         for (j = 0; j < MTRACELARGE_NUM_TRACES; j++) {
7689                 struct mtracelarge *trace = &mtracelarge_table[j];
7690                 if (trace->size == 0 || trace->depth == 0)
7691                         continue;
7692                 if (printed_banner == false) {
7693                         k = snprintf(c, clen,
7694                             "\nlargest allocation failure backtraces:\n");
7695                         MBUF_DUMP_BUF_CHK();
7696                         printed_banner = true;
7697                 }
7698                 k = snprintf(c, clen, "size %llu: < ", trace->size);
7699                 MBUF_DUMP_BUF_CHK();
7700                 for (i = 0; i < trace->depth; i++) {
7701                         if (mleak_stat->ml_isaddr64) {
7702                                 k = snprintf(c, clen, "0x%0llx ",
7703                                     (uint64_t)VM_KERNEL_UNSLIDE(
7704                                             trace->addr[i]));
7705                         } else {
7706                                 k = snprintf(c, clen,
7707                                     "0x%08x ",
7708                                     (uint32_t)VM_KERNEL_UNSLIDE(
7709                                             trace->addr[i]));
7710                         }
7711                         MBUF_DUMP_BUF_CHK();
7712                 }
7713                 k = snprintf(c, clen, ">\n");
7714                 MBUF_DUMP_BUF_CHK();
7715         }
7716
7717         /* mbuf leak detection statistics */
7718         mleak_update_stats();
7719
7720         k = snprintf(c, clen, "\nmbuf leak detection table:\n");
7721         MBUF_DUMP_BUF_CHK();
7722         k = snprintf(c, clen, "\ttotal captured: %u (one per %u)\n",
7723             mleak_table.mleak_capture / mleak_table.mleak_sample_factor,
7724             mleak_table.mleak_sample_factor);
7725         MBUF_DUMP_BUF_CHK();
7726         k = snprintf(c, clen, "\ttotal allocs outstanding: %llu\n",
7727             mleak_table.outstanding_allocs);
7728         MBUF_DUMP_BUF_CHK();
7729         k = snprintf(c, clen, "\tnew hash recorded: %llu allocs, %llu traces\n",
7730             mleak_table.alloc_recorded, mleak_table.trace_recorded);
7731         MBUF_DUMP_BUF_CHK();
7732         k = snprintf(c, clen, "\thash collisions: %llu allocs, %llu traces\n",
7733             mleak_table.alloc_collisions, mleak_table.trace_collisions);
7734         MBUF_DUMP_BUF_CHK();
7735         k = snprintf(c, clen, "\toverwrites: %llu allocs, %llu traces\n",
7736             mleak_table.alloc_overwrites, mleak_table.trace_overwrites);
7737         MBUF_DUMP_BUF_CHK();
7738         k = snprintf(c, clen, "\tlock conflicts: %llu\n\n",
7739             mleak_table.total_conflicts);
7740         MBUF_DUMP_BUF_CHK();
7741
7742         k = snprintf(c, clen, "top %d outstanding traces:\n",
7743             mleak_stat->ml_cnt);
7744         MBUF_DUMP_BUF_CHK();
7745         for (i = 0; i < mleak_stat->ml_cnt; i++) {
7746                 mltr = &mleak_stat->ml_trace[i];
7747                 k = snprintf(c, clen, "[%d] %llu outstanding alloc(s), "
7748                     "%llu hit(s), %llu collision(s)\n", (i + 1),
7749                     mltr->mltr_allocs, mltr->mltr_hitcount,
7750                     mltr->mltr_collisions);
7751                 MBUF_DUMP_BUF_CHK();
7752         }
7753
7754         if (mleak_stat->ml_isaddr64)
7755                 k = snprintf(c, clen, MB_LEAK_HDR_64);
7756         else
7757                 k = snprintf(c, clen, MB_LEAK_HDR_32);
7758         MBUF_DUMP_BUF_CHK();
7759
7760         for (i = 0; i < MLEAK_STACK_DEPTH; i++) {
7761                 k = snprintf(c, clen, "%2d: ", (i + 1));
7762                 MBUF_DUMP_BUF_CHK();
7763                 for (j = 0; j < mleak_stat->ml_cnt; j++) {
7764                         mltr = &mleak_stat->ml_trace[j];
7765                         if (i < mltr->mltr_depth) {
7766                                 if (mleak_stat->ml_isaddr64) {
7767                                         k = snprintf(c, clen, "0x%0llx  ",
7768                                             (uint64_t)VM_KERNEL_UNSLIDE(
7769                                                 mltr->mltr_addr[i]));
7770                                 } else {
7771                                         k = snprintf(c, clen,
7772                                             "0x%08x  ",
7773                                             (uint32_t)VM_KERNEL_UNSLIDE(
7774                                                 mltr->mltr_addr[i]));
7775                                 }
7776                         } else {
7777                                 if (mleak_stat->ml_isaddr64)
7778                                         k = snprintf(c, clen,
7779                                             MB_LEAK_SPACING_64);
7780                                 else
7781                                         k = snprintf(c, clen,
7782                                             MB_LEAK_SPACING_32);
7783                         }
7784                         MBUF_DUMP_BUF_CHK();
7785                 }
7786                 k = snprintf(c, clen, "\n");
7787                 MBUF_DUMP_BUF_CHK();
7788         }
7789 done:
7790         return (mbuf_dump_buf);
7791 }
7792
7793 #undef MBUF_DUMP_BUF_CHK
7794
7795 /*
7796  * Convert between a regular and a packet header mbuf.  Caller is responsible
7797  * for setting or clearing M_PKTHDR; this routine does the rest of the work.
7798  */
7799 int
7800 m_reinit(struct mbuf *m, int hdr)
7801 {
7802         int ret = 0;
7803
7804         if (hdr) {
7805                 VERIFY(!(m->m_flags & M_PKTHDR));
7806                 if (!(m->m_flags & M_EXT) &&
7807                     (m->m_data != m->m_dat || m->m_len > 0)) {
7808                         /*
7809                          * If there's no external cluster attached and the
7810                          * mbuf appears to contain user data, we cannot
7811                          * safely convert this to a packet header mbuf,
7812                          * as the packet header structure might overlap
7813                          * with the data.
7814                          */
7815                         printf("%s: cannot set M_PKTHDR on altered mbuf %llx, "
7816                             "m_data %llx (expected %llx), "
7817                             "m_len %d (expected 0)\n",
7818                             __func__,
7819                             (uint64_t)VM_KERNEL_ADDRPERM(m),
7820                             (uint64_t)VM_KERNEL_ADDRPERM(m->m_data),
7821                             (uint64_t)VM_KERNEL_ADDRPERM(m->m_dat), m->m_len);
7822                         ret = EBUSY;
7823                 } else {
7824                         VERIFY((m->m_flags & M_EXT) || m->m_data == m->m_dat);
7825                         m->m_flags |= M_PKTHDR;
7826                         MBUF_INIT_PKTHDR(m);
7827                 }
7828         } else {
7829                 /* Check for scratch area overflow */
7830                 m_redzone_verify(m);
7831                 /* Free the aux data and tags if there is any */
7832                 m_tag_delete_chain(m, NULL);
7833                 m->m_flags &= ~M_PKTHDR;
7834         }
7835
7836         return (ret);
7837 }
7838
7839 int
7840 m_ext_set_prop(struct mbuf *m, uint32_t o, uint32_t n)
7841 {
7842         ASSERT(m->m_flags & M_EXT);
7843         return (atomic_test_set_32(&MEXT_PRIV(m), o, n));
7844 }
7845
7846 uint32_t
7847 m_ext_get_prop(struct mbuf *m)
7848 {
7849         ASSERT(m->m_flags & M_EXT);
7850         return (MEXT_PRIV(m));
7851 }
7852
7853 int
7854 m_ext_paired_is_active(struct mbuf *m)
7855 {
7856         return (MBUF_IS_PAIRED(m) ? (MEXT_PREF(m) > MEXT_MINREF(m)) : 1);
7857 }
7858
7859 void
7860 m_ext_paired_activate(struct mbuf *m)
7861 {
7862         struct ext_ref *rfa;
7863         int hdr, type;
7864         caddr_t extbuf;
7865         m_ext_free_func_t extfree;
7866         u_int extsize;
7867
7868         VERIFY(MBUF_IS_PAIRED(m));
7869         VERIFY(MEXT_REF(m) == MEXT_MINREF(m));
7870         VERIFY(MEXT_PREF(m) == MEXT_MINREF(m));
7871
7872         hdr = (m->m_flags & M_PKTHDR);
7873         type = m->m_type;
7874         extbuf = m->m_ext.ext_buf;
7875         extfree = m_get_ext_free(m);
7876         extsize = m->m_ext.ext_size;
7877         rfa = m_get_rfa(m);
7878
7879         VERIFY(extbuf != NULL && rfa != NULL);
7880
7881         /*
7882          * Safe to reinitialize packet header tags, since it's
7883          * already taken care of at m_free() time.  Similar to
7884          * what's done in m_clattach() for the cluster.  Bump
7885          * up MEXT_PREF to indicate activation.
7886          */
7887         MBUF_INIT(m, hdr, type);
7888         MEXT_INIT(m, extbuf, extsize, extfree, (caddr_t)m, rfa,
7889             1, 1, 2, EXTF_PAIRED, MEXT_PRIV(m), m);
7890 }
7891
7892 void
7893 m_scratch_init(struct mbuf *m)
7894 {
7895         struct pkthdr *pkt = &m->m_pkthdr;
7896
7897         VERIFY(m->m_flags & M_PKTHDR);
7898
7899         /* See comments in <rdar://problem/14040693> */
7900         if (pkt->pkt_flags & PKTF_PRIV_GUARDED) {
7901                 panic_plain("Invalid attempt to modify guarded module-private "
7902                     "area: mbuf %p, pkt_flags 0x%x\n", m, pkt->pkt_flags);
7903                 /* NOTREACHED */
7904         }
7905
7906         bzero(&pkt->pkt_mpriv, sizeof (pkt->pkt_mpriv));
7907 }
7908
7909 /*
7910  * This routine is reserved for mbuf_get_driver_scratch(); clients inside
7911  * xnu that intend on utilizing the module-private area should directly
7912  * refer to the pkt_mpriv structure in the pkthdr.  They are also expected
7913  * to set and clear PKTF_PRIV_GUARDED, while owning the packet and prior
7914  * to handing it off to another module, respectively.
7915  */
7916 u_int32_t
7917 m_scratch_get(struct mbuf *m, u_int8_t **p)
7918 {
7919         struct pkthdr *pkt = &m->m_pkthdr;
7920
7921         VERIFY(m->m_flags & M_PKTHDR);
7922
7923         /* See comments in <rdar://problem/14040693> */
7924         if (pkt->pkt_flags & PKTF_PRIV_GUARDED) {
7925                 panic_plain("Invalid attempt to access guarded module-private "
7926                     "area: mbuf %p, pkt_flags 0x%x\n", m, pkt->pkt_flags);
7927                 /* NOTREACHED */
7928         }
7929
7930         if (mcltrace) {
7931                 mcache_audit_t *mca;
7932
7933                 lck_mtx_lock(mbuf_mlock);
7934                 mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
7935                 if (mca->mca_uflags & MB_SCVALID)
7936                         mcl_audit_scratch(mca);
7937                 lck_mtx_unlock(mbuf_mlock);
7938         }
7939
7940         *p = (u_int8_t *)&pkt->pkt_mpriv;
7941         return (sizeof (pkt->pkt_mpriv));
7942 }
7943
7944 static void
7945 m_redzone_init(struct mbuf *m)
7946 {
7947         VERIFY(m->m_flags & M_PKTHDR);
7948         /*
7949          * Each mbuf has a unique red zone pattern, which is a XOR
7950          * of the red zone cookie and the address of the mbuf.
7951          */
7952         m->m_pkthdr.redzone = ((u_int32_t)(uintptr_t)m) ^ mb_redzone_cookie;
7953 }
7954
7955 static void
7956 m_redzone_verify(struct mbuf *m)
7957 {
7958         u_int32_t mb_redzone;
7959
7960         VERIFY(m->m_flags & M_PKTHDR);
7961
7962         mb_redzone = ((u_int32_t)(uintptr_t)m) ^ mb_redzone_cookie;
7963         if (m->m_pkthdr.redzone != mb_redzone) {
7964                 panic("mbuf %p redzone violation with value 0x%x "
7965                     "(instead of 0x%x, using cookie 0x%x)\n",
7966                     m, m->m_pkthdr.redzone, mb_redzone, mb_redzone_cookie);
7967                 /* NOTREACHED */
7968         }
7969 }
7970
7971 __private_extern__ inline void
7972 m_set_ext(struct mbuf *m, struct ext_ref *rfa, m_ext_free_func_t ext_free,
7973     caddr_t ext_arg)
7974 {
7975         VERIFY(m->m_flags & M_EXT);
7976         if (rfa != NULL) {
7977                 m->m_ext.ext_refflags =
7978                     (struct ext_ref *)(((uintptr_t)rfa) ^ mb_obscure_extref);
7979                 if (ext_free != NULL) {
7980                         rfa->ext_token = ((uintptr_t)&rfa->ext_token) ^
7981                             mb_obscure_extfree;
7982                         m->m_ext.ext_free = (m_ext_free_func_t)
7983                             (((uintptr_t)ext_free) ^ rfa->ext_token);
7984                         if (ext_arg != NULL) {
7985                                 m->m_ext.ext_arg =
7986                                     (caddr_t)(((uintptr_t)ext_arg) ^ rfa->ext_token);
7987                         } else {
7988                                 m->m_ext.ext_arg = NULL;
7989                         }
7990                 } else {
7991                         rfa->ext_token = 0;
7992                         m->m_ext.ext_free = NULL;
7993                         m->m_ext.ext_arg = NULL;
7994                 }
7995         } else {
7996                 /*
7997                  * If we are going to loose the cookie in ext_token by
7998                  * resetting the rfa, we should use the global cookie
7999                  * to obscure the ext_free and ext_arg pointers.
8000                  */
8001                 if (ext_free != NULL) {
8002                         m->m_ext.ext_free =
8003                             (m_ext_free_func_t)((uintptr_t)ext_free ^
8004                             mb_obscure_extfree);
8005                         if (ext_arg != NULL) {
8006                                 m->m_ext.ext_arg =
8007                                     (caddr_t)((uintptr_t)ext_arg ^
8008                                     mb_obscure_extfree);
8009                         } else {
8010                                 m->m_ext.ext_arg = NULL;
8011                         }
8012                 } else {
8013                         m->m_ext.ext_free = NULL;
8014                         m->m_ext.ext_arg = NULL;
8015                 }
8016                 m->m_ext.ext_refflags = NULL;
8017         }
8018 }
8019
8020 __private_extern__ inline struct ext_ref *
8021 m_get_rfa(struct mbuf *m)
8022 {
8023         if (m->m_ext.ext_refflags == NULL)
8024                 return (NULL);
8025         else
8026                 return ((struct ext_ref *)(((uintptr_t)m->m_ext.ext_refflags) ^ mb_obscure_extref));
8027 }
8028
8029 __private_extern__ inline m_ext_free_func_t
8030 m_get_ext_free(struct mbuf *m)
8031 {
8032         struct ext_ref *rfa;
8033         if (m->m_ext.ext_free == NULL)
8034                 return (NULL);
8035
8036         rfa = m_get_rfa(m);
8037         if (rfa == NULL)
8038                 return ((m_ext_free_func_t)((uintptr_t)m->m_ext.ext_free ^ mb_obscure_extfree));
8039         else
8040                 return ((m_ext_free_func_t)(((uintptr_t)m->m_ext.ext_free)
8041                     ^ rfa->ext_token));
8042 }
8043
8044 __private_extern__ inline caddr_t
8045 m_get_ext_arg(struct mbuf *m)
8046 {
8047         struct ext_ref *rfa;
8048         if (m->m_ext.ext_arg == NULL)
8049                 return (NULL);
8050
8051         rfa = m_get_rfa(m);
8052         if (rfa == NULL) {
8053                 return ((caddr_t)((uintptr_t)m->m_ext.ext_arg ^ mb_obscure_extfree));
8054         } else {
8055                 return ((caddr_t)(((uintptr_t)m->m_ext.ext_arg) ^
8056                     rfa->ext_token));
8057         }
8058 }
8059
8060 /*
8061  * Send a report of mbuf usage if the usage is at least 6% of max limit
8062  * or if there has been at least 3% increase since the last report.
8063  *
8064  * The values 6% and 3% are chosen so that we can do simple arithmetic
8065  * with shift operations.
8066  */
8067 static boolean_t
8068 mbuf_report_usage(mbuf_class_t cl)
8069 {
8070         /* if a report is already in progress, nothing to do */
8071         if (mb_peak_newreport)
8072                 return (TRUE);
8073
8074         if (m_total(cl) > m_peak(cl) &&
8075             m_total(cl) >= (m_maxlimit(cl) >> 4) &&
8076             (m_total(cl) - m_peak(cl)) >= (m_peak(cl) >> 5))
8077                 return (TRUE);
8078         return (FALSE);
8079 }
8080
8081 __private_extern__ void
8082 mbuf_report_peak_usage(void)
8083 {
8084         int i = 0;
8085         u_int64_t uptime;
8086         struct nstat_sysinfo_data ns_data;
8087         uint32_t memreleased = 0;
8088         static uint32_t prevmemreleased;
8089
8090         uptime = net_uptime();
8091         lck_mtx_lock(mbuf_mlock);
8092
8093         /* Generate an initial report after 1 week of uptime */
8094         if (!mb_peak_firstreport &&
8095             uptime > MBUF_PEAK_FIRST_REPORT_THRESHOLD) {
8096                 mb_peak_newreport = TRUE;
8097                 mb_peak_firstreport = TRUE;
8098         }
8099
8100         if (!mb_peak_newreport) {
8101                 lck_mtx_unlock(mbuf_mlock);
8102                 return;
8103         }
8104
8105         /*
8106          * Since a report is being generated before 1 week,
8107          * we do not need to force another one later
8108          */
8109         if (uptime < MBUF_PEAK_FIRST_REPORT_THRESHOLD)
8110                 mb_peak_firstreport = TRUE;
8111
8112         for (i = 0; i < NELEM(mbuf_table); i++) {
8113                 m_peak(m_class(i)) = m_total(m_class(i));
8114                 memreleased += m_release_cnt(i);
8115         }
8116         memreleased = memreleased - prevmemreleased;
8117         prevmemreleased = memreleased;
8118         mb_peak_newreport = FALSE;
8119         lck_mtx_unlock(mbuf_mlock);
8120
8121         bzero(&ns_data, sizeof(ns_data));
8122         ns_data.flags = NSTAT_SYSINFO_MBUF_STATS;
8123         ns_data.u.mb_stats.total_256b = m_peak(MC_MBUF);
8124         ns_data.u.mb_stats.total_2kb = m_peak(MC_CL);
8125         ns_data.u.mb_stats.total_4kb = m_peak(MC_BIGCL);
8126         ns_data.u.mb_stats.total_16kb = m_peak(MC_16KCL);
8127         ns_data.u.mb_stats.sbmb_total = total_sbmb_cnt_peak;
8128         ns_data.u.mb_stats.sb_atmbuflimit = sbmb_limreached;
8129         ns_data.u.mb_stats.draincnt = mbstat.m_drain;
8130         ns_data.u.mb_stats.memreleased = memreleased;
8131         ns_data.u.mb_stats.sbmb_floor = total_sbmb_cnt_floor;
8132
8133         nstat_sysinfo_send_data(&ns_data);
8134
8135         /*
8136          * Reset the floor whenever we report a new
8137          * peak to track the trend (increase peek usage
8138          * is not a leak if mbufs get released
8139          * between reports and the floor stays low)
8140          */
8141         total_sbmb_cnt_floor = total_sbmb_cnt_peak;
8142 }
8143
8144 /*
8145  * Simple routine to avoid taking the lock when we can't run the
8146  * mbuf drain.
8147  */
8148 static int
8149 mbuf_drain_checks(boolean_t ignore_waiters)
8150 {
8151
8152         if (mb_drain_maxint == 0)
8153                 return 0;
8154         if (!ignore_waiters && mb_waiters != 0)
8155                 return 0;
8156
8157         return 1;
8158 }
8159
8160 /*
8161  * Called by the VM when there's memory pressure or when we exhausted
8162  * the 4k/16k reserved space.
8163  */
8164 static void
8165 mbuf_drain_locked(boolean_t ignore_waiters)
8166 {
8167         mbuf_class_t mc;
8168         mcl_slab_t *sp, *sp_tmp, *nsp;
8169         unsigned int num, k, interval, released = 0;
8170         unsigned long total_mem = 0, use_mem = 0;
8171         boolean_t ret, purge_caches = FALSE;
8172         ppnum_t offset;
8173         mcache_obj_t *obj;
8174         unsigned long per;
8175         static unsigned char scratch[32];
8176         static ppnum_t scratch_pa = 0;
8177
8178         LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
8179         if (!mbuf_drain_checks(ignore_waiters))
8180                 return;
8181         if (scratch_pa == 0) {
8182                 bzero(scratch, sizeof(scratch));
8183                 scratch_pa = pmap_find_phys(kernel_pmap, (addr64_t)scratch);
8184                 VERIFY(scratch_pa);
8185         } else if (mclverify) {
8186                 /*
8187                  * Panic if a driver wrote to our scratch memory.
8188                  */
8189                 for (k = 0; k < sizeof(scratch); k++)
8190                         if (scratch[k])
8191                                 panic("suspect DMA to freed address");
8192         }
8193         /*
8194          * Don't free memory too often as that could cause excessive
8195          * waiting times for mbufs.  Purge caches if we were asked to drain
8196          * in the last 5 minutes.
8197          */
8198         if (mbuf_drain_last_runtime != 0) {
8199                 interval = net_uptime() - mbuf_drain_last_runtime;
8200                 if (interval <= mb_drain_maxint) {
8201                         return;
8202                 }
8203                 if (interval <= mb_drain_maxint * 5)
8204                         purge_caches = TRUE;
8205         }
8206         mbuf_drain_last_runtime = net_uptime();
8207         /*
8208          * Don't free any memory if we're using 60% or more.
8209          */
8210         for (mc = 0; mc < NELEM(mbuf_table); mc++) {
8211                 total_mem += m_total(mc) * m_maxsize(mc);
8212                 use_mem += m_active(mc) * m_maxsize(mc);
8213         }
8214         per = (use_mem * 100) / total_mem;
8215         if (per >= 60) {
8216                 return;
8217         }
8218         /*
8219          * Purge all the caches.  This effectively disables
8220          * caching for a few seconds, but the mbuf worker thread will
8221          * re-enable them again.
8222          */
8223         if (purge_caches == TRUE)
8224                 for (mc = 0; mc < NELEM(mbuf_table); mc++) {
8225                         if (m_total(mc) < m_avgtotal(mc))
8226                                 continue;
8227                         lck_mtx_unlock(mbuf_mlock);
8228                         ret = mcache_purge_cache(m_cache(mc), FALSE);
8229                         lck_mtx_lock(mbuf_mlock);
8230                         if (ret == TRUE)
8231                                 m_purge_cnt(mc)++;
8232                 }
8233         /*
8234          * Move the objects from the composite class freelist to
8235          * the rudimentary slabs list, but keep at least 10% of the average
8236          * total in the freelist.
8237          */
8238         for (mc = 0; mc < NELEM(mbuf_table); mc++) {
8239                 while (m_cobjlist(mc) &&
8240                     m_total(mc) < m_avgtotal(mc) &&
8241                     m_infree(mc) > 0.1 * m_avgtotal(mc) + m_minlimit(mc)) {
8242                         obj = m_cobjlist(mc);
8243                         m_cobjlist(mc) = obj->obj_next;
8244                         obj->obj_next = NULL;
8245                         num = cslab_free(mc, obj, 1);
8246                         VERIFY(num == 1);
8247                         m_free_cnt(mc)++;
8248                         m_infree(mc)--;
8249                         /* cslab_free() handles m_total */
8250                 }
8251         }
8252         /*
8253          * Free the buffers present in the slab list up to 10% of the total
8254          * average per class.
8255          *
8256          * We walk the list backwards in an attempt to reduce fragmentation.
8257          */
8258         for (mc = NELEM(mbuf_table) - 1; (int)mc >= 0; mc--) {
8259                 TAILQ_FOREACH_SAFE(sp, &m_slablist(mc), sl_link, sp_tmp) {
8260                         /*
8261                          * Process only unused slabs occupying memory.
8262                          */
8263                         if (sp->sl_refcnt != 0 || sp->sl_len == 0 ||
8264                             sp->sl_base == NULL)
8265                                 continue;
8266                         if (m_total(mc) < m_avgtotal(mc) ||
8267                             m_infree(mc) < 0.1 * m_avgtotal(mc) + m_minlimit(mc))
8268                                 break;
8269                         slab_remove(sp, mc);
8270                         switch (mc) {
8271                         case MC_MBUF:
8272                                 m_infree(mc) -= NMBPG;
8273                                 m_total(mc) -= NMBPG;
8274                                 if (mclaudit != NULL)
8275                                         mcl_audit_free(sp->sl_base, NMBPG);
8276                                 break;
8277                         case MC_CL:
8278                                 m_infree(mc) -= NCLPG;
8279                                 m_total(mc) -= NCLPG;
8280                                 if (mclaudit != NULL)
8281                                         mcl_audit_free(sp->sl_base, NMBPG);
8282                                 break;
8283                         case MC_BIGCL:
8284                         {
8285                                 m_infree(mc) -= NBCLPG;
8286                                 m_total(mc) -= NBCLPG;
8287                                 if (mclaudit != NULL)
8288                                         mcl_audit_free(sp->sl_base, NMBPG);
8289                                 break;
8290                         }
8291                         case MC_16KCL:
8292                                 m_infree(mc)--;
8293                                 m_total(mc)--;
8294                                 for (nsp = sp, k = 1; k < NSLABSP16KB; k++) {
8295                                         nsp = nsp->sl_next;
8296                                         VERIFY(nsp->sl_refcnt == 0 &&
8297                                             nsp->sl_base != NULL &&
8298                                             nsp->sl_len == 0);
8299                                         slab_init(nsp, 0, 0, NULL, NULL, 0, 0,
8300                                             0);
8301                                         nsp->sl_flags = 0;
8302                                 }
8303                                 if (mclaudit != NULL) {
8304                                         if (sp->sl_len == PAGE_SIZE) {
8305                                                 mcl_audit_free(sp->sl_base,
8306                                                     NMBPG);
8307                                         } else {
8308                                                 mcl_audit_free(sp->sl_base, 1);
8309                                         }
8310                                 }
8311                                 break;
8312                         default:
8313                                 /*
8314                                  * The composite classes have their own
8315                                  * freelist (m_cobjlist), so we only
8316                                  * process rudimentary classes here.
8317                                  */
8318                                 VERIFY(0);
8319                         }
8320                         m_release_cnt(mc) += m_size(mc);
8321                         released += m_size(mc);
8322                         VERIFY(sp->sl_base != NULL &&
8323                             sp->sl_len >= PAGE_SIZE);
8324                         offset = MTOPG(sp->sl_base);
8325                         /*
8326                          * Make sure the IOMapper points to a valid, but
8327                          * bogus, address.  This should prevent further DMA
8328                          * accesses to freed memory.
8329                          */
8330                         IOMapperInsertPage(mcl_paddr_base, offset, scratch_pa);
8331                         mcl_paddr[offset] = 0;
8332                         kmem_free(mb_map, (vm_offset_t)sp->sl_base,
8333                             sp->sl_len);
8334                         slab_init(sp, 0, 0, NULL, NULL, 0, 0, 0);
8335                         sp->sl_flags = 0;
8336                 }
8337         }
8338         mbstat.m_drain++;
8339         mbstat.m_bigclusters = m_total(MC_BIGCL);
8340         mbstat.m_clusters = m_total(MC_CL);
8341         mbstat.m_mbufs = m_total(MC_MBUF);
8342         mbuf_stat_sync();
8343         mbuf_mtypes_sync(TRUE);
8344 }
8345
8346 __private_extern__ void
8347 mbuf_drain(boolean_t ignore_waiters)
8348 {
8349         LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_NOTOWNED);
8350         if (!mbuf_drain_checks(ignore_waiters))
8351                 return;
8352         lck_mtx_lock(mbuf_mlock);
8353         mbuf_drain_locked(ignore_waiters);
8354         lck_mtx_unlock(mbuf_mlock);
8355 }
8356
8357
8358 static int
8359 m_drain_force_sysctl SYSCTL_HANDLER_ARGS
8360 {
8361 #pragma unused(arg1, arg2)
8362         int val = 0, err;
8363
8364         err = sysctl_handle_int(oidp, &val, 0, req);
8365         if (err != 0 || req->newptr == USER_ADDR_NULL)
8366                 return (err);
8367         if (val) {
8368                 mbuf_drain(TRUE);
8369         }
8370
8371         return (err);
8372 }
8373
8374 #if DEBUG || DEVELOPMENT
8375 static void
8376 _mbwdog_logger(const char *func, const int line, const char *fmt, ...)
8377 {
8378         va_list ap;
8379         struct timeval now;
8380         char str[384], p[256];
8381         int len;
8382
8383         LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
8384         if (mbwdog_logging == NULL) {
8385                 mbwdog_logging = _MALLOC(mbwdog_logging_size,
8386                     M_TEMP, M_ZERO|M_NOWAIT);
8387                 if (mbwdog_logging == NULL)
8388                         return;
8389         }
8390         va_start(ap, fmt);
8391         vsnprintf(p, sizeof(p), fmt, ap);
8392         va_end(ap);
8393         microuptime(&now);
8394         len = snprintf(str, sizeof(str),
8395             "\n%ld.%d (%d/%llx) %s:%d %s",
8396             now.tv_sec, now.tv_usec,
8397             current_proc()->p_pid,
8398             (uint64_t)VM_KERNEL_ADDRPERM(current_thread()),
8399             func, line, p);
8400         if (len < 0)
8401                 return;
8402         if (mbwdog_logging_used + len > mbwdog_logging_size) {
8403                 mbwdog_logging_used = mbwdog_logging_used / 2;
8404                 memmove(mbwdog_logging, mbwdog_logging + mbwdog_logging_used,
8405                     mbwdog_logging_size - mbwdog_logging_used);
8406                 mbwdog_logging[mbwdog_logging_used] = 0;
8407         }
8408         strlcat(mbwdog_logging, str, mbwdog_logging_size);
8409         mbwdog_logging_used += len;
8410 }
8411
8412 static int
8413 sysctl_mbwdog_log SYSCTL_HANDLER_ARGS
8414 {
8415 #pragma unused(oidp, arg1, arg2)
8416     return SYSCTL_OUT(req, mbwdog_logging, mbwdog_logging_used);
8417 }
8418 SYSCTL_DECL(_kern_ipc);
8419 SYSCTL_PROC(_kern_ipc, OID_AUTO, mbwdog_log,
8420     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_LOCKED,
8421     0, 0, sysctl_mbwdog_log, "A", "");
8422
8423 static int mbtest_val;
8424 static int mbtest_running;
8425
8426 static void mbtest_thread(__unused void *arg)
8427 {
8428         int i;
8429         int scale_down = 1;
8430         int iterations = 250;
8431         int allocations = nmbclusters;
8432         iterations = iterations / scale_down;
8433         allocations = allocations / scale_down;
8434         printf("%s thread starting\n", __func__);
8435         for (i = 0; i < iterations; i++) {
8436                 unsigned int needed = allocations;
8437                 struct mbuf *m1, *m2, *m3;
8438
8439                 if (njcl > 0) {
8440                         needed = allocations;
8441                         m3 = m_getpackets_internal(&needed, 0, M_DONTWAIT, 0, M16KCLBYTES);
8442                         m_freem_list(m3);
8443                 }
8444
8445                 needed = allocations;
8446                 m2 = m_getpackets_internal(&needed, 0, M_DONTWAIT, 0, MBIGCLBYTES);
8447                 m_freem_list(m2);
8448
8449                 m1 = m_getpackets_internal(&needed, 0, M_DONTWAIT, 0, MCLBYTES);
8450                 m_freem_list(m1);
8451         }
8452
8453         printf("%s thread ending\n", __func__);
8454
8455         OSDecrementAtomic(&mbtest_running);
8456         wakeup_one((caddr_t)&mbtest_running);
8457 }
8458
8459 static void sysctl_mbtest(void)
8460 {
8461         /* We launch three threads - wait for all of them */
8462         OSIncrementAtomic(&mbtest_running);
8463         OSIncrementAtomic(&mbtest_running);
8464         OSIncrementAtomic(&mbtest_running);
8465
8466         thread_call_func_delayed((thread_call_func_t)mbtest_thread, NULL, 10);
8467         thread_call_func_delayed((thread_call_func_t)mbtest_thread, NULL, 10);
8468         thread_call_func_delayed((thread_call_func_t)mbtest_thread, NULL, 10);
8469
8470         while (mbtest_running) {
8471                 msleep((caddr_t)&mbtest_running, NULL, PUSER, "mbtest_running", NULL);
8472         }
8473 }
8474
8475 static int
8476 mbtest SYSCTL_HANDLER_ARGS
8477 {
8478 #pragma unused(arg1, arg2)
8479         int error = 0, val, oldval = mbtest_val;
8480
8481         val = oldval;
8482         error = sysctl_handle_int(oidp, &val, 0, req);
8483         if (error || !req->newptr)
8484                 return (error);
8485
8486         if (val != oldval)
8487                 sysctl_mbtest();
8488
8489         mbtest_val = val;
8490
8491         return (error);
8492 }
8493 #endif // DEBUG || DEVELOPMENT
8494
8495 static void
8496 mtracelarge_register(size_t size)
8497 {
8498         int i;
8499         struct mtracelarge *trace;
8500         uintptr_t bt[MLEAK_STACK_DEPTH];
8501         unsigned int depth;
8502
8503         depth = backtrace(bt, MLEAK_STACK_DEPTH);
8504         /* Check if this entry is already on the list. */
8505         for (i = 0; i < MTRACELARGE_NUM_TRACES; i++) {
8506                 trace = &mtracelarge_table[i];
8507                 if (trace->size == size && trace->depth == depth &&
8508                     memcmp(bt, trace->addr, depth * sizeof(uintptr_t)) == 0) {
8509                         return;
8510                 }
8511
8512         }
8513         for (i = 0; i < MTRACELARGE_NUM_TRACES; i++) {
8514                 trace = &mtracelarge_table[i];
8515                 if (size > trace->size) {
8516                         trace->depth = depth;
8517                         memcpy(trace->addr, bt, depth * sizeof(uintptr_t));
8518                         trace->size = size;
8519                         break;
8520                 }
8521         }
8522 }
8523
8524 SYSCTL_DECL(_kern_ipc);
8525 #if DEBUG || DEVELOPMENT
8526 SYSCTL_PROC(_kern_ipc, OID_AUTO, mbtest,
8527     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &mbtest_val, 0, &mbtest, "I",
8528     "Toggle to test mbufs");
8529 #endif
8530 SYSCTL_PROC(_kern_ipc, KIPC_MBSTAT, mbstat,
8531     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
8532     0, 0, mbstat_sysctl, "S,mbstat", "");
8533 SYSCTL_PROC(_kern_ipc, OID_AUTO, mb_stat,
8534     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
8535     0, 0, mb_stat_sysctl, "S,mb_stat", "");
8536 SYSCTL_PROC(_kern_ipc, OID_AUTO, mleak_top_trace,
8537     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
8538     0, 0, mleak_top_trace_sysctl, "S,mb_top_trace", "");
8539 SYSCTL_PROC(_kern_ipc, OID_AUTO, mleak_table,
8540     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
8541     0, 0, mleak_table_sysctl, "S,mleak_table", "");
8542 SYSCTL_INT(_kern_ipc, OID_AUTO, mleak_sample_factor,
8543     CTLFLAG_RW | CTLFLAG_LOCKED, &mleak_table.mleak_sample_factor, 0, "");
8544 SYSCTL_INT(_kern_ipc, OID_AUTO, mb_normalized,
8545     CTLFLAG_RD | CTLFLAG_LOCKED, &mb_normalized, 0, "");
8546 SYSCTL_INT(_kern_ipc, OID_AUTO, mb_watchdog,
8547     CTLFLAG_RW | CTLFLAG_LOCKED, &mb_watchdog, 0, "");
8548 SYSCTL_PROC(_kern_ipc, OID_AUTO, mb_drain_force,
8549     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, NULL, 0,
8550     m_drain_force_sysctl, "I",
8551     "Forces the mbuf garbage collection to run");
8552 SYSCTL_INT(_kern_ipc, OID_AUTO, mb_drain_maxint,
8553     CTLFLAG_RW | CTLFLAG_LOCKED, &mb_drain_maxint, 0,
8554     "Minimum time interval between garbage collection");