]> git.saurik.com Git - apple/xnu.git/blame - bsd/kern/uipc_mbuf.c
xnu-2422.110.17.tar.gz
[apple/xnu.git] / bsd / kern / uipc_mbuf.c
CommitLineData
1c79356b 1/*
39236c6e 2 * Copyright (c) 1998-2013 Apple Inc. All rights reserved.
5d5c5d0d 3 *
2d21ac55
A
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
8f6c56a5
A
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
2d21ac55
A
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
1c79356b
A
27 */
28/* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
29/*
30 * Copyright (c) 1982, 1986, 1988, 1991, 1993
31 * The Regents of the University of California. All rights reserved.
32 *
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
35 * are met:
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
41 * 3. All advertising materials mentioning features or use of this software
42 * must display the following acknowledgement:
43 * This product includes software developed by the University of
44 * California, Berkeley and its contributors.
45 * 4. Neither the name of the University nor the names of its contributors
46 * may be used to endorse or promote products derived from this software
47 * without specific prior written permission.
48 *
49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59 * SUCH DAMAGE.
60 *
61 * @(#)uipc_mbuf.c 8.2 (Berkeley) 1/4/94
62 */
2d21ac55
A
63/*
64 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
65 * support for mandatory and extensible security protections. This notice
66 * is included in support of clause 2.2 (b) of the Apple Public License,
67 * Version 2.0.
1c79356b
A
68 */
69
70#include <sys/param.h>
71#include <sys/systm.h>
72#include <sys/malloc.h>
73#include <sys/mbuf.h>
74#include <sys/kernel.h>
91447636 75#include <sys/sysctl.h>
1c79356b
A
76#include <sys/syslog.h>
77#include <sys/protosw.h>
78#include <sys/domain.h>
2d21ac55 79#include <sys/queue.h>
b0d623f7 80#include <sys/proc.h>
1c79356b 81
39236c6e
A
82#include <dev/random/randomdev.h>
83
9bccf70c 84#include <kern/kern_types.h>
2d21ac55
A
85#include <kern/simple_lock.h>
86#include <kern/queue.h>
9bccf70c 87#include <kern/sched_prim.h>
2d21ac55 88#include <kern/cpu_number.h>
6d2010ae 89#include <kern/zalloc.h>
2d21ac55
A
90
91#include <libkern/OSAtomic.h>
39236c6e 92#include <libkern/OSDebug.h>
2d21ac55 93#include <libkern/libkern.h>
9bccf70c 94
55e303ae
A
95#include <IOKit/IOMapper.h>
96
2d21ac55
A
97#include <machine/limits.h>
98#include <machine/machine_routines.h>
55e303ae 99
2d21ac55
A
100#if CONFIG_MACF_NET
101#include <security/mac_framework.h>
102#endif /* MAC_NET */
103
104#include <sys/mcache.h>
1c79356b 105
2d21ac55
A
106/*
107 * MBUF IMPLEMENTATION NOTES.
108 *
109 * There is a total of 5 per-CPU caches:
110 *
111 * MC_MBUF:
112 * This is a cache of rudimentary objects of MSIZE in size; each
113 * object represents an mbuf structure. This cache preserves only
114 * the m_type field of the mbuf during its transactions.
115 *
116 * MC_CL:
117 * This is a cache of rudimentary objects of MCLBYTES in size; each
118 * object represents a mcluster structure. This cache does not
119 * preserve the contents of the objects during its transactions.
120 *
121 * MC_BIGCL:
6d2010ae 122 * This is a cache of rudimentary objects of MBIGCLBYTES in size; each
2d21ac55
A
123 * object represents a mbigcluster structure. This cache does not
124 * preserve the contents of the objects during its transaction.
125 *
126 * MC_MBUF_CL:
127 * This is a cache of mbufs each having a cluster attached to it.
128 * It is backed by MC_MBUF and MC_CL rudimentary caches. Several
129 * fields of the mbuf related to the external cluster are preserved
130 * during transactions.
131 *
132 * MC_MBUF_BIGCL:
133 * This is a cache of mbufs each having a big cluster attached to it.
134 * It is backed by MC_MBUF and MC_BIGCL rudimentary caches. Several
135 * fields of the mbuf related to the external cluster are preserved
136 * during transactions.
137 *
138 * OBJECT ALLOCATION:
139 *
140 * Allocation requests are handled first at the per-CPU (mcache) layer
141 * before falling back to the slab layer. Performance is optimal when
142 * the request is satisfied at the CPU layer because global data/lock
143 * never gets accessed. When the slab layer is entered for allocation,
144 * the slab freelist will be checked first for available objects before
145 * the VM backing store is invoked. Slab layer operations are serialized
146 * for all of the caches as the mbuf global lock is held most of the time.
147 * Allocation paths are different depending on the class of objects:
148 *
149 * a. Rudimentary object:
150 *
151 * { m_get_common(), m_clattach(), m_mclget(),
152 * m_mclalloc(), m_bigalloc(), m_copym_with_hdrs(),
153 * composite object allocation }
154 * | ^
155 * | |
156 * | +-----------------------+
157 * v |
158 * mcache_alloc/mcache_alloc_ext() mbuf_slab_audit()
159 * | ^
160 * v |
161 * [CPU cache] -------> (found?) -------+
162 * | |
163 * v |
164 * mbuf_slab_alloc() |
165 * | |
166 * v |
167 * +---------> [freelist] -------> (found?) -------+
168 * | |
169 * | v
170 * | m_clalloc()
171 * | |
172 * | v
173 * +---<<---- kmem_mb_alloc()
174 *
175 * b. Composite object:
176 *
177 * { m_getpackets_internal(), m_allocpacket_internal() }
178 * | ^
179 * | |
180 * | +------ (done) ---------+
181 * v |
182 * mcache_alloc/mcache_alloc_ext() mbuf_cslab_audit()
183 * | ^
184 * v |
185 * [CPU cache] -------> (found?) -------+
186 * | |
187 * v |
188 * mbuf_cslab_alloc() |
189 * | |
190 * v |
191 * [freelist] -------> (found?) -------+
192 * | |
193 * v |
194 * (rudimentary object) |
195 * mcache_alloc/mcache_alloc_ext() ------>>-----+
196 *
197 * Auditing notes: If auditing is enabled, buffers will be subjected to
198 * integrity checks by the audit routine. This is done by verifying their
199 * contents against DEADBEEF (free) pattern before returning them to caller.
200 * As part of this step, the routine will also record the transaction and
201 * pattern-fill the buffers with BADDCAFE (uninitialized) pattern. It will
202 * also restore any constructed data structure fields if necessary.
203 *
204 * OBJECT DEALLOCATION:
205 *
206 * Freeing an object simply involves placing it into the CPU cache; this
207 * pollutes the cache to benefit subsequent allocations. The slab layer
208 * will only be entered if the object is to be purged out of the cache.
209 * During normal operations, this happens only when the CPU layer resizes
210 * its bucket while it's adjusting to the allocation load. Deallocation
211 * paths are different depending on the class of objects:
212 *
213 * a. Rudimentary object:
214 *
215 * { m_free(), m_freem_list(), composite object deallocation }
216 * | ^
217 * | |
218 * | +------ (done) ---------+
219 * v |
220 * mcache_free/mcache_free_ext() |
221 * | |
222 * v |
223 * mbuf_slab_audit() |
224 * | |
225 * v |
226 * [CPU cache] ---> (not purging?) -----+
227 * | |
228 * v |
229 * mbuf_slab_free() |
230 * | |
231 * v |
232 * [freelist] ----------->>------------+
233 * (objects never get purged to VM)
234 *
235 * b. Composite object:
236 *
237 * { m_free(), m_freem_list() }
238 * | ^
239 * | |
240 * | +------ (done) ---------+
241 * v |
242 * mcache_free/mcache_free_ext() |
243 * | |
244 * v |
245 * mbuf_cslab_audit() |
246 * | |
247 * v |
248 * [CPU cache] ---> (not purging?) -----+
249 * | |
250 * v |
251 * mbuf_cslab_free() |
252 * | |
253 * v |
254 * [freelist] ---> (not purging?) -----+
255 * | |
256 * v |
257 * (rudimentary object) |
258 * mcache_free/mcache_free_ext() ------->>------+
259 *
260 * Auditing notes: If auditing is enabled, the audit routine will save
261 * any constructed data structure fields (if necessary) before filling the
262 * contents of the buffers with DEADBEEF (free) pattern and recording the
263 * transaction. Buffers that are freed (whether at CPU or slab layer) are
264 * expected to contain the free pattern.
265 *
266 * DEBUGGING:
267 *
268 * Debugging can be enabled by adding "mbuf_debug=0x3" to boot-args; this
269 * translates to the mcache flags (MCF_VERIFY | MCF_AUDIT). Additionally,
270 * the CPU layer cache can be disabled by setting the MCF_NOCPUCACHE flag,
6d2010ae
A
271 * i.e. modify the boot argument parameter to "mbuf_debug=0x13". Leak
272 * detection may also be disabled by setting the MCF_NOLEAKLOG flag, e.g.
273 * "mbuf_debug=0x113". Note that debugging consumes more CPU and memory.
2d21ac55
A
274 *
275 * Each object is associated with exactly one mcache_audit_t structure that
276 * contains the information related to its last buffer transaction. Given
277 * an address of an object, the audit structure can be retrieved by finding
278 * the position of the object relevant to the base address of the cluster:
279 *
280 * +------------+ +=============+
281 * | mbuf addr | | mclaudit[i] |
282 * +------------+ +=============+
283 * | | cl_audit[0] |
6d2010ae 284 * i = MTOBG(addr) +-------------+
2d21ac55 285 * | +-----> | cl_audit[1] | -----> mcache_audit_t
6d2010ae 286 * b = BGTOM(i) | +-------------+
2d21ac55
A
287 * | | | ... |
288 * x = MCLIDX(b, addr) | +-------------+
289 * | | | cl_audit[7] |
290 * +-----------------+ +-------------+
291 * (e.g. x == 1)
292 *
293 * The mclaudit[] array is allocated at initialization time, but its contents
6d2010ae
A
294 * get populated when the corresponding cluster is created. Because a page
295 * can be turned into NMBPBG number of mbufs, we preserve enough space for the
296 * mbufs so that there is a 1-to-1 mapping between them. A page that never
2d21ac55 297 * gets (or has not yet) turned into mbufs will use only cl_audit[0] with the
6d2010ae
A
298 * remaining entries unused. For 16KB cluster, only one entry from the first
299 * page is allocated and used for the entire object.
2d21ac55 300 */
91447636 301
2d21ac55
A
302/* TODO: should be in header file */
303/* kernel translater */
b0d623f7 304extern vm_offset_t kmem_mb_alloc(vm_map_t, int, int);
2d21ac55 305extern ppnum_t pmap_find_phys(pmap_t pmap, addr64_t va);
1c79356b 306extern vm_map_t mb_map; /* special map */
2d21ac55
A
307
308/* Global lock */
316670eb
A
309decl_lck_mtx_data(static, mbuf_mlock_data);
310static lck_mtx_t *mbuf_mlock = &mbuf_mlock_data;
2d21ac55
A
311static lck_attr_t *mbuf_mlock_attr;
312static lck_grp_t *mbuf_mlock_grp;
313static lck_grp_attr_t *mbuf_mlock_grp_attr;
314
315/* Back-end (common) layer */
316static void *mbuf_worker_run; /* wait channel for worker thread */
317static int mbuf_worker_ready; /* worker thread is runnable */
318static int mbuf_expand_mcl; /* number of cluster creation requets */
319static int mbuf_expand_big; /* number of big cluster creation requests */
6d2010ae 320static int mbuf_expand_16k; /* number of 16KB cluster creation requests */
2d21ac55 321static int ncpu; /* number of CPUs */
b0d623f7
A
322static ppnum_t *mcl_paddr; /* Array of cluster physical addresses */
323static ppnum_t mcl_pages; /* Size of array (# physical pages) */
55e303ae 324static ppnum_t mcl_paddr_base; /* Handle returned by IOMapper::iovmAlloc() */
2d21ac55
A
325static mcache_t *ref_cache; /* Cache of cluster reference & flags */
326static mcache_t *mcl_audit_con_cache; /* Audit contents cache */
327static unsigned int mbuf_debug; /* patchable mbuf mcache flags */
328static unsigned int mb_normalized; /* number of packets "normalized" */
b0d623f7
A
329
330#define MB_GROWTH_AGGRESSIVE 1 /* Threshold: 1/2 of total */
6d2010ae 331#define MB_GROWTH_NORMAL 2 /* Threshold: 3/4 of total */
2d21ac55
A
332
333typedef enum {
334 MC_MBUF = 0, /* Regular mbuf */
335 MC_CL, /* Cluster */
6d2010ae
A
336 MC_BIGCL, /* Large (4KB) cluster */
337 MC_16KCL, /* Jumbo (16KB) cluster */
2d21ac55 338 MC_MBUF_CL, /* mbuf + cluster */
6d2010ae
A
339 MC_MBUF_BIGCL, /* mbuf + large (4KB) cluster */
340 MC_MBUF_16KCL /* mbuf + jumbo (16KB) cluster */
2d21ac55
A
341} mbuf_class_t;
342
343#define MBUF_CLASS_MIN MC_MBUF
344#define MBUF_CLASS_MAX MC_MBUF_16KCL
345#define MBUF_CLASS_LAST MC_16KCL
346#define MBUF_CLASS_VALID(c) \
347 ((int)(c) >= MBUF_CLASS_MIN && (int)(c) <= MBUF_CLASS_MAX)
348#define MBUF_CLASS_COMPOSITE(c) \
349 ((int)(c) > MBUF_CLASS_LAST)
91447636 350
9bccf70c 351
2d21ac55
A
352/*
353 * mbuf specific mcache allocation request flags.
354 */
355#define MCR_COMP MCR_USR1 /* for MC_MBUF_{CL,BIGCL,16KCL} caches */
9bccf70c 356
2d21ac55
A
357/*
358 * Per-cluster slab structure.
359 *
360 * A slab is a cluster control structure that contains one or more object
361 * chunks; the available chunks are chained in the slab's freelist (sl_head).
362 * Each time a chunk is taken out of the slab, the slab's reference count
363 * gets incremented. When all chunks have been taken out, the empty slab
364 * gets removed (SLF_DETACHED) from the class's slab list. A chunk that is
365 * returned to a slab causes the slab's reference count to be decremented;
366 * it also causes the slab to be reinserted back to class's slab list, if
367 * it's not already done.
368 *
369 * Compartmentalizing of the object chunks into slabs allows us to easily
370 * merge one or more slabs together when the adjacent slabs are idle, as
371 * well as to convert or move a slab from one class to another; e.g. the
372 * mbuf cluster slab can be converted to a regular cluster slab when all
373 * mbufs in the slab have been freed.
374 *
375 * A slab may also span across multiple clusters for chunks larger than
376 * a cluster's size. In this case, only the slab of the first cluster is
377 * used. The rest of the slabs are marked with SLF_PARTIAL to indicate
378 * that they are part of the larger slab.
6d2010ae
A
379 *
380 * Each slab controls a page of memory.
2d21ac55
A
381 */
382typedef struct mcl_slab {
383 struct mcl_slab *sl_next; /* neighboring slab */
384 u_int8_t sl_class; /* controlling mbuf class */
385 int8_t sl_refcnt; /* outstanding allocations */
386 int8_t sl_chunks; /* chunks (bufs) in this slab */
387 u_int16_t sl_flags; /* slab flags (see below) */
388 u_int16_t sl_len; /* slab length */
389 void *sl_base; /* base of allocated memory */
390 void *sl_head; /* first free buffer */
391 TAILQ_ENTRY(mcl_slab) sl_link; /* next/prev slab on freelist */
392} mcl_slab_t;
393
394#define SLF_MAPPED 0x0001 /* backed by a mapped page */
395#define SLF_PARTIAL 0x0002 /* part of another slab */
396#define SLF_DETACHED 0x0004 /* not in slab freelist */
1c79356b 397
2d21ac55
A
398/*
399 * The array of slabs are broken into groups of arrays per 1MB of kernel
400 * memory to reduce the footprint. Each group is allocated on demand
401 * whenever a new piece of memory mapped in from the VM crosses the 1MB
402 * boundary.
403 */
6d2010ae 404#define NSLABSPMB ((1 << MBSHIFT) >> PGSHIFT) /* 256 slabs/grp */
91447636 405
2d21ac55
A
406typedef struct mcl_slabg {
407 mcl_slab_t slg_slab[NSLABSPMB]; /* group of slabs */
408} mcl_slabg_t;
1c79356b 409
6d2010ae
A
410/*
411 * Number of slabs needed to control a 16KB cluster object.
412 */
413#define NSLABSP16KB (M16KCLBYTES >> PGSHIFT)
414
2d21ac55
A
415/*
416 * Per-cluster audit structure.
417 */
418typedef struct {
6d2010ae 419 mcache_audit_t *cl_audit[NMBPBG]; /* array of audits */
2d21ac55 420} mcl_audit_t;
91447636 421
39236c6e
A
422typedef struct {
423 struct thread *msa_thread; /* thread doing transaction */
424 struct thread *msa_pthread; /* previous transaction thread */
425 uint32_t msa_tstamp; /* transaction timestamp (ms) */
426 uint32_t msa_ptstamp; /* prev transaction timestamp (ms) */
427 uint16_t msa_depth; /* pc stack depth */
428 uint16_t msa_pdepth; /* previous transaction pc stack */
429 void *msa_stack[MCACHE_STACK_DEPTH];
430 void *msa_pstack[MCACHE_STACK_DEPTH];
431} mcl_scratch_audit_t;
432
433typedef struct {
434 /*
435 * Size of data from the beginning of an mbuf that covers m_hdr,
436 * pkthdr and m_ext structures. If auditing is enabled, we allocate
437 * a shadow mbuf structure of this size inside each audit structure,
438 * and the contents of the real mbuf gets copied into it when the mbuf
439 * is freed. This allows us to pattern-fill the mbuf for integrity
440 * check, and to preserve any constructed mbuf fields (e.g. mbuf +
441 * cluster cache case). Note that we don't save the contents of
442 * clusters when they are freed; we simply pattern-fill them.
443 */
444 u_int8_t sc_mbuf[(MSIZE - _MHLEN) + sizeof (_m_ext_t)];
445 mcl_scratch_audit_t sc_scratch __attribute__((aligned(8)));
446} mcl_saved_contents_t;
447
448#define AUDIT_CONTENTS_SIZE (sizeof (mcl_saved_contents_t))
449
450#define MCA_SAVED_MBUF_PTR(_mca) \
451 ((struct mbuf *)(void *)((mcl_saved_contents_t *) \
452 (_mca)->mca_contents)->sc_mbuf)
453#define MCA_SAVED_MBUF_SIZE \
454 (sizeof (((mcl_saved_contents_t *)0)->sc_mbuf))
455#define MCA_SAVED_SCRATCH_PTR(_mca) \
456 (&((mcl_saved_contents_t *)(_mca)->mca_contents)->sc_scratch)
fa4905b1 457
2d21ac55
A
458/*
459 * mbuf specific mcache audit flags
460 */
461#define MB_INUSE 0x01 /* object has not been returned to slab */
462#define MB_COMP_INUSE 0x02 /* object has not been returned to cslab */
463#define MB_SCVALID 0x04 /* object has valid saved contents */
fa4905b1 464
2d21ac55
A
465/*
466 * Each of the following two arrays hold up to nmbclusters elements.
467 */
468static mcl_audit_t *mclaudit; /* array of cluster audit information */
6d2010ae 469static unsigned int maxclaudit; /* max # of entries in audit table */
2d21ac55
A
470static mcl_slabg_t **slabstbl; /* cluster slabs table */
471static unsigned int maxslabgrp; /* max # of entries in slabs table */
472static unsigned int slabgrp; /* # of entries in slabs table */
473
474/* Globals */
475int nclusters; /* # of clusters for non-jumbo (legacy) sizes */
476int njcl; /* # of clusters for jumbo sizes */
477int njclbytes; /* size of a jumbo cluster */
6d2010ae
A
478union mbigcluster *mbutl; /* first mapped cluster address */
479union mbigcluster *embutl; /* ending virtual address of mclusters */
316670eb
A
480int _max_linkhdr; /* largest link-level header */
481int _max_protohdr; /* largest protocol header */
2d21ac55
A
482int max_hdr; /* largest link+protocol header */
483int max_datalen; /* MHLEN - max_hdr */
484
6d2010ae
A
485static boolean_t mclverify; /* debug: pattern-checking */
486static boolean_t mcltrace; /* debug: stack tracing */
487static boolean_t mclfindleak; /* debug: leak detection */
316670eb 488static boolean_t mclexpleak; /* debug: expose leak info to user space */
6d2010ae 489
39236c6e
A
490static struct timeval mb_start; /* beginning of time */
491
6d2010ae
A
492/* mbuf leak detection variables */
493static struct mleak_table mleak_table;
494static mleak_stat_t *mleak_stat;
495
496#define MLEAK_STAT_SIZE(n) \
497 ((size_t)(&((mleak_stat_t *)0)->ml_trace[n]))
498
499struct mallocation {
500 mcache_obj_t *element; /* the alloc'ed element, NULL if unused */
501 u_int32_t trace_index; /* mtrace index for corresponding backtrace */
502 u_int32_t count; /* How many objects were requested */
503 u_int64_t hitcount; /* for determining hash effectiveness */
504};
505
506struct mtrace {
507 u_int64_t collisions;
508 u_int64_t hitcount;
509 u_int64_t allocs;
510 u_int64_t depth;
511 uintptr_t addr[MLEAK_STACK_DEPTH];
512};
513
514/* Size must be a power of two for the zhash to be able to just mask off bits */
515#define MLEAK_ALLOCATION_MAP_NUM 512
516#define MLEAK_TRACE_MAP_NUM 256
517
518/*
519 * Sample factor for how often to record a trace. This is overwritable
520 * by the boot-arg mleak_sample_factor.
521 */
522#define MLEAK_SAMPLE_FACTOR 500
523
524/*
525 * Number of top leakers recorded.
526 */
527#define MLEAK_NUM_TRACES 5
528
316670eb
A
529#define MB_LEAK_SPACING_64 " "
530#define MB_LEAK_SPACING_32 " "
531
532
533#define MB_LEAK_HDR_32 "\n\
534 trace [1] trace [2] trace [3] trace [4] trace [5] \n\
535 ---------- ---------- ---------- ---------- ---------- \n\
536"
537
538#define MB_LEAK_HDR_64 "\n\
539 trace [1] trace [2] trace [3] \
540 trace [4] trace [5] \n\
541 ------------------ ------------------ ------------------ \
542 ------------------ ------------------ \n\
543"
544
6d2010ae
A
545static uint32_t mleak_alloc_buckets = MLEAK_ALLOCATION_MAP_NUM;
546static uint32_t mleak_trace_buckets = MLEAK_TRACE_MAP_NUM;
547
548/* Hashmaps of allocations and their corresponding traces */
549static struct mallocation *mleak_allocations;
550static struct mtrace *mleak_traces;
551static struct mtrace *mleak_top_trace[MLEAK_NUM_TRACES];
552
553/* Lock to protect mleak tables from concurrent modification */
316670eb
A
554decl_lck_mtx_data(static, mleak_lock_data);
555static lck_mtx_t *mleak_lock = &mleak_lock_data;
6d2010ae
A
556static lck_attr_t *mleak_lock_attr;
557static lck_grp_t *mleak_lock_grp;
558static lck_grp_attr_t *mleak_lock_grp_attr;
559
b0d623f7
A
560extern u_int32_t high_sb_max;
561
2d21ac55
A
562/* The minimum number of objects that are allocated, to start. */
563#define MINCL 32
564#define MINBIGCL (MINCL >> 1)
565#define MIN16KCL (MINCL >> 2)
566
567/* Low watermarks (only map in pages once free counts go below) */
2d21ac55
A
568#define MBIGCL_LOWAT MINBIGCL
569#define M16KCL_LOWAT MIN16KCL
570
571typedef struct {
572 mbuf_class_t mtbl_class; /* class type */
573 mcache_t *mtbl_cache; /* mcache for this buffer class */
574 TAILQ_HEAD(mcl_slhead, mcl_slab) mtbl_slablist; /* slab list */
575 mcache_obj_t *mtbl_cobjlist; /* composite objects freelist */
576 mb_class_stat_t *mtbl_stats; /* statistics fetchable via sysctl */
577 u_int32_t mtbl_maxsize; /* maximum buffer size */
578 int mtbl_minlimit; /* minimum allowed */
579 int mtbl_maxlimit; /* maximum allowed */
580 u_int32_t mtbl_wantpurge; /* purge during next reclaim */
581} mbuf_table_t;
582
583#define m_class(c) mbuf_table[c].mtbl_class
584#define m_cache(c) mbuf_table[c].mtbl_cache
585#define m_slablist(c) mbuf_table[c].mtbl_slablist
586#define m_cobjlist(c) mbuf_table[c].mtbl_cobjlist
587#define m_maxsize(c) mbuf_table[c].mtbl_maxsize
588#define m_minlimit(c) mbuf_table[c].mtbl_minlimit
589#define m_maxlimit(c) mbuf_table[c].mtbl_maxlimit
590#define m_wantpurge(c) mbuf_table[c].mtbl_wantpurge
591#define m_cname(c) mbuf_table[c].mtbl_stats->mbcl_cname
592#define m_size(c) mbuf_table[c].mtbl_stats->mbcl_size
593#define m_total(c) mbuf_table[c].mtbl_stats->mbcl_total
594#define m_active(c) mbuf_table[c].mtbl_stats->mbcl_active
595#define m_infree(c) mbuf_table[c].mtbl_stats->mbcl_infree
596#define m_slab_cnt(c) mbuf_table[c].mtbl_stats->mbcl_slab_cnt
597#define m_alloc_cnt(c) mbuf_table[c].mtbl_stats->mbcl_alloc_cnt
598#define m_free_cnt(c) mbuf_table[c].mtbl_stats->mbcl_free_cnt
599#define m_notified(c) mbuf_table[c].mtbl_stats->mbcl_notified
600#define m_purge_cnt(c) mbuf_table[c].mtbl_stats->mbcl_purge_cnt
601#define m_fail_cnt(c) mbuf_table[c].mtbl_stats->mbcl_fail_cnt
602#define m_ctotal(c) mbuf_table[c].mtbl_stats->mbcl_ctotal
603
604static mbuf_table_t mbuf_table[] = {
605 /*
606 * The caches for mbufs, regular clusters and big clusters.
607 */
608 { MC_MBUF, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_MBUF)),
609 NULL, NULL, 0, 0, 0, 0 },
610 { MC_CL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_CL)),
611 NULL, NULL, 0, 0, 0, 0 },
612 { MC_BIGCL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_BIGCL)),
613 NULL, NULL, 0, 0, 0, 0 },
614 { MC_16KCL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_16KCL)),
615 NULL, NULL, 0, 0, 0, 0 },
616 /*
617 * The following are special caches; they serve as intermediate
618 * caches backed by the above rudimentary caches. Each object
619 * in the cache is an mbuf with a cluster attached to it. Unlike
620 * the above caches, these intermediate caches do not directly
621 * deal with the slab structures; instead, the constructed
622 * cached elements are simply stored in the freelists.
623 */
624 { MC_MBUF_CL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0 },
625 { MC_MBUF_BIGCL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0 },
626 { MC_MBUF_16KCL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0 },
627};
628
629#define NELEM(a) (sizeof (a) / sizeof ((a)[0]))
630
631static void *mb_waitchan = &mbuf_table; /* wait channel for all caches */
6d2010ae
A
632static int mb_waiters; /* number of waiters */
633
634#define MB_WDT_MAXTIME 10 /* # of secs before watchdog panic */
635static struct timeval mb_wdtstart; /* watchdog start timestamp */
316670eb
A
636static char *mbuf_dump_buf;
637
638#define MBUF_DUMP_BUF_SIZE 2048
6d2010ae
A
639
640/*
641 * mbuf watchdog is enabled by default on embedded platforms. It is
642 * also toggeable via the kern.ipc.mb_watchdog sysctl.
643 */
6d2010ae 644static unsigned int mb_watchdog = 0;
39236c6e
A
645
646/* Red zone */
647static u_int32_t mb_redzone_cookie;
648static void m_redzone_init(struct mbuf *);
649static void m_redzone_verify(struct mbuf *m);
2d21ac55
A
650
651/* The following are used to serialize m_clalloc() */
652static boolean_t mb_clalloc_busy;
653static void *mb_clalloc_waitchan = &mb_clalloc_busy;
654static int mb_clalloc_waiters;
655
6d2010ae 656static void mbuf_mtypes_sync(boolean_t);
2d21ac55 657static int mbstat_sysctl SYSCTL_HANDLER_ARGS;
6d2010ae 658static void mbuf_stat_sync(void);
2d21ac55 659static int mb_stat_sysctl SYSCTL_HANDLER_ARGS;
6d2010ae
A
660static int mleak_top_trace_sysctl SYSCTL_HANDLER_ARGS;
661static int mleak_table_sysctl SYSCTL_HANDLER_ARGS;
662static char *mbuf_dump(void);
2d21ac55
A
663static void mbuf_table_init(void);
664static inline void m_incref(struct mbuf *);
665static inline u_int32_t m_decref(struct mbuf *);
666static int m_clalloc(const u_int32_t, const int, const u_int32_t);
667static void mbuf_worker_thread_init(void);
668static mcache_obj_t *slab_alloc(mbuf_class_t, int);
669static void slab_free(mbuf_class_t, mcache_obj_t *);
670static unsigned int mbuf_slab_alloc(void *, mcache_obj_t ***,
671 unsigned int, int);
672static void mbuf_slab_free(void *, mcache_obj_t *, int);
673static void mbuf_slab_audit(void *, mcache_obj_t *, boolean_t);
674static void mbuf_slab_notify(void *, u_int32_t);
675static unsigned int cslab_alloc(mbuf_class_t, mcache_obj_t ***,
676 unsigned int);
677static unsigned int cslab_free(mbuf_class_t, mcache_obj_t *, int);
678static unsigned int mbuf_cslab_alloc(void *, mcache_obj_t ***,
679 unsigned int, int);
680static void mbuf_cslab_free(void *, mcache_obj_t *, int);
681static void mbuf_cslab_audit(void *, mcache_obj_t *, boolean_t);
682static int freelist_populate(mbuf_class_t, unsigned int, int);
6d2010ae 683static void freelist_init(mbuf_class_t);
2d21ac55
A
684static boolean_t mbuf_cached_above(mbuf_class_t, int);
685static boolean_t mbuf_steal(mbuf_class_t, unsigned int);
686static void m_reclaim(mbuf_class_t, unsigned int, boolean_t);
687static int m_howmany(int, size_t);
688static void mbuf_worker_thread(void);
6d2010ae 689static void mbuf_watchdog(void);
2d21ac55
A
690static boolean_t mbuf_sleep(mbuf_class_t, unsigned int, int);
691
692static void mcl_audit_init(void *, mcache_audit_t **, mcache_obj_t **,
693 size_t, unsigned int);
694static mcache_audit_t *mcl_audit_buf2mca(mbuf_class_t, mcache_obj_t *);
695static void mcl_audit_mbuf(mcache_audit_t *, void *, boolean_t, boolean_t);
696static void mcl_audit_cluster(mcache_audit_t *, void *, size_t, boolean_t,
697 boolean_t);
698static void mcl_audit_restore_mbuf(struct mbuf *, mcache_audit_t *, boolean_t);
699static void mcl_audit_save_mbuf(struct mbuf *, mcache_audit_t *);
39236c6e 700static void mcl_audit_scratch(mcache_audit_t *);
2d21ac55
A
701static void mcl_audit_mcheck_panic(struct mbuf *);
702static void mcl_audit_verify_nextptr(void *, mcache_audit_t *);
703
6d2010ae
A
704static void mleak_activate(void);
705static void mleak_logger(u_int32_t, mcache_obj_t *, boolean_t);
706static boolean_t mleak_log(uintptr_t *, mcache_obj_t *, uint32_t, int);
707static void mleak_free(mcache_obj_t *);
316670eb
A
708static void mleak_sort_traces(void);
709static void mleak_update_stats(void);
6d2010ae 710
2d21ac55
A
711static mcl_slab_t *slab_get(void *);
712static void slab_init(mcl_slab_t *, mbuf_class_t, u_int32_t,
713 void *, void *, unsigned int, int, int);
714static void slab_insert(mcl_slab_t *, mbuf_class_t);
715static void slab_remove(mcl_slab_t *, mbuf_class_t);
716static boolean_t slab_inrange(mcl_slab_t *, void *);
717static void slab_nextptr_panic(mcl_slab_t *, void *);
718static void slab_detach(mcl_slab_t *);
719static boolean_t slab_is_detached(mcl_slab_t *);
720
b0d623f7
A
721static int m_copyback0(struct mbuf **, int, int, const void *, int, int);
722static struct mbuf *m_split0(struct mbuf *, int, int, int);
723
724/* flags for m_copyback0 */
725#define M_COPYBACK0_COPYBACK 0x0001 /* copyback from cp */
726#define M_COPYBACK0_PRESERVE 0x0002 /* preserve original data */
727#define M_COPYBACK0_COW 0x0004 /* do copy-on-write */
728#define M_COPYBACK0_EXTEND 0x0008 /* extend chain */
729
2d21ac55
A
730/*
731 * This flag is set for all mbufs that come out of and into the composite
732 * mbuf + cluster caches, i.e. MC_MBUF_CL and MC_MBUF_BIGCL. mbufs that
733 * are marked with such a flag have clusters attached to them, and will be
734 * treated differently when they are freed; instead of being placed back
735 * into the mbuf and cluster freelists, the composite mbuf + cluster objects
736 * are placed back into the appropriate composite cache's freelist, and the
737 * actual freeing is deferred until the composite objects are purged. At
738 * such a time, this flag will be cleared from the mbufs and the objects
739 * will be freed into their own separate freelists.
740 */
741#define EXTF_COMPOSITE 0x1
1c79356b 742
6d2010ae
A
743/*
744 * This flag indicates that the external cluster is read-only, i.e. it is
745 * or was referred to by more than one mbufs. Once set, this flag is never
746 * cleared.
747 */
748#define EXTF_READONLY 0x2
749#define EXTF_MASK (EXTF_COMPOSITE | EXTF_READONLY)
750
2d21ac55
A
751#define MEXT_RFA(m) ((m)->m_ext.ext_refflags)
752#define MEXT_REF(m) (MEXT_RFA(m)->refcnt)
753#define MEXT_FLAGS(m) (MEXT_RFA(m)->flags)
754#define MBUF_IS_COMPOSITE(m) \
6d2010ae 755 (MEXT_REF(m) == 0 && (MEXT_FLAGS(m) & EXTF_MASK) == EXTF_COMPOSITE)
1c79356b 756
2d21ac55
A
757/*
758 * Macros used to verify the integrity of the mbuf.
759 */
760#define _MCHECK(m) { \
761 if ((m)->m_type != MT_FREE) { \
762 if (mclaudit == NULL) \
763 panic("MCHECK: m_type=%d m=%p", \
764 (u_int16_t)(m)->m_type, m); \
765 else \
766 mcl_audit_mcheck_panic(m); \
767 } \
768}
55e303ae 769
2d21ac55
A
770#define MBUF_IN_MAP(addr) \
771 ((void *)(addr) >= (void *)mbutl && (void *)(addr) < (void *)embutl)
55e303ae 772
2d21ac55
A
773#define MRANGE(addr) { \
774 if (!MBUF_IN_MAP(addr)) \
775 panic("MRANGE: address out of range 0x%p", addr); \
1c79356b
A
776}
777
778/*
2d21ac55 779 * Macro version of mtod.
1c79356b 780 */
2d21ac55 781#define MTOD(m, t) ((t)((m)->m_data))
1c79356b 782
2d21ac55 783/*
6d2010ae
A
784 * Macros to obtain (4KB) cluster index and base cluster address.
785 */
786
787#define MTOBG(x) (((char *)(x) - (char *)mbutl) >> MBIGCLSHIFT)
788#define BGTOM(x) ((union mbigcluster *)(mbutl + (x)))
789
790/*
791 * Macro to find the mbuf index relative to a base.
2d21ac55 792 */
6d2010ae 793#define MCLIDX(c, m) (((char *)(m) - (char *)(c)) >> MSIZESHIFT)
1c79356b 794
2d21ac55 795/*
6d2010ae 796 * Same thing for 2KB cluster index.
2d21ac55 797 */
6d2010ae 798#define CLBGIDX(c, m) (((char *)(m) - (char *)(c)) >> MCLSHIFT)
91447636 799
2d21ac55
A
800/*
801 * Macros used during mbuf and cluster initialization.
802 */
39236c6e
A
803#define MBUF_INIT_PKTHDR(m) { \
804 (m)->m_pkthdr.rcvif = NULL; \
805 (m)->m_pkthdr.pkt_hdr = NULL; \
806 (m)->m_pkthdr.len = 0; \
807 (m)->m_pkthdr.csum_flags = 0; \
808 (m)->m_pkthdr.csum_data = 0; \
809 (m)->m_pkthdr.vlan_tag = 0; \
810 m_classifier_init(m, 0); \
811 m_tag_init(m, 1); \
812 m_scratch_init(m); \
813 m_redzone_init(m); \
814}
815
2d21ac55
A
816#define MBUF_INIT(m, pkthdr, type) { \
817 _MCHECK(m); \
818 (m)->m_next = (m)->m_nextpkt = NULL; \
819 (m)->m_len = 0; \
820 (m)->m_type = type; \
821 if ((pkthdr) == 0) { \
822 (m)->m_data = (m)->m_dat; \
823 (m)->m_flags = 0; \
824 } else { \
825 (m)->m_data = (m)->m_pktdat; \
826 (m)->m_flags = M_PKTHDR; \
39236c6e 827 MBUF_INIT_PKTHDR(m); \
2d21ac55
A
828 } \
829}
91447636 830
2d21ac55
A
831#define MEXT_INIT(m, buf, size, free, arg, rfa, ref, flag) { \
832 (m)->m_data = (m)->m_ext.ext_buf = (buf); \
833 (m)->m_flags |= M_EXT; \
834 (m)->m_ext.ext_size = (size); \
835 (m)->m_ext.ext_free = (free); \
836 (m)->m_ext.ext_arg = (arg); \
837 (m)->m_ext.ext_refs.forward = (m)->m_ext.ext_refs.backward = \
838 &(m)->m_ext.ext_refs; \
839 MEXT_RFA(m) = (rfa); \
840 MEXT_REF(m) = (ref); \
841 MEXT_FLAGS(m) = (flag); \
1c79356b
A
842}
843
2d21ac55
A
844#define MBUF_CL_INIT(m, buf, rfa, ref, flag) \
845 MEXT_INIT(m, buf, m_maxsize(MC_CL), NULL, NULL, rfa, ref, flag)
846
847#define MBUF_BIGCL_INIT(m, buf, rfa, ref, flag) \
848 MEXT_INIT(m, buf, m_maxsize(MC_BIGCL), m_bigfree, NULL, rfa, ref, flag)
849
850#define MBUF_16KCL_INIT(m, buf, rfa, ref, flag) \
851 MEXT_INIT(m, buf, m_maxsize(MC_16KCL), m_16kfree, NULL, rfa, ref, flag)
852
1c79356b 853/*
2d21ac55 854 * Macro to convert BSD malloc sleep flag to mcache's
1c79356b 855 */
2d21ac55 856#define MSLEEPF(f) ((!((f) & M_DONTWAIT)) ? MCR_SLEEP : MCR_NOSLEEP)
1c79356b 857
2d21ac55
A
858/*
859 * The structure that holds all mbuf class statistics exportable via sysctl.
860 * Similar to mbstat structure, the mb_stat structure is protected by the
861 * global mbuf lock. It contains additional information about the classes
862 * that allows for a more accurate view of the state of the allocator.
863 */
864struct mb_stat *mb_stat;
b0d623f7 865struct omb_stat *omb_stat; /* For backwards compatibility */
1c79356b 866
2d21ac55
A
867#define MB_STAT_SIZE(n) \
868 ((size_t)(&((mb_stat_t *)0)->mbs_class[n]))
b0d623f7
A
869#define OMB_STAT_SIZE(n) \
870 ((size_t)(&((struct omb_stat *)0)->mbs_class[n]))
1c79356b
A
871
872/*
2d21ac55
A
873 * The legacy structure holding all of the mbuf allocation statistics.
874 * The actual statistics used by the kernel are stored in the mbuf_table
875 * instead, and are updated atomically while the global mbuf lock is held.
876 * They are mirrored in mbstat to support legacy applications (e.g. netstat).
877 * Unlike before, the kernel no longer relies on the contents of mbstat for
878 * its operations (e.g. cluster expansion) because the structure is exposed
879 * to outside and could possibly be modified, therefore making it unsafe.
880 * With the exception of the mbstat.m_mtypes array (see below), all of the
881 * statistics are updated as they change.
1c79356b 882 */
2d21ac55 883struct mbstat mbstat;
1c79356b 884
2d21ac55
A
885#define MBSTAT_MTYPES_MAX \
886 (sizeof (mbstat.m_mtypes) / sizeof (mbstat.m_mtypes[0]))
1c79356b
A
887
888/*
2d21ac55
A
889 * Allocation statistics related to mbuf types (up to MT_MAX-1) are updated
890 * atomically and stored in a per-CPU structure which is lock-free; this is
891 * done in order to avoid writing to the global mbstat data structure which
892 * would cause false sharing. During sysctl request for kern.ipc.mbstat,
893 * the statistics across all CPUs will be converged into the mbstat.m_mtypes
894 * array and returned to the application. Any updates for types greater or
895 * equal than MT_MAX would be done atomically to the mbstat; this slows down
896 * performance but is okay since the kernel uses only up to MT_MAX-1 while
897 * anything beyond that (up to type 255) is considered a corner case.
1c79356b 898 */
2d21ac55
A
899typedef struct {
900 unsigned int cpu_mtypes[MT_MAX];
39236c6e 901} __attribute__((aligned(MAX_CPU_CACHE_LINE_SIZE), packed)) mtypes_cpu_t;
1c79356b 902
2d21ac55
A
903typedef struct {
904 mtypes_cpu_t mbs_cpu[1];
905} mbuf_mtypes_t;
1c79356b 906
2d21ac55
A
907static mbuf_mtypes_t *mbuf_mtypes; /* per-CPU statistics */
908
909#define MBUF_MTYPES_SIZE(n) \
910 ((size_t)(&((mbuf_mtypes_t *)0)->mbs_cpu[n]))
911
912#define MTYPES_CPU(p) \
316670eb 913 ((mtypes_cpu_t *)(void *)((char *)(p) + MBUF_MTYPES_SIZE(cpu_number())))
2d21ac55 914
2d21ac55
A
915#define mtype_stat_add(type, n) { \
916 if ((unsigned)(type) < MT_MAX) { \
917 mtypes_cpu_t *mbs = MTYPES_CPU(mbuf_mtypes); \
918 atomic_add_32(&mbs->cpu_mtypes[type], n); \
6d2010ae
A
919 } else if ((unsigned)(type) < (unsigned)MBSTAT_MTYPES_MAX) { \
920 atomic_add_16((int16_t *)&mbstat.m_mtypes[type], n); \
2d21ac55 921 } \
1c79356b
A
922}
923
2d21ac55
A
924#define mtype_stat_sub(t, n) mtype_stat_add(t, -(n))
925#define mtype_stat_inc(t) mtype_stat_add(t, 1)
926#define mtype_stat_dec(t) mtype_stat_sub(t, 1)
91447636 927
6d2010ae
A
928static void
929mbuf_mtypes_sync(boolean_t locked)
2d21ac55 930{
2d21ac55
A
931 int m, n;
932 mtypes_cpu_t mtc;
1c79356b 933
6d2010ae
A
934 if (locked)
935 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
936
2d21ac55
A
937 bzero(&mtc, sizeof (mtc));
938 for (m = 0; m < ncpu; m++) {
939 mtypes_cpu_t *scp = &mbuf_mtypes->mbs_cpu[m];
940 mtypes_cpu_t temp;
9bccf70c 941
2d21ac55
A
942 bcopy(&scp->cpu_mtypes, &temp.cpu_mtypes,
943 sizeof (temp.cpu_mtypes));
91447636 944
2d21ac55
A
945 for (n = 0; n < MT_MAX; n++)
946 mtc.cpu_mtypes[n] += temp.cpu_mtypes[n];
947 }
6d2010ae
A
948 if (!locked)
949 lck_mtx_lock(mbuf_mlock);
2d21ac55
A
950 for (n = 0; n < MT_MAX; n++)
951 mbstat.m_mtypes[n] = mtc.cpu_mtypes[n];
6d2010ae
A
952 if (!locked)
953 lck_mtx_unlock(mbuf_mlock);
1c79356b
A
954}
955
2d21ac55 956static int
6d2010ae 957mbstat_sysctl SYSCTL_HANDLER_ARGS
1c79356b 958{
2d21ac55 959#pragma unused(oidp, arg1, arg2)
6d2010ae
A
960 mbuf_mtypes_sync(FALSE);
961
962 return (SYSCTL_OUT(req, &mbstat, sizeof (mbstat)));
963}
964
965static void
966mbuf_stat_sync(void)
967{
2d21ac55 968 mb_class_stat_t *sp;
6d2010ae
A
969 mcache_cpu_t *ccp;
970 mcache_t *cp;
971 int k, m, bktsize;
972
973 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2d21ac55 974
2d21ac55
A
975 for (k = 0; k < NELEM(mbuf_table); k++) {
976 cp = m_cache(k);
977 ccp = &cp->mc_cpu[0];
978 bktsize = ccp->cc_bktsize;
979 sp = mbuf_table[k].mtbl_stats;
980
981 if (cp->mc_flags & MCF_NOCPUCACHE)
982 sp->mbcl_mc_state = MCS_DISABLED;
983 else if (cp->mc_purge_cnt > 0)
984 sp->mbcl_mc_state = MCS_PURGING;
985 else if (bktsize == 0)
986 sp->mbcl_mc_state = MCS_OFFLINE;
987 else
988 sp->mbcl_mc_state = MCS_ONLINE;
989
990 sp->mbcl_mc_cached = 0;
991 for (m = 0; m < ncpu; m++) {
992 ccp = &cp->mc_cpu[m];
993 if (ccp->cc_objs > 0)
994 sp->mbcl_mc_cached += ccp->cc_objs;
995 if (ccp->cc_pobjs > 0)
996 sp->mbcl_mc_cached += ccp->cc_pobjs;
997 }
998 sp->mbcl_mc_cached += (cp->mc_full.bl_total * bktsize);
999 sp->mbcl_active = sp->mbcl_total - sp->mbcl_mc_cached -
1000 sp->mbcl_infree;
1001
1002 sp->mbcl_mc_waiter_cnt = cp->mc_waiter_cnt;
1003 sp->mbcl_mc_wretry_cnt = cp->mc_wretry_cnt;
1004 sp->mbcl_mc_nwretry_cnt = cp->mc_nwretry_cnt;
1005
1006 /* Calculate total count specific to each class */
1007 sp->mbcl_ctotal = sp->mbcl_total;
1008 switch (m_class(k)) {
1009 case MC_MBUF:
1010 /* Deduct mbufs used in composite caches */
1011 sp->mbcl_ctotal -= (m_total(MC_MBUF_CL) +
1012 m_total(MC_MBUF_BIGCL));
1013 break;
91447636 1014
2d21ac55 1015 case MC_CL:
6d2010ae
A
1016 /* Deduct clusters used in composite cache */
1017 sp->mbcl_ctotal -= m_total(MC_MBUF_CL);
2d21ac55 1018 break;
91447636 1019
2d21ac55
A
1020 case MC_BIGCL:
1021 /* Deduct clusters used in composite cache */
1022 sp->mbcl_ctotal -= m_total(MC_MBUF_BIGCL);
1023 break;
1c79356b 1024
2d21ac55
A
1025 case MC_16KCL:
1026 /* Deduct clusters used in composite cache */
1027 sp->mbcl_ctotal -= m_total(MC_MBUF_16KCL);
1028 break;
1029
1030 default:
1031 break;
1032 }
1033 }
6d2010ae
A
1034}
1035
1036static int
1037mb_stat_sysctl SYSCTL_HANDLER_ARGS
1038{
1039#pragma unused(oidp, arg1, arg2)
1040 void *statp;
1041 int k, statsz, proc64 = proc_is64bit(req->p);
1042
1043 lck_mtx_lock(mbuf_mlock);
1044 mbuf_stat_sync();
b0d623f7
A
1045
1046 if (!proc64) {
1047 struct omb_class_stat *oc;
1048 struct mb_class_stat *c;
1049
1050 omb_stat->mbs_cnt = mb_stat->mbs_cnt;
1051 oc = &omb_stat->mbs_class[0];
1052 c = &mb_stat->mbs_class[0];
1053 for (k = 0; k < omb_stat->mbs_cnt; k++, oc++, c++) {
1054 (void) snprintf(oc->mbcl_cname, sizeof (oc->mbcl_cname),
1055 "%s", c->mbcl_cname);
1056 oc->mbcl_size = c->mbcl_size;
1057 oc->mbcl_total = c->mbcl_total;
1058 oc->mbcl_active = c->mbcl_active;
1059 oc->mbcl_infree = c->mbcl_infree;
1060 oc->mbcl_slab_cnt = c->mbcl_slab_cnt;
1061 oc->mbcl_alloc_cnt = c->mbcl_alloc_cnt;
1062 oc->mbcl_free_cnt = c->mbcl_free_cnt;
1063 oc->mbcl_notified = c->mbcl_notified;
1064 oc->mbcl_purge_cnt = c->mbcl_purge_cnt;
1065 oc->mbcl_fail_cnt = c->mbcl_fail_cnt;
1066 oc->mbcl_ctotal = c->mbcl_ctotal;
1067 oc->mbcl_mc_state = c->mbcl_mc_state;
1068 oc->mbcl_mc_cached = c->mbcl_mc_cached;
1069 oc->mbcl_mc_waiter_cnt = c->mbcl_mc_waiter_cnt;
1070 oc->mbcl_mc_wretry_cnt = c->mbcl_mc_wretry_cnt;
1071 oc->mbcl_mc_nwretry_cnt = c->mbcl_mc_nwretry_cnt;
1072 }
1073 statp = omb_stat;
1074 statsz = OMB_STAT_SIZE(NELEM(mbuf_table));
1075 } else {
1076 statp = mb_stat;
1077 statsz = MB_STAT_SIZE(NELEM(mbuf_table));
1078 }
1079
2d21ac55 1080 lck_mtx_unlock(mbuf_mlock);
9bccf70c 1081
b0d623f7 1082 return (SYSCTL_OUT(req, statp, statsz));
2d21ac55 1083}
91447636 1084
6d2010ae
A
1085static int
1086mleak_top_trace_sysctl SYSCTL_HANDLER_ARGS
1087{
1088#pragma unused(oidp, arg1, arg2)
6d2010ae
A
1089 int i;
1090
1091 /* Ensure leak tracing turned on */
316670eb 1092 if (!mclfindleak || !mclexpleak)
6d2010ae
A
1093 return (ENXIO);
1094
6d2010ae 1095 lck_mtx_lock(mleak_lock);
316670eb 1096 mleak_update_stats();
6d2010ae
A
1097 i = SYSCTL_OUT(req, mleak_stat, MLEAK_STAT_SIZE(MLEAK_NUM_TRACES));
1098 lck_mtx_unlock(mleak_lock);
1099
1100 return (i);
1101}
1102
1103static int
1104mleak_table_sysctl SYSCTL_HANDLER_ARGS
1105{
1106#pragma unused(oidp, arg1, arg2)
1107 int i = 0;
1108
1109 /* Ensure leak tracing turned on */
316670eb 1110 if (!mclfindleak || !mclexpleak)
6d2010ae
A
1111 return (ENXIO);
1112
1113 lck_mtx_lock(mleak_lock);
1114 i = SYSCTL_OUT(req, &mleak_table, sizeof (mleak_table));
1115 lck_mtx_unlock(mleak_lock);
1116
1117 return (i);
1118}
1119
2d21ac55
A
1120static inline void
1121m_incref(struct mbuf *m)
1122{
1123 UInt32 old, new;
1124 volatile UInt32 *addr = (volatile UInt32 *)&MEXT_REF(m);
91447636 1125
2d21ac55
A
1126 do {
1127 old = *addr;
1128 new = old + 1;
1129 ASSERT(new != 0);
1130 } while (!OSCompareAndSwap(old, new, addr));
6d2010ae
A
1131
1132 /*
1133 * If cluster is shared, mark it with (sticky) EXTF_READONLY;
1134 * we don't clear the flag when the refcount goes back to 1
1135 * to simplify code calling m_mclhasreference().
1136 */
1137 if (new > 1 && !(MEXT_FLAGS(m) & EXTF_READONLY))
1138 (void) OSBitOrAtomic(EXTF_READONLY, &MEXT_FLAGS(m));
1c79356b
A
1139}
1140
2d21ac55
A
1141static inline u_int32_t
1142m_decref(struct mbuf *m)
1c79356b 1143{
2d21ac55
A
1144 UInt32 old, new;
1145 volatile UInt32 *addr = (volatile UInt32 *)&MEXT_REF(m);
1c79356b 1146
2d21ac55
A
1147 do {
1148 old = *addr;
1149 new = old - 1;
1150 ASSERT(old != 0);
1151 } while (!OSCompareAndSwap(old, new, addr));
1152
1153 return (new);
1c79356b
A
1154}
1155
2d21ac55
A
1156static void
1157mbuf_table_init(void)
1c79356b 1158{
6d2010ae 1159 unsigned int b, c, s;
2d21ac55 1160 int m;
91447636 1161
b0d623f7
A
1162 MALLOC(omb_stat, struct omb_stat *, OMB_STAT_SIZE(NELEM(mbuf_table)),
1163 M_TEMP, M_WAITOK | M_ZERO);
1164 VERIFY(omb_stat != NULL);
1165
2d21ac55
A
1166 MALLOC(mb_stat, mb_stat_t *, MB_STAT_SIZE(NELEM(mbuf_table)),
1167 M_TEMP, M_WAITOK | M_ZERO);
1168 VERIFY(mb_stat != NULL);
1c79356b 1169
2d21ac55
A
1170 mb_stat->mbs_cnt = NELEM(mbuf_table);
1171 for (m = 0; m < NELEM(mbuf_table); m++)
1172 mbuf_table[m].mtbl_stats = &mb_stat->mbs_class[m];
1c79356b 1173
2d21ac55
A
1174#if CONFIG_MBUF_JUMBO
1175 /*
1176 * Set aside 1/3 of the mbuf cluster map for jumbo clusters; we do
1177 * this only on platforms where jumbo cluster pool is enabled.
1178 */
1179 njcl = nmbclusters / 3;
1180 njclbytes = M16KCLBYTES;
1181#endif /* CONFIG_MBUF_JUMBO */
9bccf70c 1182
2d21ac55 1183 /*
6d2010ae
A
1184 * nclusters holds both the 2KB and 4KB pools, so ensure it's
1185 * a multiple of 4KB clusters.
2d21ac55 1186 */
6d2010ae 1187 nclusters = P2ROUNDDOWN(nmbclusters - njcl, NCLPBG);
2d21ac55
A
1188 if (njcl > 0) {
1189 /*
6d2010ae
A
1190 * Each jumbo cluster takes 8 2KB clusters, so make
1191 * sure that the pool size is evenly divisible by 8;
1192 * njcl is in 2KB unit, hence treated as such.
2d21ac55
A
1193 */
1194 njcl = P2ROUNDDOWN(nmbclusters - nclusters, 8);
1c79356b 1195
6d2010ae
A
1196 /* Update nclusters with rounded down value of njcl */
1197 nclusters = P2ROUNDDOWN(nmbclusters - njcl, NCLPBG);
9bccf70c 1198 }
2d21ac55
A
1199
1200 /*
6d2010ae
A
1201 * njcl is valid only on platforms with 16KB jumbo clusters, where
1202 * it is configured to 1/3 of the pool size. On these platforms,
1203 * the remaining is used for 2KB and 4KB clusters. On platforms
1204 * without 16KB jumbo clusters, the entire pool is used for both
1205 * 2KB and 4KB clusters. A 4KB cluster can either be splitted into
1206 * 16 mbufs, or into 2 2KB clusters.
1207 *
1208 * +---+---+------------ ... -----------+------- ... -------+
1209 * | c | b | s | njcl |
1210 * +---+---+------------ ... -----------+------- ... -------+
1211 *
1212 * 1/32th of the shared region is reserved for pure 2KB and 4KB
1213 * clusters (1/64th each.)
1214 */
1215 c = P2ROUNDDOWN((nclusters >> 6), 2); /* in 2KB unit */
1216 b = P2ROUNDDOWN((nclusters >> (6 + NCLPBGSHIFT)), 2); /* in 4KB unit */
1217 s = nclusters - (c + (b << NCLPBGSHIFT)); /* in 2KB unit */
1218
1219 /*
1220 * 1/64th (c) is reserved for 2KB clusters.
2d21ac55 1221 */
6d2010ae
A
1222 m_minlimit(MC_CL) = c;
1223 m_maxlimit(MC_CL) = s + c; /* in 2KB unit */
2d21ac55
A
1224 m_maxsize(MC_CL) = m_size(MC_CL) = MCLBYTES;
1225 (void) snprintf(m_cname(MC_CL), MAX_MBUF_CNAME, "cl");
1226
1227 /*
6d2010ae
A
1228 * Another 1/64th (b) of the map is reserved for 4KB clusters.
1229 * It cannot be turned into 2KB clusters or mbufs.
2d21ac55 1230 */
6d2010ae
A
1231 m_minlimit(MC_BIGCL) = b;
1232 m_maxlimit(MC_BIGCL) = (s >> NCLPBGSHIFT) + b; /* in 4KB unit */
1233 m_maxsize(MC_BIGCL) = m_size(MC_BIGCL) = MBIGCLBYTES;
1234 (void) snprintf(m_cname(MC_BIGCL), MAX_MBUF_CNAME, "bigcl");
2d21ac55
A
1235
1236 /*
6d2010ae 1237 * The remaining 31/32ths (s) are all-purpose (mbufs, 2KB, or 4KB)
2d21ac55 1238 */
6d2010ae
A
1239 m_minlimit(MC_MBUF) = 0;
1240 m_maxlimit(MC_MBUF) = (s << NMBPCLSHIFT); /* in mbuf unit */
1241 m_maxsize(MC_MBUF) = m_size(MC_MBUF) = MSIZE;
1242 (void) snprintf(m_cname(MC_MBUF), MAX_MBUF_CNAME, "mbuf");
2d21ac55
A
1243
1244 /*
1245 * Set limits for the composite classes.
1246 */
1247 m_minlimit(MC_MBUF_CL) = 0;
6d2010ae 1248 m_maxlimit(MC_MBUF_CL) = m_maxlimit(MC_CL);
2d21ac55
A
1249 m_maxsize(MC_MBUF_CL) = MCLBYTES;
1250 m_size(MC_MBUF_CL) = m_size(MC_MBUF) + m_size(MC_CL);
1251 (void) snprintf(m_cname(MC_MBUF_CL), MAX_MBUF_CNAME, "mbuf_cl");
1252
1253 m_minlimit(MC_MBUF_BIGCL) = 0;
1254 m_maxlimit(MC_MBUF_BIGCL) = m_maxlimit(MC_BIGCL);
6d2010ae 1255 m_maxsize(MC_MBUF_BIGCL) = MBIGCLBYTES;
2d21ac55
A
1256 m_size(MC_MBUF_BIGCL) = m_size(MC_MBUF) + m_size(MC_BIGCL);
1257 (void) snprintf(m_cname(MC_MBUF_BIGCL), MAX_MBUF_CNAME, "mbuf_bigcl");
1258
1259 /*
1260 * And for jumbo classes.
1261 */
1262 m_minlimit(MC_16KCL) = 0;
6d2010ae 1263 m_maxlimit(MC_16KCL) = (njcl >> NCLPJCLSHIFT); /* in 16KB unit */
2d21ac55
A
1264 m_maxsize(MC_16KCL) = m_size(MC_16KCL) = M16KCLBYTES;
1265 (void) snprintf(m_cname(MC_16KCL), MAX_MBUF_CNAME, "16kcl");
1266
1267 m_minlimit(MC_MBUF_16KCL) = 0;
1268 m_maxlimit(MC_MBUF_16KCL) = m_maxlimit(MC_16KCL);
1269 m_maxsize(MC_MBUF_16KCL) = M16KCLBYTES;
1270 m_size(MC_MBUF_16KCL) = m_size(MC_MBUF) + m_size(MC_16KCL);
1271 (void) snprintf(m_cname(MC_MBUF_16KCL), MAX_MBUF_CNAME, "mbuf_16kcl");
1272
1273 /*
1274 * Initialize the legacy mbstat structure.
1275 */
1276 bzero(&mbstat, sizeof (mbstat));
1277 mbstat.m_msize = m_maxsize(MC_MBUF);
1278 mbstat.m_mclbytes = m_maxsize(MC_CL);
1279 mbstat.m_minclsize = MINCLSIZE;
1280 mbstat.m_mlen = MLEN;
1281 mbstat.m_mhlen = MHLEN;
1282 mbstat.m_bigmclbytes = m_maxsize(MC_BIGCL);
1283}
1284
b0d623f7
A
1285#if defined(__LP64__)
1286typedef struct ncl_tbl {
1287 uint64_t nt_maxmem; /* memory (sane) size */
1288 uint32_t nt_mbpool; /* mbuf pool size */
1289} ncl_tbl_t;
1290
1291/* Non-server */
1292static ncl_tbl_t ncl_table[] = {
316670eb 1293 { (1ULL << GBSHIFT) /* 1 GB */, (64 << MBSHIFT) /* 64 MB */ },
b0d623f7
A
1294 { (1ULL << (GBSHIFT + 3)) /* 8 GB */, (96 << MBSHIFT) /* 96 MB */ },
1295 { (1ULL << (GBSHIFT + 4)) /* 16 GB */, (128 << MBSHIFT) /* 128 MB */ },
1296 { 0, 0 }
1297};
1298
1299/* Server */
1300static ncl_tbl_t ncl_table_srv[] = {
316670eb 1301 { (1ULL << GBSHIFT) /* 1 GB */, (96 << MBSHIFT) /* 96 MB */ },
b0d623f7
A
1302 { (1ULL << (GBSHIFT + 2)) /* 4 GB */, (128 << MBSHIFT) /* 128 MB */ },
1303 { (1ULL << (GBSHIFT + 3)) /* 8 GB */, (160 << MBSHIFT) /* 160 MB */ },
1304 { (1ULL << (GBSHIFT + 4)) /* 16 GB */, (192 << MBSHIFT) /* 192 MB */ },
1305 { (1ULL << (GBSHIFT + 5)) /* 32 GB */, (256 << MBSHIFT) /* 256 MB */ },
1306 { (1ULL << (GBSHIFT + 6)) /* 64 GB */, (384 << MBSHIFT) /* 384 MB */ },
1307 { 0, 0 }
1308};
1309#endif /* __LP64__ */
1310
1311__private_extern__ unsigned int
6d2010ae 1312mbuf_default_ncl(int server, uint64_t mem)
b0d623f7
A
1313{
1314#if !defined(__LP64__)
6d2010ae 1315#pragma unused(server)
b0d623f7
A
1316 unsigned int n;
1317 /*
1318 * 32-bit kernel (default to 64MB of mbuf pool for >= 1GB RAM).
1319 */
6d2010ae
A
1320 if ((n = ((mem / 16) / MCLBYTES)) > 32768)
1321 n = 32768;
b0d623f7
A
1322#else
1323 unsigned int n, i;
6d2010ae 1324 ncl_tbl_t *tbl = (server ? ncl_table_srv : ncl_table);
b0d623f7
A
1325 /*
1326 * 64-bit kernel (mbuf pool size based on table).
1327 */
1328 n = tbl[0].nt_mbpool;
1329 for (i = 0; tbl[i].nt_mbpool != 0; i++) {
1330 if (mem < tbl[i].nt_maxmem)
1331 break;
1332 n = tbl[i].nt_mbpool;
1333 }
1334 n >>= MCLSHIFT;
1335#endif /* !__LP64__ */
1336 return (n);
1337}
1338
2d21ac55
A
1339__private_extern__ void
1340mbinit(void)
1341{
1342 unsigned int m;
6d2010ae 1343 unsigned int initmcl = 0;
2d21ac55 1344 void *buf;
b0d623f7 1345 thread_t thread = THREAD_NULL;
2d21ac55 1346
39236c6e
A
1347 microuptime(&mb_start);
1348
316670eb
A
1349 /*
1350 * These MBUF_ values must be equal to their private counterparts.
1351 */
1352 _CASSERT(MBUF_EXT == M_EXT);
1353 _CASSERT(MBUF_PKTHDR == M_PKTHDR);
1354 _CASSERT(MBUF_EOR == M_EOR);
1355 _CASSERT(MBUF_LOOP == M_LOOP);
1356 _CASSERT(MBUF_BCAST == M_BCAST);
1357 _CASSERT(MBUF_MCAST == M_MCAST);
1358 _CASSERT(MBUF_FRAG == M_FRAG);
1359 _CASSERT(MBUF_FIRSTFRAG == M_FIRSTFRAG);
1360 _CASSERT(MBUF_LASTFRAG == M_LASTFRAG);
1361 _CASSERT(MBUF_PROMISC == M_PROMISC);
1362 _CASSERT(MBUF_HASFCS == M_HASFCS);
1363
1364 _CASSERT(MBUF_TYPE_FREE == MT_FREE);
1365 _CASSERT(MBUF_TYPE_DATA == MT_DATA);
1366 _CASSERT(MBUF_TYPE_HEADER == MT_HEADER);
1367 _CASSERT(MBUF_TYPE_SOCKET == MT_SOCKET);
1368 _CASSERT(MBUF_TYPE_PCB == MT_PCB);
1369 _CASSERT(MBUF_TYPE_RTABLE == MT_RTABLE);
1370 _CASSERT(MBUF_TYPE_HTABLE == MT_HTABLE);
1371 _CASSERT(MBUF_TYPE_ATABLE == MT_ATABLE);
1372 _CASSERT(MBUF_TYPE_SONAME == MT_SONAME);
1373 _CASSERT(MBUF_TYPE_SOOPTS == MT_SOOPTS);
1374 _CASSERT(MBUF_TYPE_FTABLE == MT_FTABLE);
1375 _CASSERT(MBUF_TYPE_RIGHTS == MT_RIGHTS);
1376 _CASSERT(MBUF_TYPE_IFADDR == MT_IFADDR);
1377 _CASSERT(MBUF_TYPE_CONTROL == MT_CONTROL);
1378 _CASSERT(MBUF_TYPE_OOBDATA == MT_OOBDATA);
1379
1380 _CASSERT(MBUF_TSO_IPV4 == CSUM_TSO_IPV4);
1381 _CASSERT(MBUF_TSO_IPV6 == CSUM_TSO_IPV6);
39236c6e 1382 _CASSERT(MBUF_CSUM_REQ_SUM16 == CSUM_PARTIAL);
316670eb
A
1383 _CASSERT(MBUF_CSUM_TCP_SUM16 == MBUF_CSUM_REQ_SUM16);
1384 _CASSERT(MBUF_CSUM_REQ_IP == CSUM_IP);
1385 _CASSERT(MBUF_CSUM_REQ_TCP == CSUM_TCP);
1386 _CASSERT(MBUF_CSUM_REQ_UDP == CSUM_UDP);
1387 _CASSERT(MBUF_CSUM_REQ_TCPIPV6 == CSUM_TCPIPV6);
1388 _CASSERT(MBUF_CSUM_REQ_UDPIPV6 == CSUM_UDPIPV6);
1389 _CASSERT(MBUF_CSUM_DID_IP == CSUM_IP_CHECKED);
1390 _CASSERT(MBUF_CSUM_IP_GOOD == CSUM_IP_VALID);
1391 _CASSERT(MBUF_CSUM_DID_DATA == CSUM_DATA_VALID);
1392 _CASSERT(MBUF_CSUM_PSEUDO_HDR == CSUM_PSEUDO_HDR);
1393
1394 _CASSERT(MBUF_WAITOK == M_WAIT);
1395 _CASSERT(MBUF_DONTWAIT == M_DONTWAIT);
1396 _CASSERT(MBUF_COPYALL == M_COPYALL);
1397
316670eb
A
1398 _CASSERT(MBUF_SC2TC(MBUF_SC_BK_SYS) == MBUF_TC_BK);
1399 _CASSERT(MBUF_SC2TC(MBUF_SC_BK) == MBUF_TC_BK);
1400 _CASSERT(MBUF_SC2TC(MBUF_SC_BE) == MBUF_TC_BE);
1401 _CASSERT(MBUF_SC2TC(MBUF_SC_RD) == MBUF_TC_BE);
1402 _CASSERT(MBUF_SC2TC(MBUF_SC_OAM) == MBUF_TC_BE);
1403 _CASSERT(MBUF_SC2TC(MBUF_SC_AV) == MBUF_TC_VI);
1404 _CASSERT(MBUF_SC2TC(MBUF_SC_RV) == MBUF_TC_VI);
1405 _CASSERT(MBUF_SC2TC(MBUF_SC_VI) == MBUF_TC_VI);
1406 _CASSERT(MBUF_SC2TC(MBUF_SC_VO) == MBUF_TC_VO);
1407 _CASSERT(MBUF_SC2TC(MBUF_SC_CTL) == MBUF_TC_VO);
1408
1409 _CASSERT(MBUF_TC2SCVAL(MBUF_TC_BK) == SCVAL_BK);
1410 _CASSERT(MBUF_TC2SCVAL(MBUF_TC_BE) == SCVAL_BE);
1411 _CASSERT(MBUF_TC2SCVAL(MBUF_TC_VI) == SCVAL_VI);
1412 _CASSERT(MBUF_TC2SCVAL(MBUF_TC_VO) == SCVAL_VO);
1413
39236c6e
A
1414 /* Module specific scratch space (32-bit alignment requirement) */
1415 _CASSERT(!(offsetof(struct mbuf, m_pkthdr.pkt_mpriv) %
1416 sizeof (uint32_t)));
1417
1418 /* Initialize random red zone cookie value */
1419 _CASSERT(sizeof (mb_redzone_cookie) ==
1420 sizeof (((struct pkthdr *)0)->redzone));
1421 read_random(&mb_redzone_cookie, sizeof (mb_redzone_cookie));
1422
1423 /* Make sure we don't save more than we should */
1424 _CASSERT(MCA_SAVED_MBUF_SIZE <= sizeof (struct mbuf));
1425
2d21ac55
A
1426 if (nmbclusters == 0)
1427 nmbclusters = NMBCLUSTERS;
1428
6d2010ae
A
1429 /* This should be a sane (at least even) value by now */
1430 VERIFY(nmbclusters != 0 && !(nmbclusters & 0x1));
1431
2d21ac55
A
1432 /* Setup the mbuf table */
1433 mbuf_table_init();
1434
1435 /* Global lock for common layer */
1436 mbuf_mlock_grp_attr = lck_grp_attr_alloc_init();
1437 mbuf_mlock_grp = lck_grp_alloc_init("mbuf", mbuf_mlock_grp_attr);
1438 mbuf_mlock_attr = lck_attr_alloc_init();
316670eb 1439 lck_mtx_init(mbuf_mlock, mbuf_mlock_grp, mbuf_mlock_attr);
2d21ac55 1440
6d2010ae
A
1441 /*
1442 * Allocate cluster slabs table:
1443 *
1444 * maxslabgrp = (N * 2048) / (1024 * 1024)
1445 *
1446 * Where N is nmbclusters rounded up to the nearest 512. This yields
1447 * mcl_slab_g_t units, each one representing a MB of memory.
1448 */
1449 maxslabgrp =
1450 (P2ROUNDUP(nmbclusters, (MBSIZE >> 11)) << MCLSHIFT) >> MBSHIFT;
2d21ac55
A
1451 MALLOC(slabstbl, mcl_slabg_t **, maxslabgrp * sizeof (mcl_slabg_t *),
1452 M_TEMP, M_WAITOK | M_ZERO);
1453 VERIFY(slabstbl != NULL);
1454
6d2010ae
A
1455 /*
1456 * Allocate audit structures, if needed:
1457 *
1458 * maxclaudit = (maxslabgrp * 1024 * 1024) / 4096
1459 *
1460 * This yields mcl_audit_t units, each one representing a page.
1461 */
593a1d5f 1462 PE_parse_boot_argn("mbuf_debug", &mbuf_debug, sizeof (mbuf_debug));
2d21ac55 1463 mbuf_debug |= mcache_getflags();
6d2010ae
A
1464 if (mbuf_debug & MCF_DEBUG) {
1465 maxclaudit = ((maxslabgrp << MBSHIFT) >> PGSHIFT);
1466 MALLOC(mclaudit, mcl_audit_t *, maxclaudit * sizeof (*mclaudit),
1467 M_TEMP, M_WAITOK | M_ZERO);
2d21ac55
A
1468 VERIFY(mclaudit != NULL);
1469
1470 mcl_audit_con_cache = mcache_create("mcl_audit_contents",
39236c6e 1471 AUDIT_CONTENTS_SIZE, sizeof (u_int64_t), 0, MCR_SLEEP);
2d21ac55
A
1472 VERIFY(mcl_audit_con_cache != NULL);
1473 }
6d2010ae
A
1474 mclverify = (mbuf_debug & MCF_VERIFY);
1475 mcltrace = (mbuf_debug & MCF_TRACE);
1476 mclfindleak = !(mbuf_debug & MCF_NOLEAKLOG);
316670eb 1477 mclexpleak = mclfindleak && (mbuf_debug & MCF_EXPLEAKLOG);
6d2010ae
A
1478
1479 /* Enable mbuf leak logging, with a lock to protect the tables */
1480
1481 mleak_lock_grp_attr = lck_grp_attr_alloc_init();
1482 mleak_lock_grp = lck_grp_alloc_init("mleak_lock", mleak_lock_grp_attr);
1483 mleak_lock_attr = lck_attr_alloc_init();
316670eb 1484 lck_mtx_init(mleak_lock, mleak_lock_grp, mleak_lock_attr);
6d2010ae
A
1485
1486 mleak_activate();
2d21ac55
A
1487
1488 /* Calculate the number of pages assigned to the cluster pool */
b0d623f7
A
1489 mcl_pages = (nmbclusters * MCLBYTES) / CLBYTES;
1490 MALLOC(mcl_paddr, ppnum_t *, mcl_pages * sizeof (ppnum_t),
1491 M_TEMP, M_WAITOK);
2d21ac55
A
1492 VERIFY(mcl_paddr != NULL);
1493
1494 /* Register with the I/O Bus mapper */
1495 mcl_paddr_base = IOMapperIOVMAlloc(mcl_pages);
b0d623f7 1496 bzero((char *)mcl_paddr, mcl_pages * sizeof (ppnum_t));
2d21ac55 1497
6d2010ae 1498 embutl = (union mbigcluster *)
316670eb 1499 ((void *)((unsigned char *)mbutl + (nmbclusters * MCLBYTES)));
6d2010ae 1500 VERIFY((((char *)embutl - (char *)mbutl) % MBIGCLBYTES) == 0);
2d21ac55 1501
6d2010ae 1502 /* Prime up the freelist */
593a1d5f 1503 PE_parse_boot_argn("initmcl", &initmcl, sizeof (initmcl));
6d2010ae
A
1504 if (initmcl != 0) {
1505 initmcl >>= NCLPBGSHIFT; /* become a 4K unit */
1506 if (initmcl > m_maxlimit(MC_BIGCL))
1507 initmcl = m_maxlimit(MC_BIGCL);
1508 }
1509 if (initmcl < m_minlimit(MC_BIGCL))
1510 initmcl = m_minlimit(MC_BIGCL);
2d21ac55
A
1511
1512 lck_mtx_lock(mbuf_mlock);
1513
6d2010ae
A
1514 /*
1515 * For classes with non-zero minimum limits, populate their freelists
1516 * so that m_total(class) is at least m_minlimit(class).
1517 */
1518 VERIFY(m_total(MC_BIGCL) == 0 && m_minlimit(MC_BIGCL) != 0);
1519 freelist_populate(m_class(MC_BIGCL), initmcl, M_WAIT);
1520 VERIFY(m_total(MC_BIGCL) >= m_minlimit(MC_BIGCL));
1521 freelist_init(m_class(MC_CL));
1522
1523 for (m = 0; m < NELEM(mbuf_table); m++) {
1524 /* Make sure we didn't miss any */
1525 VERIFY(m_minlimit(m_class(m)) == 0 ||
1526 m_total(m_class(m)) >= m_minlimit(m_class(m)));
1527 }
2d21ac55
A
1528
1529 lck_mtx_unlock(mbuf_mlock);
1530
6d2010ae
A
1531 (void) kernel_thread_start((thread_continue_t)mbuf_worker_thread_init,
1532 NULL, &thread);
b0d623f7 1533 thread_deallocate(thread);
2d21ac55
A
1534
1535 ref_cache = mcache_create("mext_ref", sizeof (struct ext_ref),
1536 0, 0, MCR_SLEEP);
1537
1538 /* Create the cache for each class */
1539 for (m = 0; m < NELEM(mbuf_table); m++) {
6d2010ae 1540 void *allocfunc, *freefunc, *auditfunc, *logfunc;
2d21ac55
A
1541 u_int32_t flags;
1542
1543 flags = mbuf_debug;
1544 if (m_class(m) == MC_MBUF_CL || m_class(m) == MC_MBUF_BIGCL ||
1545 m_class(m) == MC_MBUF_16KCL) {
1546 allocfunc = mbuf_cslab_alloc;
1547 freefunc = mbuf_cslab_free;
1548 auditfunc = mbuf_cslab_audit;
6d2010ae 1549 logfunc = mleak_logger;
2d21ac55
A
1550 } else {
1551 allocfunc = mbuf_slab_alloc;
1552 freefunc = mbuf_slab_free;
1553 auditfunc = mbuf_slab_audit;
6d2010ae 1554 logfunc = mleak_logger;
2d21ac55
A
1555 }
1556
1557 /*
1558 * Disable per-CPU caches for jumbo classes if there
1559 * is no jumbo cluster pool available in the system.
1560 * The cache itself is still created (but will never
1561 * be populated) since it simplifies the code.
1562 */
1563 if ((m_class(m) == MC_MBUF_16KCL || m_class(m) == MC_16KCL) &&
1564 njcl == 0)
1565 flags |= MCF_NOCPUCACHE;
1566
6d2010ae
A
1567 if (!mclfindleak)
1568 flags |= MCF_NOLEAKLOG;
1569
2d21ac55 1570 m_cache(m) = mcache_create_ext(m_cname(m), m_maxsize(m),
6d2010ae 1571 allocfunc, freefunc, auditfunc, logfunc, mbuf_slab_notify,
b0d623f7 1572 (void *)(uintptr_t)m, flags, MCR_SLEEP);
2d21ac55
A
1573 }
1574
1575 /*
1576 * Allocate structure for per-CPU statistics that's aligned
1577 * on the CPU cache boundary; this code assumes that we never
1578 * uninitialize this framework, since the original address
1579 * before alignment is not saved.
1580 */
1581 ncpu = ml_get_max_cpus();
39236c6e 1582 MALLOC(buf, void *, MBUF_MTYPES_SIZE(ncpu) + CPU_CACHE_LINE_SIZE,
2d21ac55
A
1583 M_TEMP, M_WAITOK);
1584 VERIFY(buf != NULL);
1585
39236c6e
A
1586 mbuf_mtypes = (mbuf_mtypes_t *)P2ROUNDUP((intptr_t)buf,
1587 CPU_CACHE_LINE_SIZE);
2d21ac55
A
1588 bzero(mbuf_mtypes, MBUF_MTYPES_SIZE(ncpu));
1589
6d2010ae
A
1590 /*
1591 * Set the max limit on sb_max to be 1/16 th of the size of
b0d623f7
A
1592 * memory allocated for mbuf clusters.
1593 */
6d2010ae 1594 high_sb_max = (nmbclusters << (MCLSHIFT - 4));
b0d623f7
A
1595 if (high_sb_max < sb_max) {
1596 /* sb_max is too large for this configuration, scale it down */
6d2010ae 1597 if (high_sb_max > (1 << MBSHIFT)) {
b0d623f7
A
1598 /* We have atleast 16 M of mbuf pool */
1599 sb_max = high_sb_max;
1600 } else if ((nmbclusters << MCLSHIFT) > (1 << MBSHIFT)) {
6d2010ae
A
1601 /*
1602 * If we have more than 1M of mbufpool, cap the size of
b0d623f7 1603 * max sock buf at 1M
6d2010ae 1604 */
b0d623f7
A
1605 sb_max = high_sb_max = (1 << MBSHIFT);
1606 } else {
1607 sb_max = high_sb_max;
1608 }
1609 }
1610
316670eb
A
1611 /* allocate space for mbuf_dump_buf */
1612 MALLOC(mbuf_dump_buf, char *, MBUF_DUMP_BUF_SIZE, M_TEMP, M_WAITOK);
1613 VERIFY(mbuf_dump_buf != NULL);
1614
39236c6e
A
1615 if (mbuf_debug & MCF_DEBUG) {
1616 printf("%s: MLEN %d, MHLEN %d\n", __func__,
1617 (int)_MLEN, (int)_MHLEN);
1618 }
1619
1620 printf("%s: done [%d MB total pool size, (%d/%d) split]\n", __func__,
6d2010ae
A
1621 (nmbclusters << MCLSHIFT) >> MBSHIFT,
1622 (nclusters << MCLSHIFT) >> MBSHIFT,
1623 (njcl << MCLSHIFT) >> MBSHIFT);
2d21ac55
A
1624}
1625
1626/*
1627 * Obtain a slab of object(s) from the class's freelist.
1628 */
1629static mcache_obj_t *
1630slab_alloc(mbuf_class_t class, int wait)
1631{
1632 mcl_slab_t *sp;
1633 mcache_obj_t *buf;
1634
1635 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1636
1637 VERIFY(class != MC_16KCL || njcl > 0);
1638
1639 /* This should always be NULL for us */
1640 VERIFY(m_cobjlist(class) == NULL);
1641
1642 /*
1643 * Treat composite objects as having longer lifespan by using
1644 * a slab from the reverse direction, in hoping that this could
1645 * reduce the probability of fragmentation for slabs that hold
1646 * more than one buffer chunks (e.g. mbuf slabs). For other
1647 * slabs, this probably doesn't make much of a difference.
1648 */
6d2010ae 1649 if ((class == MC_MBUF || class == MC_CL) && (wait & MCR_COMP))
2d21ac55
A
1650 sp = (mcl_slab_t *)TAILQ_LAST(&m_slablist(class), mcl_slhead);
1651 else
1652 sp = (mcl_slab_t *)TAILQ_FIRST(&m_slablist(class));
1653
1654 if (sp == NULL) {
1655 VERIFY(m_infree(class) == 0 && m_slab_cnt(class) == 0);
1656 /* The slab list for this class is empty */
1657 return (NULL);
1658 }
1659
1660 VERIFY(m_infree(class) > 0);
1661 VERIFY(!slab_is_detached(sp));
1662 VERIFY(sp->sl_class == class &&
1663 (sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED);
1664 buf = sp->sl_head;
1665 VERIFY(slab_inrange(sp, buf) && sp == slab_get(buf));
1666
1667 if (class == MC_MBUF) {
1668 sp->sl_head = buf->obj_next;
6d2010ae
A
1669 VERIFY(sp->sl_head != NULL || sp->sl_refcnt == (NMBPBG - 1));
1670 } else if (class == MC_CL) {
1671 sp->sl_head = buf->obj_next;
1672 VERIFY(sp->sl_head != NULL || sp->sl_refcnt == (NCLPBG - 1));
2d21ac55
A
1673 } else {
1674 sp->sl_head = NULL;
1675 }
1676 if (sp->sl_head != NULL && !slab_inrange(sp, sp->sl_head)) {
1677 slab_nextptr_panic(sp, sp->sl_head);
1678 /* In case sl_head is in the map but not in the slab */
1679 VERIFY(slab_inrange(sp, sp->sl_head));
1680 /* NOTREACHED */
1681 }
1682
1683 /* Increment slab reference */
1684 sp->sl_refcnt++;
1685
1686 if (mclaudit != NULL) {
1687 mcache_audit_t *mca = mcl_audit_buf2mca(class, buf);
1688 mca->mca_uflags = 0;
1689 /* Save contents on mbuf objects only */
1690 if (class == MC_MBUF)
1691 mca->mca_uflags |= MB_SCVALID;
1692 }
1693
1694 if (class == MC_CL) {
1695 mbstat.m_clfree = (--m_infree(MC_CL)) + m_infree(MC_MBUF_CL);
1696 /*
6d2010ae 1697 * A 2K cluster slab can have at most NCLPBG references.
2d21ac55 1698 */
6d2010ae
A
1699 VERIFY(sp->sl_refcnt >= 1 && sp->sl_refcnt <= NCLPBG &&
1700 sp->sl_chunks == NCLPBG &&
1701 sp->sl_len == m_maxsize(MC_BIGCL));
1702 VERIFY(sp->sl_refcnt < NCLPBG || sp->sl_head == NULL);
2d21ac55 1703 } else if (class == MC_BIGCL) {
2d21ac55
A
1704 mbstat.m_bigclfree = (--m_infree(MC_BIGCL)) +
1705 m_infree(MC_MBUF_BIGCL);
1706 /*
6d2010ae 1707 * A 4K cluster slab can have at most 1 reference.
2d21ac55
A
1708 */
1709 VERIFY(sp->sl_refcnt == 1 && sp->sl_chunks == 1 &&
6d2010ae 1710 sp->sl_len == m_maxsize(class) && sp->sl_head == NULL);
2d21ac55
A
1711 } else if (class == MC_16KCL) {
1712 mcl_slab_t *nsp;
1713 int k;
1714
1715 --m_infree(MC_16KCL);
1716 VERIFY(sp->sl_refcnt == 1 && sp->sl_chunks == 1 &&
6d2010ae 1717 sp->sl_len == m_maxsize(class) && sp->sl_head == NULL);
2d21ac55 1718 /*
6d2010ae
A
1719 * Increment 2nd-Nth slab reference, where N is NSLABSP16KB.
1720 * A 16KB big cluster takes NSLABSP16KB slabs, each having at
1721 * most 1 reference.
2d21ac55 1722 */
6d2010ae 1723 for (nsp = sp, k = 1; k < NSLABSP16KB; k++) {
2d21ac55
A
1724 nsp = nsp->sl_next;
1725 /* Next slab must already be present */
1726 VERIFY(nsp != NULL);
1727 nsp->sl_refcnt++;
1728 VERIFY(!slab_is_detached(nsp));
1729 VERIFY(nsp->sl_class == MC_16KCL &&
1730 nsp->sl_flags == (SLF_MAPPED | SLF_PARTIAL) &&
1731 nsp->sl_refcnt == 1 && nsp->sl_chunks == 0 &&
1732 nsp->sl_len == 0 && nsp->sl_base == sp->sl_base &&
1733 nsp->sl_head == NULL);
1734 }
1735 } else {
6d2010ae 1736 VERIFY(class == MC_MBUF);
2d21ac55
A
1737 --m_infree(MC_MBUF);
1738 /*
1739 * If auditing is turned on, this check is
1740 * deferred until later in mbuf_slab_audit().
1741 */
1742 if (mclaudit == NULL)
1743 _MCHECK((struct mbuf *)buf);
1744 /*
1745 * Since we have incremented the reference count above,
6d2010ae 1746 * an mbuf slab (formerly a 4KB cluster slab that was cut
2d21ac55 1747 * up into mbufs) must have a reference count between 1
6d2010ae 1748 * and NMBPBG at this point.
2d21ac55 1749 */
6d2010ae
A
1750 VERIFY(sp->sl_refcnt >= 1 && sp->sl_refcnt <= NMBPBG &&
1751 sp->sl_chunks == NMBPBG &&
1752 sp->sl_len == m_maxsize(MC_BIGCL));
1753 VERIFY(sp->sl_refcnt < NMBPBG || sp->sl_head == NULL);
2d21ac55
A
1754 }
1755
1756 /* If empty, remove this slab from the class's freelist */
1757 if (sp->sl_head == NULL) {
6d2010ae
A
1758 VERIFY(class != MC_MBUF || sp->sl_refcnt == NMBPBG);
1759 VERIFY(class != MC_CL || sp->sl_refcnt == NCLPBG);
2d21ac55
A
1760 slab_remove(sp, class);
1761 }
1762
1763 return (buf);
1764}
1765
1766/*
1767 * Place a slab of object(s) back into a class's slab list.
1768 */
1769static void
1770slab_free(mbuf_class_t class, mcache_obj_t *buf)
1771{
1772 mcl_slab_t *sp;
1773
1774 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1775
1776 VERIFY(class != MC_16KCL || njcl > 0);
1777 VERIFY(buf->obj_next == NULL);
1778 sp = slab_get(buf);
1779 VERIFY(sp->sl_class == class && slab_inrange(sp, buf) &&
1780 (sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED);
1781
1782 /* Decrement slab reference */
1783 sp->sl_refcnt--;
1784
6d2010ae 1785 if (class == MC_CL) {
2d21ac55
A
1786 VERIFY(IS_P2ALIGNED(buf, MCLBYTES));
1787 /*
6d2010ae
A
1788 * A slab that has been splitted for 2KB clusters can have
1789 * at most 1 outstanding reference at this point.
1790 */
1791 VERIFY(sp->sl_refcnt >= 0 && sp->sl_refcnt <= (NCLPBG - 1) &&
1792 sp->sl_chunks == NCLPBG &&
1793 sp->sl_len == m_maxsize(MC_BIGCL));
1794 VERIFY(sp->sl_refcnt < (NCLPBG - 1) ||
1795 (slab_is_detached(sp) && sp->sl_head == NULL));
1796 } else if (class == MC_BIGCL) {
1797 VERIFY(IS_P2ALIGNED(buf, MCLBYTES));
1798 /*
1799 * A 4KB cluster slab can have at most 1 reference
2d21ac55
A
1800 * which must be 0 at this point.
1801 */
1802 VERIFY(sp->sl_refcnt == 0 && sp->sl_chunks == 1 &&
1803 sp->sl_len == m_maxsize(class) && sp->sl_head == NULL);
1804 VERIFY(slab_is_detached(sp));
2d21ac55
A
1805 } else if (class == MC_16KCL) {
1806 mcl_slab_t *nsp;
1807 int k;
1808 /*
6d2010ae 1809 * A 16KB cluster takes NSLABSP16KB slabs, all must
2d21ac55
A
1810 * now have 0 reference.
1811 */
6d2010ae 1812 VERIFY(IS_P2ALIGNED(buf, MBIGCLBYTES));
2d21ac55 1813 VERIFY(sp->sl_refcnt == 0 && sp->sl_chunks == 1 &&
6d2010ae 1814 sp->sl_len == m_maxsize(class) && sp->sl_head == NULL);
2d21ac55 1815 VERIFY(slab_is_detached(sp));
6d2010ae 1816 for (nsp = sp, k = 1; k < NSLABSP16KB; k++) {
2d21ac55
A
1817 nsp = nsp->sl_next;
1818 /* Next slab must already be present */
1819 VERIFY(nsp != NULL);
1820 nsp->sl_refcnt--;
1821 VERIFY(slab_is_detached(nsp));
1822 VERIFY(nsp->sl_class == MC_16KCL &&
1823 (nsp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) &&
1824 nsp->sl_refcnt == 0 && nsp->sl_chunks == 0 &&
1825 nsp->sl_len == 0 && nsp->sl_base == sp->sl_base &&
1826 nsp->sl_head == NULL);
1827 }
1828 } else {
1829 /*
6d2010ae
A
1830 * A slab that has been splitted for mbufs has at most NMBPBG
1831 * reference counts. Since we have decremented one reference
1832 * above, it must now be between 0 and NMBPBG-1.
2d21ac55 1833 */
6d2010ae
A
1834 VERIFY(class == MC_MBUF);
1835 VERIFY(sp->sl_refcnt >= 0 && sp->sl_refcnt <= (NMBPBG - 1) &&
1836 sp->sl_chunks == NMBPBG &&
1837 sp->sl_len == m_maxsize(MC_BIGCL));
1838 VERIFY(sp->sl_refcnt < (NMBPBG - 1) ||
2d21ac55
A
1839 (slab_is_detached(sp) && sp->sl_head == NULL));
1840 }
1841
1842 /*
1843 * When auditing is enabled, ensure that the buffer still
1844 * contains the free pattern. Otherwise it got corrupted
1845 * while at the CPU cache layer.
1846 */
1847 if (mclaudit != NULL) {
1848 mcache_audit_t *mca = mcl_audit_buf2mca(class, buf);
6d2010ae
A
1849 if (mclverify) {
1850 mcache_audit_free_verify(mca, buf, 0, m_maxsize(class));
1851 }
2d21ac55
A
1852 mca->mca_uflags &= ~MB_SCVALID;
1853 }
1854
1855 if (class == MC_CL) {
1856 mbstat.m_clfree = (++m_infree(MC_CL)) + m_infree(MC_MBUF_CL);
6d2010ae 1857 buf->obj_next = sp->sl_head;
2d21ac55
A
1858 } else if (class == MC_BIGCL) {
1859 mbstat.m_bigclfree = (++m_infree(MC_BIGCL)) +
1860 m_infree(MC_MBUF_BIGCL);
1861 } else if (class == MC_16KCL) {
1862 ++m_infree(MC_16KCL);
1863 } else {
1864 ++m_infree(MC_MBUF);
1865 buf->obj_next = sp->sl_head;
1866 }
1867 sp->sl_head = buf;
1868
6d2010ae
A
1869 /*
1870 * If a slab has been splitted to either one which holds 2KB clusters,
1871 * or one which holds mbufs, turn it back to one which holds a 4KB
1872 * cluster.
1873 */
1874 if (class == MC_MBUF && sp->sl_refcnt == 0 &&
1875 m_total(class) > m_minlimit(class) &&
1876 m_total(MC_BIGCL) < m_maxlimit(MC_BIGCL)) {
1877 int i = NMBPBG;
1878
1879 m_total(MC_BIGCL)++;
1880 mbstat.m_bigclusters = m_total(MC_BIGCL);
1881 m_total(MC_MBUF) -= NMBPBG;
2d21ac55 1882 mbstat.m_mbufs = m_total(MC_MBUF);
6d2010ae
A
1883 m_infree(MC_MBUF) -= NMBPBG;
1884 mtype_stat_add(MT_FREE, -((unsigned)NMBPBG));
1885
1886 VERIFY(m_total(MC_BIGCL) <= m_maxlimit(MC_BIGCL));
1887 VERIFY(m_total(MC_MBUF) >= m_minlimit(MC_MBUF));
2d21ac55
A
1888
1889 while (i--) {
1890 struct mbuf *m = sp->sl_head;
1891 VERIFY(m != NULL);
1892 sp->sl_head = m->m_next;
1893 m->m_next = NULL;
1894 }
1895 VERIFY(sp->sl_head == NULL);
1896
1897 /* Remove the slab from the mbuf class's slab list */
1898 slab_remove(sp, class);
1899
6d2010ae
A
1900 /* Reinitialize it as a 4KB cluster slab */
1901 slab_init(sp, MC_BIGCL, sp->sl_flags, sp->sl_base, sp->sl_base,
2d21ac55
A
1902 sp->sl_len, 0, 1);
1903
6d2010ae 1904 if (mclverify) {
2d21ac55 1905 mcache_set_pattern(MCACHE_FREE_PATTERN,
6d2010ae
A
1906 (caddr_t)sp->sl_head, m_maxsize(MC_BIGCL));
1907 }
1908 mbstat.m_bigclfree = (++m_infree(MC_BIGCL)) +
1909 m_infree(MC_MBUF_BIGCL);
2d21ac55 1910
6d2010ae
A
1911 VERIFY(slab_is_detached(sp));
1912 /* And finally switch class */
1913 class = MC_BIGCL;
1914 } else if (class == MC_CL && sp->sl_refcnt == 0 &&
1915 m_total(class) > m_minlimit(class) &&
1916 m_total(MC_BIGCL) < m_maxlimit(MC_BIGCL)) {
1917 int i = NCLPBG;
1918
1919 m_total(MC_BIGCL)++;
1920 mbstat.m_bigclusters = m_total(MC_BIGCL);
1921 m_total(MC_CL) -= NCLPBG;
1922 mbstat.m_clusters = m_total(MC_CL);
1923 m_infree(MC_CL) -= NCLPBG;
1924 VERIFY(m_total(MC_BIGCL) <= m_maxlimit(MC_BIGCL));
1925 VERIFY(m_total(MC_CL) >= m_minlimit(MC_CL));
1926
1927 while (i--) {
1928 union mcluster *c = sp->sl_head;
1929 VERIFY(c != NULL);
1930 sp->sl_head = c->mcl_next;
1931 c->mcl_next = NULL;
1932 }
1933 VERIFY(sp->sl_head == NULL);
1934
1935 /* Remove the slab from the 2KB cluster class's slab list */
1936 slab_remove(sp, class);
1937
1938 /* Reinitialize it as a 4KB cluster slab */
1939 slab_init(sp, MC_BIGCL, sp->sl_flags, sp->sl_base, sp->sl_base,
1940 sp->sl_len, 0, 1);
1941
1942 if (mclverify) {
1943 mcache_set_pattern(MCACHE_FREE_PATTERN,
1944 (caddr_t)sp->sl_head, m_maxsize(MC_BIGCL));
1945 }
1946 mbstat.m_bigclfree = (++m_infree(MC_BIGCL)) +
1947 m_infree(MC_MBUF_BIGCL);
2d21ac55
A
1948
1949 VERIFY(slab_is_detached(sp));
1950 /* And finally switch class */
6d2010ae 1951 class = MC_BIGCL;
2d21ac55
A
1952 }
1953
1954 /* Reinsert the slab to the class's slab list */
1955 if (slab_is_detached(sp))
1956 slab_insert(sp, class);
1957}
1958
1959/*
1960 * Common allocator for rudimentary objects called by the CPU cache layer
1961 * during an allocation request whenever there is no available element in the
1962 * bucket layer. It returns one or more elements from the appropriate global
1963 * freelist. If the freelist is empty, it will attempt to populate it and
1964 * retry the allocation.
1965 */
1966static unsigned int
1967mbuf_slab_alloc(void *arg, mcache_obj_t ***plist, unsigned int num, int wait)
1968{
1969 mbuf_class_t class = (mbuf_class_t)arg;
1970 unsigned int need = num;
1971 mcache_obj_t **list = *plist;
1972
1973 ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
1974 ASSERT(need > 0);
1975
1976 lck_mtx_lock(mbuf_mlock);
1977
1978 for (;;) {
1979 if ((*list = slab_alloc(class, wait)) != NULL) {
1980 (*list)->obj_next = NULL;
1981 list = *plist = &(*list)->obj_next;
1982
1983 if (--need == 0) {
1984 /*
1985 * If the number of elements in freelist has
1986 * dropped below low watermark, asynchronously
1987 * populate the freelist now rather than doing
1988 * it later when we run out of elements.
1989 */
1990 if (!mbuf_cached_above(class, wait) &&
1991 m_infree(class) < m_total(class) >> 5) {
1992 (void) freelist_populate(class, 1,
1993 M_DONTWAIT);
1994 }
1995 break;
1996 }
1997 } else {
1998 VERIFY(m_infree(class) == 0 || class == MC_CL);
1999
2000 (void) freelist_populate(class, 1,
2001 (wait & MCR_NOSLEEP) ? M_DONTWAIT : M_WAIT);
2002
2003 if (m_infree(class) > 0)
2004 continue;
2005
2006 /* Check if there's anything at the cache layer */
2007 if (mbuf_cached_above(class, wait))
2008 break;
2009
6d2010ae
A
2010 /* watchdog checkpoint */
2011 mbuf_watchdog();
2012
2d21ac55
A
2013 /* We have nothing and cannot block; give up */
2014 if (wait & MCR_NOSLEEP) {
2015 if (!(wait & MCR_TRYHARD)) {
2016 m_fail_cnt(class)++;
2017 mbstat.m_drops++;
2018 break;
2019 }
2020 }
2021
2022 /*
2023 * If the freelist is still empty and the caller is
2024 * willing to be blocked, sleep on the wait channel
2025 * until an element is available. Otherwise, if
2026 * MCR_TRYHARD is set, do our best to satisfy the
2027 * request without having to go to sleep.
2028 */
2029 if (mbuf_worker_ready &&
2030 mbuf_sleep(class, need, wait))
2031 break;
2032
2033 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2034 }
2035 }
2036
2037 m_alloc_cnt(class) += num - need;
2038 lck_mtx_unlock(mbuf_mlock);
2039
2040 return (num - need);
2041}
2042
2043/*
2044 * Common de-allocator for rudimentary objects called by the CPU cache
2045 * layer when one or more elements need to be returned to the appropriate
2046 * global freelist.
2047 */
2048static void
2049mbuf_slab_free(void *arg, mcache_obj_t *list, __unused int purged)
2050{
2051 mbuf_class_t class = (mbuf_class_t)arg;
2052 mcache_obj_t *nlist;
2053 unsigned int num = 0;
2054 int w;
2055
2056 ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
2057
2058 lck_mtx_lock(mbuf_mlock);
2059
2060 for (;;) {
2061 nlist = list->obj_next;
2062 list->obj_next = NULL;
2063 slab_free(class, list);
2064 ++num;
2065 if ((list = nlist) == NULL)
2066 break;
2067 }
2068 m_free_cnt(class) += num;
2069
2070 if ((w = mb_waiters) > 0)
2071 mb_waiters = 0;
2072
2073 lck_mtx_unlock(mbuf_mlock);
2074
2075 if (w != 0)
2076 wakeup(mb_waitchan);
2077}
2078
2079/*
2080 * Common auditor for rudimentary objects called by the CPU cache layer
2081 * during an allocation or free request. For the former, this is called
2082 * after the objects are obtained from either the bucket or slab layer
2083 * and before they are returned to the caller. For the latter, this is
2084 * called immediately during free and before placing the objects into
2085 * the bucket or slab layer.
2086 */
2087static void
2088mbuf_slab_audit(void *arg, mcache_obj_t *list, boolean_t alloc)
2089{
2090 mbuf_class_t class = (mbuf_class_t)arg;
2091 mcache_audit_t *mca;
2092
2093 ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
2094
2095 while (list != NULL) {
2096 lck_mtx_lock(mbuf_mlock);
2097 mca = mcl_audit_buf2mca(class, list);
2098
2099 /* Do the sanity checks */
2100 if (class == MC_MBUF) {
2101 mcl_audit_mbuf(mca, list, FALSE, alloc);
2102 ASSERT(mca->mca_uflags & MB_SCVALID);
2103 } else {
2104 mcl_audit_cluster(mca, list, m_maxsize(class),
2105 alloc, TRUE);
2106 ASSERT(!(mca->mca_uflags & MB_SCVALID));
2107 }
2108 /* Record this transaction */
6d2010ae 2109 if (mcltrace)
39236c6e 2110 mcache_buffer_log(mca, list, m_cache(class), &mb_start);
6d2010ae 2111
2d21ac55
A
2112 if (alloc)
2113 mca->mca_uflags |= MB_INUSE;
2114 else
2115 mca->mca_uflags &= ~MB_INUSE;
2116 /* Unpair the object (unconditionally) */
2117 mca->mca_uptr = NULL;
2118 lck_mtx_unlock(mbuf_mlock);
2119
2120 list = list->obj_next;
2121 }
2122}
2123
2124/*
2125 * Common notify routine for all caches. It is called by mcache when
2126 * one or more objects get freed. We use this indication to trigger
2127 * the wakeup of any sleeping threads so that they can retry their
2128 * allocation requests.
2129 */
2130static void
2131mbuf_slab_notify(void *arg, u_int32_t reason)
2132{
2133 mbuf_class_t class = (mbuf_class_t)arg;
2134 int w;
2135
2136 ASSERT(MBUF_CLASS_VALID(class));
2137
2138 if (reason != MCN_RETRYALLOC)
2139 return;
2140
2141 lck_mtx_lock(mbuf_mlock);
2142 if ((w = mb_waiters) > 0) {
2143 m_notified(class)++;
2144 mb_waiters = 0;
2145 }
2146 lck_mtx_unlock(mbuf_mlock);
2147
2148 if (w != 0)
2149 wakeup(mb_waitchan);
2150}
2151
2152/*
2153 * Obtain object(s) from the composite class's freelist.
2154 */
2155static unsigned int
2156cslab_alloc(mbuf_class_t class, mcache_obj_t ***plist, unsigned int num)
2157{
2158 unsigned int need = num;
2159 mcl_slab_t *sp, *clsp, *nsp;
2160 struct mbuf *m;
2161 mcache_obj_t **list = *plist;
2162 void *cl;
2163
2164 VERIFY(need > 0);
2165 VERIFY(class != MC_MBUF_16KCL || njcl > 0);
2166 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2167
2168 /* Get what we can from the freelist */
2169 while ((*list = m_cobjlist(class)) != NULL) {
2170 MRANGE(*list);
2171
2172 m = (struct mbuf *)*list;
2173 sp = slab_get(m);
2174 cl = m->m_ext.ext_buf;
2175 clsp = slab_get(cl);
2176 VERIFY(m->m_flags == M_EXT && cl != NULL);
2177 VERIFY(MEXT_RFA(m) != NULL && MBUF_IS_COMPOSITE(m));
6d2010ae
A
2178
2179 if (class == MC_MBUF_CL) {
2180 VERIFY(clsp->sl_refcnt >= 1 &&
2181 clsp->sl_refcnt <= NCLPBG);
2182 } else {
2183 VERIFY(clsp->sl_refcnt == 1);
2184 }
2185
2186 if (class == MC_MBUF_16KCL) {
2d21ac55 2187 int k;
6d2010ae 2188 for (nsp = clsp, k = 1; k < NSLABSP16KB; k++) {
2d21ac55
A
2189 nsp = nsp->sl_next;
2190 /* Next slab must already be present */
2191 VERIFY(nsp != NULL);
2192 VERIFY(nsp->sl_refcnt == 1);
2193 }
2194 }
2195
2196 if ((m_cobjlist(class) = (*list)->obj_next) != NULL &&
2197 !MBUF_IN_MAP(m_cobjlist(class))) {
2198 slab_nextptr_panic(sp, m_cobjlist(class));
2199 /* NOTREACHED */
2200 }
2201 (*list)->obj_next = NULL;
2202 list = *plist = &(*list)->obj_next;
2203
2204 if (--need == 0)
2205 break;
2206 }
2207 m_infree(class) -= (num - need);
2208
2209 return (num - need);
2210}
2211
2212/*
2213 * Place object(s) back into a composite class's freelist.
2214 */
2215static unsigned int
2216cslab_free(mbuf_class_t class, mcache_obj_t *list, int purged)
2217{
2218 mcache_obj_t *o, *tail;
2219 unsigned int num = 0;
2220 struct mbuf *m, *ms;
2221 mcache_audit_t *mca = NULL;
2222 mcache_obj_t *ref_list = NULL;
2223 mcl_slab_t *clsp, *nsp;
2224 void *cl;
6d2010ae 2225 mbuf_class_t cl_class;
2d21ac55
A
2226
2227 ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
2228 VERIFY(class != MC_MBUF_16KCL || njcl > 0);
2229 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2230
6d2010ae
A
2231 if (class == MC_MBUF_CL) {
2232 cl_class = MC_CL;
2233 } else if (class == MC_MBUF_BIGCL) {
2234 cl_class = MC_BIGCL;
2235 } else {
2236 VERIFY(class == MC_MBUF_16KCL);
2237 cl_class = MC_16KCL;
2238 }
2239
2d21ac55
A
2240 o = tail = list;
2241
2242 while ((m = ms = (struct mbuf *)o) != NULL) {
2243 mcache_obj_t *rfa, *nexto = o->obj_next;
2244
2245 /* Do the mbuf sanity checks */
2246 if (mclaudit != NULL) {
2247 mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
6d2010ae
A
2248 if (mclverify) {
2249 mcache_audit_free_verify(mca, m, 0,
2250 m_maxsize(MC_MBUF));
2251 }
39236c6e 2252 ms = MCA_SAVED_MBUF_PTR(mca);
2d21ac55
A
2253 }
2254
2255 /* Do the cluster sanity checks */
2256 cl = ms->m_ext.ext_buf;
2257 clsp = slab_get(cl);
6d2010ae
A
2258 if (mclverify) {
2259 size_t size = m_maxsize(cl_class);
2260 mcache_audit_free_verify(mcl_audit_buf2mca(cl_class,
2d21ac55
A
2261 (mcache_obj_t *)cl), cl, 0, size);
2262 }
2263 VERIFY(ms->m_type == MT_FREE);
2264 VERIFY(ms->m_flags == M_EXT);
2265 VERIFY(MEXT_RFA(ms) != NULL && MBUF_IS_COMPOSITE(ms));
6d2010ae
A
2266 if (cl_class == MC_CL) {
2267 VERIFY(clsp->sl_refcnt >= 1 &&
2268 clsp->sl_refcnt <= NCLPBG);
2269 } else {
2270 VERIFY(clsp->sl_refcnt == 1);
2271 }
2272 if (cl_class == MC_16KCL) {
2d21ac55 2273 int k;
6d2010ae 2274 for (nsp = clsp, k = 1; k < NSLABSP16KB; k++) {
2d21ac55
A
2275 nsp = nsp->sl_next;
2276 /* Next slab must already be present */
2277 VERIFY(nsp != NULL);
2278 VERIFY(nsp->sl_refcnt == 1);
2279 }
2280 }
2281
2282 /*
2283 * If we're asked to purge, restore the actual mbuf using
2284 * contents of the shadow structure (if auditing is enabled)
2285 * and clear EXTF_COMPOSITE flag from the mbuf, as we are
2286 * about to free it and the attached cluster into their caches.
2287 */
2288 if (purged) {
2289 /* Restore constructed mbuf fields */
2290 if (mclaudit != NULL)
2291 mcl_audit_restore_mbuf(m, mca, TRUE);
2292
2293 MEXT_REF(m) = 0;
2294 MEXT_FLAGS(m) = 0;
2295
316670eb 2296 rfa = (mcache_obj_t *)(void *)MEXT_RFA(m);
2d21ac55
A
2297 rfa->obj_next = ref_list;
2298 ref_list = rfa;
2299 MEXT_RFA(m) = NULL;
2300
2301 m->m_type = MT_FREE;
2302 m->m_flags = m->m_len = 0;
2303 m->m_next = m->m_nextpkt = NULL;
2304
2305 /* Save mbuf fields and make auditing happy */
2306 if (mclaudit != NULL)
2307 mcl_audit_mbuf(mca, o, FALSE, FALSE);
2308
2309 VERIFY(m_total(class) > 0);
2310 m_total(class)--;
2311
2312 /* Free the mbuf */
2313 o->obj_next = NULL;
2314 slab_free(MC_MBUF, o);
2315
2316 /* And free the cluster */
2317 ((mcache_obj_t *)cl)->obj_next = NULL;
2318 if (class == MC_MBUF_CL)
2319 slab_free(MC_CL, cl);
2320 else if (class == MC_MBUF_BIGCL)
2321 slab_free(MC_BIGCL, cl);
2322 else
2323 slab_free(MC_16KCL, cl);
2324 }
2325
2326 ++num;
2327 tail = o;
2328 o = nexto;
2329 }
2330
2331 if (!purged) {
2332 tail->obj_next = m_cobjlist(class);
2333 m_cobjlist(class) = list;
2334 m_infree(class) += num;
2335 } else if (ref_list != NULL) {
2336 mcache_free_ext(ref_cache, ref_list);
2337 }
2338
2339 return (num);
2340}
2341
2342/*
2343 * Common allocator for composite objects called by the CPU cache layer
2344 * during an allocation request whenever there is no available element in
2345 * the bucket layer. It returns one or more composite elements from the
2346 * appropriate global freelist. If the freelist is empty, it will attempt
2347 * to obtain the rudimentary objects from their caches and construct them
2348 * into composite mbuf + cluster objects.
2349 */
2350static unsigned int
2351mbuf_cslab_alloc(void *arg, mcache_obj_t ***plist, unsigned int needed,
2352 int wait)
2353{
2354 mbuf_class_t class = (mbuf_class_t)arg;
6d2010ae 2355 mbuf_class_t cl_class = 0;
2d21ac55
A
2356 unsigned int num = 0, cnum = 0, want = needed;
2357 mcache_obj_t *ref_list = NULL;
2358 mcache_obj_t *mp_list = NULL;
2359 mcache_obj_t *clp_list = NULL;
2360 mcache_obj_t **list;
2361 struct ext_ref *rfa;
2362 struct mbuf *m;
2363 void *cl;
2364
2365 ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
2366 ASSERT(needed > 0);
2367
2368 VERIFY(class != MC_MBUF_16KCL || njcl > 0);
2369
2370 /* There should not be any slab for this class */
2371 VERIFY(m_slab_cnt(class) == 0 &&
2372 m_slablist(class).tqh_first == NULL &&
2373 m_slablist(class).tqh_last == NULL);
2374
2375 lck_mtx_lock(mbuf_mlock);
2376
2377 /* Try using the freelist first */
2378 num = cslab_alloc(class, plist, needed);
2379 list = *plist;
2380 if (num == needed) {
2381 m_alloc_cnt(class) += num;
2382 lck_mtx_unlock(mbuf_mlock);
2383 return (needed);
2384 }
2385
2386 lck_mtx_unlock(mbuf_mlock);
2387
2388 /*
2389 * We could not satisfy the request using the freelist alone;
2390 * allocate from the appropriate rudimentary caches and use
2391 * whatever we can get to construct the composite objects.
2392 */
2393 needed -= num;
2394
2395 /*
2396 * Mark these allocation requests as coming from a composite cache.
2397 * Also, if the caller is willing to be blocked, mark the request
2398 * with MCR_FAILOK such that we don't end up sleeping at the mbuf
2399 * slab layer waiting for the individual object when one or more
2400 * of the already-constructed composite objects are available.
2401 */
2402 wait |= MCR_COMP;
2403 if (!(wait & MCR_NOSLEEP))
2404 wait |= MCR_FAILOK;
2405
6d2010ae 2406 /* allocate mbufs */
2d21ac55
A
2407 needed = mcache_alloc_ext(m_cache(MC_MBUF), &mp_list, needed, wait);
2408 if (needed == 0) {
2409 ASSERT(mp_list == NULL);
2410 goto fail;
2411 }
6d2010ae
A
2412
2413 /* allocate clusters */
2414 if (class == MC_MBUF_CL) {
2415 cl_class = MC_CL;
2416 } else if (class == MC_MBUF_BIGCL) {
2417 cl_class = MC_BIGCL;
2418 } else {
2419 VERIFY(class == MC_MBUF_16KCL);
2420 cl_class = MC_16KCL;
2421 }
2422 needed = mcache_alloc_ext(m_cache(cl_class), &clp_list, needed, wait);
2d21ac55
A
2423 if (needed == 0) {
2424 ASSERT(clp_list == NULL);
2425 goto fail;
2426 }
6d2010ae 2427
2d21ac55
A
2428 needed = mcache_alloc_ext(ref_cache, &ref_list, needed, wait);
2429 if (needed == 0) {
2430 ASSERT(ref_list == NULL);
2431 goto fail;
2432 }
2433
2434 /*
2435 * By this time "needed" is MIN(mbuf, cluster, ref). Any left
2436 * overs will get freed accordingly before we return to caller.
2437 */
2438 for (cnum = 0; cnum < needed; cnum++) {
2439 struct mbuf *ms;
2440
2441 m = ms = (struct mbuf *)mp_list;
2442 mp_list = mp_list->obj_next;
2443
2444 cl = clp_list;
2445 clp_list = clp_list->obj_next;
2446 ((mcache_obj_t *)cl)->obj_next = NULL;
2447
2448 rfa = (struct ext_ref *)ref_list;
2449 ref_list = ref_list->obj_next;
316670eb 2450 ((mcache_obj_t *)(void *)rfa)->obj_next = NULL;
2d21ac55
A
2451
2452 /*
2453 * If auditing is enabled, construct the shadow mbuf
2454 * in the audit structure instead of in the actual one.
2455 * mbuf_cslab_audit() will take care of restoring the
2456 * contents after the integrity check.
2457 */
2458 if (mclaudit != NULL) {
2459 mcache_audit_t *mca, *cl_mca;
2d21ac55
A
2460
2461 lck_mtx_lock(mbuf_mlock);
2462 mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
39236c6e 2463 ms = MCA_SAVED_MBUF_PTR(mca);
2d21ac55
A
2464 cl_mca = mcl_audit_buf2mca(MC_CL, (mcache_obj_t *)cl);
2465
2466 /*
2467 * Pair them up. Note that this is done at the time
2468 * the mbuf+cluster objects are constructed. This
2469 * information should be treated as "best effort"
2470 * debugging hint since more than one mbufs can refer
2471 * to a cluster. In that case, the cluster might not
2472 * be freed along with the mbuf it was paired with.
2473 */
2474 mca->mca_uptr = cl_mca;
2475 cl_mca->mca_uptr = mca;
2476
2477 ASSERT(mca->mca_uflags & MB_SCVALID);
2478 ASSERT(!(cl_mca->mca_uflags & MB_SCVALID));
2479 lck_mtx_unlock(mbuf_mlock);
2480
2481 /* Technically, they are in the freelist */
6d2010ae
A
2482 if (mclverify) {
2483 size_t size;
2484
2485 mcache_set_pattern(MCACHE_FREE_PATTERN, m,
2486 m_maxsize(MC_MBUF));
2487
2488 if (class == MC_MBUF_CL)
2489 size = m_maxsize(MC_CL);
2490 else if (class == MC_MBUF_BIGCL)
2491 size = m_maxsize(MC_BIGCL);
2492 else
2493 size = m_maxsize(MC_16KCL);
2494
2495 mcache_set_pattern(MCACHE_FREE_PATTERN, cl,
2496 size);
2497 }
2d21ac55
A
2498 }
2499
2500 MBUF_INIT(ms, 0, MT_FREE);
2501 if (class == MC_MBUF_16KCL) {
2502 MBUF_16KCL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE);
2503 } else if (class == MC_MBUF_BIGCL) {
2504 MBUF_BIGCL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE);
2505 } else {
2506 MBUF_CL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE);
2507 }
2508 VERIFY(ms->m_flags == M_EXT);
2509 VERIFY(MEXT_RFA(ms) != NULL && MBUF_IS_COMPOSITE(ms));
2510
2511 *list = (mcache_obj_t *)m;
2512 (*list)->obj_next = NULL;
2513 list = *plist = &(*list)->obj_next;
2514 }
2515
2516fail:
2517 /*
2518 * Free up what's left of the above.
2519 */
2520 if (mp_list != NULL)
2521 mcache_free_ext(m_cache(MC_MBUF), mp_list);
2522 if (clp_list != NULL)
6d2010ae 2523 mcache_free_ext(m_cache(cl_class), clp_list);
2d21ac55
A
2524 if (ref_list != NULL)
2525 mcache_free_ext(ref_cache, ref_list);
2526
2527 lck_mtx_lock(mbuf_mlock);
2528 if (num > 0 || cnum > 0) {
2529 m_total(class) += cnum;
2530 VERIFY(m_total(class) <= m_maxlimit(class));
2531 m_alloc_cnt(class) += num + cnum;
2532 }
2533 if ((num + cnum) < want)
2534 m_fail_cnt(class) += (want - (num + cnum));
2535 lck_mtx_unlock(mbuf_mlock);
2536
2537 return (num + cnum);
2538}
2539
2540/*
2541 * Common de-allocator for composite objects called by the CPU cache
2542 * layer when one or more elements need to be returned to the appropriate
2543 * global freelist.
2544 */
2545static void
2546mbuf_cslab_free(void *arg, mcache_obj_t *list, int purged)
2547{
2548 mbuf_class_t class = (mbuf_class_t)arg;
2549 unsigned int num;
2550 int w;
2551
2552 ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
2553
2554 lck_mtx_lock(mbuf_mlock);
2555
2556 num = cslab_free(class, list, purged);
2557 m_free_cnt(class) += num;
2558
2559 if ((w = mb_waiters) > 0)
2560 mb_waiters = 0;
2561
2562 lck_mtx_unlock(mbuf_mlock);
2563
2564 if (w != 0)
2565 wakeup(mb_waitchan);
2566}
2567
2568/*
2569 * Common auditor for composite objects called by the CPU cache layer
2570 * during an allocation or free request. For the former, this is called
2571 * after the objects are obtained from either the bucket or slab layer
2572 * and before they are returned to the caller. For the latter, this is
2573 * called immediately during free and before placing the objects into
2574 * the bucket or slab layer.
2575 */
2576static void
2577mbuf_cslab_audit(void *arg, mcache_obj_t *list, boolean_t alloc)
2578{
2579 mbuf_class_t class = (mbuf_class_t)arg;
2580 mcache_audit_t *mca;
2581 struct mbuf *m, *ms;
2582 mcl_slab_t *clsp, *nsp;
2583 size_t size;
2584 void *cl;
2585
2586 ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
2587
2588 while ((m = ms = (struct mbuf *)list) != NULL) {
2589 lck_mtx_lock(mbuf_mlock);
2590 /* Do the mbuf sanity checks and record its transaction */
2591 mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
2592 mcl_audit_mbuf(mca, m, TRUE, alloc);
6d2010ae 2593 if (mcltrace)
39236c6e 2594 mcache_buffer_log(mca, m, m_cache(class), &mb_start);
6d2010ae 2595
2d21ac55
A
2596 if (alloc)
2597 mca->mca_uflags |= MB_COMP_INUSE;
2598 else
2599 mca->mca_uflags &= ~MB_COMP_INUSE;
2600
2601 /*
2602 * Use the shadow mbuf in the audit structure if we are
2603 * freeing, since the contents of the actual mbuf has been
2604 * pattern-filled by the above call to mcl_audit_mbuf().
2605 */
6d2010ae 2606 if (!alloc && mclverify)
39236c6e 2607 ms = MCA_SAVED_MBUF_PTR(mca);
2d21ac55
A
2608
2609 /* Do the cluster sanity checks and record its transaction */
2610 cl = ms->m_ext.ext_buf;
2611 clsp = slab_get(cl);
2612 VERIFY(ms->m_flags == M_EXT && cl != NULL);
2613 VERIFY(MEXT_RFA(ms) != NULL && MBUF_IS_COMPOSITE(ms));
6d2010ae
A
2614 if (class == MC_MBUF_CL)
2615 VERIFY(clsp->sl_refcnt >= 1 &&
2616 clsp->sl_refcnt <= NCLPBG);
2617 else
2618 VERIFY(clsp->sl_refcnt == 1);
2619
2620 if (class == MC_MBUF_16KCL) {
2d21ac55 2621 int k;
6d2010ae 2622 for (nsp = clsp, k = 1; k < NSLABSP16KB; k++) {
2d21ac55
A
2623 nsp = nsp->sl_next;
2624 /* Next slab must already be present */
2625 VERIFY(nsp != NULL);
2626 VERIFY(nsp->sl_refcnt == 1);
2627 }
2628 }
2629
2630 mca = mcl_audit_buf2mca(MC_CL, cl);
2631 if (class == MC_MBUF_CL)
2632 size = m_maxsize(MC_CL);
2633 else if (class == MC_MBUF_BIGCL)
2634 size = m_maxsize(MC_BIGCL);
2635 else
2636 size = m_maxsize(MC_16KCL);
2637 mcl_audit_cluster(mca, cl, size, alloc, FALSE);
6d2010ae 2638 if (mcltrace)
39236c6e 2639 mcache_buffer_log(mca, cl, m_cache(class), &mb_start);
6d2010ae 2640
2d21ac55
A
2641 if (alloc)
2642 mca->mca_uflags |= MB_COMP_INUSE;
2643 else
2644 mca->mca_uflags &= ~MB_COMP_INUSE;
2645 lck_mtx_unlock(mbuf_mlock);
2646
2647 list = list->obj_next;
2648 }
2649}
2650
2651/*
2652 * Allocate some number of mbuf clusters and place on cluster freelist.
2653 */
2654static int
2655m_clalloc(const u_int32_t num, const int wait, const u_int32_t bufsize)
2656{
2657 int i;
2658 vm_size_t size = 0;
b0d623f7 2659 int numpages = 0, large_buffer = (bufsize == m_maxsize(MC_16KCL));
2d21ac55
A
2660 vm_offset_t page = 0;
2661 mcache_audit_t *mca_list = NULL;
2662 mcache_obj_t *con_list = NULL;
2663 mcl_slab_t *sp;
2664
6d2010ae
A
2665 VERIFY(bufsize == m_maxsize(MC_BIGCL) ||
2666 bufsize == m_maxsize(MC_16KCL));
2d21ac55
A
2667
2668 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2669
2670 /*
2671 * Multiple threads may attempt to populate the cluster map one
2672 * after another. Since we drop the lock below prior to acquiring
2673 * the physical page(s), our view of the cluster map may no longer
2674 * be accurate, and we could end up over-committing the pages beyond
2675 * the maximum allowed for each class. To prevent it, this entire
2676 * operation (including the page mapping) is serialized.
2677 */
2678 while (mb_clalloc_busy) {
2679 mb_clalloc_waiters++;
2680 (void) msleep(mb_clalloc_waitchan, mbuf_mlock,
2681 (PZERO-1), "m_clalloc", NULL);
2682 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2683 }
2684
2685 /* We are busy now; tell everyone else to go away */
2686 mb_clalloc_busy = TRUE;
2687
2688 /*
2689 * Honor the caller's wish to block or not block. We have a way
2690 * to grow the pool asynchronously using the mbuf worker thread.
2691 */
2692 i = m_howmany(num, bufsize);
2693 if (i == 0 || (wait & M_DONTWAIT))
2694 goto out;
2695
2696 lck_mtx_unlock(mbuf_mlock);
2697
b0d623f7
A
2698 size = round_page(i * bufsize);
2699 page = kmem_mb_alloc(mb_map, size, large_buffer);
2700
2701 /*
6d2010ae 2702 * If we did ask for "n" 16KB physically contiguous chunks
b0d623f7
A
2703 * and didn't get them, then please try again without this
2704 * restriction.
2705 */
2706 if (large_buffer && page == 0)
2707 page = kmem_mb_alloc(mb_map, size, 0);
2d21ac55
A
2708
2709 if (page == 0) {
6d2010ae
A
2710 if (bufsize == m_maxsize(MC_BIGCL)) {
2711 /* Try for 1 page if failed, only 4KB request */
2d21ac55 2712 size = NBPG;
b0d623f7 2713 page = kmem_mb_alloc(mb_map, size, 0);
2d21ac55
A
2714 }
2715
2716 if (page == 0) {
2717 lck_mtx_lock(mbuf_mlock);
2718 goto out;
2719 }
2720 }
2721
2722 VERIFY(IS_P2ALIGNED(page, NBPG));
2723 numpages = size / NBPG;
2724
2725 /* If auditing is enabled, allocate the audit structures now */
2726 if (mclaudit != NULL) {
2727 int needed;
2728
2729 /*
2730 * Yes, I realize this is a waste of memory for clusters
2731 * that never get transformed into mbufs, as we may end
6d2010ae 2732 * up with NMBPBG-1 unused audit structures per cluster.
2d21ac55
A
2733 * But doing so tremendously simplifies the allocation
2734 * strategy, since at this point we are not holding the
6d2010ae 2735 * mbuf lock and the caller is okay to be blocked.
2d21ac55 2736 */
6d2010ae
A
2737 if (bufsize == m_maxsize(MC_BIGCL)) {
2738 needed = numpages * NMBPBG;
2d21ac55
A
2739
2740 i = mcache_alloc_ext(mcl_audit_con_cache,
2741 &con_list, needed, MCR_SLEEP);
2742
2743 VERIFY(con_list != NULL && i == needed);
2d21ac55 2744 } else {
6d2010ae 2745 needed = numpages / NSLABSP16KB;
2d21ac55
A
2746 }
2747
2748 i = mcache_alloc_ext(mcache_audit_cache,
2749 (mcache_obj_t **)&mca_list, needed, MCR_SLEEP);
2750
2751 VERIFY(mca_list != NULL && i == needed);
2752 }
2753
2754 lck_mtx_lock(mbuf_mlock);
2755
2756 for (i = 0; i < numpages; i++, page += NBPG) {
2757 ppnum_t offset = ((char *)page - (char *)mbutl) / NBPG;
99c3a104 2758 ppnum_t new_page = pmap_find_phys(kernel_pmap, page);
2d21ac55
A
2759
2760 /*
39236c6e
A
2761 * If there is a mapper the appropriate I/O page is returned;
2762 * zero out the page to discard its past contents to prevent
2763 * exposing leftover kernel memory.
2d21ac55 2764 */
b0d623f7 2765 VERIFY(offset < mcl_pages);
39236c6e
A
2766 if (mcl_paddr_base != 0) {
2767 bzero((void *)(uintptr_t) page, page_size);
2768 new_page = IOMapperInsertPage(mcl_paddr_base,
2769 offset, new_page);
99c3a104 2770 }
39236c6e 2771 mcl_paddr[offset] = new_page;
2d21ac55
A
2772
2773 /* Pattern-fill this fresh page */
6d2010ae 2774 if (mclverify) {
2d21ac55
A
2775 mcache_set_pattern(MCACHE_FREE_PATTERN,
2776 (caddr_t)page, NBPG);
6d2010ae
A
2777 }
2778 if (bufsize == m_maxsize(MC_BIGCL)) {
2d21ac55 2779 union mbigcluster *mbc = (union mbigcluster *)page;
2d21ac55
A
2780
2781 /* One for the entire page */
2782 sp = slab_get(mbc);
6d2010ae
A
2783 if (mclaudit != NULL) {
2784 mcl_audit_init(mbc, &mca_list, &con_list,
2785 AUDIT_CONTENTS_SIZE, NMBPBG);
2786 }
2d21ac55
A
2787 VERIFY(sp->sl_refcnt == 0 && sp->sl_flags == 0);
2788 slab_init(sp, MC_BIGCL, SLF_MAPPED,
2789 mbc, mbc, bufsize, 0, 1);
2790
2d21ac55
A
2791 /* Insert this slab */
2792 slab_insert(sp, MC_BIGCL);
2793
2794 /* Update stats now since slab_get() drops the lock */
2795 mbstat.m_bigclfree = ++m_infree(MC_BIGCL) +
2796 m_infree(MC_MBUF_BIGCL);
2797 mbstat.m_bigclusters = ++m_total(MC_BIGCL);
2798 VERIFY(m_total(MC_BIGCL) <= m_maxlimit(MC_BIGCL));
6d2010ae 2799 } else if ((i % NSLABSP16KB) == 0) {
2d21ac55
A
2800 union m16kcluster *m16kcl = (union m16kcluster *)page;
2801 mcl_slab_t *nsp;
2802 int k;
2803
2804 VERIFY(njcl > 0);
2805 /* One for the entire 16KB */
2806 sp = slab_get(m16kcl);
2807 if (mclaudit != NULL)
2808 mcl_audit_init(m16kcl, &mca_list, NULL, 0, 1);
2809
2810 VERIFY(sp->sl_refcnt == 0 && sp->sl_flags == 0);
2811 slab_init(sp, MC_16KCL, SLF_MAPPED,
2812 m16kcl, m16kcl, bufsize, 0, 1);
2813
6d2010ae
A
2814 /*
2815 * 2nd-Nth page's slab is part of the first one,
2816 * where N is NSLABSP16KB.
2817 */
2818 for (k = 1; k < NSLABSP16KB; k++) {
2819 nsp = slab_get(((union mbigcluster *)page) + k);
2d21ac55
A
2820 VERIFY(nsp->sl_refcnt == 0 &&
2821 nsp->sl_flags == 0);
2822 slab_init(nsp, MC_16KCL,
2823 SLF_MAPPED | SLF_PARTIAL,
2824 m16kcl, NULL, 0, 0, 0);
2825 }
2826
2827 /* Insert this slab */
2828 slab_insert(sp, MC_16KCL);
2829
2830 /* Update stats now since slab_get() drops the lock */
2831 m_infree(MC_16KCL)++;
2832 m_total(MC_16KCL)++;
2833 VERIFY(m_total(MC_16KCL) <= m_maxlimit(MC_16KCL));
2834 }
2835 }
2836 VERIFY(mca_list == NULL && con_list == NULL);
2837
2838 /* We're done; let others enter */
2839 mb_clalloc_busy = FALSE;
2840 if (mb_clalloc_waiters > 0) {
2841 mb_clalloc_waiters = 0;
2842 wakeup(mb_clalloc_waitchan);
2843 }
2844
6d2010ae 2845 if (bufsize == m_maxsize(MC_BIGCL))
2d21ac55
A
2846 return (numpages);
2847
2848 VERIFY(bufsize == m_maxsize(MC_16KCL));
6d2010ae 2849 return (numpages / NSLABSP16KB);
2d21ac55
A
2850
2851out:
2852 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2853
2854 /* We're done; let others enter */
2855 mb_clalloc_busy = FALSE;
2856 if (mb_clalloc_waiters > 0) {
2857 mb_clalloc_waiters = 0;
2858 wakeup(mb_clalloc_waitchan);
2859 }
2860
2861 /*
2862 * When non-blocking we kick a thread if we have to grow the
2863 * pool or if the number of free clusters is less than requested.
2864 */
6d2010ae 2865 if (bufsize == m_maxsize(MC_BIGCL)) {
2d21ac55
A
2866 if (i > 0) {
2867 /*
2868 * Remember total number of 4KB clusters needed
2869 * at this time.
2870 */
2871 i += m_total(MC_BIGCL);
2872 if (i > mbuf_expand_big) {
2873 mbuf_expand_big = i;
2874 if (mbuf_worker_ready)
2875 wakeup((caddr_t)&mbuf_worker_run);
2876 }
2877 }
2878
2879 if (m_infree(MC_BIGCL) >= num)
2880 return (1);
2881 } else {
2882 if (i > 0) {
2883 /*
2884 * Remember total number of 16KB clusters needed
2885 * at this time.
2886 */
2887 i += m_total(MC_16KCL);
2888 if (i > mbuf_expand_16k) {
2889 mbuf_expand_16k = i;
2890 if (mbuf_worker_ready)
2891 wakeup((caddr_t)&mbuf_worker_run);
2892 }
2893 }
2894
2895 if (m_infree(MC_16KCL) >= num)
2896 return (1);
2897 }
2898 return (0);
2899}
2900
2901/*
2902 * Populate the global freelist of the corresponding buffer class.
2903 */
2904static int
2905freelist_populate(mbuf_class_t class, unsigned int num, int wait)
2906{
2907 mcache_obj_t *o = NULL;
6d2010ae 2908 int i, numpages = 0, count;
2d21ac55
A
2909
2910 VERIFY(class == MC_MBUF || class == MC_CL || class == MC_BIGCL ||
2911 class == MC_16KCL);
2912
2d21ac55
A
2913 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2914
2915 switch (class) {
2916 case MC_MBUF:
2917 case MC_CL:
6d2010ae
A
2918 case MC_BIGCL:
2919 numpages = (num * m_size(class) + NBPG - 1) / NBPG;
2920 i = m_clalloc(numpages, wait, m_maxsize(MC_BIGCL));
2d21ac55 2921
6d2010ae
A
2922 /* Respect the 4KB clusters minimum limit */
2923 if (m_total(MC_BIGCL) == m_maxlimit(MC_BIGCL) &&
2924 m_infree(MC_BIGCL) <= m_minlimit(MC_BIGCL)) {
2925 if (class != MC_BIGCL || (wait & MCR_COMP))
2d21ac55
A
2926 return (0);
2927 }
6d2010ae 2928 if (class == MC_BIGCL)
2d21ac55
A
2929 return (i != 0);
2930 break;
2931
2d21ac55
A
2932 case MC_16KCL:
2933 return (m_clalloc(num, wait, m_maxsize(class)) != 0);
2934 /* NOTREACHED */
2935
2936 default:
2937 VERIFY(0);
2938 /* NOTREACHED */
2939 }
2940
6d2010ae
A
2941 VERIFY(class == MC_MBUF || class == MC_CL);
2942
2943 /* how many objects will we cut the page into? */
2944 int numobj = (class == MC_MBUF ? NMBPBG : NCLPBG);
2945
2946 for (count = 0; count < numpages; count++) {
2947
2948 /* respect totals, minlimit, maxlimit */
2949 if (m_total(MC_BIGCL) <= m_minlimit(MC_BIGCL) ||
2950 m_total(class) >= m_maxlimit(class))
2951 break;
2952
2953 if ((o = slab_alloc(MC_BIGCL, wait)) == NULL)
2954 break;
2955
2d21ac55 2956 struct mbuf *m = (struct mbuf *)o;
6d2010ae 2957 union mcluster *c = (union mcluster *)o;
2d21ac55 2958 mcl_slab_t *sp = slab_get(o);
6d2010ae 2959 mcache_audit_t *mca = NULL;
2d21ac55
A
2960
2961 VERIFY(slab_is_detached(sp) &&
2962 (sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED);
2963
6d2010ae
A
2964 /*
2965 * Make sure that the cluster is unmolested
2966 * while in freelist
2967 */
2968 if (mclverify) {
2969 mca = mcl_audit_buf2mca(MC_BIGCL, o);
2970 mcache_audit_free_verify(mca, o, 0,
2971 m_maxsize(MC_BIGCL));
2d21ac55
A
2972 }
2973
6d2010ae
A
2974 /* Reinitialize it as an mbuf or 2K slab */
2975 slab_init(sp, class, sp->sl_flags,
2976 sp->sl_base, NULL, sp->sl_len, 0, numobj);
2d21ac55 2977
6d2010ae 2978 VERIFY(o == (mcache_obj_t *)sp->sl_base);
2d21ac55
A
2979 VERIFY(sp->sl_head == NULL);
2980
6d2010ae
A
2981 VERIFY(m_total(MC_BIGCL) > 0);
2982 m_total(MC_BIGCL)--;
2983 mbstat.m_bigclusters = m_total(MC_BIGCL);
2d21ac55 2984
6d2010ae
A
2985 m_total(class) += numobj;
2986 m_infree(class) += numobj;
2987
2988 VERIFY(m_total(MC_BIGCL) >= m_minlimit(MC_BIGCL));
2989 VERIFY(m_total(class) <= m_maxlimit(class));
2990
2991 i = numobj;
2992 if (class == MC_MBUF) {
2993 mbstat.m_mbufs = m_total(MC_MBUF);
2994 mtype_stat_add(MT_FREE, NMBPBG);
2995 while (i--) {
2996 /*
2997 * If auditing is enabled, construct the
2998 * shadow mbuf in the audit structure
2999 * instead of the actual one.
3000 * mbuf_slab_audit() will take care of
3001 * restoring the contents after the
3002 * integrity check.
3003 */
3004 if (mclaudit != NULL) {
3005 struct mbuf *ms;
3006 mca = mcl_audit_buf2mca(MC_MBUF,
3007 (mcache_obj_t *)m);
39236c6e 3008 ms = MCA_SAVED_MBUF_PTR(mca);
6d2010ae
A
3009 ms->m_type = MT_FREE;
3010 } else {
3011 m->m_type = MT_FREE;
3012 }
3013 m->m_next = sp->sl_head;
3014 sp->sl_head = (void *)m++;
3015 }
3016 } else { /* MC_CL */
3017 mbstat.m_clfree =
3018 m_infree(MC_CL) + m_infree(MC_MBUF_CL);
3019 mbstat.m_clusters = m_total(MC_CL);
3020 while (i--) {
3021 c->mcl_next = sp->sl_head;
3022 sp->sl_head = (void *)c++;
2d21ac55 3023 }
2d21ac55
A
3024 }
3025
6d2010ae
A
3026 /* Insert into the mbuf or 2k slab list */
3027 slab_insert(sp, class);
2d21ac55
A
3028
3029 if ((i = mb_waiters) > 0)
3030 mb_waiters = 0;
3031 if (i != 0)
3032 wakeup(mb_waitchan);
2d21ac55 3033 }
6d2010ae
A
3034 return (count != 0);
3035}
2d21ac55 3036
6d2010ae
A
3037/*
3038 * For each class, initialize the freelist to hold m_minlimit() objects.
3039 */
3040static void
3041freelist_init(mbuf_class_t class)
3042{
3043 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
3044
3045 VERIFY(class == MC_CL || class == MC_BIGCL);
3046 VERIFY(m_total(class) == 0);
3047 VERIFY(m_minlimit(class) > 0);
3048
3049 while (m_total(class) < m_minlimit(class))
3050 (void) freelist_populate(class, m_minlimit(class), M_WAIT);
3051
3052 VERIFY(m_total(class) >= m_minlimit(class));
2d21ac55
A
3053}
3054
3055/*
3056 * (Inaccurately) check if it might be worth a trip back to the
3057 * mcache layer due the availability of objects there. We'll
3058 * end up back here if there's nothing up there.
3059 */
3060static boolean_t
3061mbuf_cached_above(mbuf_class_t class, int wait)
3062{
3063 switch (class) {
3064 case MC_MBUF:
3065 if (wait & MCR_COMP)
3066 return (!mcache_bkt_isempty(m_cache(MC_MBUF_CL)) ||
3067 !mcache_bkt_isempty(m_cache(MC_MBUF_BIGCL)));
3068 break;
3069
3070 case MC_CL:
3071 if (wait & MCR_COMP)
3072 return (!mcache_bkt_isempty(m_cache(MC_MBUF_CL)));
3073 break;
3074
3075 case MC_BIGCL:
3076 if (wait & MCR_COMP)
3077 return (!mcache_bkt_isempty(m_cache(MC_MBUF_BIGCL)));
3078 break;
3079
3080 case MC_16KCL:
3081 if (wait & MCR_COMP)
3082 return (!mcache_bkt_isempty(m_cache(MC_MBUF_16KCL)));
3083 break;
3084
3085 case MC_MBUF_CL:
3086 case MC_MBUF_BIGCL:
3087 case MC_MBUF_16KCL:
3088 break;
3089
3090 default:
3091 VERIFY(0);
3092 /* NOTREACHED */
3093 }
3094
3095 return (!mcache_bkt_isempty(m_cache(class)));
3096}
3097
3098/*
3099 * If possible, convert constructed objects to raw ones.
3100 */
3101static boolean_t
3102mbuf_steal(mbuf_class_t class, unsigned int num)
3103{
3104 mcache_obj_t *top = NULL;
3105 mcache_obj_t **list = &top;
3106 unsigned int tot = 0;
3107
3108 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
3109
3110 switch (class) {
3111 case MC_MBUF:
3112 case MC_CL:
3113 case MC_BIGCL:
3114 case MC_16KCL:
3115 return (FALSE);
3116
3117 case MC_MBUF_CL:
3118 case MC_MBUF_BIGCL:
3119 case MC_MBUF_16KCL:
3120 /* Get the required number of constructed objects if possible */
3121 if (m_infree(class) > m_minlimit(class)) {
3122 tot = cslab_alloc(class, &list,
3123 MIN(num, m_infree(class)));
3124 }
3125
3126 /* And destroy them to get back the raw objects */
3127 if (top != NULL)
3128 (void) cslab_free(class, top, 1);
3129 break;
3130
3131 default:
3132 VERIFY(0);
3133 /* NOTREACHED */
3134 }
3135
3136 return (tot == num);
3137}
3138
3139static void
3140m_reclaim(mbuf_class_t class, unsigned int num, boolean_t comp)
3141{
3142 int m, bmap = 0;
3143
3144 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
3145
3146 VERIFY(m_total(MC_CL) <= m_maxlimit(MC_CL));
3147 VERIFY(m_total(MC_BIGCL) <= m_maxlimit(MC_BIGCL));
3148 VERIFY(m_total(MC_16KCL) <= m_maxlimit(MC_16KCL));
3149
3150 /*
3151 * This logic can be made smarter; for now, simply mark
3152 * all other related classes as potential victims.
3153 */
3154 switch (class) {
3155 case MC_MBUF:
3156 m_wantpurge(MC_CL)++;
6d2010ae 3157 m_wantpurge(MC_BIGCL)++;
2d21ac55
A
3158 m_wantpurge(MC_MBUF_CL)++;
3159 m_wantpurge(MC_MBUF_BIGCL)++;
3160 break;
3161
3162 case MC_CL:
3163 m_wantpurge(MC_MBUF)++;
6d2010ae
A
3164 m_wantpurge(MC_BIGCL)++;
3165 m_wantpurge(MC_MBUF_BIGCL)++;
2d21ac55
A
3166 if (!comp)
3167 m_wantpurge(MC_MBUF_CL)++;
3168 break;
3169
3170 case MC_BIGCL:
6d2010ae
A
3171 m_wantpurge(MC_MBUF)++;
3172 m_wantpurge(MC_CL)++;
3173 m_wantpurge(MC_MBUF_CL)++;
2d21ac55
A
3174 if (!comp)
3175 m_wantpurge(MC_MBUF_BIGCL)++;
3176 break;
3177
3178 case MC_16KCL:
3179 if (!comp)
3180 m_wantpurge(MC_MBUF_16KCL)++;
3181 break;
3182
3183 default:
3184 VERIFY(0);
3185 /* NOTREACHED */
3186 }
3187
3188 /*
3189 * Run through each marked class and check if we really need to
3190 * purge (and therefore temporarily disable) the per-CPU caches
3191 * layer used by the class. If so, remember the classes since
3192 * we are going to drop the lock below prior to purging.
3193 */
3194 for (m = 0; m < NELEM(mbuf_table); m++) {
3195 if (m_wantpurge(m) > 0) {
3196 m_wantpurge(m) = 0;
3197 /*
3198 * Try hard to steal the required number of objects
3199 * from the freelist of other mbuf classes. Only
3200 * purge and disable the per-CPU caches layer when
3201 * we don't have enough; it's the last resort.
3202 */
3203 if (!mbuf_steal(m, num))
3204 bmap |= (1 << m);
3205 }
3206 }
3207
3208 lck_mtx_unlock(mbuf_mlock);
3209
3210 if (bmap != 0) {
39236c6e
A
3211 /* signal the domains to drain */
3212 net_drain_domains();
2d21ac55
A
3213
3214 /* Sigh; we have no other choices but to ask mcache to purge */
3215 for (m = 0; m < NELEM(mbuf_table); m++) {
3216 if ((bmap & (1 << m)) &&
3217 mcache_purge_cache(m_cache(m))) {
3218 lck_mtx_lock(mbuf_mlock);
3219 m_purge_cnt(m)++;
3220 mbstat.m_drain++;
3221 lck_mtx_unlock(mbuf_mlock);
3222 }
3223 }
3224 } else {
3225 /*
3226 * Request mcache to reap extra elements from all of its caches;
3227 * note that all reaps are serialized and happen only at a fixed
3228 * interval.
3229 */
3230 mcache_reap();
3231 }
3232 lck_mtx_lock(mbuf_mlock);
3233}
3234
3235static inline struct mbuf *
3236m_get_common(int wait, short type, int hdr)
3237{
3238 struct mbuf *m;
3239 int mcflags = MSLEEPF(wait);
3240
3241 /* Is this due to a non-blocking retry? If so, then try harder */
3242 if (mcflags & MCR_NOSLEEP)
3243 mcflags |= MCR_TRYHARD;
3244
3245 m = mcache_alloc(m_cache(MC_MBUF), mcflags);
3246 if (m != NULL) {
3247 MBUF_INIT(m, hdr, type);
3248 mtype_stat_inc(type);
3249 mtype_stat_dec(MT_FREE);
3250#if CONFIG_MACF_NET
3251 if (hdr && mac_init_mbuf(m, wait) != 0) {
3252 m_free(m);
3253 return (NULL);
3254 }
3255#endif /* MAC_NET */
3256 }
3257 return (m);
3258}
3259
3260/*
3261 * Space allocation routines; these are also available as macros
3262 * for critical paths.
3263 */
3264#define _M_GET(wait, type) m_get_common(wait, type, 0)
3265#define _M_GETHDR(wait, type) m_get_common(wait, type, 1)
3266#define _M_RETRY(wait, type) _M_GET(wait, type)
3267#define _M_RETRYHDR(wait, type) _M_GETHDR(wait, type)
3268#define _MGET(m, how, type) ((m) = _M_GET(how, type))
3269#define _MGETHDR(m, how, type) ((m) = _M_GETHDR(how, type))
3270
3271struct mbuf *
3272m_get(int wait, int type)
3273{
3274 return (_M_GET(wait, type));
3275}
3276
3277struct mbuf *
3278m_gethdr(int wait, int type)
3279{
3280 return (_M_GETHDR(wait, type));
3281}
3282
3283struct mbuf *
3284m_retry(int wait, int type)
3285{
3286 return (_M_RETRY(wait, type));
3287}
3288
3289struct mbuf *
3290m_retryhdr(int wait, int type)
3291{
3292 return (_M_RETRYHDR(wait, type));
3293}
3294
3295struct mbuf *
3296m_getclr(int wait, int type)
3297{
3298 struct mbuf *m;
3299
3300 _MGET(m, wait, type);
3301 if (m != NULL)
3302 bzero(MTOD(m, caddr_t), MLEN);
3303 return (m);
3304}
3305
3306struct mbuf *
3307m_free(struct mbuf *m)
3308{
3309 struct mbuf *n = m->m_next;
3310
3311 if (m->m_type == MT_FREE)
3312 panic("m_free: freeing an already freed mbuf");
3313
2d21ac55 3314 if (m->m_flags & M_PKTHDR) {
39236c6e
A
3315 /* Check for scratch area overflow */
3316 m_redzone_verify(m);
3317 /* Free the aux data and tags if there is any */
2d21ac55
A
3318 m_tag_delete_chain(m, NULL);
3319 }
3320
3321 if (m->m_flags & M_EXT) {
3322 u_int32_t refcnt;
6d2010ae 3323 u_int32_t composite;
2d21ac55
A
3324
3325 refcnt = m_decref(m);
6d2010ae
A
3326 composite = (MEXT_FLAGS(m) & EXTF_COMPOSITE);
3327 if (refcnt == 0 && !composite) {
2d21ac55
A
3328 if (m->m_ext.ext_free == NULL) {
3329 mcache_free(m_cache(MC_CL), m->m_ext.ext_buf);
3330 } else if (m->m_ext.ext_free == m_bigfree) {
3331 mcache_free(m_cache(MC_BIGCL),
3332 m->m_ext.ext_buf);
3333 } else if (m->m_ext.ext_free == m_16kfree) {
3334 mcache_free(m_cache(MC_16KCL),
3335 m->m_ext.ext_buf);
3336 } else {
3337 (*(m->m_ext.ext_free))(m->m_ext.ext_buf,
3338 m->m_ext.ext_size, m->m_ext.ext_arg);
3339 }
3340 mcache_free(ref_cache, MEXT_RFA(m));
3341 MEXT_RFA(m) = NULL;
6d2010ae 3342 } else if (refcnt == 0 && composite) {
2d21ac55
A
3343 VERIFY(m->m_type != MT_FREE);
3344
3345 mtype_stat_dec(m->m_type);
3346 mtype_stat_inc(MT_FREE);
3347
3348 m->m_type = MT_FREE;
3349 m->m_flags = M_EXT;
3350 m->m_len = 0;
3351 m->m_next = m->m_nextpkt = NULL;
3352
6d2010ae
A
3353 MEXT_FLAGS(m) &= ~EXTF_READONLY;
3354
2d21ac55
A
3355 /* "Free" into the intermediate cache */
3356 if (m->m_ext.ext_free == NULL) {
3357 mcache_free(m_cache(MC_MBUF_CL), m);
3358 } else if (m->m_ext.ext_free == m_bigfree) {
3359 mcache_free(m_cache(MC_MBUF_BIGCL), m);
3360 } else {
3361 VERIFY(m->m_ext.ext_free == m_16kfree);
3362 mcache_free(m_cache(MC_MBUF_16KCL), m);
3363 }
3364 return (n);
3365 }
3366 }
3367
3368 if (m->m_type != MT_FREE) {
3369 mtype_stat_dec(m->m_type);
3370 mtype_stat_inc(MT_FREE);
3371 }
3372
3373 m->m_type = MT_FREE;
3374 m->m_flags = m->m_len = 0;
3375 m->m_next = m->m_nextpkt = NULL;
3376
3377 mcache_free(m_cache(MC_MBUF), m);
3378
3379 return (n);
3380}
3381
3382__private_extern__ struct mbuf *
3383m_clattach(struct mbuf *m, int type, caddr_t extbuf,
3384 void (*extfree)(caddr_t, u_int, caddr_t), u_int extsize, caddr_t extarg,
3385 int wait)
3386{
3387 struct ext_ref *rfa = NULL;
3388
3389 if (m == NULL && (m = _M_GETHDR(wait, type)) == NULL)
3390 return (NULL);
3391
3392 if (m->m_flags & M_EXT) {
3393 u_int32_t refcnt;
6d2010ae 3394 u_int32_t composite;
2d21ac55
A
3395
3396 refcnt = m_decref(m);
6d2010ae
A
3397 composite = (MEXT_FLAGS(m) & EXTF_COMPOSITE);
3398 if (refcnt == 0 && !composite) {
2d21ac55
A
3399 if (m->m_ext.ext_free == NULL) {
3400 mcache_free(m_cache(MC_CL), m->m_ext.ext_buf);
3401 } else if (m->m_ext.ext_free == m_bigfree) {
3402 mcache_free(m_cache(MC_BIGCL),
3403 m->m_ext.ext_buf);
3404 } else if (m->m_ext.ext_free == m_16kfree) {
3405 mcache_free(m_cache(MC_16KCL),
3406 m->m_ext.ext_buf);
3407 } else {
3408 (*(m->m_ext.ext_free))(m->m_ext.ext_buf,
3409 m->m_ext.ext_size, m->m_ext.ext_arg);
3410 }
3411 /* Re-use the reference structure */
3412 rfa = MEXT_RFA(m);
6d2010ae 3413 } else if (refcnt == 0 && composite) {
2d21ac55
A
3414 VERIFY(m->m_type != MT_FREE);
3415
3416 mtype_stat_dec(m->m_type);
3417 mtype_stat_inc(MT_FREE);
3418
3419 m->m_type = MT_FREE;
3420 m->m_flags = M_EXT;
3421 m->m_len = 0;
3422 m->m_next = m->m_nextpkt = NULL;
6d2010ae
A
3423
3424 MEXT_FLAGS(m) &= ~EXTF_READONLY;
3425
2d21ac55
A
3426 /* "Free" into the intermediate cache */
3427 if (m->m_ext.ext_free == NULL) {
3428 mcache_free(m_cache(MC_MBUF_CL), m);
3429 } else if (m->m_ext.ext_free == m_bigfree) {
3430 mcache_free(m_cache(MC_MBUF_BIGCL), m);
3431 } else {
3432 VERIFY(m->m_ext.ext_free == m_16kfree);
3433 mcache_free(m_cache(MC_MBUF_16KCL), m);
3434 }
3435 /*
3436 * Allocate a new mbuf, since we didn't divorce
3437 * the composite mbuf + cluster pair above.
3438 */
3439 if ((m = _M_GETHDR(wait, type)) == NULL)
3440 return (NULL);
3441 }
3442 }
3443
3444 if (rfa == NULL &&
3445 (rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL) {
3446 m_free(m);
3447 return (NULL);
3448 }
3449
3450 MEXT_INIT(m, extbuf, extsize, extfree, extarg, rfa, 1, 0);
3451
3452 return (m);
3453}
3454
b0d623f7
A
3455/*
3456 * Perform `fast' allocation mbuf clusters from a cache of recently-freed
3457 * clusters. (If the cache is empty, new clusters are allocated en-masse.)
3458 */
3459struct mbuf *
3460m_getcl(int wait, int type, int flags)
3461{
3462 struct mbuf *m;
3463 int mcflags = MSLEEPF(wait);
3464 int hdr = (flags & M_PKTHDR);
3465
3466 /* Is this due to a non-blocking retry? If so, then try harder */
3467 if (mcflags & MCR_NOSLEEP)
3468 mcflags |= MCR_TRYHARD;
3469
6d2010ae
A
3470 m = mcache_alloc(m_cache(MC_MBUF_CL), mcflags);
3471 if (m != NULL) {
3472 u_int32_t flag;
3473 struct ext_ref *rfa;
3474 void *cl;
3475
3476 VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT);
3477 cl = m->m_ext.ext_buf;
3478 rfa = MEXT_RFA(m);
3479
3480 ASSERT(cl != NULL && rfa != NULL);
3481 VERIFY(MBUF_IS_COMPOSITE(m) && m->m_ext.ext_free == NULL);
3482
3483 flag = MEXT_FLAGS(m);
3484
b0d623f7 3485 MBUF_INIT(m, hdr, type);
6d2010ae
A
3486 MBUF_CL_INIT(m, cl, rfa, 1, flag);
3487
b0d623f7
A
3488 mtype_stat_inc(type);
3489 mtype_stat_dec(MT_FREE);
3490#if CONFIG_MACF_NET
3491 if (hdr && mac_init_mbuf(m, wait) != 0) {
6d2010ae 3492 m_freem(m);
b0d623f7
A
3493 return (NULL);
3494 }
3495#endif /* MAC_NET */
3496 }
3497 return (m);
3498}
3499
2d21ac55
A
3500/* m_mclget() add an mbuf cluster to a normal mbuf */
3501struct mbuf *
3502m_mclget(struct mbuf *m, int wait)
3503{
3504 struct ext_ref *rfa;
3505
3506 if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL)
3507 return (m);
3508
3509 m->m_ext.ext_buf = m_mclalloc(wait);
3510 if (m->m_ext.ext_buf != NULL) {
3511 MBUF_CL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0);
3512 } else {
3513 mcache_free(ref_cache, rfa);
3514 }
3515 return (m);
3516}
3517
3518/* Allocate an mbuf cluster */
3519caddr_t
3520m_mclalloc(int wait)
3521{
3522 int mcflags = MSLEEPF(wait);
3523
3524 /* Is this due to a non-blocking retry? If so, then try harder */
3525 if (mcflags & MCR_NOSLEEP)
3526 mcflags |= MCR_TRYHARD;
3527
3528 return (mcache_alloc(m_cache(MC_CL), mcflags));
3529}
3530
3531/* Free an mbuf cluster */
3532void
3533m_mclfree(caddr_t p)
3534{
3535 mcache_free(m_cache(MC_CL), p);
3536}
3537
3538/*
3539 * mcl_hasreference() checks if a cluster of an mbuf is referenced by
6d2010ae 3540 * another mbuf; see comments in m_incref() regarding EXTF_READONLY.
2d21ac55
A
3541 */
3542int
3543m_mclhasreference(struct mbuf *m)
3544{
3545 if (!(m->m_flags & M_EXT))
3546 return (0);
9bccf70c 3547
2d21ac55
A
3548 ASSERT(MEXT_RFA(m) != NULL);
3549
6d2010ae 3550 return ((MEXT_FLAGS(m) & EXTF_READONLY) ? 1 : 0);
9bccf70c
A
3551}
3552
2d21ac55
A
3553__private_extern__ caddr_t
3554m_bigalloc(int wait)
9bccf70c 3555{
2d21ac55 3556 int mcflags = MSLEEPF(wait);
91447636 3557
2d21ac55
A
3558 /* Is this due to a non-blocking retry? If so, then try harder */
3559 if (mcflags & MCR_NOSLEEP)
3560 mcflags |= MCR_TRYHARD;
91447636 3561
2d21ac55 3562 return (mcache_alloc(m_cache(MC_BIGCL), mcflags));
9bccf70c
A
3563}
3564
2d21ac55
A
3565__private_extern__ void
3566m_bigfree(caddr_t p, __unused u_int size, __unused caddr_t arg)
9bccf70c 3567{
2d21ac55 3568 mcache_free(m_cache(MC_BIGCL), p);
9bccf70c
A
3569}
3570
2d21ac55
A
3571/* m_mbigget() add an 4KB mbuf cluster to a normal mbuf */
3572__private_extern__ struct mbuf *
3573m_mbigget(struct mbuf *m, int wait)
3574{
3575 struct ext_ref *rfa;
3576
3577 if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL)
3578 return (m);
3579
3580 m->m_ext.ext_buf = m_bigalloc(wait);
3581 if (m->m_ext.ext_buf != NULL) {
3582 MBUF_BIGCL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0);
91447636 3583 } else {
2d21ac55 3584 mcache_free(ref_cache, rfa);
91447636 3585 }
2d21ac55
A
3586 return (m);
3587}
3588
3589__private_extern__ caddr_t
3590m_16kalloc(int wait)
3591{
3592 int mcflags = MSLEEPF(wait);
3593
3594 /* Is this due to a non-blocking retry? If so, then try harder */
3595 if (mcflags & MCR_NOSLEEP)
3596 mcflags |= MCR_TRYHARD;
3597
3598 return (mcache_alloc(m_cache(MC_16KCL), mcflags));
91447636
A
3599}
3600
3601__private_extern__ void
2d21ac55 3602m_16kfree(caddr_t p, __unused u_int size, __unused caddr_t arg)
91447636 3603{
2d21ac55 3604 mcache_free(m_cache(MC_16KCL), p);
91447636
A
3605}
3606
2d21ac55 3607/* m_m16kget() add a 16KB mbuf cluster to a normal mbuf */
91447636 3608__private_extern__ struct mbuf *
2d21ac55 3609m_m16kget(struct mbuf *m, int wait)
91447636 3610{
2d21ac55
A
3611 struct ext_ref *rfa;
3612
3613 if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL)
3614 return (m);
3615
3616 m->m_ext.ext_buf = m_16kalloc(wait);
3617 if (m->m_ext.ext_buf != NULL) {
3618 MBUF_16KCL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0);
3619 } else {
3620 mcache_free(ref_cache, rfa);
91447636 3621 }
2d21ac55 3622 return (m);
91447636
A
3623}
3624
b0d623f7
A
3625/*
3626 * "Move" mbuf pkthdr from "from" to "to".
3627 * "from" must have M_PKTHDR set, and "to" must be empty.
3628 */
9bccf70c 3629void
2d21ac55 3630m_copy_pkthdr(struct mbuf *to, struct mbuf *from)
9bccf70c 3631{
39236c6e
A
3632 VERIFY(from->m_flags & M_PKTHDR);
3633
3634 /* Check for scratch area overflow */
3635 m_redzone_verify(from);
3636
3637 if (to->m_flags & M_PKTHDR) {
3638 /* Check for scratch area overflow */
3639 m_redzone_verify(to);
3640 /* We will be taking over the tags of 'to' */
2d21ac55 3641 m_tag_delete_chain(to, NULL);
39236c6e 3642 }
2d21ac55 3643 to->m_pkthdr = from->m_pkthdr; /* especially tags */
39236c6e
A
3644 m_classifier_init(from, 0); /* purge classifier info */
3645 m_tag_init(from, 1); /* purge all tags from src */
3646 m_scratch_init(from); /* clear src scratch area */
935ed37a
A
3647 to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT);
3648 if ((to->m_flags & M_EXT) == 0)
3649 to->m_data = to->m_pktdat;
39236c6e 3650 m_redzone_init(to); /* setup red zone on dst */
9bccf70c
A
3651}
3652
91447636
A
3653/*
3654 * Duplicate "from"'s mbuf pkthdr in "to".
3655 * "from" must have M_PKTHDR set, and "to" must be empty.
3656 * In particular, this does a deep copy of the packet tags.
3657 */
3a60a9f5 3658static int
91447636
A
3659m_dup_pkthdr(struct mbuf *to, struct mbuf *from, int how)
3660{
39236c6e
A
3661 VERIFY(from->m_flags & M_PKTHDR);
3662
3663 /* Check for scratch area overflow */
3664 m_redzone_verify(from);
3665
3666 if (to->m_flags & M_PKTHDR) {
3667 /* Check for scratch area overflow */
3668 m_redzone_verify(to);
3669 /* We will be taking over the tags of 'to' */
2d21ac55 3670 m_tag_delete_chain(to, NULL);
39236c6e 3671 }
2d21ac55
A
3672 to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT);
3673 if ((to->m_flags & M_EXT) == 0)
3674 to->m_data = to->m_pktdat;
3675 to->m_pkthdr = from->m_pkthdr;
39236c6e
A
3676 m_redzone_init(to); /* setup red zone on dst */
3677 m_tag_init(to, 0); /* preserve dst static tags */
2d21ac55 3678 return (m_tag_copy_chain(to, from, how));
91447636 3679}
fa4905b1 3680
316670eb
A
3681void
3682m_copy_pftag(struct mbuf *to, struct mbuf *from)
3683{
3684 to->m_pkthdr.pf_mtag = from->m_pkthdr.pf_mtag;
39236c6e 3685#if PF_ECN
316670eb
A
3686 to->m_pkthdr.pf_mtag.pftag_hdr = NULL;
3687 to->m_pkthdr.pf_mtag.pftag_flags &= ~(PF_TAG_HDR_INET|PF_TAG_HDR_INET6);
39236c6e
A
3688#endif /* PF_ECN */
3689}
3690
3691void
3692m_classifier_init(struct mbuf *m, uint32_t pktf_mask)
3693{
3694 VERIFY(m->m_flags & M_PKTHDR);
3695
3696 m->m_pkthdr.pkt_proto = 0;
3697 m->m_pkthdr.pkt_flowsrc = 0;
3698 m->m_pkthdr.pkt_flowid = 0;
3699 m->m_pkthdr.pkt_flags &= pktf_mask; /* caller-defined mask */
3700 /* preserve service class and interface info for loopback packets */
3701 if (!(m->m_pkthdr.pkt_flags & PKTF_LOOP))
3702 (void) m_set_service_class(m, MBUF_SC_BE);
3703 if (!(m->m_pkthdr.pkt_flags & PKTF_IFAINFO))
3704 m->m_pkthdr.pkt_ifainfo = 0;
3705#if MEASURE_BW
3706 m->m_pkthdr.pkt_bwseq = 0;
3707#endif /* MEASURE_BW */
3708}
3709
3710void
3711m_copy_classifier(struct mbuf *to, struct mbuf *from)
3712{
3713 VERIFY(to->m_flags & M_PKTHDR);
3714 VERIFY(from->m_flags & M_PKTHDR);
3715
3716 to->m_pkthdr.pkt_proto = from->m_pkthdr.pkt_proto;
3717 to->m_pkthdr.pkt_flowsrc = from->m_pkthdr.pkt_flowsrc;
3718 to->m_pkthdr.pkt_flowid = from->m_pkthdr.pkt_flowid;
3719 to->m_pkthdr.pkt_flags = from->m_pkthdr.pkt_flags;
3720 (void) m_set_service_class(to, from->m_pkthdr.pkt_svc);
3721 to->m_pkthdr.pkt_ifainfo = from->m_pkthdr.pkt_ifainfo;
3722 to->m_pkthdr.ipsec_policy = from->m_pkthdr.ipsec_policy;
3723#if MEASURE_BW
3724 to->m_pkthdr.pkt_bwseq = from->m_pkthdr.pkt_bwseq;
3725#endif /* MEASURE_BW */
316670eb
A
3726}
3727
9bccf70c 3728/*
2d21ac55
A
3729 * Return a list of mbuf hdrs that point to clusters. Try for num_needed;
3730 * if wantall is not set, return whatever number were available. Set up the
3731 * first num_with_pkthdrs with mbuf hdrs configured as packet headers; these
3732 * are chained on the m_nextpkt field. Any packets requested beyond this
3733 * are chained onto the last packet header's m_next field. The size of
3734 * the cluster is controlled by the parameter bufsize.
9bccf70c 3735 */
91447636 3736__private_extern__ struct mbuf *
2d21ac55
A
3737m_getpackets_internal(unsigned int *num_needed, int num_with_pkthdrs,
3738 int wait, int wantall, size_t bufsize)
fa4905b1
A
3739{
3740 struct mbuf *m;
3741 struct mbuf **np, *top;
2d21ac55
A
3742 unsigned int pnum, needed = *num_needed;
3743 mcache_obj_t *mp_list = NULL;
3744 int mcflags = MSLEEPF(wait);
3745 u_int32_t flag;
3746 struct ext_ref *rfa;
3747 mcache_t *cp;
3748 void *cl;
3749
3750 ASSERT(bufsize == m_maxsize(MC_CL) ||
3751 bufsize == m_maxsize(MC_BIGCL) ||
3752 bufsize == m_maxsize(MC_16KCL));
3753
3754 /*
3755 * Caller must first check for njcl because this
3756 * routine is internal and not exposed/used via KPI.
3757 */
3758 VERIFY(bufsize != m_maxsize(MC_16KCL) || njcl > 0);
3759
fa4905b1
A
3760 top = NULL;
3761 np = &top;
2d21ac55 3762 pnum = 0;
fa4905b1 3763
2d21ac55
A
3764 /*
3765 * The caller doesn't want all the requested buffers; only some.
3766 * Try hard to get what we can, but don't block. This effectively
3767 * overrides MCR_SLEEP, since this thread will not go to sleep
3768 * if we can't get all the buffers.
3769 */
3770 if (!wantall || (mcflags & MCR_NOSLEEP))
3771 mcflags |= MCR_TRYHARD;
3772
3773 /* Allocate the composite mbuf + cluster elements from the cache */
3774 if (bufsize == m_maxsize(MC_CL))
3775 cp = m_cache(MC_MBUF_CL);
3776 else if (bufsize == m_maxsize(MC_BIGCL))
3777 cp = m_cache(MC_MBUF_BIGCL);
3778 else
3779 cp = m_cache(MC_MBUF_16KCL);
3780 needed = mcache_alloc_ext(cp, &mp_list, needed, mcflags);
3781
3782 for (pnum = 0; pnum < needed; pnum++) {
3783 m = (struct mbuf *)mp_list;
3784 mp_list = mp_list->obj_next;
3785
3786 VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT);
3787 cl = m->m_ext.ext_buf;
3788 rfa = MEXT_RFA(m);
3789
3790 ASSERT(cl != NULL && rfa != NULL);
3791 VERIFY(MBUF_IS_COMPOSITE(m));
3792
3793 flag = MEXT_FLAGS(m);
3794
3795 MBUF_INIT(m, num_with_pkthdrs, MT_DATA);
3796 if (bufsize == m_maxsize(MC_16KCL)) {
3797 MBUF_16KCL_INIT(m, cl, rfa, 1, flag);
3798 } else if (bufsize == m_maxsize(MC_BIGCL)) {
3799 MBUF_BIGCL_INIT(m, cl, rfa, 1, flag);
91447636 3800 } else {
2d21ac55
A
3801 MBUF_CL_INIT(m, cl, rfa, 1, flag);
3802 }
3803
3804 if (num_with_pkthdrs > 0) {
3805 --num_with_pkthdrs;
3806#if CONFIG_MACF_NET
3807 if (mac_mbuf_label_init(m, wait) != 0) {
6d2010ae 3808 m_freem(m);
2d21ac55 3809 break;
91447636 3810 }
2d21ac55 3811#endif /* MAC_NET */
91447636 3812 }
2d21ac55
A
3813
3814 *np = m;
3815 if (num_with_pkthdrs > 0)
91447636
A
3816 np = &m->m_nextpkt;
3817 else
3818 np = &m->m_next;
3819 }
2d21ac55
A
3820 ASSERT(pnum != *num_needed || mp_list == NULL);
3821 if (mp_list != NULL)
3822 mcache_free_ext(cp, mp_list);
3823
3824 if (pnum > 0) {
3825 mtype_stat_add(MT_DATA, pnum);
3826 mtype_stat_sub(MT_FREE, pnum);
3827 }
3828
3829 if (wantall && (pnum != *num_needed)) {
3830 if (top != NULL)
3831 m_freem_list(top);
3832 return (NULL);
91447636 3833 }
fa4905b1 3834
316670eb
A
3835 if (pnum > *num_needed) {
3836 printf("%s: File a radar related to <rdar://10146739>. \
3837 needed = %u, pnum = %u, num_needed = %u \n",
3838 __func__, needed, pnum, *num_needed);
3839 }
3840
2d21ac55
A
3841 *num_needed = pnum;
3842 return (top);
3843}
fa4905b1 3844
91447636 3845/*
2d21ac55
A
3846 * Return list of mbuf linked by m_nextpkt. Try for numlist, and if
3847 * wantall is not set, return whatever number were available. The size of
3848 * each mbuf in the list is controlled by the parameter packetlen. Each
3849 * mbuf of the list may have a chain of mbufs linked by m_next. Each mbuf
3850 * in the chain is called a segment. If maxsegments is not null and the
3851 * value pointed to is not null, this specify the maximum number of segments
3852 * for a chain of mbufs. If maxsegments is zero or the value pointed to
3853 * is zero the caller does not have any restriction on the number of segments.
3854 * The actual number of segments of a mbuf chain is return in the value
3855 * pointed to by maxsegments.
91447636 3856 */
91447636 3857__private_extern__ struct mbuf *
2d21ac55
A
3858m_allocpacket_internal(unsigned int *numlist, size_t packetlen,
3859 unsigned int *maxsegments, int wait, int wantall, size_t wantsize)
91447636 3860{
2d21ac55
A
3861 struct mbuf **np, *top, *first = NULL;
3862 size_t bufsize, r_bufsize;
3863 unsigned int num = 0;
3864 unsigned int nsegs = 0;
3865 unsigned int needed, resid;
3866 int mcflags = MSLEEPF(wait);
3867 mcache_obj_t *mp_list = NULL, *rmp_list = NULL;
3868 mcache_t *cp = NULL, *rcp = NULL;
3869
3870 if (*numlist == 0)
3871 return (NULL);
fa4905b1 3872
91447636
A
3873 top = NULL;
3874 np = &top;
2d21ac55 3875
91447636 3876 if (wantsize == 0) {
2d21ac55 3877 if (packetlen <= MINCLSIZE) {
91447636 3878 bufsize = packetlen;
2d21ac55
A
3879 } else if (packetlen > m_maxsize(MC_CL)) {
3880 /* Use 4KB if jumbo cluster pool isn't available */
3881 if (packetlen <= m_maxsize(MC_BIGCL) || njcl == 0)
3882 bufsize = m_maxsize(MC_BIGCL);
3883 else
3884 bufsize = m_maxsize(MC_16KCL);
3885 } else {
3886 bufsize = m_maxsize(MC_CL);
3887 }
3888 } else if (wantsize == m_maxsize(MC_CL) ||
3889 wantsize == m_maxsize(MC_BIGCL) ||
3890 (wantsize == m_maxsize(MC_16KCL) && njcl > 0)) {
91447636 3891 bufsize = wantsize;
2d21ac55
A
3892 } else {
3893 return (NULL);
3894 }
91447636
A
3895
3896 if (bufsize <= MHLEN) {
2d21ac55 3897 nsegs = 1;
91447636
A
3898 } else if (bufsize <= MINCLSIZE) {
3899 if (maxsegments != NULL && *maxsegments == 1) {
2d21ac55
A
3900 bufsize = m_maxsize(MC_CL);
3901 nsegs = 1;
91447636 3902 } else {
2d21ac55 3903 nsegs = 2;
fa4905b1 3904 }
2d21ac55
A
3905 } else if (bufsize == m_maxsize(MC_16KCL)) {
3906 VERIFY(njcl > 0);
3907 nsegs = ((packetlen - 1) >> (PGSHIFT + 2)) + 1;
3908 } else if (bufsize == m_maxsize(MC_BIGCL)) {
3909 nsegs = ((packetlen - 1) >> PGSHIFT) + 1;
91447636 3910 } else {
2d21ac55 3911 nsegs = ((packetlen - 1) >> MCLSHIFT) + 1;
91447636
A
3912 }
3913 if (maxsegments != NULL) {
2d21ac55
A
3914 if (*maxsegments && nsegs > *maxsegments) {
3915 *maxsegments = nsegs;
3916 return (NULL);
91447636 3917 }
2d21ac55 3918 *maxsegments = nsegs;
91447636 3919 }
91447636 3920
2d21ac55
A
3921 /*
3922 * The caller doesn't want all the requested buffers; only some.
3923 * Try hard to get what we can, but don't block. This effectively
3924 * overrides MCR_SLEEP, since this thread will not go to sleep
3925 * if we can't get all the buffers.
3926 */
3927 if (!wantall || (mcflags & MCR_NOSLEEP))
3928 mcflags |= MCR_TRYHARD;
3929
3930 /*
3931 * Simple case where all elements in the lists/chains are mbufs.
3932 * Unless bufsize is greater than MHLEN, each segment chain is made
3933 * up of exactly 1 mbuf. Otherwise, each segment chain is made up
3934 * of 2 mbufs; the second one is used for the residual data, i.e.
3935 * the remaining data that cannot fit into the first mbuf.
3936 */
3937 if (bufsize <= MINCLSIZE) {
3938 /* Allocate the elements in one shot from the mbuf cache */
3939 ASSERT(bufsize <= MHLEN || nsegs == 2);
3940 cp = m_cache(MC_MBUF);
3941 needed = mcache_alloc_ext(cp, &mp_list,
3942 (*numlist) * nsegs, mcflags);
3943
3944 /*
3945 * The number of elements must be even if we are to use an
3946 * mbuf (instead of a cluster) to store the residual data.
3947 * If we couldn't allocate the requested number of mbufs,
3948 * trim the number down (if it's odd) in order to avoid
3949 * creating a partial segment chain.
3950 */
3951 if (bufsize > MHLEN && (needed & 0x1))
3952 needed--;
91447636 3953
2d21ac55
A
3954 while (num < needed) {
3955 struct mbuf *m;
91447636 3956
2d21ac55
A
3957 m = (struct mbuf *)mp_list;
3958 mp_list = mp_list->obj_next;
3959 ASSERT(m != NULL);
91447636 3960
2d21ac55
A
3961 MBUF_INIT(m, 1, MT_DATA);
3962#if CONFIG_MACF_NET
3963 if (mac_init_mbuf(m, wait) != 0) {
3964 m_free(m);
3965 break;
91447636 3966 }
2d21ac55
A
3967#endif /* MAC_NET */
3968 num++;
3969 if (bufsize > MHLEN) {
3970 /* A second mbuf for this segment chain */
3971 m->m_next = (struct mbuf *)mp_list;
3972 mp_list = mp_list->obj_next;
3973 ASSERT(m->m_next != NULL);
3974
3975 MBUF_INIT(m->m_next, 0, MT_DATA);
3976 num++;
91447636 3977 }
2d21ac55
A
3978 *np = m;
3979 np = &m->m_nextpkt;
3980 }
3981 ASSERT(num != *numlist || mp_list == NULL);
3982
3983 if (num > 0) {
3984 mtype_stat_add(MT_DATA, num);
3985 mtype_stat_sub(MT_FREE, num);
3986 }
3987 num /= nsegs;
3988
3989 /* We've got them all; return to caller */
3990 if (num == *numlist)
3991 return (top);
3992
3993 goto fail;
3994 }
3995
3996 /*
3997 * Complex cases where elements are made up of one or more composite
3998 * mbufs + cluster, depending on packetlen. Each N-segment chain can
3999 * be illustrated as follows:
4000 *
4001 * [mbuf + cluster 1] [mbuf + cluster 2] ... [mbuf + cluster N]
4002 *
4003 * Every composite mbuf + cluster element comes from the intermediate
4004 * cache (either MC_MBUF_CL or MC_MBUF_BIGCL). For space efficiency,
4005 * the last composite element will come from the MC_MBUF_CL cache,
4006 * unless the residual data is larger than 2KB where we use the
4007 * big cluster composite cache (MC_MBUF_BIGCL) instead. Residual
4008 * data is defined as extra data beyond the first element that cannot
4009 * fit into the previous element, i.e. there is no residual data if
4010 * the chain only has 1 segment.
4011 */
4012 r_bufsize = bufsize;
4013 resid = packetlen > bufsize ? packetlen % bufsize : 0;
4014 if (resid > 0) {
4015 /* There is residual data; figure out the cluster size */
4016 if (wantsize == 0 && packetlen > MINCLSIZE) {
4017 /*
4018 * Caller didn't request that all of the segments
4019 * in the chain use the same cluster size; use the
4020 * smaller of the cluster sizes.
4021 */
4022 if (njcl > 0 && resid > m_maxsize(MC_BIGCL))
4023 r_bufsize = m_maxsize(MC_16KCL);
4024 else if (resid > m_maxsize(MC_CL))
4025 r_bufsize = m_maxsize(MC_BIGCL);
4026 else
4027 r_bufsize = m_maxsize(MC_CL);
4028 } else {
4029 /* Use the same cluster size as the other segments */
4030 resid = 0;
4031 }
4032 }
4033
4034 needed = *numlist;
4035 if (resid > 0) {
4036 /*
4037 * Attempt to allocate composite mbuf + cluster elements for
4038 * the residual data in each chain; record the number of such
4039 * elements that can be allocated so that we know how many
4040 * segment chains we can afford to create.
4041 */
4042 if (r_bufsize <= m_maxsize(MC_CL))
4043 rcp = m_cache(MC_MBUF_CL);
4044 else if (r_bufsize <= m_maxsize(MC_BIGCL))
4045 rcp = m_cache(MC_MBUF_BIGCL);
4046 else
4047 rcp = m_cache(MC_MBUF_16KCL);
4048 needed = mcache_alloc_ext(rcp, &rmp_list, *numlist, mcflags);
4049
4050 if (needed == 0)
4051 goto fail;
4052
4053 /* This is temporarily reduced for calculation */
4054 ASSERT(nsegs > 1);
4055 nsegs--;
4056 }
4057
4058 /*
4059 * Attempt to allocate the rest of the composite mbuf + cluster
4060 * elements for the number of segment chains that we need.
4061 */
4062 if (bufsize <= m_maxsize(MC_CL))
4063 cp = m_cache(MC_MBUF_CL);
4064 else if (bufsize <= m_maxsize(MC_BIGCL))
4065 cp = m_cache(MC_MBUF_BIGCL);
4066 else
4067 cp = m_cache(MC_MBUF_16KCL);
4068 needed = mcache_alloc_ext(cp, &mp_list, needed * nsegs, mcflags);
4069
4070 /* Round it down to avoid creating a partial segment chain */
4071 needed = (needed / nsegs) * nsegs;
4072 if (needed == 0)
4073 goto fail;
4074
4075 if (resid > 0) {
4076 /*
4077 * We're about to construct the chain(s); take into account
4078 * the number of segments we have created above to hold the
4079 * residual data for each chain, as well as restore the
4080 * original count of segments per chain.
4081 */
4082 ASSERT(nsegs > 0);
4083 needed += needed / nsegs;
4084 nsegs++;
4085 }
4086
4087 for (;;) {
4088 struct mbuf *m;
4089 u_int32_t flag;
4090 struct ext_ref *rfa;
4091 void *cl;
4092 int pkthdr;
4093
4094 ++num;
4095 if (nsegs == 1 || (num % nsegs) != 0 || resid == 0) {
4096 m = (struct mbuf *)mp_list;
4097 mp_list = mp_list->obj_next;
4098 } else {
4099 m = (struct mbuf *)rmp_list;
4100 rmp_list = rmp_list->obj_next;
4101 }
4102 ASSERT(m != NULL);
4103 VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT);
4104 VERIFY(m->m_ext.ext_free == NULL ||
4105 m->m_ext.ext_free == m_bigfree ||
4106 m->m_ext.ext_free == m_16kfree);
4107
4108 cl = m->m_ext.ext_buf;
4109 rfa = MEXT_RFA(m);
4110
4111 ASSERT(cl != NULL && rfa != NULL);
4112 VERIFY(MBUF_IS_COMPOSITE(m));
4113
4114 flag = MEXT_FLAGS(m);
4115
4116 pkthdr = (nsegs == 1 || (num % nsegs) == 1);
4117 if (pkthdr)
4118 first = m;
4119 MBUF_INIT(m, pkthdr, MT_DATA);
4120 if (m->m_ext.ext_free == m_16kfree) {
4121 MBUF_16KCL_INIT(m, cl, rfa, 1, flag);
4122 } else if (m->m_ext.ext_free == m_bigfree) {
4123 MBUF_BIGCL_INIT(m, cl, rfa, 1, flag);
4124 } else {
4125 MBUF_CL_INIT(m, cl, rfa, 1, flag);
4126 }
4127#if CONFIG_MACF_NET
4128 if (pkthdr && mac_init_mbuf(m, wait) != 0) {
4129 --num;
6d2010ae 4130 m_freem(m);
2d21ac55 4131 break;
91447636 4132 }
2d21ac55
A
4133#endif /* MAC_NET */
4134
4135 *np = m;
4136 if ((num % nsegs) == 0)
4137 np = &first->m_nextpkt;
4138 else
4139 np = &m->m_next;
4140
4141 if (num == needed)
4142 break;
4143 }
4144
4145 if (num > 0) {
4146 mtype_stat_add(MT_DATA, num);
4147 mtype_stat_sub(MT_FREE, num);
91447636 4148 }
2d21ac55
A
4149
4150 num /= nsegs;
4151
4152 /* We've got them all; return to caller */
4153 if (num == *numlist) {
4154 ASSERT(mp_list == NULL && rmp_list == NULL);
4155 return (top);
4156 }
4157
91447636 4158fail:
2d21ac55
A
4159 /* Free up what's left of the above */
4160 if (mp_list != NULL)
4161 mcache_free_ext(cp, mp_list);
4162 if (rmp_list != NULL)
4163 mcache_free_ext(rcp, rmp_list);
4164 if (wantall && top != NULL) {
91447636 4165 m_freem(top);
2d21ac55 4166 return (NULL);
91447636 4167 }
2d21ac55
A
4168 *numlist = num;
4169 return (top);
91447636 4170}
fa4905b1 4171
2d21ac55
A
4172/*
4173 * Best effort to get a mbuf cluster + pkthdr. Used by drivers to allocated
4174 * packets on receive ring.
91447636
A
4175 */
4176__private_extern__ struct mbuf *
2d21ac55 4177m_getpacket_how(int wait)
91447636
A
4178{
4179 unsigned int num_needed = 1;
2d21ac55
A
4180
4181 return (m_getpackets_internal(&num_needed, 1, wait, 1,
4182 m_maxsize(MC_CL)));
91447636 4183}
fa4905b1 4184
2d21ac55
A
4185/*
4186 * Best effort to get a mbuf cluster + pkthdr. Used by drivers to allocated
4187 * packets on receive ring.
91447636
A
4188 */
4189struct mbuf *
4190m_getpacket(void)
4191{
4192 unsigned int num_needed = 1;
9bccf70c 4193
2d21ac55
A
4194 return (m_getpackets_internal(&num_needed, 1, M_WAIT, 1,
4195 m_maxsize(MC_CL)));
91447636 4196}
fa4905b1 4197
91447636 4198/*
2d21ac55
A
4199 * Return a list of mbuf hdrs that point to clusters. Try for num_needed;
4200 * if this can't be met, return whatever number were available. Set up the
4201 * first num_with_pkthdrs with mbuf hdrs configured as packet headers. These
4202 * are chained on the m_nextpkt field. Any packets requested beyond this are
4203 * chained onto the last packet header's m_next field.
91447636
A
4204 */
4205struct mbuf *
4206m_getpackets(int num_needed, int num_with_pkthdrs, int how)
4207{
4208 unsigned int n = num_needed;
fa4905b1 4209
2d21ac55
A
4210 return (m_getpackets_internal(&n, num_with_pkthdrs, how, 0,
4211 m_maxsize(MC_CL)));
4212}
fa4905b1 4213
9bccf70c 4214/*
2d21ac55
A
4215 * Return a list of mbuf hdrs set up as packet hdrs chained together
4216 * on the m_nextpkt field
9bccf70c 4217 */
fa4905b1
A
4218struct mbuf *
4219m_getpackethdrs(int num_needed, int how)
4220{
4221 struct mbuf *m;
4222 struct mbuf **np, *top;
4223
4224 top = NULL;
4225 np = &top;
4226
fa4905b1 4227 while (num_needed--) {
2d21ac55
A
4228 m = _M_RETRYHDR(how, MT_DATA);
4229 if (m == NULL)
4230 break;
4231
4232 *np = m;
4233 np = &m->m_nextpkt;
4234 }
fa4905b1
A
4235
4236 return (top);
4237}
4238
2d21ac55
A
4239/*
4240 * Free an mbuf list (m_nextpkt) while following m_next. Returns the count
4241 * for mbufs packets freed. Used by the drivers.
1c79356b 4242 */
2d21ac55
A
4243int
4244m_freem_list(struct mbuf *m)
1c79356b
A
4245{
4246 struct mbuf *nextpkt;
2d21ac55
A
4247 mcache_obj_t *mp_list = NULL;
4248 mcache_obj_t *mcl_list = NULL;
4249 mcache_obj_t *mbc_list = NULL;
4250 mcache_obj_t *m16k_list = NULL;
4251 mcache_obj_t *m_mcl_list = NULL;
4252 mcache_obj_t *m_mbc_list = NULL;
4253 mcache_obj_t *m_m16k_list = NULL;
4254 mcache_obj_t *ref_list = NULL;
4255 int pktcount = 0;
4256 int mt_free = 0, mt_data = 0, mt_header = 0, mt_soname = 0, mt_tag = 0;
4257
4258 while (m != NULL) {
4259 pktcount++;
4260
4261 nextpkt = m->m_nextpkt;
4262 m->m_nextpkt = NULL;
4263
4264 while (m != NULL) {
4265 struct mbuf *next = m->m_next;
4266 mcache_obj_t *o, *rfa;
6d2010ae 4267 u_int32_t refcnt, composite;
fa4905b1 4268
2d21ac55
A
4269 if (m->m_type == MT_FREE)
4270 panic("m_free: freeing an already freed mbuf");
9bccf70c 4271
2d21ac55
A
4272 if (m->m_type != MT_FREE)
4273 mt_free++;
91447636 4274
2d21ac55 4275 if (m->m_flags & M_PKTHDR) {
39236c6e
A
4276 /* Check for scratch area overflow */
4277 m_redzone_verify(m);
4278 /* Free the aux data and tags if there is any */
91447636 4279 m_tag_delete_chain(m, NULL);
91447636 4280 }
9bccf70c 4281
2d21ac55
A
4282 if (!(m->m_flags & M_EXT))
4283 goto simple_free;
4284
316670eb 4285 o = (mcache_obj_t *)(void *)m->m_ext.ext_buf;
2d21ac55 4286 refcnt = m_decref(m);
6d2010ae
A
4287 composite = (MEXT_FLAGS(m) & EXTF_COMPOSITE);
4288 if (refcnt == 0 && !composite) {
2d21ac55
A
4289 if (m->m_ext.ext_free == NULL) {
4290 o->obj_next = mcl_list;
4291 mcl_list = o;
4292 } else if (m->m_ext.ext_free == m_bigfree) {
4293 o->obj_next = mbc_list;
4294 mbc_list = o;
4295 } else if (m->m_ext.ext_free == m_16kfree) {
4296 o->obj_next = m16k_list;
4297 m16k_list = o;
4298 } else {
4299 (*(m->m_ext.ext_free))((caddr_t)o,
4300 m->m_ext.ext_size,
4301 m->m_ext.ext_arg);
4302 }
316670eb 4303 rfa = (mcache_obj_t *)(void *)MEXT_RFA(m);
2d21ac55
A
4304 rfa->obj_next = ref_list;
4305 ref_list = rfa;
4306 MEXT_RFA(m) = NULL;
6d2010ae 4307 } else if (refcnt == 0 && composite) {
2d21ac55
A
4308 VERIFY(m->m_type != MT_FREE);
4309 /*
4310 * Amortize the costs of atomic operations
4311 * by doing them at the end, if possible.
4312 */
4313 if (m->m_type == MT_DATA)
4314 mt_data++;
4315 else if (m->m_type == MT_HEADER)
4316 mt_header++;
4317 else if (m->m_type == MT_SONAME)
4318 mt_soname++;
4319 else if (m->m_type == MT_TAG)
4320 mt_tag++;
4321 else
4322 mtype_stat_dec(m->m_type);
fa4905b1 4323
2d21ac55
A
4324 m->m_type = MT_FREE;
4325 m->m_flags = M_EXT;
4326 m->m_len = 0;
4327 m->m_next = m->m_nextpkt = NULL;
4328
6d2010ae
A
4329 MEXT_FLAGS(m) &= ~EXTF_READONLY;
4330
2d21ac55
A
4331 /* "Free" into the intermediate cache */
4332 o = (mcache_obj_t *)m;
4333 if (m->m_ext.ext_free == NULL) {
4334 o->obj_next = m_mcl_list;
4335 m_mcl_list = o;
4336 } else if (m->m_ext.ext_free == m_bigfree) {
4337 o->obj_next = m_mbc_list;
4338 m_mbc_list = o;
1c79356b 4339 } else {
2d21ac55
A
4340 VERIFY(m->m_ext.ext_free == m_16kfree);
4341 o->obj_next = m_m16k_list;
4342 m_m16k_list = o;
1c79356b 4343 }
2d21ac55
A
4344 m = next;
4345 continue;
1c79356b 4346 }
2d21ac55
A
4347simple_free:
4348 /*
4349 * Amortize the costs of atomic operations
4350 * by doing them at the end, if possible.
4351 */
4352 if (m->m_type == MT_DATA)
4353 mt_data++;
4354 else if (m->m_type == MT_HEADER)
4355 mt_header++;
4356 else if (m->m_type == MT_SONAME)
4357 mt_soname++;
4358 else if (m->m_type == MT_TAG)
4359 mt_tag++;
4360 else if (m->m_type != MT_FREE)
4361 mtype_stat_dec(m->m_type);
4362
1c79356b 4363 m->m_type = MT_FREE;
2d21ac55
A
4364 m->m_flags = m->m_len = 0;
4365 m->m_next = m->m_nextpkt = NULL;
fa4905b1 4366
2d21ac55
A
4367 ((mcache_obj_t *)m)->obj_next = mp_list;
4368 mp_list = (mcache_obj_t *)m;
4369
4370 m = next;
4371 }
fa4905b1 4372
2d21ac55
A
4373 m = nextpkt;
4374 }
fa4905b1 4375
2d21ac55
A
4376 if (mt_free > 0)
4377 mtype_stat_add(MT_FREE, mt_free);
4378 if (mt_data > 0)
4379 mtype_stat_sub(MT_DATA, mt_data);
4380 if (mt_header > 0)
4381 mtype_stat_sub(MT_HEADER, mt_header);
4382 if (mt_soname > 0)
4383 mtype_stat_sub(MT_SONAME, mt_soname);
4384 if (mt_tag > 0)
4385 mtype_stat_sub(MT_TAG, mt_tag);
4386
4387 if (mp_list != NULL)
4388 mcache_free_ext(m_cache(MC_MBUF), mp_list);
4389 if (mcl_list != NULL)
4390 mcache_free_ext(m_cache(MC_CL), mcl_list);
4391 if (mbc_list != NULL)
4392 mcache_free_ext(m_cache(MC_BIGCL), mbc_list);
4393 if (m16k_list != NULL)
4394 mcache_free_ext(m_cache(MC_16KCL), m16k_list);
4395 if (m_mcl_list != NULL)
4396 mcache_free_ext(m_cache(MC_MBUF_CL), m_mcl_list);
4397 if (m_mbc_list != NULL)
4398 mcache_free_ext(m_cache(MC_MBUF_BIGCL), m_mbc_list);
4399 if (m_m16k_list != NULL)
4400 mcache_free_ext(m_cache(MC_MBUF_16KCL), m_m16k_list);
4401 if (ref_list != NULL)
4402 mcache_free_ext(ref_cache, ref_list);
4403
4404 return (pktcount);
1c79356b
A
4405}
4406
4407void
2d21ac55 4408m_freem(struct mbuf *m)
1c79356b 4409{
2d21ac55 4410 while (m != NULL)
1c79356b
A
4411 m = m_free(m);
4412}
4413
4414/*
4415 * Mbuffer utility routines.
4416 */
2d21ac55 4417
1c79356b 4418/*
2d21ac55
A
4419 * Compute the amount of space available before the current start
4420 * of data in an mbuf.
1c79356b 4421 */
91447636 4422int
2d21ac55 4423m_leadingspace(struct mbuf *m)
1c79356b
A
4424{
4425 if (m->m_flags & M_EXT) {
4426 if (MCLHASREFERENCE(m))
2d21ac55 4427 return (0);
1c79356b
A
4428 return (m->m_data - m->m_ext.ext_buf);
4429 }
4430 if (m->m_flags & M_PKTHDR)
4431 return (m->m_data - m->m_pktdat);
4432 return (m->m_data - m->m_dat);
4433}
4434
4435/*
2d21ac55 4436 * Compute the amount of space available after the end of data in an mbuf.
1c79356b 4437 */
91447636 4438int
2d21ac55 4439m_trailingspace(struct mbuf *m)
1c79356b
A
4440{
4441 if (m->m_flags & M_EXT) {
4442 if (MCLHASREFERENCE(m))
2d21ac55 4443 return (0);
1c79356b 4444 return (m->m_ext.ext_buf + m->m_ext.ext_size -
2d21ac55 4445 (m->m_data + m->m_len));
1c79356b
A
4446 }
4447 return (&m->m_dat[MLEN] - (m->m_data + m->m_len));
4448}
4449
4450/*
2d21ac55
A
4451 * Lesser-used path for M_PREPEND: allocate new mbuf to prepend to chain,
4452 * copy junk along. Does not adjust packet header length.
1c79356b
A
4453 */
4454struct mbuf *
2d21ac55 4455m_prepend(struct mbuf *m, int len, int how)
1c79356b
A
4456{
4457 struct mbuf *mn;
4458
2d21ac55
A
4459 _MGET(mn, how, m->m_type);
4460 if (mn == NULL) {
1c79356b 4461 m_freem(m);
2d21ac55 4462 return (NULL);
1c79356b
A
4463 }
4464 if (m->m_flags & M_PKTHDR) {
4465 M_COPY_PKTHDR(mn, m);
4466 m->m_flags &= ~M_PKTHDR;
4467 }
4468 mn->m_next = m;
4469 m = mn;
4470 if (len < MHLEN)
4471 MH_ALIGN(m, len);
4472 m->m_len = len;
4473 return (m);
4474}
4475
9bccf70c 4476/*
2d21ac55
A
4477 * Replacement for old M_PREPEND macro: allocate new mbuf to prepend to
4478 * chain, copy junk along, and adjust length.
9bccf70c
A
4479 */
4480struct mbuf *
2d21ac55
A
4481m_prepend_2(struct mbuf *m, int len, int how)
4482{
4483 if (M_LEADINGSPACE(m) >= len) {
4484 m->m_data -= len;
4485 m->m_len += len;
4486 } else {
9bccf70c 4487 m = m_prepend(m, len, how);
2d21ac55
A
4488 }
4489 if ((m) && (m->m_flags & M_PKTHDR))
4490 m->m_pkthdr.len += len;
4491 return (m);
9bccf70c
A
4492}
4493
1c79356b
A
4494/*
4495 * Make a copy of an mbuf chain starting "off0" bytes from the beginning,
4496 * continuing for "len" bytes. If len is M_COPYALL, copy to end of mbuf.
4497 * The wait parameter is a choice of M_WAIT/M_DONTWAIT from caller.
4498 */
4499int MCFail;
4500
4501struct mbuf *
39236c6e 4502m_copym_mode(struct mbuf *m, int off0, int len, int wait, uint32_t mode)
1c79356b 4503{
2d21ac55 4504 struct mbuf *n, *mhdr = NULL, **np;
91447636 4505 int off = off0;
1c79356b
A
4506 struct mbuf *top;
4507 int copyhdr = 0;
4508
4509 if (off < 0 || len < 0)
2d21ac55
A
4510 panic("m_copym: invalid offset %d or len %d", off, len);
4511
4512 if (off == 0 && (m->m_flags & M_PKTHDR)) {
4513 mhdr = m;
1c79356b 4514 copyhdr = 1;
2d21ac55 4515 }
fa4905b1
A
4516
4517 while (off >= m->m_len) {
2d21ac55
A
4518 if (m->m_next == NULL)
4519 panic("m_copym: invalid mbuf chain");
1c79356b
A
4520 off -= m->m_len;
4521 m = m->m_next;
4522 }
4523 np = &top;
2d21ac55 4524 top = NULL;
fa4905b1 4525
1c79356b 4526 while (len > 0) {
2d21ac55 4527 if (m == NULL) {
1c79356b 4528 if (len != M_COPYALL)
2d21ac55 4529 panic("m_copym: len != M_COPYALL");
1c79356b
A
4530 break;
4531 }
2d21ac55
A
4532
4533 n = _M_RETRY(wait, m->m_type);
1c79356b 4534 *np = n;
fa4905b1 4535
2d21ac55 4536 if (n == NULL)
1c79356b 4537 goto nospace;
2d21ac55
A
4538
4539 if (copyhdr != 0) {
39236c6e
A
4540 if (mode == M_COPYM_MOVE_HDR) {
4541 M_COPY_PKTHDR(n, mhdr);
4542 } else if (mode == M_COPYM_COPY_HDR) {
4543 if (m_dup_pkthdr(n, mhdr, wait) == 0)
4544 goto nospace;
4545 }
1c79356b
A
4546 if (len == M_COPYALL)
4547 n->m_pkthdr.len -= off0;
4548 else
4549 n->m_pkthdr.len = len;
4550 copyhdr = 0;
4551 }
4552 if (len == M_COPYALL) {
2d21ac55 4553 if (MIN(len, (m->m_len - off)) == len) {
b0d623f7 4554 printf("m->m_len %d - off %d = %d, %d\n",
2d21ac55
A
4555 m->m_len, off, m->m_len - off,
4556 MIN(len, (m->m_len - off)));
4557 }
1c79356b 4558 }
2d21ac55 4559 n->m_len = MIN(len, (m->m_len - off));
1c79356b 4560 if (n->m_len == M_COPYALL) {
2d21ac55
A
4561 printf("n->m_len == M_COPYALL, fixing\n");
4562 n->m_len = MHLEN;
1c79356b
A
4563 }
4564 if (m->m_flags & M_EXT) {
1c79356b 4565 n->m_ext = m->m_ext;
2d21ac55 4566 m_incref(m);
1c79356b
A
4567 n->m_data = m->m_data + off;
4568 n->m_flags |= M_EXT;
fa4905b1 4569 } else {
2d21ac55 4570 bcopy(MTOD(m, caddr_t)+off, MTOD(n, caddr_t),
1c79356b 4571 (unsigned)n->m_len);
fa4905b1 4572 }
1c79356b
A
4573 if (len != M_COPYALL)
4574 len -= n->m_len;
4575 off = 0;
4576 m = m->m_next;
4577 np = &n->m_next;
4578 }
fa4905b1 4579
2d21ac55 4580 if (top == NULL)
1c79356b 4581 MCFail++;
fa4905b1 4582
1c79356b
A
4583 return (top);
4584nospace:
fa4905b1 4585
1c79356b
A
4586 m_freem(top);
4587 MCFail++;
2d21ac55 4588 return (NULL);
1c79356b
A
4589}
4590
39236c6e
A
4591
4592struct mbuf *
4593m_copym(struct mbuf *m, int off0, int len, int wait)
4594{
4595 return (m_copym_mode(m, off0, len, wait, M_COPYM_MOVE_HDR));
4596}
4597
9bccf70c 4598/*
2d21ac55
A
4599 * Equivalent to m_copym except that all necessary mbuf hdrs are allocated
4600 * within this routine also, the last mbuf and offset accessed are passed
4601 * out and can be passed back in to avoid having to rescan the entire mbuf
4602 * list (normally hung off of the socket)
9bccf70c 4603 */
fa4905b1 4604struct mbuf *
2d21ac55 4605m_copym_with_hdrs(struct mbuf *m, int off0, int len0, int wait,
39236c6e 4606 struct mbuf **m_lastm, int *m_off, uint32_t mode)
2d21ac55
A
4607{
4608 struct mbuf *n, **np = NULL;
4609 int off = off0, len = len0;
4610 struct mbuf *top = NULL;
4611 int mcflags = MSLEEPF(wait);
fa4905b1 4612 int copyhdr = 0;
2d21ac55
A
4613 int type = 0;
4614 mcache_obj_t *list = NULL;
4615 int needed = 0;
fa4905b1 4616
2d21ac55 4617 if (off == 0 && (m->m_flags & M_PKTHDR))
fa4905b1
A
4618 copyhdr = 1;
4619
6d2010ae
A
4620 if (*m_lastm != NULL) {
4621 m = *m_lastm;
fa4905b1
A
4622 off = *m_off;
4623 } else {
2d21ac55
A
4624 while (off >= m->m_len) {
4625 off -= m->m_len;
fa4905b1
A
4626 m = m->m_next;
4627 }
4628 }
91447636 4629
2d21ac55
A
4630 n = m;
4631 while (len > 0) {
4632 needed++;
4633 ASSERT(n != NULL);
4634 len -= MIN(len, (n->m_len - ((needed == 1) ? off : 0)));
4635 n = n->m_next;
4636 }
4637 needed++;
4638 len = len0;
4639
4640 /*
4641 * If the caller doesn't want to be put to sleep, mark it with
4642 * MCR_TRYHARD so that we may reclaim buffers from other places
4643 * before giving up.
4644 */
4645 if (mcflags & MCR_NOSLEEP)
4646 mcflags |= MCR_TRYHARD;
4647
4648 if (mcache_alloc_ext(m_cache(MC_MBUF), &list, needed,
4649 mcflags) != needed)
4650 goto nospace;
fa4905b1 4651
2d21ac55 4652 needed = 0;
fa4905b1 4653 while (len > 0) {
2d21ac55
A
4654 n = (struct mbuf *)list;
4655 list = list->obj_next;
4656 ASSERT(n != NULL && m != NULL);
4657
4658 type = (top == NULL) ? MT_HEADER : m->m_type;
4659 MBUF_INIT(n, (top == NULL), type);
4660#if CONFIG_MACF_NET
4661 if (top == NULL && mac_mbuf_label_init(n, wait) != 0) {
4662 mtype_stat_inc(MT_HEADER);
4663 mtype_stat_dec(MT_FREE);
4664 m_free(n);
fa4905b1 4665 goto nospace;
2d21ac55
A
4666 }
4667#endif /* MAC_NET */
4668
4669 if (top == NULL) {
4670 top = n;
fa4905b1
A
4671 np = &top->m_next;
4672 continue;
2d21ac55
A
4673 } else {
4674 needed++;
4675 *np = n;
4676 }
fa4905b1
A
4677
4678 if (copyhdr) {
39236c6e
A
4679 if (mode == M_COPYM_MOVE_HDR) {
4680 M_COPY_PKTHDR(n, m);
4681 } else if (mode == M_COPYM_COPY_HDR) {
4682 if (m_dup_pkthdr(n, m, wait) == 0)
4683 goto nospace;
4684 }
fa4905b1
A
4685 n->m_pkthdr.len = len;
4686 copyhdr = 0;
4687 }
2d21ac55 4688 n->m_len = MIN(len, (m->m_len - off));
fa4905b1
A
4689
4690 if (m->m_flags & M_EXT) {
4691 n->m_ext = m->m_ext;
2d21ac55 4692 m_incref(m);
fa4905b1
A
4693 n->m_data = m->m_data + off;
4694 n->m_flags |= M_EXT;
4695 } else {
2d21ac55 4696 bcopy(MTOD(m, caddr_t)+off, MTOD(n, caddr_t),
fa4905b1
A
4697 (unsigned)n->m_len);
4698 }
4699 len -= n->m_len;
2d21ac55 4700
fa4905b1 4701 if (len == 0) {
2d21ac55 4702 if ((off + n->m_len) == m->m_len) {
6d2010ae 4703 *m_lastm = m->m_next;
2d21ac55 4704 *m_off = 0;
fa4905b1 4705 } else {
6d2010ae 4706 *m_lastm = m;
2d21ac55 4707 *m_off = off + n->m_len;
fa4905b1 4708 }
2d21ac55 4709 break;
fa4905b1
A
4710 }
4711 off = 0;
4712 m = m->m_next;
4713 np = &n->m_next;
4714 }
fa4905b1 4715
2d21ac55
A
4716 mtype_stat_inc(MT_HEADER);
4717 mtype_stat_add(type, needed);
4718 mtype_stat_sub(MT_FREE, needed + 1);
4719
4720 ASSERT(list == NULL);
fa4905b1 4721 return (top);
fa4905b1 4722
2d21ac55
A
4723nospace:
4724 if (list != NULL)
4725 mcache_free_ext(m_cache(MC_MBUF), list);
4726 if (top != NULL)
4727 m_freem(top);
fa4905b1 4728 MCFail++;
2d21ac55 4729 return (NULL);
fa4905b1
A
4730}
4731
1c79356b
A
4732/*
4733 * Copy data from an mbuf chain starting "off" bytes from the beginning,
4734 * continuing for "len" bytes, into the indicated buffer.
4735 */
2d21ac55 4736void
b0d623f7 4737m_copydata(struct mbuf *m, int off, int len, void *vp)
1c79356b 4738{
91447636 4739 unsigned count;
b0d623f7 4740 char *cp = vp;
1c79356b
A
4741
4742 if (off < 0 || len < 0)
2d21ac55
A
4743 panic("m_copydata: invalid offset %d or len %d", off, len);
4744
1c79356b 4745 while (off > 0) {
2d21ac55
A
4746 if (m == NULL)
4747 panic("m_copydata: invalid mbuf chain");
1c79356b
A
4748 if (off < m->m_len)
4749 break;
4750 off -= m->m_len;
4751 m = m->m_next;
4752 }
4753 while (len > 0) {
2d21ac55
A
4754 if (m == NULL)
4755 panic("m_copydata: invalid mbuf chain");
4756 count = MIN(m->m_len - off, len);
4757 bcopy(MTOD(m, caddr_t) + off, cp, count);
1c79356b
A
4758 len -= count;
4759 cp += count;
4760 off = 0;
4761 m = m->m_next;
4762 }
4763}
4764
4765/*
2d21ac55
A
4766 * Concatenate mbuf chain n to m. Both chains must be of the same type
4767 * (e.g. MT_DATA). Any m_pkthdr is not updated.
1c79356b 4768 */
2d21ac55
A
4769void
4770m_cat(struct mbuf *m, struct mbuf *n)
1c79356b
A
4771{
4772 while (m->m_next)
4773 m = m->m_next;
4774 while (n) {
2d21ac55 4775 if ((m->m_flags & M_EXT) ||
1c79356b
A
4776 m->m_data + m->m_len + n->m_len >= &m->m_dat[MLEN]) {
4777 /* just join the two chains */
4778 m->m_next = n;
4779 return;
4780 }
4781 /* splat the data from one into the other */
2d21ac55 4782 bcopy(MTOD(n, caddr_t), MTOD(m, caddr_t) + m->m_len,
1c79356b
A
4783 (u_int)n->m_len);
4784 m->m_len += n->m_len;
4785 n = m_free(n);
4786 }
4787}
4788
4789void
2d21ac55 4790m_adj(struct mbuf *mp, int req_len)
1c79356b 4791{
91447636
A
4792 int len = req_len;
4793 struct mbuf *m;
4794 int count;
1c79356b
A
4795
4796 if ((m = mp) == NULL)
4797 return;
4798 if (len >= 0) {
4799 /*
4800 * Trim from head.
4801 */
4802 while (m != NULL && len > 0) {
4803 if (m->m_len <= len) {
4804 len -= m->m_len;
4805 m->m_len = 0;
4806 m = m->m_next;
4807 } else {
4808 m->m_len -= len;
4809 m->m_data += len;
4810 len = 0;
4811 }
4812 }
4813 m = mp;
4814 if (m->m_flags & M_PKTHDR)
4815 m->m_pkthdr.len -= (req_len - len);
4816 } else {
4817 /*
4818 * Trim from tail. Scan the mbuf chain,
4819 * calculating its length and finding the last mbuf.
4820 * If the adjustment only affects this mbuf, then just
4821 * adjust and return. Otherwise, rescan and truncate
4822 * after the remaining size.
4823 */
4824 len = -len;
4825 count = 0;
4826 for (;;) {
4827 count += m->m_len;
4828 if (m->m_next == (struct mbuf *)0)
4829 break;
4830 m = m->m_next;
4831 }
4832 if (m->m_len >= len) {
4833 m->m_len -= len;
4834 m = mp;
4835 if (m->m_flags & M_PKTHDR)
4836 m->m_pkthdr.len -= len;
4837 return;
4838 }
4839 count -= len;
4840 if (count < 0)
4841 count = 0;
4842 /*
4843 * Correct length for chain is "count".
4844 * Find the mbuf with last data, adjust its length,
4845 * and toss data from remaining mbufs on chain.
4846 */
4847 m = mp;
4848 if (m->m_flags & M_PKTHDR)
4849 m->m_pkthdr.len = count;
4850 for (; m; m = m->m_next) {
4851 if (m->m_len >= count) {
4852 m->m_len = count;
4853 break;
4854 }
4855 count -= m->m_len;
4856 }
91447636 4857 while ((m = m->m_next))
1c79356b
A
4858 m->m_len = 0;
4859 }
4860}
4861
4862/*
4863 * Rearange an mbuf chain so that len bytes are contiguous
4864 * and in the data area of an mbuf (so that mtod and dtom
4865 * will work for a structure of size len). Returns the resulting
4866 * mbuf chain on success, frees it and returns null on failure.
4867 * If there is room, it will add up to max_protohdr-len extra bytes to the
4868 * contiguous region in an attempt to avoid being called next time.
4869 */
4870int MPFail;
4871
4872struct mbuf *
2d21ac55 4873m_pullup(struct mbuf *n, int len)
1c79356b 4874{
91447636
A
4875 struct mbuf *m;
4876 int count;
1c79356b
A
4877 int space;
4878
4879 /*
4880 * If first mbuf has no cluster, and has room for len bytes
4881 * without shifting current data, pullup into it,
4882 * otherwise allocate a new mbuf to prepend to the chain.
4883 */
4884 if ((n->m_flags & M_EXT) == 0 &&
4885 n->m_data + len < &n->m_dat[MLEN] && n->m_next) {
4886 if (n->m_len >= len)
4887 return (n);
4888 m = n;
4889 n = n->m_next;
4890 len -= m->m_len;
4891 } else {
4892 if (len > MHLEN)
4893 goto bad;
2d21ac55 4894 _MGET(m, M_DONTWAIT, n->m_type);
1c79356b
A
4895 if (m == 0)
4896 goto bad;
4897 m->m_len = 0;
4898 if (n->m_flags & M_PKTHDR) {
4899 M_COPY_PKTHDR(m, n);
4900 n->m_flags &= ~M_PKTHDR;
4901 }
4902 }
4903 space = &m->m_dat[MLEN] - (m->m_data + m->m_len);
4904 do {
2d21ac55
A
4905 count = MIN(MIN(MAX(len, max_protohdr), space), n->m_len);
4906 bcopy(MTOD(n, caddr_t), MTOD(m, caddr_t) + m->m_len,
4907 (unsigned)count);
1c79356b
A
4908 len -= count;
4909 m->m_len += count;
4910 n->m_len -= count;
4911 space -= count;
4912 if (n->m_len)
4913 n->m_data += count;
4914 else
4915 n = m_free(n);
4916 } while (len > 0 && n);
4917 if (len > 0) {
4918 (void) m_free(m);
4919 goto bad;
4920 }
4921 m->m_next = n;
4922 return (m);
4923bad:
4924 m_freem(n);
4925 MPFail++;
4926 return (0);
4927}
4928
6d2010ae
A
4929/*
4930 * Like m_pullup(), except a new mbuf is always allocated, and we allow
4931 * the amount of empty space before the data in the new mbuf to be specified
4932 * (in the event that the caller expects to prepend later).
4933 */
4934__private_extern__ int MSFail = 0;
4935
4936__private_extern__ struct mbuf *
4937m_copyup(struct mbuf *n, int len, int dstoff)
4938{
4939 struct mbuf *m;
4940 int count, space;
4941
4942 if (len > (MHLEN - dstoff))
4943 goto bad;
4944 MGET(m, M_DONTWAIT, n->m_type);
4945 if (m == NULL)
4946 goto bad;
4947 m->m_len = 0;
4948 if (n->m_flags & M_PKTHDR) {
4949 m_copy_pkthdr(m, n);
4950 n->m_flags &= ~M_PKTHDR;
4951 }
4952 m->m_data += dstoff;
4953 space = &m->m_dat[MLEN] - (m->m_data + m->m_len);
4954 do {
4955 count = min(min(max(len, max_protohdr), space), n->m_len);
4956 memcpy(mtod(m, caddr_t) + m->m_len, mtod(n, caddr_t),
4957 (unsigned)count);
4958 len -= count;
4959 m->m_len += count;
4960 n->m_len -= count;
4961 space -= count;
4962 if (n->m_len)
4963 n->m_data += count;
4964 else
4965 n = m_free(n);
4966 } while (len > 0 && n);
4967 if (len > 0) {
4968 (void) m_free(m);
4969 goto bad;
4970 }
4971 m->m_next = n;
4972 return (m);
4973bad:
4974 m_freem(n);
4975 MSFail++;
4976 return (NULL);
4977}
4978
1c79356b
A
4979/*
4980 * Partition an mbuf chain in two pieces, returning the tail --
4981 * all but the first len0 bytes. In case of failure, it returns NULL and
4982 * attempts to restore the chain to its original state.
4983 */
4984struct mbuf *
2d21ac55 4985m_split(struct mbuf *m0, int len0, int wait)
b0d623f7
A
4986{
4987 return (m_split0(m0, len0, wait, 1));
4988}
4989
4990static struct mbuf *
4991m_split0(struct mbuf *m0, int len0, int wait, int copyhdr)
1c79356b 4992{
91447636 4993 struct mbuf *m, *n;
1c79356b
A
4994 unsigned len = len0, remain;
4995
4996 for (m = m0; m && len > m->m_len; m = m->m_next)
4997 len -= m->m_len;
2d21ac55
A
4998 if (m == NULL)
4999 return (NULL);
1c79356b 5000 remain = m->m_len - len;
b0d623f7 5001 if (copyhdr && (m0->m_flags & M_PKTHDR)) {
2d21ac55
A
5002 _MGETHDR(n, wait, m0->m_type);
5003 if (n == NULL)
5004 return (NULL);
1c79356b
A
5005 n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif;
5006 n->m_pkthdr.len = m0->m_pkthdr.len - len0;
5007 m0->m_pkthdr.len = len0;
5008 if (m->m_flags & M_EXT)
5009 goto extpacket;
5010 if (remain > MHLEN) {
5011 /* m can't be the lead packet */
5012 MH_ALIGN(n, 0);
5013 n->m_next = m_split(m, len, wait);
2d21ac55 5014 if (n->m_next == NULL) {
1c79356b 5015 (void) m_free(n);
2d21ac55 5016 return (NULL);
1c79356b
A
5017 } else
5018 return (n);
5019 } else
5020 MH_ALIGN(n, remain);
5021 } else if (remain == 0) {
5022 n = m->m_next;
2d21ac55 5023 m->m_next = NULL;
1c79356b
A
5024 return (n);
5025 } else {
2d21ac55
A
5026 _MGET(n, wait, m->m_type);
5027 if (n == NULL)
5028 return (NULL);
1c79356b
A
5029 M_ALIGN(n, remain);
5030 }
5031extpacket:
5032 if (m->m_flags & M_EXT) {
5033 n->m_flags |= M_EXT;
0b4e3aa0 5034 n->m_ext = m->m_ext;
2d21ac55 5035 m_incref(m);
1c79356b
A
5036 n->m_data = m->m_data + len;
5037 } else {
2d21ac55 5038 bcopy(MTOD(m, caddr_t) + len, MTOD(n, caddr_t), remain);
1c79356b
A
5039 }
5040 n->m_len = remain;
5041 m->m_len = len;
5042 n->m_next = m->m_next;
2d21ac55 5043 m->m_next = NULL;
1c79356b
A
5044 return (n);
5045}
2d21ac55 5046
1c79356b
A
5047/*
5048 * Routine to copy from device local memory into mbufs.
5049 */
5050struct mbuf *
2d21ac55
A
5051m_devget(char *buf, int totlen, int off0, struct ifnet *ifp,
5052 void (*copy)(const void *, void *, size_t))
1c79356b 5053{
91447636 5054 struct mbuf *m;
2d21ac55 5055 struct mbuf *top = NULL, **mp = &top;
91447636
A
5056 int off = off0, len;
5057 char *cp;
1c79356b
A
5058 char *epkt;
5059
5060 cp = buf;
5061 epkt = cp + totlen;
5062 if (off) {
5063 /*
5064 * If 'off' is non-zero, packet is trailer-encapsulated,
5065 * so we have to skip the type and length fields.
5066 */
2d21ac55
A
5067 cp += off + 2 * sizeof (u_int16_t);
5068 totlen -= 2 * sizeof (u_int16_t);
1c79356b 5069 }
2d21ac55
A
5070 _MGETHDR(m, M_DONTWAIT, MT_DATA);
5071 if (m == NULL)
5072 return (NULL);
1c79356b
A
5073 m->m_pkthdr.rcvif = ifp;
5074 m->m_pkthdr.len = totlen;
5075 m->m_len = MHLEN;
5076
5077 while (totlen > 0) {
2d21ac55
A
5078 if (top != NULL) {
5079 _MGET(m, M_DONTWAIT, MT_DATA);
5080 if (m == NULL) {
1c79356b 5081 m_freem(top);
2d21ac55 5082 return (NULL);
1c79356b
A
5083 }
5084 m->m_len = MLEN;
5085 }
2d21ac55 5086 len = MIN(totlen, epkt - cp);
1c79356b
A
5087 if (len >= MINCLSIZE) {
5088 MCLGET(m, M_DONTWAIT);
2d21ac55
A
5089 if (m->m_flags & M_EXT) {
5090 m->m_len = len = MIN(len, m_maxsize(MC_CL));
5091 } else {
5092 /* give up when it's out of cluster mbufs */
5093 if (top != NULL)
5094 m_freem(top);
1c79356b 5095 m_freem(m);
2d21ac55 5096 return (NULL);
1c79356b
A
5097 }
5098 } else {
5099 /*
5100 * Place initial small packet/header at end of mbuf.
5101 */
5102 if (len < m->m_len) {
2d21ac55
A
5103 if (top == NULL &&
5104 len + max_linkhdr <= m->m_len)
1c79356b
A
5105 m->m_data += max_linkhdr;
5106 m->m_len = len;
2d21ac55 5107 } else {
1c79356b 5108 len = m->m_len;
2d21ac55 5109 }
1c79356b
A
5110 }
5111 if (copy)
2d21ac55 5112 copy(cp, MTOD(m, caddr_t), (unsigned)len);
1c79356b 5113 else
2d21ac55 5114 bcopy(cp, MTOD(m, caddr_t), (unsigned)len);
1c79356b
A
5115 cp += len;
5116 *mp = m;
5117 mp = &m->m_next;
5118 totlen -= len;
5119 if (cp == epkt)
5120 cp = buf;
5121 }
5122 return (top);
5123}
5124
6d2010ae
A
5125#ifndef MBUF_GROWTH_NORMAL_THRESH
5126#define MBUF_GROWTH_NORMAL_THRESH 25
5127#endif
b0d623f7 5128
1c79356b 5129/*
2d21ac55 5130 * Cluster freelist allocation check.
1c79356b
A
5131 */
5132static int
91447636 5133m_howmany(int num, size_t bufsize)
1c79356b 5134{
2d21ac55 5135 int i = 0, j = 0;
6d2010ae
A
5136 u_int32_t m_mbclusters, m_clusters, m_bigclusters, m_16kclusters;
5137 u_int32_t m_mbfree, m_clfree, m_bigclfree, m_16kclfree;
5138 u_int32_t sumclusters, freeclusters;
5139 u_int32_t percent_pool, percent_kmem;
5140 u_int32_t mb_growth, mb_growth_thresh;
5141
5142 VERIFY(bufsize == m_maxsize(MC_BIGCL) ||
5143 bufsize == m_maxsize(MC_16KCL));
2d21ac55
A
5144
5145 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
5146
6d2010ae
A
5147 /* Numbers in 2K cluster units */
5148 m_mbclusters = m_total(MC_MBUF) >> NMBPCLSHIFT;
2d21ac55 5149 m_clusters = m_total(MC_CL);
6d2010ae 5150 m_bigclusters = m_total(MC_BIGCL) << NCLPBGSHIFT;
2d21ac55 5151 m_16kclusters = m_total(MC_16KCL);
6d2010ae
A
5152 sumclusters = m_mbclusters + m_clusters + m_bigclusters;
5153
5154 m_mbfree = m_infree(MC_MBUF) >> NMBPCLSHIFT;
2d21ac55 5155 m_clfree = m_infree(MC_CL);
6d2010ae 5156 m_bigclfree = m_infree(MC_BIGCL) << NCLPBGSHIFT;
2d21ac55 5157 m_16kclfree = m_infree(MC_16KCL);
6d2010ae 5158 freeclusters = m_mbfree + m_clfree + m_bigclfree;
2d21ac55 5159
91447636 5160 /* Bail if we've maxed out the mbuf memory map */
6d2010ae 5161 if ((bufsize == m_maxsize(MC_BIGCL) && sumclusters >= nclusters) ||
2d21ac55 5162 (njcl > 0 && bufsize == m_maxsize(MC_16KCL) &&
6d2010ae 5163 (m_16kclusters << NCLPJCLSHIFT) >= njcl)) {
2d21ac55
A
5164 return (0);
5165 }
5166
6d2010ae 5167 if (bufsize == m_maxsize(MC_BIGCL)) {
2d21ac55 5168 /* Under minimum */
6d2010ae
A
5169 if (m_bigclusters < m_minlimit(MC_BIGCL))
5170 return (m_minlimit(MC_BIGCL) - m_bigclusters);
5171
5172 percent_pool =
5173 ((sumclusters - freeclusters) * 100) / sumclusters;
5174 percent_kmem = (sumclusters * 100) / nclusters;
5175
5176 /*
5177 * If a light/normal user, grow conservatively (75%)
5178 * If a heavy user, grow aggressively (50%)
5179 */
5180 if (percent_kmem < MBUF_GROWTH_NORMAL_THRESH)
5181 mb_growth = MB_GROWTH_NORMAL;
5182 else
5183 mb_growth = MB_GROWTH_AGGRESSIVE;
5184
5185 if (percent_kmem < 5) {
5186 /* For initial allocations */
5187 i = num;
5188 } else {
5189 /* Return if >= MBIGCL_LOWAT clusters available */
5190 if (m_infree(MC_BIGCL) >= MBIGCL_LOWAT &&
5191 m_total(MC_BIGCL) >=
5192 MBIGCL_LOWAT + m_minlimit(MC_BIGCL))
2d21ac55 5193 return (0);
6d2010ae
A
5194
5195 /* Ensure at least num clusters are accessible */
5196 if (num >= m_infree(MC_BIGCL))
5197 i = num - m_infree(MC_BIGCL);
5198 if (num > m_total(MC_BIGCL) - m_minlimit(MC_BIGCL))
5199 j = num - (m_total(MC_BIGCL) -
5200 m_minlimit(MC_BIGCL));
5201
2d21ac55 5202 i = MAX(i, j);
6d2010ae
A
5203
5204 /*
5205 * Grow pool if percent_pool > 75 (normal growth)
5206 * or percent_pool > 50 (aggressive growth).
5207 */
5208 mb_growth_thresh = 100 - (100 / (1 << mb_growth));
5209 if (percent_pool > mb_growth_thresh)
5210 j = ((sumclusters + num) >> mb_growth) -
5211 freeclusters;
2d21ac55 5212 i = MAX(i, j);
2d21ac55 5213 }
6d2010ae
A
5214
5215 /* Check to ensure we didn't go over limits */
5216 if (i + m_bigclusters >= m_maxlimit(MC_BIGCL))
5217 i = m_maxlimit(MC_BIGCL) - m_bigclusters;
5218 if ((i << 1) + sumclusters >= nclusters)
5219 i = (nclusters - sumclusters) >> 1;
2d21ac55 5220 VERIFY((m_total(MC_BIGCL) + i) <= m_maxlimit(MC_BIGCL));
6d2010ae
A
5221 VERIFY(sumclusters + (i << 1) <= nclusters);
5222
5223 } else { /* 16K CL */
2d21ac55
A
5224 VERIFY(njcl > 0);
5225 /* Under minimum */
5226 if (m_16kclusters < MIN16KCL)
5227 return (MIN16KCL - m_16kclusters);
6d2010ae
A
5228 if (m_16kclfree >= M16KCL_LOWAT)
5229 return (0);
5230
5231 /* Ensure at least num clusters are available */
5232 if (num >= m_16kclfree)
5233 i = num - m_16kclfree;
5234
5235 /* Always grow 16KCL pool aggressively */
5236 if (((m_16kclusters + num) >> 1) > m_16kclfree)
5237 j = ((m_16kclusters + num) >> 1) - m_16kclfree;
5238 i = MAX(i, j);
5239
5240 /* Check to ensure we don't go over limit */
5241 if (i + m_16kclusters >= m_maxlimit(MC_16KCL))
5242 i = m_maxlimit(MC_16KCL) - m_16kclusters;
2d21ac55 5243 VERIFY((m_total(MC_16KCL) + i) <= m_maxlimit(MC_16KCL));
91447636 5244 }
2d21ac55 5245 return (i);
1c79356b 5246}
b0d623f7
A
5247/*
5248 * Return the number of bytes in the mbuf chain, m.
6d2010ae
A
5249 */
5250unsigned int
b0d623f7
A
5251m_length(struct mbuf *m)
5252{
5253 struct mbuf *m0;
5254 unsigned int pktlen;
5255
5256 if (m->m_flags & M_PKTHDR)
5257 return (m->m_pkthdr.len);
5258
5259 pktlen = 0;
5260 for (m0 = m; m0 != NULL; m0 = m0->m_next)
5261 pktlen += m0->m_len;
5262 return (pktlen);
5263}
5264
1c79356b
A
5265/*
5266 * Copy data from a buffer back into the indicated mbuf chain,
5267 * starting "off" bytes from the beginning, extending the mbuf
5268 * chain if necessary.
5269 */
5270void
b0d623f7 5271m_copyback(struct mbuf *m0, int off, int len, const void *cp)
1c79356b 5272{
b0d623f7
A
5273#if DEBUG
5274 struct mbuf *origm = m0;
5275 int error;
5276#endif /* DEBUG */
1c79356b 5277
2d21ac55 5278 if (m0 == NULL)
1c79356b 5279 return;
b0d623f7
A
5280
5281#if DEBUG
5282 error =
5283#endif /* DEBUG */
5284 m_copyback0(&m0, off, len, cp,
5285 M_COPYBACK0_COPYBACK | M_COPYBACK0_EXTEND, M_DONTWAIT);
5286
5287#if DEBUG
5288 if (error != 0 || (m0 != NULL && origm != m0))
5289 panic("m_copyback");
5290#endif /* DEBUG */
5291}
5292
5293struct mbuf *
5294m_copyback_cow(struct mbuf *m0, int off, int len, const void *cp, int how)
5295{
5296 int error;
5297
5298 /* don't support chain expansion */
5299 VERIFY(off + len <= m_length(m0));
5300
5301 error = m_copyback0(&m0, off, len, cp,
5302 M_COPYBACK0_COPYBACK | M_COPYBACK0_COW, how);
5303 if (error) {
5304 /*
5305 * no way to recover from partial success.
5306 * just free the chain.
5307 */
5308 m_freem(m0);
5309 return (NULL);
5310 }
5311 return (m0);
5312}
5313
5314/*
5315 * m_makewritable: ensure the specified range writable.
5316 */
5317int
5318m_makewritable(struct mbuf **mp, int off, int len, int how)
5319{
5320 int error;
5321#if DEBUG
5322 struct mbuf *n;
5323 int origlen, reslen;
5324
5325 origlen = m_length(*mp);
5326#endif /* DEBUG */
5327
5328#if 0 /* M_COPYALL is large enough */
5329 if (len == M_COPYALL)
5330 len = m_length(*mp) - off; /* XXX */
5331#endif
5332
5333 error = m_copyback0(mp, off, len, NULL,
5334 M_COPYBACK0_PRESERVE | M_COPYBACK0_COW, how);
5335
5336#if DEBUG
5337 reslen = 0;
5338 for (n = *mp; n; n = n->m_next)
5339 reslen += n->m_len;
5340 if (origlen != reslen)
5341 panic("m_makewritable: length changed");
5342 if (((*mp)->m_flags & M_PKTHDR) && reslen != (*mp)->m_pkthdr.len)
5343 panic("m_makewritable: inconsist");
5344#endif /* DEBUG */
5345
5346 return (error);
5347}
5348
5349static int
5350m_copyback0(struct mbuf **mp0, int off, int len, const void *vp, int flags,
5351 int how)
5352{
5353 int mlen;
5354 struct mbuf *m, *n;
5355 struct mbuf **mp;
5356 int totlen = 0;
5357 const char *cp = vp;
5358
5359 VERIFY(mp0 != NULL);
5360 VERIFY(*mp0 != NULL);
5361 VERIFY((flags & M_COPYBACK0_PRESERVE) == 0 || cp == NULL);
5362 VERIFY((flags & M_COPYBACK0_COPYBACK) == 0 || cp != NULL);
5363
5364 /*
5365 * we don't bother to update "totlen" in the case of M_COPYBACK0_COW,
5366 * assuming that M_COPYBACK0_EXTEND and M_COPYBACK0_COW are exclusive.
5367 */
5368
5369 VERIFY((~flags & (M_COPYBACK0_EXTEND|M_COPYBACK0_COW)) != 0);
5370
5371 mp = mp0;
5372 m = *mp;
1c79356b
A
5373 while (off > (mlen = m->m_len)) {
5374 off -= mlen;
5375 totlen += mlen;
2d21ac55 5376 if (m->m_next == NULL) {
b0d623f7
A
5377 int tspace;
5378extend:
5379 if (!(flags & M_COPYBACK0_EXTEND))
1c79356b 5380 goto out;
b0d623f7
A
5381
5382 /*
5383 * try to make some space at the end of "m".
5384 */
5385
5386 mlen = m->m_len;
5387 if (off + len >= MINCLSIZE &&
5388 !(m->m_flags & M_EXT) && m->m_len == 0) {
5389 MCLGET(m, how);
5390 }
5391 tspace = M_TRAILINGSPACE(m);
5392 if (tspace > 0) {
5393 tspace = MIN(tspace, off + len);
5394 VERIFY(tspace > 0);
5395 bzero(mtod(m, char *) + m->m_len,
5396 MIN(off, tspace));
5397 m->m_len += tspace;
5398 off += mlen;
5399 totlen -= mlen;
5400 continue;
5401 }
5402
5403 /*
5404 * need to allocate an mbuf.
5405 */
5406
5407 if (off + len >= MINCLSIZE) {
5408 n = m_getcl(how, m->m_type, 0);
5409 } else {
5410 n = _M_GET(how, m->m_type);
5411 }
5412 if (n == NULL) {
5413 goto out;
5414 }
5415 n->m_len = 0;
5416 n->m_len = MIN(M_TRAILINGSPACE(n), off + len);
5417 bzero(mtod(n, char *), MIN(n->m_len, off));
1c79356b
A
5418 m->m_next = n;
5419 }
b0d623f7 5420 mp = &m->m_next;
1c79356b
A
5421 m = m->m_next;
5422 }
5423 while (len > 0) {
b0d623f7
A
5424 mlen = m->m_len - off;
5425 if (mlen != 0 && m_mclhasreference(m)) {
5426 char *datap;
5427 int eatlen;
5428
5429 /*
5430 * this mbuf is read-only.
5431 * allocate a new writable mbuf and try again.
5432 */
5433
39236c6e 5434#if DIAGNOSTIC
b0d623f7
A
5435 if (!(flags & M_COPYBACK0_COW))
5436 panic("m_copyback0: read-only");
39236c6e 5437#endif /* DIAGNOSTIC */
b0d623f7
A
5438
5439 /*
5440 * if we're going to write into the middle of
5441 * a mbuf, split it first.
5442 */
5443 if (off > 0 && len < mlen) {
5444 n = m_split0(m, off, how, 0);
5445 if (n == NULL)
5446 goto enobufs;
5447 m->m_next = n;
5448 mp = &m->m_next;
5449 m = n;
5450 off = 0;
5451 continue;
5452 }
5453
5454 /*
5455 * XXX TODO coalesce into the trailingspace of
5456 * the previous mbuf when possible.
5457 */
5458
5459 /*
5460 * allocate a new mbuf. copy packet header if needed.
5461 */
5462 n = _M_GET(how, m->m_type);
5463 if (n == NULL)
5464 goto enobufs;
5465 if (off == 0 && (m->m_flags & M_PKTHDR)) {
5466 M_COPY_PKTHDR(n, m);
5467 n->m_len = MHLEN;
5468 } else {
5469 if (len >= MINCLSIZE)
5470 MCLGET(n, M_DONTWAIT);
5471 n->m_len =
5472 (n->m_flags & M_EXT) ? MCLBYTES : MLEN;
5473 }
5474 if (n->m_len > len)
5475 n->m_len = len;
5476
5477 /*
5478 * free the region which has been overwritten.
5479 * copying data from old mbufs if requested.
5480 */
5481 if (flags & M_COPYBACK0_PRESERVE)
5482 datap = mtod(n, char *);
5483 else
5484 datap = NULL;
5485 eatlen = n->m_len;
5486 VERIFY(off == 0 || eatlen >= mlen);
5487 if (off > 0) {
5488 VERIFY(len >= mlen);
5489 m->m_len = off;
5490 m->m_next = n;
5491 if (datap) {
5492 m_copydata(m, off, mlen, datap);
5493 datap += mlen;
5494 }
5495 eatlen -= mlen;
5496 mp = &m->m_next;
5497 m = m->m_next;
5498 }
5499 while (m != NULL && m_mclhasreference(m) &&
5500 n->m_type == m->m_type && eatlen > 0) {
5501 mlen = MIN(eatlen, m->m_len);
5502 if (datap) {
5503 m_copydata(m, 0, mlen, datap);
5504 datap += mlen;
5505 }
5506 m->m_data += mlen;
5507 m->m_len -= mlen;
5508 eatlen -= mlen;
5509 if (m->m_len == 0)
5510 *mp = m = m_free(m);
5511 }
5512 if (eatlen > 0)
5513 n->m_len -= eatlen;
5514 n->m_next = m;
5515 *mp = m = n;
5516 continue;
5517 }
5518 mlen = MIN(mlen, len);
5519 if (flags & M_COPYBACK0_COPYBACK) {
5520 bcopy(cp, mtod(m, caddr_t) + off, (unsigned)mlen);
5521 cp += mlen;
5522 }
1c79356b
A
5523 len -= mlen;
5524 mlen += off;
5525 off = 0;
5526 totlen += mlen;
5527 if (len == 0)
5528 break;
2d21ac55 5529 if (m->m_next == NULL) {
b0d623f7 5530 goto extend;
1c79356b 5531 }
b0d623f7 5532 mp = &m->m_next;
1c79356b
A
5533 m = m->m_next;
5534 }
2d21ac55 5535out:
b0d623f7
A
5536 if (((m = *mp0)->m_flags & M_PKTHDR) && (m->m_pkthdr.len < totlen)) {
5537 VERIFY(flags & M_COPYBACK0_EXTEND);
1c79356b 5538 m->m_pkthdr.len = totlen;
b0d623f7
A
5539 }
5540
5541 return (0);
5542
5543enobufs:
5544 return (ENOBUFS);
1c79356b
A
5545}
5546
39236c6e 5547uint64_t
2d21ac55
A
5548mcl_to_paddr(char *addr)
5549{
b0d623f7 5550 vm_offset_t base_phys;
1c79356b 5551
2d21ac55 5552 if (!MBUF_IN_MAP(addr))
39236c6e
A
5553 return (0);
5554 base_phys = mcl_paddr[atop_64(addr - (char *)mbutl)];
1c79356b
A
5555
5556 if (base_phys == 0)
39236c6e
A
5557 return (0);
5558 return ((uint64_t)(ptoa_64(base_phys) | ((uint64_t)addr & PAGE_MASK)));
1c79356b
A
5559}
5560
5561/*
5562 * Dup the mbuf chain passed in. The whole thing. No cute additional cruft.
5563 * And really copy the thing. That way, we don't "precompute" checksums
2d21ac55
A
5564 * for unsuspecting consumers. Assumption: m->m_nextpkt == 0. Trick: for
5565 * small packets, don't dup into a cluster. That way received packets
5566 * don't take up too much room in the sockbuf (cf. sbspace()).
1c79356b
A
5567 */
5568int MDFail;
5569
5570struct mbuf *
91447636 5571m_dup(struct mbuf *m, int how)
2d21ac55 5572{
91447636 5573 struct mbuf *n, **np;
1c79356b
A
5574 struct mbuf *top;
5575 int copyhdr = 0;
5576
5577 np = &top;
2d21ac55 5578 top = NULL;
1c79356b
A
5579 if (m->m_flags & M_PKTHDR)
5580 copyhdr = 1;
5581
5582 /*
5583 * Quick check: if we have one mbuf and its data fits in an
5584 * mbuf with packet header, just copy and go.
5585 */
2d21ac55
A
5586 if (m->m_next == NULL) {
5587 /* Then just move the data into an mbuf and be done... */
5588 if (copyhdr) {
5589 if (m->m_pkthdr.len <= MHLEN && m->m_len <= MHLEN) {
5590 if ((n = _M_GETHDR(how, m->m_type)) == NULL)
5591 return (NULL);
1c79356b 5592 n->m_len = m->m_len;
3a60a9f5
A
5593 m_dup_pkthdr(n, m, how);
5594 bcopy(m->m_data, n->m_data, m->m_len);
2d21ac55 5595 return (n);
1c79356b 5596 }
2d21ac55
A
5597 } else if (m->m_len <= MLEN) {
5598 if ((n = _M_GET(how, m->m_type)) == NULL)
5599 return (NULL);
1c79356b
A
5600 bcopy(m->m_data, n->m_data, m->m_len);
5601 n->m_len = m->m_len;
2d21ac55 5602 return (n);
1c79356b
A
5603 }
5604 }
2d21ac55 5605 while (m != NULL) {
1c79356b
A
5606#if BLUE_DEBUG
5607 kprintf("<%x: %x, %x, %x\n", m, m->m_flags, m->m_len,
2d21ac55 5608 m->m_data);
1c79356b
A
5609#endif
5610 if (copyhdr)
2d21ac55 5611 n = _M_GETHDR(how, m->m_type);
1c79356b 5612 else
2d21ac55
A
5613 n = _M_GET(how, m->m_type);
5614 if (n == NULL)
1c79356b 5615 goto nospace;
2d21ac55
A
5616 if (m->m_flags & M_EXT) {
5617 if (m->m_len <= m_maxsize(MC_CL))
5618 MCLGET(n, how);
5619 else if (m->m_len <= m_maxsize(MC_BIGCL))
5620 n = m_mbigget(n, how);
5621 else if (m->m_len <= m_maxsize(MC_16KCL) && njcl > 0)
5622 n = m_m16kget(n, how);
5623 if (!(n->m_flags & M_EXT)) {
5624 (void) m_free(n);
1c79356b 5625 goto nospace;
2d21ac55 5626 }
1c79356b
A
5627 }
5628 *np = n;
2d21ac55
A
5629 if (copyhdr) {
5630 /* Don't use M_COPY_PKTHDR: preserve m_data */
3a60a9f5 5631 m_dup_pkthdr(n, m, how);
1c79356b 5632 copyhdr = 0;
2d21ac55 5633 if (!(n->m_flags & M_EXT))
1c79356b
A
5634 n->m_data = n->m_pktdat;
5635 }
5636 n->m_len = m->m_len;
5637 /*
5638 * Get the dup on the same bdry as the original
5639 * Assume that the two mbufs have the same offset to data area
2d21ac55 5640 * (up to word boundaries)
1c79356b 5641 */
2d21ac55 5642 bcopy(MTOD(m, caddr_t), MTOD(n, caddr_t), (unsigned)n->m_len);
1c79356b
A
5643 m = m->m_next;
5644 np = &n->m_next;
5645#if BLUE_DEBUG
5646 kprintf(">%x: %x, %x, %x\n", n, n->m_flags, n->m_len,
2d21ac55 5647 n->m_data);
1c79356b
A
5648#endif
5649 }
5650
2d21ac55 5651 if (top == NULL)
1c79356b
A
5652 MDFail++;
5653 return (top);
2d21ac55
A
5654
5655nospace:
1c79356b
A
5656 m_freem(top);
5657 MDFail++;
2d21ac55 5658 return (NULL);
1c79356b
A
5659}
5660
2d21ac55
A
5661#define MBUF_MULTIPAGES(m) \
5662 (((m)->m_flags & M_EXT) && \
5663 ((IS_P2ALIGNED((m)->m_data, NBPG) && (m)->m_len > NBPG) || \
5664 (!IS_P2ALIGNED((m)->m_data, NBPG) && \
5665 P2ROUNDUP((m)->m_data, NBPG) < ((uintptr_t)(m)->m_data + (m)->m_len))))
5666
5667static struct mbuf *
5668m_expand(struct mbuf *m, struct mbuf **last)
9bccf70c 5669{
2d21ac55
A
5670 struct mbuf *top = NULL;
5671 struct mbuf **nm = &top;
5672 uintptr_t data0, data;
5673 unsigned int len0, len;
5674
5675 VERIFY(MBUF_MULTIPAGES(m));
5676 VERIFY(m->m_next == NULL);
5677 data0 = (uintptr_t)m->m_data;
5678 len0 = m->m_len;
5679 *last = top;
5680
5681 for (;;) {
5682 struct mbuf *n;
5683
5684 data = data0;
5685 if (IS_P2ALIGNED(data, NBPG) && len0 > NBPG)
5686 len = NBPG;
5687 else if (!IS_P2ALIGNED(data, NBPG) &&
5688 P2ROUNDUP(data, NBPG) < (data + len0))
5689 len = P2ROUNDUP(data, NBPG) - data;
5690 else
5691 len = len0;
5692
5693 VERIFY(len > 0);
5694 VERIFY(m->m_flags & M_EXT);
5695 m->m_data = (void *)data;
5696 m->m_len = len;
5697
5698 *nm = *last = m;
5699 nm = &m->m_next;
5700 m->m_next = NULL;
5701
5702 data0 += len;
5703 len0 -= len;
5704 if (len0 == 0)
5705 break;
5706
5707 n = _M_RETRY(M_DONTWAIT, MT_DATA);
5708 if (n == NULL) {
5709 m_freem(top);
5710 top = *last = NULL;
5711 break;
5712 }
5713
5714 n->m_ext = m->m_ext;
5715 m_incref(m);
5716 n->m_flags |= M_EXT;
5717 m = n;
5718 }
5719 return (top);
9bccf70c
A
5720}
5721
2d21ac55
A
5722struct mbuf *
5723m_normalize(struct mbuf *m)
9bccf70c 5724{
2d21ac55
A
5725 struct mbuf *top = NULL;
5726 struct mbuf **nm = &top;
5727 boolean_t expanded = FALSE;
5728
5729 while (m != NULL) {
5730 struct mbuf *n;
5731
5732 n = m->m_next;
5733 m->m_next = NULL;
5734
5735 /* Does the data cross one or more page boundaries? */
5736 if (MBUF_MULTIPAGES(m)) {
5737 struct mbuf *last;
5738 if ((m = m_expand(m, &last)) == NULL) {
5739 m_freem(n);
5740 m_freem(top);
5741 top = NULL;
5742 break;
5743 }
5744 *nm = m;
5745 nm = &last->m_next;
5746 expanded = TRUE;
5747 } else {
5748 *nm = m;
5749 nm = &m->m_next;
5750 }
5751 m = n;
5752 }
5753 if (expanded)
5754 atomic_add_32(&mb_normalized, 1);
5755 return (top);
9bccf70c
A
5756}
5757
6d2010ae
A
5758/*
5759 * Append the specified data to the indicated mbuf chain,
5760 * Extend the mbuf chain if the new data does not fit in
5761 * existing space.
5762 *
5763 * Return 1 if able to complete the job; otherwise 0.
5764 */
5765int
5766m_append(struct mbuf *m0, int len, caddr_t cp)
5767{
5768 struct mbuf *m, *n;
5769 int remainder, space;
5770
5771 for (m = m0; m->m_next != NULL; m = m->m_next)
5772 ;
5773 remainder = len;
5774 space = M_TRAILINGSPACE(m);
5775 if (space > 0) {
5776 /*
5777 * Copy into available space.
5778 */
5779 if (space > remainder)
5780 space = remainder;
5781 bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
5782 m->m_len += space;
5783 cp += space, remainder -= space;
5784 }
5785 while (remainder > 0) {
5786 /*
5787 * Allocate a new mbuf; could check space
5788 * and allocate a cluster instead.
5789 */
5790 n = m_get(M_WAITOK, m->m_type);
5791 if (n == NULL)
5792 break;
5793 n->m_len = min(MLEN, remainder);
5794 bcopy(cp, mtod(n, caddr_t), n->m_len);
5795 cp += n->m_len;
5796 remainder -= n->m_len;
5797 m->m_next = n;
5798 m = n;
5799 }
5800 if (m0->m_flags & M_PKTHDR)
5801 m0->m_pkthdr.len += len - remainder;
5802 return (remainder == 0);
5803}
5804
5805struct mbuf *
5806m_last(struct mbuf *m)
5807{
5808 while (m->m_next != NULL)
5809 m = m->m_next;
5810 return (m);
5811}
5812
316670eb
A
5813unsigned int
5814m_fixhdr(struct mbuf *m0)
5815{
5816 u_int len;
5817
39236c6e
A
5818 VERIFY(m0->m_flags & M_PKTHDR);
5819
316670eb
A
5820 len = m_length2(m0, NULL);
5821 m0->m_pkthdr.len = len;
5822 return (len);
5823}
5824
5825unsigned int
5826m_length2(struct mbuf *m0, struct mbuf **last)
5827{
5828 struct mbuf *m;
5829 u_int len;
5830
5831 len = 0;
5832 for (m = m0; m != NULL; m = m->m_next) {
5833 len += m->m_len;
5834 if (m->m_next == NULL)
5835 break;
5836 }
5837 if (last != NULL)
5838 *last = m;
5839 return (len);
5840}
5841
5842/*
5843 * Defragment a mbuf chain, returning the shortest possible chain of mbufs
5844 * and clusters. If allocation fails and this cannot be completed, NULL will
5845 * be returned, but the passed in chain will be unchanged. Upon success,
5846 * the original chain will be freed, and the new chain will be returned.
5847 *
5848 * If a non-packet header is passed in, the original mbuf (chain?) will
5849 * be returned unharmed.
5850 *
5851 * If offset is specfied, the first mbuf in the chain will have a leading
5852 * space of the amount stated by the "off" parameter.
5853 *
5854 * This routine requires that the m_pkthdr.header field of the original
5855 * mbuf chain is cleared by the caller.
5856 */
5857struct mbuf *
5858m_defrag_offset(struct mbuf *m0, u_int32_t off, int how)
5859{
5860 struct mbuf *m_new = NULL, *m_final = NULL;
5861 int progress = 0, length, pktlen;
5862
5863 if (!(m0->m_flags & M_PKTHDR))
5864 return (m0);
5865
5866 VERIFY(off < MHLEN);
5867 m_fixhdr(m0); /* Needed sanity check */
5868
5869 pktlen = m0->m_pkthdr.len + off;
5870 if (pktlen > MHLEN)
5871 m_final = m_getcl(how, MT_DATA, M_PKTHDR);
5872 else
5873 m_final = m_gethdr(how, MT_DATA);
5874
5875 if (m_final == NULL)
5876 goto nospace;
5877
5878 if (off > 0) {
5879 pktlen -= off;
316670eb
A
5880 m_final->m_data += off;
5881 }
5882
5883 /*
5884 * Caller must have handled the contents pointed to by this
5885 * pointer before coming here, as otherwise it will point to
5886 * the original mbuf which will get freed upon success.
5887 */
39236c6e 5888 VERIFY(m0->m_pkthdr.pkt_hdr == NULL);
316670eb
A
5889
5890 if (m_dup_pkthdr(m_final, m0, how) == 0)
5891 goto nospace;
5892
5893 m_new = m_final;
5894
5895 while (progress < pktlen) {
5896 length = pktlen - progress;
5897 if (length > MCLBYTES)
5898 length = MCLBYTES;
39236c6e 5899 length -= ((m_new == m_final) ? off : 0);
316670eb
A
5900
5901 if (m_new == NULL) {
5902 if (length > MLEN)
5903 m_new = m_getcl(how, MT_DATA, 0);
5904 else
5905 m_new = m_get(how, MT_DATA);
5906 if (m_new == NULL)
5907 goto nospace;
5908 }
5909
5910 m_copydata(m0, progress, length, mtod(m_new, caddr_t));
5911 progress += length;
5912 m_new->m_len = length;
5913 if (m_new != m_final)
5914 m_cat(m_final, m_new);
5915 m_new = NULL;
5916 }
5917 m_freem(m0);
5918 m0 = m_final;
5919 return (m0);
5920nospace:
5921 if (m_final)
5922 m_freem(m_final);
5923 return (NULL);
5924}
5925
5926struct mbuf *
5927m_defrag(struct mbuf *m0, int how)
5928{
5929 return (m_defrag_offset(m0, 0, how));
5930}
5931
9bccf70c
A
5932void
5933m_mchtype(struct mbuf *m, int t)
5934{
2d21ac55
A
5935 mtype_stat_inc(t);
5936 mtype_stat_dec(m->m_type);
5937 (m)->m_type = t;
9bccf70c
A
5938}
5939
2d21ac55
A
5940void *
5941m_mtod(struct mbuf *m)
9bccf70c 5942{
2d21ac55 5943 return (MTOD(m, void *));
9bccf70c
A
5944}
5945
2d21ac55
A
5946struct mbuf *
5947m_dtom(void *x)
9bccf70c 5948{
b0d623f7 5949 return ((struct mbuf *)((uintptr_t)(x) & ~(MSIZE-1)));
9bccf70c
A
5950}
5951
2d21ac55
A
5952void
5953m_mcheck(struct mbuf *m)
9bccf70c 5954{
2d21ac55 5955 _MCHECK(m);
9bccf70c
A
5956}
5957
6d2010ae
A
5958/*
5959 * Return a pointer to mbuf/offset of location in mbuf chain.
5960 */
5961struct mbuf *
5962m_getptr(struct mbuf *m, int loc, int *off)
5963{
5964
5965 while (loc >= 0) {
5966 /* Normal end of search. */
5967 if (m->m_len > loc) {
5968 *off = loc;
5969 return (m);
5970 } else {
5971 loc -= m->m_len;
5972 if (m->m_next == NULL) {
5973 if (loc == 0) {
5974 /* Point at the end of valid data. */
5975 *off = m->m_len;
5976 return (m);
5977 }
5978 return (NULL);
5979 }
5980 m = m->m_next;
5981 }
5982 }
5983 return (NULL);
5984}
5985
2d21ac55
A
5986/*
5987 * Inform the corresponding mcache(s) that there's a waiter below.
5988 */
5989static void
5990mbuf_waiter_inc(mbuf_class_t class, boolean_t comp)
9bccf70c 5991{
2d21ac55
A
5992 mcache_waiter_inc(m_cache(class));
5993 if (comp) {
5994 if (class == MC_CL) {
5995 mcache_waiter_inc(m_cache(MC_MBUF_CL));
5996 } else if (class == MC_BIGCL) {
5997 mcache_waiter_inc(m_cache(MC_MBUF_BIGCL));
5998 } else if (class == MC_16KCL) {
5999 mcache_waiter_inc(m_cache(MC_MBUF_16KCL));
6000 } else {
6001 mcache_waiter_inc(m_cache(MC_MBUF_CL));
6002 mcache_waiter_inc(m_cache(MC_MBUF_BIGCL));
6003 }
6004 }
9bccf70c
A
6005}
6006
2d21ac55
A
6007/*
6008 * Inform the corresponding mcache(s) that there's no more waiter below.
6009 */
6010static void
6011mbuf_waiter_dec(mbuf_class_t class, boolean_t comp)
6012{
6013 mcache_waiter_dec(m_cache(class));
6014 if (comp) {
6015 if (class == MC_CL) {
6016 mcache_waiter_dec(m_cache(MC_MBUF_CL));
6017 } else if (class == MC_BIGCL) {
6018 mcache_waiter_dec(m_cache(MC_MBUF_BIGCL));
6019 } else if (class == MC_16KCL) {
6020 mcache_waiter_dec(m_cache(MC_MBUF_16KCL));
6021 } else {
6022 mcache_waiter_dec(m_cache(MC_MBUF_CL));
6023 mcache_waiter_dec(m_cache(MC_MBUF_BIGCL));
6024 }
6025 }
6026}
9bccf70c 6027
6d2010ae
A
6028/*
6029 * Called during slab (blocking and non-blocking) allocation. If there
6030 * is at least one waiter, and the time since the first waiter is blocked
6031 * is greater than the watchdog timeout, panic the system.
6032 */
6033static void
6034mbuf_watchdog(void)
6035{
6036 struct timeval now;
6037 unsigned int since;
6038
6039 if (mb_waiters == 0 || !mb_watchdog)
6040 return;
6041
6042 microuptime(&now);
6043 since = now.tv_sec - mb_wdtstart.tv_sec;
6044 if (since >= MB_WDT_MAXTIME) {
6045 panic_plain("%s: %d waiters stuck for %u secs\n%s", __func__,
6046 mb_waiters, since, mbuf_dump());
6047 /* NOTREACHED */
6048 }
6049}
6050
2d21ac55
A
6051/*
6052 * Called during blocking allocation. Returns TRUE if one or more objects
6053 * are available at the per-CPU caches layer and that allocation should be
6054 * retried at that level.
6055 */
6056static boolean_t
6057mbuf_sleep(mbuf_class_t class, unsigned int num, int wait)
9bccf70c 6058{
2d21ac55
A
6059 boolean_t mcache_retry = FALSE;
6060
6061 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
6062
6063 /* Check if there's anything at the cache layer */
6064 if (mbuf_cached_above(class, wait)) {
6065 mcache_retry = TRUE;
6066 goto done;
6067 }
6068
6069 /* Nothing? Then try hard to get it from somewhere */
6070 m_reclaim(class, num, (wait & MCR_COMP));
6071
6072 /* We tried hard and got something? */
6073 if (m_infree(class) > 0) {
6074 mbstat.m_wait++;
6075 goto done;
6076 } else if (mbuf_cached_above(class, wait)) {
6077 mbstat.m_wait++;
6078 mcache_retry = TRUE;
6079 goto done;
6080 } else if (wait & MCR_TRYHARD) {
6081 mcache_retry = TRUE;
6082 goto done;
6083 }
6084
6085 /*
6086 * There's really nothing for us right now; inform the
6087 * cache(s) that there is a waiter below and go to sleep.
6088 */
6089 mbuf_waiter_inc(class, (wait & MCR_COMP));
6090
6091 VERIFY(!(wait & MCR_NOSLEEP));
6d2010ae
A
6092
6093 /*
6094 * If this is the first waiter, arm the watchdog timer. Otherwise
6095 * check if we need to panic the system due to watchdog timeout.
6096 */
6097 if (mb_waiters == 0)
6098 microuptime(&mb_wdtstart);
6099 else
6100 mbuf_watchdog();
6101
2d21ac55
A
6102 mb_waiters++;
6103 (void) msleep(mb_waitchan, mbuf_mlock, (PZERO-1), m_cname(class), NULL);
6104
6105 /* We are now up; stop getting notified until next round */
6106 mbuf_waiter_dec(class, (wait & MCR_COMP));
6107
6108 /* We waited and got something */
6109 if (m_infree(class) > 0) {
6110 mbstat.m_wait++;
6111 goto done;
6112 } else if (mbuf_cached_above(class, wait)) {
6113 mbstat.m_wait++;
6114 mcache_retry = TRUE;
6115 }
6116done:
6117 return (mcache_retry);
9bccf70c
A
6118}
6119
91447636 6120static void
2d21ac55 6121mbuf_worker_thread(void)
1c79356b 6122{
2d21ac55
A
6123 int mbuf_expand;
6124
91447636 6125 while (1) {
2d21ac55
A
6126 lck_mtx_lock(mbuf_mlock);
6127
6128 mbuf_expand = 0;
91447636
A
6129 if (mbuf_expand_mcl) {
6130 int n;
2d21ac55
A
6131
6132 /* Adjust to current number of cluster in use */
6133 n = mbuf_expand_mcl -
6134 (m_total(MC_CL) - m_infree(MC_CL));
6135 if ((n + m_total(MC_CL)) > m_maxlimit(MC_CL))
6136 n = m_maxlimit(MC_CL) - m_total(MC_CL);
91447636 6137 mbuf_expand_mcl = 0;
2d21ac55
A
6138
6139 if (n > 0 && freelist_populate(MC_CL, n, M_WAIT) > 0)
6140 mbuf_expand++;
91447636
A
6141 }
6142 if (mbuf_expand_big) {
6143 int n;
2d21ac55
A
6144
6145 /* Adjust to current number of 4 KB cluster in use */
6146 n = mbuf_expand_big -
6147 (m_total(MC_BIGCL) - m_infree(MC_BIGCL));
6148 if ((n + m_total(MC_BIGCL)) > m_maxlimit(MC_BIGCL))
6149 n = m_maxlimit(MC_BIGCL) - m_total(MC_BIGCL);
91447636 6150 mbuf_expand_big = 0;
2d21ac55
A
6151
6152 if (n > 0 && freelist_populate(MC_BIGCL, n, M_WAIT) > 0)
6153 mbuf_expand++;
6154 }
6155 if (mbuf_expand_16k) {
6156 int n;
6157
6158 /* Adjust to current number of 16 KB cluster in use */
6159 n = mbuf_expand_16k -
6160 (m_total(MC_16KCL) - m_infree(MC_16KCL));
6161 if ((n + m_total(MC_16KCL)) > m_maxlimit(MC_16KCL))
6162 n = m_maxlimit(MC_16KCL) - m_total(MC_16KCL);
6163 mbuf_expand_16k = 0;
6164
6165 if (n > 0)
6166 (void) freelist_populate(MC_16KCL, n, M_WAIT);
6167 }
6168
6169 /*
6170 * Because we can run out of memory before filling the mbuf
6171 * map, we should not allocate more clusters than they are
6172 * mbufs -- otherwise we could have a large number of useless
6173 * clusters allocated.
91447636 6174 */
2d21ac55
A
6175 if (mbuf_expand) {
6176 while (m_total(MC_MBUF) <
6177 (m_total(MC_BIGCL) + m_total(MC_CL))) {
6178 if (freelist_populate(MC_MBUF, 1, M_WAIT) == 0)
6179 break;
6180 }
91447636 6181 }
2d21ac55
A
6182
6183 lck_mtx_unlock(mbuf_mlock);
6184
6185 assert_wait(&mbuf_worker_run, THREAD_UNINT);
6186 (void) thread_block((thread_continue_t)mbuf_worker_thread);
91447636 6187 }
1c79356b
A
6188}
6189
91447636 6190static void
2d21ac55 6191mbuf_worker_thread_init(void)
55e303ae 6192{
2d21ac55
A
6193 mbuf_worker_ready++;
6194 mbuf_worker_thread();
55e303ae 6195}
1c79356b 6196
2d21ac55
A
6197static mcl_slab_t *
6198slab_get(void *buf)
6199{
6200 mcl_slabg_t *slg;
6201 unsigned int ix, k;
6202
6203 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
6204
6205 VERIFY(MBUF_IN_MAP(buf));
6206 ix = ((char *)buf - (char *)mbutl) >> MBSHIFT;
6207 VERIFY(ix < maxslabgrp);
6208
6209 if ((slg = slabstbl[ix]) == NULL) {
6210 /*
6211 * In the current implementation, we never shrink the memory
6212 * pool (hence the cluster map); if we attempt to reallocate
6213 * a cluster group when it's already allocated, panic since
6214 * this is a sign of a memory corruption (slabstbl[ix] got
6215 * nullified). This also means that there shouldn't be any
6216 * hole in the kernel sub-map for the mbuf pool.
6217 */
6218 ++slabgrp;
6219 VERIFY(ix < slabgrp);
6220 /*
6221 * Slabs expansion can only be done single threaded; when
6222 * we get here, it must be as a result of m_clalloc() which
6223 * is serialized and therefore mb_clalloc_busy must be set.
6224 */
6225 VERIFY(mb_clalloc_busy);
6226 lck_mtx_unlock(mbuf_mlock);
6227
6228 /* This is a new buffer; create the slabs group for it */
6229 MALLOC(slg, mcl_slabg_t *, sizeof (*slg), M_TEMP,
6230 M_WAITOK | M_ZERO);
6231 VERIFY(slg != NULL);
6232
6233 lck_mtx_lock(mbuf_mlock);
6234 /*
6235 * No other thread could have gone into m_clalloc() after
6236 * we dropped the lock above, so verify that it's true.
6237 */
6238 VERIFY(mb_clalloc_busy);
6239
6240 slabstbl[ix] = slg;
6241
6242 /* Chain each slab in the group to its forward neighbor */
6243 for (k = 1; k < NSLABSPMB; k++)
6244 slg->slg_slab[k - 1].sl_next = &slg->slg_slab[k];
6245 VERIFY(slg->slg_slab[NSLABSPMB - 1].sl_next == NULL);
6246
6247 /* And chain the last slab in the previous group to this */
6248 if (ix > 0) {
6249 VERIFY(slabstbl[ix - 1]->
6250 slg_slab[NSLABSPMB - 1].sl_next == NULL);
6251 slabstbl[ix - 1]->slg_slab[NSLABSPMB - 1].sl_next =
6252 &slg->slg_slab[0];
6253 }
6254 }
6255
6d2010ae 6256 ix = MTOBG(buf) % NSLABSPMB;
2d21ac55
A
6257 VERIFY(ix < NSLABSPMB);
6258
6259 return (&slg->slg_slab[ix]);
6260}
6261
6262static void
6263slab_init(mcl_slab_t *sp, mbuf_class_t class, u_int32_t flags,
6264 void *base, void *head, unsigned int len, int refcnt, int chunks)
6265{
6266 sp->sl_class = class;
6267 sp->sl_flags = flags;
6268 sp->sl_base = base;
6269 sp->sl_head = head;
6270 sp->sl_len = len;
6271 sp->sl_refcnt = refcnt;
6272 sp->sl_chunks = chunks;
6273 slab_detach(sp);
6274}
6275
6276static void
6277slab_insert(mcl_slab_t *sp, mbuf_class_t class)
6278{
6279 VERIFY(slab_is_detached(sp));
6280 m_slab_cnt(class)++;
6281 TAILQ_INSERT_TAIL(&m_slablist(class), sp, sl_link);
6282 sp->sl_flags &= ~SLF_DETACHED;
6d2010ae 6283 if (class == MC_16KCL) {
2d21ac55 6284 int k;
6d2010ae 6285 for (k = 1; k < NSLABSP16KB; k++) {
2d21ac55
A
6286 sp = sp->sl_next;
6287 /* Next slab must already be present */
6288 VERIFY(sp != NULL);
6289 VERIFY(slab_is_detached(sp));
6290 sp->sl_flags &= ~SLF_DETACHED;
6291 }
6292 }
6293}
6294
6295static void
6296slab_remove(mcl_slab_t *sp, mbuf_class_t class)
6297{
6298 VERIFY(!slab_is_detached(sp));
6299 VERIFY(m_slab_cnt(class) > 0);
6300 m_slab_cnt(class)--;
6301 TAILQ_REMOVE(&m_slablist(class), sp, sl_link);
6302 slab_detach(sp);
6d2010ae 6303 if (class == MC_16KCL) {
2d21ac55 6304 int k;
6d2010ae 6305 for (k = 1; k < NSLABSP16KB; k++) {
2d21ac55
A
6306 sp = sp->sl_next;
6307 /* Next slab must already be present */
6308 VERIFY(sp != NULL);
6309 VERIFY(!slab_is_detached(sp));
6310 slab_detach(sp);
6311 }
6312 }
6313}
6314
6315static boolean_t
6316slab_inrange(mcl_slab_t *sp, void *buf)
6317{
6318 return ((uintptr_t)buf >= (uintptr_t)sp->sl_base &&
6319 (uintptr_t)buf < ((uintptr_t)sp->sl_base + sp->sl_len));
6320}
6321
b0d623f7 6322#undef panic
2d21ac55
A
6323
6324static void
6325slab_nextptr_panic(mcl_slab_t *sp, void *addr)
6326{
6327 int i;
6328 unsigned int chunk_len = sp->sl_len / sp->sl_chunks;
6329 uintptr_t buf = (uintptr_t)sp->sl_base;
6330
6331 for (i = 0; i < sp->sl_chunks; i++, buf += chunk_len) {
6332 void *next = ((mcache_obj_t *)buf)->obj_next;
6333 if (next != addr)
6334 continue;
6d2010ae 6335 if (!mclverify) {
2d21ac55
A
6336 if (next != NULL && !MBUF_IN_MAP(next)) {
6337 mcache_t *cp = m_cache(sp->sl_class);
6338 panic("%s: %s buffer %p in slab %p modified "
6339 "after free at offset 0: %p out of range "
6340 "[%p-%p)\n", __func__, cp->mc_name,
6341 (void *)buf, sp, next, mbutl, embutl);
6342 /* NOTREACHED */
6343 }
6344 } else {
6345 mcache_audit_t *mca = mcl_audit_buf2mca(sp->sl_class,
6346 (mcache_obj_t *)buf);
6347 mcl_audit_verify_nextptr(next, mca);
6348 }
6349 }
6350}
6351
6352static void
6353slab_detach(mcl_slab_t *sp)
6354{
6355 sp->sl_link.tqe_next = (mcl_slab_t *)-1;
6356 sp->sl_link.tqe_prev = (mcl_slab_t **)-1;
6357 sp->sl_flags |= SLF_DETACHED;
6358}
6359
6360static boolean_t
6361slab_is_detached(mcl_slab_t *sp)
6362{
6363 return ((intptr_t)sp->sl_link.tqe_next == -1 &&
6364 (intptr_t)sp->sl_link.tqe_prev == -1 &&
6365 (sp->sl_flags & SLF_DETACHED));
6366}
6367
6368static void
6369mcl_audit_init(void *buf, mcache_audit_t **mca_list,
6370 mcache_obj_t **con_list, size_t con_size, unsigned int num)
6371{
6372 mcache_audit_t *mca, *mca_tail;
6373 mcache_obj_t *con = NULL;
6374 boolean_t save_contents = (con_list != NULL);
6375 unsigned int i, ix;
6376
6d2010ae 6377 ASSERT(num <= NMBPBG);
2d21ac55
A
6378 ASSERT(con_list == NULL || con_size != 0);
6379
6d2010ae
A
6380 ix = MTOBG(buf);
6381 VERIFY(ix < maxclaudit);
6382
2d21ac55 6383 /* Make sure we haven't been here before */
6d2010ae 6384 for (i = 0; i < NMBPBG; i++)
2d21ac55
A
6385 VERIFY(mclaudit[ix].cl_audit[i] == NULL);
6386
6387 mca = mca_tail = *mca_list;
6388 if (save_contents)
6389 con = *con_list;
6390
6391 for (i = 0; i < num; i++) {
6392 mcache_audit_t *next;
6393
6394 next = mca->mca_next;
6395 bzero(mca, sizeof (*mca));
6396 mca->mca_next = next;
6397 mclaudit[ix].cl_audit[i] = mca;
6398
6399 /* Attach the contents buffer if requested */
6400 if (save_contents) {
39236c6e
A
6401 mcl_saved_contents_t *msc =
6402 (mcl_saved_contents_t *)(void *)con;
6403
6404 VERIFY(msc != NULL);
6405 VERIFY(IS_P2ALIGNED(msc, sizeof (u_int64_t)));
6406 VERIFY(con_size == sizeof (*msc));
2d21ac55 6407 mca->mca_contents_size = con_size;
39236c6e 6408 mca->mca_contents = msc;
2d21ac55
A
6409 con = con->obj_next;
6410 bzero(mca->mca_contents, mca->mca_contents_size);
6411 }
6412
6413 mca_tail = mca;
6414 mca = mca->mca_next;
6415 }
91447636 6416
2d21ac55
A
6417 if (save_contents)
6418 *con_list = con;
6419
6420 *mca_list = mca_tail->mca_next;
6421 mca_tail->mca_next = NULL;
6422}
6423
6424/*
6d2010ae 6425 * Given an address of a buffer (mbuf/2KB/4KB/16KB), return
2d21ac55
A
6426 * the corresponding audit structure for that buffer.
6427 */
6428static mcache_audit_t *
6429mcl_audit_buf2mca(mbuf_class_t class, mcache_obj_t *o)
6430{
6431 mcache_audit_t *mca = NULL;
6d2010ae 6432 int ix = MTOBG(o);
2d21ac55 6433
6d2010ae 6434 VERIFY(ix < maxclaudit);
2d21ac55
A
6435 VERIFY(IS_P2ALIGNED(o, MIN(m_maxsize(class), NBPG)));
6436
6437 switch (class) {
6438 case MC_MBUF:
6439 /*
6d2010ae 6440 * For the mbuf case, find the index of the page
2d21ac55 6441 * used by the mbuf and use that index to locate the
6d2010ae
A
6442 * base address of the page. Then find out the
6443 * mbuf index relative to the page base and use
2d21ac55
A
6444 * it to locate the audit structure.
6445 */
6d2010ae
A
6446 VERIFY(MCLIDX(BGTOM(ix), o) < (int)NMBPBG);
6447 mca = mclaudit[ix].cl_audit[MCLIDX(BGTOM(ix), o)];
2d21ac55
A
6448 break;
6449
6450 case MC_CL:
6d2010ae
A
6451 /*
6452 * Same thing as above, but for 2KB clusters in a page.
6453 */
6454 VERIFY(CLBGIDX(BGTOM(ix), o) < (int)NCLPBG);
6455 mca = mclaudit[ix].cl_audit[CLBGIDX(BGTOM(ix), o)];
6456 break;
6457
2d21ac55
A
6458 case MC_BIGCL:
6459 case MC_16KCL:
6460 /*
6461 * Same as above, but only return the first element.
6462 */
6463 mca = mclaudit[ix].cl_audit[0];
6464 break;
6465
6466 default:
6467 VERIFY(0);
6468 /* NOTREACHED */
6469 }
6470
6471 return (mca);
6472}
6473
6474static void
6475mcl_audit_mbuf(mcache_audit_t *mca, void *addr, boolean_t composite,
6476 boolean_t alloc)
6477{
6478 struct mbuf *m = addr;
6479 mcache_obj_t *next = ((mcache_obj_t *)m)->obj_next;
6480
6481 VERIFY(mca->mca_contents != NULL &&
6482 mca->mca_contents_size == AUDIT_CONTENTS_SIZE);
6483
6d2010ae
A
6484 if (mclverify)
6485 mcl_audit_verify_nextptr(next, mca);
2d21ac55
A
6486
6487 if (!alloc) {
6488 /* Save constructed mbuf fields */
6489 mcl_audit_save_mbuf(m, mca);
6d2010ae
A
6490 if (mclverify) {
6491 mcache_set_pattern(MCACHE_FREE_PATTERN, m,
6492 m_maxsize(MC_MBUF));
6493 }
2d21ac55
A
6494 ((mcache_obj_t *)m)->obj_next = next;
6495 return;
6496 }
6497
6498 /* Check if the buffer has been corrupted while in freelist */
6d2010ae
A
6499 if (mclverify) {
6500 mcache_audit_free_verify_set(mca, addr, 0, m_maxsize(MC_MBUF));
6501 }
2d21ac55
A
6502 /* Restore constructed mbuf fields */
6503 mcl_audit_restore_mbuf(m, mca, composite);
6504}
6505
6506static void
6507mcl_audit_restore_mbuf(struct mbuf *m, mcache_audit_t *mca, boolean_t composite)
6508{
39236c6e 6509 struct mbuf *ms = MCA_SAVED_MBUF_PTR(mca);
2d21ac55
A
6510
6511 if (composite) {
6512 struct mbuf *next = m->m_next;
6513 VERIFY(ms->m_flags == M_EXT && MEXT_RFA(ms) != NULL &&
6514 MBUF_IS_COMPOSITE(ms));
39236c6e 6515 VERIFY(mca->mca_contents_size == AUDIT_CONTENTS_SIZE);
2d21ac55
A
6516 /*
6517 * We could have hand-picked the mbuf fields and restore
6518 * them individually, but that will be a maintenance
6519 * headache. Instead, restore everything that was saved;
6520 * the mbuf layer will recheck and reinitialize anyway.
6521 */
39236c6e 6522 bcopy(ms, m, MCA_SAVED_MBUF_SIZE);
2d21ac55
A
6523 m->m_next = next;
6524 } else {
6525 /*
6526 * For a regular mbuf (no cluster attached) there's nothing
6527 * to restore other than the type field, which is expected
6528 * to be MT_FREE.
6529 */
6530 m->m_type = ms->m_type;
6531 }
6532 _MCHECK(m);
6533}
6534
6535static void
6536mcl_audit_save_mbuf(struct mbuf *m, mcache_audit_t *mca)
6537{
39236c6e 6538 VERIFY(mca->mca_contents_size == AUDIT_CONTENTS_SIZE);
2d21ac55 6539 _MCHECK(m);
39236c6e 6540 bcopy(m, MCA_SAVED_MBUF_PTR(mca), MCA_SAVED_MBUF_SIZE);
2d21ac55
A
6541}
6542
6543static void
6544mcl_audit_cluster(mcache_audit_t *mca, void *addr, size_t size, boolean_t alloc,
6545 boolean_t save_next)
6546{
6547 mcache_obj_t *next = ((mcache_obj_t *)addr)->obj_next;
6548
6549 if (!alloc) {
6d2010ae
A
6550 if (mclverify) {
6551 mcache_set_pattern(MCACHE_FREE_PATTERN, addr, size);
6552 }
2d21ac55
A
6553 if (save_next) {
6554 mcl_audit_verify_nextptr(next, mca);
6555 ((mcache_obj_t *)addr)->obj_next = next;
6556 }
6d2010ae 6557 } else if (mclverify) {
2d21ac55
A
6558 /* Check if the buffer has been corrupted while in freelist */
6559 mcl_audit_verify_nextptr(next, mca);
6560 mcache_audit_free_verify_set(mca, addr, 0, size);
6561 }
6562}
6563
39236c6e
A
6564static void
6565mcl_audit_scratch(mcache_audit_t *mca)
6566{
6567 void *stack[MCACHE_STACK_DEPTH + 1];
6568 mcl_scratch_audit_t *msa;
6569 struct timeval now;
6570
6571 VERIFY(mca->mca_contents != NULL);
6572 msa = MCA_SAVED_SCRATCH_PTR(mca);
6573
6574 msa->msa_pthread = msa->msa_thread;
6575 msa->msa_thread = current_thread();
6576 bcopy(msa->msa_stack, msa->msa_pstack, sizeof (msa->msa_pstack));
6577 msa->msa_pdepth = msa->msa_depth;
6578 bzero(stack, sizeof (stack));
6579 msa->msa_depth = OSBacktrace(stack, MCACHE_STACK_DEPTH + 1) - 1;
6580 bcopy(&stack[1], msa->msa_stack, sizeof (mca->mca_pstack));
6581
6582 msa->msa_ptstamp = msa->msa_tstamp;
6583 microuptime(&now);
6584 /* tstamp is in ms relative to base_ts */
6585 msa->msa_tstamp = ((now.tv_usec - mb_start.tv_usec) / 1000);
6586 if ((now.tv_sec - mb_start.tv_sec) > 0)
6587 msa->msa_tstamp += ((now.tv_sec - mb_start.tv_sec) * 1000);
6588}
6589
2d21ac55
A
6590static void
6591mcl_audit_mcheck_panic(struct mbuf *m)
6592{
6593 mcache_audit_t *mca;
6594
6595 MRANGE(m);
6596 mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
6597
6598 panic("mcl_audit: freed mbuf %p with type 0x%x (instead of 0x%x)\n%s\n",
6599 m, (u_int16_t)m->m_type, MT_FREE, mcache_dump_mca(mca));
6600 /* NOTREACHED */
6601}
6602
6603static void
6604mcl_audit_verify_nextptr(void *next, mcache_audit_t *mca)
6605{
6d2010ae
A
6606 if (next != NULL && !MBUF_IN_MAP(next) &&
6607 (next != (void *)MCACHE_FREE_PATTERN || !mclverify)) {
2d21ac55
A
6608 panic("mcl_audit: buffer %p modified after free at offset 0: "
6609 "%p out of range [%p-%p)\n%s\n",
6610 mca->mca_addr, next, mbutl, embutl, mcache_dump_mca(mca));
6611 /* NOTREACHED */
6612 }
6613}
6614
6d2010ae
A
6615/* This function turns on mbuf leak detection */
6616static void
6617mleak_activate(void)
6618{
6619 mleak_table.mleak_sample_factor = MLEAK_SAMPLE_FACTOR;
6620 PE_parse_boot_argn("mleak_sample_factor",
6621 &mleak_table.mleak_sample_factor,
6622 sizeof (mleak_table.mleak_sample_factor));
6623
6624 if (mleak_table.mleak_sample_factor == 0)
6625 mclfindleak = 0;
6626
6627 if (mclfindleak == 0)
6628 return;
6629
6630 vm_size_t alloc_size =
6631 mleak_alloc_buckets * sizeof (struct mallocation);
6632 vm_size_t trace_size = mleak_trace_buckets * sizeof (struct mtrace);
6633
6634 MALLOC(mleak_allocations, struct mallocation *, alloc_size,
6635 M_TEMP, M_WAITOK | M_ZERO);
6636 VERIFY(mleak_allocations != NULL);
6637
6638 MALLOC(mleak_traces, struct mtrace *, trace_size,
6639 M_TEMP, M_WAITOK | M_ZERO);
6640 VERIFY(mleak_traces != NULL);
6641
6642 MALLOC(mleak_stat, mleak_stat_t *, MLEAK_STAT_SIZE(MLEAK_NUM_TRACES),
6643 M_TEMP, M_WAITOK | M_ZERO);
6644 VERIFY(mleak_stat != NULL);
6645 mleak_stat->ml_cnt = MLEAK_NUM_TRACES;
6646#ifdef __LP64__
6647 mleak_stat->ml_isaddr64 = 1;
6648#endif /* __LP64__ */
6649}
6650
6651static void
6652mleak_logger(u_int32_t num, mcache_obj_t *addr, boolean_t alloc)
6653{
6654 int temp;
6655
6656 if (mclfindleak == 0)
6657 return;
6658
6659 if (!alloc)
6660 return (mleak_free(addr));
6661
6662 temp = atomic_add_32_ov(&mleak_table.mleak_capture, 1);
6663
6664 if ((temp % mleak_table.mleak_sample_factor) == 0 && addr != NULL) {
6665 uintptr_t bt[MLEAK_STACK_DEPTH];
6666 int logged = fastbacktrace(bt, MLEAK_STACK_DEPTH);
6667 mleak_log(bt, addr, logged, num);
6668 }
6669}
6670
6671/*
6672 * This function records the allocation in the mleak_allocations table
6673 * and the backtrace in the mleak_traces table; if allocation slot is in use,
6674 * replace old allocation with new one if the trace slot is in use, return
6675 * (or increment refcount if same trace).
6676 */
6677static boolean_t
6678mleak_log(uintptr_t *bt, mcache_obj_t *addr, uint32_t depth, int num)
6679{
6680 struct mallocation *allocation;
6681 struct mtrace *trace;
6682 uint32_t trace_index;
6d2010ae
A
6683
6684 /* Quit if someone else modifying the tables */
6685 if (!lck_mtx_try_lock_spin(mleak_lock)) {
6686 mleak_table.total_conflicts++;
6687 return (FALSE);
6688 }
6689
6690 allocation = &mleak_allocations[hashaddr((uintptr_t)addr,
6691 mleak_alloc_buckets)];
6692 trace_index = hashbacktrace(bt, depth, mleak_trace_buckets);
6693 trace = &mleak_traces[trace_index];
6694
6695 VERIFY(allocation <= &mleak_allocations[mleak_alloc_buckets - 1]);
6696 VERIFY(trace <= &mleak_traces[mleak_trace_buckets - 1]);
6697
6698 allocation->hitcount++;
6699 trace->hitcount++;
6700
6701 /*
6702 * If the allocation bucket we want is occupied
6703 * and the occupier has the same trace, just bail.
6704 */
6705 if (allocation->element != NULL &&
6706 trace_index == allocation->trace_index) {
6707 mleak_table.alloc_collisions++;
6708 lck_mtx_unlock(mleak_lock);
6709 return (TRUE);
6710 }
6711
6712 /*
6713 * Store the backtrace in the traces array;
6714 * Size of zero = trace bucket is free.
6715 */
6716 if (trace->allocs > 0 &&
6717 bcmp(trace->addr, bt, (depth * sizeof (uintptr_t))) != 0) {
6718 /* Different, unique trace, but the same hash! Bail out. */
6719 trace->collisions++;
6720 mleak_table.trace_collisions++;
6721 lck_mtx_unlock(mleak_lock);
6722 return (TRUE);
6723 } else if (trace->allocs > 0) {
6724 /* Same trace, already added, so increment refcount */
6725 trace->allocs++;
6726 } else {
6727 /* Found an unused trace bucket, so record the trace here */
6728 if (trace->depth != 0) {
6729 /* this slot previously used but not currently in use */
6730 mleak_table.trace_overwrites++;
6731 }
6732 mleak_table.trace_recorded++;
6733 trace->allocs = 1;
6734 memcpy(trace->addr, bt, (depth * sizeof (uintptr_t)));
6735 trace->depth = depth;
6736 trace->collisions = 0;
6737 }
6738
6739 /* Step 2: Store the allocation record in the allocations array */
6740 if (allocation->element != NULL) {
6741 /*
6742 * Replace an existing allocation. No need to preserve
6743 * because only a subset of the allocations are being
6744 * recorded anyway.
6745 */
6746 mleak_table.alloc_collisions++;
6747 } else if (allocation->trace_index != 0) {
6748 mleak_table.alloc_overwrites++;
6749 }
6750 allocation->element = addr;
6751 allocation->trace_index = trace_index;
6752 allocation->count = num;
6753 mleak_table.alloc_recorded++;
6754 mleak_table.outstanding_allocs++;
6755
6d2010ae
A
6756 lck_mtx_unlock(mleak_lock);
6757 return (TRUE);
6758}
6759
6760static void
6761mleak_free(mcache_obj_t *addr)
6762{
6763 while (addr != NULL) {
6764 struct mallocation *allocation = &mleak_allocations
6765 [hashaddr((uintptr_t)addr, mleak_alloc_buckets)];
6766
6767 if (allocation->element == addr &&
6768 allocation->trace_index < mleak_trace_buckets) {
6769 lck_mtx_lock_spin(mleak_lock);
6770 if (allocation->element == addr &&
6771 allocation->trace_index < mleak_trace_buckets) {
6772 struct mtrace *trace;
6773 trace = &mleak_traces[allocation->trace_index];
6774 /* allocs = 0 means trace bucket is unused */
6775 if (trace->allocs > 0)
6776 trace->allocs--;
6777 if (trace->allocs == 0)
6778 trace->depth = 0;
6779 /* NULL element means alloc bucket is unused */
6780 allocation->element = NULL;
6781 mleak_table.outstanding_allocs--;
6782 }
6783 lck_mtx_unlock(mleak_lock);
6784 }
6785 addr = addr->obj_next;
6786 }
6787}
6788
316670eb
A
6789static void
6790mleak_sort_traces()
6791{
6792 int i, j, k;
6793 struct mtrace *swap;
6794
6795 for(i = 0; i < MLEAK_NUM_TRACES; i++)
6796 mleak_top_trace[i] = NULL;
6797
6798 for(i = 0, j = 0; j < MLEAK_NUM_TRACES && i < mleak_trace_buckets; i++)
6799 {
6800 if (mleak_traces[i].allocs <= 0)
6801 continue;
6802
6803 mleak_top_trace[j] = &mleak_traces[i];
6804 for (k = j; k > 0; k--) {
6805 if (mleak_top_trace[k]->allocs <=
6806 mleak_top_trace[k-1]->allocs)
6807 break;
6808
6809 swap = mleak_top_trace[k-1];
6810 mleak_top_trace[k-1] = mleak_top_trace[k];
6811 mleak_top_trace[k] = swap;
6812 }
6813 j++;
6814 }
6815
6816 j--;
6817 for(; i < mleak_trace_buckets; i++) {
6818 if (mleak_traces[i].allocs <= mleak_top_trace[j]->allocs)
6819 continue;
6820
6821 mleak_top_trace[j] = &mleak_traces[i];
6822
6823 for (k = j; k > 0; k--) {
6824 if (mleak_top_trace[k]->allocs <=
6825 mleak_top_trace[k-1]->allocs)
6826 break;
6827
6828 swap = mleak_top_trace[k-1];
6829 mleak_top_trace[k-1] = mleak_top_trace[k];
6830 mleak_top_trace[k] = swap;
6831 }
6832 }
6833}
6834
6835static void
6836mleak_update_stats()
6837{
6838 mleak_trace_stat_t *mltr;
6839 int i;
6840
6841 VERIFY(mleak_stat != NULL);
6842#ifdef __LP64__
6843 VERIFY(mleak_stat->ml_isaddr64);
6844#else
6845 VERIFY(!mleak_stat->ml_isaddr64);
6846#endif /* !__LP64__ */
6847 VERIFY(mleak_stat->ml_cnt == MLEAK_NUM_TRACES);
6848
6849 mleak_sort_traces();
6850
6851 mltr = &mleak_stat->ml_trace[0];
6852 bzero(mltr, sizeof (*mltr) * MLEAK_NUM_TRACES);
6853 for (i = 0; i < MLEAK_NUM_TRACES; i++) {
6854 int j;
6855
6856 if (mleak_top_trace[i] == NULL ||
6857 mleak_top_trace[i]->allocs == 0)
6858 continue;
6859
6860 mltr->mltr_collisions = mleak_top_trace[i]->collisions;
6861 mltr->mltr_hitcount = mleak_top_trace[i]->hitcount;
6862 mltr->mltr_allocs = mleak_top_trace[i]->allocs;
6863 mltr->mltr_depth = mleak_top_trace[i]->depth;
6864
6865 VERIFY(mltr->mltr_depth <= MLEAK_STACK_DEPTH);
6866 for (j = 0; j < mltr->mltr_depth; j++)
6867 mltr->mltr_addr[j] = mleak_top_trace[i]->addr[j];
6868
6869 mltr++;
6870 }
6871}
6872
6d2010ae
A
6873static struct mbtypes {
6874 int mt_type;
6875 const char *mt_name;
6876} mbtypes[] = {
6877 { MT_DATA, "data" },
6878 { MT_OOBDATA, "oob data" },
6879 { MT_CONTROL, "ancillary data" },
6880 { MT_HEADER, "packet headers" },
6881 { MT_SOCKET, "socket structures" },
6882 { MT_PCB, "protocol control blocks" },
6883 { MT_RTABLE, "routing table entries" },
6884 { MT_HTABLE, "IMP host table entries" },
6885 { MT_ATABLE, "address resolution tables" },
6886 { MT_FTABLE, "fragment reassembly queue headers" },
6887 { MT_SONAME, "socket names and addresses" },
6888 { MT_SOOPTS, "socket options" },
6889 { MT_RIGHTS, "access rights" },
6890 { MT_IFADDR, "interface addresses" },
6891 { MT_TAG, "packet tags" },
6892 { 0, NULL }
6893};
6894
6895#define MBUF_DUMP_BUF_CHK() { \
6896 clen -= k; \
6897 if (clen < 1) \
6898 goto done; \
6899 c += k; \
6900}
6901
6902static char *
6903mbuf_dump(void)
6904{
6905 unsigned long totmem = 0, totfree = 0, totmbufs, totused, totpct;
6906 u_int32_t m_mbufs = 0, m_clfree = 0, m_bigclfree = 0;
6907 u_int32_t m_mbufclfree = 0, m_mbufbigclfree = 0;
6908 u_int32_t m_16kclusters = 0, m_16kclfree = 0, m_mbuf16kclfree = 0;
6909 int nmbtypes = sizeof (mbstat.m_mtypes) / sizeof (short);
6910 uint8_t seen[256];
6911 struct mbtypes *mp;
6912 mb_class_stat_t *sp;
316670eb 6913 mleak_trace_stat_t *mltr;
6d2010ae 6914 char *c = mbuf_dump_buf;
316670eb 6915 int i, k, clen = MBUF_DUMP_BUF_SIZE;
6d2010ae
A
6916
6917 mbuf_dump_buf[0] = '\0';
6918
6919 /* synchronize all statistics in the mbuf table */
6920 mbuf_stat_sync();
6921 mbuf_mtypes_sync(TRUE);
6922
6923 sp = &mb_stat->mbs_class[0];
6924 for (i = 0; i < mb_stat->mbs_cnt; i++, sp++) {
6925 u_int32_t mem;
6926
6927 if (m_class(i) == MC_MBUF) {
6928 m_mbufs = sp->mbcl_active;
6929 } else if (m_class(i) == MC_CL) {
6930 m_clfree = sp->mbcl_total - sp->mbcl_active;
6931 } else if (m_class(i) == MC_BIGCL) {
6932 m_bigclfree = sp->mbcl_total - sp->mbcl_active;
6933 } else if (njcl > 0 && m_class(i) == MC_16KCL) {
6934 m_16kclfree = sp->mbcl_total - sp->mbcl_active;
6935 m_16kclusters = sp->mbcl_total;
6936 } else if (m_class(i) == MC_MBUF_CL) {
6937 m_mbufclfree = sp->mbcl_total - sp->mbcl_active;
6938 } else if (m_class(i) == MC_MBUF_BIGCL) {
6939 m_mbufbigclfree = sp->mbcl_total - sp->mbcl_active;
6940 } else if (njcl > 0 && m_class(i) == MC_MBUF_16KCL) {
6941 m_mbuf16kclfree = sp->mbcl_total - sp->mbcl_active;
6942 }
6943
6944 mem = sp->mbcl_ctotal * sp->mbcl_size;
6945 totmem += mem;
6946 totfree += (sp->mbcl_mc_cached + sp->mbcl_infree) *
6947 sp->mbcl_size;
6948
6949 }
6950
6951 /* adjust free counts to include composite caches */
6952 m_clfree += m_mbufclfree;
6953 m_bigclfree += m_mbufbigclfree;
6954 m_16kclfree += m_mbuf16kclfree;
6955
6956 totmbufs = 0;
6957 for (mp = mbtypes; mp->mt_name != NULL; mp++)
6958 totmbufs += mbstat.m_mtypes[mp->mt_type];
6959 if (totmbufs > m_mbufs)
6960 totmbufs = m_mbufs;
6961 k = snprintf(c, clen, "%lu/%u mbufs in use:\n", totmbufs, m_mbufs);
6962 MBUF_DUMP_BUF_CHK();
6963
6964 bzero(&seen, sizeof (seen));
6965 for (mp = mbtypes; mp->mt_name != NULL; mp++) {
6966 if (mbstat.m_mtypes[mp->mt_type] != 0) {
6967 seen[mp->mt_type] = 1;
6968 k = snprintf(c, clen, "\t%u mbufs allocated to %s\n",
6969 mbstat.m_mtypes[mp->mt_type], mp->mt_name);
6970 MBUF_DUMP_BUF_CHK();
6971 }
6972 }
6973 seen[MT_FREE] = 1;
6974 for (i = 0; i < nmbtypes; i++)
6975 if (!seen[i] && mbstat.m_mtypes[i] != 0) {
6976 k = snprintf(c, clen, "\t%u mbufs allocated to "
6977 "<mbuf type %d>\n", mbstat.m_mtypes[i], i);
6978 MBUF_DUMP_BUF_CHK();
6979 }
6980 if ((m_mbufs - totmbufs) > 0) {
6981 k = snprintf(c, clen, "\t%lu mbufs allocated to caches\n",
6982 m_mbufs - totmbufs);
6983 MBUF_DUMP_BUF_CHK();
6984 }
6985 k = snprintf(c, clen, "%u/%u mbuf 2KB clusters in use\n"
6986 "%u/%u mbuf 4KB clusters in use\n",
6987 (unsigned int)(mbstat.m_clusters - m_clfree),
6988 (unsigned int)mbstat.m_clusters,
6989 (unsigned int)(mbstat.m_bigclusters - m_bigclfree),
6990 (unsigned int)mbstat.m_bigclusters);
6991 MBUF_DUMP_BUF_CHK();
6992
6993 if (njcl > 0) {
6994 k = snprintf(c, clen, "%u/%u mbuf %uKB clusters in use\n",
6995 m_16kclusters - m_16kclfree, m_16kclusters,
6996 njclbytes / 1024);
6997 MBUF_DUMP_BUF_CHK();
6998 }
6999 totused = totmem - totfree;
7000 if (totmem == 0) {
7001 totpct = 0;
7002 } else if (totused < (ULONG_MAX / 100)) {
7003 totpct = (totused * 100) / totmem;
7004 } else {
7005 u_long totmem1 = totmem / 100;
7006 u_long totused1 = totused / 100;
7007 totpct = (totused1 * 100) / totmem1;
7008 }
7009 k = snprintf(c, clen, "%lu KB allocated to network (approx. %lu%% "
7010 "in use)\n", totmem / 1024, totpct);
7011 MBUF_DUMP_BUF_CHK();
7012
316670eb
A
7013 /* mbuf leak detection statistics */
7014 mleak_update_stats();
7015
7016 k = snprintf(c, clen, "\nmbuf leak detection table:\n");
7017 MBUF_DUMP_BUF_CHK();
7018 k = snprintf(c, clen, "\ttotal captured: %u (one per %u)\n",
7019 mleak_table.mleak_capture / mleak_table.mleak_sample_factor,
7020 mleak_table.mleak_sample_factor);
7021 MBUF_DUMP_BUF_CHK();
7022 k = snprintf(c, clen, "\ttotal allocs outstanding: %llu\n",
7023 mleak_table.outstanding_allocs);
7024 MBUF_DUMP_BUF_CHK();
7025 k = snprintf(c, clen, "\tnew hash recorded: %llu allocs, %llu traces\n",
7026 mleak_table.alloc_recorded, mleak_table.trace_recorded);
7027 MBUF_DUMP_BUF_CHK();
7028 k = snprintf(c, clen, "\thash collisions: %llu allocs, %llu traces\n",
7029 mleak_table.alloc_collisions, mleak_table.trace_collisions);
7030 MBUF_DUMP_BUF_CHK();
7031 k = snprintf(c, clen, "\toverwrites: %llu allocs, %llu traces\n",
7032 mleak_table.alloc_overwrites, mleak_table.trace_overwrites);
7033 MBUF_DUMP_BUF_CHK();
7034 k = snprintf(c, clen, "\tlock conflicts: %llu\n\n",
7035 mleak_table.total_conflicts);
7036 MBUF_DUMP_BUF_CHK();
7037
7038 k = snprintf(c, clen, "top %d outstanding traces:\n",
7039 mleak_stat->ml_cnt);
7040 MBUF_DUMP_BUF_CHK();
7041 for (i = 0; i < mleak_stat->ml_cnt; i++) {
7042 mltr = &mleak_stat->ml_trace[i];
7043 k = snprintf(c, clen, "[%d] %llu outstanding alloc(s), "
7044 "%llu hit(s), %llu collision(s)\n", (i + 1),
7045 mltr->mltr_allocs, mltr->mltr_hitcount,
7046 mltr->mltr_collisions);
7047 MBUF_DUMP_BUF_CHK();
7048 }
7049
7050 if (mleak_stat->ml_isaddr64)
7051 k = snprintf(c, clen, MB_LEAK_HDR_64);
7052 else
7053 k = snprintf(c, clen, MB_LEAK_HDR_32);
7054 MBUF_DUMP_BUF_CHK();
7055
7056 for (i = 0; i < MLEAK_STACK_DEPTH; i++) {
7057 int j;
7058 k = snprintf(c, clen, "%2d: ", (i + 1));
7059 MBUF_DUMP_BUF_CHK();
7060 for (j = 0; j < mleak_stat->ml_cnt; j++) {
7061 mltr = &mleak_stat->ml_trace[j];
7062 if (i < mltr->mltr_depth) {
7063 if (mleak_stat->ml_isaddr64) {
7064 k = snprintf(c, clen, "0x%0llx ",
7065 mltr->mltr_addr[i]);
7066 } else {
7067 k = snprintf(c, clen,
7068 "0x%08x ",
7069 (u_int32_t)mltr->mltr_addr[i]);
7070 }
7071 } else {
7072 if (mleak_stat->ml_isaddr64)
7073 k = snprintf(c, clen,
7074 MB_LEAK_SPACING_64);
7075 else
7076 k = snprintf(c, clen,
7077 MB_LEAK_SPACING_32);
7078 }
7079 MBUF_DUMP_BUF_CHK();
7080 }
7081 k = snprintf(c, clen, "\n");
7082 MBUF_DUMP_BUF_CHK();
7083 }
6d2010ae
A
7084done:
7085 return (mbuf_dump_buf);
7086}
7087
7088#undef MBUF_DUMP_BUF_CHK
7089
39236c6e
A
7090/*
7091 * Convert between a regular and a packet header mbuf. Caller is responsible
7092 * for setting or clearing M_PKTHDR; this routine does the rest of the work.
7093 */
7094int
7095m_reinit(struct mbuf *m, int hdr)
7096{
7097 int ret = 0;
7098
7099 if (hdr) {
7100 VERIFY(!(m->m_flags & M_PKTHDR));
7101 if (!(m->m_flags & M_EXT) &&
7102 (m->m_data != m->m_dat || m->m_len > 0)) {
7103 /*
7104 * If there's no external cluster attached and the
7105 * mbuf appears to contain user data, we cannot
7106 * safely convert this to a packet header mbuf,
7107 * as the packet header structure might overlap
7108 * with the data.
7109 */
7110 printf("%s: cannot set M_PKTHDR on altered mbuf %p, "
7111 "m_data %p (expected %p), m_len %d (expected 0)\n",
7112 __func__, m, m->m_data, m->m_dat, m->m_len);
7113 ret = EBUSY;
7114 } else {
7115 VERIFY((m->m_flags & M_EXT) || m->m_data == m->m_dat);
7116 m->m_flags |= M_PKTHDR;
7117 MBUF_INIT_PKTHDR(m);
7118 }
7119 } else {
7120 /* Check for scratch area overflow */
7121 m_redzone_verify(m);
7122 /* Free the aux data and tags if there is any */
7123 m_tag_delete_chain(m, NULL);
7124 m->m_flags &= ~M_PKTHDR;
7125 }
7126
7127 return (ret);
7128}
7129
7130void
7131m_scratch_init(struct mbuf *m)
7132{
7133 VERIFY(m->m_flags & M_PKTHDR);
7134
7135 bzero(&m->m_pkthdr.pkt_mpriv, sizeof (m->m_pkthdr.pkt_mpriv));
7136}
7137
7138u_int32_t
7139m_scratch_get(struct mbuf *m, u_int8_t **p)
7140{
7141 VERIFY(m->m_flags & M_PKTHDR);
7142
7143 if (mcltrace) {
7144 mcache_audit_t *mca;
7145
7146 lck_mtx_lock(mbuf_mlock);
7147 mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
7148 if (mca->mca_uflags & MB_SCVALID)
7149 mcl_audit_scratch(mca);
7150 lck_mtx_unlock(mbuf_mlock);
7151 }
7152
7153 *p = (u_int8_t *)&m->m_pkthdr.pkt_mpriv;
7154 return (sizeof (m->m_pkthdr.pkt_mpriv));
7155}
7156
7157static void
7158m_redzone_init(struct mbuf *m)
7159{
7160 VERIFY(m->m_flags & M_PKTHDR);
7161 /*
7162 * Each mbuf has a unique red zone pattern, which is a XOR
7163 * of the red zone cookie and the address of the mbuf.
7164 */
7165 m->m_pkthdr.redzone = ((u_int32_t)(uintptr_t)m) ^ mb_redzone_cookie;
7166}
7167
7168static void
7169m_redzone_verify(struct mbuf *m)
7170{
7171 u_int32_t mb_redzone;
7172
7173 VERIFY(m->m_flags & M_PKTHDR);
7174
7175 mb_redzone = ((u_int32_t)(uintptr_t)m) ^ mb_redzone_cookie;
7176 if (m->m_pkthdr.redzone != mb_redzone) {
7177 panic("mbuf %p redzone violation with value 0x%x "
7178 "(instead of 0x%x, using cookie 0x%x)\n",
7179 m, m->m_pkthdr.redzone, mb_redzone, mb_redzone_cookie);
7180 /* NOTREACHED */
7181 }
7182}
7183
2d21ac55 7184SYSCTL_DECL(_kern_ipc);
6d2010ae
A
7185SYSCTL_PROC(_kern_ipc, KIPC_MBSTAT, mbstat,
7186 CTLFLAG_RD | CTLFLAG_LOCKED,
2d21ac55 7187 0, 0, mbstat_sysctl, "S,mbstat", "");
6d2010ae
A
7188SYSCTL_PROC(_kern_ipc, OID_AUTO, mb_stat,
7189 CTLFLAG_RD | CTLFLAG_LOCKED,
2d21ac55 7190 0, 0, mb_stat_sysctl, "S,mb_stat", "");
6d2010ae
A
7191SYSCTL_PROC(_kern_ipc, OID_AUTO, mleak_top_trace,
7192 CTLFLAG_RD | CTLFLAG_LOCKED,
7193 0, 0, mleak_top_trace_sysctl, "S,mb_top_trace", "");
7194SYSCTL_PROC(_kern_ipc, OID_AUTO, mleak_table,
7195 CTLFLAG_RD | CTLFLAG_LOCKED,
7196 0, 0, mleak_table_sysctl, "S,mleak_table", "");
7197SYSCTL_INT(_kern_ipc, OID_AUTO, mleak_sample_factor,
7198 CTLFLAG_RW | CTLFLAG_LOCKED, &mleak_table.mleak_sample_factor, 0, "");
7199SYSCTL_INT(_kern_ipc, OID_AUTO, mb_normalized,
7200 CTLFLAG_RD | CTLFLAG_LOCKED, &mb_normalized, 0, "");
7201SYSCTL_INT(_kern_ipc, OID_AUTO, mb_watchdog,
7202 CTLFLAG_RW | CTLFLAG_LOCKED, &mb_watchdog, 0, "");