]> git.saurik.com Git - apple/xnu.git/blame - bsd/kern/uipc_mbuf.c
xnu-2782.40.9.tar.gz
[apple/xnu.git] / bsd / kern / uipc_mbuf.c
CommitLineData
1c79356b 1/*
fe8ab488 2 * Copyright (c) 1998-2014 Apple Inc. All rights reserved.
5d5c5d0d 3 *
2d21ac55
A
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
8f6c56a5
A
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
2d21ac55
A
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
1c79356b
A
27 */
28/* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
29/*
30 * Copyright (c) 1982, 1986, 1988, 1991, 1993
31 * The Regents of the University of California. All rights reserved.
32 *
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
35 * are met:
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
41 * 3. All advertising materials mentioning features or use of this software
42 * must display the following acknowledgement:
43 * This product includes software developed by the University of
44 * California, Berkeley and its contributors.
45 * 4. Neither the name of the University nor the names of its contributors
46 * may be used to endorse or promote products derived from this software
47 * without specific prior written permission.
48 *
49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59 * SUCH DAMAGE.
60 *
61 * @(#)uipc_mbuf.c 8.2 (Berkeley) 1/4/94
62 */
2d21ac55
A
63/*
64 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
65 * support for mandatory and extensible security protections. This notice
66 * is included in support of clause 2.2 (b) of the Apple Public License,
67 * Version 2.0.
1c79356b
A
68 */
69
70#include <sys/param.h>
71#include <sys/systm.h>
72#include <sys/malloc.h>
73#include <sys/mbuf.h>
74#include <sys/kernel.h>
91447636 75#include <sys/sysctl.h>
1c79356b
A
76#include <sys/syslog.h>
77#include <sys/protosw.h>
78#include <sys/domain.h>
2d21ac55 79#include <sys/queue.h>
b0d623f7 80#include <sys/proc.h>
1c79356b 81
39236c6e
A
82#include <dev/random/randomdev.h>
83
9bccf70c 84#include <kern/kern_types.h>
2d21ac55
A
85#include <kern/simple_lock.h>
86#include <kern/queue.h>
9bccf70c 87#include <kern/sched_prim.h>
2d21ac55 88#include <kern/cpu_number.h>
6d2010ae 89#include <kern/zalloc.h>
2d21ac55
A
90
91#include <libkern/OSAtomic.h>
39236c6e 92#include <libkern/OSDebug.h>
2d21ac55 93#include <libkern/libkern.h>
9bccf70c 94
55e303ae
A
95#include <IOKit/IOMapper.h>
96
2d21ac55
A
97#include <machine/limits.h>
98#include <machine/machine_routines.h>
55e303ae 99
2d21ac55
A
100#if CONFIG_MACF_NET
101#include <security/mac_framework.h>
102#endif /* MAC_NET */
103
104#include <sys/mcache.h>
fe8ab488 105#include <net/ntstat.h>
1c79356b 106
2d21ac55
A
107/*
108 * MBUF IMPLEMENTATION NOTES.
109 *
110 * There is a total of 5 per-CPU caches:
111 *
112 * MC_MBUF:
113 * This is a cache of rudimentary objects of MSIZE in size; each
114 * object represents an mbuf structure. This cache preserves only
115 * the m_type field of the mbuf during its transactions.
116 *
117 * MC_CL:
118 * This is a cache of rudimentary objects of MCLBYTES in size; each
119 * object represents a mcluster structure. This cache does not
120 * preserve the contents of the objects during its transactions.
121 *
122 * MC_BIGCL:
6d2010ae 123 * This is a cache of rudimentary objects of MBIGCLBYTES in size; each
2d21ac55
A
124 * object represents a mbigcluster structure. This cache does not
125 * preserve the contents of the objects during its transaction.
126 *
127 * MC_MBUF_CL:
128 * This is a cache of mbufs each having a cluster attached to it.
129 * It is backed by MC_MBUF and MC_CL rudimentary caches. Several
130 * fields of the mbuf related to the external cluster are preserved
131 * during transactions.
132 *
133 * MC_MBUF_BIGCL:
134 * This is a cache of mbufs each having a big cluster attached to it.
135 * It is backed by MC_MBUF and MC_BIGCL rudimentary caches. Several
136 * fields of the mbuf related to the external cluster are preserved
137 * during transactions.
138 *
139 * OBJECT ALLOCATION:
140 *
141 * Allocation requests are handled first at the per-CPU (mcache) layer
142 * before falling back to the slab layer. Performance is optimal when
143 * the request is satisfied at the CPU layer because global data/lock
144 * never gets accessed. When the slab layer is entered for allocation,
145 * the slab freelist will be checked first for available objects before
146 * the VM backing store is invoked. Slab layer operations are serialized
147 * for all of the caches as the mbuf global lock is held most of the time.
148 * Allocation paths are different depending on the class of objects:
149 *
150 * a. Rudimentary object:
151 *
152 * { m_get_common(), m_clattach(), m_mclget(),
153 * m_mclalloc(), m_bigalloc(), m_copym_with_hdrs(),
154 * composite object allocation }
155 * | ^
156 * | |
157 * | +-----------------------+
158 * v |
159 * mcache_alloc/mcache_alloc_ext() mbuf_slab_audit()
160 * | ^
161 * v |
162 * [CPU cache] -------> (found?) -------+
163 * | |
164 * v |
165 * mbuf_slab_alloc() |
166 * | |
167 * v |
168 * +---------> [freelist] -------> (found?) -------+
169 * | |
170 * | v
171 * | m_clalloc()
172 * | |
173 * | v
174 * +---<<---- kmem_mb_alloc()
175 *
176 * b. Composite object:
177 *
178 * { m_getpackets_internal(), m_allocpacket_internal() }
179 * | ^
180 * | |
181 * | +------ (done) ---------+
182 * v |
183 * mcache_alloc/mcache_alloc_ext() mbuf_cslab_audit()
184 * | ^
185 * v |
186 * [CPU cache] -------> (found?) -------+
187 * | |
188 * v |
189 * mbuf_cslab_alloc() |
190 * | |
191 * v |
192 * [freelist] -------> (found?) -------+
193 * | |
194 * v |
195 * (rudimentary object) |
196 * mcache_alloc/mcache_alloc_ext() ------>>-----+
197 *
198 * Auditing notes: If auditing is enabled, buffers will be subjected to
199 * integrity checks by the audit routine. This is done by verifying their
200 * contents against DEADBEEF (free) pattern before returning them to caller.
201 * As part of this step, the routine will also record the transaction and
202 * pattern-fill the buffers with BADDCAFE (uninitialized) pattern. It will
203 * also restore any constructed data structure fields if necessary.
204 *
205 * OBJECT DEALLOCATION:
206 *
207 * Freeing an object simply involves placing it into the CPU cache; this
208 * pollutes the cache to benefit subsequent allocations. The slab layer
209 * will only be entered if the object is to be purged out of the cache.
210 * During normal operations, this happens only when the CPU layer resizes
211 * its bucket while it's adjusting to the allocation load. Deallocation
212 * paths are different depending on the class of objects:
213 *
214 * a. Rudimentary object:
215 *
216 * { m_free(), m_freem_list(), composite object deallocation }
217 * | ^
218 * | |
219 * | +------ (done) ---------+
220 * v |
221 * mcache_free/mcache_free_ext() |
222 * | |
223 * v |
224 * mbuf_slab_audit() |
225 * | |
226 * v |
227 * [CPU cache] ---> (not purging?) -----+
228 * | |
229 * v |
230 * mbuf_slab_free() |
231 * | |
232 * v |
233 * [freelist] ----------->>------------+
234 * (objects never get purged to VM)
235 *
236 * b. Composite object:
237 *
238 * { m_free(), m_freem_list() }
239 * | ^
240 * | |
241 * | +------ (done) ---------+
242 * v |
243 * mcache_free/mcache_free_ext() |
244 * | |
245 * v |
246 * mbuf_cslab_audit() |
247 * | |
248 * v |
249 * [CPU cache] ---> (not purging?) -----+
250 * | |
251 * v |
252 * mbuf_cslab_free() |
253 * | |
254 * v |
255 * [freelist] ---> (not purging?) -----+
256 * | |
257 * v |
258 * (rudimentary object) |
259 * mcache_free/mcache_free_ext() ------->>------+
260 *
261 * Auditing notes: If auditing is enabled, the audit routine will save
262 * any constructed data structure fields (if necessary) before filling the
263 * contents of the buffers with DEADBEEF (free) pattern and recording the
264 * transaction. Buffers that are freed (whether at CPU or slab layer) are
265 * expected to contain the free pattern.
266 *
267 * DEBUGGING:
268 *
269 * Debugging can be enabled by adding "mbuf_debug=0x3" to boot-args; this
270 * translates to the mcache flags (MCF_VERIFY | MCF_AUDIT). Additionally,
271 * the CPU layer cache can be disabled by setting the MCF_NOCPUCACHE flag,
6d2010ae
A
272 * i.e. modify the boot argument parameter to "mbuf_debug=0x13". Leak
273 * detection may also be disabled by setting the MCF_NOLEAKLOG flag, e.g.
274 * "mbuf_debug=0x113". Note that debugging consumes more CPU and memory.
2d21ac55
A
275 *
276 * Each object is associated with exactly one mcache_audit_t structure that
277 * contains the information related to its last buffer transaction. Given
278 * an address of an object, the audit structure can be retrieved by finding
279 * the position of the object relevant to the base address of the cluster:
280 *
281 * +------------+ +=============+
282 * | mbuf addr | | mclaudit[i] |
283 * +------------+ +=============+
284 * | | cl_audit[0] |
6d2010ae 285 * i = MTOBG(addr) +-------------+
2d21ac55 286 * | +-----> | cl_audit[1] | -----> mcache_audit_t
6d2010ae 287 * b = BGTOM(i) | +-------------+
2d21ac55
A
288 * | | | ... |
289 * x = MCLIDX(b, addr) | +-------------+
290 * | | | cl_audit[7] |
291 * +-----------------+ +-------------+
292 * (e.g. x == 1)
293 *
294 * The mclaudit[] array is allocated at initialization time, but its contents
6d2010ae
A
295 * get populated when the corresponding cluster is created. Because a page
296 * can be turned into NMBPBG number of mbufs, we preserve enough space for the
297 * mbufs so that there is a 1-to-1 mapping between them. A page that never
2d21ac55 298 * gets (or has not yet) turned into mbufs will use only cl_audit[0] with the
6d2010ae
A
299 * remaining entries unused. For 16KB cluster, only one entry from the first
300 * page is allocated and used for the entire object.
2d21ac55 301 */
91447636 302
2d21ac55
A
303/* TODO: should be in header file */
304/* kernel translater */
b0d623f7 305extern vm_offset_t kmem_mb_alloc(vm_map_t, int, int);
2d21ac55 306extern ppnum_t pmap_find_phys(pmap_t pmap, addr64_t va);
1c79356b 307extern vm_map_t mb_map; /* special map */
2d21ac55
A
308
309/* Global lock */
316670eb
A
310decl_lck_mtx_data(static, mbuf_mlock_data);
311static lck_mtx_t *mbuf_mlock = &mbuf_mlock_data;
2d21ac55
A
312static lck_attr_t *mbuf_mlock_attr;
313static lck_grp_t *mbuf_mlock_grp;
314static lck_grp_attr_t *mbuf_mlock_grp_attr;
315
316/* Back-end (common) layer */
317static void *mbuf_worker_run; /* wait channel for worker thread */
318static int mbuf_worker_ready; /* worker thread is runnable */
319static int mbuf_expand_mcl; /* number of cluster creation requets */
320static int mbuf_expand_big; /* number of big cluster creation requests */
6d2010ae 321static int mbuf_expand_16k; /* number of 16KB cluster creation requests */
2d21ac55 322static int ncpu; /* number of CPUs */
b0d623f7
A
323static ppnum_t *mcl_paddr; /* Array of cluster physical addresses */
324static ppnum_t mcl_pages; /* Size of array (# physical pages) */
55e303ae 325static ppnum_t mcl_paddr_base; /* Handle returned by IOMapper::iovmAlloc() */
2d21ac55
A
326static mcache_t *ref_cache; /* Cache of cluster reference & flags */
327static mcache_t *mcl_audit_con_cache; /* Audit contents cache */
328static unsigned int mbuf_debug; /* patchable mbuf mcache flags */
329static unsigned int mb_normalized; /* number of packets "normalized" */
b0d623f7
A
330
331#define MB_GROWTH_AGGRESSIVE 1 /* Threshold: 1/2 of total */
6d2010ae 332#define MB_GROWTH_NORMAL 2 /* Threshold: 3/4 of total */
2d21ac55
A
333
334typedef enum {
335 MC_MBUF = 0, /* Regular mbuf */
336 MC_CL, /* Cluster */
6d2010ae
A
337 MC_BIGCL, /* Large (4KB) cluster */
338 MC_16KCL, /* Jumbo (16KB) cluster */
2d21ac55 339 MC_MBUF_CL, /* mbuf + cluster */
6d2010ae
A
340 MC_MBUF_BIGCL, /* mbuf + large (4KB) cluster */
341 MC_MBUF_16KCL /* mbuf + jumbo (16KB) cluster */
2d21ac55
A
342} mbuf_class_t;
343
344#define MBUF_CLASS_MIN MC_MBUF
345#define MBUF_CLASS_MAX MC_MBUF_16KCL
346#define MBUF_CLASS_LAST MC_16KCL
347#define MBUF_CLASS_VALID(c) \
348 ((int)(c) >= MBUF_CLASS_MIN && (int)(c) <= MBUF_CLASS_MAX)
349#define MBUF_CLASS_COMPOSITE(c) \
350 ((int)(c) > MBUF_CLASS_LAST)
91447636 351
9bccf70c 352
2d21ac55
A
353/*
354 * mbuf specific mcache allocation request flags.
355 */
356#define MCR_COMP MCR_USR1 /* for MC_MBUF_{CL,BIGCL,16KCL} caches */
9bccf70c 357
2d21ac55
A
358/*
359 * Per-cluster slab structure.
360 *
361 * A slab is a cluster control structure that contains one or more object
362 * chunks; the available chunks are chained in the slab's freelist (sl_head).
363 * Each time a chunk is taken out of the slab, the slab's reference count
364 * gets incremented. When all chunks have been taken out, the empty slab
365 * gets removed (SLF_DETACHED) from the class's slab list. A chunk that is
366 * returned to a slab causes the slab's reference count to be decremented;
367 * it also causes the slab to be reinserted back to class's slab list, if
368 * it's not already done.
369 *
370 * Compartmentalizing of the object chunks into slabs allows us to easily
371 * merge one or more slabs together when the adjacent slabs are idle, as
372 * well as to convert or move a slab from one class to another; e.g. the
373 * mbuf cluster slab can be converted to a regular cluster slab when all
374 * mbufs in the slab have been freed.
375 *
376 * A slab may also span across multiple clusters for chunks larger than
377 * a cluster's size. In this case, only the slab of the first cluster is
378 * used. The rest of the slabs are marked with SLF_PARTIAL to indicate
379 * that they are part of the larger slab.
6d2010ae
A
380 *
381 * Each slab controls a page of memory.
2d21ac55
A
382 */
383typedef struct mcl_slab {
384 struct mcl_slab *sl_next; /* neighboring slab */
385 u_int8_t sl_class; /* controlling mbuf class */
386 int8_t sl_refcnt; /* outstanding allocations */
387 int8_t sl_chunks; /* chunks (bufs) in this slab */
388 u_int16_t sl_flags; /* slab flags (see below) */
389 u_int16_t sl_len; /* slab length */
390 void *sl_base; /* base of allocated memory */
391 void *sl_head; /* first free buffer */
392 TAILQ_ENTRY(mcl_slab) sl_link; /* next/prev slab on freelist */
393} mcl_slab_t;
394
395#define SLF_MAPPED 0x0001 /* backed by a mapped page */
396#define SLF_PARTIAL 0x0002 /* part of another slab */
397#define SLF_DETACHED 0x0004 /* not in slab freelist */
1c79356b 398
2d21ac55
A
399/*
400 * The array of slabs are broken into groups of arrays per 1MB of kernel
401 * memory to reduce the footprint. Each group is allocated on demand
402 * whenever a new piece of memory mapped in from the VM crosses the 1MB
403 * boundary.
404 */
6d2010ae 405#define NSLABSPMB ((1 << MBSHIFT) >> PGSHIFT) /* 256 slabs/grp */
91447636 406
2d21ac55
A
407typedef struct mcl_slabg {
408 mcl_slab_t slg_slab[NSLABSPMB]; /* group of slabs */
409} mcl_slabg_t;
1c79356b 410
6d2010ae
A
411/*
412 * Number of slabs needed to control a 16KB cluster object.
413 */
414#define NSLABSP16KB (M16KCLBYTES >> PGSHIFT)
415
2d21ac55
A
416/*
417 * Per-cluster audit structure.
418 */
419typedef struct {
6d2010ae 420 mcache_audit_t *cl_audit[NMBPBG]; /* array of audits */
2d21ac55 421} mcl_audit_t;
91447636 422
39236c6e
A
423typedef struct {
424 struct thread *msa_thread; /* thread doing transaction */
425 struct thread *msa_pthread; /* previous transaction thread */
426 uint32_t msa_tstamp; /* transaction timestamp (ms) */
427 uint32_t msa_ptstamp; /* prev transaction timestamp (ms) */
428 uint16_t msa_depth; /* pc stack depth */
429 uint16_t msa_pdepth; /* previous transaction pc stack */
430 void *msa_stack[MCACHE_STACK_DEPTH];
431 void *msa_pstack[MCACHE_STACK_DEPTH];
432} mcl_scratch_audit_t;
433
434typedef struct {
435 /*
436 * Size of data from the beginning of an mbuf that covers m_hdr,
437 * pkthdr and m_ext structures. If auditing is enabled, we allocate
438 * a shadow mbuf structure of this size inside each audit structure,
439 * and the contents of the real mbuf gets copied into it when the mbuf
440 * is freed. This allows us to pattern-fill the mbuf for integrity
441 * check, and to preserve any constructed mbuf fields (e.g. mbuf +
442 * cluster cache case). Note that we don't save the contents of
443 * clusters when they are freed; we simply pattern-fill them.
444 */
445 u_int8_t sc_mbuf[(MSIZE - _MHLEN) + sizeof (_m_ext_t)];
446 mcl_scratch_audit_t sc_scratch __attribute__((aligned(8)));
447} mcl_saved_contents_t;
448
449#define AUDIT_CONTENTS_SIZE (sizeof (mcl_saved_contents_t))
450
451#define MCA_SAVED_MBUF_PTR(_mca) \
452 ((struct mbuf *)(void *)((mcl_saved_contents_t *) \
453 (_mca)->mca_contents)->sc_mbuf)
454#define MCA_SAVED_MBUF_SIZE \
455 (sizeof (((mcl_saved_contents_t *)0)->sc_mbuf))
456#define MCA_SAVED_SCRATCH_PTR(_mca) \
457 (&((mcl_saved_contents_t *)(_mca)->mca_contents)->sc_scratch)
fa4905b1 458
2d21ac55
A
459/*
460 * mbuf specific mcache audit flags
461 */
462#define MB_INUSE 0x01 /* object has not been returned to slab */
463#define MB_COMP_INUSE 0x02 /* object has not been returned to cslab */
464#define MB_SCVALID 0x04 /* object has valid saved contents */
fa4905b1 465
2d21ac55
A
466/*
467 * Each of the following two arrays hold up to nmbclusters elements.
468 */
469static mcl_audit_t *mclaudit; /* array of cluster audit information */
6d2010ae 470static unsigned int maxclaudit; /* max # of entries in audit table */
2d21ac55
A
471static mcl_slabg_t **slabstbl; /* cluster slabs table */
472static unsigned int maxslabgrp; /* max # of entries in slabs table */
473static unsigned int slabgrp; /* # of entries in slabs table */
474
475/* Globals */
476int nclusters; /* # of clusters for non-jumbo (legacy) sizes */
477int njcl; /* # of clusters for jumbo sizes */
478int njclbytes; /* size of a jumbo cluster */
6d2010ae
A
479union mbigcluster *mbutl; /* first mapped cluster address */
480union mbigcluster *embutl; /* ending virtual address of mclusters */
316670eb
A
481int _max_linkhdr; /* largest link-level header */
482int _max_protohdr; /* largest protocol header */
2d21ac55
A
483int max_hdr; /* largest link+protocol header */
484int max_datalen; /* MHLEN - max_hdr */
485
6d2010ae
A
486static boolean_t mclverify; /* debug: pattern-checking */
487static boolean_t mcltrace; /* debug: stack tracing */
488static boolean_t mclfindleak; /* debug: leak detection */
316670eb 489static boolean_t mclexpleak; /* debug: expose leak info to user space */
6d2010ae 490
39236c6e
A
491static struct timeval mb_start; /* beginning of time */
492
6d2010ae
A
493/* mbuf leak detection variables */
494static struct mleak_table mleak_table;
495static mleak_stat_t *mleak_stat;
496
497#define MLEAK_STAT_SIZE(n) \
498 ((size_t)(&((mleak_stat_t *)0)->ml_trace[n]))
499
500struct mallocation {
501 mcache_obj_t *element; /* the alloc'ed element, NULL if unused */
502 u_int32_t trace_index; /* mtrace index for corresponding backtrace */
503 u_int32_t count; /* How many objects were requested */
504 u_int64_t hitcount; /* for determining hash effectiveness */
505};
506
507struct mtrace {
508 u_int64_t collisions;
509 u_int64_t hitcount;
510 u_int64_t allocs;
511 u_int64_t depth;
512 uintptr_t addr[MLEAK_STACK_DEPTH];
513};
514
515/* Size must be a power of two for the zhash to be able to just mask off bits */
516#define MLEAK_ALLOCATION_MAP_NUM 512
517#define MLEAK_TRACE_MAP_NUM 256
518
519/*
520 * Sample factor for how often to record a trace. This is overwritable
521 * by the boot-arg mleak_sample_factor.
522 */
523#define MLEAK_SAMPLE_FACTOR 500
524
525/*
526 * Number of top leakers recorded.
527 */
528#define MLEAK_NUM_TRACES 5
529
316670eb
A
530#define MB_LEAK_SPACING_64 " "
531#define MB_LEAK_SPACING_32 " "
532
533
534#define MB_LEAK_HDR_32 "\n\
535 trace [1] trace [2] trace [3] trace [4] trace [5] \n\
536 ---------- ---------- ---------- ---------- ---------- \n\
537"
538
539#define MB_LEAK_HDR_64 "\n\
540 trace [1] trace [2] trace [3] \
541 trace [4] trace [5] \n\
542 ------------------ ------------------ ------------------ \
543 ------------------ ------------------ \n\
544"
545
6d2010ae
A
546static uint32_t mleak_alloc_buckets = MLEAK_ALLOCATION_MAP_NUM;
547static uint32_t mleak_trace_buckets = MLEAK_TRACE_MAP_NUM;
548
549/* Hashmaps of allocations and their corresponding traces */
550static struct mallocation *mleak_allocations;
551static struct mtrace *mleak_traces;
552static struct mtrace *mleak_top_trace[MLEAK_NUM_TRACES];
553
554/* Lock to protect mleak tables from concurrent modification */
316670eb
A
555decl_lck_mtx_data(static, mleak_lock_data);
556static lck_mtx_t *mleak_lock = &mleak_lock_data;
6d2010ae
A
557static lck_attr_t *mleak_lock_attr;
558static lck_grp_t *mleak_lock_grp;
559static lck_grp_attr_t *mleak_lock_grp_attr;
560
b0d623f7
A
561extern u_int32_t high_sb_max;
562
2d21ac55
A
563/* The minimum number of objects that are allocated, to start. */
564#define MINCL 32
565#define MINBIGCL (MINCL >> 1)
566#define MIN16KCL (MINCL >> 2)
567
568/* Low watermarks (only map in pages once free counts go below) */
2d21ac55
A
569#define MBIGCL_LOWAT MINBIGCL
570#define M16KCL_LOWAT MIN16KCL
571
572typedef struct {
573 mbuf_class_t mtbl_class; /* class type */
574 mcache_t *mtbl_cache; /* mcache for this buffer class */
575 TAILQ_HEAD(mcl_slhead, mcl_slab) mtbl_slablist; /* slab list */
576 mcache_obj_t *mtbl_cobjlist; /* composite objects freelist */
577 mb_class_stat_t *mtbl_stats; /* statistics fetchable via sysctl */
578 u_int32_t mtbl_maxsize; /* maximum buffer size */
579 int mtbl_minlimit; /* minimum allowed */
580 int mtbl_maxlimit; /* maximum allowed */
581 u_int32_t mtbl_wantpurge; /* purge during next reclaim */
fe8ab488 582 uint32_t mtbl_avgtotal; /* average total on iOS */
2d21ac55
A
583} mbuf_table_t;
584
585#define m_class(c) mbuf_table[c].mtbl_class
586#define m_cache(c) mbuf_table[c].mtbl_cache
587#define m_slablist(c) mbuf_table[c].mtbl_slablist
588#define m_cobjlist(c) mbuf_table[c].mtbl_cobjlist
589#define m_maxsize(c) mbuf_table[c].mtbl_maxsize
590#define m_minlimit(c) mbuf_table[c].mtbl_minlimit
591#define m_maxlimit(c) mbuf_table[c].mtbl_maxlimit
592#define m_wantpurge(c) mbuf_table[c].mtbl_wantpurge
fe8ab488 593#define m_avgtotal(c) mbuf_table[c].mtbl_avgtotal
2d21ac55
A
594#define m_cname(c) mbuf_table[c].mtbl_stats->mbcl_cname
595#define m_size(c) mbuf_table[c].mtbl_stats->mbcl_size
596#define m_total(c) mbuf_table[c].mtbl_stats->mbcl_total
597#define m_active(c) mbuf_table[c].mtbl_stats->mbcl_active
598#define m_infree(c) mbuf_table[c].mtbl_stats->mbcl_infree
599#define m_slab_cnt(c) mbuf_table[c].mtbl_stats->mbcl_slab_cnt
600#define m_alloc_cnt(c) mbuf_table[c].mtbl_stats->mbcl_alloc_cnt
601#define m_free_cnt(c) mbuf_table[c].mtbl_stats->mbcl_free_cnt
602#define m_notified(c) mbuf_table[c].mtbl_stats->mbcl_notified
603#define m_purge_cnt(c) mbuf_table[c].mtbl_stats->mbcl_purge_cnt
604#define m_fail_cnt(c) mbuf_table[c].mtbl_stats->mbcl_fail_cnt
605#define m_ctotal(c) mbuf_table[c].mtbl_stats->mbcl_ctotal
fe8ab488
A
606#define m_peak(c) mbuf_table[c].mtbl_stats->mbcl_peak_reported
607#define m_release_cnt(c) mbuf_table[c].mtbl_stats->mbcl_release_cnt
2d21ac55
A
608
609static mbuf_table_t mbuf_table[] = {
610 /*
611 * The caches for mbufs, regular clusters and big clusters.
fe8ab488
A
612 * The average total values were based on data gathered by actual
613 * usage patterns on iOS.
2d21ac55
A
614 */
615 { MC_MBUF, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_MBUF)),
fe8ab488 616 NULL, NULL, 0, 0, 0, 0, 3000 },
2d21ac55 617 { MC_CL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_CL)),
fe8ab488 618 NULL, NULL, 0, 0, 0, 0, 2000 },
2d21ac55 619 { MC_BIGCL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_BIGCL)),
fe8ab488 620 NULL, NULL, 0, 0, 0, 0, 1000 },
2d21ac55 621 { MC_16KCL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_16KCL)),
fe8ab488 622 NULL, NULL, 0, 0, 0, 0, 1000 },
2d21ac55
A
623 /*
624 * The following are special caches; they serve as intermediate
625 * caches backed by the above rudimentary caches. Each object
626 * in the cache is an mbuf with a cluster attached to it. Unlike
627 * the above caches, these intermediate caches do not directly
628 * deal with the slab structures; instead, the constructed
629 * cached elements are simply stored in the freelists.
630 */
fe8ab488
A
631 { MC_MBUF_CL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0, 2000 },
632 { MC_MBUF_BIGCL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0, 1000 },
633 { MC_MBUF_16KCL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0, 1000 },
2d21ac55
A
634};
635
636#define NELEM(a) (sizeof (a) / sizeof ((a)[0]))
637
638static void *mb_waitchan = &mbuf_table; /* wait channel for all caches */
6d2010ae
A
639static int mb_waiters; /* number of waiters */
640
fe8ab488
A
641boolean_t mb_peak_newreport = FALSE;
642boolean_t mb_peak_firstreport = FALSE;
643
644/* generate a report by default after 1 week of uptime */
645#define MBUF_PEAK_FIRST_REPORT_THRESHOLD 604800
646
6d2010ae
A
647#define MB_WDT_MAXTIME 10 /* # of secs before watchdog panic */
648static struct timeval mb_wdtstart; /* watchdog start timestamp */
316670eb
A
649static char *mbuf_dump_buf;
650
651#define MBUF_DUMP_BUF_SIZE 2048
6d2010ae
A
652
653/*
654 * mbuf watchdog is enabled by default on embedded platforms. It is
655 * also toggeable via the kern.ipc.mb_watchdog sysctl.
fe8ab488
A
656 * Garbage collection is also enabled by default on embedded platforms.
657 * mb_drain_maxint controls the amount of time to wait (in seconds) before
658 * consecutive calls to m_drain().
6d2010ae 659 */
6d2010ae 660static unsigned int mb_watchdog = 0;
fe8ab488 661static unsigned int mb_drain_maxint = 0;
39236c6e
A
662
663/* Red zone */
664static u_int32_t mb_redzone_cookie;
665static void m_redzone_init(struct mbuf *);
666static void m_redzone_verify(struct mbuf *m);
2d21ac55
A
667
668/* The following are used to serialize m_clalloc() */
669static boolean_t mb_clalloc_busy;
670static void *mb_clalloc_waitchan = &mb_clalloc_busy;
671static int mb_clalloc_waiters;
672
6d2010ae 673static void mbuf_mtypes_sync(boolean_t);
2d21ac55 674static int mbstat_sysctl SYSCTL_HANDLER_ARGS;
6d2010ae 675static void mbuf_stat_sync(void);
2d21ac55 676static int mb_stat_sysctl SYSCTL_HANDLER_ARGS;
6d2010ae
A
677static int mleak_top_trace_sysctl SYSCTL_HANDLER_ARGS;
678static int mleak_table_sysctl SYSCTL_HANDLER_ARGS;
679static char *mbuf_dump(void);
2d21ac55
A
680static void mbuf_table_init(void);
681static inline void m_incref(struct mbuf *);
682static inline u_int32_t m_decref(struct mbuf *);
683static int m_clalloc(const u_int32_t, const int, const u_int32_t);
684static void mbuf_worker_thread_init(void);
685static mcache_obj_t *slab_alloc(mbuf_class_t, int);
686static void slab_free(mbuf_class_t, mcache_obj_t *);
687static unsigned int mbuf_slab_alloc(void *, mcache_obj_t ***,
688 unsigned int, int);
689static void mbuf_slab_free(void *, mcache_obj_t *, int);
690static void mbuf_slab_audit(void *, mcache_obj_t *, boolean_t);
691static void mbuf_slab_notify(void *, u_int32_t);
692static unsigned int cslab_alloc(mbuf_class_t, mcache_obj_t ***,
693 unsigned int);
694static unsigned int cslab_free(mbuf_class_t, mcache_obj_t *, int);
695static unsigned int mbuf_cslab_alloc(void *, mcache_obj_t ***,
696 unsigned int, int);
697static void mbuf_cslab_free(void *, mcache_obj_t *, int);
698static void mbuf_cslab_audit(void *, mcache_obj_t *, boolean_t);
699static int freelist_populate(mbuf_class_t, unsigned int, int);
6d2010ae 700static void freelist_init(mbuf_class_t);
2d21ac55
A
701static boolean_t mbuf_cached_above(mbuf_class_t, int);
702static boolean_t mbuf_steal(mbuf_class_t, unsigned int);
703static void m_reclaim(mbuf_class_t, unsigned int, boolean_t);
704static int m_howmany(int, size_t);
705static void mbuf_worker_thread(void);
6d2010ae 706static void mbuf_watchdog(void);
2d21ac55
A
707static boolean_t mbuf_sleep(mbuf_class_t, unsigned int, int);
708
709static void mcl_audit_init(void *, mcache_audit_t **, mcache_obj_t **,
710 size_t, unsigned int);
fe8ab488 711static void mcl_audit_free(void *, unsigned int);
2d21ac55
A
712static mcache_audit_t *mcl_audit_buf2mca(mbuf_class_t, mcache_obj_t *);
713static void mcl_audit_mbuf(mcache_audit_t *, void *, boolean_t, boolean_t);
714static void mcl_audit_cluster(mcache_audit_t *, void *, size_t, boolean_t,
715 boolean_t);
716static void mcl_audit_restore_mbuf(struct mbuf *, mcache_audit_t *, boolean_t);
717static void mcl_audit_save_mbuf(struct mbuf *, mcache_audit_t *);
39236c6e 718static void mcl_audit_scratch(mcache_audit_t *);
2d21ac55
A
719static void mcl_audit_mcheck_panic(struct mbuf *);
720static void mcl_audit_verify_nextptr(void *, mcache_audit_t *);
721
6d2010ae
A
722static void mleak_activate(void);
723static void mleak_logger(u_int32_t, mcache_obj_t *, boolean_t);
724static boolean_t mleak_log(uintptr_t *, mcache_obj_t *, uint32_t, int);
725static void mleak_free(mcache_obj_t *);
316670eb
A
726static void mleak_sort_traces(void);
727static void mleak_update_stats(void);
6d2010ae 728
2d21ac55
A
729static mcl_slab_t *slab_get(void *);
730static void slab_init(mcl_slab_t *, mbuf_class_t, u_int32_t,
731 void *, void *, unsigned int, int, int);
732static void slab_insert(mcl_slab_t *, mbuf_class_t);
733static void slab_remove(mcl_slab_t *, mbuf_class_t);
734static boolean_t slab_inrange(mcl_slab_t *, void *);
735static void slab_nextptr_panic(mcl_slab_t *, void *);
736static void slab_detach(mcl_slab_t *);
737static boolean_t slab_is_detached(mcl_slab_t *);
738
b0d623f7
A
739static int m_copyback0(struct mbuf **, int, int, const void *, int, int);
740static struct mbuf *m_split0(struct mbuf *, int, int, int);
fe8ab488
A
741__private_extern__ void mbuf_report_peak_usage(void);
742static boolean_t mbuf_report_usage(mbuf_class_t);
b0d623f7
A
743
744/* flags for m_copyback0 */
745#define M_COPYBACK0_COPYBACK 0x0001 /* copyback from cp */
746#define M_COPYBACK0_PRESERVE 0x0002 /* preserve original data */
747#define M_COPYBACK0_COW 0x0004 /* do copy-on-write */
748#define M_COPYBACK0_EXTEND 0x0008 /* extend chain */
749
2d21ac55
A
750/*
751 * This flag is set for all mbufs that come out of and into the composite
752 * mbuf + cluster caches, i.e. MC_MBUF_CL and MC_MBUF_BIGCL. mbufs that
753 * are marked with such a flag have clusters attached to them, and will be
754 * treated differently when they are freed; instead of being placed back
755 * into the mbuf and cluster freelists, the composite mbuf + cluster objects
756 * are placed back into the appropriate composite cache's freelist, and the
757 * actual freeing is deferred until the composite objects are purged. At
758 * such a time, this flag will be cleared from the mbufs and the objects
759 * will be freed into their own separate freelists.
760 */
761#define EXTF_COMPOSITE 0x1
1c79356b 762
6d2010ae
A
763/*
764 * This flag indicates that the external cluster is read-only, i.e. it is
765 * or was referred to by more than one mbufs. Once set, this flag is never
766 * cleared.
767 */
768#define EXTF_READONLY 0x2
769#define EXTF_MASK (EXTF_COMPOSITE | EXTF_READONLY)
770
2d21ac55
A
771#define MEXT_RFA(m) ((m)->m_ext.ext_refflags)
772#define MEXT_REF(m) (MEXT_RFA(m)->refcnt)
773#define MEXT_FLAGS(m) (MEXT_RFA(m)->flags)
774#define MBUF_IS_COMPOSITE(m) \
6d2010ae 775 (MEXT_REF(m) == 0 && (MEXT_FLAGS(m) & EXTF_MASK) == EXTF_COMPOSITE)
1c79356b 776
2d21ac55
A
777/*
778 * Macros used to verify the integrity of the mbuf.
779 */
780#define _MCHECK(m) { \
781 if ((m)->m_type != MT_FREE) { \
782 if (mclaudit == NULL) \
783 panic("MCHECK: m_type=%d m=%p", \
784 (u_int16_t)(m)->m_type, m); \
785 else \
786 mcl_audit_mcheck_panic(m); \
787 } \
788}
55e303ae 789
2d21ac55
A
790#define MBUF_IN_MAP(addr) \
791 ((void *)(addr) >= (void *)mbutl && (void *)(addr) < (void *)embutl)
55e303ae 792
2d21ac55
A
793#define MRANGE(addr) { \
794 if (!MBUF_IN_MAP(addr)) \
795 panic("MRANGE: address out of range 0x%p", addr); \
1c79356b
A
796}
797
798/*
2d21ac55 799 * Macro version of mtod.
1c79356b 800 */
2d21ac55 801#define MTOD(m, t) ((t)((m)->m_data))
1c79356b 802
2d21ac55 803/*
6d2010ae
A
804 * Macros to obtain (4KB) cluster index and base cluster address.
805 */
806
807#define MTOBG(x) (((char *)(x) - (char *)mbutl) >> MBIGCLSHIFT)
808#define BGTOM(x) ((union mbigcluster *)(mbutl + (x)))
809
810/*
811 * Macro to find the mbuf index relative to a base.
2d21ac55 812 */
6d2010ae 813#define MCLIDX(c, m) (((char *)(m) - (char *)(c)) >> MSIZESHIFT)
1c79356b 814
2d21ac55 815/*
6d2010ae 816 * Same thing for 2KB cluster index.
2d21ac55 817 */
6d2010ae 818#define CLBGIDX(c, m) (((char *)(m) - (char *)(c)) >> MCLSHIFT)
91447636 819
2d21ac55
A
820/*
821 * Macros used during mbuf and cluster initialization.
822 */
39236c6e
A
823#define MBUF_INIT_PKTHDR(m) { \
824 (m)->m_pkthdr.rcvif = NULL; \
825 (m)->m_pkthdr.pkt_hdr = NULL; \
826 (m)->m_pkthdr.len = 0; \
827 (m)->m_pkthdr.csum_flags = 0; \
828 (m)->m_pkthdr.csum_data = 0; \
829 (m)->m_pkthdr.vlan_tag = 0; \
830 m_classifier_init(m, 0); \
831 m_tag_init(m, 1); \
832 m_scratch_init(m); \
833 m_redzone_init(m); \
834}
835
2d21ac55
A
836#define MBUF_INIT(m, pkthdr, type) { \
837 _MCHECK(m); \
838 (m)->m_next = (m)->m_nextpkt = NULL; \
839 (m)->m_len = 0; \
840 (m)->m_type = type; \
841 if ((pkthdr) == 0) { \
842 (m)->m_data = (m)->m_dat; \
843 (m)->m_flags = 0; \
844 } else { \
845 (m)->m_data = (m)->m_pktdat; \
846 (m)->m_flags = M_PKTHDR; \
39236c6e 847 MBUF_INIT_PKTHDR(m); \
2d21ac55
A
848 } \
849}
91447636 850
2d21ac55
A
851#define MEXT_INIT(m, buf, size, free, arg, rfa, ref, flag) { \
852 (m)->m_data = (m)->m_ext.ext_buf = (buf); \
853 (m)->m_flags |= M_EXT; \
854 (m)->m_ext.ext_size = (size); \
855 (m)->m_ext.ext_free = (free); \
856 (m)->m_ext.ext_arg = (arg); \
857 (m)->m_ext.ext_refs.forward = (m)->m_ext.ext_refs.backward = \
858 &(m)->m_ext.ext_refs; \
859 MEXT_RFA(m) = (rfa); \
860 MEXT_REF(m) = (ref); \
861 MEXT_FLAGS(m) = (flag); \
1c79356b
A
862}
863
2d21ac55
A
864#define MBUF_CL_INIT(m, buf, rfa, ref, flag) \
865 MEXT_INIT(m, buf, m_maxsize(MC_CL), NULL, NULL, rfa, ref, flag)
866
867#define MBUF_BIGCL_INIT(m, buf, rfa, ref, flag) \
868 MEXT_INIT(m, buf, m_maxsize(MC_BIGCL), m_bigfree, NULL, rfa, ref, flag)
869
870#define MBUF_16KCL_INIT(m, buf, rfa, ref, flag) \
871 MEXT_INIT(m, buf, m_maxsize(MC_16KCL), m_16kfree, NULL, rfa, ref, flag)
872
1c79356b 873/*
2d21ac55 874 * Macro to convert BSD malloc sleep flag to mcache's
1c79356b 875 */
2d21ac55 876#define MSLEEPF(f) ((!((f) & M_DONTWAIT)) ? MCR_SLEEP : MCR_NOSLEEP)
1c79356b 877
2d21ac55
A
878/*
879 * The structure that holds all mbuf class statistics exportable via sysctl.
880 * Similar to mbstat structure, the mb_stat structure is protected by the
881 * global mbuf lock. It contains additional information about the classes
882 * that allows for a more accurate view of the state of the allocator.
883 */
884struct mb_stat *mb_stat;
b0d623f7 885struct omb_stat *omb_stat; /* For backwards compatibility */
1c79356b 886
2d21ac55
A
887#define MB_STAT_SIZE(n) \
888 ((size_t)(&((mb_stat_t *)0)->mbs_class[n]))
b0d623f7
A
889#define OMB_STAT_SIZE(n) \
890 ((size_t)(&((struct omb_stat *)0)->mbs_class[n]))
1c79356b
A
891
892/*
2d21ac55
A
893 * The legacy structure holding all of the mbuf allocation statistics.
894 * The actual statistics used by the kernel are stored in the mbuf_table
895 * instead, and are updated atomically while the global mbuf lock is held.
896 * They are mirrored in mbstat to support legacy applications (e.g. netstat).
897 * Unlike before, the kernel no longer relies on the contents of mbstat for
898 * its operations (e.g. cluster expansion) because the structure is exposed
899 * to outside and could possibly be modified, therefore making it unsafe.
900 * With the exception of the mbstat.m_mtypes array (see below), all of the
901 * statistics are updated as they change.
1c79356b 902 */
2d21ac55 903struct mbstat mbstat;
1c79356b 904
2d21ac55
A
905#define MBSTAT_MTYPES_MAX \
906 (sizeof (mbstat.m_mtypes) / sizeof (mbstat.m_mtypes[0]))
1c79356b
A
907
908/*
2d21ac55
A
909 * Allocation statistics related to mbuf types (up to MT_MAX-1) are updated
910 * atomically and stored in a per-CPU structure which is lock-free; this is
911 * done in order to avoid writing to the global mbstat data structure which
912 * would cause false sharing. During sysctl request for kern.ipc.mbstat,
913 * the statistics across all CPUs will be converged into the mbstat.m_mtypes
914 * array and returned to the application. Any updates for types greater or
915 * equal than MT_MAX would be done atomically to the mbstat; this slows down
916 * performance but is okay since the kernel uses only up to MT_MAX-1 while
917 * anything beyond that (up to type 255) is considered a corner case.
1c79356b 918 */
2d21ac55
A
919typedef struct {
920 unsigned int cpu_mtypes[MT_MAX];
39236c6e 921} __attribute__((aligned(MAX_CPU_CACHE_LINE_SIZE), packed)) mtypes_cpu_t;
1c79356b 922
2d21ac55
A
923typedef struct {
924 mtypes_cpu_t mbs_cpu[1];
925} mbuf_mtypes_t;
1c79356b 926
2d21ac55
A
927static mbuf_mtypes_t *mbuf_mtypes; /* per-CPU statistics */
928
929#define MBUF_MTYPES_SIZE(n) \
930 ((size_t)(&((mbuf_mtypes_t *)0)->mbs_cpu[n]))
931
932#define MTYPES_CPU(p) \
316670eb 933 ((mtypes_cpu_t *)(void *)((char *)(p) + MBUF_MTYPES_SIZE(cpu_number())))
2d21ac55 934
2d21ac55
A
935#define mtype_stat_add(type, n) { \
936 if ((unsigned)(type) < MT_MAX) { \
937 mtypes_cpu_t *mbs = MTYPES_CPU(mbuf_mtypes); \
938 atomic_add_32(&mbs->cpu_mtypes[type], n); \
6d2010ae
A
939 } else if ((unsigned)(type) < (unsigned)MBSTAT_MTYPES_MAX) { \
940 atomic_add_16((int16_t *)&mbstat.m_mtypes[type], n); \
2d21ac55 941 } \
1c79356b
A
942}
943
2d21ac55
A
944#define mtype_stat_sub(t, n) mtype_stat_add(t, -(n))
945#define mtype_stat_inc(t) mtype_stat_add(t, 1)
946#define mtype_stat_dec(t) mtype_stat_sub(t, 1)
91447636 947
6d2010ae
A
948static void
949mbuf_mtypes_sync(boolean_t locked)
2d21ac55 950{
2d21ac55
A
951 int m, n;
952 mtypes_cpu_t mtc;
1c79356b 953
6d2010ae
A
954 if (locked)
955 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
956
2d21ac55
A
957 bzero(&mtc, sizeof (mtc));
958 for (m = 0; m < ncpu; m++) {
959 mtypes_cpu_t *scp = &mbuf_mtypes->mbs_cpu[m];
960 mtypes_cpu_t temp;
9bccf70c 961
2d21ac55
A
962 bcopy(&scp->cpu_mtypes, &temp.cpu_mtypes,
963 sizeof (temp.cpu_mtypes));
91447636 964
2d21ac55
A
965 for (n = 0; n < MT_MAX; n++)
966 mtc.cpu_mtypes[n] += temp.cpu_mtypes[n];
967 }
6d2010ae
A
968 if (!locked)
969 lck_mtx_lock(mbuf_mlock);
2d21ac55
A
970 for (n = 0; n < MT_MAX; n++)
971 mbstat.m_mtypes[n] = mtc.cpu_mtypes[n];
6d2010ae
A
972 if (!locked)
973 lck_mtx_unlock(mbuf_mlock);
1c79356b
A
974}
975
2d21ac55 976static int
6d2010ae 977mbstat_sysctl SYSCTL_HANDLER_ARGS
1c79356b 978{
2d21ac55 979#pragma unused(oidp, arg1, arg2)
6d2010ae
A
980 mbuf_mtypes_sync(FALSE);
981
982 return (SYSCTL_OUT(req, &mbstat, sizeof (mbstat)));
983}
984
985static void
986mbuf_stat_sync(void)
987{
2d21ac55 988 mb_class_stat_t *sp;
6d2010ae
A
989 mcache_cpu_t *ccp;
990 mcache_t *cp;
991 int k, m, bktsize;
992
993 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2d21ac55 994
2d21ac55
A
995 for (k = 0; k < NELEM(mbuf_table); k++) {
996 cp = m_cache(k);
997 ccp = &cp->mc_cpu[0];
998 bktsize = ccp->cc_bktsize;
999 sp = mbuf_table[k].mtbl_stats;
1000
1001 if (cp->mc_flags & MCF_NOCPUCACHE)
1002 sp->mbcl_mc_state = MCS_DISABLED;
1003 else if (cp->mc_purge_cnt > 0)
1004 sp->mbcl_mc_state = MCS_PURGING;
1005 else if (bktsize == 0)
1006 sp->mbcl_mc_state = MCS_OFFLINE;
1007 else
1008 sp->mbcl_mc_state = MCS_ONLINE;
1009
1010 sp->mbcl_mc_cached = 0;
1011 for (m = 0; m < ncpu; m++) {
1012 ccp = &cp->mc_cpu[m];
1013 if (ccp->cc_objs > 0)
1014 sp->mbcl_mc_cached += ccp->cc_objs;
1015 if (ccp->cc_pobjs > 0)
1016 sp->mbcl_mc_cached += ccp->cc_pobjs;
1017 }
1018 sp->mbcl_mc_cached += (cp->mc_full.bl_total * bktsize);
1019 sp->mbcl_active = sp->mbcl_total - sp->mbcl_mc_cached -
1020 sp->mbcl_infree;
1021
1022 sp->mbcl_mc_waiter_cnt = cp->mc_waiter_cnt;
1023 sp->mbcl_mc_wretry_cnt = cp->mc_wretry_cnt;
1024 sp->mbcl_mc_nwretry_cnt = cp->mc_nwretry_cnt;
1025
1026 /* Calculate total count specific to each class */
1027 sp->mbcl_ctotal = sp->mbcl_total;
1028 switch (m_class(k)) {
1029 case MC_MBUF:
1030 /* Deduct mbufs used in composite caches */
1031 sp->mbcl_ctotal -= (m_total(MC_MBUF_CL) +
1032 m_total(MC_MBUF_BIGCL));
1033 break;
91447636 1034
2d21ac55 1035 case MC_CL:
6d2010ae
A
1036 /* Deduct clusters used in composite cache */
1037 sp->mbcl_ctotal -= m_total(MC_MBUF_CL);
2d21ac55 1038 break;
91447636 1039
2d21ac55
A
1040 case MC_BIGCL:
1041 /* Deduct clusters used in composite cache */
1042 sp->mbcl_ctotal -= m_total(MC_MBUF_BIGCL);
1043 break;
1c79356b 1044
2d21ac55
A
1045 case MC_16KCL:
1046 /* Deduct clusters used in composite cache */
1047 sp->mbcl_ctotal -= m_total(MC_MBUF_16KCL);
1048 break;
1049
1050 default:
1051 break;
1052 }
1053 }
6d2010ae
A
1054}
1055
1056static int
1057mb_stat_sysctl SYSCTL_HANDLER_ARGS
1058{
1059#pragma unused(oidp, arg1, arg2)
1060 void *statp;
1061 int k, statsz, proc64 = proc_is64bit(req->p);
1062
1063 lck_mtx_lock(mbuf_mlock);
1064 mbuf_stat_sync();
b0d623f7
A
1065
1066 if (!proc64) {
1067 struct omb_class_stat *oc;
1068 struct mb_class_stat *c;
1069
1070 omb_stat->mbs_cnt = mb_stat->mbs_cnt;
1071 oc = &omb_stat->mbs_class[0];
1072 c = &mb_stat->mbs_class[0];
1073 for (k = 0; k < omb_stat->mbs_cnt; k++, oc++, c++) {
1074 (void) snprintf(oc->mbcl_cname, sizeof (oc->mbcl_cname),
1075 "%s", c->mbcl_cname);
1076 oc->mbcl_size = c->mbcl_size;
1077 oc->mbcl_total = c->mbcl_total;
1078 oc->mbcl_active = c->mbcl_active;
1079 oc->mbcl_infree = c->mbcl_infree;
1080 oc->mbcl_slab_cnt = c->mbcl_slab_cnt;
1081 oc->mbcl_alloc_cnt = c->mbcl_alloc_cnt;
1082 oc->mbcl_free_cnt = c->mbcl_free_cnt;
1083 oc->mbcl_notified = c->mbcl_notified;
1084 oc->mbcl_purge_cnt = c->mbcl_purge_cnt;
1085 oc->mbcl_fail_cnt = c->mbcl_fail_cnt;
1086 oc->mbcl_ctotal = c->mbcl_ctotal;
fe8ab488 1087 oc->mbcl_release_cnt = c->mbcl_release_cnt;
b0d623f7
A
1088 oc->mbcl_mc_state = c->mbcl_mc_state;
1089 oc->mbcl_mc_cached = c->mbcl_mc_cached;
1090 oc->mbcl_mc_waiter_cnt = c->mbcl_mc_waiter_cnt;
1091 oc->mbcl_mc_wretry_cnt = c->mbcl_mc_wretry_cnt;
1092 oc->mbcl_mc_nwretry_cnt = c->mbcl_mc_nwretry_cnt;
1093 }
1094 statp = omb_stat;
1095 statsz = OMB_STAT_SIZE(NELEM(mbuf_table));
1096 } else {
1097 statp = mb_stat;
1098 statsz = MB_STAT_SIZE(NELEM(mbuf_table));
1099 }
1100
2d21ac55 1101 lck_mtx_unlock(mbuf_mlock);
9bccf70c 1102
b0d623f7 1103 return (SYSCTL_OUT(req, statp, statsz));
2d21ac55 1104}
91447636 1105
6d2010ae
A
1106static int
1107mleak_top_trace_sysctl SYSCTL_HANDLER_ARGS
1108{
1109#pragma unused(oidp, arg1, arg2)
6d2010ae
A
1110 int i;
1111
1112 /* Ensure leak tracing turned on */
316670eb 1113 if (!mclfindleak || !mclexpleak)
6d2010ae
A
1114 return (ENXIO);
1115
6d2010ae 1116 lck_mtx_lock(mleak_lock);
316670eb 1117 mleak_update_stats();
6d2010ae
A
1118 i = SYSCTL_OUT(req, mleak_stat, MLEAK_STAT_SIZE(MLEAK_NUM_TRACES));
1119 lck_mtx_unlock(mleak_lock);
1120
1121 return (i);
1122}
1123
1124static int
1125mleak_table_sysctl SYSCTL_HANDLER_ARGS
1126{
1127#pragma unused(oidp, arg1, arg2)
1128 int i = 0;
1129
1130 /* Ensure leak tracing turned on */
316670eb 1131 if (!mclfindleak || !mclexpleak)
6d2010ae
A
1132 return (ENXIO);
1133
1134 lck_mtx_lock(mleak_lock);
1135 i = SYSCTL_OUT(req, &mleak_table, sizeof (mleak_table));
1136 lck_mtx_unlock(mleak_lock);
1137
1138 return (i);
1139}
1140
2d21ac55
A
1141static inline void
1142m_incref(struct mbuf *m)
1143{
1144 UInt32 old, new;
1145 volatile UInt32 *addr = (volatile UInt32 *)&MEXT_REF(m);
91447636 1146
2d21ac55
A
1147 do {
1148 old = *addr;
1149 new = old + 1;
1150 ASSERT(new != 0);
1151 } while (!OSCompareAndSwap(old, new, addr));
6d2010ae
A
1152
1153 /*
1154 * If cluster is shared, mark it with (sticky) EXTF_READONLY;
1155 * we don't clear the flag when the refcount goes back to 1
1156 * to simplify code calling m_mclhasreference().
1157 */
1158 if (new > 1 && !(MEXT_FLAGS(m) & EXTF_READONLY))
1159 (void) OSBitOrAtomic(EXTF_READONLY, &MEXT_FLAGS(m));
1c79356b
A
1160}
1161
2d21ac55
A
1162static inline u_int32_t
1163m_decref(struct mbuf *m)
1c79356b 1164{
2d21ac55
A
1165 UInt32 old, new;
1166 volatile UInt32 *addr = (volatile UInt32 *)&MEXT_REF(m);
1c79356b 1167
2d21ac55
A
1168 do {
1169 old = *addr;
1170 new = old - 1;
1171 ASSERT(old != 0);
1172 } while (!OSCompareAndSwap(old, new, addr));
1173
1174 return (new);
1c79356b
A
1175}
1176
2d21ac55
A
1177static void
1178mbuf_table_init(void)
1c79356b 1179{
6d2010ae 1180 unsigned int b, c, s;
2d21ac55 1181 int m;
91447636 1182
b0d623f7
A
1183 MALLOC(omb_stat, struct omb_stat *, OMB_STAT_SIZE(NELEM(mbuf_table)),
1184 M_TEMP, M_WAITOK | M_ZERO);
1185 VERIFY(omb_stat != NULL);
1186
2d21ac55
A
1187 MALLOC(mb_stat, mb_stat_t *, MB_STAT_SIZE(NELEM(mbuf_table)),
1188 M_TEMP, M_WAITOK | M_ZERO);
1189 VERIFY(mb_stat != NULL);
1c79356b 1190
2d21ac55
A
1191 mb_stat->mbs_cnt = NELEM(mbuf_table);
1192 for (m = 0; m < NELEM(mbuf_table); m++)
1193 mbuf_table[m].mtbl_stats = &mb_stat->mbs_class[m];
1c79356b 1194
2d21ac55
A
1195#if CONFIG_MBUF_JUMBO
1196 /*
1197 * Set aside 1/3 of the mbuf cluster map for jumbo clusters; we do
1198 * this only on platforms where jumbo cluster pool is enabled.
1199 */
1200 njcl = nmbclusters / 3;
1201 njclbytes = M16KCLBYTES;
1202#endif /* CONFIG_MBUF_JUMBO */
9bccf70c 1203
2d21ac55 1204 /*
6d2010ae
A
1205 * nclusters holds both the 2KB and 4KB pools, so ensure it's
1206 * a multiple of 4KB clusters.
2d21ac55 1207 */
6d2010ae 1208 nclusters = P2ROUNDDOWN(nmbclusters - njcl, NCLPBG);
2d21ac55
A
1209 if (njcl > 0) {
1210 /*
6d2010ae
A
1211 * Each jumbo cluster takes 8 2KB clusters, so make
1212 * sure that the pool size is evenly divisible by 8;
1213 * njcl is in 2KB unit, hence treated as such.
2d21ac55
A
1214 */
1215 njcl = P2ROUNDDOWN(nmbclusters - nclusters, 8);
1c79356b 1216
6d2010ae
A
1217 /* Update nclusters with rounded down value of njcl */
1218 nclusters = P2ROUNDDOWN(nmbclusters - njcl, NCLPBG);
9bccf70c 1219 }
2d21ac55
A
1220
1221 /*
6d2010ae
A
1222 * njcl is valid only on platforms with 16KB jumbo clusters, where
1223 * it is configured to 1/3 of the pool size. On these platforms,
1224 * the remaining is used for 2KB and 4KB clusters. On platforms
1225 * without 16KB jumbo clusters, the entire pool is used for both
1226 * 2KB and 4KB clusters. A 4KB cluster can either be splitted into
1227 * 16 mbufs, or into 2 2KB clusters.
1228 *
1229 * +---+---+------------ ... -----------+------- ... -------+
1230 * | c | b | s | njcl |
1231 * +---+---+------------ ... -----------+------- ... -------+
1232 *
1233 * 1/32th of the shared region is reserved for pure 2KB and 4KB
1234 * clusters (1/64th each.)
1235 */
1236 c = P2ROUNDDOWN((nclusters >> 6), 2); /* in 2KB unit */
1237 b = P2ROUNDDOWN((nclusters >> (6 + NCLPBGSHIFT)), 2); /* in 4KB unit */
1238 s = nclusters - (c + (b << NCLPBGSHIFT)); /* in 2KB unit */
1239
1240 /*
1241 * 1/64th (c) is reserved for 2KB clusters.
2d21ac55 1242 */
6d2010ae
A
1243 m_minlimit(MC_CL) = c;
1244 m_maxlimit(MC_CL) = s + c; /* in 2KB unit */
2d21ac55
A
1245 m_maxsize(MC_CL) = m_size(MC_CL) = MCLBYTES;
1246 (void) snprintf(m_cname(MC_CL), MAX_MBUF_CNAME, "cl");
1247
1248 /*
6d2010ae
A
1249 * Another 1/64th (b) of the map is reserved for 4KB clusters.
1250 * It cannot be turned into 2KB clusters or mbufs.
2d21ac55 1251 */
6d2010ae
A
1252 m_minlimit(MC_BIGCL) = b;
1253 m_maxlimit(MC_BIGCL) = (s >> NCLPBGSHIFT) + b; /* in 4KB unit */
1254 m_maxsize(MC_BIGCL) = m_size(MC_BIGCL) = MBIGCLBYTES;
1255 (void) snprintf(m_cname(MC_BIGCL), MAX_MBUF_CNAME, "bigcl");
2d21ac55
A
1256
1257 /*
6d2010ae 1258 * The remaining 31/32ths (s) are all-purpose (mbufs, 2KB, or 4KB)
2d21ac55 1259 */
6d2010ae
A
1260 m_minlimit(MC_MBUF) = 0;
1261 m_maxlimit(MC_MBUF) = (s << NMBPCLSHIFT); /* in mbuf unit */
1262 m_maxsize(MC_MBUF) = m_size(MC_MBUF) = MSIZE;
1263 (void) snprintf(m_cname(MC_MBUF), MAX_MBUF_CNAME, "mbuf");
2d21ac55
A
1264
1265 /*
1266 * Set limits for the composite classes.
1267 */
1268 m_minlimit(MC_MBUF_CL) = 0;
6d2010ae 1269 m_maxlimit(MC_MBUF_CL) = m_maxlimit(MC_CL);
2d21ac55
A
1270 m_maxsize(MC_MBUF_CL) = MCLBYTES;
1271 m_size(MC_MBUF_CL) = m_size(MC_MBUF) + m_size(MC_CL);
1272 (void) snprintf(m_cname(MC_MBUF_CL), MAX_MBUF_CNAME, "mbuf_cl");
1273
1274 m_minlimit(MC_MBUF_BIGCL) = 0;
1275 m_maxlimit(MC_MBUF_BIGCL) = m_maxlimit(MC_BIGCL);
6d2010ae 1276 m_maxsize(MC_MBUF_BIGCL) = MBIGCLBYTES;
2d21ac55
A
1277 m_size(MC_MBUF_BIGCL) = m_size(MC_MBUF) + m_size(MC_BIGCL);
1278 (void) snprintf(m_cname(MC_MBUF_BIGCL), MAX_MBUF_CNAME, "mbuf_bigcl");
1279
1280 /*
1281 * And for jumbo classes.
1282 */
1283 m_minlimit(MC_16KCL) = 0;
6d2010ae 1284 m_maxlimit(MC_16KCL) = (njcl >> NCLPJCLSHIFT); /* in 16KB unit */
2d21ac55
A
1285 m_maxsize(MC_16KCL) = m_size(MC_16KCL) = M16KCLBYTES;
1286 (void) snprintf(m_cname(MC_16KCL), MAX_MBUF_CNAME, "16kcl");
1287
1288 m_minlimit(MC_MBUF_16KCL) = 0;
1289 m_maxlimit(MC_MBUF_16KCL) = m_maxlimit(MC_16KCL);
1290 m_maxsize(MC_MBUF_16KCL) = M16KCLBYTES;
1291 m_size(MC_MBUF_16KCL) = m_size(MC_MBUF) + m_size(MC_16KCL);
1292 (void) snprintf(m_cname(MC_MBUF_16KCL), MAX_MBUF_CNAME, "mbuf_16kcl");
1293
1294 /*
1295 * Initialize the legacy mbstat structure.
1296 */
1297 bzero(&mbstat, sizeof (mbstat));
1298 mbstat.m_msize = m_maxsize(MC_MBUF);
1299 mbstat.m_mclbytes = m_maxsize(MC_CL);
1300 mbstat.m_minclsize = MINCLSIZE;
1301 mbstat.m_mlen = MLEN;
1302 mbstat.m_mhlen = MHLEN;
1303 mbstat.m_bigmclbytes = m_maxsize(MC_BIGCL);
1304}
1305
b0d623f7
A
1306#if defined(__LP64__)
1307typedef struct ncl_tbl {
1308 uint64_t nt_maxmem; /* memory (sane) size */
1309 uint32_t nt_mbpool; /* mbuf pool size */
1310} ncl_tbl_t;
1311
1312/* Non-server */
1313static ncl_tbl_t ncl_table[] = {
316670eb 1314 { (1ULL << GBSHIFT) /* 1 GB */, (64 << MBSHIFT) /* 64 MB */ },
b0d623f7
A
1315 { (1ULL << (GBSHIFT + 3)) /* 8 GB */, (96 << MBSHIFT) /* 96 MB */ },
1316 { (1ULL << (GBSHIFT + 4)) /* 16 GB */, (128 << MBSHIFT) /* 128 MB */ },
1317 { 0, 0 }
1318};
1319
1320/* Server */
1321static ncl_tbl_t ncl_table_srv[] = {
316670eb 1322 { (1ULL << GBSHIFT) /* 1 GB */, (96 << MBSHIFT) /* 96 MB */ },
b0d623f7
A
1323 { (1ULL << (GBSHIFT + 2)) /* 4 GB */, (128 << MBSHIFT) /* 128 MB */ },
1324 { (1ULL << (GBSHIFT + 3)) /* 8 GB */, (160 << MBSHIFT) /* 160 MB */ },
1325 { (1ULL << (GBSHIFT + 4)) /* 16 GB */, (192 << MBSHIFT) /* 192 MB */ },
1326 { (1ULL << (GBSHIFT + 5)) /* 32 GB */, (256 << MBSHIFT) /* 256 MB */ },
1327 { (1ULL << (GBSHIFT + 6)) /* 64 GB */, (384 << MBSHIFT) /* 384 MB */ },
1328 { 0, 0 }
1329};
1330#endif /* __LP64__ */
1331
1332__private_extern__ unsigned int
6d2010ae 1333mbuf_default_ncl(int server, uint64_t mem)
b0d623f7
A
1334{
1335#if !defined(__LP64__)
6d2010ae 1336#pragma unused(server)
b0d623f7
A
1337 unsigned int n;
1338 /*
1339 * 32-bit kernel (default to 64MB of mbuf pool for >= 1GB RAM).
1340 */
6d2010ae
A
1341 if ((n = ((mem / 16) / MCLBYTES)) > 32768)
1342 n = 32768;
b0d623f7
A
1343#else
1344 unsigned int n, i;
6d2010ae 1345 ncl_tbl_t *tbl = (server ? ncl_table_srv : ncl_table);
b0d623f7
A
1346 /*
1347 * 64-bit kernel (mbuf pool size based on table).
1348 */
1349 n = tbl[0].nt_mbpool;
1350 for (i = 0; tbl[i].nt_mbpool != 0; i++) {
1351 if (mem < tbl[i].nt_maxmem)
1352 break;
1353 n = tbl[i].nt_mbpool;
1354 }
1355 n >>= MCLSHIFT;
1356#endif /* !__LP64__ */
1357 return (n);
1358}
1359
2d21ac55
A
1360__private_extern__ void
1361mbinit(void)
1362{
1363 unsigned int m;
6d2010ae 1364 unsigned int initmcl = 0;
2d21ac55 1365 void *buf;
b0d623f7 1366 thread_t thread = THREAD_NULL;
2d21ac55 1367
39236c6e
A
1368 microuptime(&mb_start);
1369
316670eb
A
1370 /*
1371 * These MBUF_ values must be equal to their private counterparts.
1372 */
1373 _CASSERT(MBUF_EXT == M_EXT);
1374 _CASSERT(MBUF_PKTHDR == M_PKTHDR);
1375 _CASSERT(MBUF_EOR == M_EOR);
1376 _CASSERT(MBUF_LOOP == M_LOOP);
1377 _CASSERT(MBUF_BCAST == M_BCAST);
1378 _CASSERT(MBUF_MCAST == M_MCAST);
1379 _CASSERT(MBUF_FRAG == M_FRAG);
1380 _CASSERT(MBUF_FIRSTFRAG == M_FIRSTFRAG);
1381 _CASSERT(MBUF_LASTFRAG == M_LASTFRAG);
1382 _CASSERT(MBUF_PROMISC == M_PROMISC);
1383 _CASSERT(MBUF_HASFCS == M_HASFCS);
1384
1385 _CASSERT(MBUF_TYPE_FREE == MT_FREE);
1386 _CASSERT(MBUF_TYPE_DATA == MT_DATA);
1387 _CASSERT(MBUF_TYPE_HEADER == MT_HEADER);
1388 _CASSERT(MBUF_TYPE_SOCKET == MT_SOCKET);
1389 _CASSERT(MBUF_TYPE_PCB == MT_PCB);
1390 _CASSERT(MBUF_TYPE_RTABLE == MT_RTABLE);
1391 _CASSERT(MBUF_TYPE_HTABLE == MT_HTABLE);
1392 _CASSERT(MBUF_TYPE_ATABLE == MT_ATABLE);
1393 _CASSERT(MBUF_TYPE_SONAME == MT_SONAME);
1394 _CASSERT(MBUF_TYPE_SOOPTS == MT_SOOPTS);
1395 _CASSERT(MBUF_TYPE_FTABLE == MT_FTABLE);
1396 _CASSERT(MBUF_TYPE_RIGHTS == MT_RIGHTS);
1397 _CASSERT(MBUF_TYPE_IFADDR == MT_IFADDR);
1398 _CASSERT(MBUF_TYPE_CONTROL == MT_CONTROL);
1399 _CASSERT(MBUF_TYPE_OOBDATA == MT_OOBDATA);
1400
1401 _CASSERT(MBUF_TSO_IPV4 == CSUM_TSO_IPV4);
1402 _CASSERT(MBUF_TSO_IPV6 == CSUM_TSO_IPV6);
39236c6e 1403 _CASSERT(MBUF_CSUM_REQ_SUM16 == CSUM_PARTIAL);
316670eb
A
1404 _CASSERT(MBUF_CSUM_TCP_SUM16 == MBUF_CSUM_REQ_SUM16);
1405 _CASSERT(MBUF_CSUM_REQ_IP == CSUM_IP);
1406 _CASSERT(MBUF_CSUM_REQ_TCP == CSUM_TCP);
1407 _CASSERT(MBUF_CSUM_REQ_UDP == CSUM_UDP);
1408 _CASSERT(MBUF_CSUM_REQ_TCPIPV6 == CSUM_TCPIPV6);
1409 _CASSERT(MBUF_CSUM_REQ_UDPIPV6 == CSUM_UDPIPV6);
1410 _CASSERT(MBUF_CSUM_DID_IP == CSUM_IP_CHECKED);
1411 _CASSERT(MBUF_CSUM_IP_GOOD == CSUM_IP_VALID);
1412 _CASSERT(MBUF_CSUM_DID_DATA == CSUM_DATA_VALID);
1413 _CASSERT(MBUF_CSUM_PSEUDO_HDR == CSUM_PSEUDO_HDR);
1414
1415 _CASSERT(MBUF_WAITOK == M_WAIT);
1416 _CASSERT(MBUF_DONTWAIT == M_DONTWAIT);
1417 _CASSERT(MBUF_COPYALL == M_COPYALL);
1418
316670eb
A
1419 _CASSERT(MBUF_SC2TC(MBUF_SC_BK_SYS) == MBUF_TC_BK);
1420 _CASSERT(MBUF_SC2TC(MBUF_SC_BK) == MBUF_TC_BK);
1421 _CASSERT(MBUF_SC2TC(MBUF_SC_BE) == MBUF_TC_BE);
1422 _CASSERT(MBUF_SC2TC(MBUF_SC_RD) == MBUF_TC_BE);
1423 _CASSERT(MBUF_SC2TC(MBUF_SC_OAM) == MBUF_TC_BE);
1424 _CASSERT(MBUF_SC2TC(MBUF_SC_AV) == MBUF_TC_VI);
1425 _CASSERT(MBUF_SC2TC(MBUF_SC_RV) == MBUF_TC_VI);
1426 _CASSERT(MBUF_SC2TC(MBUF_SC_VI) == MBUF_TC_VI);
1427 _CASSERT(MBUF_SC2TC(MBUF_SC_VO) == MBUF_TC_VO);
1428 _CASSERT(MBUF_SC2TC(MBUF_SC_CTL) == MBUF_TC_VO);
1429
1430 _CASSERT(MBUF_TC2SCVAL(MBUF_TC_BK) == SCVAL_BK);
1431 _CASSERT(MBUF_TC2SCVAL(MBUF_TC_BE) == SCVAL_BE);
1432 _CASSERT(MBUF_TC2SCVAL(MBUF_TC_VI) == SCVAL_VI);
1433 _CASSERT(MBUF_TC2SCVAL(MBUF_TC_VO) == SCVAL_VO);
1434
39236c6e
A
1435 /* Module specific scratch space (32-bit alignment requirement) */
1436 _CASSERT(!(offsetof(struct mbuf, m_pkthdr.pkt_mpriv) %
1437 sizeof (uint32_t)));
1438
1439 /* Initialize random red zone cookie value */
1440 _CASSERT(sizeof (mb_redzone_cookie) ==
1441 sizeof (((struct pkthdr *)0)->redzone));
1442 read_random(&mb_redzone_cookie, sizeof (mb_redzone_cookie));
1443
1444 /* Make sure we don't save more than we should */
1445 _CASSERT(MCA_SAVED_MBUF_SIZE <= sizeof (struct mbuf));
1446
2d21ac55
A
1447 if (nmbclusters == 0)
1448 nmbclusters = NMBCLUSTERS;
1449
6d2010ae
A
1450 /* This should be a sane (at least even) value by now */
1451 VERIFY(nmbclusters != 0 && !(nmbclusters & 0x1));
1452
2d21ac55
A
1453 /* Setup the mbuf table */
1454 mbuf_table_init();
1455
1456 /* Global lock for common layer */
1457 mbuf_mlock_grp_attr = lck_grp_attr_alloc_init();
1458 mbuf_mlock_grp = lck_grp_alloc_init("mbuf", mbuf_mlock_grp_attr);
1459 mbuf_mlock_attr = lck_attr_alloc_init();
316670eb 1460 lck_mtx_init(mbuf_mlock, mbuf_mlock_grp, mbuf_mlock_attr);
2d21ac55 1461
6d2010ae
A
1462 /*
1463 * Allocate cluster slabs table:
1464 *
1465 * maxslabgrp = (N * 2048) / (1024 * 1024)
1466 *
1467 * Where N is nmbclusters rounded up to the nearest 512. This yields
1468 * mcl_slab_g_t units, each one representing a MB of memory.
1469 */
1470 maxslabgrp =
1471 (P2ROUNDUP(nmbclusters, (MBSIZE >> 11)) << MCLSHIFT) >> MBSHIFT;
2d21ac55
A
1472 MALLOC(slabstbl, mcl_slabg_t **, maxslabgrp * sizeof (mcl_slabg_t *),
1473 M_TEMP, M_WAITOK | M_ZERO);
1474 VERIFY(slabstbl != NULL);
1475
6d2010ae
A
1476 /*
1477 * Allocate audit structures, if needed:
1478 *
1479 * maxclaudit = (maxslabgrp * 1024 * 1024) / 4096
1480 *
1481 * This yields mcl_audit_t units, each one representing a page.
1482 */
593a1d5f 1483 PE_parse_boot_argn("mbuf_debug", &mbuf_debug, sizeof (mbuf_debug));
2d21ac55 1484 mbuf_debug |= mcache_getflags();
6d2010ae
A
1485 if (mbuf_debug & MCF_DEBUG) {
1486 maxclaudit = ((maxslabgrp << MBSHIFT) >> PGSHIFT);
1487 MALLOC(mclaudit, mcl_audit_t *, maxclaudit * sizeof (*mclaudit),
1488 M_TEMP, M_WAITOK | M_ZERO);
2d21ac55
A
1489 VERIFY(mclaudit != NULL);
1490
1491 mcl_audit_con_cache = mcache_create("mcl_audit_contents",
39236c6e 1492 AUDIT_CONTENTS_SIZE, sizeof (u_int64_t), 0, MCR_SLEEP);
2d21ac55
A
1493 VERIFY(mcl_audit_con_cache != NULL);
1494 }
6d2010ae
A
1495 mclverify = (mbuf_debug & MCF_VERIFY);
1496 mcltrace = (mbuf_debug & MCF_TRACE);
1497 mclfindleak = !(mbuf_debug & MCF_NOLEAKLOG);
316670eb 1498 mclexpleak = mclfindleak && (mbuf_debug & MCF_EXPLEAKLOG);
6d2010ae
A
1499
1500 /* Enable mbuf leak logging, with a lock to protect the tables */
1501
1502 mleak_lock_grp_attr = lck_grp_attr_alloc_init();
1503 mleak_lock_grp = lck_grp_alloc_init("mleak_lock", mleak_lock_grp_attr);
1504 mleak_lock_attr = lck_attr_alloc_init();
316670eb 1505 lck_mtx_init(mleak_lock, mleak_lock_grp, mleak_lock_attr);
6d2010ae
A
1506
1507 mleak_activate();
2d21ac55
A
1508
1509 /* Calculate the number of pages assigned to the cluster pool */
b0d623f7
A
1510 mcl_pages = (nmbclusters * MCLBYTES) / CLBYTES;
1511 MALLOC(mcl_paddr, ppnum_t *, mcl_pages * sizeof (ppnum_t),
1512 M_TEMP, M_WAITOK);
2d21ac55
A
1513 VERIFY(mcl_paddr != NULL);
1514
1515 /* Register with the I/O Bus mapper */
1516 mcl_paddr_base = IOMapperIOVMAlloc(mcl_pages);
b0d623f7 1517 bzero((char *)mcl_paddr, mcl_pages * sizeof (ppnum_t));
2d21ac55 1518
6d2010ae 1519 embutl = (union mbigcluster *)
316670eb 1520 ((void *)((unsigned char *)mbutl + (nmbclusters * MCLBYTES)));
6d2010ae 1521 VERIFY((((char *)embutl - (char *)mbutl) % MBIGCLBYTES) == 0);
2d21ac55 1522
6d2010ae 1523 /* Prime up the freelist */
593a1d5f 1524 PE_parse_boot_argn("initmcl", &initmcl, sizeof (initmcl));
6d2010ae
A
1525 if (initmcl != 0) {
1526 initmcl >>= NCLPBGSHIFT; /* become a 4K unit */
1527 if (initmcl > m_maxlimit(MC_BIGCL))
1528 initmcl = m_maxlimit(MC_BIGCL);
1529 }
1530 if (initmcl < m_minlimit(MC_BIGCL))
1531 initmcl = m_minlimit(MC_BIGCL);
2d21ac55
A
1532
1533 lck_mtx_lock(mbuf_mlock);
1534
6d2010ae
A
1535 /*
1536 * For classes with non-zero minimum limits, populate their freelists
1537 * so that m_total(class) is at least m_minlimit(class).
1538 */
1539 VERIFY(m_total(MC_BIGCL) == 0 && m_minlimit(MC_BIGCL) != 0);
1540 freelist_populate(m_class(MC_BIGCL), initmcl, M_WAIT);
1541 VERIFY(m_total(MC_BIGCL) >= m_minlimit(MC_BIGCL));
1542 freelist_init(m_class(MC_CL));
1543
1544 for (m = 0; m < NELEM(mbuf_table); m++) {
1545 /* Make sure we didn't miss any */
1546 VERIFY(m_minlimit(m_class(m)) == 0 ||
1547 m_total(m_class(m)) >= m_minlimit(m_class(m)));
fe8ab488
A
1548
1549 /* populate the initial sizes and report from there on */
1550 m_peak(m_class(m)) = m_total(m_class(m));
6d2010ae 1551 }
fe8ab488 1552 mb_peak_newreport = FALSE;
2d21ac55
A
1553
1554 lck_mtx_unlock(mbuf_mlock);
1555
6d2010ae
A
1556 (void) kernel_thread_start((thread_continue_t)mbuf_worker_thread_init,
1557 NULL, &thread);
b0d623f7 1558 thread_deallocate(thread);
2d21ac55
A
1559
1560 ref_cache = mcache_create("mext_ref", sizeof (struct ext_ref),
1561 0, 0, MCR_SLEEP);
1562
1563 /* Create the cache for each class */
1564 for (m = 0; m < NELEM(mbuf_table); m++) {
6d2010ae 1565 void *allocfunc, *freefunc, *auditfunc, *logfunc;
2d21ac55
A
1566 u_int32_t flags;
1567
1568 flags = mbuf_debug;
1569 if (m_class(m) == MC_MBUF_CL || m_class(m) == MC_MBUF_BIGCL ||
1570 m_class(m) == MC_MBUF_16KCL) {
1571 allocfunc = mbuf_cslab_alloc;
1572 freefunc = mbuf_cslab_free;
1573 auditfunc = mbuf_cslab_audit;
6d2010ae 1574 logfunc = mleak_logger;
2d21ac55
A
1575 } else {
1576 allocfunc = mbuf_slab_alloc;
1577 freefunc = mbuf_slab_free;
1578 auditfunc = mbuf_slab_audit;
6d2010ae 1579 logfunc = mleak_logger;
2d21ac55
A
1580 }
1581
1582 /*
1583 * Disable per-CPU caches for jumbo classes if there
1584 * is no jumbo cluster pool available in the system.
1585 * The cache itself is still created (but will never
1586 * be populated) since it simplifies the code.
1587 */
1588 if ((m_class(m) == MC_MBUF_16KCL || m_class(m) == MC_16KCL) &&
1589 njcl == 0)
1590 flags |= MCF_NOCPUCACHE;
1591
6d2010ae
A
1592 if (!mclfindleak)
1593 flags |= MCF_NOLEAKLOG;
1594
2d21ac55 1595 m_cache(m) = mcache_create_ext(m_cname(m), m_maxsize(m),
6d2010ae 1596 allocfunc, freefunc, auditfunc, logfunc, mbuf_slab_notify,
b0d623f7 1597 (void *)(uintptr_t)m, flags, MCR_SLEEP);
2d21ac55
A
1598 }
1599
1600 /*
1601 * Allocate structure for per-CPU statistics that's aligned
1602 * on the CPU cache boundary; this code assumes that we never
1603 * uninitialize this framework, since the original address
1604 * before alignment is not saved.
1605 */
1606 ncpu = ml_get_max_cpus();
39236c6e 1607 MALLOC(buf, void *, MBUF_MTYPES_SIZE(ncpu) + CPU_CACHE_LINE_SIZE,
2d21ac55
A
1608 M_TEMP, M_WAITOK);
1609 VERIFY(buf != NULL);
1610
39236c6e
A
1611 mbuf_mtypes = (mbuf_mtypes_t *)P2ROUNDUP((intptr_t)buf,
1612 CPU_CACHE_LINE_SIZE);
2d21ac55
A
1613 bzero(mbuf_mtypes, MBUF_MTYPES_SIZE(ncpu));
1614
6d2010ae
A
1615 /*
1616 * Set the max limit on sb_max to be 1/16 th of the size of
b0d623f7
A
1617 * memory allocated for mbuf clusters.
1618 */
6d2010ae 1619 high_sb_max = (nmbclusters << (MCLSHIFT - 4));
b0d623f7
A
1620 if (high_sb_max < sb_max) {
1621 /* sb_max is too large for this configuration, scale it down */
6d2010ae 1622 if (high_sb_max > (1 << MBSHIFT)) {
b0d623f7
A
1623 /* We have atleast 16 M of mbuf pool */
1624 sb_max = high_sb_max;
1625 } else if ((nmbclusters << MCLSHIFT) > (1 << MBSHIFT)) {
6d2010ae
A
1626 /*
1627 * If we have more than 1M of mbufpool, cap the size of
b0d623f7 1628 * max sock buf at 1M
6d2010ae 1629 */
b0d623f7
A
1630 sb_max = high_sb_max = (1 << MBSHIFT);
1631 } else {
1632 sb_max = high_sb_max;
1633 }
1634 }
1635
316670eb
A
1636 /* allocate space for mbuf_dump_buf */
1637 MALLOC(mbuf_dump_buf, char *, MBUF_DUMP_BUF_SIZE, M_TEMP, M_WAITOK);
1638 VERIFY(mbuf_dump_buf != NULL);
1639
39236c6e
A
1640 if (mbuf_debug & MCF_DEBUG) {
1641 printf("%s: MLEN %d, MHLEN %d\n", __func__,
1642 (int)_MLEN, (int)_MHLEN);
1643 }
1644
1645 printf("%s: done [%d MB total pool size, (%d/%d) split]\n", __func__,
6d2010ae
A
1646 (nmbclusters << MCLSHIFT) >> MBSHIFT,
1647 (nclusters << MCLSHIFT) >> MBSHIFT,
1648 (njcl << MCLSHIFT) >> MBSHIFT);
2d21ac55
A
1649}
1650
1651/*
1652 * Obtain a slab of object(s) from the class's freelist.
1653 */
1654static mcache_obj_t *
1655slab_alloc(mbuf_class_t class, int wait)
1656{
1657 mcl_slab_t *sp;
1658 mcache_obj_t *buf;
1659
1660 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1661
1662 VERIFY(class != MC_16KCL || njcl > 0);
1663
1664 /* This should always be NULL for us */
1665 VERIFY(m_cobjlist(class) == NULL);
1666
1667 /*
1668 * Treat composite objects as having longer lifespan by using
1669 * a slab from the reverse direction, in hoping that this could
1670 * reduce the probability of fragmentation for slabs that hold
1671 * more than one buffer chunks (e.g. mbuf slabs). For other
1672 * slabs, this probably doesn't make much of a difference.
1673 */
6d2010ae 1674 if ((class == MC_MBUF || class == MC_CL) && (wait & MCR_COMP))
2d21ac55
A
1675 sp = (mcl_slab_t *)TAILQ_LAST(&m_slablist(class), mcl_slhead);
1676 else
1677 sp = (mcl_slab_t *)TAILQ_FIRST(&m_slablist(class));
1678
1679 if (sp == NULL) {
1680 VERIFY(m_infree(class) == 0 && m_slab_cnt(class) == 0);
1681 /* The slab list for this class is empty */
1682 return (NULL);
1683 }
1684
1685 VERIFY(m_infree(class) > 0);
1686 VERIFY(!slab_is_detached(sp));
1687 VERIFY(sp->sl_class == class &&
1688 (sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED);
1689 buf = sp->sl_head;
1690 VERIFY(slab_inrange(sp, buf) && sp == slab_get(buf));
1691
1692 if (class == MC_MBUF) {
1693 sp->sl_head = buf->obj_next;
6d2010ae
A
1694 VERIFY(sp->sl_head != NULL || sp->sl_refcnt == (NMBPBG - 1));
1695 } else if (class == MC_CL) {
1696 sp->sl_head = buf->obj_next;
1697 VERIFY(sp->sl_head != NULL || sp->sl_refcnt == (NCLPBG - 1));
2d21ac55
A
1698 } else {
1699 sp->sl_head = NULL;
1700 }
1701 if (sp->sl_head != NULL && !slab_inrange(sp, sp->sl_head)) {
1702 slab_nextptr_panic(sp, sp->sl_head);
1703 /* In case sl_head is in the map but not in the slab */
1704 VERIFY(slab_inrange(sp, sp->sl_head));
1705 /* NOTREACHED */
1706 }
1707
1708 /* Increment slab reference */
1709 sp->sl_refcnt++;
1710
1711 if (mclaudit != NULL) {
1712 mcache_audit_t *mca = mcl_audit_buf2mca(class, buf);
1713 mca->mca_uflags = 0;
1714 /* Save contents on mbuf objects only */
1715 if (class == MC_MBUF)
1716 mca->mca_uflags |= MB_SCVALID;
1717 }
1718
1719 if (class == MC_CL) {
1720 mbstat.m_clfree = (--m_infree(MC_CL)) + m_infree(MC_MBUF_CL);
1721 /*
6d2010ae 1722 * A 2K cluster slab can have at most NCLPBG references.
2d21ac55 1723 */
6d2010ae
A
1724 VERIFY(sp->sl_refcnt >= 1 && sp->sl_refcnt <= NCLPBG &&
1725 sp->sl_chunks == NCLPBG &&
1726 sp->sl_len == m_maxsize(MC_BIGCL));
1727 VERIFY(sp->sl_refcnt < NCLPBG || sp->sl_head == NULL);
2d21ac55 1728 } else if (class == MC_BIGCL) {
2d21ac55
A
1729 mbstat.m_bigclfree = (--m_infree(MC_BIGCL)) +
1730 m_infree(MC_MBUF_BIGCL);
1731 /*
6d2010ae 1732 * A 4K cluster slab can have at most 1 reference.
2d21ac55
A
1733 */
1734 VERIFY(sp->sl_refcnt == 1 && sp->sl_chunks == 1 &&
6d2010ae 1735 sp->sl_len == m_maxsize(class) && sp->sl_head == NULL);
2d21ac55
A
1736 } else if (class == MC_16KCL) {
1737 mcl_slab_t *nsp;
1738 int k;
1739
1740 --m_infree(MC_16KCL);
1741 VERIFY(sp->sl_refcnt == 1 && sp->sl_chunks == 1 &&
6d2010ae 1742 sp->sl_len == m_maxsize(class) && sp->sl_head == NULL);
2d21ac55 1743 /*
6d2010ae
A
1744 * Increment 2nd-Nth slab reference, where N is NSLABSP16KB.
1745 * A 16KB big cluster takes NSLABSP16KB slabs, each having at
1746 * most 1 reference.
2d21ac55 1747 */
6d2010ae 1748 for (nsp = sp, k = 1; k < NSLABSP16KB; k++) {
2d21ac55
A
1749 nsp = nsp->sl_next;
1750 /* Next slab must already be present */
1751 VERIFY(nsp != NULL);
1752 nsp->sl_refcnt++;
1753 VERIFY(!slab_is_detached(nsp));
1754 VERIFY(nsp->sl_class == MC_16KCL &&
1755 nsp->sl_flags == (SLF_MAPPED | SLF_PARTIAL) &&
1756 nsp->sl_refcnt == 1 && nsp->sl_chunks == 0 &&
1757 nsp->sl_len == 0 && nsp->sl_base == sp->sl_base &&
1758 nsp->sl_head == NULL);
1759 }
1760 } else {
6d2010ae 1761 VERIFY(class == MC_MBUF);
2d21ac55
A
1762 --m_infree(MC_MBUF);
1763 /*
1764 * If auditing is turned on, this check is
1765 * deferred until later in mbuf_slab_audit().
1766 */
1767 if (mclaudit == NULL)
1768 _MCHECK((struct mbuf *)buf);
1769 /*
1770 * Since we have incremented the reference count above,
6d2010ae 1771 * an mbuf slab (formerly a 4KB cluster slab that was cut
2d21ac55 1772 * up into mbufs) must have a reference count between 1
6d2010ae 1773 * and NMBPBG at this point.
2d21ac55 1774 */
6d2010ae
A
1775 VERIFY(sp->sl_refcnt >= 1 && sp->sl_refcnt <= NMBPBG &&
1776 sp->sl_chunks == NMBPBG &&
1777 sp->sl_len == m_maxsize(MC_BIGCL));
1778 VERIFY(sp->sl_refcnt < NMBPBG || sp->sl_head == NULL);
2d21ac55
A
1779 }
1780
1781 /* If empty, remove this slab from the class's freelist */
1782 if (sp->sl_head == NULL) {
6d2010ae
A
1783 VERIFY(class != MC_MBUF || sp->sl_refcnt == NMBPBG);
1784 VERIFY(class != MC_CL || sp->sl_refcnt == NCLPBG);
2d21ac55
A
1785 slab_remove(sp, class);
1786 }
1787
1788 return (buf);
1789}
1790
1791/*
1792 * Place a slab of object(s) back into a class's slab list.
1793 */
1794static void
1795slab_free(mbuf_class_t class, mcache_obj_t *buf)
1796{
1797 mcl_slab_t *sp;
1798
1799 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1800
1801 VERIFY(class != MC_16KCL || njcl > 0);
1802 VERIFY(buf->obj_next == NULL);
1803 sp = slab_get(buf);
1804 VERIFY(sp->sl_class == class && slab_inrange(sp, buf) &&
1805 (sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED);
1806
1807 /* Decrement slab reference */
1808 sp->sl_refcnt--;
1809
6d2010ae 1810 if (class == MC_CL) {
2d21ac55
A
1811 VERIFY(IS_P2ALIGNED(buf, MCLBYTES));
1812 /*
6d2010ae
A
1813 * A slab that has been splitted for 2KB clusters can have
1814 * at most 1 outstanding reference at this point.
1815 */
1816 VERIFY(sp->sl_refcnt >= 0 && sp->sl_refcnt <= (NCLPBG - 1) &&
1817 sp->sl_chunks == NCLPBG &&
1818 sp->sl_len == m_maxsize(MC_BIGCL));
1819 VERIFY(sp->sl_refcnt < (NCLPBG - 1) ||
1820 (slab_is_detached(sp) && sp->sl_head == NULL));
1821 } else if (class == MC_BIGCL) {
1822 VERIFY(IS_P2ALIGNED(buf, MCLBYTES));
1823 /*
1824 * A 4KB cluster slab can have at most 1 reference
2d21ac55
A
1825 * which must be 0 at this point.
1826 */
1827 VERIFY(sp->sl_refcnt == 0 && sp->sl_chunks == 1 &&
1828 sp->sl_len == m_maxsize(class) && sp->sl_head == NULL);
1829 VERIFY(slab_is_detached(sp));
2d21ac55
A
1830 } else if (class == MC_16KCL) {
1831 mcl_slab_t *nsp;
1832 int k;
1833 /*
6d2010ae 1834 * A 16KB cluster takes NSLABSP16KB slabs, all must
2d21ac55
A
1835 * now have 0 reference.
1836 */
6d2010ae 1837 VERIFY(IS_P2ALIGNED(buf, MBIGCLBYTES));
2d21ac55 1838 VERIFY(sp->sl_refcnt == 0 && sp->sl_chunks == 1 &&
6d2010ae 1839 sp->sl_len == m_maxsize(class) && sp->sl_head == NULL);
2d21ac55 1840 VERIFY(slab_is_detached(sp));
6d2010ae 1841 for (nsp = sp, k = 1; k < NSLABSP16KB; k++) {
2d21ac55
A
1842 nsp = nsp->sl_next;
1843 /* Next slab must already be present */
1844 VERIFY(nsp != NULL);
1845 nsp->sl_refcnt--;
1846 VERIFY(slab_is_detached(nsp));
1847 VERIFY(nsp->sl_class == MC_16KCL &&
1848 (nsp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) &&
1849 nsp->sl_refcnt == 0 && nsp->sl_chunks == 0 &&
1850 nsp->sl_len == 0 && nsp->sl_base == sp->sl_base &&
1851 nsp->sl_head == NULL);
1852 }
1853 } else {
1854 /*
6d2010ae
A
1855 * A slab that has been splitted for mbufs has at most NMBPBG
1856 * reference counts. Since we have decremented one reference
1857 * above, it must now be between 0 and NMBPBG-1.
2d21ac55 1858 */
6d2010ae
A
1859 VERIFY(class == MC_MBUF);
1860 VERIFY(sp->sl_refcnt >= 0 && sp->sl_refcnt <= (NMBPBG - 1) &&
1861 sp->sl_chunks == NMBPBG &&
1862 sp->sl_len == m_maxsize(MC_BIGCL));
1863 VERIFY(sp->sl_refcnt < (NMBPBG - 1) ||
2d21ac55
A
1864 (slab_is_detached(sp) && sp->sl_head == NULL));
1865 }
1866
1867 /*
1868 * When auditing is enabled, ensure that the buffer still
1869 * contains the free pattern. Otherwise it got corrupted
1870 * while at the CPU cache layer.
1871 */
1872 if (mclaudit != NULL) {
1873 mcache_audit_t *mca = mcl_audit_buf2mca(class, buf);
6d2010ae
A
1874 if (mclverify) {
1875 mcache_audit_free_verify(mca, buf, 0, m_maxsize(class));
1876 }
2d21ac55
A
1877 mca->mca_uflags &= ~MB_SCVALID;
1878 }
1879
1880 if (class == MC_CL) {
1881 mbstat.m_clfree = (++m_infree(MC_CL)) + m_infree(MC_MBUF_CL);
6d2010ae 1882 buf->obj_next = sp->sl_head;
2d21ac55
A
1883 } else if (class == MC_BIGCL) {
1884 mbstat.m_bigclfree = (++m_infree(MC_BIGCL)) +
1885 m_infree(MC_MBUF_BIGCL);
1886 } else if (class == MC_16KCL) {
1887 ++m_infree(MC_16KCL);
1888 } else {
1889 ++m_infree(MC_MBUF);
1890 buf->obj_next = sp->sl_head;
1891 }
1892 sp->sl_head = buf;
1893
6d2010ae
A
1894 /*
1895 * If a slab has been splitted to either one which holds 2KB clusters,
1896 * or one which holds mbufs, turn it back to one which holds a 4KB
1897 * cluster.
1898 */
1899 if (class == MC_MBUF && sp->sl_refcnt == 0 &&
1900 m_total(class) > m_minlimit(class) &&
1901 m_total(MC_BIGCL) < m_maxlimit(MC_BIGCL)) {
1902 int i = NMBPBG;
1903
1904 m_total(MC_BIGCL)++;
1905 mbstat.m_bigclusters = m_total(MC_BIGCL);
1906 m_total(MC_MBUF) -= NMBPBG;
2d21ac55 1907 mbstat.m_mbufs = m_total(MC_MBUF);
6d2010ae
A
1908 m_infree(MC_MBUF) -= NMBPBG;
1909 mtype_stat_add(MT_FREE, -((unsigned)NMBPBG));
1910
1911 VERIFY(m_total(MC_BIGCL) <= m_maxlimit(MC_BIGCL));
1912 VERIFY(m_total(MC_MBUF) >= m_minlimit(MC_MBUF));
2d21ac55
A
1913
1914 while (i--) {
1915 struct mbuf *m = sp->sl_head;
1916 VERIFY(m != NULL);
1917 sp->sl_head = m->m_next;
1918 m->m_next = NULL;
1919 }
1920 VERIFY(sp->sl_head == NULL);
1921
1922 /* Remove the slab from the mbuf class's slab list */
1923 slab_remove(sp, class);
1924
6d2010ae
A
1925 /* Reinitialize it as a 4KB cluster slab */
1926 slab_init(sp, MC_BIGCL, sp->sl_flags, sp->sl_base, sp->sl_base,
2d21ac55
A
1927 sp->sl_len, 0, 1);
1928
6d2010ae 1929 if (mclverify) {
2d21ac55 1930 mcache_set_pattern(MCACHE_FREE_PATTERN,
6d2010ae
A
1931 (caddr_t)sp->sl_head, m_maxsize(MC_BIGCL));
1932 }
1933 mbstat.m_bigclfree = (++m_infree(MC_BIGCL)) +
1934 m_infree(MC_MBUF_BIGCL);
2d21ac55 1935
6d2010ae
A
1936 VERIFY(slab_is_detached(sp));
1937 /* And finally switch class */
1938 class = MC_BIGCL;
1939 } else if (class == MC_CL && sp->sl_refcnt == 0 &&
1940 m_total(class) > m_minlimit(class) &&
1941 m_total(MC_BIGCL) < m_maxlimit(MC_BIGCL)) {
1942 int i = NCLPBG;
1943
1944 m_total(MC_BIGCL)++;
1945 mbstat.m_bigclusters = m_total(MC_BIGCL);
1946 m_total(MC_CL) -= NCLPBG;
1947 mbstat.m_clusters = m_total(MC_CL);
1948 m_infree(MC_CL) -= NCLPBG;
1949 VERIFY(m_total(MC_BIGCL) <= m_maxlimit(MC_BIGCL));
1950 VERIFY(m_total(MC_CL) >= m_minlimit(MC_CL));
1951
1952 while (i--) {
1953 union mcluster *c = sp->sl_head;
1954 VERIFY(c != NULL);
1955 sp->sl_head = c->mcl_next;
1956 c->mcl_next = NULL;
1957 }
1958 VERIFY(sp->sl_head == NULL);
1959
1960 /* Remove the slab from the 2KB cluster class's slab list */
1961 slab_remove(sp, class);
1962
1963 /* Reinitialize it as a 4KB cluster slab */
1964 slab_init(sp, MC_BIGCL, sp->sl_flags, sp->sl_base, sp->sl_base,
1965 sp->sl_len, 0, 1);
1966
1967 if (mclverify) {
1968 mcache_set_pattern(MCACHE_FREE_PATTERN,
1969 (caddr_t)sp->sl_head, m_maxsize(MC_BIGCL));
1970 }
1971 mbstat.m_bigclfree = (++m_infree(MC_BIGCL)) +
1972 m_infree(MC_MBUF_BIGCL);
2d21ac55
A
1973
1974 VERIFY(slab_is_detached(sp));
1975 /* And finally switch class */
6d2010ae 1976 class = MC_BIGCL;
2d21ac55
A
1977 }
1978
1979 /* Reinsert the slab to the class's slab list */
1980 if (slab_is_detached(sp))
1981 slab_insert(sp, class);
1982}
1983
1984/*
1985 * Common allocator for rudimentary objects called by the CPU cache layer
1986 * during an allocation request whenever there is no available element in the
1987 * bucket layer. It returns one or more elements from the appropriate global
1988 * freelist. If the freelist is empty, it will attempt to populate it and
1989 * retry the allocation.
1990 */
1991static unsigned int
1992mbuf_slab_alloc(void *arg, mcache_obj_t ***plist, unsigned int num, int wait)
1993{
1994 mbuf_class_t class = (mbuf_class_t)arg;
1995 unsigned int need = num;
1996 mcache_obj_t **list = *plist;
1997
1998 ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
1999 ASSERT(need > 0);
2000
2001 lck_mtx_lock(mbuf_mlock);
2002
2003 for (;;) {
2004 if ((*list = slab_alloc(class, wait)) != NULL) {
2005 (*list)->obj_next = NULL;
2006 list = *plist = &(*list)->obj_next;
2007
2008 if (--need == 0) {
2009 /*
2010 * If the number of elements in freelist has
2011 * dropped below low watermark, asynchronously
2012 * populate the freelist now rather than doing
2013 * it later when we run out of elements.
2014 */
2015 if (!mbuf_cached_above(class, wait) &&
2016 m_infree(class) < m_total(class) >> 5) {
2017 (void) freelist_populate(class, 1,
2018 M_DONTWAIT);
2019 }
2020 break;
2021 }
2022 } else {
2023 VERIFY(m_infree(class) == 0 || class == MC_CL);
2024
2025 (void) freelist_populate(class, 1,
2026 (wait & MCR_NOSLEEP) ? M_DONTWAIT : M_WAIT);
2027
2028 if (m_infree(class) > 0)
2029 continue;
2030
2031 /* Check if there's anything at the cache layer */
2032 if (mbuf_cached_above(class, wait))
2033 break;
2034
6d2010ae
A
2035 /* watchdog checkpoint */
2036 mbuf_watchdog();
2037
2d21ac55
A
2038 /* We have nothing and cannot block; give up */
2039 if (wait & MCR_NOSLEEP) {
2040 if (!(wait & MCR_TRYHARD)) {
2041 m_fail_cnt(class)++;
2042 mbstat.m_drops++;
2043 break;
2044 }
2045 }
2046
2047 /*
2048 * If the freelist is still empty and the caller is
2049 * willing to be blocked, sleep on the wait channel
2050 * until an element is available. Otherwise, if
2051 * MCR_TRYHARD is set, do our best to satisfy the
2052 * request without having to go to sleep.
2053 */
2054 if (mbuf_worker_ready &&
2055 mbuf_sleep(class, need, wait))
2056 break;
2057
2058 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2059 }
2060 }
2061
2062 m_alloc_cnt(class) += num - need;
2063 lck_mtx_unlock(mbuf_mlock);
2064
2065 return (num - need);
2066}
2067
2068/*
2069 * Common de-allocator for rudimentary objects called by the CPU cache
2070 * layer when one or more elements need to be returned to the appropriate
2071 * global freelist.
2072 */
2073static void
2074mbuf_slab_free(void *arg, mcache_obj_t *list, __unused int purged)
2075{
2076 mbuf_class_t class = (mbuf_class_t)arg;
2077 mcache_obj_t *nlist;
2078 unsigned int num = 0;
2079 int w;
2080
2081 ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
2082
2083 lck_mtx_lock(mbuf_mlock);
2084
2085 for (;;) {
2086 nlist = list->obj_next;
2087 list->obj_next = NULL;
2088 slab_free(class, list);
2089 ++num;
2090 if ((list = nlist) == NULL)
2091 break;
2092 }
2093 m_free_cnt(class) += num;
2094
2095 if ((w = mb_waiters) > 0)
2096 mb_waiters = 0;
2097
2098 lck_mtx_unlock(mbuf_mlock);
2099
2100 if (w != 0)
2101 wakeup(mb_waitchan);
2102}
2103
2104/*
2105 * Common auditor for rudimentary objects called by the CPU cache layer
2106 * during an allocation or free request. For the former, this is called
2107 * after the objects are obtained from either the bucket or slab layer
2108 * and before they are returned to the caller. For the latter, this is
2109 * called immediately during free and before placing the objects into
2110 * the bucket or slab layer.
2111 */
2112static void
2113mbuf_slab_audit(void *arg, mcache_obj_t *list, boolean_t alloc)
2114{
2115 mbuf_class_t class = (mbuf_class_t)arg;
2116 mcache_audit_t *mca;
2117
2118 ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
2119
2120 while (list != NULL) {
2121 lck_mtx_lock(mbuf_mlock);
2122 mca = mcl_audit_buf2mca(class, list);
2123
2124 /* Do the sanity checks */
2125 if (class == MC_MBUF) {
2126 mcl_audit_mbuf(mca, list, FALSE, alloc);
2127 ASSERT(mca->mca_uflags & MB_SCVALID);
2128 } else {
2129 mcl_audit_cluster(mca, list, m_maxsize(class),
2130 alloc, TRUE);
2131 ASSERT(!(mca->mca_uflags & MB_SCVALID));
2132 }
2133 /* Record this transaction */
6d2010ae 2134 if (mcltrace)
39236c6e 2135 mcache_buffer_log(mca, list, m_cache(class), &mb_start);
6d2010ae 2136
2d21ac55
A
2137 if (alloc)
2138 mca->mca_uflags |= MB_INUSE;
2139 else
2140 mca->mca_uflags &= ~MB_INUSE;
2141 /* Unpair the object (unconditionally) */
2142 mca->mca_uptr = NULL;
2143 lck_mtx_unlock(mbuf_mlock);
2144
2145 list = list->obj_next;
2146 }
2147}
2148
2149/*
2150 * Common notify routine for all caches. It is called by mcache when
2151 * one or more objects get freed. We use this indication to trigger
2152 * the wakeup of any sleeping threads so that they can retry their
2153 * allocation requests.
2154 */
2155static void
2156mbuf_slab_notify(void *arg, u_int32_t reason)
2157{
2158 mbuf_class_t class = (mbuf_class_t)arg;
2159 int w;
2160
2161 ASSERT(MBUF_CLASS_VALID(class));
2162
2163 if (reason != MCN_RETRYALLOC)
2164 return;
2165
2166 lck_mtx_lock(mbuf_mlock);
2167 if ((w = mb_waiters) > 0) {
2168 m_notified(class)++;
2169 mb_waiters = 0;
2170 }
2171 lck_mtx_unlock(mbuf_mlock);
2172
2173 if (w != 0)
2174 wakeup(mb_waitchan);
2175}
2176
2177/*
2178 * Obtain object(s) from the composite class's freelist.
2179 */
2180static unsigned int
2181cslab_alloc(mbuf_class_t class, mcache_obj_t ***plist, unsigned int num)
2182{
2183 unsigned int need = num;
2184 mcl_slab_t *sp, *clsp, *nsp;
2185 struct mbuf *m;
2186 mcache_obj_t **list = *plist;
2187 void *cl;
2188
2189 VERIFY(need > 0);
2190 VERIFY(class != MC_MBUF_16KCL || njcl > 0);
2191 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2192
2193 /* Get what we can from the freelist */
2194 while ((*list = m_cobjlist(class)) != NULL) {
2195 MRANGE(*list);
2196
2197 m = (struct mbuf *)*list;
2198 sp = slab_get(m);
2199 cl = m->m_ext.ext_buf;
2200 clsp = slab_get(cl);
2201 VERIFY(m->m_flags == M_EXT && cl != NULL);
2202 VERIFY(MEXT_RFA(m) != NULL && MBUF_IS_COMPOSITE(m));
6d2010ae
A
2203
2204 if (class == MC_MBUF_CL) {
2205 VERIFY(clsp->sl_refcnt >= 1 &&
2206 clsp->sl_refcnt <= NCLPBG);
2207 } else {
2208 VERIFY(clsp->sl_refcnt == 1);
2209 }
2210
2211 if (class == MC_MBUF_16KCL) {
2d21ac55 2212 int k;
6d2010ae 2213 for (nsp = clsp, k = 1; k < NSLABSP16KB; k++) {
2d21ac55
A
2214 nsp = nsp->sl_next;
2215 /* Next slab must already be present */
2216 VERIFY(nsp != NULL);
2217 VERIFY(nsp->sl_refcnt == 1);
2218 }
2219 }
2220
2221 if ((m_cobjlist(class) = (*list)->obj_next) != NULL &&
2222 !MBUF_IN_MAP(m_cobjlist(class))) {
2223 slab_nextptr_panic(sp, m_cobjlist(class));
2224 /* NOTREACHED */
2225 }
2226 (*list)->obj_next = NULL;
2227 list = *plist = &(*list)->obj_next;
2228
2229 if (--need == 0)
2230 break;
2231 }
2232 m_infree(class) -= (num - need);
2233
2234 return (num - need);
2235}
2236
2237/*
2238 * Place object(s) back into a composite class's freelist.
2239 */
2240static unsigned int
2241cslab_free(mbuf_class_t class, mcache_obj_t *list, int purged)
2242{
2243 mcache_obj_t *o, *tail;
2244 unsigned int num = 0;
2245 struct mbuf *m, *ms;
2246 mcache_audit_t *mca = NULL;
2247 mcache_obj_t *ref_list = NULL;
2248 mcl_slab_t *clsp, *nsp;
2249 void *cl;
6d2010ae 2250 mbuf_class_t cl_class;
2d21ac55
A
2251
2252 ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
2253 VERIFY(class != MC_MBUF_16KCL || njcl > 0);
2254 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2255
6d2010ae
A
2256 if (class == MC_MBUF_CL) {
2257 cl_class = MC_CL;
2258 } else if (class == MC_MBUF_BIGCL) {
2259 cl_class = MC_BIGCL;
2260 } else {
2261 VERIFY(class == MC_MBUF_16KCL);
2262 cl_class = MC_16KCL;
2263 }
2264
2d21ac55
A
2265 o = tail = list;
2266
2267 while ((m = ms = (struct mbuf *)o) != NULL) {
2268 mcache_obj_t *rfa, *nexto = o->obj_next;
2269
2270 /* Do the mbuf sanity checks */
2271 if (mclaudit != NULL) {
2272 mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
6d2010ae
A
2273 if (mclverify) {
2274 mcache_audit_free_verify(mca, m, 0,
2275 m_maxsize(MC_MBUF));
2276 }
39236c6e 2277 ms = MCA_SAVED_MBUF_PTR(mca);
2d21ac55
A
2278 }
2279
2280 /* Do the cluster sanity checks */
2281 cl = ms->m_ext.ext_buf;
2282 clsp = slab_get(cl);
6d2010ae
A
2283 if (mclverify) {
2284 size_t size = m_maxsize(cl_class);
2285 mcache_audit_free_verify(mcl_audit_buf2mca(cl_class,
2d21ac55
A
2286 (mcache_obj_t *)cl), cl, 0, size);
2287 }
2288 VERIFY(ms->m_type == MT_FREE);
2289 VERIFY(ms->m_flags == M_EXT);
2290 VERIFY(MEXT_RFA(ms) != NULL && MBUF_IS_COMPOSITE(ms));
6d2010ae
A
2291 if (cl_class == MC_CL) {
2292 VERIFY(clsp->sl_refcnt >= 1 &&
2293 clsp->sl_refcnt <= NCLPBG);
2294 } else {
2295 VERIFY(clsp->sl_refcnt == 1);
2296 }
2297 if (cl_class == MC_16KCL) {
2d21ac55 2298 int k;
6d2010ae 2299 for (nsp = clsp, k = 1; k < NSLABSP16KB; k++) {
2d21ac55
A
2300 nsp = nsp->sl_next;
2301 /* Next slab must already be present */
2302 VERIFY(nsp != NULL);
2303 VERIFY(nsp->sl_refcnt == 1);
2304 }
2305 }
2306
2307 /*
2308 * If we're asked to purge, restore the actual mbuf using
2309 * contents of the shadow structure (if auditing is enabled)
2310 * and clear EXTF_COMPOSITE flag from the mbuf, as we are
2311 * about to free it and the attached cluster into their caches.
2312 */
2313 if (purged) {
2314 /* Restore constructed mbuf fields */
2315 if (mclaudit != NULL)
2316 mcl_audit_restore_mbuf(m, mca, TRUE);
2317
2318 MEXT_REF(m) = 0;
2319 MEXT_FLAGS(m) = 0;
2320
316670eb 2321 rfa = (mcache_obj_t *)(void *)MEXT_RFA(m);
2d21ac55
A
2322 rfa->obj_next = ref_list;
2323 ref_list = rfa;
2324 MEXT_RFA(m) = NULL;
2325
2326 m->m_type = MT_FREE;
2327 m->m_flags = m->m_len = 0;
2328 m->m_next = m->m_nextpkt = NULL;
2329
2330 /* Save mbuf fields and make auditing happy */
2331 if (mclaudit != NULL)
2332 mcl_audit_mbuf(mca, o, FALSE, FALSE);
2333
2334 VERIFY(m_total(class) > 0);
2335 m_total(class)--;
2336
2337 /* Free the mbuf */
2338 o->obj_next = NULL;
2339 slab_free(MC_MBUF, o);
2340
2341 /* And free the cluster */
2342 ((mcache_obj_t *)cl)->obj_next = NULL;
2343 if (class == MC_MBUF_CL)
2344 slab_free(MC_CL, cl);
2345 else if (class == MC_MBUF_BIGCL)
2346 slab_free(MC_BIGCL, cl);
2347 else
2348 slab_free(MC_16KCL, cl);
2349 }
2350
2351 ++num;
2352 tail = o;
2353 o = nexto;
2354 }
2355
2356 if (!purged) {
2357 tail->obj_next = m_cobjlist(class);
2358 m_cobjlist(class) = list;
2359 m_infree(class) += num;
2360 } else if (ref_list != NULL) {
2361 mcache_free_ext(ref_cache, ref_list);
2362 }
2363
2364 return (num);
2365}
2366
2367/*
2368 * Common allocator for composite objects called by the CPU cache layer
2369 * during an allocation request whenever there is no available element in
2370 * the bucket layer. It returns one or more composite elements from the
2371 * appropriate global freelist. If the freelist is empty, it will attempt
2372 * to obtain the rudimentary objects from their caches and construct them
2373 * into composite mbuf + cluster objects.
2374 */
2375static unsigned int
2376mbuf_cslab_alloc(void *arg, mcache_obj_t ***plist, unsigned int needed,
2377 int wait)
2378{
2379 mbuf_class_t class = (mbuf_class_t)arg;
6d2010ae 2380 mbuf_class_t cl_class = 0;
2d21ac55
A
2381 unsigned int num = 0, cnum = 0, want = needed;
2382 mcache_obj_t *ref_list = NULL;
2383 mcache_obj_t *mp_list = NULL;
2384 mcache_obj_t *clp_list = NULL;
2385 mcache_obj_t **list;
2386 struct ext_ref *rfa;
2387 struct mbuf *m;
2388 void *cl;
2389
2390 ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
2391 ASSERT(needed > 0);
2392
2393 VERIFY(class != MC_MBUF_16KCL || njcl > 0);
2394
2395 /* There should not be any slab for this class */
2396 VERIFY(m_slab_cnt(class) == 0 &&
2397 m_slablist(class).tqh_first == NULL &&
2398 m_slablist(class).tqh_last == NULL);
2399
2400 lck_mtx_lock(mbuf_mlock);
2401
2402 /* Try using the freelist first */
2403 num = cslab_alloc(class, plist, needed);
2404 list = *plist;
2405 if (num == needed) {
2406 m_alloc_cnt(class) += num;
2407 lck_mtx_unlock(mbuf_mlock);
2408 return (needed);
2409 }
2410
2411 lck_mtx_unlock(mbuf_mlock);
2412
2413 /*
2414 * We could not satisfy the request using the freelist alone;
2415 * allocate from the appropriate rudimentary caches and use
2416 * whatever we can get to construct the composite objects.
2417 */
2418 needed -= num;
2419
2420 /*
2421 * Mark these allocation requests as coming from a composite cache.
2422 * Also, if the caller is willing to be blocked, mark the request
2423 * with MCR_FAILOK such that we don't end up sleeping at the mbuf
2424 * slab layer waiting for the individual object when one or more
2425 * of the already-constructed composite objects are available.
2426 */
2427 wait |= MCR_COMP;
2428 if (!(wait & MCR_NOSLEEP))
2429 wait |= MCR_FAILOK;
2430
6d2010ae 2431 /* allocate mbufs */
2d21ac55
A
2432 needed = mcache_alloc_ext(m_cache(MC_MBUF), &mp_list, needed, wait);
2433 if (needed == 0) {
2434 ASSERT(mp_list == NULL);
2435 goto fail;
2436 }
6d2010ae
A
2437
2438 /* allocate clusters */
2439 if (class == MC_MBUF_CL) {
2440 cl_class = MC_CL;
2441 } else if (class == MC_MBUF_BIGCL) {
2442 cl_class = MC_BIGCL;
2443 } else {
2444 VERIFY(class == MC_MBUF_16KCL);
2445 cl_class = MC_16KCL;
2446 }
2447 needed = mcache_alloc_ext(m_cache(cl_class), &clp_list, needed, wait);
2d21ac55
A
2448 if (needed == 0) {
2449 ASSERT(clp_list == NULL);
2450 goto fail;
2451 }
6d2010ae 2452
2d21ac55
A
2453 needed = mcache_alloc_ext(ref_cache, &ref_list, needed, wait);
2454 if (needed == 0) {
2455 ASSERT(ref_list == NULL);
2456 goto fail;
2457 }
2458
2459 /*
2460 * By this time "needed" is MIN(mbuf, cluster, ref). Any left
2461 * overs will get freed accordingly before we return to caller.
2462 */
2463 for (cnum = 0; cnum < needed; cnum++) {
2464 struct mbuf *ms;
2465
2466 m = ms = (struct mbuf *)mp_list;
2467 mp_list = mp_list->obj_next;
2468
2469 cl = clp_list;
2470 clp_list = clp_list->obj_next;
2471 ((mcache_obj_t *)cl)->obj_next = NULL;
2472
2473 rfa = (struct ext_ref *)ref_list;
2474 ref_list = ref_list->obj_next;
316670eb 2475 ((mcache_obj_t *)(void *)rfa)->obj_next = NULL;
2d21ac55
A
2476
2477 /*
2478 * If auditing is enabled, construct the shadow mbuf
2479 * in the audit structure instead of in the actual one.
2480 * mbuf_cslab_audit() will take care of restoring the
2481 * contents after the integrity check.
2482 */
2483 if (mclaudit != NULL) {
2484 mcache_audit_t *mca, *cl_mca;
2d21ac55
A
2485
2486 lck_mtx_lock(mbuf_mlock);
2487 mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
39236c6e 2488 ms = MCA_SAVED_MBUF_PTR(mca);
2d21ac55
A
2489 cl_mca = mcl_audit_buf2mca(MC_CL, (mcache_obj_t *)cl);
2490
2491 /*
2492 * Pair them up. Note that this is done at the time
2493 * the mbuf+cluster objects are constructed. This
2494 * information should be treated as "best effort"
2495 * debugging hint since more than one mbufs can refer
2496 * to a cluster. In that case, the cluster might not
2497 * be freed along with the mbuf it was paired with.
2498 */
2499 mca->mca_uptr = cl_mca;
2500 cl_mca->mca_uptr = mca;
2501
2502 ASSERT(mca->mca_uflags & MB_SCVALID);
2503 ASSERT(!(cl_mca->mca_uflags & MB_SCVALID));
2504 lck_mtx_unlock(mbuf_mlock);
2505
2506 /* Technically, they are in the freelist */
6d2010ae
A
2507 if (mclverify) {
2508 size_t size;
2509
2510 mcache_set_pattern(MCACHE_FREE_PATTERN, m,
2511 m_maxsize(MC_MBUF));
2512
2513 if (class == MC_MBUF_CL)
2514 size = m_maxsize(MC_CL);
2515 else if (class == MC_MBUF_BIGCL)
2516 size = m_maxsize(MC_BIGCL);
2517 else
2518 size = m_maxsize(MC_16KCL);
2519
2520 mcache_set_pattern(MCACHE_FREE_PATTERN, cl,
2521 size);
2522 }
2d21ac55
A
2523 }
2524
2525 MBUF_INIT(ms, 0, MT_FREE);
2526 if (class == MC_MBUF_16KCL) {
2527 MBUF_16KCL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE);
2528 } else if (class == MC_MBUF_BIGCL) {
2529 MBUF_BIGCL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE);
2530 } else {
2531 MBUF_CL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE);
2532 }
2533 VERIFY(ms->m_flags == M_EXT);
2534 VERIFY(MEXT_RFA(ms) != NULL && MBUF_IS_COMPOSITE(ms));
2535
2536 *list = (mcache_obj_t *)m;
2537 (*list)->obj_next = NULL;
2538 list = *plist = &(*list)->obj_next;
2539 }
2540
2541fail:
2542 /*
2543 * Free up what's left of the above.
2544 */
2545 if (mp_list != NULL)
2546 mcache_free_ext(m_cache(MC_MBUF), mp_list);
2547 if (clp_list != NULL)
6d2010ae 2548 mcache_free_ext(m_cache(cl_class), clp_list);
2d21ac55
A
2549 if (ref_list != NULL)
2550 mcache_free_ext(ref_cache, ref_list);
2551
2552 lck_mtx_lock(mbuf_mlock);
2553 if (num > 0 || cnum > 0) {
2554 m_total(class) += cnum;
2555 VERIFY(m_total(class) <= m_maxlimit(class));
2556 m_alloc_cnt(class) += num + cnum;
2557 }
2558 if ((num + cnum) < want)
2559 m_fail_cnt(class) += (want - (num + cnum));
2560 lck_mtx_unlock(mbuf_mlock);
2561
2562 return (num + cnum);
2563}
2564
2565/*
2566 * Common de-allocator for composite objects called by the CPU cache
2567 * layer when one or more elements need to be returned to the appropriate
2568 * global freelist.
2569 */
2570static void
2571mbuf_cslab_free(void *arg, mcache_obj_t *list, int purged)
2572{
2573 mbuf_class_t class = (mbuf_class_t)arg;
2574 unsigned int num;
2575 int w;
2576
2577 ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
2578
2579 lck_mtx_lock(mbuf_mlock);
2580
2581 num = cslab_free(class, list, purged);
2582 m_free_cnt(class) += num;
2583
2584 if ((w = mb_waiters) > 0)
2585 mb_waiters = 0;
2586
2587 lck_mtx_unlock(mbuf_mlock);
2588
2589 if (w != 0)
2590 wakeup(mb_waitchan);
2591}
2592
2593/*
2594 * Common auditor for composite objects called by the CPU cache layer
2595 * during an allocation or free request. For the former, this is called
2596 * after the objects are obtained from either the bucket or slab layer
2597 * and before they are returned to the caller. For the latter, this is
2598 * called immediately during free and before placing the objects into
2599 * the bucket or slab layer.
2600 */
2601static void
2602mbuf_cslab_audit(void *arg, mcache_obj_t *list, boolean_t alloc)
2603{
2604 mbuf_class_t class = (mbuf_class_t)arg;
2605 mcache_audit_t *mca;
2606 struct mbuf *m, *ms;
2607 mcl_slab_t *clsp, *nsp;
2608 size_t size;
2609 void *cl;
2610
2611 ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
2612
2613 while ((m = ms = (struct mbuf *)list) != NULL) {
2614 lck_mtx_lock(mbuf_mlock);
2615 /* Do the mbuf sanity checks and record its transaction */
2616 mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
2617 mcl_audit_mbuf(mca, m, TRUE, alloc);
6d2010ae 2618 if (mcltrace)
39236c6e 2619 mcache_buffer_log(mca, m, m_cache(class), &mb_start);
6d2010ae 2620
2d21ac55
A
2621 if (alloc)
2622 mca->mca_uflags |= MB_COMP_INUSE;
2623 else
2624 mca->mca_uflags &= ~MB_COMP_INUSE;
2625
2626 /*
2627 * Use the shadow mbuf in the audit structure if we are
2628 * freeing, since the contents of the actual mbuf has been
2629 * pattern-filled by the above call to mcl_audit_mbuf().
2630 */
6d2010ae 2631 if (!alloc && mclverify)
39236c6e 2632 ms = MCA_SAVED_MBUF_PTR(mca);
2d21ac55
A
2633
2634 /* Do the cluster sanity checks and record its transaction */
2635 cl = ms->m_ext.ext_buf;
2636 clsp = slab_get(cl);
2637 VERIFY(ms->m_flags == M_EXT && cl != NULL);
2638 VERIFY(MEXT_RFA(ms) != NULL && MBUF_IS_COMPOSITE(ms));
6d2010ae
A
2639 if (class == MC_MBUF_CL)
2640 VERIFY(clsp->sl_refcnt >= 1 &&
2641 clsp->sl_refcnt <= NCLPBG);
2642 else
2643 VERIFY(clsp->sl_refcnt == 1);
2644
2645 if (class == MC_MBUF_16KCL) {
2d21ac55 2646 int k;
6d2010ae 2647 for (nsp = clsp, k = 1; k < NSLABSP16KB; k++) {
2d21ac55
A
2648 nsp = nsp->sl_next;
2649 /* Next slab must already be present */
2650 VERIFY(nsp != NULL);
2651 VERIFY(nsp->sl_refcnt == 1);
2652 }
2653 }
2654
2655 mca = mcl_audit_buf2mca(MC_CL, cl);
2656 if (class == MC_MBUF_CL)
2657 size = m_maxsize(MC_CL);
2658 else if (class == MC_MBUF_BIGCL)
2659 size = m_maxsize(MC_BIGCL);
2660 else
2661 size = m_maxsize(MC_16KCL);
2662 mcl_audit_cluster(mca, cl, size, alloc, FALSE);
6d2010ae 2663 if (mcltrace)
39236c6e 2664 mcache_buffer_log(mca, cl, m_cache(class), &mb_start);
6d2010ae 2665
2d21ac55
A
2666 if (alloc)
2667 mca->mca_uflags |= MB_COMP_INUSE;
2668 else
2669 mca->mca_uflags &= ~MB_COMP_INUSE;
2670 lck_mtx_unlock(mbuf_mlock);
2671
2672 list = list->obj_next;
2673 }
2674}
2675
2676/*
2677 * Allocate some number of mbuf clusters and place on cluster freelist.
2678 */
2679static int
2680m_clalloc(const u_int32_t num, const int wait, const u_int32_t bufsize)
2681{
2682 int i;
2683 vm_size_t size = 0;
b0d623f7 2684 int numpages = 0, large_buffer = (bufsize == m_maxsize(MC_16KCL));
2d21ac55
A
2685 vm_offset_t page = 0;
2686 mcache_audit_t *mca_list = NULL;
2687 mcache_obj_t *con_list = NULL;
2688 mcl_slab_t *sp;
2689
6d2010ae
A
2690 VERIFY(bufsize == m_maxsize(MC_BIGCL) ||
2691 bufsize == m_maxsize(MC_16KCL));
2d21ac55
A
2692
2693 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2694
2695 /*
2696 * Multiple threads may attempt to populate the cluster map one
2697 * after another. Since we drop the lock below prior to acquiring
2698 * the physical page(s), our view of the cluster map may no longer
2699 * be accurate, and we could end up over-committing the pages beyond
2700 * the maximum allowed for each class. To prevent it, this entire
2701 * operation (including the page mapping) is serialized.
2702 */
2703 while (mb_clalloc_busy) {
2704 mb_clalloc_waiters++;
2705 (void) msleep(mb_clalloc_waitchan, mbuf_mlock,
2706 (PZERO-1), "m_clalloc", NULL);
2707 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2708 }
2709
2710 /* We are busy now; tell everyone else to go away */
2711 mb_clalloc_busy = TRUE;
2712
2713 /*
2714 * Honor the caller's wish to block or not block. We have a way
2715 * to grow the pool asynchronously using the mbuf worker thread.
2716 */
2717 i = m_howmany(num, bufsize);
2718 if (i == 0 || (wait & M_DONTWAIT))
2719 goto out;
2720
2721 lck_mtx_unlock(mbuf_mlock);
2722
b0d623f7
A
2723 size = round_page(i * bufsize);
2724 page = kmem_mb_alloc(mb_map, size, large_buffer);
2725
2726 /*
6d2010ae 2727 * If we did ask for "n" 16KB physically contiguous chunks
b0d623f7
A
2728 * and didn't get them, then please try again without this
2729 * restriction.
2730 */
2731 if (large_buffer && page == 0)
2732 page = kmem_mb_alloc(mb_map, size, 0);
2d21ac55
A
2733
2734 if (page == 0) {
6d2010ae
A
2735 if (bufsize == m_maxsize(MC_BIGCL)) {
2736 /* Try for 1 page if failed, only 4KB request */
2d21ac55 2737 size = NBPG;
b0d623f7 2738 page = kmem_mb_alloc(mb_map, size, 0);
2d21ac55
A
2739 }
2740
2741 if (page == 0) {
2742 lck_mtx_lock(mbuf_mlock);
2743 goto out;
2744 }
2745 }
2746
2747 VERIFY(IS_P2ALIGNED(page, NBPG));
2748 numpages = size / NBPG;
2749
2750 /* If auditing is enabled, allocate the audit structures now */
2751 if (mclaudit != NULL) {
2752 int needed;
2753
2754 /*
2755 * Yes, I realize this is a waste of memory for clusters
2756 * that never get transformed into mbufs, as we may end
6d2010ae 2757 * up with NMBPBG-1 unused audit structures per cluster.
2d21ac55
A
2758 * But doing so tremendously simplifies the allocation
2759 * strategy, since at this point we are not holding the
6d2010ae 2760 * mbuf lock and the caller is okay to be blocked.
2d21ac55 2761 */
6d2010ae
A
2762 if (bufsize == m_maxsize(MC_BIGCL)) {
2763 needed = numpages * NMBPBG;
2d21ac55
A
2764
2765 i = mcache_alloc_ext(mcl_audit_con_cache,
2766 &con_list, needed, MCR_SLEEP);
2767
2768 VERIFY(con_list != NULL && i == needed);
2d21ac55 2769 } else {
6d2010ae 2770 needed = numpages / NSLABSP16KB;
2d21ac55
A
2771 }
2772
2773 i = mcache_alloc_ext(mcache_audit_cache,
2774 (mcache_obj_t **)&mca_list, needed, MCR_SLEEP);
2775
2776 VERIFY(mca_list != NULL && i == needed);
2777 }
2778
2779 lck_mtx_lock(mbuf_mlock);
2780
2781 for (i = 0; i < numpages; i++, page += NBPG) {
2782 ppnum_t offset = ((char *)page - (char *)mbutl) / NBPG;
99c3a104 2783 ppnum_t new_page = pmap_find_phys(kernel_pmap, page);
fe8ab488 2784 mbuf_class_t class = MC_BIGCL;
2d21ac55
A
2785
2786 /*
39236c6e
A
2787 * If there is a mapper the appropriate I/O page is returned;
2788 * zero out the page to discard its past contents to prevent
2789 * exposing leftover kernel memory.
2d21ac55 2790 */
b0d623f7 2791 VERIFY(offset < mcl_pages);
39236c6e
A
2792 if (mcl_paddr_base != 0) {
2793 bzero((void *)(uintptr_t) page, page_size);
2794 new_page = IOMapperInsertPage(mcl_paddr_base,
2795 offset, new_page);
99c3a104 2796 }
39236c6e 2797 mcl_paddr[offset] = new_page;
2d21ac55
A
2798
2799 /* Pattern-fill this fresh page */
6d2010ae 2800 if (mclverify) {
2d21ac55
A
2801 mcache_set_pattern(MCACHE_FREE_PATTERN,
2802 (caddr_t)page, NBPG);
6d2010ae
A
2803 }
2804 if (bufsize == m_maxsize(MC_BIGCL)) {
2d21ac55 2805 union mbigcluster *mbc = (union mbigcluster *)page;
2d21ac55
A
2806
2807 /* One for the entire page */
2808 sp = slab_get(mbc);
6d2010ae
A
2809 if (mclaudit != NULL) {
2810 mcl_audit_init(mbc, &mca_list, &con_list,
2811 AUDIT_CONTENTS_SIZE, NMBPBG);
2812 }
2d21ac55
A
2813 VERIFY(sp->sl_refcnt == 0 && sp->sl_flags == 0);
2814 slab_init(sp, MC_BIGCL, SLF_MAPPED,
2815 mbc, mbc, bufsize, 0, 1);
2816
2d21ac55
A
2817 /* Insert this slab */
2818 slab_insert(sp, MC_BIGCL);
2819
2820 /* Update stats now since slab_get() drops the lock */
2821 mbstat.m_bigclfree = ++m_infree(MC_BIGCL) +
2822 m_infree(MC_MBUF_BIGCL);
2823 mbstat.m_bigclusters = ++m_total(MC_BIGCL);
2824 VERIFY(m_total(MC_BIGCL) <= m_maxlimit(MC_BIGCL));
fe8ab488 2825 class = MC_BIGCL;
6d2010ae 2826 } else if ((i % NSLABSP16KB) == 0) {
2d21ac55
A
2827 union m16kcluster *m16kcl = (union m16kcluster *)page;
2828 mcl_slab_t *nsp;
2829 int k;
2830
2831 VERIFY(njcl > 0);
2832 /* One for the entire 16KB */
2833 sp = slab_get(m16kcl);
2834 if (mclaudit != NULL)
2835 mcl_audit_init(m16kcl, &mca_list, NULL, 0, 1);
2836
2837 VERIFY(sp->sl_refcnt == 0 && sp->sl_flags == 0);
2838 slab_init(sp, MC_16KCL, SLF_MAPPED,
2839 m16kcl, m16kcl, bufsize, 0, 1);
2840
6d2010ae
A
2841 /*
2842 * 2nd-Nth page's slab is part of the first one,
2843 * where N is NSLABSP16KB.
2844 */
2845 for (k = 1; k < NSLABSP16KB; k++) {
2846 nsp = slab_get(((union mbigcluster *)page) + k);
2d21ac55
A
2847 VERIFY(nsp->sl_refcnt == 0 &&
2848 nsp->sl_flags == 0);
2849 slab_init(nsp, MC_16KCL,
2850 SLF_MAPPED | SLF_PARTIAL,
2851 m16kcl, NULL, 0, 0, 0);
2852 }
2853
2854 /* Insert this slab */
2855 slab_insert(sp, MC_16KCL);
2856
2857 /* Update stats now since slab_get() drops the lock */
2858 m_infree(MC_16KCL)++;
2859 m_total(MC_16KCL)++;
2860 VERIFY(m_total(MC_16KCL) <= m_maxlimit(MC_16KCL));
fe8ab488 2861 class = MC_16KCL;
2d21ac55 2862 }
fe8ab488
A
2863 if (!mb_peak_newreport && mbuf_report_usage(class))
2864 mb_peak_newreport = TRUE;
2d21ac55
A
2865 }
2866 VERIFY(mca_list == NULL && con_list == NULL);
2867
2868 /* We're done; let others enter */
2869 mb_clalloc_busy = FALSE;
2870 if (mb_clalloc_waiters > 0) {
2871 mb_clalloc_waiters = 0;
2872 wakeup(mb_clalloc_waitchan);
2873 }
2874
6d2010ae 2875 if (bufsize == m_maxsize(MC_BIGCL))
2d21ac55
A
2876 return (numpages);
2877
2878 VERIFY(bufsize == m_maxsize(MC_16KCL));
6d2010ae 2879 return (numpages / NSLABSP16KB);
2d21ac55
A
2880
2881out:
2882 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2883
2884 /* We're done; let others enter */
2885 mb_clalloc_busy = FALSE;
2886 if (mb_clalloc_waiters > 0) {
2887 mb_clalloc_waiters = 0;
2888 wakeup(mb_clalloc_waitchan);
2889 }
2890
2891 /*
2892 * When non-blocking we kick a thread if we have to grow the
2893 * pool or if the number of free clusters is less than requested.
2894 */
6d2010ae 2895 if (bufsize == m_maxsize(MC_BIGCL)) {
2d21ac55
A
2896 if (i > 0) {
2897 /*
2898 * Remember total number of 4KB clusters needed
2899 * at this time.
2900 */
2901 i += m_total(MC_BIGCL);
2902 if (i > mbuf_expand_big) {
2903 mbuf_expand_big = i;
2904 if (mbuf_worker_ready)
2905 wakeup((caddr_t)&mbuf_worker_run);
2906 }
2907 }
2908
2909 if (m_infree(MC_BIGCL) >= num)
2910 return (1);
2911 } else {
2912 if (i > 0) {
2913 /*
2914 * Remember total number of 16KB clusters needed
2915 * at this time.
2916 */
2917 i += m_total(MC_16KCL);
2918 if (i > mbuf_expand_16k) {
2919 mbuf_expand_16k = i;
2920 if (mbuf_worker_ready)
2921 wakeup((caddr_t)&mbuf_worker_run);
2922 }
2923 }
2924
2925 if (m_infree(MC_16KCL) >= num)
2926 return (1);
2927 }
2928 return (0);
2929}
2930
2931/*
2932 * Populate the global freelist of the corresponding buffer class.
2933 */
2934static int
2935freelist_populate(mbuf_class_t class, unsigned int num, int wait)
2936{
2937 mcache_obj_t *o = NULL;
6d2010ae 2938 int i, numpages = 0, count;
2d21ac55
A
2939
2940 VERIFY(class == MC_MBUF || class == MC_CL || class == MC_BIGCL ||
2941 class == MC_16KCL);
2942
2d21ac55
A
2943 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2944
2945 switch (class) {
2946 case MC_MBUF:
2947 case MC_CL:
6d2010ae
A
2948 case MC_BIGCL:
2949 numpages = (num * m_size(class) + NBPG - 1) / NBPG;
2950 i = m_clalloc(numpages, wait, m_maxsize(MC_BIGCL));
2d21ac55 2951
6d2010ae
A
2952 /* Respect the 4KB clusters minimum limit */
2953 if (m_total(MC_BIGCL) == m_maxlimit(MC_BIGCL) &&
2954 m_infree(MC_BIGCL) <= m_minlimit(MC_BIGCL)) {
2955 if (class != MC_BIGCL || (wait & MCR_COMP))
2d21ac55
A
2956 return (0);
2957 }
6d2010ae 2958 if (class == MC_BIGCL)
2d21ac55
A
2959 return (i != 0);
2960 break;
2961
2d21ac55
A
2962 case MC_16KCL:
2963 return (m_clalloc(num, wait, m_maxsize(class)) != 0);
2964 /* NOTREACHED */
2965
2966 default:
2967 VERIFY(0);
2968 /* NOTREACHED */
2969 }
2970
6d2010ae
A
2971 VERIFY(class == MC_MBUF || class == MC_CL);
2972
2973 /* how many objects will we cut the page into? */
2974 int numobj = (class == MC_MBUF ? NMBPBG : NCLPBG);
2975
2976 for (count = 0; count < numpages; count++) {
2977
2978 /* respect totals, minlimit, maxlimit */
2979 if (m_total(MC_BIGCL) <= m_minlimit(MC_BIGCL) ||
2980 m_total(class) >= m_maxlimit(class))
2981 break;
2982
2983 if ((o = slab_alloc(MC_BIGCL, wait)) == NULL)
2984 break;
2985
2d21ac55 2986 struct mbuf *m = (struct mbuf *)o;
6d2010ae 2987 union mcluster *c = (union mcluster *)o;
2d21ac55 2988 mcl_slab_t *sp = slab_get(o);
6d2010ae 2989 mcache_audit_t *mca = NULL;
2d21ac55
A
2990
2991 VERIFY(slab_is_detached(sp) &&
2992 (sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED);
2993
6d2010ae
A
2994 /*
2995 * Make sure that the cluster is unmolested
2996 * while in freelist
2997 */
2998 if (mclverify) {
2999 mca = mcl_audit_buf2mca(MC_BIGCL, o);
3000 mcache_audit_free_verify(mca, o, 0,
3001 m_maxsize(MC_BIGCL));
2d21ac55
A
3002 }
3003
6d2010ae
A
3004 /* Reinitialize it as an mbuf or 2K slab */
3005 slab_init(sp, class, sp->sl_flags,
3006 sp->sl_base, NULL, sp->sl_len, 0, numobj);
2d21ac55 3007
6d2010ae 3008 VERIFY(o == (mcache_obj_t *)sp->sl_base);
2d21ac55
A
3009 VERIFY(sp->sl_head == NULL);
3010
6d2010ae
A
3011 VERIFY(m_total(MC_BIGCL) > 0);
3012 m_total(MC_BIGCL)--;
3013 mbstat.m_bigclusters = m_total(MC_BIGCL);
2d21ac55 3014
6d2010ae
A
3015 m_total(class) += numobj;
3016 m_infree(class) += numobj;
3017
3018 VERIFY(m_total(MC_BIGCL) >= m_minlimit(MC_BIGCL));
3019 VERIFY(m_total(class) <= m_maxlimit(class));
fe8ab488
A
3020 if (!mb_peak_newreport && mbuf_report_usage(class))
3021 mb_peak_newreport = TRUE;
6d2010ae
A
3022
3023 i = numobj;
3024 if (class == MC_MBUF) {
3025 mbstat.m_mbufs = m_total(MC_MBUF);
3026 mtype_stat_add(MT_FREE, NMBPBG);
3027 while (i--) {
3028 /*
3029 * If auditing is enabled, construct the
3030 * shadow mbuf in the audit structure
3031 * instead of the actual one.
3032 * mbuf_slab_audit() will take care of
3033 * restoring the contents after the
3034 * integrity check.
3035 */
3036 if (mclaudit != NULL) {
3037 struct mbuf *ms;
3038 mca = mcl_audit_buf2mca(MC_MBUF,
3039 (mcache_obj_t *)m);
39236c6e 3040 ms = MCA_SAVED_MBUF_PTR(mca);
6d2010ae
A
3041 ms->m_type = MT_FREE;
3042 } else {
3043 m->m_type = MT_FREE;
3044 }
3045 m->m_next = sp->sl_head;
3046 sp->sl_head = (void *)m++;
3047 }
3048 } else { /* MC_CL */
3049 mbstat.m_clfree =
3050 m_infree(MC_CL) + m_infree(MC_MBUF_CL);
3051 mbstat.m_clusters = m_total(MC_CL);
3052 while (i--) {
3053 c->mcl_next = sp->sl_head;
3054 sp->sl_head = (void *)c++;
2d21ac55 3055 }
2d21ac55
A
3056 }
3057
6d2010ae
A
3058 /* Insert into the mbuf or 2k slab list */
3059 slab_insert(sp, class);
2d21ac55
A
3060
3061 if ((i = mb_waiters) > 0)
3062 mb_waiters = 0;
3063 if (i != 0)
3064 wakeup(mb_waitchan);
2d21ac55 3065 }
6d2010ae
A
3066 return (count != 0);
3067}
2d21ac55 3068
6d2010ae
A
3069/*
3070 * For each class, initialize the freelist to hold m_minlimit() objects.
3071 */
3072static void
3073freelist_init(mbuf_class_t class)
3074{
3075 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
3076
3077 VERIFY(class == MC_CL || class == MC_BIGCL);
3078 VERIFY(m_total(class) == 0);
3079 VERIFY(m_minlimit(class) > 0);
3080
3081 while (m_total(class) < m_minlimit(class))
3082 (void) freelist_populate(class, m_minlimit(class), M_WAIT);
3083
3084 VERIFY(m_total(class) >= m_minlimit(class));
2d21ac55
A
3085}
3086
3087/*
3088 * (Inaccurately) check if it might be worth a trip back to the
3089 * mcache layer due the availability of objects there. We'll
3090 * end up back here if there's nothing up there.
3091 */
3092static boolean_t
3093mbuf_cached_above(mbuf_class_t class, int wait)
3094{
3095 switch (class) {
3096 case MC_MBUF:
3097 if (wait & MCR_COMP)
3098 return (!mcache_bkt_isempty(m_cache(MC_MBUF_CL)) ||
3099 !mcache_bkt_isempty(m_cache(MC_MBUF_BIGCL)));
3100 break;
3101
3102 case MC_CL:
3103 if (wait & MCR_COMP)
3104 return (!mcache_bkt_isempty(m_cache(MC_MBUF_CL)));
3105 break;
3106
3107 case MC_BIGCL:
3108 if (wait & MCR_COMP)
3109 return (!mcache_bkt_isempty(m_cache(MC_MBUF_BIGCL)));
3110 break;
3111
3112 case MC_16KCL:
3113 if (wait & MCR_COMP)
3114 return (!mcache_bkt_isempty(m_cache(MC_MBUF_16KCL)));
3115 break;
3116
3117 case MC_MBUF_CL:
3118 case MC_MBUF_BIGCL:
3119 case MC_MBUF_16KCL:
3120 break;
3121
3122 default:
3123 VERIFY(0);
3124 /* NOTREACHED */
3125 }
3126
3127 return (!mcache_bkt_isempty(m_cache(class)));
3128}
3129
3130/*
3131 * If possible, convert constructed objects to raw ones.
3132 */
3133static boolean_t
3134mbuf_steal(mbuf_class_t class, unsigned int num)
3135{
3136 mcache_obj_t *top = NULL;
3137 mcache_obj_t **list = &top;
3138 unsigned int tot = 0;
3139
3140 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
3141
3142 switch (class) {
3143 case MC_MBUF:
3144 case MC_CL:
3145 case MC_BIGCL:
3146 case MC_16KCL:
3147 return (FALSE);
3148
3149 case MC_MBUF_CL:
3150 case MC_MBUF_BIGCL:
3151 case MC_MBUF_16KCL:
3152 /* Get the required number of constructed objects if possible */
3153 if (m_infree(class) > m_minlimit(class)) {
3154 tot = cslab_alloc(class, &list,
3155 MIN(num, m_infree(class)));
3156 }
3157
3158 /* And destroy them to get back the raw objects */
3159 if (top != NULL)
3160 (void) cslab_free(class, top, 1);
3161 break;
3162
3163 default:
3164 VERIFY(0);
3165 /* NOTREACHED */
3166 }
3167
3168 return (tot == num);
3169}
3170
3171static void
3172m_reclaim(mbuf_class_t class, unsigned int num, boolean_t comp)
3173{
3174 int m, bmap = 0;
3175
3176 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
3177
3178 VERIFY(m_total(MC_CL) <= m_maxlimit(MC_CL));
3179 VERIFY(m_total(MC_BIGCL) <= m_maxlimit(MC_BIGCL));
3180 VERIFY(m_total(MC_16KCL) <= m_maxlimit(MC_16KCL));
3181
3182 /*
3183 * This logic can be made smarter; for now, simply mark
3184 * all other related classes as potential victims.
3185 */
3186 switch (class) {
3187 case MC_MBUF:
3188 m_wantpurge(MC_CL)++;
6d2010ae 3189 m_wantpurge(MC_BIGCL)++;
2d21ac55
A
3190 m_wantpurge(MC_MBUF_CL)++;
3191 m_wantpurge(MC_MBUF_BIGCL)++;
3192 break;
3193
3194 case MC_CL:
3195 m_wantpurge(MC_MBUF)++;
6d2010ae
A
3196 m_wantpurge(MC_BIGCL)++;
3197 m_wantpurge(MC_MBUF_BIGCL)++;
2d21ac55
A
3198 if (!comp)
3199 m_wantpurge(MC_MBUF_CL)++;
3200 break;
3201
3202 case MC_BIGCL:
6d2010ae
A
3203 m_wantpurge(MC_MBUF)++;
3204 m_wantpurge(MC_CL)++;
3205 m_wantpurge(MC_MBUF_CL)++;
2d21ac55
A
3206 if (!comp)
3207 m_wantpurge(MC_MBUF_BIGCL)++;
3208 break;
3209
3210 case MC_16KCL:
3211 if (!comp)
3212 m_wantpurge(MC_MBUF_16KCL)++;
3213 break;
3214
3215 default:
3216 VERIFY(0);
3217 /* NOTREACHED */
3218 }
3219
3220 /*
3221 * Run through each marked class and check if we really need to
3222 * purge (and therefore temporarily disable) the per-CPU caches
3223 * layer used by the class. If so, remember the classes since
3224 * we are going to drop the lock below prior to purging.
3225 */
3226 for (m = 0; m < NELEM(mbuf_table); m++) {
3227 if (m_wantpurge(m) > 0) {
3228 m_wantpurge(m) = 0;
3229 /*
3230 * Try hard to steal the required number of objects
3231 * from the freelist of other mbuf classes. Only
3232 * purge and disable the per-CPU caches layer when
3233 * we don't have enough; it's the last resort.
3234 */
3235 if (!mbuf_steal(m, num))
3236 bmap |= (1 << m);
3237 }
3238 }
3239
3240 lck_mtx_unlock(mbuf_mlock);
3241
3242 if (bmap != 0) {
39236c6e
A
3243 /* signal the domains to drain */
3244 net_drain_domains();
2d21ac55
A
3245
3246 /* Sigh; we have no other choices but to ask mcache to purge */
3247 for (m = 0; m < NELEM(mbuf_table); m++) {
3248 if ((bmap & (1 << m)) &&
fe8ab488 3249 mcache_purge_cache(m_cache(m), TRUE)) {
2d21ac55
A
3250 lck_mtx_lock(mbuf_mlock);
3251 m_purge_cnt(m)++;
3252 mbstat.m_drain++;
3253 lck_mtx_unlock(mbuf_mlock);
3254 }
3255 }
3256 } else {
3257 /*
3258 * Request mcache to reap extra elements from all of its caches;
3259 * note that all reaps are serialized and happen only at a fixed
3260 * interval.
3261 */
3262 mcache_reap();
3263 }
3264 lck_mtx_lock(mbuf_mlock);
3265}
3266
3267static inline struct mbuf *
3268m_get_common(int wait, short type, int hdr)
3269{
3270 struct mbuf *m;
3271 int mcflags = MSLEEPF(wait);
3272
3273 /* Is this due to a non-blocking retry? If so, then try harder */
3274 if (mcflags & MCR_NOSLEEP)
3275 mcflags |= MCR_TRYHARD;
3276
3277 m = mcache_alloc(m_cache(MC_MBUF), mcflags);
3278 if (m != NULL) {
3279 MBUF_INIT(m, hdr, type);
3280 mtype_stat_inc(type);
3281 mtype_stat_dec(MT_FREE);
3282#if CONFIG_MACF_NET
3283 if (hdr && mac_init_mbuf(m, wait) != 0) {
3284 m_free(m);
3285 return (NULL);
3286 }
3287#endif /* MAC_NET */
3288 }
3289 return (m);
3290}
3291
3292/*
3293 * Space allocation routines; these are also available as macros
3294 * for critical paths.
3295 */
3296#define _M_GET(wait, type) m_get_common(wait, type, 0)
3297#define _M_GETHDR(wait, type) m_get_common(wait, type, 1)
3298#define _M_RETRY(wait, type) _M_GET(wait, type)
3299#define _M_RETRYHDR(wait, type) _M_GETHDR(wait, type)
3300#define _MGET(m, how, type) ((m) = _M_GET(how, type))
3301#define _MGETHDR(m, how, type) ((m) = _M_GETHDR(how, type))
3302
3303struct mbuf *
3304m_get(int wait, int type)
3305{
3306 return (_M_GET(wait, type));
3307}
3308
3309struct mbuf *
3310m_gethdr(int wait, int type)
3311{
3312 return (_M_GETHDR(wait, type));
3313}
3314
3315struct mbuf *
3316m_retry(int wait, int type)
3317{
3318 return (_M_RETRY(wait, type));
3319}
3320
3321struct mbuf *
3322m_retryhdr(int wait, int type)
3323{
3324 return (_M_RETRYHDR(wait, type));
3325}
3326
3327struct mbuf *
3328m_getclr(int wait, int type)
3329{
3330 struct mbuf *m;
3331
3332 _MGET(m, wait, type);
3333 if (m != NULL)
3334 bzero(MTOD(m, caddr_t), MLEN);
3335 return (m);
3336}
3337
3338struct mbuf *
3339m_free(struct mbuf *m)
3340{
3341 struct mbuf *n = m->m_next;
3342
3343 if (m->m_type == MT_FREE)
3344 panic("m_free: freeing an already freed mbuf");
3345
2d21ac55 3346 if (m->m_flags & M_PKTHDR) {
39236c6e
A
3347 /* Check for scratch area overflow */
3348 m_redzone_verify(m);
3349 /* Free the aux data and tags if there is any */
2d21ac55
A
3350 m_tag_delete_chain(m, NULL);
3351 }
3352
3353 if (m->m_flags & M_EXT) {
3354 u_int32_t refcnt;
6d2010ae 3355 u_int32_t composite;
2d21ac55
A
3356
3357 refcnt = m_decref(m);
6d2010ae
A
3358 composite = (MEXT_FLAGS(m) & EXTF_COMPOSITE);
3359 if (refcnt == 0 && !composite) {
2d21ac55
A
3360 if (m->m_ext.ext_free == NULL) {
3361 mcache_free(m_cache(MC_CL), m->m_ext.ext_buf);
3362 } else if (m->m_ext.ext_free == m_bigfree) {
3363 mcache_free(m_cache(MC_BIGCL),
3364 m->m_ext.ext_buf);
3365 } else if (m->m_ext.ext_free == m_16kfree) {
3366 mcache_free(m_cache(MC_16KCL),
3367 m->m_ext.ext_buf);
3368 } else {
3369 (*(m->m_ext.ext_free))(m->m_ext.ext_buf,
3370 m->m_ext.ext_size, m->m_ext.ext_arg);
3371 }
3372 mcache_free(ref_cache, MEXT_RFA(m));
3373 MEXT_RFA(m) = NULL;
6d2010ae 3374 } else if (refcnt == 0 && composite) {
2d21ac55
A
3375 VERIFY(m->m_type != MT_FREE);
3376
3377 mtype_stat_dec(m->m_type);
3378 mtype_stat_inc(MT_FREE);
3379
3380 m->m_type = MT_FREE;
3381 m->m_flags = M_EXT;
3382 m->m_len = 0;
3383 m->m_next = m->m_nextpkt = NULL;
3384
6d2010ae
A
3385 MEXT_FLAGS(m) &= ~EXTF_READONLY;
3386
2d21ac55
A
3387 /* "Free" into the intermediate cache */
3388 if (m->m_ext.ext_free == NULL) {
3389 mcache_free(m_cache(MC_MBUF_CL), m);
3390 } else if (m->m_ext.ext_free == m_bigfree) {
3391 mcache_free(m_cache(MC_MBUF_BIGCL), m);
3392 } else {
3393 VERIFY(m->m_ext.ext_free == m_16kfree);
3394 mcache_free(m_cache(MC_MBUF_16KCL), m);
3395 }
3396 return (n);
3397 }
3398 }
3399
3400 if (m->m_type != MT_FREE) {
3401 mtype_stat_dec(m->m_type);
3402 mtype_stat_inc(MT_FREE);
3403 }
3404
3405 m->m_type = MT_FREE;
3406 m->m_flags = m->m_len = 0;
3407 m->m_next = m->m_nextpkt = NULL;
3408
3409 mcache_free(m_cache(MC_MBUF), m);
3410
3411 return (n);
3412}
3413
3414__private_extern__ struct mbuf *
3415m_clattach(struct mbuf *m, int type, caddr_t extbuf,
3416 void (*extfree)(caddr_t, u_int, caddr_t), u_int extsize, caddr_t extarg,
3417 int wait)
3418{
3419 struct ext_ref *rfa = NULL;
3420
3421 if (m == NULL && (m = _M_GETHDR(wait, type)) == NULL)
3422 return (NULL);
3423
3424 if (m->m_flags & M_EXT) {
3425 u_int32_t refcnt;
6d2010ae 3426 u_int32_t composite;
2d21ac55
A
3427
3428 refcnt = m_decref(m);
6d2010ae
A
3429 composite = (MEXT_FLAGS(m) & EXTF_COMPOSITE);
3430 if (refcnt == 0 && !composite) {
2d21ac55
A
3431 if (m->m_ext.ext_free == NULL) {
3432 mcache_free(m_cache(MC_CL), m->m_ext.ext_buf);
3433 } else if (m->m_ext.ext_free == m_bigfree) {
3434 mcache_free(m_cache(MC_BIGCL),
3435 m->m_ext.ext_buf);
3436 } else if (m->m_ext.ext_free == m_16kfree) {
3437 mcache_free(m_cache(MC_16KCL),
3438 m->m_ext.ext_buf);
3439 } else {
3440 (*(m->m_ext.ext_free))(m->m_ext.ext_buf,
3441 m->m_ext.ext_size, m->m_ext.ext_arg);
3442 }
3443 /* Re-use the reference structure */
3444 rfa = MEXT_RFA(m);
6d2010ae 3445 } else if (refcnt == 0 && composite) {
2d21ac55
A
3446 VERIFY(m->m_type != MT_FREE);
3447
3448 mtype_stat_dec(m->m_type);
3449 mtype_stat_inc(MT_FREE);
3450
3451 m->m_type = MT_FREE;
3452 m->m_flags = M_EXT;
3453 m->m_len = 0;
3454 m->m_next = m->m_nextpkt = NULL;
6d2010ae
A
3455
3456 MEXT_FLAGS(m) &= ~EXTF_READONLY;
3457
2d21ac55
A
3458 /* "Free" into the intermediate cache */
3459 if (m->m_ext.ext_free == NULL) {
3460 mcache_free(m_cache(MC_MBUF_CL), m);
3461 } else if (m->m_ext.ext_free == m_bigfree) {
3462 mcache_free(m_cache(MC_MBUF_BIGCL), m);
3463 } else {
3464 VERIFY(m->m_ext.ext_free == m_16kfree);
3465 mcache_free(m_cache(MC_MBUF_16KCL), m);
3466 }
3467 /*
3468 * Allocate a new mbuf, since we didn't divorce
3469 * the composite mbuf + cluster pair above.
3470 */
3471 if ((m = _M_GETHDR(wait, type)) == NULL)
3472 return (NULL);
3473 }
3474 }
3475
3476 if (rfa == NULL &&
3477 (rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL) {
3478 m_free(m);
3479 return (NULL);
3480 }
3481
3482 MEXT_INIT(m, extbuf, extsize, extfree, extarg, rfa, 1, 0);
3483
3484 return (m);
3485}
3486
b0d623f7
A
3487/*
3488 * Perform `fast' allocation mbuf clusters from a cache of recently-freed
3489 * clusters. (If the cache is empty, new clusters are allocated en-masse.)
3490 */
3491struct mbuf *
3492m_getcl(int wait, int type, int flags)
3493{
3494 struct mbuf *m;
3495 int mcflags = MSLEEPF(wait);
3496 int hdr = (flags & M_PKTHDR);
3497
3498 /* Is this due to a non-blocking retry? If so, then try harder */
3499 if (mcflags & MCR_NOSLEEP)
3500 mcflags |= MCR_TRYHARD;
3501
6d2010ae
A
3502 m = mcache_alloc(m_cache(MC_MBUF_CL), mcflags);
3503 if (m != NULL) {
3504 u_int32_t flag;
3505 struct ext_ref *rfa;
3506 void *cl;
3507
3508 VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT);
3509 cl = m->m_ext.ext_buf;
3510 rfa = MEXT_RFA(m);
3511
3512 ASSERT(cl != NULL && rfa != NULL);
3513 VERIFY(MBUF_IS_COMPOSITE(m) && m->m_ext.ext_free == NULL);
3514
3515 flag = MEXT_FLAGS(m);
3516
b0d623f7 3517 MBUF_INIT(m, hdr, type);
6d2010ae
A
3518 MBUF_CL_INIT(m, cl, rfa, 1, flag);
3519
b0d623f7
A
3520 mtype_stat_inc(type);
3521 mtype_stat_dec(MT_FREE);
3522#if CONFIG_MACF_NET
3523 if (hdr && mac_init_mbuf(m, wait) != 0) {
6d2010ae 3524 m_freem(m);
b0d623f7
A
3525 return (NULL);
3526 }
3527#endif /* MAC_NET */
3528 }
3529 return (m);
3530}
3531
2d21ac55
A
3532/* m_mclget() add an mbuf cluster to a normal mbuf */
3533struct mbuf *
3534m_mclget(struct mbuf *m, int wait)
3535{
3536 struct ext_ref *rfa;
3537
3538 if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL)
3539 return (m);
3540
3541 m->m_ext.ext_buf = m_mclalloc(wait);
3542 if (m->m_ext.ext_buf != NULL) {
3543 MBUF_CL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0);
3544 } else {
3545 mcache_free(ref_cache, rfa);
3546 }
3547 return (m);
3548}
3549
3550/* Allocate an mbuf cluster */
3551caddr_t
3552m_mclalloc(int wait)
3553{
3554 int mcflags = MSLEEPF(wait);
3555
3556 /* Is this due to a non-blocking retry? If so, then try harder */
3557 if (mcflags & MCR_NOSLEEP)
3558 mcflags |= MCR_TRYHARD;
3559
3560 return (mcache_alloc(m_cache(MC_CL), mcflags));
3561}
3562
3563/* Free an mbuf cluster */
3564void
3565m_mclfree(caddr_t p)
3566{
3567 mcache_free(m_cache(MC_CL), p);
3568}
3569
3570/*
3571 * mcl_hasreference() checks if a cluster of an mbuf is referenced by
6d2010ae 3572 * another mbuf; see comments in m_incref() regarding EXTF_READONLY.
2d21ac55
A
3573 */
3574int
3575m_mclhasreference(struct mbuf *m)
3576{
3577 if (!(m->m_flags & M_EXT))
3578 return (0);
9bccf70c 3579
2d21ac55
A
3580 ASSERT(MEXT_RFA(m) != NULL);
3581
6d2010ae 3582 return ((MEXT_FLAGS(m) & EXTF_READONLY) ? 1 : 0);
9bccf70c
A
3583}
3584
2d21ac55
A
3585__private_extern__ caddr_t
3586m_bigalloc(int wait)
9bccf70c 3587{
2d21ac55 3588 int mcflags = MSLEEPF(wait);
91447636 3589
2d21ac55
A
3590 /* Is this due to a non-blocking retry? If so, then try harder */
3591 if (mcflags & MCR_NOSLEEP)
3592 mcflags |= MCR_TRYHARD;
91447636 3593
2d21ac55 3594 return (mcache_alloc(m_cache(MC_BIGCL), mcflags));
9bccf70c
A
3595}
3596
2d21ac55
A
3597__private_extern__ void
3598m_bigfree(caddr_t p, __unused u_int size, __unused caddr_t arg)
9bccf70c 3599{
2d21ac55 3600 mcache_free(m_cache(MC_BIGCL), p);
9bccf70c
A
3601}
3602
2d21ac55
A
3603/* m_mbigget() add an 4KB mbuf cluster to a normal mbuf */
3604__private_extern__ struct mbuf *
3605m_mbigget(struct mbuf *m, int wait)
3606{
3607 struct ext_ref *rfa;
3608
3609 if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL)
3610 return (m);
3611
3612 m->m_ext.ext_buf = m_bigalloc(wait);
3613 if (m->m_ext.ext_buf != NULL) {
3614 MBUF_BIGCL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0);
91447636 3615 } else {
2d21ac55 3616 mcache_free(ref_cache, rfa);
91447636 3617 }
2d21ac55
A
3618 return (m);
3619}
3620
3621__private_extern__ caddr_t
3622m_16kalloc(int wait)
3623{
3624 int mcflags = MSLEEPF(wait);
3625
3626 /* Is this due to a non-blocking retry? If so, then try harder */
3627 if (mcflags & MCR_NOSLEEP)
3628 mcflags |= MCR_TRYHARD;
3629
3630 return (mcache_alloc(m_cache(MC_16KCL), mcflags));
91447636
A
3631}
3632
3633__private_extern__ void
2d21ac55 3634m_16kfree(caddr_t p, __unused u_int size, __unused caddr_t arg)
91447636 3635{
2d21ac55 3636 mcache_free(m_cache(MC_16KCL), p);
91447636
A
3637}
3638
2d21ac55 3639/* m_m16kget() add a 16KB mbuf cluster to a normal mbuf */
91447636 3640__private_extern__ struct mbuf *
2d21ac55 3641m_m16kget(struct mbuf *m, int wait)
91447636 3642{
2d21ac55
A
3643 struct ext_ref *rfa;
3644
3645 if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL)
3646 return (m);
3647
3648 m->m_ext.ext_buf = m_16kalloc(wait);
3649 if (m->m_ext.ext_buf != NULL) {
3650 MBUF_16KCL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0);
3651 } else {
3652 mcache_free(ref_cache, rfa);
91447636 3653 }
2d21ac55 3654 return (m);
91447636
A
3655}
3656
b0d623f7
A
3657/*
3658 * "Move" mbuf pkthdr from "from" to "to".
3659 * "from" must have M_PKTHDR set, and "to" must be empty.
3660 */
9bccf70c 3661void
2d21ac55 3662m_copy_pkthdr(struct mbuf *to, struct mbuf *from)
9bccf70c 3663{
39236c6e
A
3664 VERIFY(from->m_flags & M_PKTHDR);
3665
3666 /* Check for scratch area overflow */
3667 m_redzone_verify(from);
3668
3669 if (to->m_flags & M_PKTHDR) {
3670 /* Check for scratch area overflow */
3671 m_redzone_verify(to);
3672 /* We will be taking over the tags of 'to' */
2d21ac55 3673 m_tag_delete_chain(to, NULL);
39236c6e 3674 }
2d21ac55 3675 to->m_pkthdr = from->m_pkthdr; /* especially tags */
39236c6e
A
3676 m_classifier_init(from, 0); /* purge classifier info */
3677 m_tag_init(from, 1); /* purge all tags from src */
3678 m_scratch_init(from); /* clear src scratch area */
935ed37a
A
3679 to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT);
3680 if ((to->m_flags & M_EXT) == 0)
3681 to->m_data = to->m_pktdat;
39236c6e 3682 m_redzone_init(to); /* setup red zone on dst */
9bccf70c
A
3683}
3684
91447636
A
3685/*
3686 * Duplicate "from"'s mbuf pkthdr in "to".
3687 * "from" must have M_PKTHDR set, and "to" must be empty.
3688 * In particular, this does a deep copy of the packet tags.
3689 */
3a60a9f5 3690static int
91447636
A
3691m_dup_pkthdr(struct mbuf *to, struct mbuf *from, int how)
3692{
39236c6e
A
3693 VERIFY(from->m_flags & M_PKTHDR);
3694
3695 /* Check for scratch area overflow */
3696 m_redzone_verify(from);
3697
3698 if (to->m_flags & M_PKTHDR) {
3699 /* Check for scratch area overflow */
3700 m_redzone_verify(to);
3701 /* We will be taking over the tags of 'to' */
2d21ac55 3702 m_tag_delete_chain(to, NULL);
39236c6e 3703 }
2d21ac55
A
3704 to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT);
3705 if ((to->m_flags & M_EXT) == 0)
3706 to->m_data = to->m_pktdat;
3707 to->m_pkthdr = from->m_pkthdr;
39236c6e
A
3708 m_redzone_init(to); /* setup red zone on dst */
3709 m_tag_init(to, 0); /* preserve dst static tags */
2d21ac55 3710 return (m_tag_copy_chain(to, from, how));
91447636 3711}
fa4905b1 3712
316670eb
A
3713void
3714m_copy_pftag(struct mbuf *to, struct mbuf *from)
3715{
3716 to->m_pkthdr.pf_mtag = from->m_pkthdr.pf_mtag;
39236c6e 3717#if PF_ECN
316670eb
A
3718 to->m_pkthdr.pf_mtag.pftag_hdr = NULL;
3719 to->m_pkthdr.pf_mtag.pftag_flags &= ~(PF_TAG_HDR_INET|PF_TAG_HDR_INET6);
39236c6e
A
3720#endif /* PF_ECN */
3721}
3722
3723void
3724m_classifier_init(struct mbuf *m, uint32_t pktf_mask)
3725{
3726 VERIFY(m->m_flags & M_PKTHDR);
3727
3728 m->m_pkthdr.pkt_proto = 0;
3729 m->m_pkthdr.pkt_flowsrc = 0;
3730 m->m_pkthdr.pkt_flowid = 0;
3731 m->m_pkthdr.pkt_flags &= pktf_mask; /* caller-defined mask */
3732 /* preserve service class and interface info for loopback packets */
3733 if (!(m->m_pkthdr.pkt_flags & PKTF_LOOP))
3734 (void) m_set_service_class(m, MBUF_SC_BE);
3735 if (!(m->m_pkthdr.pkt_flags & PKTF_IFAINFO))
3736 m->m_pkthdr.pkt_ifainfo = 0;
3737#if MEASURE_BW
3738 m->m_pkthdr.pkt_bwseq = 0;
3739#endif /* MEASURE_BW */
3740}
3741
3742void
3743m_copy_classifier(struct mbuf *to, struct mbuf *from)
3744{
3745 VERIFY(to->m_flags & M_PKTHDR);
3746 VERIFY(from->m_flags & M_PKTHDR);
3747
3748 to->m_pkthdr.pkt_proto = from->m_pkthdr.pkt_proto;
3749 to->m_pkthdr.pkt_flowsrc = from->m_pkthdr.pkt_flowsrc;
3750 to->m_pkthdr.pkt_flowid = from->m_pkthdr.pkt_flowid;
3751 to->m_pkthdr.pkt_flags = from->m_pkthdr.pkt_flags;
3752 (void) m_set_service_class(to, from->m_pkthdr.pkt_svc);
3753 to->m_pkthdr.pkt_ifainfo = from->m_pkthdr.pkt_ifainfo;
39236c6e
A
3754#if MEASURE_BW
3755 to->m_pkthdr.pkt_bwseq = from->m_pkthdr.pkt_bwseq;
3756#endif /* MEASURE_BW */
316670eb
A
3757}
3758
9bccf70c 3759/*
2d21ac55
A
3760 * Return a list of mbuf hdrs that point to clusters. Try for num_needed;
3761 * if wantall is not set, return whatever number were available. Set up the
3762 * first num_with_pkthdrs with mbuf hdrs configured as packet headers; these
3763 * are chained on the m_nextpkt field. Any packets requested beyond this
3764 * are chained onto the last packet header's m_next field. The size of
3765 * the cluster is controlled by the parameter bufsize.
9bccf70c 3766 */
91447636 3767__private_extern__ struct mbuf *
2d21ac55
A
3768m_getpackets_internal(unsigned int *num_needed, int num_with_pkthdrs,
3769 int wait, int wantall, size_t bufsize)
fa4905b1
A
3770{
3771 struct mbuf *m;
3772 struct mbuf **np, *top;
2d21ac55
A
3773 unsigned int pnum, needed = *num_needed;
3774 mcache_obj_t *mp_list = NULL;
3775 int mcflags = MSLEEPF(wait);
3776 u_int32_t flag;
3777 struct ext_ref *rfa;
3778 mcache_t *cp;
3779 void *cl;
3780
3781 ASSERT(bufsize == m_maxsize(MC_CL) ||
3782 bufsize == m_maxsize(MC_BIGCL) ||
3783 bufsize == m_maxsize(MC_16KCL));
3784
3785 /*
3786 * Caller must first check for njcl because this
3787 * routine is internal and not exposed/used via KPI.
3788 */
3789 VERIFY(bufsize != m_maxsize(MC_16KCL) || njcl > 0);
3790
fa4905b1
A
3791 top = NULL;
3792 np = &top;
2d21ac55 3793 pnum = 0;
fa4905b1 3794
2d21ac55
A
3795 /*
3796 * The caller doesn't want all the requested buffers; only some.
3797 * Try hard to get what we can, but don't block. This effectively
3798 * overrides MCR_SLEEP, since this thread will not go to sleep
3799 * if we can't get all the buffers.
3800 */
3801 if (!wantall || (mcflags & MCR_NOSLEEP))
3802 mcflags |= MCR_TRYHARD;
3803
3804 /* Allocate the composite mbuf + cluster elements from the cache */
3805 if (bufsize == m_maxsize(MC_CL))
3806 cp = m_cache(MC_MBUF_CL);
3807 else if (bufsize == m_maxsize(MC_BIGCL))
3808 cp = m_cache(MC_MBUF_BIGCL);
3809 else
3810 cp = m_cache(MC_MBUF_16KCL);
3811 needed = mcache_alloc_ext(cp, &mp_list, needed, mcflags);
3812
3813 for (pnum = 0; pnum < needed; pnum++) {
3814 m = (struct mbuf *)mp_list;
3815 mp_list = mp_list->obj_next;
3816
3817 VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT);
3818 cl = m->m_ext.ext_buf;
3819 rfa = MEXT_RFA(m);
3820
3821 ASSERT(cl != NULL && rfa != NULL);
3822 VERIFY(MBUF_IS_COMPOSITE(m));
3823
3824 flag = MEXT_FLAGS(m);
3825
3826 MBUF_INIT(m, num_with_pkthdrs, MT_DATA);
3827 if (bufsize == m_maxsize(MC_16KCL)) {
3828 MBUF_16KCL_INIT(m, cl, rfa, 1, flag);
3829 } else if (bufsize == m_maxsize(MC_BIGCL)) {
3830 MBUF_BIGCL_INIT(m, cl, rfa, 1, flag);
91447636 3831 } else {
2d21ac55
A
3832 MBUF_CL_INIT(m, cl, rfa, 1, flag);
3833 }
3834
3835 if (num_with_pkthdrs > 0) {
3836 --num_with_pkthdrs;
3837#if CONFIG_MACF_NET
3838 if (mac_mbuf_label_init(m, wait) != 0) {
6d2010ae 3839 m_freem(m);
2d21ac55 3840 break;
91447636 3841 }
2d21ac55 3842#endif /* MAC_NET */
91447636 3843 }
2d21ac55
A
3844
3845 *np = m;
3846 if (num_with_pkthdrs > 0)
91447636
A
3847 np = &m->m_nextpkt;
3848 else
3849 np = &m->m_next;
3850 }
2d21ac55
A
3851 ASSERT(pnum != *num_needed || mp_list == NULL);
3852 if (mp_list != NULL)
3853 mcache_free_ext(cp, mp_list);
3854
3855 if (pnum > 0) {
3856 mtype_stat_add(MT_DATA, pnum);
3857 mtype_stat_sub(MT_FREE, pnum);
3858 }
3859
3860 if (wantall && (pnum != *num_needed)) {
3861 if (top != NULL)
3862 m_freem_list(top);
3863 return (NULL);
91447636 3864 }
fa4905b1 3865
316670eb
A
3866 if (pnum > *num_needed) {
3867 printf("%s: File a radar related to <rdar://10146739>. \
3868 needed = %u, pnum = %u, num_needed = %u \n",
3869 __func__, needed, pnum, *num_needed);
3870 }
3871
2d21ac55
A
3872 *num_needed = pnum;
3873 return (top);
3874}
fa4905b1 3875
91447636 3876/*
2d21ac55
A
3877 * Return list of mbuf linked by m_nextpkt. Try for numlist, and if
3878 * wantall is not set, return whatever number were available. The size of
3879 * each mbuf in the list is controlled by the parameter packetlen. Each
3880 * mbuf of the list may have a chain of mbufs linked by m_next. Each mbuf
3881 * in the chain is called a segment. If maxsegments is not null and the
3882 * value pointed to is not null, this specify the maximum number of segments
3883 * for a chain of mbufs. If maxsegments is zero or the value pointed to
3884 * is zero the caller does not have any restriction on the number of segments.
3885 * The actual number of segments of a mbuf chain is return in the value
3886 * pointed to by maxsegments.
91447636 3887 */
91447636 3888__private_extern__ struct mbuf *
2d21ac55
A
3889m_allocpacket_internal(unsigned int *numlist, size_t packetlen,
3890 unsigned int *maxsegments, int wait, int wantall, size_t wantsize)
91447636 3891{
2d21ac55
A
3892 struct mbuf **np, *top, *first = NULL;
3893 size_t bufsize, r_bufsize;
3894 unsigned int num = 0;
3895 unsigned int nsegs = 0;
3896 unsigned int needed, resid;
3897 int mcflags = MSLEEPF(wait);
3898 mcache_obj_t *mp_list = NULL, *rmp_list = NULL;
3899 mcache_t *cp = NULL, *rcp = NULL;
3900
3901 if (*numlist == 0)
3902 return (NULL);
fa4905b1 3903
91447636
A
3904 top = NULL;
3905 np = &top;
2d21ac55 3906
91447636 3907 if (wantsize == 0) {
2d21ac55 3908 if (packetlen <= MINCLSIZE) {
91447636 3909 bufsize = packetlen;
2d21ac55
A
3910 } else if (packetlen > m_maxsize(MC_CL)) {
3911 /* Use 4KB if jumbo cluster pool isn't available */
3912 if (packetlen <= m_maxsize(MC_BIGCL) || njcl == 0)
3913 bufsize = m_maxsize(MC_BIGCL);
3914 else
3915 bufsize = m_maxsize(MC_16KCL);
3916 } else {
3917 bufsize = m_maxsize(MC_CL);
3918 }
3919 } else if (wantsize == m_maxsize(MC_CL) ||
3920 wantsize == m_maxsize(MC_BIGCL) ||
3921 (wantsize == m_maxsize(MC_16KCL) && njcl > 0)) {
91447636 3922 bufsize = wantsize;
2d21ac55
A
3923 } else {
3924 return (NULL);
3925 }
91447636
A
3926
3927 if (bufsize <= MHLEN) {
2d21ac55 3928 nsegs = 1;
91447636
A
3929 } else if (bufsize <= MINCLSIZE) {
3930 if (maxsegments != NULL && *maxsegments == 1) {
2d21ac55
A
3931 bufsize = m_maxsize(MC_CL);
3932 nsegs = 1;
91447636 3933 } else {
2d21ac55 3934 nsegs = 2;
fa4905b1 3935 }
2d21ac55
A
3936 } else if (bufsize == m_maxsize(MC_16KCL)) {
3937 VERIFY(njcl > 0);
3938 nsegs = ((packetlen - 1) >> (PGSHIFT + 2)) + 1;
3939 } else if (bufsize == m_maxsize(MC_BIGCL)) {
3940 nsegs = ((packetlen - 1) >> PGSHIFT) + 1;
91447636 3941 } else {
2d21ac55 3942 nsegs = ((packetlen - 1) >> MCLSHIFT) + 1;
91447636
A
3943 }
3944 if (maxsegments != NULL) {
2d21ac55
A
3945 if (*maxsegments && nsegs > *maxsegments) {
3946 *maxsegments = nsegs;
3947 return (NULL);
91447636 3948 }
2d21ac55 3949 *maxsegments = nsegs;
91447636 3950 }
91447636 3951
2d21ac55
A
3952 /*
3953 * The caller doesn't want all the requested buffers; only some.
3954 * Try hard to get what we can, but don't block. This effectively
3955 * overrides MCR_SLEEP, since this thread will not go to sleep
3956 * if we can't get all the buffers.
3957 */
3958 if (!wantall || (mcflags & MCR_NOSLEEP))
3959 mcflags |= MCR_TRYHARD;
3960
3961 /*
3962 * Simple case where all elements in the lists/chains are mbufs.
3963 * Unless bufsize is greater than MHLEN, each segment chain is made
3964 * up of exactly 1 mbuf. Otherwise, each segment chain is made up
3965 * of 2 mbufs; the second one is used for the residual data, i.e.
3966 * the remaining data that cannot fit into the first mbuf.
3967 */
3968 if (bufsize <= MINCLSIZE) {
3969 /* Allocate the elements in one shot from the mbuf cache */
3970 ASSERT(bufsize <= MHLEN || nsegs == 2);
3971 cp = m_cache(MC_MBUF);
3972 needed = mcache_alloc_ext(cp, &mp_list,
3973 (*numlist) * nsegs, mcflags);
3974
3975 /*
3976 * The number of elements must be even if we are to use an
3977 * mbuf (instead of a cluster) to store the residual data.
3978 * If we couldn't allocate the requested number of mbufs,
3979 * trim the number down (if it's odd) in order to avoid
3980 * creating a partial segment chain.
3981 */
3982 if (bufsize > MHLEN && (needed & 0x1))
3983 needed--;
91447636 3984
2d21ac55
A
3985 while (num < needed) {
3986 struct mbuf *m;
91447636 3987
2d21ac55
A
3988 m = (struct mbuf *)mp_list;
3989 mp_list = mp_list->obj_next;
3990 ASSERT(m != NULL);
91447636 3991
2d21ac55
A
3992 MBUF_INIT(m, 1, MT_DATA);
3993#if CONFIG_MACF_NET
3994 if (mac_init_mbuf(m, wait) != 0) {
3995 m_free(m);
3996 break;
91447636 3997 }
2d21ac55
A
3998#endif /* MAC_NET */
3999 num++;
4000 if (bufsize > MHLEN) {
4001 /* A second mbuf for this segment chain */
4002 m->m_next = (struct mbuf *)mp_list;
4003 mp_list = mp_list->obj_next;
4004 ASSERT(m->m_next != NULL);
4005
4006 MBUF_INIT(m->m_next, 0, MT_DATA);
4007 num++;
91447636 4008 }
2d21ac55
A
4009 *np = m;
4010 np = &m->m_nextpkt;
4011 }
4012 ASSERT(num != *numlist || mp_list == NULL);
4013
4014 if (num > 0) {
4015 mtype_stat_add(MT_DATA, num);
4016 mtype_stat_sub(MT_FREE, num);
4017 }
4018 num /= nsegs;
4019
4020 /* We've got them all; return to caller */
4021 if (num == *numlist)
4022 return (top);
4023
4024 goto fail;
4025 }
4026
4027 /*
4028 * Complex cases where elements are made up of one or more composite
4029 * mbufs + cluster, depending on packetlen. Each N-segment chain can
4030 * be illustrated as follows:
4031 *
4032 * [mbuf + cluster 1] [mbuf + cluster 2] ... [mbuf + cluster N]
4033 *
4034 * Every composite mbuf + cluster element comes from the intermediate
4035 * cache (either MC_MBUF_CL or MC_MBUF_BIGCL). For space efficiency,
4036 * the last composite element will come from the MC_MBUF_CL cache,
4037 * unless the residual data is larger than 2KB where we use the
4038 * big cluster composite cache (MC_MBUF_BIGCL) instead. Residual
4039 * data is defined as extra data beyond the first element that cannot
4040 * fit into the previous element, i.e. there is no residual data if
4041 * the chain only has 1 segment.
4042 */
4043 r_bufsize = bufsize;
4044 resid = packetlen > bufsize ? packetlen % bufsize : 0;
4045 if (resid > 0) {
4046 /* There is residual data; figure out the cluster size */
4047 if (wantsize == 0 && packetlen > MINCLSIZE) {
4048 /*
4049 * Caller didn't request that all of the segments
4050 * in the chain use the same cluster size; use the
4051 * smaller of the cluster sizes.
4052 */
4053 if (njcl > 0 && resid > m_maxsize(MC_BIGCL))
4054 r_bufsize = m_maxsize(MC_16KCL);
4055 else if (resid > m_maxsize(MC_CL))
4056 r_bufsize = m_maxsize(MC_BIGCL);
4057 else
4058 r_bufsize = m_maxsize(MC_CL);
4059 } else {
4060 /* Use the same cluster size as the other segments */
4061 resid = 0;
4062 }
4063 }
4064
4065 needed = *numlist;
4066 if (resid > 0) {
4067 /*
4068 * Attempt to allocate composite mbuf + cluster elements for
4069 * the residual data in each chain; record the number of such
4070 * elements that can be allocated so that we know how many
4071 * segment chains we can afford to create.
4072 */
4073 if (r_bufsize <= m_maxsize(MC_CL))
4074 rcp = m_cache(MC_MBUF_CL);
4075 else if (r_bufsize <= m_maxsize(MC_BIGCL))
4076 rcp = m_cache(MC_MBUF_BIGCL);
4077 else
4078 rcp = m_cache(MC_MBUF_16KCL);
4079 needed = mcache_alloc_ext(rcp, &rmp_list, *numlist, mcflags);
4080
4081 if (needed == 0)
4082 goto fail;
4083
4084 /* This is temporarily reduced for calculation */
4085 ASSERT(nsegs > 1);
4086 nsegs--;
4087 }
4088
4089 /*
4090 * Attempt to allocate the rest of the composite mbuf + cluster
4091 * elements for the number of segment chains that we need.
4092 */
4093 if (bufsize <= m_maxsize(MC_CL))
4094 cp = m_cache(MC_MBUF_CL);
4095 else if (bufsize <= m_maxsize(MC_BIGCL))
4096 cp = m_cache(MC_MBUF_BIGCL);
4097 else
4098 cp = m_cache(MC_MBUF_16KCL);
4099 needed = mcache_alloc_ext(cp, &mp_list, needed * nsegs, mcflags);
4100
4101 /* Round it down to avoid creating a partial segment chain */
4102 needed = (needed / nsegs) * nsegs;
4103 if (needed == 0)
4104 goto fail;
4105
4106 if (resid > 0) {
4107 /*
4108 * We're about to construct the chain(s); take into account
4109 * the number of segments we have created above to hold the
4110 * residual data for each chain, as well as restore the
4111 * original count of segments per chain.
4112 */
4113 ASSERT(nsegs > 0);
4114 needed += needed / nsegs;
4115 nsegs++;
4116 }
4117
4118 for (;;) {
4119 struct mbuf *m;
4120 u_int32_t flag;
4121 struct ext_ref *rfa;
4122 void *cl;
4123 int pkthdr;
4124
4125 ++num;
4126 if (nsegs == 1 || (num % nsegs) != 0 || resid == 0) {
4127 m = (struct mbuf *)mp_list;
4128 mp_list = mp_list->obj_next;
4129 } else {
4130 m = (struct mbuf *)rmp_list;
4131 rmp_list = rmp_list->obj_next;
4132 }
4133 ASSERT(m != NULL);
4134 VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT);
4135 VERIFY(m->m_ext.ext_free == NULL ||
4136 m->m_ext.ext_free == m_bigfree ||
4137 m->m_ext.ext_free == m_16kfree);
4138
4139 cl = m->m_ext.ext_buf;
4140 rfa = MEXT_RFA(m);
4141
4142 ASSERT(cl != NULL && rfa != NULL);
4143 VERIFY(MBUF_IS_COMPOSITE(m));
4144
4145 flag = MEXT_FLAGS(m);
4146
4147 pkthdr = (nsegs == 1 || (num % nsegs) == 1);
4148 if (pkthdr)
4149 first = m;
4150 MBUF_INIT(m, pkthdr, MT_DATA);
4151 if (m->m_ext.ext_free == m_16kfree) {
4152 MBUF_16KCL_INIT(m, cl, rfa, 1, flag);
4153 } else if (m->m_ext.ext_free == m_bigfree) {
4154 MBUF_BIGCL_INIT(m, cl, rfa, 1, flag);
4155 } else {
4156 MBUF_CL_INIT(m, cl, rfa, 1, flag);
4157 }
4158#if CONFIG_MACF_NET
4159 if (pkthdr && mac_init_mbuf(m, wait) != 0) {
4160 --num;
6d2010ae 4161 m_freem(m);
2d21ac55 4162 break;
91447636 4163 }
2d21ac55
A
4164#endif /* MAC_NET */
4165
4166 *np = m;
4167 if ((num % nsegs) == 0)
4168 np = &first->m_nextpkt;
4169 else
4170 np = &m->m_next;
4171
4172 if (num == needed)
4173 break;
4174 }
4175
4176 if (num > 0) {
4177 mtype_stat_add(MT_DATA, num);
4178 mtype_stat_sub(MT_FREE, num);
91447636 4179 }
2d21ac55
A
4180
4181 num /= nsegs;
4182
4183 /* We've got them all; return to caller */
4184 if (num == *numlist) {
4185 ASSERT(mp_list == NULL && rmp_list == NULL);
4186 return (top);
4187 }
4188
91447636 4189fail:
2d21ac55
A
4190 /* Free up what's left of the above */
4191 if (mp_list != NULL)
4192 mcache_free_ext(cp, mp_list);
4193 if (rmp_list != NULL)
4194 mcache_free_ext(rcp, rmp_list);
4195 if (wantall && top != NULL) {
91447636 4196 m_freem(top);
2d21ac55 4197 return (NULL);
91447636 4198 }
2d21ac55
A
4199 *numlist = num;
4200 return (top);
91447636 4201}
fa4905b1 4202
2d21ac55
A
4203/*
4204 * Best effort to get a mbuf cluster + pkthdr. Used by drivers to allocated
4205 * packets on receive ring.
91447636
A
4206 */
4207__private_extern__ struct mbuf *
2d21ac55 4208m_getpacket_how(int wait)
91447636
A
4209{
4210 unsigned int num_needed = 1;
2d21ac55
A
4211
4212 return (m_getpackets_internal(&num_needed, 1, wait, 1,
4213 m_maxsize(MC_CL)));
91447636 4214}
fa4905b1 4215
2d21ac55
A
4216/*
4217 * Best effort to get a mbuf cluster + pkthdr. Used by drivers to allocated
4218 * packets on receive ring.
91447636
A
4219 */
4220struct mbuf *
4221m_getpacket(void)
4222{
4223 unsigned int num_needed = 1;
9bccf70c 4224
2d21ac55
A
4225 return (m_getpackets_internal(&num_needed, 1, M_WAIT, 1,
4226 m_maxsize(MC_CL)));
91447636 4227}
fa4905b1 4228
91447636 4229/*
2d21ac55
A
4230 * Return a list of mbuf hdrs that point to clusters. Try for num_needed;
4231 * if this can't be met, return whatever number were available. Set up the
4232 * first num_with_pkthdrs with mbuf hdrs configured as packet headers. These
4233 * are chained on the m_nextpkt field. Any packets requested beyond this are
4234 * chained onto the last packet header's m_next field.
91447636
A
4235 */
4236struct mbuf *
4237m_getpackets(int num_needed, int num_with_pkthdrs, int how)
4238{
4239 unsigned int n = num_needed;
fa4905b1 4240
2d21ac55
A
4241 return (m_getpackets_internal(&n, num_with_pkthdrs, how, 0,
4242 m_maxsize(MC_CL)));
4243}
fa4905b1 4244
9bccf70c 4245/*
2d21ac55
A
4246 * Return a list of mbuf hdrs set up as packet hdrs chained together
4247 * on the m_nextpkt field
9bccf70c 4248 */
fa4905b1
A
4249struct mbuf *
4250m_getpackethdrs(int num_needed, int how)
4251{
4252 struct mbuf *m;
4253 struct mbuf **np, *top;
4254
4255 top = NULL;
4256 np = &top;
4257
fa4905b1 4258 while (num_needed--) {
2d21ac55
A
4259 m = _M_RETRYHDR(how, MT_DATA);
4260 if (m == NULL)
4261 break;
4262
4263 *np = m;
4264 np = &m->m_nextpkt;
4265 }
fa4905b1
A
4266
4267 return (top);
4268}
4269
2d21ac55
A
4270/*
4271 * Free an mbuf list (m_nextpkt) while following m_next. Returns the count
4272 * for mbufs packets freed. Used by the drivers.
1c79356b 4273 */
2d21ac55
A
4274int
4275m_freem_list(struct mbuf *m)
1c79356b
A
4276{
4277 struct mbuf *nextpkt;
2d21ac55
A
4278 mcache_obj_t *mp_list = NULL;
4279 mcache_obj_t *mcl_list = NULL;
4280 mcache_obj_t *mbc_list = NULL;
4281 mcache_obj_t *m16k_list = NULL;
4282 mcache_obj_t *m_mcl_list = NULL;
4283 mcache_obj_t *m_mbc_list = NULL;
4284 mcache_obj_t *m_m16k_list = NULL;
4285 mcache_obj_t *ref_list = NULL;
4286 int pktcount = 0;
4287 int mt_free = 0, mt_data = 0, mt_header = 0, mt_soname = 0, mt_tag = 0;
4288
4289 while (m != NULL) {
4290 pktcount++;
4291
4292 nextpkt = m->m_nextpkt;
4293 m->m_nextpkt = NULL;
4294
4295 while (m != NULL) {
4296 struct mbuf *next = m->m_next;
4297 mcache_obj_t *o, *rfa;
6d2010ae 4298 u_int32_t refcnt, composite;
fa4905b1 4299
2d21ac55
A
4300 if (m->m_type == MT_FREE)
4301 panic("m_free: freeing an already freed mbuf");
9bccf70c 4302
2d21ac55
A
4303 if (m->m_type != MT_FREE)
4304 mt_free++;
91447636 4305
2d21ac55 4306 if (m->m_flags & M_PKTHDR) {
39236c6e
A
4307 /* Check for scratch area overflow */
4308 m_redzone_verify(m);
4309 /* Free the aux data and tags if there is any */
91447636 4310 m_tag_delete_chain(m, NULL);
91447636 4311 }
9bccf70c 4312
2d21ac55
A
4313 if (!(m->m_flags & M_EXT))
4314 goto simple_free;
4315
316670eb 4316 o = (mcache_obj_t *)(void *)m->m_ext.ext_buf;
2d21ac55 4317 refcnt = m_decref(m);
6d2010ae
A
4318 composite = (MEXT_FLAGS(m) & EXTF_COMPOSITE);
4319 if (refcnt == 0 && !composite) {
2d21ac55
A
4320 if (m->m_ext.ext_free == NULL) {
4321 o->obj_next = mcl_list;
4322 mcl_list = o;
4323 } else if (m->m_ext.ext_free == m_bigfree) {
4324 o->obj_next = mbc_list;
4325 mbc_list = o;
4326 } else if (m->m_ext.ext_free == m_16kfree) {
4327 o->obj_next = m16k_list;
4328 m16k_list = o;
4329 } else {
4330 (*(m->m_ext.ext_free))((caddr_t)o,
4331 m->m_ext.ext_size,
4332 m->m_ext.ext_arg);
4333 }
316670eb 4334 rfa = (mcache_obj_t *)(void *)MEXT_RFA(m);
2d21ac55
A
4335 rfa->obj_next = ref_list;
4336 ref_list = rfa;
4337 MEXT_RFA(m) = NULL;
6d2010ae 4338 } else if (refcnt == 0 && composite) {
2d21ac55
A
4339 VERIFY(m->m_type != MT_FREE);
4340 /*
4341 * Amortize the costs of atomic operations
4342 * by doing them at the end, if possible.
4343 */
4344 if (m->m_type == MT_DATA)
4345 mt_data++;
4346 else if (m->m_type == MT_HEADER)
4347 mt_header++;
4348 else if (m->m_type == MT_SONAME)
4349 mt_soname++;
4350 else if (m->m_type == MT_TAG)
4351 mt_tag++;
4352 else
4353 mtype_stat_dec(m->m_type);
fa4905b1 4354
2d21ac55
A
4355 m->m_type = MT_FREE;
4356 m->m_flags = M_EXT;
4357 m->m_len = 0;
4358 m->m_next = m->m_nextpkt = NULL;
4359
6d2010ae
A
4360 MEXT_FLAGS(m) &= ~EXTF_READONLY;
4361
2d21ac55
A
4362 /* "Free" into the intermediate cache */
4363 o = (mcache_obj_t *)m;
4364 if (m->m_ext.ext_free == NULL) {
4365 o->obj_next = m_mcl_list;
4366 m_mcl_list = o;
4367 } else if (m->m_ext.ext_free == m_bigfree) {
4368 o->obj_next = m_mbc_list;
4369 m_mbc_list = o;
1c79356b 4370 } else {
2d21ac55
A
4371 VERIFY(m->m_ext.ext_free == m_16kfree);
4372 o->obj_next = m_m16k_list;
4373 m_m16k_list = o;
1c79356b 4374 }
2d21ac55
A
4375 m = next;
4376 continue;
1c79356b 4377 }
2d21ac55
A
4378simple_free:
4379 /*
4380 * Amortize the costs of atomic operations
4381 * by doing them at the end, if possible.
4382 */
4383 if (m->m_type == MT_DATA)
4384 mt_data++;
4385 else if (m->m_type == MT_HEADER)
4386 mt_header++;
4387 else if (m->m_type == MT_SONAME)
4388 mt_soname++;
4389 else if (m->m_type == MT_TAG)
4390 mt_tag++;
4391 else if (m->m_type != MT_FREE)
4392 mtype_stat_dec(m->m_type);
4393
1c79356b 4394 m->m_type = MT_FREE;
2d21ac55
A
4395 m->m_flags = m->m_len = 0;
4396 m->m_next = m->m_nextpkt = NULL;
fa4905b1 4397
2d21ac55
A
4398 ((mcache_obj_t *)m)->obj_next = mp_list;
4399 mp_list = (mcache_obj_t *)m;
4400
4401 m = next;
4402 }
fa4905b1 4403
2d21ac55
A
4404 m = nextpkt;
4405 }
fa4905b1 4406
2d21ac55
A
4407 if (mt_free > 0)
4408 mtype_stat_add(MT_FREE, mt_free);
4409 if (mt_data > 0)
4410 mtype_stat_sub(MT_DATA, mt_data);
4411 if (mt_header > 0)
4412 mtype_stat_sub(MT_HEADER, mt_header);
4413 if (mt_soname > 0)
4414 mtype_stat_sub(MT_SONAME, mt_soname);
4415 if (mt_tag > 0)
4416 mtype_stat_sub(MT_TAG, mt_tag);
4417
4418 if (mp_list != NULL)
4419 mcache_free_ext(m_cache(MC_MBUF), mp_list);
4420 if (mcl_list != NULL)
4421 mcache_free_ext(m_cache(MC_CL), mcl_list);
4422 if (mbc_list != NULL)
4423 mcache_free_ext(m_cache(MC_BIGCL), mbc_list);
4424 if (m16k_list != NULL)
4425 mcache_free_ext(m_cache(MC_16KCL), m16k_list);
4426 if (m_mcl_list != NULL)
4427 mcache_free_ext(m_cache(MC_MBUF_CL), m_mcl_list);
4428 if (m_mbc_list != NULL)
4429 mcache_free_ext(m_cache(MC_MBUF_BIGCL), m_mbc_list);
4430 if (m_m16k_list != NULL)
4431 mcache_free_ext(m_cache(MC_MBUF_16KCL), m_m16k_list);
4432 if (ref_list != NULL)
4433 mcache_free_ext(ref_cache, ref_list);
4434
4435 return (pktcount);
1c79356b
A
4436}
4437
4438void
2d21ac55 4439m_freem(struct mbuf *m)
1c79356b 4440{
2d21ac55 4441 while (m != NULL)
1c79356b
A
4442 m = m_free(m);
4443}
4444
4445/*
4446 * Mbuffer utility routines.
4447 */
2d21ac55 4448
1c79356b 4449/*
2d21ac55
A
4450 * Compute the amount of space available before the current start
4451 * of data in an mbuf.
1c79356b 4452 */
91447636 4453int
2d21ac55 4454m_leadingspace(struct mbuf *m)
1c79356b
A
4455{
4456 if (m->m_flags & M_EXT) {
4457 if (MCLHASREFERENCE(m))
2d21ac55 4458 return (0);
1c79356b
A
4459 return (m->m_data - m->m_ext.ext_buf);
4460 }
4461 if (m->m_flags & M_PKTHDR)
4462 return (m->m_data - m->m_pktdat);
4463 return (m->m_data - m->m_dat);
4464}
4465
4466/*
2d21ac55 4467 * Compute the amount of space available after the end of data in an mbuf.
1c79356b 4468 */
91447636 4469int
2d21ac55 4470m_trailingspace(struct mbuf *m)
1c79356b
A
4471{
4472 if (m->m_flags & M_EXT) {
4473 if (MCLHASREFERENCE(m))
2d21ac55 4474 return (0);
1c79356b 4475 return (m->m_ext.ext_buf + m->m_ext.ext_size -
2d21ac55 4476 (m->m_data + m->m_len));
1c79356b
A
4477 }
4478 return (&m->m_dat[MLEN] - (m->m_data + m->m_len));
4479}
4480
4481/*
2d21ac55
A
4482 * Lesser-used path for M_PREPEND: allocate new mbuf to prepend to chain,
4483 * copy junk along. Does not adjust packet header length.
1c79356b
A
4484 */
4485struct mbuf *
2d21ac55 4486m_prepend(struct mbuf *m, int len, int how)
1c79356b
A
4487{
4488 struct mbuf *mn;
4489
2d21ac55
A
4490 _MGET(mn, how, m->m_type);
4491 if (mn == NULL) {
1c79356b 4492 m_freem(m);
2d21ac55 4493 return (NULL);
1c79356b
A
4494 }
4495 if (m->m_flags & M_PKTHDR) {
4496 M_COPY_PKTHDR(mn, m);
4497 m->m_flags &= ~M_PKTHDR;
4498 }
4499 mn->m_next = m;
4500 m = mn;
4501 if (len < MHLEN)
4502 MH_ALIGN(m, len);
4503 m->m_len = len;
4504 return (m);
4505}
4506
9bccf70c 4507/*
2d21ac55
A
4508 * Replacement for old M_PREPEND macro: allocate new mbuf to prepend to
4509 * chain, copy junk along, and adjust length.
9bccf70c
A
4510 */
4511struct mbuf *
2d21ac55
A
4512m_prepend_2(struct mbuf *m, int len, int how)
4513{
4514 if (M_LEADINGSPACE(m) >= len) {
4515 m->m_data -= len;
4516 m->m_len += len;
4517 } else {
9bccf70c 4518 m = m_prepend(m, len, how);
2d21ac55
A
4519 }
4520 if ((m) && (m->m_flags & M_PKTHDR))
4521 m->m_pkthdr.len += len;
4522 return (m);
9bccf70c
A
4523}
4524
1c79356b
A
4525/*
4526 * Make a copy of an mbuf chain starting "off0" bytes from the beginning,
4527 * continuing for "len" bytes. If len is M_COPYALL, copy to end of mbuf.
4528 * The wait parameter is a choice of M_WAIT/M_DONTWAIT from caller.
4529 */
4530int MCFail;
4531
4532struct mbuf *
39236c6e 4533m_copym_mode(struct mbuf *m, int off0, int len, int wait, uint32_t mode)
1c79356b 4534{
2d21ac55 4535 struct mbuf *n, *mhdr = NULL, **np;
91447636 4536 int off = off0;
1c79356b
A
4537 struct mbuf *top;
4538 int copyhdr = 0;
4539
4540 if (off < 0 || len < 0)
2d21ac55
A
4541 panic("m_copym: invalid offset %d or len %d", off, len);
4542
fe8ab488
A
4543 VERIFY((mode != M_COPYM_MUST_COPY_HDR &&
4544 mode != M_COPYM_MUST_MOVE_HDR) || (m->m_flags & M_PKTHDR));
4545
4546 if ((off == 0 && (m->m_flags & M_PKTHDR)) ||
4547 mode == M_COPYM_MUST_COPY_HDR || mode == M_COPYM_MUST_MOVE_HDR) {
2d21ac55 4548 mhdr = m;
1c79356b 4549 copyhdr = 1;
2d21ac55 4550 }
fa4905b1
A
4551
4552 while (off >= m->m_len) {
2d21ac55
A
4553 if (m->m_next == NULL)
4554 panic("m_copym: invalid mbuf chain");
1c79356b
A
4555 off -= m->m_len;
4556 m = m->m_next;
4557 }
4558 np = &top;
2d21ac55 4559 top = NULL;
fa4905b1 4560
1c79356b 4561 while (len > 0) {
2d21ac55 4562 if (m == NULL) {
1c79356b 4563 if (len != M_COPYALL)
2d21ac55 4564 panic("m_copym: len != M_COPYALL");
1c79356b
A
4565 break;
4566 }
2d21ac55 4567
fe8ab488
A
4568 if (copyhdr)
4569 n = _M_RETRYHDR(wait, m->m_type);
4570 else
4571 n = _M_RETRY(wait, m->m_type);
1c79356b 4572 *np = n;
fa4905b1 4573
2d21ac55 4574 if (n == NULL)
1c79356b 4575 goto nospace;
2d21ac55
A
4576
4577 if (copyhdr != 0) {
fe8ab488
A
4578 if ((mode == M_COPYM_MOVE_HDR) ||
4579 (mode == M_COPYM_MUST_MOVE_HDR)) {
39236c6e 4580 M_COPY_PKTHDR(n, mhdr);
fe8ab488
A
4581 } else if ((mode == M_COPYM_COPY_HDR) ||
4582 (mode == M_COPYM_MUST_COPY_HDR)) {
39236c6e
A
4583 if (m_dup_pkthdr(n, mhdr, wait) == 0)
4584 goto nospace;
4585 }
1c79356b
A
4586 if (len == M_COPYALL)
4587 n->m_pkthdr.len -= off0;
4588 else
4589 n->m_pkthdr.len = len;
4590 copyhdr = 0;
fe8ab488
A
4591 /*
4592 * There is data to copy from the packet header mbuf
4593 * if it is empty or it is before the starting offset
4594 */
4595 if (mhdr != m) {
4596 np = &n->m_next;
4597 continue;
2d21ac55 4598 }
1c79356b 4599 }
2d21ac55 4600 n->m_len = MIN(len, (m->m_len - off));
1c79356b 4601 if (m->m_flags & M_EXT) {
1c79356b 4602 n->m_ext = m->m_ext;
2d21ac55 4603 m_incref(m);
1c79356b
A
4604 n->m_data = m->m_data + off;
4605 n->m_flags |= M_EXT;
fa4905b1 4606 } else {
fe8ab488
A
4607 /*
4608 * Limit to the capacity of the destination
4609 */
4610 if (n->m_flags & M_PKTHDR)
4611 n->m_len = MIN(n->m_len, MHLEN);
4612 else
4613 n->m_len = MIN(n->m_len, MLEN);
4614
4615 if (MTOD(n, char *) + n->m_len > ((char *)n) + MSIZE)
4616 panic("%s n %p copy overflow",
4617 __func__, n);
4618
2d21ac55 4619 bcopy(MTOD(m, caddr_t)+off, MTOD(n, caddr_t),
1c79356b 4620 (unsigned)n->m_len);
fa4905b1 4621 }
1c79356b
A
4622 if (len != M_COPYALL)
4623 len -= n->m_len;
4624 off = 0;
4625 m = m->m_next;
4626 np = &n->m_next;
4627 }
fa4905b1 4628
2d21ac55 4629 if (top == NULL)
1c79356b 4630 MCFail++;
fa4905b1 4631
1c79356b
A
4632 return (top);
4633nospace:
fa4905b1 4634
1c79356b
A
4635 m_freem(top);
4636 MCFail++;
2d21ac55 4637 return (NULL);
1c79356b
A
4638}
4639
39236c6e
A
4640
4641struct mbuf *
4642m_copym(struct mbuf *m, int off0, int len, int wait)
4643{
4644 return (m_copym_mode(m, off0, len, wait, M_COPYM_MOVE_HDR));
4645}
4646
9bccf70c 4647/*
2d21ac55
A
4648 * Equivalent to m_copym except that all necessary mbuf hdrs are allocated
4649 * within this routine also, the last mbuf and offset accessed are passed
4650 * out and can be passed back in to avoid having to rescan the entire mbuf
4651 * list (normally hung off of the socket)
9bccf70c 4652 */
fa4905b1 4653struct mbuf *
fe8ab488 4654m_copym_with_hdrs(struct mbuf *m0, int off0, int len0, int wait,
39236c6e 4655 struct mbuf **m_lastm, int *m_off, uint32_t mode)
2d21ac55 4656{
fe8ab488 4657 struct mbuf *m = m0, *n, **np = NULL;
2d21ac55
A
4658 int off = off0, len = len0;
4659 struct mbuf *top = NULL;
4660 int mcflags = MSLEEPF(wait);
fa4905b1 4661 int copyhdr = 0;
2d21ac55
A
4662 int type = 0;
4663 mcache_obj_t *list = NULL;
4664 int needed = 0;
fa4905b1 4665
2d21ac55 4666 if (off == 0 && (m->m_flags & M_PKTHDR))
fa4905b1 4667 copyhdr = 1;
fe8ab488
A
4668
4669 if (m_lastm != NULL && *m_lastm != NULL) {
6d2010ae 4670 m = *m_lastm;
fa4905b1
A
4671 off = *m_off;
4672 } else {
2d21ac55
A
4673 while (off >= m->m_len) {
4674 off -= m->m_len;
fa4905b1
A
4675 m = m->m_next;
4676 }
4677 }
91447636 4678
2d21ac55
A
4679 n = m;
4680 while (len > 0) {
4681 needed++;
4682 ASSERT(n != NULL);
4683 len -= MIN(len, (n->m_len - ((needed == 1) ? off : 0)));
4684 n = n->m_next;
4685 }
4686 needed++;
4687 len = len0;
4688
4689 /*
4690 * If the caller doesn't want to be put to sleep, mark it with
4691 * MCR_TRYHARD so that we may reclaim buffers from other places
4692 * before giving up.
4693 */
4694 if (mcflags & MCR_NOSLEEP)
4695 mcflags |= MCR_TRYHARD;
4696
4697 if (mcache_alloc_ext(m_cache(MC_MBUF), &list, needed,
4698 mcflags) != needed)
4699 goto nospace;
fa4905b1 4700
2d21ac55 4701 needed = 0;
fa4905b1 4702 while (len > 0) {
2d21ac55
A
4703 n = (struct mbuf *)list;
4704 list = list->obj_next;
4705 ASSERT(n != NULL && m != NULL);
4706
4707 type = (top == NULL) ? MT_HEADER : m->m_type;
4708 MBUF_INIT(n, (top == NULL), type);
4709#if CONFIG_MACF_NET
4710 if (top == NULL && mac_mbuf_label_init(n, wait) != 0) {
4711 mtype_stat_inc(MT_HEADER);
4712 mtype_stat_dec(MT_FREE);
4713 m_free(n);
fa4905b1 4714 goto nospace;
2d21ac55
A
4715 }
4716#endif /* MAC_NET */
4717
4718 if (top == NULL) {
4719 top = n;
fa4905b1
A
4720 np = &top->m_next;
4721 continue;
2d21ac55
A
4722 } else {
4723 needed++;
4724 *np = n;
4725 }
fa4905b1
A
4726
4727 if (copyhdr) {
fe8ab488
A
4728 if ((mode == M_COPYM_MOVE_HDR) ||
4729 (mode == M_COPYM_MUST_MOVE_HDR)) {
39236c6e 4730 M_COPY_PKTHDR(n, m);
fe8ab488
A
4731 } else if ((mode == M_COPYM_COPY_HDR) ||
4732 (mode == M_COPYM_MUST_COPY_HDR)) {
39236c6e
A
4733 if (m_dup_pkthdr(n, m, wait) == 0)
4734 goto nospace;
4735 }
fa4905b1
A
4736 n->m_pkthdr.len = len;
4737 copyhdr = 0;
4738 }
2d21ac55 4739 n->m_len = MIN(len, (m->m_len - off));
fa4905b1
A
4740
4741 if (m->m_flags & M_EXT) {
4742 n->m_ext = m->m_ext;
2d21ac55 4743 m_incref(m);
fa4905b1
A
4744 n->m_data = m->m_data + off;
4745 n->m_flags |= M_EXT;
4746 } else {
fe8ab488
A
4747 if (MTOD(n, char *) + n->m_len > ((char *)n) + MSIZE)
4748 panic("%s n %p copy overflow",
4749 __func__, n);
4750
2d21ac55 4751 bcopy(MTOD(m, caddr_t)+off, MTOD(n, caddr_t),
fa4905b1
A
4752 (unsigned)n->m_len);
4753 }
4754 len -= n->m_len;
2d21ac55 4755
fa4905b1 4756 if (len == 0) {
fe8ab488
A
4757 if (m_lastm != NULL && m_off != NULL) {
4758 if ((off + n->m_len) == m->m_len) {
4759 *m_lastm = m->m_next;
4760 *m_off = 0;
4761 } else {
4762 *m_lastm = m;
4763 *m_off = off + n->m_len;
4764 }
fa4905b1 4765 }
2d21ac55 4766 break;
fa4905b1
A
4767 }
4768 off = 0;
4769 m = m->m_next;
4770 np = &n->m_next;
4771 }
fa4905b1 4772
2d21ac55
A
4773 mtype_stat_inc(MT_HEADER);
4774 mtype_stat_add(type, needed);
4775 mtype_stat_sub(MT_FREE, needed + 1);
4776
4777 ASSERT(list == NULL);
fa4905b1 4778 return (top);
fa4905b1 4779
2d21ac55
A
4780nospace:
4781 if (list != NULL)
4782 mcache_free_ext(m_cache(MC_MBUF), list);
4783 if (top != NULL)
4784 m_freem(top);
fa4905b1 4785 MCFail++;
2d21ac55 4786 return (NULL);
fa4905b1
A
4787}
4788
1c79356b
A
4789/*
4790 * Copy data from an mbuf chain starting "off" bytes from the beginning,
4791 * continuing for "len" bytes, into the indicated buffer.
4792 */
2d21ac55 4793void
b0d623f7 4794m_copydata(struct mbuf *m, int off, int len, void *vp)
1c79356b 4795{
91447636 4796 unsigned count;
b0d623f7 4797 char *cp = vp;
1c79356b
A
4798
4799 if (off < 0 || len < 0)
2d21ac55
A
4800 panic("m_copydata: invalid offset %d or len %d", off, len);
4801
1c79356b 4802 while (off > 0) {
2d21ac55
A
4803 if (m == NULL)
4804 panic("m_copydata: invalid mbuf chain");
1c79356b
A
4805 if (off < m->m_len)
4806 break;
4807 off -= m->m_len;
4808 m = m->m_next;
4809 }
4810 while (len > 0) {
2d21ac55
A
4811 if (m == NULL)
4812 panic("m_copydata: invalid mbuf chain");
4813 count = MIN(m->m_len - off, len);
4814 bcopy(MTOD(m, caddr_t) + off, cp, count);
1c79356b
A
4815 len -= count;
4816 cp += count;
4817 off = 0;
4818 m = m->m_next;
4819 }
4820}
4821
4822/*
2d21ac55
A
4823 * Concatenate mbuf chain n to m. Both chains must be of the same type
4824 * (e.g. MT_DATA). Any m_pkthdr is not updated.
1c79356b 4825 */
2d21ac55
A
4826void
4827m_cat(struct mbuf *m, struct mbuf *n)
1c79356b
A
4828{
4829 while (m->m_next)
4830 m = m->m_next;
4831 while (n) {
2d21ac55 4832 if ((m->m_flags & M_EXT) ||
1c79356b
A
4833 m->m_data + m->m_len + n->m_len >= &m->m_dat[MLEN]) {
4834 /* just join the two chains */
4835 m->m_next = n;
4836 return;
4837 }
4838 /* splat the data from one into the other */
2d21ac55 4839 bcopy(MTOD(n, caddr_t), MTOD(m, caddr_t) + m->m_len,
1c79356b
A
4840 (u_int)n->m_len);
4841 m->m_len += n->m_len;
4842 n = m_free(n);
4843 }
4844}
4845
4846void
2d21ac55 4847m_adj(struct mbuf *mp, int req_len)
1c79356b 4848{
91447636
A
4849 int len = req_len;
4850 struct mbuf *m;
4851 int count;
1c79356b
A
4852
4853 if ((m = mp) == NULL)
4854 return;
4855 if (len >= 0) {
4856 /*
4857 * Trim from head.
4858 */
4859 while (m != NULL && len > 0) {
4860 if (m->m_len <= len) {
4861 len -= m->m_len;
4862 m->m_len = 0;
4863 m = m->m_next;
4864 } else {
4865 m->m_len -= len;
4866 m->m_data += len;
4867 len = 0;
4868 }
4869 }
4870 m = mp;
4871 if (m->m_flags & M_PKTHDR)
4872 m->m_pkthdr.len -= (req_len - len);
4873 } else {
4874 /*
4875 * Trim from tail. Scan the mbuf chain,
4876 * calculating its length and finding the last mbuf.
4877 * If the adjustment only affects this mbuf, then just
4878 * adjust and return. Otherwise, rescan and truncate
4879 * after the remaining size.
4880 */
4881 len = -len;
4882 count = 0;
4883 for (;;) {
4884 count += m->m_len;
4885 if (m->m_next == (struct mbuf *)0)
4886 break;
4887 m = m->m_next;
4888 }
4889 if (m->m_len >= len) {
4890 m->m_len -= len;
4891 m = mp;
4892 if (m->m_flags & M_PKTHDR)
4893 m->m_pkthdr.len -= len;
4894 return;
4895 }
4896 count -= len;
4897 if (count < 0)
4898 count = 0;
4899 /*
4900 * Correct length for chain is "count".
4901 * Find the mbuf with last data, adjust its length,
4902 * and toss data from remaining mbufs on chain.
4903 */
4904 m = mp;
4905 if (m->m_flags & M_PKTHDR)
4906 m->m_pkthdr.len = count;
4907 for (; m; m = m->m_next) {
4908 if (m->m_len >= count) {
4909 m->m_len = count;
4910 break;
4911 }
4912 count -= m->m_len;
4913 }
91447636 4914 while ((m = m->m_next))
1c79356b
A
4915 m->m_len = 0;
4916 }
4917}
4918
4919/*
4920 * Rearange an mbuf chain so that len bytes are contiguous
4921 * and in the data area of an mbuf (so that mtod and dtom
4922 * will work for a structure of size len). Returns the resulting
4923 * mbuf chain on success, frees it and returns null on failure.
4924 * If there is room, it will add up to max_protohdr-len extra bytes to the
4925 * contiguous region in an attempt to avoid being called next time.
4926 */
4927int MPFail;
4928
4929struct mbuf *
2d21ac55 4930m_pullup(struct mbuf *n, int len)
1c79356b 4931{
91447636
A
4932 struct mbuf *m;
4933 int count;
1c79356b
A
4934 int space;
4935
4936 /*
4937 * If first mbuf has no cluster, and has room for len bytes
4938 * without shifting current data, pullup into it,
4939 * otherwise allocate a new mbuf to prepend to the chain.
4940 */
4941 if ((n->m_flags & M_EXT) == 0 &&
4942 n->m_data + len < &n->m_dat[MLEN] && n->m_next) {
4943 if (n->m_len >= len)
4944 return (n);
4945 m = n;
4946 n = n->m_next;
4947 len -= m->m_len;
4948 } else {
4949 if (len > MHLEN)
4950 goto bad;
2d21ac55 4951 _MGET(m, M_DONTWAIT, n->m_type);
1c79356b
A
4952 if (m == 0)
4953 goto bad;
4954 m->m_len = 0;
4955 if (n->m_flags & M_PKTHDR) {
4956 M_COPY_PKTHDR(m, n);
4957 n->m_flags &= ~M_PKTHDR;
4958 }
4959 }
4960 space = &m->m_dat[MLEN] - (m->m_data + m->m_len);
4961 do {
2d21ac55
A
4962 count = MIN(MIN(MAX(len, max_protohdr), space), n->m_len);
4963 bcopy(MTOD(n, caddr_t), MTOD(m, caddr_t) + m->m_len,
4964 (unsigned)count);
1c79356b
A
4965 len -= count;
4966 m->m_len += count;
4967 n->m_len -= count;
4968 space -= count;
4969 if (n->m_len)
4970 n->m_data += count;
4971 else
4972 n = m_free(n);
4973 } while (len > 0 && n);
4974 if (len > 0) {
4975 (void) m_free(m);
4976 goto bad;
4977 }
4978 m->m_next = n;
4979 return (m);
4980bad:
4981 m_freem(n);
4982 MPFail++;
4983 return (0);
4984}
4985
6d2010ae
A
4986/*
4987 * Like m_pullup(), except a new mbuf is always allocated, and we allow
4988 * the amount of empty space before the data in the new mbuf to be specified
4989 * (in the event that the caller expects to prepend later).
4990 */
4991__private_extern__ int MSFail = 0;
4992
4993__private_extern__ struct mbuf *
4994m_copyup(struct mbuf *n, int len, int dstoff)
4995{
4996 struct mbuf *m;
4997 int count, space;
4998
4999 if (len > (MHLEN - dstoff))
5000 goto bad;
5001 MGET(m, M_DONTWAIT, n->m_type);
5002 if (m == NULL)
5003 goto bad;
5004 m->m_len = 0;
5005 if (n->m_flags & M_PKTHDR) {
5006 m_copy_pkthdr(m, n);
5007 n->m_flags &= ~M_PKTHDR;
5008 }
5009 m->m_data += dstoff;
5010 space = &m->m_dat[MLEN] - (m->m_data + m->m_len);
5011 do {
5012 count = min(min(max(len, max_protohdr), space), n->m_len);
5013 memcpy(mtod(m, caddr_t) + m->m_len, mtod(n, caddr_t),
5014 (unsigned)count);
5015 len -= count;
5016 m->m_len += count;
5017 n->m_len -= count;
5018 space -= count;
5019 if (n->m_len)
5020 n->m_data += count;
5021 else
5022 n = m_free(n);
5023 } while (len > 0 && n);
5024 if (len > 0) {
5025 (void) m_free(m);
5026 goto bad;
5027 }
5028 m->m_next = n;
5029 return (m);
5030bad:
5031 m_freem(n);
5032 MSFail++;
5033 return (NULL);
5034}
5035
1c79356b
A
5036/*
5037 * Partition an mbuf chain in two pieces, returning the tail --
5038 * all but the first len0 bytes. In case of failure, it returns NULL and
5039 * attempts to restore the chain to its original state.
5040 */
5041struct mbuf *
2d21ac55 5042m_split(struct mbuf *m0, int len0, int wait)
b0d623f7
A
5043{
5044 return (m_split0(m0, len0, wait, 1));
5045}
5046
5047static struct mbuf *
5048m_split0(struct mbuf *m0, int len0, int wait, int copyhdr)
1c79356b 5049{
91447636 5050 struct mbuf *m, *n;
1c79356b
A
5051 unsigned len = len0, remain;
5052
5053 for (m = m0; m && len > m->m_len; m = m->m_next)
5054 len -= m->m_len;
2d21ac55
A
5055 if (m == NULL)
5056 return (NULL);
1c79356b 5057 remain = m->m_len - len;
b0d623f7 5058 if (copyhdr && (m0->m_flags & M_PKTHDR)) {
2d21ac55
A
5059 _MGETHDR(n, wait, m0->m_type);
5060 if (n == NULL)
5061 return (NULL);
1c79356b
A
5062 n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif;
5063 n->m_pkthdr.len = m0->m_pkthdr.len - len0;
5064 m0->m_pkthdr.len = len0;
5065 if (m->m_flags & M_EXT)
5066 goto extpacket;
5067 if (remain > MHLEN) {
5068 /* m can't be the lead packet */
5069 MH_ALIGN(n, 0);
5070 n->m_next = m_split(m, len, wait);
2d21ac55 5071 if (n->m_next == NULL) {
1c79356b 5072 (void) m_free(n);
2d21ac55 5073 return (NULL);
1c79356b
A
5074 } else
5075 return (n);
5076 } else
5077 MH_ALIGN(n, remain);
5078 } else if (remain == 0) {
5079 n = m->m_next;
2d21ac55 5080 m->m_next = NULL;
1c79356b
A
5081 return (n);
5082 } else {
2d21ac55
A
5083 _MGET(n, wait, m->m_type);
5084 if (n == NULL)
5085 return (NULL);
1c79356b
A
5086 M_ALIGN(n, remain);
5087 }
5088extpacket:
5089 if (m->m_flags & M_EXT) {
5090 n->m_flags |= M_EXT;
0b4e3aa0 5091 n->m_ext = m->m_ext;
2d21ac55 5092 m_incref(m);
1c79356b
A
5093 n->m_data = m->m_data + len;
5094 } else {
2d21ac55 5095 bcopy(MTOD(m, caddr_t) + len, MTOD(n, caddr_t), remain);
1c79356b
A
5096 }
5097 n->m_len = remain;
5098 m->m_len = len;
5099 n->m_next = m->m_next;
2d21ac55 5100 m->m_next = NULL;
1c79356b
A
5101 return (n);
5102}
2d21ac55 5103
1c79356b
A
5104/*
5105 * Routine to copy from device local memory into mbufs.
5106 */
5107struct mbuf *
2d21ac55
A
5108m_devget(char *buf, int totlen, int off0, struct ifnet *ifp,
5109 void (*copy)(const void *, void *, size_t))
1c79356b 5110{
91447636 5111 struct mbuf *m;
2d21ac55 5112 struct mbuf *top = NULL, **mp = &top;
91447636
A
5113 int off = off0, len;
5114 char *cp;
1c79356b
A
5115 char *epkt;
5116
5117 cp = buf;
5118 epkt = cp + totlen;
5119 if (off) {
5120 /*
5121 * If 'off' is non-zero, packet is trailer-encapsulated,
5122 * so we have to skip the type and length fields.
5123 */
2d21ac55
A
5124 cp += off + 2 * sizeof (u_int16_t);
5125 totlen -= 2 * sizeof (u_int16_t);
1c79356b 5126 }
2d21ac55
A
5127 _MGETHDR(m, M_DONTWAIT, MT_DATA);
5128 if (m == NULL)
5129 return (NULL);
1c79356b
A
5130 m->m_pkthdr.rcvif = ifp;
5131 m->m_pkthdr.len = totlen;
5132 m->m_len = MHLEN;
5133
5134 while (totlen > 0) {
2d21ac55
A
5135 if (top != NULL) {
5136 _MGET(m, M_DONTWAIT, MT_DATA);
5137 if (m == NULL) {
1c79356b 5138 m_freem(top);
2d21ac55 5139 return (NULL);
1c79356b
A
5140 }
5141 m->m_len = MLEN;
5142 }
2d21ac55 5143 len = MIN(totlen, epkt - cp);
1c79356b
A
5144 if (len >= MINCLSIZE) {
5145 MCLGET(m, M_DONTWAIT);
2d21ac55
A
5146 if (m->m_flags & M_EXT) {
5147 m->m_len = len = MIN(len, m_maxsize(MC_CL));
5148 } else {
5149 /* give up when it's out of cluster mbufs */
5150 if (top != NULL)
5151 m_freem(top);
1c79356b 5152 m_freem(m);
2d21ac55 5153 return (NULL);
1c79356b
A
5154 }
5155 } else {
5156 /*
5157 * Place initial small packet/header at end of mbuf.
5158 */
5159 if (len < m->m_len) {
2d21ac55
A
5160 if (top == NULL &&
5161 len + max_linkhdr <= m->m_len)
1c79356b
A
5162 m->m_data += max_linkhdr;
5163 m->m_len = len;
2d21ac55 5164 } else {
1c79356b 5165 len = m->m_len;
2d21ac55 5166 }
1c79356b
A
5167 }
5168 if (copy)
2d21ac55 5169 copy(cp, MTOD(m, caddr_t), (unsigned)len);
1c79356b 5170 else
2d21ac55 5171 bcopy(cp, MTOD(m, caddr_t), (unsigned)len);
1c79356b
A
5172 cp += len;
5173 *mp = m;
5174 mp = &m->m_next;
5175 totlen -= len;
5176 if (cp == epkt)
5177 cp = buf;
5178 }
5179 return (top);
5180}
5181
6d2010ae
A
5182#ifndef MBUF_GROWTH_NORMAL_THRESH
5183#define MBUF_GROWTH_NORMAL_THRESH 25
5184#endif
b0d623f7 5185
1c79356b 5186/*
2d21ac55 5187 * Cluster freelist allocation check.
1c79356b
A
5188 */
5189static int
91447636 5190m_howmany(int num, size_t bufsize)
1c79356b 5191{
2d21ac55 5192 int i = 0, j = 0;
6d2010ae
A
5193 u_int32_t m_mbclusters, m_clusters, m_bigclusters, m_16kclusters;
5194 u_int32_t m_mbfree, m_clfree, m_bigclfree, m_16kclfree;
5195 u_int32_t sumclusters, freeclusters;
5196 u_int32_t percent_pool, percent_kmem;
5197 u_int32_t mb_growth, mb_growth_thresh;
5198
5199 VERIFY(bufsize == m_maxsize(MC_BIGCL) ||
5200 bufsize == m_maxsize(MC_16KCL));
2d21ac55
A
5201
5202 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
5203
6d2010ae
A
5204 /* Numbers in 2K cluster units */
5205 m_mbclusters = m_total(MC_MBUF) >> NMBPCLSHIFT;
2d21ac55 5206 m_clusters = m_total(MC_CL);
6d2010ae 5207 m_bigclusters = m_total(MC_BIGCL) << NCLPBGSHIFT;
2d21ac55 5208 m_16kclusters = m_total(MC_16KCL);
6d2010ae
A
5209 sumclusters = m_mbclusters + m_clusters + m_bigclusters;
5210
5211 m_mbfree = m_infree(MC_MBUF) >> NMBPCLSHIFT;
2d21ac55 5212 m_clfree = m_infree(MC_CL);
6d2010ae 5213 m_bigclfree = m_infree(MC_BIGCL) << NCLPBGSHIFT;
2d21ac55 5214 m_16kclfree = m_infree(MC_16KCL);
6d2010ae 5215 freeclusters = m_mbfree + m_clfree + m_bigclfree;
2d21ac55 5216
91447636 5217 /* Bail if we've maxed out the mbuf memory map */
6d2010ae 5218 if ((bufsize == m_maxsize(MC_BIGCL) && sumclusters >= nclusters) ||
2d21ac55 5219 (njcl > 0 && bufsize == m_maxsize(MC_16KCL) &&
6d2010ae 5220 (m_16kclusters << NCLPJCLSHIFT) >= njcl)) {
2d21ac55
A
5221 return (0);
5222 }
5223
6d2010ae 5224 if (bufsize == m_maxsize(MC_BIGCL)) {
2d21ac55 5225 /* Under minimum */
6d2010ae
A
5226 if (m_bigclusters < m_minlimit(MC_BIGCL))
5227 return (m_minlimit(MC_BIGCL) - m_bigclusters);
5228
5229 percent_pool =
5230 ((sumclusters - freeclusters) * 100) / sumclusters;
5231 percent_kmem = (sumclusters * 100) / nclusters;
5232
5233 /*
5234 * If a light/normal user, grow conservatively (75%)
5235 * If a heavy user, grow aggressively (50%)
5236 */
5237 if (percent_kmem < MBUF_GROWTH_NORMAL_THRESH)
5238 mb_growth = MB_GROWTH_NORMAL;
5239 else
5240 mb_growth = MB_GROWTH_AGGRESSIVE;
5241
5242 if (percent_kmem < 5) {
5243 /* For initial allocations */
5244 i = num;
5245 } else {
5246 /* Return if >= MBIGCL_LOWAT clusters available */
5247 if (m_infree(MC_BIGCL) >= MBIGCL_LOWAT &&
5248 m_total(MC_BIGCL) >=
5249 MBIGCL_LOWAT + m_minlimit(MC_BIGCL))
2d21ac55 5250 return (0);
6d2010ae
A
5251
5252 /* Ensure at least num clusters are accessible */
5253 if (num >= m_infree(MC_BIGCL))
5254 i = num - m_infree(MC_BIGCL);
5255 if (num > m_total(MC_BIGCL) - m_minlimit(MC_BIGCL))
5256 j = num - (m_total(MC_BIGCL) -
5257 m_minlimit(MC_BIGCL));
5258
2d21ac55 5259 i = MAX(i, j);
6d2010ae
A
5260
5261 /*
5262 * Grow pool if percent_pool > 75 (normal growth)
5263 * or percent_pool > 50 (aggressive growth).
5264 */
5265 mb_growth_thresh = 100 - (100 / (1 << mb_growth));
5266 if (percent_pool > mb_growth_thresh)
5267 j = ((sumclusters + num) >> mb_growth) -
5268 freeclusters;
2d21ac55 5269 i = MAX(i, j);
2d21ac55 5270 }
6d2010ae
A
5271
5272 /* Check to ensure we didn't go over limits */
5273 if (i + m_bigclusters >= m_maxlimit(MC_BIGCL))
5274 i = m_maxlimit(MC_BIGCL) - m_bigclusters;
5275 if ((i << 1) + sumclusters >= nclusters)
5276 i = (nclusters - sumclusters) >> 1;
2d21ac55 5277 VERIFY((m_total(MC_BIGCL) + i) <= m_maxlimit(MC_BIGCL));
6d2010ae
A
5278 VERIFY(sumclusters + (i << 1) <= nclusters);
5279
5280 } else { /* 16K CL */
2d21ac55
A
5281 VERIFY(njcl > 0);
5282 /* Under minimum */
5283 if (m_16kclusters < MIN16KCL)
5284 return (MIN16KCL - m_16kclusters);
6d2010ae
A
5285 if (m_16kclfree >= M16KCL_LOWAT)
5286 return (0);
5287
5288 /* Ensure at least num clusters are available */
5289 if (num >= m_16kclfree)
5290 i = num - m_16kclfree;
5291
5292 /* Always grow 16KCL pool aggressively */
5293 if (((m_16kclusters + num) >> 1) > m_16kclfree)
5294 j = ((m_16kclusters + num) >> 1) - m_16kclfree;
5295 i = MAX(i, j);
5296
5297 /* Check to ensure we don't go over limit */
5298 if (i + m_16kclusters >= m_maxlimit(MC_16KCL))
5299 i = m_maxlimit(MC_16KCL) - m_16kclusters;
2d21ac55 5300 VERIFY((m_total(MC_16KCL) + i) <= m_maxlimit(MC_16KCL));
91447636 5301 }
2d21ac55 5302 return (i);
1c79356b 5303}
b0d623f7
A
5304/*
5305 * Return the number of bytes in the mbuf chain, m.
6d2010ae
A
5306 */
5307unsigned int
b0d623f7
A
5308m_length(struct mbuf *m)
5309{
5310 struct mbuf *m0;
5311 unsigned int pktlen;
5312
5313 if (m->m_flags & M_PKTHDR)
5314 return (m->m_pkthdr.len);
5315
5316 pktlen = 0;
5317 for (m0 = m; m0 != NULL; m0 = m0->m_next)
5318 pktlen += m0->m_len;
5319 return (pktlen);
5320}
5321
1c79356b
A
5322/*
5323 * Copy data from a buffer back into the indicated mbuf chain,
5324 * starting "off" bytes from the beginning, extending the mbuf
5325 * chain if necessary.
5326 */
5327void
b0d623f7 5328m_copyback(struct mbuf *m0, int off, int len, const void *cp)
1c79356b 5329{
b0d623f7
A
5330#if DEBUG
5331 struct mbuf *origm = m0;
5332 int error;
5333#endif /* DEBUG */
1c79356b 5334
2d21ac55 5335 if (m0 == NULL)
1c79356b 5336 return;
b0d623f7
A
5337
5338#if DEBUG
5339 error =
5340#endif /* DEBUG */
5341 m_copyback0(&m0, off, len, cp,
5342 M_COPYBACK0_COPYBACK | M_COPYBACK0_EXTEND, M_DONTWAIT);
5343
5344#if DEBUG
5345 if (error != 0 || (m0 != NULL && origm != m0))
5346 panic("m_copyback");
5347#endif /* DEBUG */
5348}
5349
5350struct mbuf *
5351m_copyback_cow(struct mbuf *m0, int off, int len, const void *cp, int how)
5352{
5353 int error;
5354
5355 /* don't support chain expansion */
5356 VERIFY(off + len <= m_length(m0));
5357
5358 error = m_copyback0(&m0, off, len, cp,
5359 M_COPYBACK0_COPYBACK | M_COPYBACK0_COW, how);
5360 if (error) {
5361 /*
5362 * no way to recover from partial success.
5363 * just free the chain.
5364 */
5365 m_freem(m0);
5366 return (NULL);
5367 }
5368 return (m0);
5369}
5370
5371/*
5372 * m_makewritable: ensure the specified range writable.
5373 */
5374int
5375m_makewritable(struct mbuf **mp, int off, int len, int how)
5376{
5377 int error;
5378#if DEBUG
5379 struct mbuf *n;
5380 int origlen, reslen;
5381
5382 origlen = m_length(*mp);
5383#endif /* DEBUG */
5384
5385#if 0 /* M_COPYALL is large enough */
5386 if (len == M_COPYALL)
5387 len = m_length(*mp) - off; /* XXX */
5388#endif
5389
5390 error = m_copyback0(mp, off, len, NULL,
5391 M_COPYBACK0_PRESERVE | M_COPYBACK0_COW, how);
5392
5393#if DEBUG
5394 reslen = 0;
5395 for (n = *mp; n; n = n->m_next)
5396 reslen += n->m_len;
5397 if (origlen != reslen)
5398 panic("m_makewritable: length changed");
5399 if (((*mp)->m_flags & M_PKTHDR) && reslen != (*mp)->m_pkthdr.len)
5400 panic("m_makewritable: inconsist");
5401#endif /* DEBUG */
5402
5403 return (error);
5404}
5405
5406static int
5407m_copyback0(struct mbuf **mp0, int off, int len, const void *vp, int flags,
5408 int how)
5409{
5410 int mlen;
5411 struct mbuf *m, *n;
5412 struct mbuf **mp;
5413 int totlen = 0;
5414 const char *cp = vp;
5415
5416 VERIFY(mp0 != NULL);
5417 VERIFY(*mp0 != NULL);
5418 VERIFY((flags & M_COPYBACK0_PRESERVE) == 0 || cp == NULL);
5419 VERIFY((flags & M_COPYBACK0_COPYBACK) == 0 || cp != NULL);
5420
5421 /*
5422 * we don't bother to update "totlen" in the case of M_COPYBACK0_COW,
5423 * assuming that M_COPYBACK0_EXTEND and M_COPYBACK0_COW are exclusive.
5424 */
5425
5426 VERIFY((~flags & (M_COPYBACK0_EXTEND|M_COPYBACK0_COW)) != 0);
5427
5428 mp = mp0;
5429 m = *mp;
1c79356b
A
5430 while (off > (mlen = m->m_len)) {
5431 off -= mlen;
5432 totlen += mlen;
2d21ac55 5433 if (m->m_next == NULL) {
b0d623f7
A
5434 int tspace;
5435extend:
5436 if (!(flags & M_COPYBACK0_EXTEND))
1c79356b 5437 goto out;
b0d623f7
A
5438
5439 /*
5440 * try to make some space at the end of "m".
5441 */
5442
5443 mlen = m->m_len;
5444 if (off + len >= MINCLSIZE &&
5445 !(m->m_flags & M_EXT) && m->m_len == 0) {
5446 MCLGET(m, how);
5447 }
5448 tspace = M_TRAILINGSPACE(m);
5449 if (tspace > 0) {
5450 tspace = MIN(tspace, off + len);
5451 VERIFY(tspace > 0);
5452 bzero(mtod(m, char *) + m->m_len,
5453 MIN(off, tspace));
5454 m->m_len += tspace;
5455 off += mlen;
5456 totlen -= mlen;
5457 continue;
5458 }
5459
5460 /*
5461 * need to allocate an mbuf.
5462 */
5463
5464 if (off + len >= MINCLSIZE) {
5465 n = m_getcl(how, m->m_type, 0);
5466 } else {
5467 n = _M_GET(how, m->m_type);
5468 }
5469 if (n == NULL) {
5470 goto out;
5471 }
5472 n->m_len = 0;
5473 n->m_len = MIN(M_TRAILINGSPACE(n), off + len);
5474 bzero(mtod(n, char *), MIN(n->m_len, off));
1c79356b
A
5475 m->m_next = n;
5476 }
b0d623f7 5477 mp = &m->m_next;
1c79356b
A
5478 m = m->m_next;
5479 }
5480 while (len > 0) {
b0d623f7
A
5481 mlen = m->m_len - off;
5482 if (mlen != 0 && m_mclhasreference(m)) {
5483 char *datap;
5484 int eatlen;
5485
5486 /*
5487 * this mbuf is read-only.
5488 * allocate a new writable mbuf and try again.
5489 */
5490
39236c6e 5491#if DIAGNOSTIC
b0d623f7
A
5492 if (!(flags & M_COPYBACK0_COW))
5493 panic("m_copyback0: read-only");
39236c6e 5494#endif /* DIAGNOSTIC */
b0d623f7
A
5495
5496 /*
5497 * if we're going to write into the middle of
5498 * a mbuf, split it first.
5499 */
5500 if (off > 0 && len < mlen) {
5501 n = m_split0(m, off, how, 0);
5502 if (n == NULL)
5503 goto enobufs;
5504 m->m_next = n;
5505 mp = &m->m_next;
5506 m = n;
5507 off = 0;
5508 continue;
5509 }
5510
5511 /*
5512 * XXX TODO coalesce into the trailingspace of
5513 * the previous mbuf when possible.
5514 */
5515
5516 /*
5517 * allocate a new mbuf. copy packet header if needed.
5518 */
5519 n = _M_GET(how, m->m_type);
5520 if (n == NULL)
5521 goto enobufs;
5522 if (off == 0 && (m->m_flags & M_PKTHDR)) {
5523 M_COPY_PKTHDR(n, m);
5524 n->m_len = MHLEN;
5525 } else {
5526 if (len >= MINCLSIZE)
5527 MCLGET(n, M_DONTWAIT);
5528 n->m_len =
5529 (n->m_flags & M_EXT) ? MCLBYTES : MLEN;
5530 }
5531 if (n->m_len > len)
5532 n->m_len = len;
5533
5534 /*
5535 * free the region which has been overwritten.
5536 * copying data from old mbufs if requested.
5537 */
5538 if (flags & M_COPYBACK0_PRESERVE)
5539 datap = mtod(n, char *);
5540 else
5541 datap = NULL;
5542 eatlen = n->m_len;
5543 VERIFY(off == 0 || eatlen >= mlen);
5544 if (off > 0) {
5545 VERIFY(len >= mlen);
5546 m->m_len = off;
5547 m->m_next = n;
5548 if (datap) {
5549 m_copydata(m, off, mlen, datap);
5550 datap += mlen;
5551 }
5552 eatlen -= mlen;
5553 mp = &m->m_next;
5554 m = m->m_next;
5555 }
5556 while (m != NULL && m_mclhasreference(m) &&
5557 n->m_type == m->m_type && eatlen > 0) {
5558 mlen = MIN(eatlen, m->m_len);
5559 if (datap) {
5560 m_copydata(m, 0, mlen, datap);
5561 datap += mlen;
5562 }
5563 m->m_data += mlen;
5564 m->m_len -= mlen;
5565 eatlen -= mlen;
5566 if (m->m_len == 0)
5567 *mp = m = m_free(m);
5568 }
5569 if (eatlen > 0)
5570 n->m_len -= eatlen;
5571 n->m_next = m;
5572 *mp = m = n;
5573 continue;
5574 }
5575 mlen = MIN(mlen, len);
5576 if (flags & M_COPYBACK0_COPYBACK) {
5577 bcopy(cp, mtod(m, caddr_t) + off, (unsigned)mlen);
5578 cp += mlen;
5579 }
1c79356b
A
5580 len -= mlen;
5581 mlen += off;
5582 off = 0;
5583 totlen += mlen;
5584 if (len == 0)
5585 break;
2d21ac55 5586 if (m->m_next == NULL) {
b0d623f7 5587 goto extend;
1c79356b 5588 }
b0d623f7 5589 mp = &m->m_next;
1c79356b
A
5590 m = m->m_next;
5591 }
2d21ac55 5592out:
b0d623f7
A
5593 if (((m = *mp0)->m_flags & M_PKTHDR) && (m->m_pkthdr.len < totlen)) {
5594 VERIFY(flags & M_COPYBACK0_EXTEND);
1c79356b 5595 m->m_pkthdr.len = totlen;
b0d623f7
A
5596 }
5597
5598 return (0);
5599
5600enobufs:
5601 return (ENOBUFS);
1c79356b
A
5602}
5603
39236c6e 5604uint64_t
2d21ac55
A
5605mcl_to_paddr(char *addr)
5606{
b0d623f7 5607 vm_offset_t base_phys;
1c79356b 5608
2d21ac55 5609 if (!MBUF_IN_MAP(addr))
39236c6e
A
5610 return (0);
5611 base_phys = mcl_paddr[atop_64(addr - (char *)mbutl)];
1c79356b
A
5612
5613 if (base_phys == 0)
39236c6e
A
5614 return (0);
5615 return ((uint64_t)(ptoa_64(base_phys) | ((uint64_t)addr & PAGE_MASK)));
1c79356b
A
5616}
5617
5618/*
5619 * Dup the mbuf chain passed in. The whole thing. No cute additional cruft.
5620 * And really copy the thing. That way, we don't "precompute" checksums
2d21ac55
A
5621 * for unsuspecting consumers. Assumption: m->m_nextpkt == 0. Trick: for
5622 * small packets, don't dup into a cluster. That way received packets
5623 * don't take up too much room in the sockbuf (cf. sbspace()).
1c79356b
A
5624 */
5625int MDFail;
5626
5627struct mbuf *
91447636 5628m_dup(struct mbuf *m, int how)
2d21ac55 5629{
91447636 5630 struct mbuf *n, **np;
1c79356b
A
5631 struct mbuf *top;
5632 int copyhdr = 0;
5633
5634 np = &top;
2d21ac55 5635 top = NULL;
1c79356b
A
5636 if (m->m_flags & M_PKTHDR)
5637 copyhdr = 1;
5638
5639 /*
5640 * Quick check: if we have one mbuf and its data fits in an
5641 * mbuf with packet header, just copy and go.
5642 */
2d21ac55
A
5643 if (m->m_next == NULL) {
5644 /* Then just move the data into an mbuf and be done... */
5645 if (copyhdr) {
5646 if (m->m_pkthdr.len <= MHLEN && m->m_len <= MHLEN) {
5647 if ((n = _M_GETHDR(how, m->m_type)) == NULL)
5648 return (NULL);
1c79356b 5649 n->m_len = m->m_len;
3a60a9f5
A
5650 m_dup_pkthdr(n, m, how);
5651 bcopy(m->m_data, n->m_data, m->m_len);
2d21ac55 5652 return (n);
1c79356b 5653 }
2d21ac55
A
5654 } else if (m->m_len <= MLEN) {
5655 if ((n = _M_GET(how, m->m_type)) == NULL)
5656 return (NULL);
1c79356b
A
5657 bcopy(m->m_data, n->m_data, m->m_len);
5658 n->m_len = m->m_len;
2d21ac55 5659 return (n);
1c79356b
A
5660 }
5661 }
2d21ac55 5662 while (m != NULL) {
1c79356b
A
5663#if BLUE_DEBUG
5664 kprintf("<%x: %x, %x, %x\n", m, m->m_flags, m->m_len,
2d21ac55 5665 m->m_data);
1c79356b
A
5666#endif
5667 if (copyhdr)
2d21ac55 5668 n = _M_GETHDR(how, m->m_type);
1c79356b 5669 else
2d21ac55
A
5670 n = _M_GET(how, m->m_type);
5671 if (n == NULL)
1c79356b 5672 goto nospace;
2d21ac55
A
5673 if (m->m_flags & M_EXT) {
5674 if (m->m_len <= m_maxsize(MC_CL))
5675 MCLGET(n, how);
5676 else if (m->m_len <= m_maxsize(MC_BIGCL))
5677 n = m_mbigget(n, how);
5678 else if (m->m_len <= m_maxsize(MC_16KCL) && njcl > 0)
5679 n = m_m16kget(n, how);
5680 if (!(n->m_flags & M_EXT)) {
5681 (void) m_free(n);
1c79356b 5682 goto nospace;
2d21ac55 5683 }
1c79356b
A
5684 }
5685 *np = n;
2d21ac55
A
5686 if (copyhdr) {
5687 /* Don't use M_COPY_PKTHDR: preserve m_data */
3a60a9f5 5688 m_dup_pkthdr(n, m, how);
1c79356b 5689 copyhdr = 0;
2d21ac55 5690 if (!(n->m_flags & M_EXT))
1c79356b
A
5691 n->m_data = n->m_pktdat;
5692 }
5693 n->m_len = m->m_len;
5694 /*
5695 * Get the dup on the same bdry as the original
5696 * Assume that the two mbufs have the same offset to data area
2d21ac55 5697 * (up to word boundaries)
1c79356b 5698 */
2d21ac55 5699 bcopy(MTOD(m, caddr_t), MTOD(n, caddr_t), (unsigned)n->m_len);
1c79356b
A
5700 m = m->m_next;
5701 np = &n->m_next;
5702#if BLUE_DEBUG
5703 kprintf(">%x: %x, %x, %x\n", n, n->m_flags, n->m_len,
2d21ac55 5704 n->m_data);
1c79356b
A
5705#endif
5706 }
5707
2d21ac55 5708 if (top == NULL)
1c79356b
A
5709 MDFail++;
5710 return (top);
2d21ac55
A
5711
5712nospace:
1c79356b
A
5713 m_freem(top);
5714 MDFail++;
2d21ac55 5715 return (NULL);
1c79356b
A
5716}
5717
2d21ac55
A
5718#define MBUF_MULTIPAGES(m) \
5719 (((m)->m_flags & M_EXT) && \
5720 ((IS_P2ALIGNED((m)->m_data, NBPG) && (m)->m_len > NBPG) || \
5721 (!IS_P2ALIGNED((m)->m_data, NBPG) && \
5722 P2ROUNDUP((m)->m_data, NBPG) < ((uintptr_t)(m)->m_data + (m)->m_len))))
5723
5724static struct mbuf *
5725m_expand(struct mbuf *m, struct mbuf **last)
9bccf70c 5726{
2d21ac55
A
5727 struct mbuf *top = NULL;
5728 struct mbuf **nm = &top;
5729 uintptr_t data0, data;
5730 unsigned int len0, len;
5731
5732 VERIFY(MBUF_MULTIPAGES(m));
5733 VERIFY(m->m_next == NULL);
5734 data0 = (uintptr_t)m->m_data;
5735 len0 = m->m_len;
5736 *last = top;
5737
5738 for (;;) {
5739 struct mbuf *n;
5740
5741 data = data0;
5742 if (IS_P2ALIGNED(data, NBPG) && len0 > NBPG)
5743 len = NBPG;
5744 else if (!IS_P2ALIGNED(data, NBPG) &&
5745 P2ROUNDUP(data, NBPG) < (data + len0))
5746 len = P2ROUNDUP(data, NBPG) - data;
5747 else
5748 len = len0;
5749
5750 VERIFY(len > 0);
5751 VERIFY(m->m_flags & M_EXT);
5752 m->m_data = (void *)data;
5753 m->m_len = len;
5754
5755 *nm = *last = m;
5756 nm = &m->m_next;
5757 m->m_next = NULL;
5758
5759 data0 += len;
5760 len0 -= len;
5761 if (len0 == 0)
5762 break;
5763
5764 n = _M_RETRY(M_DONTWAIT, MT_DATA);
5765 if (n == NULL) {
5766 m_freem(top);
5767 top = *last = NULL;
5768 break;
5769 }
5770
5771 n->m_ext = m->m_ext;
5772 m_incref(m);
5773 n->m_flags |= M_EXT;
5774 m = n;
5775 }
5776 return (top);
9bccf70c
A
5777}
5778
2d21ac55
A
5779struct mbuf *
5780m_normalize(struct mbuf *m)
9bccf70c 5781{
2d21ac55
A
5782 struct mbuf *top = NULL;
5783 struct mbuf **nm = &top;
5784 boolean_t expanded = FALSE;
5785
5786 while (m != NULL) {
5787 struct mbuf *n;
5788
5789 n = m->m_next;
5790 m->m_next = NULL;
5791
5792 /* Does the data cross one or more page boundaries? */
5793 if (MBUF_MULTIPAGES(m)) {
5794 struct mbuf *last;
5795 if ((m = m_expand(m, &last)) == NULL) {
5796 m_freem(n);
5797 m_freem(top);
5798 top = NULL;
5799 break;
5800 }
5801 *nm = m;
5802 nm = &last->m_next;
5803 expanded = TRUE;
5804 } else {
5805 *nm = m;
5806 nm = &m->m_next;
5807 }
5808 m = n;
5809 }
5810 if (expanded)
5811 atomic_add_32(&mb_normalized, 1);
5812 return (top);
9bccf70c
A
5813}
5814
6d2010ae
A
5815/*
5816 * Append the specified data to the indicated mbuf chain,
5817 * Extend the mbuf chain if the new data does not fit in
5818 * existing space.
5819 *
5820 * Return 1 if able to complete the job; otherwise 0.
5821 */
5822int
5823m_append(struct mbuf *m0, int len, caddr_t cp)
5824{
5825 struct mbuf *m, *n;
5826 int remainder, space;
5827
5828 for (m = m0; m->m_next != NULL; m = m->m_next)
5829 ;
5830 remainder = len;
5831 space = M_TRAILINGSPACE(m);
5832 if (space > 0) {
5833 /*
5834 * Copy into available space.
5835 */
5836 if (space > remainder)
5837 space = remainder;
5838 bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
5839 m->m_len += space;
5840 cp += space, remainder -= space;
5841 }
5842 while (remainder > 0) {
5843 /*
5844 * Allocate a new mbuf; could check space
5845 * and allocate a cluster instead.
5846 */
5847 n = m_get(M_WAITOK, m->m_type);
5848 if (n == NULL)
5849 break;
5850 n->m_len = min(MLEN, remainder);
5851 bcopy(cp, mtod(n, caddr_t), n->m_len);
5852 cp += n->m_len;
5853 remainder -= n->m_len;
5854 m->m_next = n;
5855 m = n;
5856 }
5857 if (m0->m_flags & M_PKTHDR)
5858 m0->m_pkthdr.len += len - remainder;
5859 return (remainder == 0);
5860}
5861
5862struct mbuf *
5863m_last(struct mbuf *m)
5864{
5865 while (m->m_next != NULL)
5866 m = m->m_next;
5867 return (m);
5868}
5869
316670eb
A
5870unsigned int
5871m_fixhdr(struct mbuf *m0)
5872{
5873 u_int len;
5874
39236c6e
A
5875 VERIFY(m0->m_flags & M_PKTHDR);
5876
316670eb
A
5877 len = m_length2(m0, NULL);
5878 m0->m_pkthdr.len = len;
5879 return (len);
5880}
5881
5882unsigned int
5883m_length2(struct mbuf *m0, struct mbuf **last)
5884{
5885 struct mbuf *m;
5886 u_int len;
5887
5888 len = 0;
5889 for (m = m0; m != NULL; m = m->m_next) {
5890 len += m->m_len;
5891 if (m->m_next == NULL)
5892 break;
5893 }
5894 if (last != NULL)
5895 *last = m;
5896 return (len);
5897}
5898
5899/*
5900 * Defragment a mbuf chain, returning the shortest possible chain of mbufs
5901 * and clusters. If allocation fails and this cannot be completed, NULL will
5902 * be returned, but the passed in chain will be unchanged. Upon success,
5903 * the original chain will be freed, and the new chain will be returned.
5904 *
5905 * If a non-packet header is passed in, the original mbuf (chain?) will
5906 * be returned unharmed.
5907 *
5908 * If offset is specfied, the first mbuf in the chain will have a leading
5909 * space of the amount stated by the "off" parameter.
5910 *
5911 * This routine requires that the m_pkthdr.header field of the original
5912 * mbuf chain is cleared by the caller.
5913 */
5914struct mbuf *
5915m_defrag_offset(struct mbuf *m0, u_int32_t off, int how)
5916{
5917 struct mbuf *m_new = NULL, *m_final = NULL;
5918 int progress = 0, length, pktlen;
5919
5920 if (!(m0->m_flags & M_PKTHDR))
5921 return (m0);
5922
5923 VERIFY(off < MHLEN);
5924 m_fixhdr(m0); /* Needed sanity check */
5925
5926 pktlen = m0->m_pkthdr.len + off;
5927 if (pktlen > MHLEN)
5928 m_final = m_getcl(how, MT_DATA, M_PKTHDR);
5929 else
5930 m_final = m_gethdr(how, MT_DATA);
5931
5932 if (m_final == NULL)
5933 goto nospace;
5934
5935 if (off > 0) {
5936 pktlen -= off;
316670eb
A
5937 m_final->m_data += off;
5938 }
5939
5940 /*
5941 * Caller must have handled the contents pointed to by this
5942 * pointer before coming here, as otherwise it will point to
5943 * the original mbuf which will get freed upon success.
5944 */
39236c6e 5945 VERIFY(m0->m_pkthdr.pkt_hdr == NULL);
316670eb
A
5946
5947 if (m_dup_pkthdr(m_final, m0, how) == 0)
5948 goto nospace;
5949
5950 m_new = m_final;
5951
5952 while (progress < pktlen) {
5953 length = pktlen - progress;
5954 if (length > MCLBYTES)
5955 length = MCLBYTES;
39236c6e 5956 length -= ((m_new == m_final) ? off : 0);
316670eb
A
5957
5958 if (m_new == NULL) {
5959 if (length > MLEN)
5960 m_new = m_getcl(how, MT_DATA, 0);
5961 else
5962 m_new = m_get(how, MT_DATA);
5963 if (m_new == NULL)
5964 goto nospace;
5965 }
5966
5967 m_copydata(m0, progress, length, mtod(m_new, caddr_t));
5968 progress += length;
5969 m_new->m_len = length;
5970 if (m_new != m_final)
5971 m_cat(m_final, m_new);
5972 m_new = NULL;
5973 }
5974 m_freem(m0);
5975 m0 = m_final;
5976 return (m0);
5977nospace:
5978 if (m_final)
5979 m_freem(m_final);
5980 return (NULL);
5981}
5982
5983struct mbuf *
5984m_defrag(struct mbuf *m0, int how)
5985{
5986 return (m_defrag_offset(m0, 0, how));
5987}
5988
9bccf70c
A
5989void
5990m_mchtype(struct mbuf *m, int t)
5991{
2d21ac55
A
5992 mtype_stat_inc(t);
5993 mtype_stat_dec(m->m_type);
5994 (m)->m_type = t;
9bccf70c
A
5995}
5996
2d21ac55
A
5997void *
5998m_mtod(struct mbuf *m)
9bccf70c 5999{
2d21ac55 6000 return (MTOD(m, void *));
9bccf70c
A
6001}
6002
2d21ac55
A
6003struct mbuf *
6004m_dtom(void *x)
9bccf70c 6005{
b0d623f7 6006 return ((struct mbuf *)((uintptr_t)(x) & ~(MSIZE-1)));
9bccf70c
A
6007}
6008
2d21ac55
A
6009void
6010m_mcheck(struct mbuf *m)
9bccf70c 6011{
2d21ac55 6012 _MCHECK(m);
9bccf70c
A
6013}
6014
6d2010ae
A
6015/*
6016 * Return a pointer to mbuf/offset of location in mbuf chain.
6017 */
6018struct mbuf *
6019m_getptr(struct mbuf *m, int loc, int *off)
6020{
6021
6022 while (loc >= 0) {
6023 /* Normal end of search. */
6024 if (m->m_len > loc) {
6025 *off = loc;
6026 return (m);
6027 } else {
6028 loc -= m->m_len;
6029 if (m->m_next == NULL) {
6030 if (loc == 0) {
6031 /* Point at the end of valid data. */
6032 *off = m->m_len;
6033 return (m);
6034 }
6035 return (NULL);
6036 }
6037 m = m->m_next;
6038 }
6039 }
6040 return (NULL);
6041}
6042
2d21ac55
A
6043/*
6044 * Inform the corresponding mcache(s) that there's a waiter below.
6045 */
6046static void
6047mbuf_waiter_inc(mbuf_class_t class, boolean_t comp)
9bccf70c 6048{
2d21ac55
A
6049 mcache_waiter_inc(m_cache(class));
6050 if (comp) {
6051 if (class == MC_CL) {
6052 mcache_waiter_inc(m_cache(MC_MBUF_CL));
6053 } else if (class == MC_BIGCL) {
6054 mcache_waiter_inc(m_cache(MC_MBUF_BIGCL));
6055 } else if (class == MC_16KCL) {
6056 mcache_waiter_inc(m_cache(MC_MBUF_16KCL));
6057 } else {
6058 mcache_waiter_inc(m_cache(MC_MBUF_CL));
6059 mcache_waiter_inc(m_cache(MC_MBUF_BIGCL));
6060 }
6061 }
9bccf70c
A
6062}
6063
2d21ac55
A
6064/*
6065 * Inform the corresponding mcache(s) that there's no more waiter below.
6066 */
6067static void
6068mbuf_waiter_dec(mbuf_class_t class, boolean_t comp)
6069{
6070 mcache_waiter_dec(m_cache(class));
6071 if (comp) {
6072 if (class == MC_CL) {
6073 mcache_waiter_dec(m_cache(MC_MBUF_CL));
6074 } else if (class == MC_BIGCL) {
6075 mcache_waiter_dec(m_cache(MC_MBUF_BIGCL));
6076 } else if (class == MC_16KCL) {
6077 mcache_waiter_dec(m_cache(MC_MBUF_16KCL));
6078 } else {
6079 mcache_waiter_dec(m_cache(MC_MBUF_CL));
6080 mcache_waiter_dec(m_cache(MC_MBUF_BIGCL));
6081 }
6082 }
6083}
9bccf70c 6084
6d2010ae
A
6085/*
6086 * Called during slab (blocking and non-blocking) allocation. If there
6087 * is at least one waiter, and the time since the first waiter is blocked
6088 * is greater than the watchdog timeout, panic the system.
6089 */
6090static void
6091mbuf_watchdog(void)
6092{
6093 struct timeval now;
6094 unsigned int since;
6095
6096 if (mb_waiters == 0 || !mb_watchdog)
6097 return;
6098
6099 microuptime(&now);
6100 since = now.tv_sec - mb_wdtstart.tv_sec;
6101 if (since >= MB_WDT_MAXTIME) {
6102 panic_plain("%s: %d waiters stuck for %u secs\n%s", __func__,
6103 mb_waiters, since, mbuf_dump());
6104 /* NOTREACHED */
6105 }
6106}
6107
2d21ac55
A
6108/*
6109 * Called during blocking allocation. Returns TRUE if one or more objects
6110 * are available at the per-CPU caches layer and that allocation should be
6111 * retried at that level.
6112 */
6113static boolean_t
6114mbuf_sleep(mbuf_class_t class, unsigned int num, int wait)
9bccf70c 6115{
2d21ac55
A
6116 boolean_t mcache_retry = FALSE;
6117
6118 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
6119
6120 /* Check if there's anything at the cache layer */
6121 if (mbuf_cached_above(class, wait)) {
6122 mcache_retry = TRUE;
6123 goto done;
6124 }
6125
6126 /* Nothing? Then try hard to get it from somewhere */
6127 m_reclaim(class, num, (wait & MCR_COMP));
6128
6129 /* We tried hard and got something? */
6130 if (m_infree(class) > 0) {
6131 mbstat.m_wait++;
6132 goto done;
6133 } else if (mbuf_cached_above(class, wait)) {
6134 mbstat.m_wait++;
6135 mcache_retry = TRUE;
6136 goto done;
6137 } else if (wait & MCR_TRYHARD) {
6138 mcache_retry = TRUE;
6139 goto done;
6140 }
6141
6142 /*
6143 * There's really nothing for us right now; inform the
6144 * cache(s) that there is a waiter below and go to sleep.
6145 */
6146 mbuf_waiter_inc(class, (wait & MCR_COMP));
6147
6148 VERIFY(!(wait & MCR_NOSLEEP));
6d2010ae
A
6149
6150 /*
6151 * If this is the first waiter, arm the watchdog timer. Otherwise
6152 * check if we need to panic the system due to watchdog timeout.
6153 */
6154 if (mb_waiters == 0)
6155 microuptime(&mb_wdtstart);
6156 else
6157 mbuf_watchdog();
6158
2d21ac55
A
6159 mb_waiters++;
6160 (void) msleep(mb_waitchan, mbuf_mlock, (PZERO-1), m_cname(class), NULL);
6161
6162 /* We are now up; stop getting notified until next round */
6163 mbuf_waiter_dec(class, (wait & MCR_COMP));
6164
6165 /* We waited and got something */
6166 if (m_infree(class) > 0) {
6167 mbstat.m_wait++;
6168 goto done;
6169 } else if (mbuf_cached_above(class, wait)) {
6170 mbstat.m_wait++;
6171 mcache_retry = TRUE;
6172 }
6173done:
6174 return (mcache_retry);
9bccf70c
A
6175}
6176
91447636 6177static void
2d21ac55 6178mbuf_worker_thread(void)
1c79356b 6179{
2d21ac55
A
6180 int mbuf_expand;
6181
91447636 6182 while (1) {
2d21ac55
A
6183 lck_mtx_lock(mbuf_mlock);
6184
6185 mbuf_expand = 0;
91447636
A
6186 if (mbuf_expand_mcl) {
6187 int n;
2d21ac55
A
6188
6189 /* Adjust to current number of cluster in use */
6190 n = mbuf_expand_mcl -
6191 (m_total(MC_CL) - m_infree(MC_CL));
6192 if ((n + m_total(MC_CL)) > m_maxlimit(MC_CL))
6193 n = m_maxlimit(MC_CL) - m_total(MC_CL);
91447636 6194 mbuf_expand_mcl = 0;
2d21ac55
A
6195
6196 if (n > 0 && freelist_populate(MC_CL, n, M_WAIT) > 0)
6197 mbuf_expand++;
91447636
A
6198 }
6199 if (mbuf_expand_big) {
6200 int n;
2d21ac55
A
6201
6202 /* Adjust to current number of 4 KB cluster in use */
6203 n = mbuf_expand_big -
6204 (m_total(MC_BIGCL) - m_infree(MC_BIGCL));
6205 if ((n + m_total(MC_BIGCL)) > m_maxlimit(MC_BIGCL))
6206 n = m_maxlimit(MC_BIGCL) - m_total(MC_BIGCL);
91447636 6207 mbuf_expand_big = 0;
2d21ac55
A
6208
6209 if (n > 0 && freelist_populate(MC_BIGCL, n, M_WAIT) > 0)
6210 mbuf_expand++;
6211 }
6212 if (mbuf_expand_16k) {
6213 int n;
6214
6215 /* Adjust to current number of 16 KB cluster in use */
6216 n = mbuf_expand_16k -
6217 (m_total(MC_16KCL) - m_infree(MC_16KCL));
6218 if ((n + m_total(MC_16KCL)) > m_maxlimit(MC_16KCL))
6219 n = m_maxlimit(MC_16KCL) - m_total(MC_16KCL);
6220 mbuf_expand_16k = 0;
6221
6222 if (n > 0)
6223 (void) freelist_populate(MC_16KCL, n, M_WAIT);
6224 }
6225
6226 /*
6227 * Because we can run out of memory before filling the mbuf
6228 * map, we should not allocate more clusters than they are
6229 * mbufs -- otherwise we could have a large number of useless
6230 * clusters allocated.
91447636 6231 */
2d21ac55
A
6232 if (mbuf_expand) {
6233 while (m_total(MC_MBUF) <
6234 (m_total(MC_BIGCL) + m_total(MC_CL))) {
6235 if (freelist_populate(MC_MBUF, 1, M_WAIT) == 0)
6236 break;
6237 }
91447636 6238 }
2d21ac55
A
6239
6240 lck_mtx_unlock(mbuf_mlock);
6241
6242 assert_wait(&mbuf_worker_run, THREAD_UNINT);
6243 (void) thread_block((thread_continue_t)mbuf_worker_thread);
91447636 6244 }
1c79356b
A
6245}
6246
91447636 6247static void
2d21ac55 6248mbuf_worker_thread_init(void)
55e303ae 6249{
2d21ac55
A
6250 mbuf_worker_ready++;
6251 mbuf_worker_thread();
55e303ae 6252}
1c79356b 6253
2d21ac55
A
6254static mcl_slab_t *
6255slab_get(void *buf)
6256{
6257 mcl_slabg_t *slg;
6258 unsigned int ix, k;
6259
6260 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
6261
6262 VERIFY(MBUF_IN_MAP(buf));
6263 ix = ((char *)buf - (char *)mbutl) >> MBSHIFT;
6264 VERIFY(ix < maxslabgrp);
6265
6266 if ((slg = slabstbl[ix]) == NULL) {
6267 /*
fe8ab488
A
6268 * In the current implementation, we never shrink the slabs
6269 * table; if we attempt to reallocate a cluster group when
6270 * it's already allocated, panic since this is a sign of a
6271 * memory corruption (slabstbl[ix] got nullified).
2d21ac55
A
6272 */
6273 ++slabgrp;
6274 VERIFY(ix < slabgrp);
6275 /*
6276 * Slabs expansion can only be done single threaded; when
6277 * we get here, it must be as a result of m_clalloc() which
6278 * is serialized and therefore mb_clalloc_busy must be set.
6279 */
6280 VERIFY(mb_clalloc_busy);
6281 lck_mtx_unlock(mbuf_mlock);
6282
6283 /* This is a new buffer; create the slabs group for it */
6284 MALLOC(slg, mcl_slabg_t *, sizeof (*slg), M_TEMP,
6285 M_WAITOK | M_ZERO);
6286 VERIFY(slg != NULL);
6287
6288 lck_mtx_lock(mbuf_mlock);
6289 /*
6290 * No other thread could have gone into m_clalloc() after
6291 * we dropped the lock above, so verify that it's true.
6292 */
6293 VERIFY(mb_clalloc_busy);
6294
6295 slabstbl[ix] = slg;
6296
6297 /* Chain each slab in the group to its forward neighbor */
6298 for (k = 1; k < NSLABSPMB; k++)
6299 slg->slg_slab[k - 1].sl_next = &slg->slg_slab[k];
6300 VERIFY(slg->slg_slab[NSLABSPMB - 1].sl_next == NULL);
6301
6302 /* And chain the last slab in the previous group to this */
6303 if (ix > 0) {
6304 VERIFY(slabstbl[ix - 1]->
6305 slg_slab[NSLABSPMB - 1].sl_next == NULL);
6306 slabstbl[ix - 1]->slg_slab[NSLABSPMB - 1].sl_next =
6307 &slg->slg_slab[0];
6308 }
6309 }
6310
6d2010ae 6311 ix = MTOBG(buf) % NSLABSPMB;
2d21ac55
A
6312 VERIFY(ix < NSLABSPMB);
6313
6314 return (&slg->slg_slab[ix]);
6315}
6316
6317static void
6318slab_init(mcl_slab_t *sp, mbuf_class_t class, u_int32_t flags,
6319 void *base, void *head, unsigned int len, int refcnt, int chunks)
6320{
6321 sp->sl_class = class;
6322 sp->sl_flags = flags;
6323 sp->sl_base = base;
6324 sp->sl_head = head;
6325 sp->sl_len = len;
6326 sp->sl_refcnt = refcnt;
6327 sp->sl_chunks = chunks;
6328 slab_detach(sp);
6329}
6330
6331static void
6332slab_insert(mcl_slab_t *sp, mbuf_class_t class)
6333{
6334 VERIFY(slab_is_detached(sp));
6335 m_slab_cnt(class)++;
6336 TAILQ_INSERT_TAIL(&m_slablist(class), sp, sl_link);
6337 sp->sl_flags &= ~SLF_DETACHED;
6d2010ae 6338 if (class == MC_16KCL) {
2d21ac55 6339 int k;
6d2010ae 6340 for (k = 1; k < NSLABSP16KB; k++) {
2d21ac55
A
6341 sp = sp->sl_next;
6342 /* Next slab must already be present */
6343 VERIFY(sp != NULL);
6344 VERIFY(slab_is_detached(sp));
6345 sp->sl_flags &= ~SLF_DETACHED;
6346 }
6347 }
6348}
6349
6350static void
6351slab_remove(mcl_slab_t *sp, mbuf_class_t class)
6352{
6353 VERIFY(!slab_is_detached(sp));
6354 VERIFY(m_slab_cnt(class) > 0);
6355 m_slab_cnt(class)--;
6356 TAILQ_REMOVE(&m_slablist(class), sp, sl_link);
6357 slab_detach(sp);
6d2010ae 6358 if (class == MC_16KCL) {
2d21ac55 6359 int k;
6d2010ae 6360 for (k = 1; k < NSLABSP16KB; k++) {
2d21ac55
A
6361 sp = sp->sl_next;
6362 /* Next slab must already be present */
6363 VERIFY(sp != NULL);
6364 VERIFY(!slab_is_detached(sp));
6365 slab_detach(sp);
6366 }
6367 }
6368}
6369
6370static boolean_t
6371slab_inrange(mcl_slab_t *sp, void *buf)
6372{
6373 return ((uintptr_t)buf >= (uintptr_t)sp->sl_base &&
6374 (uintptr_t)buf < ((uintptr_t)sp->sl_base + sp->sl_len));
6375}
6376
b0d623f7 6377#undef panic
2d21ac55
A
6378
6379static void
6380slab_nextptr_panic(mcl_slab_t *sp, void *addr)
6381{
6382 int i;
6383 unsigned int chunk_len = sp->sl_len / sp->sl_chunks;
6384 uintptr_t buf = (uintptr_t)sp->sl_base;
6385
6386 for (i = 0; i < sp->sl_chunks; i++, buf += chunk_len) {
6387 void *next = ((mcache_obj_t *)buf)->obj_next;
6388 if (next != addr)
6389 continue;
6d2010ae 6390 if (!mclverify) {
2d21ac55
A
6391 if (next != NULL && !MBUF_IN_MAP(next)) {
6392 mcache_t *cp = m_cache(sp->sl_class);
6393 panic("%s: %s buffer %p in slab %p modified "
6394 "after free at offset 0: %p out of range "
6395 "[%p-%p)\n", __func__, cp->mc_name,
6396 (void *)buf, sp, next, mbutl, embutl);
6397 /* NOTREACHED */
6398 }
6399 } else {
6400 mcache_audit_t *mca = mcl_audit_buf2mca(sp->sl_class,
6401 (mcache_obj_t *)buf);
6402 mcl_audit_verify_nextptr(next, mca);
6403 }
6404 }
6405}
6406
6407static void
6408slab_detach(mcl_slab_t *sp)
6409{
6410 sp->sl_link.tqe_next = (mcl_slab_t *)-1;
6411 sp->sl_link.tqe_prev = (mcl_slab_t **)-1;
6412 sp->sl_flags |= SLF_DETACHED;
6413}
6414
6415static boolean_t
6416slab_is_detached(mcl_slab_t *sp)
6417{
6418 return ((intptr_t)sp->sl_link.tqe_next == -1 &&
6419 (intptr_t)sp->sl_link.tqe_prev == -1 &&
6420 (sp->sl_flags & SLF_DETACHED));
6421}
6422
6423static void
6424mcl_audit_init(void *buf, mcache_audit_t **mca_list,
6425 mcache_obj_t **con_list, size_t con_size, unsigned int num)
6426{
6427 mcache_audit_t *mca, *mca_tail;
6428 mcache_obj_t *con = NULL;
6429 boolean_t save_contents = (con_list != NULL);
6430 unsigned int i, ix;
6431
6d2010ae 6432 ASSERT(num <= NMBPBG);
2d21ac55
A
6433 ASSERT(con_list == NULL || con_size != 0);
6434
6d2010ae
A
6435 ix = MTOBG(buf);
6436 VERIFY(ix < maxclaudit);
6437
2d21ac55 6438 /* Make sure we haven't been here before */
6d2010ae 6439 for (i = 0; i < NMBPBG; i++)
2d21ac55
A
6440 VERIFY(mclaudit[ix].cl_audit[i] == NULL);
6441
6442 mca = mca_tail = *mca_list;
6443 if (save_contents)
6444 con = *con_list;
6445
6446 for (i = 0; i < num; i++) {
6447 mcache_audit_t *next;
6448
6449 next = mca->mca_next;
6450 bzero(mca, sizeof (*mca));
6451 mca->mca_next = next;
6452 mclaudit[ix].cl_audit[i] = mca;
6453
6454 /* Attach the contents buffer if requested */
6455 if (save_contents) {
39236c6e
A
6456 mcl_saved_contents_t *msc =
6457 (mcl_saved_contents_t *)(void *)con;
6458
6459 VERIFY(msc != NULL);
6460 VERIFY(IS_P2ALIGNED(msc, sizeof (u_int64_t)));
6461 VERIFY(con_size == sizeof (*msc));
2d21ac55 6462 mca->mca_contents_size = con_size;
39236c6e 6463 mca->mca_contents = msc;
2d21ac55
A
6464 con = con->obj_next;
6465 bzero(mca->mca_contents, mca->mca_contents_size);
6466 }
6467
6468 mca_tail = mca;
6469 mca = mca->mca_next;
6470 }
91447636 6471
2d21ac55
A
6472 if (save_contents)
6473 *con_list = con;
6474
6475 *mca_list = mca_tail->mca_next;
6476 mca_tail->mca_next = NULL;
6477}
6478
fe8ab488
A
6479static void
6480mcl_audit_free(void *buf, unsigned int num)
6481{
6482 unsigned int i, ix;
6483 mcache_audit_t *mca, *mca_list;
6484
6485 ix = MTOBG(buf);
6486 VERIFY(ix < maxclaudit);
6487
6488 if (mclaudit[ix].cl_audit[0] != NULL) {
6489 mca_list = mclaudit[ix].cl_audit[0];
6490 for (i = 0; i < num; i++) {
6491 mca = mclaudit[ix].cl_audit[i];
6492 mclaudit[ix].cl_audit[i] = NULL;
6493 if (mca->mca_contents)
6494 mcache_free(mcl_audit_con_cache,
6495 mca->mca_contents);
6496 }
6497 mcache_free_ext(mcache_audit_cache,
6498 (mcache_obj_t *)mca_list);
6499 }
6500}
6501
2d21ac55 6502/*
6d2010ae 6503 * Given an address of a buffer (mbuf/2KB/4KB/16KB), return
2d21ac55
A
6504 * the corresponding audit structure for that buffer.
6505 */
6506static mcache_audit_t *
6507mcl_audit_buf2mca(mbuf_class_t class, mcache_obj_t *o)
6508{
6509 mcache_audit_t *mca = NULL;
6d2010ae 6510 int ix = MTOBG(o);
2d21ac55 6511
6d2010ae 6512 VERIFY(ix < maxclaudit);
2d21ac55
A
6513 VERIFY(IS_P2ALIGNED(o, MIN(m_maxsize(class), NBPG)));
6514
6515 switch (class) {
6516 case MC_MBUF:
6517 /*
6d2010ae 6518 * For the mbuf case, find the index of the page
2d21ac55 6519 * used by the mbuf and use that index to locate the
6d2010ae
A
6520 * base address of the page. Then find out the
6521 * mbuf index relative to the page base and use
2d21ac55
A
6522 * it to locate the audit structure.
6523 */
6d2010ae
A
6524 VERIFY(MCLIDX(BGTOM(ix), o) < (int)NMBPBG);
6525 mca = mclaudit[ix].cl_audit[MCLIDX(BGTOM(ix), o)];
2d21ac55
A
6526 break;
6527
6528 case MC_CL:
6d2010ae
A
6529 /*
6530 * Same thing as above, but for 2KB clusters in a page.
6531 */
6532 VERIFY(CLBGIDX(BGTOM(ix), o) < (int)NCLPBG);
6533 mca = mclaudit[ix].cl_audit[CLBGIDX(BGTOM(ix), o)];
6534 break;
6535
2d21ac55
A
6536 case MC_BIGCL:
6537 case MC_16KCL:
6538 /*
6539 * Same as above, but only return the first element.
6540 */
6541 mca = mclaudit[ix].cl_audit[0];
6542 break;
6543
6544 default:
6545 VERIFY(0);
6546 /* NOTREACHED */
6547 }
6548
6549 return (mca);
6550}
6551
6552static void
6553mcl_audit_mbuf(mcache_audit_t *mca, void *addr, boolean_t composite,
6554 boolean_t alloc)
6555{
6556 struct mbuf *m = addr;
6557 mcache_obj_t *next = ((mcache_obj_t *)m)->obj_next;
6558
6559 VERIFY(mca->mca_contents != NULL &&
6560 mca->mca_contents_size == AUDIT_CONTENTS_SIZE);
6561
6d2010ae
A
6562 if (mclverify)
6563 mcl_audit_verify_nextptr(next, mca);
2d21ac55
A
6564
6565 if (!alloc) {
6566 /* Save constructed mbuf fields */
6567 mcl_audit_save_mbuf(m, mca);
6d2010ae
A
6568 if (mclverify) {
6569 mcache_set_pattern(MCACHE_FREE_PATTERN, m,
6570 m_maxsize(MC_MBUF));
6571 }
2d21ac55
A
6572 ((mcache_obj_t *)m)->obj_next = next;
6573 return;
6574 }
6575
6576 /* Check if the buffer has been corrupted while in freelist */
6d2010ae
A
6577 if (mclverify) {
6578 mcache_audit_free_verify_set(mca, addr, 0, m_maxsize(MC_MBUF));
6579 }
2d21ac55
A
6580 /* Restore constructed mbuf fields */
6581 mcl_audit_restore_mbuf(m, mca, composite);
6582}
6583
6584static void
6585mcl_audit_restore_mbuf(struct mbuf *m, mcache_audit_t *mca, boolean_t composite)
6586{
39236c6e 6587 struct mbuf *ms = MCA_SAVED_MBUF_PTR(mca);
2d21ac55
A
6588
6589 if (composite) {
6590 struct mbuf *next = m->m_next;
6591 VERIFY(ms->m_flags == M_EXT && MEXT_RFA(ms) != NULL &&
6592 MBUF_IS_COMPOSITE(ms));
39236c6e 6593 VERIFY(mca->mca_contents_size == AUDIT_CONTENTS_SIZE);
2d21ac55
A
6594 /*
6595 * We could have hand-picked the mbuf fields and restore
6596 * them individually, but that will be a maintenance
6597 * headache. Instead, restore everything that was saved;
6598 * the mbuf layer will recheck and reinitialize anyway.
6599 */
39236c6e 6600 bcopy(ms, m, MCA_SAVED_MBUF_SIZE);
2d21ac55
A
6601 m->m_next = next;
6602 } else {
6603 /*
6604 * For a regular mbuf (no cluster attached) there's nothing
6605 * to restore other than the type field, which is expected
6606 * to be MT_FREE.
6607 */
6608 m->m_type = ms->m_type;
6609 }
6610 _MCHECK(m);
6611}
6612
6613static void
6614mcl_audit_save_mbuf(struct mbuf *m, mcache_audit_t *mca)
6615{
39236c6e 6616 VERIFY(mca->mca_contents_size == AUDIT_CONTENTS_SIZE);
2d21ac55 6617 _MCHECK(m);
39236c6e 6618 bcopy(m, MCA_SAVED_MBUF_PTR(mca), MCA_SAVED_MBUF_SIZE);
2d21ac55
A
6619}
6620
6621static void
6622mcl_audit_cluster(mcache_audit_t *mca, void *addr, size_t size, boolean_t alloc,
6623 boolean_t save_next)
6624{
6625 mcache_obj_t *next = ((mcache_obj_t *)addr)->obj_next;
6626
6627 if (!alloc) {
6d2010ae
A
6628 if (mclverify) {
6629 mcache_set_pattern(MCACHE_FREE_PATTERN, addr, size);
6630 }
2d21ac55
A
6631 if (save_next) {
6632 mcl_audit_verify_nextptr(next, mca);
6633 ((mcache_obj_t *)addr)->obj_next = next;
6634 }
6d2010ae 6635 } else if (mclverify) {
2d21ac55
A
6636 /* Check if the buffer has been corrupted while in freelist */
6637 mcl_audit_verify_nextptr(next, mca);
6638 mcache_audit_free_verify_set(mca, addr, 0, size);
6639 }
6640}
6641
39236c6e
A
6642static void
6643mcl_audit_scratch(mcache_audit_t *mca)
6644{
6645 void *stack[MCACHE_STACK_DEPTH + 1];
6646 mcl_scratch_audit_t *msa;
6647 struct timeval now;
6648
6649 VERIFY(mca->mca_contents != NULL);
6650 msa = MCA_SAVED_SCRATCH_PTR(mca);
6651
6652 msa->msa_pthread = msa->msa_thread;
6653 msa->msa_thread = current_thread();
6654 bcopy(msa->msa_stack, msa->msa_pstack, sizeof (msa->msa_pstack));
6655 msa->msa_pdepth = msa->msa_depth;
6656 bzero(stack, sizeof (stack));
6657 msa->msa_depth = OSBacktrace(stack, MCACHE_STACK_DEPTH + 1) - 1;
fe8ab488 6658 bcopy(&stack[1], msa->msa_stack, sizeof (msa->msa_stack));
39236c6e
A
6659
6660 msa->msa_ptstamp = msa->msa_tstamp;
6661 microuptime(&now);
6662 /* tstamp is in ms relative to base_ts */
6663 msa->msa_tstamp = ((now.tv_usec - mb_start.tv_usec) / 1000);
6664 if ((now.tv_sec - mb_start.tv_sec) > 0)
6665 msa->msa_tstamp += ((now.tv_sec - mb_start.tv_sec) * 1000);
6666}
6667
2d21ac55
A
6668static void
6669mcl_audit_mcheck_panic(struct mbuf *m)
6670{
6671 mcache_audit_t *mca;
6672
6673 MRANGE(m);
6674 mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
6675
6676 panic("mcl_audit: freed mbuf %p with type 0x%x (instead of 0x%x)\n%s\n",
6677 m, (u_int16_t)m->m_type, MT_FREE, mcache_dump_mca(mca));
6678 /* NOTREACHED */
6679}
6680
6681static void
6682mcl_audit_verify_nextptr(void *next, mcache_audit_t *mca)
6683{
6d2010ae
A
6684 if (next != NULL && !MBUF_IN_MAP(next) &&
6685 (next != (void *)MCACHE_FREE_PATTERN || !mclverify)) {
2d21ac55
A
6686 panic("mcl_audit: buffer %p modified after free at offset 0: "
6687 "%p out of range [%p-%p)\n%s\n",
6688 mca->mca_addr, next, mbutl, embutl, mcache_dump_mca(mca));
6689 /* NOTREACHED */
6690 }
6691}
6692
6d2010ae
A
6693/* This function turns on mbuf leak detection */
6694static void
6695mleak_activate(void)
6696{
6697 mleak_table.mleak_sample_factor = MLEAK_SAMPLE_FACTOR;
6698 PE_parse_boot_argn("mleak_sample_factor",
6699 &mleak_table.mleak_sample_factor,
6700 sizeof (mleak_table.mleak_sample_factor));
6701
6702 if (mleak_table.mleak_sample_factor == 0)
6703 mclfindleak = 0;
6704
6705 if (mclfindleak == 0)
6706 return;
6707
6708 vm_size_t alloc_size =
6709 mleak_alloc_buckets * sizeof (struct mallocation);
6710 vm_size_t trace_size = mleak_trace_buckets * sizeof (struct mtrace);
6711
6712 MALLOC(mleak_allocations, struct mallocation *, alloc_size,
6713 M_TEMP, M_WAITOK | M_ZERO);
6714 VERIFY(mleak_allocations != NULL);
6715
6716 MALLOC(mleak_traces, struct mtrace *, trace_size,
6717 M_TEMP, M_WAITOK | M_ZERO);
6718 VERIFY(mleak_traces != NULL);
6719
6720 MALLOC(mleak_stat, mleak_stat_t *, MLEAK_STAT_SIZE(MLEAK_NUM_TRACES),
6721 M_TEMP, M_WAITOK | M_ZERO);
6722 VERIFY(mleak_stat != NULL);
6723 mleak_stat->ml_cnt = MLEAK_NUM_TRACES;
6724#ifdef __LP64__
6725 mleak_stat->ml_isaddr64 = 1;
6726#endif /* __LP64__ */
6727}
6728
6729static void
6730mleak_logger(u_int32_t num, mcache_obj_t *addr, boolean_t alloc)
6731{
6732 int temp;
6733
6734 if (mclfindleak == 0)
6735 return;
6736
6737 if (!alloc)
6738 return (mleak_free(addr));
6739
6740 temp = atomic_add_32_ov(&mleak_table.mleak_capture, 1);
6741
6742 if ((temp % mleak_table.mleak_sample_factor) == 0 && addr != NULL) {
6743 uintptr_t bt[MLEAK_STACK_DEPTH];
6744 int logged = fastbacktrace(bt, MLEAK_STACK_DEPTH);
6745 mleak_log(bt, addr, logged, num);
6746 }
6747}
6748
6749/*
6750 * This function records the allocation in the mleak_allocations table
6751 * and the backtrace in the mleak_traces table; if allocation slot is in use,
6752 * replace old allocation with new one if the trace slot is in use, return
6753 * (or increment refcount if same trace).
6754 */
6755static boolean_t
6756mleak_log(uintptr_t *bt, mcache_obj_t *addr, uint32_t depth, int num)
6757{
6758 struct mallocation *allocation;
6759 struct mtrace *trace;
6760 uint32_t trace_index;
6d2010ae
A
6761
6762 /* Quit if someone else modifying the tables */
6763 if (!lck_mtx_try_lock_spin(mleak_lock)) {
6764 mleak_table.total_conflicts++;
6765 return (FALSE);
6766 }
6767
6768 allocation = &mleak_allocations[hashaddr((uintptr_t)addr,
6769 mleak_alloc_buckets)];
6770 trace_index = hashbacktrace(bt, depth, mleak_trace_buckets);
6771 trace = &mleak_traces[trace_index];
6772
6773 VERIFY(allocation <= &mleak_allocations[mleak_alloc_buckets - 1]);
6774 VERIFY(trace <= &mleak_traces[mleak_trace_buckets - 1]);
6775
6776 allocation->hitcount++;
6777 trace->hitcount++;
6778
6779 /*
6780 * If the allocation bucket we want is occupied
6781 * and the occupier has the same trace, just bail.
6782 */
6783 if (allocation->element != NULL &&
6784 trace_index == allocation->trace_index) {
6785 mleak_table.alloc_collisions++;
6786 lck_mtx_unlock(mleak_lock);
6787 return (TRUE);
6788 }
6789
6790 /*
6791 * Store the backtrace in the traces array;
6792 * Size of zero = trace bucket is free.
6793 */
6794 if (trace->allocs > 0 &&
6795 bcmp(trace->addr, bt, (depth * sizeof (uintptr_t))) != 0) {
6796 /* Different, unique trace, but the same hash! Bail out. */
6797 trace->collisions++;
6798 mleak_table.trace_collisions++;
6799 lck_mtx_unlock(mleak_lock);
6800 return (TRUE);
6801 } else if (trace->allocs > 0) {
6802 /* Same trace, already added, so increment refcount */
6803 trace->allocs++;
6804 } else {
6805 /* Found an unused trace bucket, so record the trace here */
6806 if (trace->depth != 0) {
6807 /* this slot previously used but not currently in use */
6808 mleak_table.trace_overwrites++;
6809 }
6810 mleak_table.trace_recorded++;
6811 trace->allocs = 1;
6812 memcpy(trace->addr, bt, (depth * sizeof (uintptr_t)));
6813 trace->depth = depth;
6814 trace->collisions = 0;
6815 }
6816
6817 /* Step 2: Store the allocation record in the allocations array */
6818 if (allocation->element != NULL) {
6819 /*
6820 * Replace an existing allocation. No need to preserve
6821 * because only a subset of the allocations are being
6822 * recorded anyway.
6823 */
6824 mleak_table.alloc_collisions++;
6825 } else if (allocation->trace_index != 0) {
6826 mleak_table.alloc_overwrites++;
6827 }
6828 allocation->element = addr;
6829 allocation->trace_index = trace_index;
6830 allocation->count = num;
6831 mleak_table.alloc_recorded++;
6832 mleak_table.outstanding_allocs++;
6833
6d2010ae
A
6834 lck_mtx_unlock(mleak_lock);
6835 return (TRUE);
6836}
6837
6838static void
6839mleak_free(mcache_obj_t *addr)
6840{
6841 while (addr != NULL) {
6842 struct mallocation *allocation = &mleak_allocations
6843 [hashaddr((uintptr_t)addr, mleak_alloc_buckets)];
6844
6845 if (allocation->element == addr &&
6846 allocation->trace_index < mleak_trace_buckets) {
6847 lck_mtx_lock_spin(mleak_lock);
6848 if (allocation->element == addr &&
6849 allocation->trace_index < mleak_trace_buckets) {
6850 struct mtrace *trace;
6851 trace = &mleak_traces[allocation->trace_index];
6852 /* allocs = 0 means trace bucket is unused */
6853 if (trace->allocs > 0)
6854 trace->allocs--;
6855 if (trace->allocs == 0)
6856 trace->depth = 0;
6857 /* NULL element means alloc bucket is unused */
6858 allocation->element = NULL;
6859 mleak_table.outstanding_allocs--;
6860 }
6861 lck_mtx_unlock(mleak_lock);
6862 }
6863 addr = addr->obj_next;
6864 }
6865}
6866
316670eb
A
6867static void
6868mleak_sort_traces()
6869{
6870 int i, j, k;
6871 struct mtrace *swap;
6872
6873 for(i = 0; i < MLEAK_NUM_TRACES; i++)
6874 mleak_top_trace[i] = NULL;
6875
6876 for(i = 0, j = 0; j < MLEAK_NUM_TRACES && i < mleak_trace_buckets; i++)
6877 {
6878 if (mleak_traces[i].allocs <= 0)
6879 continue;
6880
6881 mleak_top_trace[j] = &mleak_traces[i];
6882 for (k = j; k > 0; k--) {
6883 if (mleak_top_trace[k]->allocs <=
6884 mleak_top_trace[k-1]->allocs)
6885 break;
6886
6887 swap = mleak_top_trace[k-1];
6888 mleak_top_trace[k-1] = mleak_top_trace[k];
6889 mleak_top_trace[k] = swap;
6890 }
6891 j++;
6892 }
6893
6894 j--;
6895 for(; i < mleak_trace_buckets; i++) {
6896 if (mleak_traces[i].allocs <= mleak_top_trace[j]->allocs)
6897 continue;
6898
6899 mleak_top_trace[j] = &mleak_traces[i];
6900
6901 for (k = j; k > 0; k--) {
6902 if (mleak_top_trace[k]->allocs <=
6903 mleak_top_trace[k-1]->allocs)
6904 break;
6905
6906 swap = mleak_top_trace[k-1];
6907 mleak_top_trace[k-1] = mleak_top_trace[k];
6908 mleak_top_trace[k] = swap;
6909 }
6910 }
6911}
6912
6913static void
6914mleak_update_stats()
6915{
6916 mleak_trace_stat_t *mltr;
6917 int i;
6918
6919 VERIFY(mleak_stat != NULL);
6920#ifdef __LP64__
6921 VERIFY(mleak_stat->ml_isaddr64);
6922#else
6923 VERIFY(!mleak_stat->ml_isaddr64);
6924#endif /* !__LP64__ */
6925 VERIFY(mleak_stat->ml_cnt == MLEAK_NUM_TRACES);
6926
6927 mleak_sort_traces();
6928
6929 mltr = &mleak_stat->ml_trace[0];
6930 bzero(mltr, sizeof (*mltr) * MLEAK_NUM_TRACES);
6931 for (i = 0; i < MLEAK_NUM_TRACES; i++) {
6932 int j;
6933
6934 if (mleak_top_trace[i] == NULL ||
6935 mleak_top_trace[i]->allocs == 0)
6936 continue;
6937
6938 mltr->mltr_collisions = mleak_top_trace[i]->collisions;
6939 mltr->mltr_hitcount = mleak_top_trace[i]->hitcount;
6940 mltr->mltr_allocs = mleak_top_trace[i]->allocs;
6941 mltr->mltr_depth = mleak_top_trace[i]->depth;
6942
6943 VERIFY(mltr->mltr_depth <= MLEAK_STACK_DEPTH);
6944 for (j = 0; j < mltr->mltr_depth; j++)
6945 mltr->mltr_addr[j] = mleak_top_trace[i]->addr[j];
6946
6947 mltr++;
6948 }
6949}
6950
6d2010ae
A
6951static struct mbtypes {
6952 int mt_type;
6953 const char *mt_name;
6954} mbtypes[] = {
6955 { MT_DATA, "data" },
6956 { MT_OOBDATA, "oob data" },
6957 { MT_CONTROL, "ancillary data" },
6958 { MT_HEADER, "packet headers" },
6959 { MT_SOCKET, "socket structures" },
6960 { MT_PCB, "protocol control blocks" },
6961 { MT_RTABLE, "routing table entries" },
6962 { MT_HTABLE, "IMP host table entries" },
6963 { MT_ATABLE, "address resolution tables" },
6964 { MT_FTABLE, "fragment reassembly queue headers" },
6965 { MT_SONAME, "socket names and addresses" },
6966 { MT_SOOPTS, "socket options" },
6967 { MT_RIGHTS, "access rights" },
6968 { MT_IFADDR, "interface addresses" },
6969 { MT_TAG, "packet tags" },
6970 { 0, NULL }
6971};
6972
6973#define MBUF_DUMP_BUF_CHK() { \
6974 clen -= k; \
6975 if (clen < 1) \
6976 goto done; \
6977 c += k; \
6978}
6979
6980static char *
6981mbuf_dump(void)
6982{
6983 unsigned long totmem = 0, totfree = 0, totmbufs, totused, totpct;
6984 u_int32_t m_mbufs = 0, m_clfree = 0, m_bigclfree = 0;
6985 u_int32_t m_mbufclfree = 0, m_mbufbigclfree = 0;
6986 u_int32_t m_16kclusters = 0, m_16kclfree = 0, m_mbuf16kclfree = 0;
6987 int nmbtypes = sizeof (mbstat.m_mtypes) / sizeof (short);
6988 uint8_t seen[256];
6989 struct mbtypes *mp;
6990 mb_class_stat_t *sp;
316670eb 6991 mleak_trace_stat_t *mltr;
6d2010ae 6992 char *c = mbuf_dump_buf;
316670eb 6993 int i, k, clen = MBUF_DUMP_BUF_SIZE;
6d2010ae
A
6994
6995 mbuf_dump_buf[0] = '\0';
6996
6997 /* synchronize all statistics in the mbuf table */
6998 mbuf_stat_sync();
6999 mbuf_mtypes_sync(TRUE);
7000
7001 sp = &mb_stat->mbs_class[0];
7002 for (i = 0; i < mb_stat->mbs_cnt; i++, sp++) {
7003 u_int32_t mem;
7004
7005 if (m_class(i) == MC_MBUF) {
7006 m_mbufs = sp->mbcl_active;
7007 } else if (m_class(i) == MC_CL) {
7008 m_clfree = sp->mbcl_total - sp->mbcl_active;
7009 } else if (m_class(i) == MC_BIGCL) {
7010 m_bigclfree = sp->mbcl_total - sp->mbcl_active;
7011 } else if (njcl > 0 && m_class(i) == MC_16KCL) {
7012 m_16kclfree = sp->mbcl_total - sp->mbcl_active;
7013 m_16kclusters = sp->mbcl_total;
7014 } else if (m_class(i) == MC_MBUF_CL) {
7015 m_mbufclfree = sp->mbcl_total - sp->mbcl_active;
7016 } else if (m_class(i) == MC_MBUF_BIGCL) {
7017 m_mbufbigclfree = sp->mbcl_total - sp->mbcl_active;
7018 } else if (njcl > 0 && m_class(i) == MC_MBUF_16KCL) {
7019 m_mbuf16kclfree = sp->mbcl_total - sp->mbcl_active;
7020 }
7021
7022 mem = sp->mbcl_ctotal * sp->mbcl_size;
7023 totmem += mem;
7024 totfree += (sp->mbcl_mc_cached + sp->mbcl_infree) *
7025 sp->mbcl_size;
7026
7027 }
7028
7029 /* adjust free counts to include composite caches */
7030 m_clfree += m_mbufclfree;
7031 m_bigclfree += m_mbufbigclfree;
7032 m_16kclfree += m_mbuf16kclfree;
7033
7034 totmbufs = 0;
7035 for (mp = mbtypes; mp->mt_name != NULL; mp++)
7036 totmbufs += mbstat.m_mtypes[mp->mt_type];
7037 if (totmbufs > m_mbufs)
7038 totmbufs = m_mbufs;
7039 k = snprintf(c, clen, "%lu/%u mbufs in use:\n", totmbufs, m_mbufs);
7040 MBUF_DUMP_BUF_CHK();
7041
7042 bzero(&seen, sizeof (seen));
7043 for (mp = mbtypes; mp->mt_name != NULL; mp++) {
7044 if (mbstat.m_mtypes[mp->mt_type] != 0) {
7045 seen[mp->mt_type] = 1;
7046 k = snprintf(c, clen, "\t%u mbufs allocated to %s\n",
7047 mbstat.m_mtypes[mp->mt_type], mp->mt_name);
7048 MBUF_DUMP_BUF_CHK();
7049 }
7050 }
7051 seen[MT_FREE] = 1;
7052 for (i = 0; i < nmbtypes; i++)
7053 if (!seen[i] && mbstat.m_mtypes[i] != 0) {
7054 k = snprintf(c, clen, "\t%u mbufs allocated to "
7055 "<mbuf type %d>\n", mbstat.m_mtypes[i], i);
7056 MBUF_DUMP_BUF_CHK();
7057 }
7058 if ((m_mbufs - totmbufs) > 0) {
7059 k = snprintf(c, clen, "\t%lu mbufs allocated to caches\n",
7060 m_mbufs - totmbufs);
7061 MBUF_DUMP_BUF_CHK();
7062 }
7063 k = snprintf(c, clen, "%u/%u mbuf 2KB clusters in use\n"
7064 "%u/%u mbuf 4KB clusters in use\n",
7065 (unsigned int)(mbstat.m_clusters - m_clfree),
7066 (unsigned int)mbstat.m_clusters,
7067 (unsigned int)(mbstat.m_bigclusters - m_bigclfree),
7068 (unsigned int)mbstat.m_bigclusters);
7069 MBUF_DUMP_BUF_CHK();
7070
7071 if (njcl > 0) {
7072 k = snprintf(c, clen, "%u/%u mbuf %uKB clusters in use\n",
7073 m_16kclusters - m_16kclfree, m_16kclusters,
7074 njclbytes / 1024);
7075 MBUF_DUMP_BUF_CHK();
7076 }
7077 totused = totmem - totfree;
7078 if (totmem == 0) {
7079 totpct = 0;
7080 } else if (totused < (ULONG_MAX / 100)) {
7081 totpct = (totused * 100) / totmem;
7082 } else {
7083 u_long totmem1 = totmem / 100;
7084 u_long totused1 = totused / 100;
7085 totpct = (totused1 * 100) / totmem1;
7086 }
7087 k = snprintf(c, clen, "%lu KB allocated to network (approx. %lu%% "
7088 "in use)\n", totmem / 1024, totpct);
7089 MBUF_DUMP_BUF_CHK();
7090
316670eb
A
7091 /* mbuf leak detection statistics */
7092 mleak_update_stats();
7093
7094 k = snprintf(c, clen, "\nmbuf leak detection table:\n");
7095 MBUF_DUMP_BUF_CHK();
7096 k = snprintf(c, clen, "\ttotal captured: %u (one per %u)\n",
7097 mleak_table.mleak_capture / mleak_table.mleak_sample_factor,
7098 mleak_table.mleak_sample_factor);
7099 MBUF_DUMP_BUF_CHK();
7100 k = snprintf(c, clen, "\ttotal allocs outstanding: %llu\n",
7101 mleak_table.outstanding_allocs);
7102 MBUF_DUMP_BUF_CHK();
7103 k = snprintf(c, clen, "\tnew hash recorded: %llu allocs, %llu traces\n",
7104 mleak_table.alloc_recorded, mleak_table.trace_recorded);
7105 MBUF_DUMP_BUF_CHK();
7106 k = snprintf(c, clen, "\thash collisions: %llu allocs, %llu traces\n",
7107 mleak_table.alloc_collisions, mleak_table.trace_collisions);
7108 MBUF_DUMP_BUF_CHK();
7109 k = snprintf(c, clen, "\toverwrites: %llu allocs, %llu traces\n",
7110 mleak_table.alloc_overwrites, mleak_table.trace_overwrites);
7111 MBUF_DUMP_BUF_CHK();
7112 k = snprintf(c, clen, "\tlock conflicts: %llu\n\n",
7113 mleak_table.total_conflicts);
7114 MBUF_DUMP_BUF_CHK();
7115
7116 k = snprintf(c, clen, "top %d outstanding traces:\n",
7117 mleak_stat->ml_cnt);
7118 MBUF_DUMP_BUF_CHK();
7119 for (i = 0; i < mleak_stat->ml_cnt; i++) {
7120 mltr = &mleak_stat->ml_trace[i];
7121 k = snprintf(c, clen, "[%d] %llu outstanding alloc(s), "
7122 "%llu hit(s), %llu collision(s)\n", (i + 1),
7123 mltr->mltr_allocs, mltr->mltr_hitcount,
7124 mltr->mltr_collisions);
7125 MBUF_DUMP_BUF_CHK();
7126 }
7127
7128 if (mleak_stat->ml_isaddr64)
7129 k = snprintf(c, clen, MB_LEAK_HDR_64);
7130 else
7131 k = snprintf(c, clen, MB_LEAK_HDR_32);
7132 MBUF_DUMP_BUF_CHK();
7133
7134 for (i = 0; i < MLEAK_STACK_DEPTH; i++) {
7135 int j;
7136 k = snprintf(c, clen, "%2d: ", (i + 1));
7137 MBUF_DUMP_BUF_CHK();
7138 for (j = 0; j < mleak_stat->ml_cnt; j++) {
7139 mltr = &mleak_stat->ml_trace[j];
7140 if (i < mltr->mltr_depth) {
7141 if (mleak_stat->ml_isaddr64) {
7142 k = snprintf(c, clen, "0x%0llx ",
fe8ab488
A
7143 (uint64_t)VM_KERNEL_UNSLIDE(
7144 mltr->mltr_addr[i]));
316670eb
A
7145 } else {
7146 k = snprintf(c, clen,
7147 "0x%08x ",
fe8ab488
A
7148 (uint32_t)VM_KERNEL_UNSLIDE(
7149 mltr->mltr_addr[i]));
316670eb
A
7150 }
7151 } else {
7152 if (mleak_stat->ml_isaddr64)
7153 k = snprintf(c, clen,
7154 MB_LEAK_SPACING_64);
7155 else
7156 k = snprintf(c, clen,
7157 MB_LEAK_SPACING_32);
7158 }
7159 MBUF_DUMP_BUF_CHK();
7160 }
7161 k = snprintf(c, clen, "\n");
7162 MBUF_DUMP_BUF_CHK();
7163 }
6d2010ae
A
7164done:
7165 return (mbuf_dump_buf);
7166}
7167
7168#undef MBUF_DUMP_BUF_CHK
7169
39236c6e
A
7170/*
7171 * Convert between a regular and a packet header mbuf. Caller is responsible
7172 * for setting or clearing M_PKTHDR; this routine does the rest of the work.
7173 */
7174int
7175m_reinit(struct mbuf *m, int hdr)
7176{
7177 int ret = 0;
7178
7179 if (hdr) {
7180 VERIFY(!(m->m_flags & M_PKTHDR));
7181 if (!(m->m_flags & M_EXT) &&
7182 (m->m_data != m->m_dat || m->m_len > 0)) {
7183 /*
7184 * If there's no external cluster attached and the
7185 * mbuf appears to contain user data, we cannot
7186 * safely convert this to a packet header mbuf,
7187 * as the packet header structure might overlap
7188 * with the data.
7189 */
fe8ab488
A
7190 printf("%s: cannot set M_PKTHDR on altered mbuf %llx, "
7191 "m_data %llx (expected %llx), "
7192 "m_len %d (expected 0)\n",
7193 __func__,
7194 (uint64_t)VM_KERNEL_ADDRPERM(m),
7195 (uint64_t)VM_KERNEL_ADDRPERM(m->m_data),
7196 (uint64_t)VM_KERNEL_ADDRPERM(m->m_dat), m->m_len);
39236c6e
A
7197 ret = EBUSY;
7198 } else {
7199 VERIFY((m->m_flags & M_EXT) || m->m_data == m->m_dat);
7200 m->m_flags |= M_PKTHDR;
7201 MBUF_INIT_PKTHDR(m);
7202 }
7203 } else {
7204 /* Check for scratch area overflow */
7205 m_redzone_verify(m);
7206 /* Free the aux data and tags if there is any */
7207 m_tag_delete_chain(m, NULL);
7208 m->m_flags &= ~M_PKTHDR;
7209 }
7210
7211 return (ret);
7212}
7213
7214void
7215m_scratch_init(struct mbuf *m)
7216{
fe8ab488
A
7217 struct pkthdr *pkt = &m->m_pkthdr;
7218
39236c6e
A
7219 VERIFY(m->m_flags & M_PKTHDR);
7220
fe8ab488
A
7221 /* See comments in <rdar://problem/14040693> */
7222 if (pkt->pkt_flags & PKTF_PRIV_GUARDED) {
7223 panic_plain("Invalid attempt to modify guarded module-private "
7224 "area: mbuf %p, pkt_flags 0x%x\n", m, pkt->pkt_flags);
7225 /* NOTREACHED */
7226 }
7227
7228 bzero(&pkt->pkt_mpriv, sizeof (pkt->pkt_mpriv));
39236c6e
A
7229}
7230
fe8ab488
A
7231/*
7232 * This routine is reserved for mbuf_get_driver_scratch(); clients inside
7233 * xnu that intend on utilizing the module-private area should directly
7234 * refer to the pkt_mpriv structure in the pkthdr. They are also expected
7235 * to set and clear PKTF_PRIV_GUARDED, while owning the packet and prior
7236 * to handing it off to another module, respectively.
7237 */
39236c6e
A
7238u_int32_t
7239m_scratch_get(struct mbuf *m, u_int8_t **p)
7240{
fe8ab488
A
7241 struct pkthdr *pkt = &m->m_pkthdr;
7242
39236c6e
A
7243 VERIFY(m->m_flags & M_PKTHDR);
7244
fe8ab488
A
7245 /* See comments in <rdar://problem/14040693> */
7246 if (pkt->pkt_flags & PKTF_PRIV_GUARDED) {
7247 panic_plain("Invalid attempt to access guarded module-private "
7248 "area: mbuf %p, pkt_flags 0x%x\n", m, pkt->pkt_flags);
7249 /* NOTREACHED */
7250 }
7251
39236c6e
A
7252 if (mcltrace) {
7253 mcache_audit_t *mca;
7254
7255 lck_mtx_lock(mbuf_mlock);
7256 mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
7257 if (mca->mca_uflags & MB_SCVALID)
7258 mcl_audit_scratch(mca);
7259 lck_mtx_unlock(mbuf_mlock);
7260 }
7261
fe8ab488
A
7262 *p = (u_int8_t *)&pkt->pkt_mpriv;
7263 return (sizeof (pkt->pkt_mpriv));
39236c6e
A
7264}
7265
7266static void
7267m_redzone_init(struct mbuf *m)
7268{
7269 VERIFY(m->m_flags & M_PKTHDR);
7270 /*
7271 * Each mbuf has a unique red zone pattern, which is a XOR
7272 * of the red zone cookie and the address of the mbuf.
7273 */
7274 m->m_pkthdr.redzone = ((u_int32_t)(uintptr_t)m) ^ mb_redzone_cookie;
7275}
7276
7277static void
7278m_redzone_verify(struct mbuf *m)
7279{
7280 u_int32_t mb_redzone;
7281
7282 VERIFY(m->m_flags & M_PKTHDR);
7283
7284 mb_redzone = ((u_int32_t)(uintptr_t)m) ^ mb_redzone_cookie;
7285 if (m->m_pkthdr.redzone != mb_redzone) {
7286 panic("mbuf %p redzone violation with value 0x%x "
7287 "(instead of 0x%x, using cookie 0x%x)\n",
7288 m, m->m_pkthdr.redzone, mb_redzone, mb_redzone_cookie);
7289 /* NOTREACHED */
7290 }
7291}
7292
fe8ab488
A
7293/*
7294 * Send a report of mbuf usage if the usage is at least 6% of max limit
7295 * or if there has been at least 3% increase since the last report.
7296 *
7297 * The values 6% and 3% are chosen so that we can do simple arithmetic
7298 * with shift operations.
7299 */
7300static boolean_t
7301mbuf_report_usage(mbuf_class_t cl)
7302{
7303 /* if a report is already in progress, nothing to do */
7304 if (mb_peak_newreport)
7305 return (TRUE);
7306
7307 if (m_total(cl) > m_peak(cl) &&
7308 m_total(cl) >= (m_maxlimit(cl) >> 4) &&
7309 (m_total(cl) - m_peak(cl)) >= (m_peak(cl) >> 5))
7310 return (TRUE);
7311 return (FALSE);
7312}
7313
7314__private_extern__ void
7315mbuf_report_peak_usage(void)
7316{
7317 int i = 0;
7318 u_int64_t uptime;
7319 struct nstat_sysinfo_data ns_data;
7320 uint32_t memreleased = 0;
7321
7322 uptime = net_uptime();
7323 lck_mtx_lock(mbuf_mlock);
7324
7325 /* Generate an initial report after 1 week of uptime */
7326 if (!mb_peak_firstreport &&
7327 uptime > MBUF_PEAK_FIRST_REPORT_THRESHOLD) {
7328 mb_peak_newreport = TRUE;
7329 mb_peak_firstreport = TRUE;
7330 }
7331
7332 if (!mb_peak_newreport) {
7333 lck_mtx_unlock(mbuf_mlock);
7334 return;
7335 }
7336
7337 /*
7338 * Since a report is being generated before 1 week,
7339 * we do not need to force another one later
7340 */
7341 if (uptime < MBUF_PEAK_FIRST_REPORT_THRESHOLD)
7342 mb_peak_firstreport = TRUE;
7343
7344 for (i = 0; i < NELEM(mbuf_table); i++) {
7345 m_peak(m_class(i)) = m_total(m_class(i));
7346 memreleased += m_release_cnt(i);
7347 }
7348 mb_peak_newreport = FALSE;
7349 lck_mtx_unlock(mbuf_mlock);
7350
7351 bzero(&ns_data, sizeof(ns_data));
7352 ns_data.flags = NSTAT_SYSINFO_MBUF_STATS;
7353 ns_data.u.mb_stats.total_256b = m_peak(MC_MBUF);
7354 ns_data.u.mb_stats.total_2kb = m_peak(MC_CL);
7355 ns_data.u.mb_stats.total_4kb = m_peak(MC_BIGCL);
7356 ns_data.u.mb_stats.sbmb_total = total_sbmb_cnt_peak;
7357 ns_data.u.mb_stats.sb_atmbuflimit = sbmb_limreached;
7358 ns_data.u.mb_stats.draincnt = mbstat.m_drain;
7359 ns_data.u.mb_stats.memreleased = memreleased;
7360
7361 nstat_sysinfo_send_data(&ns_data);
7362}
7363
7364/*
7365 * Called by the VM when there's memory pressure.
7366 */
7367__private_extern__ void
7368m_drain(void)
7369{
7370 mbuf_class_t mc;
7371 mcl_slab_t *sp, *sp_tmp, *nsp;
7372 unsigned int num, k, interval, released = 0;
7373 unsigned int total_mem = 0, use_mem = 0;
7374 boolean_t ret, purge_caches = FALSE;
7375 ppnum_t offset;
7376 mcache_obj_t *obj;
7377 float per;
7378 static uint64_t last_drain = 0;
7379 static unsigned char scratch[32];
7380 static ppnum_t scratch_pa = 0;
7381
7382 if (mb_drain_maxint == 0 || mb_waiters)
7383 return;
7384 if (scratch_pa == 0) {
7385 bzero(scratch, sizeof(scratch));
7386 scratch_pa = pmap_find_phys(kernel_pmap, (addr64_t)scratch);
7387 VERIFY(scratch_pa);
7388 } else if (mclverify) {
7389 /*
7390 * Panic if a driver wrote to our scratch memory.
7391 */
7392 for (k = 0; k < sizeof(scratch); k++)
7393 if (scratch[k])
7394 panic("suspect DMA to freed address");
7395 }
7396 /*
7397 * Don't free memory too often as that could cause excessive
7398 * waiting times for mbufs. Purge caches if we were asked to drain
7399 * in the last 5 minutes.
7400 */
7401 lck_mtx_lock(mbuf_mlock);
7402 if (last_drain == 0) {
7403 last_drain = net_uptime();
7404 lck_mtx_unlock(mbuf_mlock);
7405 return;
7406 }
7407 interval = net_uptime() - last_drain;
7408 if (interval <= mb_drain_maxint) {
7409 lck_mtx_unlock(mbuf_mlock);
7410 return;
7411 }
7412 if (interval <= mb_drain_maxint * 5)
7413 purge_caches = TRUE;
7414 last_drain = net_uptime();
7415 /*
7416 * Don't free any memory if we're using 60% or more.
7417 */
7418 for (mc = 0; mc < NELEM(mbuf_table); mc++) {
7419 total_mem += m_total(mc) * m_maxsize(mc);
7420 use_mem += m_active(mc) * m_maxsize(mc);
7421 }
7422 per = (float)use_mem / (float)total_mem;
7423 if (per >= 0.6) {
7424 lck_mtx_unlock(mbuf_mlock);
7425 return;
7426 }
7427 /*
7428 * Purge all the caches. This effectively disables
7429 * caching for a few seconds, but the mbuf worker thread will
7430 * re-enable them again.
7431 */
7432 if (purge_caches == TRUE)
7433 for (mc = 0; mc < NELEM(mbuf_table); mc++) {
7434 if (m_total(mc) < m_avgtotal(mc))
7435 continue;
7436 lck_mtx_unlock(mbuf_mlock);
7437 ret = mcache_purge_cache(m_cache(mc), FALSE);
7438 lck_mtx_lock(mbuf_mlock);
7439 if (ret == TRUE)
7440 m_purge_cnt(mc)++;
7441 }
7442 /*
7443 * Move the objects from the composite class freelist to
7444 * the rudimentary slabs list, but keep at least 10% of the average
7445 * total in the freelist.
7446 */
7447 for (mc = 0; mc < NELEM(mbuf_table); mc++) {
7448 while (m_cobjlist(mc) &&
7449 m_total(mc) < m_avgtotal(mc) &&
7450 m_infree(mc) > 0.1 * m_avgtotal(mc) + m_minlimit(mc)) {
7451 obj = m_cobjlist(mc);
7452 m_cobjlist(mc) = obj->obj_next;
7453 obj->obj_next = NULL;
7454 num = cslab_free(mc, obj, 1);
7455 VERIFY(num == 1);
7456 m_free_cnt(mc)++;
7457 m_infree(mc)--;
7458 /* cslab_free() handles m_total */
7459 }
7460 }
7461 /*
7462 * Free the buffers present in the slab list up to 10% of the total
7463 * average per class.
7464 *
7465 * We walk the list backwards in an attempt to reduce fragmentation.
7466 */
7467 for (mc = NELEM(mbuf_table) - 1; (int)mc >= 0; mc--) {
7468 TAILQ_FOREACH_SAFE(sp, &m_slablist(mc), sl_link, sp_tmp) {
7469 /*
7470 * Process only unused slabs occupying memory.
7471 */
7472 if (sp->sl_refcnt != 0 || sp->sl_len == 0 ||
7473 sp->sl_base == NULL)
7474 continue;
7475 if (m_total(mc) < m_avgtotal(mc) ||
7476 m_infree(mc) < 0.1 * m_avgtotal(mc) + m_minlimit(mc))
7477 break;
7478 slab_remove(sp, mc);
7479 switch (mc) {
7480 case MC_MBUF:
7481 m_infree(mc) -= NMBPBG;
7482 m_total(mc) -= NMBPBG;
7483 if (mclaudit != NULL)
7484 mcl_audit_free(sp->sl_base, NMBPBG);
7485 break;
7486 case MC_CL:
7487 m_infree(mc) -= NCLPBG;
7488 m_total(mc) -= NCLPBG;
7489 if (mclaudit != NULL)
7490 mcl_audit_free(sp->sl_base, NMBPBG);
7491 break;
7492 case MC_BIGCL:
7493 m_infree(mc)--;
7494 m_total(mc)--;
7495 if (mclaudit != NULL)
7496 mcl_audit_free(sp->sl_base, NMBPBG);
7497 break;
7498 case MC_16KCL:
7499 m_infree(mc)--;
7500 m_total(mc)--;
7501 for (nsp = sp, k = 1; k < NSLABSP16KB; k++) {
7502 nsp = nsp->sl_next;
7503 VERIFY(nsp->sl_refcnt == 0 &&
7504 nsp->sl_base != NULL &&
7505 nsp->sl_len == 0);
7506 slab_init(nsp, 0, 0, NULL, NULL, 0, 0,
7507 0);
7508 nsp->sl_flags = 0;
7509 }
7510 if (mclaudit != NULL)
7511 mcl_audit_free(sp->sl_base, 1);
7512 break;
7513 default:
7514 /*
7515 * The composite classes have their own
7516 * freelist (m_cobjlist), so we only
7517 * process rudimentary classes here.
7518 */
7519 VERIFY(0);
7520 }
7521 m_release_cnt(mc) += m_size(mc);
7522 released += m_size(mc);
7523 offset = ((char *)sp->sl_base - (char *)mbutl) / NBPG;
7524 /*
7525 * Make sure the IOMapper points to a valid, but
7526 * bogus, address. This should prevent further DMA
7527 * accesses to freed memory.
7528 */
7529 IOMapperInsertPage(mcl_paddr_base, offset, scratch_pa);
7530 mcl_paddr[offset] = 0;
7531 kmem_free(mb_map, (vm_offset_t)sp->sl_base,
7532 sp->sl_len);
7533 slab_init(sp, 0, 0, NULL, NULL, 0, 0, 0);
7534 sp->sl_flags = 0;
7535 }
7536 }
7537 mbstat.m_drain++;
7538 mbstat.m_bigclusters = m_total(MC_BIGCL);
7539 mbstat.m_clusters = m_total(MC_CL);
7540 mbstat.m_mbufs = m_total(MC_MBUF);
7541 mbuf_stat_sync();
7542 mbuf_mtypes_sync(TRUE);
7543 lck_mtx_unlock(mbuf_mlock);
7544}
7545
7546static int
7547m_drain_force_sysctl SYSCTL_HANDLER_ARGS
7548{
7549#pragma unused(arg1, arg2)
7550 int val = 0, err;
7551
7552 err = sysctl_handle_int(oidp, &val, 0, req);
7553 if (err != 0 || req->newptr == USER_ADDR_NULL)
7554 return (err);
7555 if (val)
7556 m_drain();
7557
7558 return (err);
7559}
7560
2d21ac55 7561SYSCTL_DECL(_kern_ipc);
6d2010ae 7562SYSCTL_PROC(_kern_ipc, KIPC_MBSTAT, mbstat,
fe8ab488 7563 CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
2d21ac55 7564 0, 0, mbstat_sysctl, "S,mbstat", "");
6d2010ae 7565SYSCTL_PROC(_kern_ipc, OID_AUTO, mb_stat,
fe8ab488 7566 CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
2d21ac55 7567 0, 0, mb_stat_sysctl, "S,mb_stat", "");
6d2010ae 7568SYSCTL_PROC(_kern_ipc, OID_AUTO, mleak_top_trace,
fe8ab488 7569 CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
6d2010ae
A
7570 0, 0, mleak_top_trace_sysctl, "S,mb_top_trace", "");
7571SYSCTL_PROC(_kern_ipc, OID_AUTO, mleak_table,
fe8ab488 7572 CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
6d2010ae
A
7573 0, 0, mleak_table_sysctl, "S,mleak_table", "");
7574SYSCTL_INT(_kern_ipc, OID_AUTO, mleak_sample_factor,
7575 CTLFLAG_RW | CTLFLAG_LOCKED, &mleak_table.mleak_sample_factor, 0, "");
7576SYSCTL_INT(_kern_ipc, OID_AUTO, mb_normalized,
7577 CTLFLAG_RD | CTLFLAG_LOCKED, &mb_normalized, 0, "");
7578SYSCTL_INT(_kern_ipc, OID_AUTO, mb_watchdog,
7579 CTLFLAG_RW | CTLFLAG_LOCKED, &mb_watchdog, 0, "");
fe8ab488
A
7580SYSCTL_PROC(_kern_ipc, OID_AUTO, mb_drain_force,
7581 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, NULL, 0,
7582 m_drain_force_sysctl, "I",
7583 "Forces the mbuf garbage collection to run");
7584SYSCTL_INT(_kern_ipc, OID_AUTO, mb_drain_maxint,
7585 CTLFLAG_RW | CTLFLAG_LOCKED, &mb_drain_maxint, 0,
7586 "Minimum time interval between garbage collection");