]> git.saurik.com Git - apple/xnu.git/blob - bsd/kern/uipc_mbuf.c
xnu-2422.110.17.tar.gz
[apple/xnu.git] / bsd / kern / uipc_mbuf.c
1 /*
2 * Copyright (c) 1998-2013 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
29 /*
30 * Copyright (c) 1982, 1986, 1988, 1991, 1993
31 * The Regents of the University of California. All rights reserved.
32 *
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
35 * are met:
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
41 * 3. All advertising materials mentioning features or use of this software
42 * must display the following acknowledgement:
43 * This product includes software developed by the University of
44 * California, Berkeley and its contributors.
45 * 4. Neither the name of the University nor the names of its contributors
46 * may be used to endorse or promote products derived from this software
47 * without specific prior written permission.
48 *
49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59 * SUCH DAMAGE.
60 *
61 * @(#)uipc_mbuf.c 8.2 (Berkeley) 1/4/94
62 */
63 /*
64 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
65 * support for mandatory and extensible security protections. This notice
66 * is included in support of clause 2.2 (b) of the Apple Public License,
67 * Version 2.0.
68 */
69
70 #include <sys/param.h>
71 #include <sys/systm.h>
72 #include <sys/malloc.h>
73 #include <sys/mbuf.h>
74 #include <sys/kernel.h>
75 #include <sys/sysctl.h>
76 #include <sys/syslog.h>
77 #include <sys/protosw.h>
78 #include <sys/domain.h>
79 #include <sys/queue.h>
80 #include <sys/proc.h>
81
82 #include <dev/random/randomdev.h>
83
84 #include <kern/kern_types.h>
85 #include <kern/simple_lock.h>
86 #include <kern/queue.h>
87 #include <kern/sched_prim.h>
88 #include <kern/cpu_number.h>
89 #include <kern/zalloc.h>
90
91 #include <libkern/OSAtomic.h>
92 #include <libkern/OSDebug.h>
93 #include <libkern/libkern.h>
94
95 #include <IOKit/IOMapper.h>
96
97 #include <machine/limits.h>
98 #include <machine/machine_routines.h>
99
100 #if CONFIG_MACF_NET
101 #include <security/mac_framework.h>
102 #endif /* MAC_NET */
103
104 #include <sys/mcache.h>
105
106 /*
107 * MBUF IMPLEMENTATION NOTES.
108 *
109 * There is a total of 5 per-CPU caches:
110 *
111 * MC_MBUF:
112 * This is a cache of rudimentary objects of MSIZE in size; each
113 * object represents an mbuf structure. This cache preserves only
114 * the m_type field of the mbuf during its transactions.
115 *
116 * MC_CL:
117 * This is a cache of rudimentary objects of MCLBYTES in size; each
118 * object represents a mcluster structure. This cache does not
119 * preserve the contents of the objects during its transactions.
120 *
121 * MC_BIGCL:
122 * This is a cache of rudimentary objects of MBIGCLBYTES in size; each
123 * object represents a mbigcluster structure. This cache does not
124 * preserve the contents of the objects during its transaction.
125 *
126 * MC_MBUF_CL:
127 * This is a cache of mbufs each having a cluster attached to it.
128 * It is backed by MC_MBUF and MC_CL rudimentary caches. Several
129 * fields of the mbuf related to the external cluster are preserved
130 * during transactions.
131 *
132 * MC_MBUF_BIGCL:
133 * This is a cache of mbufs each having a big cluster attached to it.
134 * It is backed by MC_MBUF and MC_BIGCL rudimentary caches. Several
135 * fields of the mbuf related to the external cluster are preserved
136 * during transactions.
137 *
138 * OBJECT ALLOCATION:
139 *
140 * Allocation requests are handled first at the per-CPU (mcache) layer
141 * before falling back to the slab layer. Performance is optimal when
142 * the request is satisfied at the CPU layer because global data/lock
143 * never gets accessed. When the slab layer is entered for allocation,
144 * the slab freelist will be checked first for available objects before
145 * the VM backing store is invoked. Slab layer operations are serialized
146 * for all of the caches as the mbuf global lock is held most of the time.
147 * Allocation paths are different depending on the class of objects:
148 *
149 * a. Rudimentary object:
150 *
151 * { m_get_common(), m_clattach(), m_mclget(),
152 * m_mclalloc(), m_bigalloc(), m_copym_with_hdrs(),
153 * composite object allocation }
154 * | ^
155 * | |
156 * | +-----------------------+
157 * v |
158 * mcache_alloc/mcache_alloc_ext() mbuf_slab_audit()
159 * | ^
160 * v |
161 * [CPU cache] -------> (found?) -------+
162 * | |
163 * v |
164 * mbuf_slab_alloc() |
165 * | |
166 * v |
167 * +---------> [freelist] -------> (found?) -------+
168 * | |
169 * | v
170 * | m_clalloc()
171 * | |
172 * | v
173 * +---<<---- kmem_mb_alloc()
174 *
175 * b. Composite object:
176 *
177 * { m_getpackets_internal(), m_allocpacket_internal() }
178 * | ^
179 * | |
180 * | +------ (done) ---------+
181 * v |
182 * mcache_alloc/mcache_alloc_ext() mbuf_cslab_audit()
183 * | ^
184 * v |
185 * [CPU cache] -------> (found?) -------+
186 * | |
187 * v |
188 * mbuf_cslab_alloc() |
189 * | |
190 * v |
191 * [freelist] -------> (found?) -------+
192 * | |
193 * v |
194 * (rudimentary object) |
195 * mcache_alloc/mcache_alloc_ext() ------>>-----+
196 *
197 * Auditing notes: If auditing is enabled, buffers will be subjected to
198 * integrity checks by the audit routine. This is done by verifying their
199 * contents against DEADBEEF (free) pattern before returning them to caller.
200 * As part of this step, the routine will also record the transaction and
201 * pattern-fill the buffers with BADDCAFE (uninitialized) pattern. It will
202 * also restore any constructed data structure fields if necessary.
203 *
204 * OBJECT DEALLOCATION:
205 *
206 * Freeing an object simply involves placing it into the CPU cache; this
207 * pollutes the cache to benefit subsequent allocations. The slab layer
208 * will only be entered if the object is to be purged out of the cache.
209 * During normal operations, this happens only when the CPU layer resizes
210 * its bucket while it's adjusting to the allocation load. Deallocation
211 * paths are different depending on the class of objects:
212 *
213 * a. Rudimentary object:
214 *
215 * { m_free(), m_freem_list(), composite object deallocation }
216 * | ^
217 * | |
218 * | +------ (done) ---------+
219 * v |
220 * mcache_free/mcache_free_ext() |
221 * | |
222 * v |
223 * mbuf_slab_audit() |
224 * | |
225 * v |
226 * [CPU cache] ---> (not purging?) -----+
227 * | |
228 * v |
229 * mbuf_slab_free() |
230 * | |
231 * v |
232 * [freelist] ----------->>------------+
233 * (objects never get purged to VM)
234 *
235 * b. Composite object:
236 *
237 * { m_free(), m_freem_list() }
238 * | ^
239 * | |
240 * | +------ (done) ---------+
241 * v |
242 * mcache_free/mcache_free_ext() |
243 * | |
244 * v |
245 * mbuf_cslab_audit() |
246 * | |
247 * v |
248 * [CPU cache] ---> (not purging?) -----+
249 * | |
250 * v |
251 * mbuf_cslab_free() |
252 * | |
253 * v |
254 * [freelist] ---> (not purging?) -----+
255 * | |
256 * v |
257 * (rudimentary object) |
258 * mcache_free/mcache_free_ext() ------->>------+
259 *
260 * Auditing notes: If auditing is enabled, the audit routine will save
261 * any constructed data structure fields (if necessary) before filling the
262 * contents of the buffers with DEADBEEF (free) pattern and recording the
263 * transaction. Buffers that are freed (whether at CPU or slab layer) are
264 * expected to contain the free pattern.
265 *
266 * DEBUGGING:
267 *
268 * Debugging can be enabled by adding "mbuf_debug=0x3" to boot-args; this
269 * translates to the mcache flags (MCF_VERIFY | MCF_AUDIT). Additionally,
270 * the CPU layer cache can be disabled by setting the MCF_NOCPUCACHE flag,
271 * i.e. modify the boot argument parameter to "mbuf_debug=0x13". Leak
272 * detection may also be disabled by setting the MCF_NOLEAKLOG flag, e.g.
273 * "mbuf_debug=0x113". Note that debugging consumes more CPU and memory.
274 *
275 * Each object is associated with exactly one mcache_audit_t structure that
276 * contains the information related to its last buffer transaction. Given
277 * an address of an object, the audit structure can be retrieved by finding
278 * the position of the object relevant to the base address of the cluster:
279 *
280 * +------------+ +=============+
281 * | mbuf addr | | mclaudit[i] |
282 * +------------+ +=============+
283 * | | cl_audit[0] |
284 * i = MTOBG(addr) +-------------+
285 * | +-----> | cl_audit[1] | -----> mcache_audit_t
286 * b = BGTOM(i) | +-------------+
287 * | | | ... |
288 * x = MCLIDX(b, addr) | +-------------+
289 * | | | cl_audit[7] |
290 * +-----------------+ +-------------+
291 * (e.g. x == 1)
292 *
293 * The mclaudit[] array is allocated at initialization time, but its contents
294 * get populated when the corresponding cluster is created. Because a page
295 * can be turned into NMBPBG number of mbufs, we preserve enough space for the
296 * mbufs so that there is a 1-to-1 mapping between them. A page that never
297 * gets (or has not yet) turned into mbufs will use only cl_audit[0] with the
298 * remaining entries unused. For 16KB cluster, only one entry from the first
299 * page is allocated and used for the entire object.
300 */
301
302 /* TODO: should be in header file */
303 /* kernel translater */
304 extern vm_offset_t kmem_mb_alloc(vm_map_t, int, int);
305 extern ppnum_t pmap_find_phys(pmap_t pmap, addr64_t va);
306 extern vm_map_t mb_map; /* special map */
307
308 /* Global lock */
309 decl_lck_mtx_data(static, mbuf_mlock_data);
310 static lck_mtx_t *mbuf_mlock = &mbuf_mlock_data;
311 static lck_attr_t *mbuf_mlock_attr;
312 static lck_grp_t *mbuf_mlock_grp;
313 static lck_grp_attr_t *mbuf_mlock_grp_attr;
314
315 /* Back-end (common) layer */
316 static void *mbuf_worker_run; /* wait channel for worker thread */
317 static int mbuf_worker_ready; /* worker thread is runnable */
318 static int mbuf_expand_mcl; /* number of cluster creation requets */
319 static int mbuf_expand_big; /* number of big cluster creation requests */
320 static int mbuf_expand_16k; /* number of 16KB cluster creation requests */
321 static int ncpu; /* number of CPUs */
322 static ppnum_t *mcl_paddr; /* Array of cluster physical addresses */
323 static ppnum_t mcl_pages; /* Size of array (# physical pages) */
324 static ppnum_t mcl_paddr_base; /* Handle returned by IOMapper::iovmAlloc() */
325 static mcache_t *ref_cache; /* Cache of cluster reference & flags */
326 static mcache_t *mcl_audit_con_cache; /* Audit contents cache */
327 static unsigned int mbuf_debug; /* patchable mbuf mcache flags */
328 static unsigned int mb_normalized; /* number of packets "normalized" */
329
330 #define MB_GROWTH_AGGRESSIVE 1 /* Threshold: 1/2 of total */
331 #define MB_GROWTH_NORMAL 2 /* Threshold: 3/4 of total */
332
333 typedef enum {
334 MC_MBUF = 0, /* Regular mbuf */
335 MC_CL, /* Cluster */
336 MC_BIGCL, /* Large (4KB) cluster */
337 MC_16KCL, /* Jumbo (16KB) cluster */
338 MC_MBUF_CL, /* mbuf + cluster */
339 MC_MBUF_BIGCL, /* mbuf + large (4KB) cluster */
340 MC_MBUF_16KCL /* mbuf + jumbo (16KB) cluster */
341 } mbuf_class_t;
342
343 #define MBUF_CLASS_MIN MC_MBUF
344 #define MBUF_CLASS_MAX MC_MBUF_16KCL
345 #define MBUF_CLASS_LAST MC_16KCL
346 #define MBUF_CLASS_VALID(c) \
347 ((int)(c) >= MBUF_CLASS_MIN && (int)(c) <= MBUF_CLASS_MAX)
348 #define MBUF_CLASS_COMPOSITE(c) \
349 ((int)(c) > MBUF_CLASS_LAST)
350
351
352 /*
353 * mbuf specific mcache allocation request flags.
354 */
355 #define MCR_COMP MCR_USR1 /* for MC_MBUF_{CL,BIGCL,16KCL} caches */
356
357 /*
358 * Per-cluster slab structure.
359 *
360 * A slab is a cluster control structure that contains one or more object
361 * chunks; the available chunks are chained in the slab's freelist (sl_head).
362 * Each time a chunk is taken out of the slab, the slab's reference count
363 * gets incremented. When all chunks have been taken out, the empty slab
364 * gets removed (SLF_DETACHED) from the class's slab list. A chunk that is
365 * returned to a slab causes the slab's reference count to be decremented;
366 * it also causes the slab to be reinserted back to class's slab list, if
367 * it's not already done.
368 *
369 * Compartmentalizing of the object chunks into slabs allows us to easily
370 * merge one or more slabs together when the adjacent slabs are idle, as
371 * well as to convert or move a slab from one class to another; e.g. the
372 * mbuf cluster slab can be converted to a regular cluster slab when all
373 * mbufs in the slab have been freed.
374 *
375 * A slab may also span across multiple clusters for chunks larger than
376 * a cluster's size. In this case, only the slab of the first cluster is
377 * used. The rest of the slabs are marked with SLF_PARTIAL to indicate
378 * that they are part of the larger slab.
379 *
380 * Each slab controls a page of memory.
381 */
382 typedef struct mcl_slab {
383 struct mcl_slab *sl_next; /* neighboring slab */
384 u_int8_t sl_class; /* controlling mbuf class */
385 int8_t sl_refcnt; /* outstanding allocations */
386 int8_t sl_chunks; /* chunks (bufs) in this slab */
387 u_int16_t sl_flags; /* slab flags (see below) */
388 u_int16_t sl_len; /* slab length */
389 void *sl_base; /* base of allocated memory */
390 void *sl_head; /* first free buffer */
391 TAILQ_ENTRY(mcl_slab) sl_link; /* next/prev slab on freelist */
392 } mcl_slab_t;
393
394 #define SLF_MAPPED 0x0001 /* backed by a mapped page */
395 #define SLF_PARTIAL 0x0002 /* part of another slab */
396 #define SLF_DETACHED 0x0004 /* not in slab freelist */
397
398 /*
399 * The array of slabs are broken into groups of arrays per 1MB of kernel
400 * memory to reduce the footprint. Each group is allocated on demand
401 * whenever a new piece of memory mapped in from the VM crosses the 1MB
402 * boundary.
403 */
404 #define NSLABSPMB ((1 << MBSHIFT) >> PGSHIFT) /* 256 slabs/grp */
405
406 typedef struct mcl_slabg {
407 mcl_slab_t slg_slab[NSLABSPMB]; /* group of slabs */
408 } mcl_slabg_t;
409
410 /*
411 * Number of slabs needed to control a 16KB cluster object.
412 */
413 #define NSLABSP16KB (M16KCLBYTES >> PGSHIFT)
414
415 /*
416 * Per-cluster audit structure.
417 */
418 typedef struct {
419 mcache_audit_t *cl_audit[NMBPBG]; /* array of audits */
420 } mcl_audit_t;
421
422 typedef struct {
423 struct thread *msa_thread; /* thread doing transaction */
424 struct thread *msa_pthread; /* previous transaction thread */
425 uint32_t msa_tstamp; /* transaction timestamp (ms) */
426 uint32_t msa_ptstamp; /* prev transaction timestamp (ms) */
427 uint16_t msa_depth; /* pc stack depth */
428 uint16_t msa_pdepth; /* previous transaction pc stack */
429 void *msa_stack[MCACHE_STACK_DEPTH];
430 void *msa_pstack[MCACHE_STACK_DEPTH];
431 } mcl_scratch_audit_t;
432
433 typedef struct {
434 /*
435 * Size of data from the beginning of an mbuf that covers m_hdr,
436 * pkthdr and m_ext structures. If auditing is enabled, we allocate
437 * a shadow mbuf structure of this size inside each audit structure,
438 * and the contents of the real mbuf gets copied into it when the mbuf
439 * is freed. This allows us to pattern-fill the mbuf for integrity
440 * check, and to preserve any constructed mbuf fields (e.g. mbuf +
441 * cluster cache case). Note that we don't save the contents of
442 * clusters when they are freed; we simply pattern-fill them.
443 */
444 u_int8_t sc_mbuf[(MSIZE - _MHLEN) + sizeof (_m_ext_t)];
445 mcl_scratch_audit_t sc_scratch __attribute__((aligned(8)));
446 } mcl_saved_contents_t;
447
448 #define AUDIT_CONTENTS_SIZE (sizeof (mcl_saved_contents_t))
449
450 #define MCA_SAVED_MBUF_PTR(_mca) \
451 ((struct mbuf *)(void *)((mcl_saved_contents_t *) \
452 (_mca)->mca_contents)->sc_mbuf)
453 #define MCA_SAVED_MBUF_SIZE \
454 (sizeof (((mcl_saved_contents_t *)0)->sc_mbuf))
455 #define MCA_SAVED_SCRATCH_PTR(_mca) \
456 (&((mcl_saved_contents_t *)(_mca)->mca_contents)->sc_scratch)
457
458 /*
459 * mbuf specific mcache audit flags
460 */
461 #define MB_INUSE 0x01 /* object has not been returned to slab */
462 #define MB_COMP_INUSE 0x02 /* object has not been returned to cslab */
463 #define MB_SCVALID 0x04 /* object has valid saved contents */
464
465 /*
466 * Each of the following two arrays hold up to nmbclusters elements.
467 */
468 static mcl_audit_t *mclaudit; /* array of cluster audit information */
469 static unsigned int maxclaudit; /* max # of entries in audit table */
470 static mcl_slabg_t **slabstbl; /* cluster slabs table */
471 static unsigned int maxslabgrp; /* max # of entries in slabs table */
472 static unsigned int slabgrp; /* # of entries in slabs table */
473
474 /* Globals */
475 int nclusters; /* # of clusters for non-jumbo (legacy) sizes */
476 int njcl; /* # of clusters for jumbo sizes */
477 int njclbytes; /* size of a jumbo cluster */
478 union mbigcluster *mbutl; /* first mapped cluster address */
479 union mbigcluster *embutl; /* ending virtual address of mclusters */
480 int _max_linkhdr; /* largest link-level header */
481 int _max_protohdr; /* largest protocol header */
482 int max_hdr; /* largest link+protocol header */
483 int max_datalen; /* MHLEN - max_hdr */
484
485 static boolean_t mclverify; /* debug: pattern-checking */
486 static boolean_t mcltrace; /* debug: stack tracing */
487 static boolean_t mclfindleak; /* debug: leak detection */
488 static boolean_t mclexpleak; /* debug: expose leak info to user space */
489
490 static struct timeval mb_start; /* beginning of time */
491
492 /* mbuf leak detection variables */
493 static struct mleak_table mleak_table;
494 static mleak_stat_t *mleak_stat;
495
496 #define MLEAK_STAT_SIZE(n) \
497 ((size_t)(&((mleak_stat_t *)0)->ml_trace[n]))
498
499 struct mallocation {
500 mcache_obj_t *element; /* the alloc'ed element, NULL if unused */
501 u_int32_t trace_index; /* mtrace index for corresponding backtrace */
502 u_int32_t count; /* How many objects were requested */
503 u_int64_t hitcount; /* for determining hash effectiveness */
504 };
505
506 struct mtrace {
507 u_int64_t collisions;
508 u_int64_t hitcount;
509 u_int64_t allocs;
510 u_int64_t depth;
511 uintptr_t addr[MLEAK_STACK_DEPTH];
512 };
513
514 /* Size must be a power of two for the zhash to be able to just mask off bits */
515 #define MLEAK_ALLOCATION_MAP_NUM 512
516 #define MLEAK_TRACE_MAP_NUM 256
517
518 /*
519 * Sample factor for how often to record a trace. This is overwritable
520 * by the boot-arg mleak_sample_factor.
521 */
522 #define MLEAK_SAMPLE_FACTOR 500
523
524 /*
525 * Number of top leakers recorded.
526 */
527 #define MLEAK_NUM_TRACES 5
528
529 #define MB_LEAK_SPACING_64 " "
530 #define MB_LEAK_SPACING_32 " "
531
532
533 #define MB_LEAK_HDR_32 "\n\
534 trace [1] trace [2] trace [3] trace [4] trace [5] \n\
535 ---------- ---------- ---------- ---------- ---------- \n\
536 "
537
538 #define MB_LEAK_HDR_64 "\n\
539 trace [1] trace [2] trace [3] \
540 trace [4] trace [5] \n\
541 ------------------ ------------------ ------------------ \
542 ------------------ ------------------ \n\
543 "
544
545 static uint32_t mleak_alloc_buckets = MLEAK_ALLOCATION_MAP_NUM;
546 static uint32_t mleak_trace_buckets = MLEAK_TRACE_MAP_NUM;
547
548 /* Hashmaps of allocations and their corresponding traces */
549 static struct mallocation *mleak_allocations;
550 static struct mtrace *mleak_traces;
551 static struct mtrace *mleak_top_trace[MLEAK_NUM_TRACES];
552
553 /* Lock to protect mleak tables from concurrent modification */
554 decl_lck_mtx_data(static, mleak_lock_data);
555 static lck_mtx_t *mleak_lock = &mleak_lock_data;
556 static lck_attr_t *mleak_lock_attr;
557 static lck_grp_t *mleak_lock_grp;
558 static lck_grp_attr_t *mleak_lock_grp_attr;
559
560 extern u_int32_t high_sb_max;
561
562 /* The minimum number of objects that are allocated, to start. */
563 #define MINCL 32
564 #define MINBIGCL (MINCL >> 1)
565 #define MIN16KCL (MINCL >> 2)
566
567 /* Low watermarks (only map in pages once free counts go below) */
568 #define MBIGCL_LOWAT MINBIGCL
569 #define M16KCL_LOWAT MIN16KCL
570
571 typedef struct {
572 mbuf_class_t mtbl_class; /* class type */
573 mcache_t *mtbl_cache; /* mcache for this buffer class */
574 TAILQ_HEAD(mcl_slhead, mcl_slab) mtbl_slablist; /* slab list */
575 mcache_obj_t *mtbl_cobjlist; /* composite objects freelist */
576 mb_class_stat_t *mtbl_stats; /* statistics fetchable via sysctl */
577 u_int32_t mtbl_maxsize; /* maximum buffer size */
578 int mtbl_minlimit; /* minimum allowed */
579 int mtbl_maxlimit; /* maximum allowed */
580 u_int32_t mtbl_wantpurge; /* purge during next reclaim */
581 } mbuf_table_t;
582
583 #define m_class(c) mbuf_table[c].mtbl_class
584 #define m_cache(c) mbuf_table[c].mtbl_cache
585 #define m_slablist(c) mbuf_table[c].mtbl_slablist
586 #define m_cobjlist(c) mbuf_table[c].mtbl_cobjlist
587 #define m_maxsize(c) mbuf_table[c].mtbl_maxsize
588 #define m_minlimit(c) mbuf_table[c].mtbl_minlimit
589 #define m_maxlimit(c) mbuf_table[c].mtbl_maxlimit
590 #define m_wantpurge(c) mbuf_table[c].mtbl_wantpurge
591 #define m_cname(c) mbuf_table[c].mtbl_stats->mbcl_cname
592 #define m_size(c) mbuf_table[c].mtbl_stats->mbcl_size
593 #define m_total(c) mbuf_table[c].mtbl_stats->mbcl_total
594 #define m_active(c) mbuf_table[c].mtbl_stats->mbcl_active
595 #define m_infree(c) mbuf_table[c].mtbl_stats->mbcl_infree
596 #define m_slab_cnt(c) mbuf_table[c].mtbl_stats->mbcl_slab_cnt
597 #define m_alloc_cnt(c) mbuf_table[c].mtbl_stats->mbcl_alloc_cnt
598 #define m_free_cnt(c) mbuf_table[c].mtbl_stats->mbcl_free_cnt
599 #define m_notified(c) mbuf_table[c].mtbl_stats->mbcl_notified
600 #define m_purge_cnt(c) mbuf_table[c].mtbl_stats->mbcl_purge_cnt
601 #define m_fail_cnt(c) mbuf_table[c].mtbl_stats->mbcl_fail_cnt
602 #define m_ctotal(c) mbuf_table[c].mtbl_stats->mbcl_ctotal
603
604 static mbuf_table_t mbuf_table[] = {
605 /*
606 * The caches for mbufs, regular clusters and big clusters.
607 */
608 { MC_MBUF, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_MBUF)),
609 NULL, NULL, 0, 0, 0, 0 },
610 { MC_CL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_CL)),
611 NULL, NULL, 0, 0, 0, 0 },
612 { MC_BIGCL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_BIGCL)),
613 NULL, NULL, 0, 0, 0, 0 },
614 { MC_16KCL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_16KCL)),
615 NULL, NULL, 0, 0, 0, 0 },
616 /*
617 * The following are special caches; they serve as intermediate
618 * caches backed by the above rudimentary caches. Each object
619 * in the cache is an mbuf with a cluster attached to it. Unlike
620 * the above caches, these intermediate caches do not directly
621 * deal with the slab structures; instead, the constructed
622 * cached elements are simply stored in the freelists.
623 */
624 { MC_MBUF_CL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0 },
625 { MC_MBUF_BIGCL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0 },
626 { MC_MBUF_16KCL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0 },
627 };
628
629 #define NELEM(a) (sizeof (a) / sizeof ((a)[0]))
630
631 static void *mb_waitchan = &mbuf_table; /* wait channel for all caches */
632 static int mb_waiters; /* number of waiters */
633
634 #define MB_WDT_MAXTIME 10 /* # of secs before watchdog panic */
635 static struct timeval mb_wdtstart; /* watchdog start timestamp */
636 static char *mbuf_dump_buf;
637
638 #define MBUF_DUMP_BUF_SIZE 2048
639
640 /*
641 * mbuf watchdog is enabled by default on embedded platforms. It is
642 * also toggeable via the kern.ipc.mb_watchdog sysctl.
643 */
644 static unsigned int mb_watchdog = 0;
645
646 /* Red zone */
647 static u_int32_t mb_redzone_cookie;
648 static void m_redzone_init(struct mbuf *);
649 static void m_redzone_verify(struct mbuf *m);
650
651 /* The following are used to serialize m_clalloc() */
652 static boolean_t mb_clalloc_busy;
653 static void *mb_clalloc_waitchan = &mb_clalloc_busy;
654 static int mb_clalloc_waiters;
655
656 static void mbuf_mtypes_sync(boolean_t);
657 static int mbstat_sysctl SYSCTL_HANDLER_ARGS;
658 static void mbuf_stat_sync(void);
659 static int mb_stat_sysctl SYSCTL_HANDLER_ARGS;
660 static int mleak_top_trace_sysctl SYSCTL_HANDLER_ARGS;
661 static int mleak_table_sysctl SYSCTL_HANDLER_ARGS;
662 static char *mbuf_dump(void);
663 static void mbuf_table_init(void);
664 static inline void m_incref(struct mbuf *);
665 static inline u_int32_t m_decref(struct mbuf *);
666 static int m_clalloc(const u_int32_t, const int, const u_int32_t);
667 static void mbuf_worker_thread_init(void);
668 static mcache_obj_t *slab_alloc(mbuf_class_t, int);
669 static void slab_free(mbuf_class_t, mcache_obj_t *);
670 static unsigned int mbuf_slab_alloc(void *, mcache_obj_t ***,
671 unsigned int, int);
672 static void mbuf_slab_free(void *, mcache_obj_t *, int);
673 static void mbuf_slab_audit(void *, mcache_obj_t *, boolean_t);
674 static void mbuf_slab_notify(void *, u_int32_t);
675 static unsigned int cslab_alloc(mbuf_class_t, mcache_obj_t ***,
676 unsigned int);
677 static unsigned int cslab_free(mbuf_class_t, mcache_obj_t *, int);
678 static unsigned int mbuf_cslab_alloc(void *, mcache_obj_t ***,
679 unsigned int, int);
680 static void mbuf_cslab_free(void *, mcache_obj_t *, int);
681 static void mbuf_cslab_audit(void *, mcache_obj_t *, boolean_t);
682 static int freelist_populate(mbuf_class_t, unsigned int, int);
683 static void freelist_init(mbuf_class_t);
684 static boolean_t mbuf_cached_above(mbuf_class_t, int);
685 static boolean_t mbuf_steal(mbuf_class_t, unsigned int);
686 static void m_reclaim(mbuf_class_t, unsigned int, boolean_t);
687 static int m_howmany(int, size_t);
688 static void mbuf_worker_thread(void);
689 static void mbuf_watchdog(void);
690 static boolean_t mbuf_sleep(mbuf_class_t, unsigned int, int);
691
692 static void mcl_audit_init(void *, mcache_audit_t **, mcache_obj_t **,
693 size_t, unsigned int);
694 static mcache_audit_t *mcl_audit_buf2mca(mbuf_class_t, mcache_obj_t *);
695 static void mcl_audit_mbuf(mcache_audit_t *, void *, boolean_t, boolean_t);
696 static void mcl_audit_cluster(mcache_audit_t *, void *, size_t, boolean_t,
697 boolean_t);
698 static void mcl_audit_restore_mbuf(struct mbuf *, mcache_audit_t *, boolean_t);
699 static void mcl_audit_save_mbuf(struct mbuf *, mcache_audit_t *);
700 static void mcl_audit_scratch(mcache_audit_t *);
701 static void mcl_audit_mcheck_panic(struct mbuf *);
702 static void mcl_audit_verify_nextptr(void *, mcache_audit_t *);
703
704 static void mleak_activate(void);
705 static void mleak_logger(u_int32_t, mcache_obj_t *, boolean_t);
706 static boolean_t mleak_log(uintptr_t *, mcache_obj_t *, uint32_t, int);
707 static void mleak_free(mcache_obj_t *);
708 static void mleak_sort_traces(void);
709 static void mleak_update_stats(void);
710
711 static mcl_slab_t *slab_get(void *);
712 static void slab_init(mcl_slab_t *, mbuf_class_t, u_int32_t,
713 void *, void *, unsigned int, int, int);
714 static void slab_insert(mcl_slab_t *, mbuf_class_t);
715 static void slab_remove(mcl_slab_t *, mbuf_class_t);
716 static boolean_t slab_inrange(mcl_slab_t *, void *);
717 static void slab_nextptr_panic(mcl_slab_t *, void *);
718 static void slab_detach(mcl_slab_t *);
719 static boolean_t slab_is_detached(mcl_slab_t *);
720
721 static int m_copyback0(struct mbuf **, int, int, const void *, int, int);
722 static struct mbuf *m_split0(struct mbuf *, int, int, int);
723
724 /* flags for m_copyback0 */
725 #define M_COPYBACK0_COPYBACK 0x0001 /* copyback from cp */
726 #define M_COPYBACK0_PRESERVE 0x0002 /* preserve original data */
727 #define M_COPYBACK0_COW 0x0004 /* do copy-on-write */
728 #define M_COPYBACK0_EXTEND 0x0008 /* extend chain */
729
730 /*
731 * This flag is set for all mbufs that come out of and into the composite
732 * mbuf + cluster caches, i.e. MC_MBUF_CL and MC_MBUF_BIGCL. mbufs that
733 * are marked with such a flag have clusters attached to them, and will be
734 * treated differently when they are freed; instead of being placed back
735 * into the mbuf and cluster freelists, the composite mbuf + cluster objects
736 * are placed back into the appropriate composite cache's freelist, and the
737 * actual freeing is deferred until the composite objects are purged. At
738 * such a time, this flag will be cleared from the mbufs and the objects
739 * will be freed into their own separate freelists.
740 */
741 #define EXTF_COMPOSITE 0x1
742
743 /*
744 * This flag indicates that the external cluster is read-only, i.e. it is
745 * or was referred to by more than one mbufs. Once set, this flag is never
746 * cleared.
747 */
748 #define EXTF_READONLY 0x2
749 #define EXTF_MASK (EXTF_COMPOSITE | EXTF_READONLY)
750
751 #define MEXT_RFA(m) ((m)->m_ext.ext_refflags)
752 #define MEXT_REF(m) (MEXT_RFA(m)->refcnt)
753 #define MEXT_FLAGS(m) (MEXT_RFA(m)->flags)
754 #define MBUF_IS_COMPOSITE(m) \
755 (MEXT_REF(m) == 0 && (MEXT_FLAGS(m) & EXTF_MASK) == EXTF_COMPOSITE)
756
757 /*
758 * Macros used to verify the integrity of the mbuf.
759 */
760 #define _MCHECK(m) { \
761 if ((m)->m_type != MT_FREE) { \
762 if (mclaudit == NULL) \
763 panic("MCHECK: m_type=%d m=%p", \
764 (u_int16_t)(m)->m_type, m); \
765 else \
766 mcl_audit_mcheck_panic(m); \
767 } \
768 }
769
770 #define MBUF_IN_MAP(addr) \
771 ((void *)(addr) >= (void *)mbutl && (void *)(addr) < (void *)embutl)
772
773 #define MRANGE(addr) { \
774 if (!MBUF_IN_MAP(addr)) \
775 panic("MRANGE: address out of range 0x%p", addr); \
776 }
777
778 /*
779 * Macro version of mtod.
780 */
781 #define MTOD(m, t) ((t)((m)->m_data))
782
783 /*
784 * Macros to obtain (4KB) cluster index and base cluster address.
785 */
786
787 #define MTOBG(x) (((char *)(x) - (char *)mbutl) >> MBIGCLSHIFT)
788 #define BGTOM(x) ((union mbigcluster *)(mbutl + (x)))
789
790 /*
791 * Macro to find the mbuf index relative to a base.
792 */
793 #define MCLIDX(c, m) (((char *)(m) - (char *)(c)) >> MSIZESHIFT)
794
795 /*
796 * Same thing for 2KB cluster index.
797 */
798 #define CLBGIDX(c, m) (((char *)(m) - (char *)(c)) >> MCLSHIFT)
799
800 /*
801 * Macros used during mbuf and cluster initialization.
802 */
803 #define MBUF_INIT_PKTHDR(m) { \
804 (m)->m_pkthdr.rcvif = NULL; \
805 (m)->m_pkthdr.pkt_hdr = NULL; \
806 (m)->m_pkthdr.len = 0; \
807 (m)->m_pkthdr.csum_flags = 0; \
808 (m)->m_pkthdr.csum_data = 0; \
809 (m)->m_pkthdr.vlan_tag = 0; \
810 m_classifier_init(m, 0); \
811 m_tag_init(m, 1); \
812 m_scratch_init(m); \
813 m_redzone_init(m); \
814 }
815
816 #define MBUF_INIT(m, pkthdr, type) { \
817 _MCHECK(m); \
818 (m)->m_next = (m)->m_nextpkt = NULL; \
819 (m)->m_len = 0; \
820 (m)->m_type = type; \
821 if ((pkthdr) == 0) { \
822 (m)->m_data = (m)->m_dat; \
823 (m)->m_flags = 0; \
824 } else { \
825 (m)->m_data = (m)->m_pktdat; \
826 (m)->m_flags = M_PKTHDR; \
827 MBUF_INIT_PKTHDR(m); \
828 } \
829 }
830
831 #define MEXT_INIT(m, buf, size, free, arg, rfa, ref, flag) { \
832 (m)->m_data = (m)->m_ext.ext_buf = (buf); \
833 (m)->m_flags |= M_EXT; \
834 (m)->m_ext.ext_size = (size); \
835 (m)->m_ext.ext_free = (free); \
836 (m)->m_ext.ext_arg = (arg); \
837 (m)->m_ext.ext_refs.forward = (m)->m_ext.ext_refs.backward = \
838 &(m)->m_ext.ext_refs; \
839 MEXT_RFA(m) = (rfa); \
840 MEXT_REF(m) = (ref); \
841 MEXT_FLAGS(m) = (flag); \
842 }
843
844 #define MBUF_CL_INIT(m, buf, rfa, ref, flag) \
845 MEXT_INIT(m, buf, m_maxsize(MC_CL), NULL, NULL, rfa, ref, flag)
846
847 #define MBUF_BIGCL_INIT(m, buf, rfa, ref, flag) \
848 MEXT_INIT(m, buf, m_maxsize(MC_BIGCL), m_bigfree, NULL, rfa, ref, flag)
849
850 #define MBUF_16KCL_INIT(m, buf, rfa, ref, flag) \
851 MEXT_INIT(m, buf, m_maxsize(MC_16KCL), m_16kfree, NULL, rfa, ref, flag)
852
853 /*
854 * Macro to convert BSD malloc sleep flag to mcache's
855 */
856 #define MSLEEPF(f) ((!((f) & M_DONTWAIT)) ? MCR_SLEEP : MCR_NOSLEEP)
857
858 /*
859 * The structure that holds all mbuf class statistics exportable via sysctl.
860 * Similar to mbstat structure, the mb_stat structure is protected by the
861 * global mbuf lock. It contains additional information about the classes
862 * that allows for a more accurate view of the state of the allocator.
863 */
864 struct mb_stat *mb_stat;
865 struct omb_stat *omb_stat; /* For backwards compatibility */
866
867 #define MB_STAT_SIZE(n) \
868 ((size_t)(&((mb_stat_t *)0)->mbs_class[n]))
869 #define OMB_STAT_SIZE(n) \
870 ((size_t)(&((struct omb_stat *)0)->mbs_class[n]))
871
872 /*
873 * The legacy structure holding all of the mbuf allocation statistics.
874 * The actual statistics used by the kernel are stored in the mbuf_table
875 * instead, and are updated atomically while the global mbuf lock is held.
876 * They are mirrored in mbstat to support legacy applications (e.g. netstat).
877 * Unlike before, the kernel no longer relies on the contents of mbstat for
878 * its operations (e.g. cluster expansion) because the structure is exposed
879 * to outside and could possibly be modified, therefore making it unsafe.
880 * With the exception of the mbstat.m_mtypes array (see below), all of the
881 * statistics are updated as they change.
882 */
883 struct mbstat mbstat;
884
885 #define MBSTAT_MTYPES_MAX \
886 (sizeof (mbstat.m_mtypes) / sizeof (mbstat.m_mtypes[0]))
887
888 /*
889 * Allocation statistics related to mbuf types (up to MT_MAX-1) are updated
890 * atomically and stored in a per-CPU structure which is lock-free; this is
891 * done in order to avoid writing to the global mbstat data structure which
892 * would cause false sharing. During sysctl request for kern.ipc.mbstat,
893 * the statistics across all CPUs will be converged into the mbstat.m_mtypes
894 * array and returned to the application. Any updates for types greater or
895 * equal than MT_MAX would be done atomically to the mbstat; this slows down
896 * performance but is okay since the kernel uses only up to MT_MAX-1 while
897 * anything beyond that (up to type 255) is considered a corner case.
898 */
899 typedef struct {
900 unsigned int cpu_mtypes[MT_MAX];
901 } __attribute__((aligned(MAX_CPU_CACHE_LINE_SIZE), packed)) mtypes_cpu_t;
902
903 typedef struct {
904 mtypes_cpu_t mbs_cpu[1];
905 } mbuf_mtypes_t;
906
907 static mbuf_mtypes_t *mbuf_mtypes; /* per-CPU statistics */
908
909 #define MBUF_MTYPES_SIZE(n) \
910 ((size_t)(&((mbuf_mtypes_t *)0)->mbs_cpu[n]))
911
912 #define MTYPES_CPU(p) \
913 ((mtypes_cpu_t *)(void *)((char *)(p) + MBUF_MTYPES_SIZE(cpu_number())))
914
915 #define mtype_stat_add(type, n) { \
916 if ((unsigned)(type) < MT_MAX) { \
917 mtypes_cpu_t *mbs = MTYPES_CPU(mbuf_mtypes); \
918 atomic_add_32(&mbs->cpu_mtypes[type], n); \
919 } else if ((unsigned)(type) < (unsigned)MBSTAT_MTYPES_MAX) { \
920 atomic_add_16((int16_t *)&mbstat.m_mtypes[type], n); \
921 } \
922 }
923
924 #define mtype_stat_sub(t, n) mtype_stat_add(t, -(n))
925 #define mtype_stat_inc(t) mtype_stat_add(t, 1)
926 #define mtype_stat_dec(t) mtype_stat_sub(t, 1)
927
928 static void
929 mbuf_mtypes_sync(boolean_t locked)
930 {
931 int m, n;
932 mtypes_cpu_t mtc;
933
934 if (locked)
935 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
936
937 bzero(&mtc, sizeof (mtc));
938 for (m = 0; m < ncpu; m++) {
939 mtypes_cpu_t *scp = &mbuf_mtypes->mbs_cpu[m];
940 mtypes_cpu_t temp;
941
942 bcopy(&scp->cpu_mtypes, &temp.cpu_mtypes,
943 sizeof (temp.cpu_mtypes));
944
945 for (n = 0; n < MT_MAX; n++)
946 mtc.cpu_mtypes[n] += temp.cpu_mtypes[n];
947 }
948 if (!locked)
949 lck_mtx_lock(mbuf_mlock);
950 for (n = 0; n < MT_MAX; n++)
951 mbstat.m_mtypes[n] = mtc.cpu_mtypes[n];
952 if (!locked)
953 lck_mtx_unlock(mbuf_mlock);
954 }
955
956 static int
957 mbstat_sysctl SYSCTL_HANDLER_ARGS
958 {
959 #pragma unused(oidp, arg1, arg2)
960 mbuf_mtypes_sync(FALSE);
961
962 return (SYSCTL_OUT(req, &mbstat, sizeof (mbstat)));
963 }
964
965 static void
966 mbuf_stat_sync(void)
967 {
968 mb_class_stat_t *sp;
969 mcache_cpu_t *ccp;
970 mcache_t *cp;
971 int k, m, bktsize;
972
973 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
974
975 for (k = 0; k < NELEM(mbuf_table); k++) {
976 cp = m_cache(k);
977 ccp = &cp->mc_cpu[0];
978 bktsize = ccp->cc_bktsize;
979 sp = mbuf_table[k].mtbl_stats;
980
981 if (cp->mc_flags & MCF_NOCPUCACHE)
982 sp->mbcl_mc_state = MCS_DISABLED;
983 else if (cp->mc_purge_cnt > 0)
984 sp->mbcl_mc_state = MCS_PURGING;
985 else if (bktsize == 0)
986 sp->mbcl_mc_state = MCS_OFFLINE;
987 else
988 sp->mbcl_mc_state = MCS_ONLINE;
989
990 sp->mbcl_mc_cached = 0;
991 for (m = 0; m < ncpu; m++) {
992 ccp = &cp->mc_cpu[m];
993 if (ccp->cc_objs > 0)
994 sp->mbcl_mc_cached += ccp->cc_objs;
995 if (ccp->cc_pobjs > 0)
996 sp->mbcl_mc_cached += ccp->cc_pobjs;
997 }
998 sp->mbcl_mc_cached += (cp->mc_full.bl_total * bktsize);
999 sp->mbcl_active = sp->mbcl_total - sp->mbcl_mc_cached -
1000 sp->mbcl_infree;
1001
1002 sp->mbcl_mc_waiter_cnt = cp->mc_waiter_cnt;
1003 sp->mbcl_mc_wretry_cnt = cp->mc_wretry_cnt;
1004 sp->mbcl_mc_nwretry_cnt = cp->mc_nwretry_cnt;
1005
1006 /* Calculate total count specific to each class */
1007 sp->mbcl_ctotal = sp->mbcl_total;
1008 switch (m_class(k)) {
1009 case MC_MBUF:
1010 /* Deduct mbufs used in composite caches */
1011 sp->mbcl_ctotal -= (m_total(MC_MBUF_CL) +
1012 m_total(MC_MBUF_BIGCL));
1013 break;
1014
1015 case MC_CL:
1016 /* Deduct clusters used in composite cache */
1017 sp->mbcl_ctotal -= m_total(MC_MBUF_CL);
1018 break;
1019
1020 case MC_BIGCL:
1021 /* Deduct clusters used in composite cache */
1022 sp->mbcl_ctotal -= m_total(MC_MBUF_BIGCL);
1023 break;
1024
1025 case MC_16KCL:
1026 /* Deduct clusters used in composite cache */
1027 sp->mbcl_ctotal -= m_total(MC_MBUF_16KCL);
1028 break;
1029
1030 default:
1031 break;
1032 }
1033 }
1034 }
1035
1036 static int
1037 mb_stat_sysctl SYSCTL_HANDLER_ARGS
1038 {
1039 #pragma unused(oidp, arg1, arg2)
1040 void *statp;
1041 int k, statsz, proc64 = proc_is64bit(req->p);
1042
1043 lck_mtx_lock(mbuf_mlock);
1044 mbuf_stat_sync();
1045
1046 if (!proc64) {
1047 struct omb_class_stat *oc;
1048 struct mb_class_stat *c;
1049
1050 omb_stat->mbs_cnt = mb_stat->mbs_cnt;
1051 oc = &omb_stat->mbs_class[0];
1052 c = &mb_stat->mbs_class[0];
1053 for (k = 0; k < omb_stat->mbs_cnt; k++, oc++, c++) {
1054 (void) snprintf(oc->mbcl_cname, sizeof (oc->mbcl_cname),
1055 "%s", c->mbcl_cname);
1056 oc->mbcl_size = c->mbcl_size;
1057 oc->mbcl_total = c->mbcl_total;
1058 oc->mbcl_active = c->mbcl_active;
1059 oc->mbcl_infree = c->mbcl_infree;
1060 oc->mbcl_slab_cnt = c->mbcl_slab_cnt;
1061 oc->mbcl_alloc_cnt = c->mbcl_alloc_cnt;
1062 oc->mbcl_free_cnt = c->mbcl_free_cnt;
1063 oc->mbcl_notified = c->mbcl_notified;
1064 oc->mbcl_purge_cnt = c->mbcl_purge_cnt;
1065 oc->mbcl_fail_cnt = c->mbcl_fail_cnt;
1066 oc->mbcl_ctotal = c->mbcl_ctotal;
1067 oc->mbcl_mc_state = c->mbcl_mc_state;
1068 oc->mbcl_mc_cached = c->mbcl_mc_cached;
1069 oc->mbcl_mc_waiter_cnt = c->mbcl_mc_waiter_cnt;
1070 oc->mbcl_mc_wretry_cnt = c->mbcl_mc_wretry_cnt;
1071 oc->mbcl_mc_nwretry_cnt = c->mbcl_mc_nwretry_cnt;
1072 }
1073 statp = omb_stat;
1074 statsz = OMB_STAT_SIZE(NELEM(mbuf_table));
1075 } else {
1076 statp = mb_stat;
1077 statsz = MB_STAT_SIZE(NELEM(mbuf_table));
1078 }
1079
1080 lck_mtx_unlock(mbuf_mlock);
1081
1082 return (SYSCTL_OUT(req, statp, statsz));
1083 }
1084
1085 static int
1086 mleak_top_trace_sysctl SYSCTL_HANDLER_ARGS
1087 {
1088 #pragma unused(oidp, arg1, arg2)
1089 int i;
1090
1091 /* Ensure leak tracing turned on */
1092 if (!mclfindleak || !mclexpleak)
1093 return (ENXIO);
1094
1095 lck_mtx_lock(mleak_lock);
1096 mleak_update_stats();
1097 i = SYSCTL_OUT(req, mleak_stat, MLEAK_STAT_SIZE(MLEAK_NUM_TRACES));
1098 lck_mtx_unlock(mleak_lock);
1099
1100 return (i);
1101 }
1102
1103 static int
1104 mleak_table_sysctl SYSCTL_HANDLER_ARGS
1105 {
1106 #pragma unused(oidp, arg1, arg2)
1107 int i = 0;
1108
1109 /* Ensure leak tracing turned on */
1110 if (!mclfindleak || !mclexpleak)
1111 return (ENXIO);
1112
1113 lck_mtx_lock(mleak_lock);
1114 i = SYSCTL_OUT(req, &mleak_table, sizeof (mleak_table));
1115 lck_mtx_unlock(mleak_lock);
1116
1117 return (i);
1118 }
1119
1120 static inline void
1121 m_incref(struct mbuf *m)
1122 {
1123 UInt32 old, new;
1124 volatile UInt32 *addr = (volatile UInt32 *)&MEXT_REF(m);
1125
1126 do {
1127 old = *addr;
1128 new = old + 1;
1129 ASSERT(new != 0);
1130 } while (!OSCompareAndSwap(old, new, addr));
1131
1132 /*
1133 * If cluster is shared, mark it with (sticky) EXTF_READONLY;
1134 * we don't clear the flag when the refcount goes back to 1
1135 * to simplify code calling m_mclhasreference().
1136 */
1137 if (new > 1 && !(MEXT_FLAGS(m) & EXTF_READONLY))
1138 (void) OSBitOrAtomic(EXTF_READONLY, &MEXT_FLAGS(m));
1139 }
1140
1141 static inline u_int32_t
1142 m_decref(struct mbuf *m)
1143 {
1144 UInt32 old, new;
1145 volatile UInt32 *addr = (volatile UInt32 *)&MEXT_REF(m);
1146
1147 do {
1148 old = *addr;
1149 new = old - 1;
1150 ASSERT(old != 0);
1151 } while (!OSCompareAndSwap(old, new, addr));
1152
1153 return (new);
1154 }
1155
1156 static void
1157 mbuf_table_init(void)
1158 {
1159 unsigned int b, c, s;
1160 int m;
1161
1162 MALLOC(omb_stat, struct omb_stat *, OMB_STAT_SIZE(NELEM(mbuf_table)),
1163 M_TEMP, M_WAITOK | M_ZERO);
1164 VERIFY(omb_stat != NULL);
1165
1166 MALLOC(mb_stat, mb_stat_t *, MB_STAT_SIZE(NELEM(mbuf_table)),
1167 M_TEMP, M_WAITOK | M_ZERO);
1168 VERIFY(mb_stat != NULL);
1169
1170 mb_stat->mbs_cnt = NELEM(mbuf_table);
1171 for (m = 0; m < NELEM(mbuf_table); m++)
1172 mbuf_table[m].mtbl_stats = &mb_stat->mbs_class[m];
1173
1174 #if CONFIG_MBUF_JUMBO
1175 /*
1176 * Set aside 1/3 of the mbuf cluster map for jumbo clusters; we do
1177 * this only on platforms where jumbo cluster pool is enabled.
1178 */
1179 njcl = nmbclusters / 3;
1180 njclbytes = M16KCLBYTES;
1181 #endif /* CONFIG_MBUF_JUMBO */
1182
1183 /*
1184 * nclusters holds both the 2KB and 4KB pools, so ensure it's
1185 * a multiple of 4KB clusters.
1186 */
1187 nclusters = P2ROUNDDOWN(nmbclusters - njcl, NCLPBG);
1188 if (njcl > 0) {
1189 /*
1190 * Each jumbo cluster takes 8 2KB clusters, so make
1191 * sure that the pool size is evenly divisible by 8;
1192 * njcl is in 2KB unit, hence treated as such.
1193 */
1194 njcl = P2ROUNDDOWN(nmbclusters - nclusters, 8);
1195
1196 /* Update nclusters with rounded down value of njcl */
1197 nclusters = P2ROUNDDOWN(nmbclusters - njcl, NCLPBG);
1198 }
1199
1200 /*
1201 * njcl is valid only on platforms with 16KB jumbo clusters, where
1202 * it is configured to 1/3 of the pool size. On these platforms,
1203 * the remaining is used for 2KB and 4KB clusters. On platforms
1204 * without 16KB jumbo clusters, the entire pool is used for both
1205 * 2KB and 4KB clusters. A 4KB cluster can either be splitted into
1206 * 16 mbufs, or into 2 2KB clusters.
1207 *
1208 * +---+---+------------ ... -----------+------- ... -------+
1209 * | c | b | s | njcl |
1210 * +---+---+------------ ... -----------+------- ... -------+
1211 *
1212 * 1/32th of the shared region is reserved for pure 2KB and 4KB
1213 * clusters (1/64th each.)
1214 */
1215 c = P2ROUNDDOWN((nclusters >> 6), 2); /* in 2KB unit */
1216 b = P2ROUNDDOWN((nclusters >> (6 + NCLPBGSHIFT)), 2); /* in 4KB unit */
1217 s = nclusters - (c + (b << NCLPBGSHIFT)); /* in 2KB unit */
1218
1219 /*
1220 * 1/64th (c) is reserved for 2KB clusters.
1221 */
1222 m_minlimit(MC_CL) = c;
1223 m_maxlimit(MC_CL) = s + c; /* in 2KB unit */
1224 m_maxsize(MC_CL) = m_size(MC_CL) = MCLBYTES;
1225 (void) snprintf(m_cname(MC_CL), MAX_MBUF_CNAME, "cl");
1226
1227 /*
1228 * Another 1/64th (b) of the map is reserved for 4KB clusters.
1229 * It cannot be turned into 2KB clusters or mbufs.
1230 */
1231 m_minlimit(MC_BIGCL) = b;
1232 m_maxlimit(MC_BIGCL) = (s >> NCLPBGSHIFT) + b; /* in 4KB unit */
1233 m_maxsize(MC_BIGCL) = m_size(MC_BIGCL) = MBIGCLBYTES;
1234 (void) snprintf(m_cname(MC_BIGCL), MAX_MBUF_CNAME, "bigcl");
1235
1236 /*
1237 * The remaining 31/32ths (s) are all-purpose (mbufs, 2KB, or 4KB)
1238 */
1239 m_minlimit(MC_MBUF) = 0;
1240 m_maxlimit(MC_MBUF) = (s << NMBPCLSHIFT); /* in mbuf unit */
1241 m_maxsize(MC_MBUF) = m_size(MC_MBUF) = MSIZE;
1242 (void) snprintf(m_cname(MC_MBUF), MAX_MBUF_CNAME, "mbuf");
1243
1244 /*
1245 * Set limits for the composite classes.
1246 */
1247 m_minlimit(MC_MBUF_CL) = 0;
1248 m_maxlimit(MC_MBUF_CL) = m_maxlimit(MC_CL);
1249 m_maxsize(MC_MBUF_CL) = MCLBYTES;
1250 m_size(MC_MBUF_CL) = m_size(MC_MBUF) + m_size(MC_CL);
1251 (void) snprintf(m_cname(MC_MBUF_CL), MAX_MBUF_CNAME, "mbuf_cl");
1252
1253 m_minlimit(MC_MBUF_BIGCL) = 0;
1254 m_maxlimit(MC_MBUF_BIGCL) = m_maxlimit(MC_BIGCL);
1255 m_maxsize(MC_MBUF_BIGCL) = MBIGCLBYTES;
1256 m_size(MC_MBUF_BIGCL) = m_size(MC_MBUF) + m_size(MC_BIGCL);
1257 (void) snprintf(m_cname(MC_MBUF_BIGCL), MAX_MBUF_CNAME, "mbuf_bigcl");
1258
1259 /*
1260 * And for jumbo classes.
1261 */
1262 m_minlimit(MC_16KCL) = 0;
1263 m_maxlimit(MC_16KCL) = (njcl >> NCLPJCLSHIFT); /* in 16KB unit */
1264 m_maxsize(MC_16KCL) = m_size(MC_16KCL) = M16KCLBYTES;
1265 (void) snprintf(m_cname(MC_16KCL), MAX_MBUF_CNAME, "16kcl");
1266
1267 m_minlimit(MC_MBUF_16KCL) = 0;
1268 m_maxlimit(MC_MBUF_16KCL) = m_maxlimit(MC_16KCL);
1269 m_maxsize(MC_MBUF_16KCL) = M16KCLBYTES;
1270 m_size(MC_MBUF_16KCL) = m_size(MC_MBUF) + m_size(MC_16KCL);
1271 (void) snprintf(m_cname(MC_MBUF_16KCL), MAX_MBUF_CNAME, "mbuf_16kcl");
1272
1273 /*
1274 * Initialize the legacy mbstat structure.
1275 */
1276 bzero(&mbstat, sizeof (mbstat));
1277 mbstat.m_msize = m_maxsize(MC_MBUF);
1278 mbstat.m_mclbytes = m_maxsize(MC_CL);
1279 mbstat.m_minclsize = MINCLSIZE;
1280 mbstat.m_mlen = MLEN;
1281 mbstat.m_mhlen = MHLEN;
1282 mbstat.m_bigmclbytes = m_maxsize(MC_BIGCL);
1283 }
1284
1285 #if defined(__LP64__)
1286 typedef struct ncl_tbl {
1287 uint64_t nt_maxmem; /* memory (sane) size */
1288 uint32_t nt_mbpool; /* mbuf pool size */
1289 } ncl_tbl_t;
1290
1291 /* Non-server */
1292 static ncl_tbl_t ncl_table[] = {
1293 { (1ULL << GBSHIFT) /* 1 GB */, (64 << MBSHIFT) /* 64 MB */ },
1294 { (1ULL << (GBSHIFT + 3)) /* 8 GB */, (96 << MBSHIFT) /* 96 MB */ },
1295 { (1ULL << (GBSHIFT + 4)) /* 16 GB */, (128 << MBSHIFT) /* 128 MB */ },
1296 { 0, 0 }
1297 };
1298
1299 /* Server */
1300 static ncl_tbl_t ncl_table_srv[] = {
1301 { (1ULL << GBSHIFT) /* 1 GB */, (96 << MBSHIFT) /* 96 MB */ },
1302 { (1ULL << (GBSHIFT + 2)) /* 4 GB */, (128 << MBSHIFT) /* 128 MB */ },
1303 { (1ULL << (GBSHIFT + 3)) /* 8 GB */, (160 << MBSHIFT) /* 160 MB */ },
1304 { (1ULL << (GBSHIFT + 4)) /* 16 GB */, (192 << MBSHIFT) /* 192 MB */ },
1305 { (1ULL << (GBSHIFT + 5)) /* 32 GB */, (256 << MBSHIFT) /* 256 MB */ },
1306 { (1ULL << (GBSHIFT + 6)) /* 64 GB */, (384 << MBSHIFT) /* 384 MB */ },
1307 { 0, 0 }
1308 };
1309 #endif /* __LP64__ */
1310
1311 __private_extern__ unsigned int
1312 mbuf_default_ncl(int server, uint64_t mem)
1313 {
1314 #if !defined(__LP64__)
1315 #pragma unused(server)
1316 unsigned int n;
1317 /*
1318 * 32-bit kernel (default to 64MB of mbuf pool for >= 1GB RAM).
1319 */
1320 if ((n = ((mem / 16) / MCLBYTES)) > 32768)
1321 n = 32768;
1322 #else
1323 unsigned int n, i;
1324 ncl_tbl_t *tbl = (server ? ncl_table_srv : ncl_table);
1325 /*
1326 * 64-bit kernel (mbuf pool size based on table).
1327 */
1328 n = tbl[0].nt_mbpool;
1329 for (i = 0; tbl[i].nt_mbpool != 0; i++) {
1330 if (mem < tbl[i].nt_maxmem)
1331 break;
1332 n = tbl[i].nt_mbpool;
1333 }
1334 n >>= MCLSHIFT;
1335 #endif /* !__LP64__ */
1336 return (n);
1337 }
1338
1339 __private_extern__ void
1340 mbinit(void)
1341 {
1342 unsigned int m;
1343 unsigned int initmcl = 0;
1344 void *buf;
1345 thread_t thread = THREAD_NULL;
1346
1347 microuptime(&mb_start);
1348
1349 /*
1350 * These MBUF_ values must be equal to their private counterparts.
1351 */
1352 _CASSERT(MBUF_EXT == M_EXT);
1353 _CASSERT(MBUF_PKTHDR == M_PKTHDR);
1354 _CASSERT(MBUF_EOR == M_EOR);
1355 _CASSERT(MBUF_LOOP == M_LOOP);
1356 _CASSERT(MBUF_BCAST == M_BCAST);
1357 _CASSERT(MBUF_MCAST == M_MCAST);
1358 _CASSERT(MBUF_FRAG == M_FRAG);
1359 _CASSERT(MBUF_FIRSTFRAG == M_FIRSTFRAG);
1360 _CASSERT(MBUF_LASTFRAG == M_LASTFRAG);
1361 _CASSERT(MBUF_PROMISC == M_PROMISC);
1362 _CASSERT(MBUF_HASFCS == M_HASFCS);
1363
1364 _CASSERT(MBUF_TYPE_FREE == MT_FREE);
1365 _CASSERT(MBUF_TYPE_DATA == MT_DATA);
1366 _CASSERT(MBUF_TYPE_HEADER == MT_HEADER);
1367 _CASSERT(MBUF_TYPE_SOCKET == MT_SOCKET);
1368 _CASSERT(MBUF_TYPE_PCB == MT_PCB);
1369 _CASSERT(MBUF_TYPE_RTABLE == MT_RTABLE);
1370 _CASSERT(MBUF_TYPE_HTABLE == MT_HTABLE);
1371 _CASSERT(MBUF_TYPE_ATABLE == MT_ATABLE);
1372 _CASSERT(MBUF_TYPE_SONAME == MT_SONAME);
1373 _CASSERT(MBUF_TYPE_SOOPTS == MT_SOOPTS);
1374 _CASSERT(MBUF_TYPE_FTABLE == MT_FTABLE);
1375 _CASSERT(MBUF_TYPE_RIGHTS == MT_RIGHTS);
1376 _CASSERT(MBUF_TYPE_IFADDR == MT_IFADDR);
1377 _CASSERT(MBUF_TYPE_CONTROL == MT_CONTROL);
1378 _CASSERT(MBUF_TYPE_OOBDATA == MT_OOBDATA);
1379
1380 _CASSERT(MBUF_TSO_IPV4 == CSUM_TSO_IPV4);
1381 _CASSERT(MBUF_TSO_IPV6 == CSUM_TSO_IPV6);
1382 _CASSERT(MBUF_CSUM_REQ_SUM16 == CSUM_PARTIAL);
1383 _CASSERT(MBUF_CSUM_TCP_SUM16 == MBUF_CSUM_REQ_SUM16);
1384 _CASSERT(MBUF_CSUM_REQ_IP == CSUM_IP);
1385 _CASSERT(MBUF_CSUM_REQ_TCP == CSUM_TCP);
1386 _CASSERT(MBUF_CSUM_REQ_UDP == CSUM_UDP);
1387 _CASSERT(MBUF_CSUM_REQ_TCPIPV6 == CSUM_TCPIPV6);
1388 _CASSERT(MBUF_CSUM_REQ_UDPIPV6 == CSUM_UDPIPV6);
1389 _CASSERT(MBUF_CSUM_DID_IP == CSUM_IP_CHECKED);
1390 _CASSERT(MBUF_CSUM_IP_GOOD == CSUM_IP_VALID);
1391 _CASSERT(MBUF_CSUM_DID_DATA == CSUM_DATA_VALID);
1392 _CASSERT(MBUF_CSUM_PSEUDO_HDR == CSUM_PSEUDO_HDR);
1393
1394 _CASSERT(MBUF_WAITOK == M_WAIT);
1395 _CASSERT(MBUF_DONTWAIT == M_DONTWAIT);
1396 _CASSERT(MBUF_COPYALL == M_COPYALL);
1397
1398 _CASSERT(MBUF_SC2TC(MBUF_SC_BK_SYS) == MBUF_TC_BK);
1399 _CASSERT(MBUF_SC2TC(MBUF_SC_BK) == MBUF_TC_BK);
1400 _CASSERT(MBUF_SC2TC(MBUF_SC_BE) == MBUF_TC_BE);
1401 _CASSERT(MBUF_SC2TC(MBUF_SC_RD) == MBUF_TC_BE);
1402 _CASSERT(MBUF_SC2TC(MBUF_SC_OAM) == MBUF_TC_BE);
1403 _CASSERT(MBUF_SC2TC(MBUF_SC_AV) == MBUF_TC_VI);
1404 _CASSERT(MBUF_SC2TC(MBUF_SC_RV) == MBUF_TC_VI);
1405 _CASSERT(MBUF_SC2TC(MBUF_SC_VI) == MBUF_TC_VI);
1406 _CASSERT(MBUF_SC2TC(MBUF_SC_VO) == MBUF_TC_VO);
1407 _CASSERT(MBUF_SC2TC(MBUF_SC_CTL) == MBUF_TC_VO);
1408
1409 _CASSERT(MBUF_TC2SCVAL(MBUF_TC_BK) == SCVAL_BK);
1410 _CASSERT(MBUF_TC2SCVAL(MBUF_TC_BE) == SCVAL_BE);
1411 _CASSERT(MBUF_TC2SCVAL(MBUF_TC_VI) == SCVAL_VI);
1412 _CASSERT(MBUF_TC2SCVAL(MBUF_TC_VO) == SCVAL_VO);
1413
1414 /* Module specific scratch space (32-bit alignment requirement) */
1415 _CASSERT(!(offsetof(struct mbuf, m_pkthdr.pkt_mpriv) %
1416 sizeof (uint32_t)));
1417
1418 /* Initialize random red zone cookie value */
1419 _CASSERT(sizeof (mb_redzone_cookie) ==
1420 sizeof (((struct pkthdr *)0)->redzone));
1421 read_random(&mb_redzone_cookie, sizeof (mb_redzone_cookie));
1422
1423 /* Make sure we don't save more than we should */
1424 _CASSERT(MCA_SAVED_MBUF_SIZE <= sizeof (struct mbuf));
1425
1426 if (nmbclusters == 0)
1427 nmbclusters = NMBCLUSTERS;
1428
1429 /* This should be a sane (at least even) value by now */
1430 VERIFY(nmbclusters != 0 && !(nmbclusters & 0x1));
1431
1432 /* Setup the mbuf table */
1433 mbuf_table_init();
1434
1435 /* Global lock for common layer */
1436 mbuf_mlock_grp_attr = lck_grp_attr_alloc_init();
1437 mbuf_mlock_grp = lck_grp_alloc_init("mbuf", mbuf_mlock_grp_attr);
1438 mbuf_mlock_attr = lck_attr_alloc_init();
1439 lck_mtx_init(mbuf_mlock, mbuf_mlock_grp, mbuf_mlock_attr);
1440
1441 /*
1442 * Allocate cluster slabs table:
1443 *
1444 * maxslabgrp = (N * 2048) / (1024 * 1024)
1445 *
1446 * Where N is nmbclusters rounded up to the nearest 512. This yields
1447 * mcl_slab_g_t units, each one representing a MB of memory.
1448 */
1449 maxslabgrp =
1450 (P2ROUNDUP(nmbclusters, (MBSIZE >> 11)) << MCLSHIFT) >> MBSHIFT;
1451 MALLOC(slabstbl, mcl_slabg_t **, maxslabgrp * sizeof (mcl_slabg_t *),
1452 M_TEMP, M_WAITOK | M_ZERO);
1453 VERIFY(slabstbl != NULL);
1454
1455 /*
1456 * Allocate audit structures, if needed:
1457 *
1458 * maxclaudit = (maxslabgrp * 1024 * 1024) / 4096
1459 *
1460 * This yields mcl_audit_t units, each one representing a page.
1461 */
1462 PE_parse_boot_argn("mbuf_debug", &mbuf_debug, sizeof (mbuf_debug));
1463 mbuf_debug |= mcache_getflags();
1464 if (mbuf_debug & MCF_DEBUG) {
1465 maxclaudit = ((maxslabgrp << MBSHIFT) >> PGSHIFT);
1466 MALLOC(mclaudit, mcl_audit_t *, maxclaudit * sizeof (*mclaudit),
1467 M_TEMP, M_WAITOK | M_ZERO);
1468 VERIFY(mclaudit != NULL);
1469
1470 mcl_audit_con_cache = mcache_create("mcl_audit_contents",
1471 AUDIT_CONTENTS_SIZE, sizeof (u_int64_t), 0, MCR_SLEEP);
1472 VERIFY(mcl_audit_con_cache != NULL);
1473 }
1474 mclverify = (mbuf_debug & MCF_VERIFY);
1475 mcltrace = (mbuf_debug & MCF_TRACE);
1476 mclfindleak = !(mbuf_debug & MCF_NOLEAKLOG);
1477 mclexpleak = mclfindleak && (mbuf_debug & MCF_EXPLEAKLOG);
1478
1479 /* Enable mbuf leak logging, with a lock to protect the tables */
1480
1481 mleak_lock_grp_attr = lck_grp_attr_alloc_init();
1482 mleak_lock_grp = lck_grp_alloc_init("mleak_lock", mleak_lock_grp_attr);
1483 mleak_lock_attr = lck_attr_alloc_init();
1484 lck_mtx_init(mleak_lock, mleak_lock_grp, mleak_lock_attr);
1485
1486 mleak_activate();
1487
1488 /* Calculate the number of pages assigned to the cluster pool */
1489 mcl_pages = (nmbclusters * MCLBYTES) / CLBYTES;
1490 MALLOC(mcl_paddr, ppnum_t *, mcl_pages * sizeof (ppnum_t),
1491 M_TEMP, M_WAITOK);
1492 VERIFY(mcl_paddr != NULL);
1493
1494 /* Register with the I/O Bus mapper */
1495 mcl_paddr_base = IOMapperIOVMAlloc(mcl_pages);
1496 bzero((char *)mcl_paddr, mcl_pages * sizeof (ppnum_t));
1497
1498 embutl = (union mbigcluster *)
1499 ((void *)((unsigned char *)mbutl + (nmbclusters * MCLBYTES)));
1500 VERIFY((((char *)embutl - (char *)mbutl) % MBIGCLBYTES) == 0);
1501
1502 /* Prime up the freelist */
1503 PE_parse_boot_argn("initmcl", &initmcl, sizeof (initmcl));
1504 if (initmcl != 0) {
1505 initmcl >>= NCLPBGSHIFT; /* become a 4K unit */
1506 if (initmcl > m_maxlimit(MC_BIGCL))
1507 initmcl = m_maxlimit(MC_BIGCL);
1508 }
1509 if (initmcl < m_minlimit(MC_BIGCL))
1510 initmcl = m_minlimit(MC_BIGCL);
1511
1512 lck_mtx_lock(mbuf_mlock);
1513
1514 /*
1515 * For classes with non-zero minimum limits, populate their freelists
1516 * so that m_total(class) is at least m_minlimit(class).
1517 */
1518 VERIFY(m_total(MC_BIGCL) == 0 && m_minlimit(MC_BIGCL) != 0);
1519 freelist_populate(m_class(MC_BIGCL), initmcl, M_WAIT);
1520 VERIFY(m_total(MC_BIGCL) >= m_minlimit(MC_BIGCL));
1521 freelist_init(m_class(MC_CL));
1522
1523 for (m = 0; m < NELEM(mbuf_table); m++) {
1524 /* Make sure we didn't miss any */
1525 VERIFY(m_minlimit(m_class(m)) == 0 ||
1526 m_total(m_class(m)) >= m_minlimit(m_class(m)));
1527 }
1528
1529 lck_mtx_unlock(mbuf_mlock);
1530
1531 (void) kernel_thread_start((thread_continue_t)mbuf_worker_thread_init,
1532 NULL, &thread);
1533 thread_deallocate(thread);
1534
1535 ref_cache = mcache_create("mext_ref", sizeof (struct ext_ref),
1536 0, 0, MCR_SLEEP);
1537
1538 /* Create the cache for each class */
1539 for (m = 0; m < NELEM(mbuf_table); m++) {
1540 void *allocfunc, *freefunc, *auditfunc, *logfunc;
1541 u_int32_t flags;
1542
1543 flags = mbuf_debug;
1544 if (m_class(m) == MC_MBUF_CL || m_class(m) == MC_MBUF_BIGCL ||
1545 m_class(m) == MC_MBUF_16KCL) {
1546 allocfunc = mbuf_cslab_alloc;
1547 freefunc = mbuf_cslab_free;
1548 auditfunc = mbuf_cslab_audit;
1549 logfunc = mleak_logger;
1550 } else {
1551 allocfunc = mbuf_slab_alloc;
1552 freefunc = mbuf_slab_free;
1553 auditfunc = mbuf_slab_audit;
1554 logfunc = mleak_logger;
1555 }
1556
1557 /*
1558 * Disable per-CPU caches for jumbo classes if there
1559 * is no jumbo cluster pool available in the system.
1560 * The cache itself is still created (but will never
1561 * be populated) since it simplifies the code.
1562 */
1563 if ((m_class(m) == MC_MBUF_16KCL || m_class(m) == MC_16KCL) &&
1564 njcl == 0)
1565 flags |= MCF_NOCPUCACHE;
1566
1567 if (!mclfindleak)
1568 flags |= MCF_NOLEAKLOG;
1569
1570 m_cache(m) = mcache_create_ext(m_cname(m), m_maxsize(m),
1571 allocfunc, freefunc, auditfunc, logfunc, mbuf_slab_notify,
1572 (void *)(uintptr_t)m, flags, MCR_SLEEP);
1573 }
1574
1575 /*
1576 * Allocate structure for per-CPU statistics that's aligned
1577 * on the CPU cache boundary; this code assumes that we never
1578 * uninitialize this framework, since the original address
1579 * before alignment is not saved.
1580 */
1581 ncpu = ml_get_max_cpus();
1582 MALLOC(buf, void *, MBUF_MTYPES_SIZE(ncpu) + CPU_CACHE_LINE_SIZE,
1583 M_TEMP, M_WAITOK);
1584 VERIFY(buf != NULL);
1585
1586 mbuf_mtypes = (mbuf_mtypes_t *)P2ROUNDUP((intptr_t)buf,
1587 CPU_CACHE_LINE_SIZE);
1588 bzero(mbuf_mtypes, MBUF_MTYPES_SIZE(ncpu));
1589
1590 /*
1591 * Set the max limit on sb_max to be 1/16 th of the size of
1592 * memory allocated for mbuf clusters.
1593 */
1594 high_sb_max = (nmbclusters << (MCLSHIFT - 4));
1595 if (high_sb_max < sb_max) {
1596 /* sb_max is too large for this configuration, scale it down */
1597 if (high_sb_max > (1 << MBSHIFT)) {
1598 /* We have atleast 16 M of mbuf pool */
1599 sb_max = high_sb_max;
1600 } else if ((nmbclusters << MCLSHIFT) > (1 << MBSHIFT)) {
1601 /*
1602 * If we have more than 1M of mbufpool, cap the size of
1603 * max sock buf at 1M
1604 */
1605 sb_max = high_sb_max = (1 << MBSHIFT);
1606 } else {
1607 sb_max = high_sb_max;
1608 }
1609 }
1610
1611 /* allocate space for mbuf_dump_buf */
1612 MALLOC(mbuf_dump_buf, char *, MBUF_DUMP_BUF_SIZE, M_TEMP, M_WAITOK);
1613 VERIFY(mbuf_dump_buf != NULL);
1614
1615 if (mbuf_debug & MCF_DEBUG) {
1616 printf("%s: MLEN %d, MHLEN %d\n", __func__,
1617 (int)_MLEN, (int)_MHLEN);
1618 }
1619
1620 printf("%s: done [%d MB total pool size, (%d/%d) split]\n", __func__,
1621 (nmbclusters << MCLSHIFT) >> MBSHIFT,
1622 (nclusters << MCLSHIFT) >> MBSHIFT,
1623 (njcl << MCLSHIFT) >> MBSHIFT);
1624 }
1625
1626 /*
1627 * Obtain a slab of object(s) from the class's freelist.
1628 */
1629 static mcache_obj_t *
1630 slab_alloc(mbuf_class_t class, int wait)
1631 {
1632 mcl_slab_t *sp;
1633 mcache_obj_t *buf;
1634
1635 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1636
1637 VERIFY(class != MC_16KCL || njcl > 0);
1638
1639 /* This should always be NULL for us */
1640 VERIFY(m_cobjlist(class) == NULL);
1641
1642 /*
1643 * Treat composite objects as having longer lifespan by using
1644 * a slab from the reverse direction, in hoping that this could
1645 * reduce the probability of fragmentation for slabs that hold
1646 * more than one buffer chunks (e.g. mbuf slabs). For other
1647 * slabs, this probably doesn't make much of a difference.
1648 */
1649 if ((class == MC_MBUF || class == MC_CL) && (wait & MCR_COMP))
1650 sp = (mcl_slab_t *)TAILQ_LAST(&m_slablist(class), mcl_slhead);
1651 else
1652 sp = (mcl_slab_t *)TAILQ_FIRST(&m_slablist(class));
1653
1654 if (sp == NULL) {
1655 VERIFY(m_infree(class) == 0 && m_slab_cnt(class) == 0);
1656 /* The slab list for this class is empty */
1657 return (NULL);
1658 }
1659
1660 VERIFY(m_infree(class) > 0);
1661 VERIFY(!slab_is_detached(sp));
1662 VERIFY(sp->sl_class == class &&
1663 (sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED);
1664 buf = sp->sl_head;
1665 VERIFY(slab_inrange(sp, buf) && sp == slab_get(buf));
1666
1667 if (class == MC_MBUF) {
1668 sp->sl_head = buf->obj_next;
1669 VERIFY(sp->sl_head != NULL || sp->sl_refcnt == (NMBPBG - 1));
1670 } else if (class == MC_CL) {
1671 sp->sl_head = buf->obj_next;
1672 VERIFY(sp->sl_head != NULL || sp->sl_refcnt == (NCLPBG - 1));
1673 } else {
1674 sp->sl_head = NULL;
1675 }
1676 if (sp->sl_head != NULL && !slab_inrange(sp, sp->sl_head)) {
1677 slab_nextptr_panic(sp, sp->sl_head);
1678 /* In case sl_head is in the map but not in the slab */
1679 VERIFY(slab_inrange(sp, sp->sl_head));
1680 /* NOTREACHED */
1681 }
1682
1683 /* Increment slab reference */
1684 sp->sl_refcnt++;
1685
1686 if (mclaudit != NULL) {
1687 mcache_audit_t *mca = mcl_audit_buf2mca(class, buf);
1688 mca->mca_uflags = 0;
1689 /* Save contents on mbuf objects only */
1690 if (class == MC_MBUF)
1691 mca->mca_uflags |= MB_SCVALID;
1692 }
1693
1694 if (class == MC_CL) {
1695 mbstat.m_clfree = (--m_infree(MC_CL)) + m_infree(MC_MBUF_CL);
1696 /*
1697 * A 2K cluster slab can have at most NCLPBG references.
1698 */
1699 VERIFY(sp->sl_refcnt >= 1 && sp->sl_refcnt <= NCLPBG &&
1700 sp->sl_chunks == NCLPBG &&
1701 sp->sl_len == m_maxsize(MC_BIGCL));
1702 VERIFY(sp->sl_refcnt < NCLPBG || sp->sl_head == NULL);
1703 } else if (class == MC_BIGCL) {
1704 mbstat.m_bigclfree = (--m_infree(MC_BIGCL)) +
1705 m_infree(MC_MBUF_BIGCL);
1706 /*
1707 * A 4K cluster slab can have at most 1 reference.
1708 */
1709 VERIFY(sp->sl_refcnt == 1 && sp->sl_chunks == 1 &&
1710 sp->sl_len == m_maxsize(class) && sp->sl_head == NULL);
1711 } else if (class == MC_16KCL) {
1712 mcl_slab_t *nsp;
1713 int k;
1714
1715 --m_infree(MC_16KCL);
1716 VERIFY(sp->sl_refcnt == 1 && sp->sl_chunks == 1 &&
1717 sp->sl_len == m_maxsize(class) && sp->sl_head == NULL);
1718 /*
1719 * Increment 2nd-Nth slab reference, where N is NSLABSP16KB.
1720 * A 16KB big cluster takes NSLABSP16KB slabs, each having at
1721 * most 1 reference.
1722 */
1723 for (nsp = sp, k = 1; k < NSLABSP16KB; k++) {
1724 nsp = nsp->sl_next;
1725 /* Next slab must already be present */
1726 VERIFY(nsp != NULL);
1727 nsp->sl_refcnt++;
1728 VERIFY(!slab_is_detached(nsp));
1729 VERIFY(nsp->sl_class == MC_16KCL &&
1730 nsp->sl_flags == (SLF_MAPPED | SLF_PARTIAL) &&
1731 nsp->sl_refcnt == 1 && nsp->sl_chunks == 0 &&
1732 nsp->sl_len == 0 && nsp->sl_base == sp->sl_base &&
1733 nsp->sl_head == NULL);
1734 }
1735 } else {
1736 VERIFY(class == MC_MBUF);
1737 --m_infree(MC_MBUF);
1738 /*
1739 * If auditing is turned on, this check is
1740 * deferred until later in mbuf_slab_audit().
1741 */
1742 if (mclaudit == NULL)
1743 _MCHECK((struct mbuf *)buf);
1744 /*
1745 * Since we have incremented the reference count above,
1746 * an mbuf slab (formerly a 4KB cluster slab that was cut
1747 * up into mbufs) must have a reference count between 1
1748 * and NMBPBG at this point.
1749 */
1750 VERIFY(sp->sl_refcnt >= 1 && sp->sl_refcnt <= NMBPBG &&
1751 sp->sl_chunks == NMBPBG &&
1752 sp->sl_len == m_maxsize(MC_BIGCL));
1753 VERIFY(sp->sl_refcnt < NMBPBG || sp->sl_head == NULL);
1754 }
1755
1756 /* If empty, remove this slab from the class's freelist */
1757 if (sp->sl_head == NULL) {
1758 VERIFY(class != MC_MBUF || sp->sl_refcnt == NMBPBG);
1759 VERIFY(class != MC_CL || sp->sl_refcnt == NCLPBG);
1760 slab_remove(sp, class);
1761 }
1762
1763 return (buf);
1764 }
1765
1766 /*
1767 * Place a slab of object(s) back into a class's slab list.
1768 */
1769 static void
1770 slab_free(mbuf_class_t class, mcache_obj_t *buf)
1771 {
1772 mcl_slab_t *sp;
1773
1774 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1775
1776 VERIFY(class != MC_16KCL || njcl > 0);
1777 VERIFY(buf->obj_next == NULL);
1778 sp = slab_get(buf);
1779 VERIFY(sp->sl_class == class && slab_inrange(sp, buf) &&
1780 (sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED);
1781
1782 /* Decrement slab reference */
1783 sp->sl_refcnt--;
1784
1785 if (class == MC_CL) {
1786 VERIFY(IS_P2ALIGNED(buf, MCLBYTES));
1787 /*
1788 * A slab that has been splitted for 2KB clusters can have
1789 * at most 1 outstanding reference at this point.
1790 */
1791 VERIFY(sp->sl_refcnt >= 0 && sp->sl_refcnt <= (NCLPBG - 1) &&
1792 sp->sl_chunks == NCLPBG &&
1793 sp->sl_len == m_maxsize(MC_BIGCL));
1794 VERIFY(sp->sl_refcnt < (NCLPBG - 1) ||
1795 (slab_is_detached(sp) && sp->sl_head == NULL));
1796 } else if (class == MC_BIGCL) {
1797 VERIFY(IS_P2ALIGNED(buf, MCLBYTES));
1798 /*
1799 * A 4KB cluster slab can have at most 1 reference
1800 * which must be 0 at this point.
1801 */
1802 VERIFY(sp->sl_refcnt == 0 && sp->sl_chunks == 1 &&
1803 sp->sl_len == m_maxsize(class) && sp->sl_head == NULL);
1804 VERIFY(slab_is_detached(sp));
1805 } else if (class == MC_16KCL) {
1806 mcl_slab_t *nsp;
1807 int k;
1808 /*
1809 * A 16KB cluster takes NSLABSP16KB slabs, all must
1810 * now have 0 reference.
1811 */
1812 VERIFY(IS_P2ALIGNED(buf, MBIGCLBYTES));
1813 VERIFY(sp->sl_refcnt == 0 && sp->sl_chunks == 1 &&
1814 sp->sl_len == m_maxsize(class) && sp->sl_head == NULL);
1815 VERIFY(slab_is_detached(sp));
1816 for (nsp = sp, k = 1; k < NSLABSP16KB; k++) {
1817 nsp = nsp->sl_next;
1818 /* Next slab must already be present */
1819 VERIFY(nsp != NULL);
1820 nsp->sl_refcnt--;
1821 VERIFY(slab_is_detached(nsp));
1822 VERIFY(nsp->sl_class == MC_16KCL &&
1823 (nsp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) &&
1824 nsp->sl_refcnt == 0 && nsp->sl_chunks == 0 &&
1825 nsp->sl_len == 0 && nsp->sl_base == sp->sl_base &&
1826 nsp->sl_head == NULL);
1827 }
1828 } else {
1829 /*
1830 * A slab that has been splitted for mbufs has at most NMBPBG
1831 * reference counts. Since we have decremented one reference
1832 * above, it must now be between 0 and NMBPBG-1.
1833 */
1834 VERIFY(class == MC_MBUF);
1835 VERIFY(sp->sl_refcnt >= 0 && sp->sl_refcnt <= (NMBPBG - 1) &&
1836 sp->sl_chunks == NMBPBG &&
1837 sp->sl_len == m_maxsize(MC_BIGCL));
1838 VERIFY(sp->sl_refcnt < (NMBPBG - 1) ||
1839 (slab_is_detached(sp) && sp->sl_head == NULL));
1840 }
1841
1842 /*
1843 * When auditing is enabled, ensure that the buffer still
1844 * contains the free pattern. Otherwise it got corrupted
1845 * while at the CPU cache layer.
1846 */
1847 if (mclaudit != NULL) {
1848 mcache_audit_t *mca = mcl_audit_buf2mca(class, buf);
1849 if (mclverify) {
1850 mcache_audit_free_verify(mca, buf, 0, m_maxsize(class));
1851 }
1852 mca->mca_uflags &= ~MB_SCVALID;
1853 }
1854
1855 if (class == MC_CL) {
1856 mbstat.m_clfree = (++m_infree(MC_CL)) + m_infree(MC_MBUF_CL);
1857 buf->obj_next = sp->sl_head;
1858 } else if (class == MC_BIGCL) {
1859 mbstat.m_bigclfree = (++m_infree(MC_BIGCL)) +
1860 m_infree(MC_MBUF_BIGCL);
1861 } else if (class == MC_16KCL) {
1862 ++m_infree(MC_16KCL);
1863 } else {
1864 ++m_infree(MC_MBUF);
1865 buf->obj_next = sp->sl_head;
1866 }
1867 sp->sl_head = buf;
1868
1869 /*
1870 * If a slab has been splitted to either one which holds 2KB clusters,
1871 * or one which holds mbufs, turn it back to one which holds a 4KB
1872 * cluster.
1873 */
1874 if (class == MC_MBUF && sp->sl_refcnt == 0 &&
1875 m_total(class) > m_minlimit(class) &&
1876 m_total(MC_BIGCL) < m_maxlimit(MC_BIGCL)) {
1877 int i = NMBPBG;
1878
1879 m_total(MC_BIGCL)++;
1880 mbstat.m_bigclusters = m_total(MC_BIGCL);
1881 m_total(MC_MBUF) -= NMBPBG;
1882 mbstat.m_mbufs = m_total(MC_MBUF);
1883 m_infree(MC_MBUF) -= NMBPBG;
1884 mtype_stat_add(MT_FREE, -((unsigned)NMBPBG));
1885
1886 VERIFY(m_total(MC_BIGCL) <= m_maxlimit(MC_BIGCL));
1887 VERIFY(m_total(MC_MBUF) >= m_minlimit(MC_MBUF));
1888
1889 while (i--) {
1890 struct mbuf *m = sp->sl_head;
1891 VERIFY(m != NULL);
1892 sp->sl_head = m->m_next;
1893 m->m_next = NULL;
1894 }
1895 VERIFY(sp->sl_head == NULL);
1896
1897 /* Remove the slab from the mbuf class's slab list */
1898 slab_remove(sp, class);
1899
1900 /* Reinitialize it as a 4KB cluster slab */
1901 slab_init(sp, MC_BIGCL, sp->sl_flags, sp->sl_base, sp->sl_base,
1902 sp->sl_len, 0, 1);
1903
1904 if (mclverify) {
1905 mcache_set_pattern(MCACHE_FREE_PATTERN,
1906 (caddr_t)sp->sl_head, m_maxsize(MC_BIGCL));
1907 }
1908 mbstat.m_bigclfree = (++m_infree(MC_BIGCL)) +
1909 m_infree(MC_MBUF_BIGCL);
1910
1911 VERIFY(slab_is_detached(sp));
1912 /* And finally switch class */
1913 class = MC_BIGCL;
1914 } else if (class == MC_CL && sp->sl_refcnt == 0 &&
1915 m_total(class) > m_minlimit(class) &&
1916 m_total(MC_BIGCL) < m_maxlimit(MC_BIGCL)) {
1917 int i = NCLPBG;
1918
1919 m_total(MC_BIGCL)++;
1920 mbstat.m_bigclusters = m_total(MC_BIGCL);
1921 m_total(MC_CL) -= NCLPBG;
1922 mbstat.m_clusters = m_total(MC_CL);
1923 m_infree(MC_CL) -= NCLPBG;
1924 VERIFY(m_total(MC_BIGCL) <= m_maxlimit(MC_BIGCL));
1925 VERIFY(m_total(MC_CL) >= m_minlimit(MC_CL));
1926
1927 while (i--) {
1928 union mcluster *c = sp->sl_head;
1929 VERIFY(c != NULL);
1930 sp->sl_head = c->mcl_next;
1931 c->mcl_next = NULL;
1932 }
1933 VERIFY(sp->sl_head == NULL);
1934
1935 /* Remove the slab from the 2KB cluster class's slab list */
1936 slab_remove(sp, class);
1937
1938 /* Reinitialize it as a 4KB cluster slab */
1939 slab_init(sp, MC_BIGCL, sp->sl_flags, sp->sl_base, sp->sl_base,
1940 sp->sl_len, 0, 1);
1941
1942 if (mclverify) {
1943 mcache_set_pattern(MCACHE_FREE_PATTERN,
1944 (caddr_t)sp->sl_head, m_maxsize(MC_BIGCL));
1945 }
1946 mbstat.m_bigclfree = (++m_infree(MC_BIGCL)) +
1947 m_infree(MC_MBUF_BIGCL);
1948
1949 VERIFY(slab_is_detached(sp));
1950 /* And finally switch class */
1951 class = MC_BIGCL;
1952 }
1953
1954 /* Reinsert the slab to the class's slab list */
1955 if (slab_is_detached(sp))
1956 slab_insert(sp, class);
1957 }
1958
1959 /*
1960 * Common allocator for rudimentary objects called by the CPU cache layer
1961 * during an allocation request whenever there is no available element in the
1962 * bucket layer. It returns one or more elements from the appropriate global
1963 * freelist. If the freelist is empty, it will attempt to populate it and
1964 * retry the allocation.
1965 */
1966 static unsigned int
1967 mbuf_slab_alloc(void *arg, mcache_obj_t ***plist, unsigned int num, int wait)
1968 {
1969 mbuf_class_t class = (mbuf_class_t)arg;
1970 unsigned int need = num;
1971 mcache_obj_t **list = *plist;
1972
1973 ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
1974 ASSERT(need > 0);
1975
1976 lck_mtx_lock(mbuf_mlock);
1977
1978 for (;;) {
1979 if ((*list = slab_alloc(class, wait)) != NULL) {
1980 (*list)->obj_next = NULL;
1981 list = *plist = &(*list)->obj_next;
1982
1983 if (--need == 0) {
1984 /*
1985 * If the number of elements in freelist has
1986 * dropped below low watermark, asynchronously
1987 * populate the freelist now rather than doing
1988 * it later when we run out of elements.
1989 */
1990 if (!mbuf_cached_above(class, wait) &&
1991 m_infree(class) < m_total(class) >> 5) {
1992 (void) freelist_populate(class, 1,
1993 M_DONTWAIT);
1994 }
1995 break;
1996 }
1997 } else {
1998 VERIFY(m_infree(class) == 0 || class == MC_CL);
1999
2000 (void) freelist_populate(class, 1,
2001 (wait & MCR_NOSLEEP) ? M_DONTWAIT : M_WAIT);
2002
2003 if (m_infree(class) > 0)
2004 continue;
2005
2006 /* Check if there's anything at the cache layer */
2007 if (mbuf_cached_above(class, wait))
2008 break;
2009
2010 /* watchdog checkpoint */
2011 mbuf_watchdog();
2012
2013 /* We have nothing and cannot block; give up */
2014 if (wait & MCR_NOSLEEP) {
2015 if (!(wait & MCR_TRYHARD)) {
2016 m_fail_cnt(class)++;
2017 mbstat.m_drops++;
2018 break;
2019 }
2020 }
2021
2022 /*
2023 * If the freelist is still empty and the caller is
2024 * willing to be blocked, sleep on the wait channel
2025 * until an element is available. Otherwise, if
2026 * MCR_TRYHARD is set, do our best to satisfy the
2027 * request without having to go to sleep.
2028 */
2029 if (mbuf_worker_ready &&
2030 mbuf_sleep(class, need, wait))
2031 break;
2032
2033 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2034 }
2035 }
2036
2037 m_alloc_cnt(class) += num - need;
2038 lck_mtx_unlock(mbuf_mlock);
2039
2040 return (num - need);
2041 }
2042
2043 /*
2044 * Common de-allocator for rudimentary objects called by the CPU cache
2045 * layer when one or more elements need to be returned to the appropriate
2046 * global freelist.
2047 */
2048 static void
2049 mbuf_slab_free(void *arg, mcache_obj_t *list, __unused int purged)
2050 {
2051 mbuf_class_t class = (mbuf_class_t)arg;
2052 mcache_obj_t *nlist;
2053 unsigned int num = 0;
2054 int w;
2055
2056 ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
2057
2058 lck_mtx_lock(mbuf_mlock);
2059
2060 for (;;) {
2061 nlist = list->obj_next;
2062 list->obj_next = NULL;
2063 slab_free(class, list);
2064 ++num;
2065 if ((list = nlist) == NULL)
2066 break;
2067 }
2068 m_free_cnt(class) += num;
2069
2070 if ((w = mb_waiters) > 0)
2071 mb_waiters = 0;
2072
2073 lck_mtx_unlock(mbuf_mlock);
2074
2075 if (w != 0)
2076 wakeup(mb_waitchan);
2077 }
2078
2079 /*
2080 * Common auditor for rudimentary objects called by the CPU cache layer
2081 * during an allocation or free request. For the former, this is called
2082 * after the objects are obtained from either the bucket or slab layer
2083 * and before they are returned to the caller. For the latter, this is
2084 * called immediately during free and before placing the objects into
2085 * the bucket or slab layer.
2086 */
2087 static void
2088 mbuf_slab_audit(void *arg, mcache_obj_t *list, boolean_t alloc)
2089 {
2090 mbuf_class_t class = (mbuf_class_t)arg;
2091 mcache_audit_t *mca;
2092
2093 ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
2094
2095 while (list != NULL) {
2096 lck_mtx_lock(mbuf_mlock);
2097 mca = mcl_audit_buf2mca(class, list);
2098
2099 /* Do the sanity checks */
2100 if (class == MC_MBUF) {
2101 mcl_audit_mbuf(mca, list, FALSE, alloc);
2102 ASSERT(mca->mca_uflags & MB_SCVALID);
2103 } else {
2104 mcl_audit_cluster(mca, list, m_maxsize(class),
2105 alloc, TRUE);
2106 ASSERT(!(mca->mca_uflags & MB_SCVALID));
2107 }
2108 /* Record this transaction */
2109 if (mcltrace)
2110 mcache_buffer_log(mca, list, m_cache(class), &mb_start);
2111
2112 if (alloc)
2113 mca->mca_uflags |= MB_INUSE;
2114 else
2115 mca->mca_uflags &= ~MB_INUSE;
2116 /* Unpair the object (unconditionally) */
2117 mca->mca_uptr = NULL;
2118 lck_mtx_unlock(mbuf_mlock);
2119
2120 list = list->obj_next;
2121 }
2122 }
2123
2124 /*
2125 * Common notify routine for all caches. It is called by mcache when
2126 * one or more objects get freed. We use this indication to trigger
2127 * the wakeup of any sleeping threads so that they can retry their
2128 * allocation requests.
2129 */
2130 static void
2131 mbuf_slab_notify(void *arg, u_int32_t reason)
2132 {
2133 mbuf_class_t class = (mbuf_class_t)arg;
2134 int w;
2135
2136 ASSERT(MBUF_CLASS_VALID(class));
2137
2138 if (reason != MCN_RETRYALLOC)
2139 return;
2140
2141 lck_mtx_lock(mbuf_mlock);
2142 if ((w = mb_waiters) > 0) {
2143 m_notified(class)++;
2144 mb_waiters = 0;
2145 }
2146 lck_mtx_unlock(mbuf_mlock);
2147
2148 if (w != 0)
2149 wakeup(mb_waitchan);
2150 }
2151
2152 /*
2153 * Obtain object(s) from the composite class's freelist.
2154 */
2155 static unsigned int
2156 cslab_alloc(mbuf_class_t class, mcache_obj_t ***plist, unsigned int num)
2157 {
2158 unsigned int need = num;
2159 mcl_slab_t *sp, *clsp, *nsp;
2160 struct mbuf *m;
2161 mcache_obj_t **list = *plist;
2162 void *cl;
2163
2164 VERIFY(need > 0);
2165 VERIFY(class != MC_MBUF_16KCL || njcl > 0);
2166 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2167
2168 /* Get what we can from the freelist */
2169 while ((*list = m_cobjlist(class)) != NULL) {
2170 MRANGE(*list);
2171
2172 m = (struct mbuf *)*list;
2173 sp = slab_get(m);
2174 cl = m->m_ext.ext_buf;
2175 clsp = slab_get(cl);
2176 VERIFY(m->m_flags == M_EXT && cl != NULL);
2177 VERIFY(MEXT_RFA(m) != NULL && MBUF_IS_COMPOSITE(m));
2178
2179 if (class == MC_MBUF_CL) {
2180 VERIFY(clsp->sl_refcnt >= 1 &&
2181 clsp->sl_refcnt <= NCLPBG);
2182 } else {
2183 VERIFY(clsp->sl_refcnt == 1);
2184 }
2185
2186 if (class == MC_MBUF_16KCL) {
2187 int k;
2188 for (nsp = clsp, k = 1; k < NSLABSP16KB; k++) {
2189 nsp = nsp->sl_next;
2190 /* Next slab must already be present */
2191 VERIFY(nsp != NULL);
2192 VERIFY(nsp->sl_refcnt == 1);
2193 }
2194 }
2195
2196 if ((m_cobjlist(class) = (*list)->obj_next) != NULL &&
2197 !MBUF_IN_MAP(m_cobjlist(class))) {
2198 slab_nextptr_panic(sp, m_cobjlist(class));
2199 /* NOTREACHED */
2200 }
2201 (*list)->obj_next = NULL;
2202 list = *plist = &(*list)->obj_next;
2203
2204 if (--need == 0)
2205 break;
2206 }
2207 m_infree(class) -= (num - need);
2208
2209 return (num - need);
2210 }
2211
2212 /*
2213 * Place object(s) back into a composite class's freelist.
2214 */
2215 static unsigned int
2216 cslab_free(mbuf_class_t class, mcache_obj_t *list, int purged)
2217 {
2218 mcache_obj_t *o, *tail;
2219 unsigned int num = 0;
2220 struct mbuf *m, *ms;
2221 mcache_audit_t *mca = NULL;
2222 mcache_obj_t *ref_list = NULL;
2223 mcl_slab_t *clsp, *nsp;
2224 void *cl;
2225 mbuf_class_t cl_class;
2226
2227 ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
2228 VERIFY(class != MC_MBUF_16KCL || njcl > 0);
2229 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2230
2231 if (class == MC_MBUF_CL) {
2232 cl_class = MC_CL;
2233 } else if (class == MC_MBUF_BIGCL) {
2234 cl_class = MC_BIGCL;
2235 } else {
2236 VERIFY(class == MC_MBUF_16KCL);
2237 cl_class = MC_16KCL;
2238 }
2239
2240 o = tail = list;
2241
2242 while ((m = ms = (struct mbuf *)o) != NULL) {
2243 mcache_obj_t *rfa, *nexto = o->obj_next;
2244
2245 /* Do the mbuf sanity checks */
2246 if (mclaudit != NULL) {
2247 mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
2248 if (mclverify) {
2249 mcache_audit_free_verify(mca, m, 0,
2250 m_maxsize(MC_MBUF));
2251 }
2252 ms = MCA_SAVED_MBUF_PTR(mca);
2253 }
2254
2255 /* Do the cluster sanity checks */
2256 cl = ms->m_ext.ext_buf;
2257 clsp = slab_get(cl);
2258 if (mclverify) {
2259 size_t size = m_maxsize(cl_class);
2260 mcache_audit_free_verify(mcl_audit_buf2mca(cl_class,
2261 (mcache_obj_t *)cl), cl, 0, size);
2262 }
2263 VERIFY(ms->m_type == MT_FREE);
2264 VERIFY(ms->m_flags == M_EXT);
2265 VERIFY(MEXT_RFA(ms) != NULL && MBUF_IS_COMPOSITE(ms));
2266 if (cl_class == MC_CL) {
2267 VERIFY(clsp->sl_refcnt >= 1 &&
2268 clsp->sl_refcnt <= NCLPBG);
2269 } else {
2270 VERIFY(clsp->sl_refcnt == 1);
2271 }
2272 if (cl_class == MC_16KCL) {
2273 int k;
2274 for (nsp = clsp, k = 1; k < NSLABSP16KB; k++) {
2275 nsp = nsp->sl_next;
2276 /* Next slab must already be present */
2277 VERIFY(nsp != NULL);
2278 VERIFY(nsp->sl_refcnt == 1);
2279 }
2280 }
2281
2282 /*
2283 * If we're asked to purge, restore the actual mbuf using
2284 * contents of the shadow structure (if auditing is enabled)
2285 * and clear EXTF_COMPOSITE flag from the mbuf, as we are
2286 * about to free it and the attached cluster into their caches.
2287 */
2288 if (purged) {
2289 /* Restore constructed mbuf fields */
2290 if (mclaudit != NULL)
2291 mcl_audit_restore_mbuf(m, mca, TRUE);
2292
2293 MEXT_REF(m) = 0;
2294 MEXT_FLAGS(m) = 0;
2295
2296 rfa = (mcache_obj_t *)(void *)MEXT_RFA(m);
2297 rfa->obj_next = ref_list;
2298 ref_list = rfa;
2299 MEXT_RFA(m) = NULL;
2300
2301 m->m_type = MT_FREE;
2302 m->m_flags = m->m_len = 0;
2303 m->m_next = m->m_nextpkt = NULL;
2304
2305 /* Save mbuf fields and make auditing happy */
2306 if (mclaudit != NULL)
2307 mcl_audit_mbuf(mca, o, FALSE, FALSE);
2308
2309 VERIFY(m_total(class) > 0);
2310 m_total(class)--;
2311
2312 /* Free the mbuf */
2313 o->obj_next = NULL;
2314 slab_free(MC_MBUF, o);
2315
2316 /* And free the cluster */
2317 ((mcache_obj_t *)cl)->obj_next = NULL;
2318 if (class == MC_MBUF_CL)
2319 slab_free(MC_CL, cl);
2320 else if (class == MC_MBUF_BIGCL)
2321 slab_free(MC_BIGCL, cl);
2322 else
2323 slab_free(MC_16KCL, cl);
2324 }
2325
2326 ++num;
2327 tail = o;
2328 o = nexto;
2329 }
2330
2331 if (!purged) {
2332 tail->obj_next = m_cobjlist(class);
2333 m_cobjlist(class) = list;
2334 m_infree(class) += num;
2335 } else if (ref_list != NULL) {
2336 mcache_free_ext(ref_cache, ref_list);
2337 }
2338
2339 return (num);
2340 }
2341
2342 /*
2343 * Common allocator for composite objects called by the CPU cache layer
2344 * during an allocation request whenever there is no available element in
2345 * the bucket layer. It returns one or more composite elements from the
2346 * appropriate global freelist. If the freelist is empty, it will attempt
2347 * to obtain the rudimentary objects from their caches and construct them
2348 * into composite mbuf + cluster objects.
2349 */
2350 static unsigned int
2351 mbuf_cslab_alloc(void *arg, mcache_obj_t ***plist, unsigned int needed,
2352 int wait)
2353 {
2354 mbuf_class_t class = (mbuf_class_t)arg;
2355 mbuf_class_t cl_class = 0;
2356 unsigned int num = 0, cnum = 0, want = needed;
2357 mcache_obj_t *ref_list = NULL;
2358 mcache_obj_t *mp_list = NULL;
2359 mcache_obj_t *clp_list = NULL;
2360 mcache_obj_t **list;
2361 struct ext_ref *rfa;
2362 struct mbuf *m;
2363 void *cl;
2364
2365 ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
2366 ASSERT(needed > 0);
2367
2368 VERIFY(class != MC_MBUF_16KCL || njcl > 0);
2369
2370 /* There should not be any slab for this class */
2371 VERIFY(m_slab_cnt(class) == 0 &&
2372 m_slablist(class).tqh_first == NULL &&
2373 m_slablist(class).tqh_last == NULL);
2374
2375 lck_mtx_lock(mbuf_mlock);
2376
2377 /* Try using the freelist first */
2378 num = cslab_alloc(class, plist, needed);
2379 list = *plist;
2380 if (num == needed) {
2381 m_alloc_cnt(class) += num;
2382 lck_mtx_unlock(mbuf_mlock);
2383 return (needed);
2384 }
2385
2386 lck_mtx_unlock(mbuf_mlock);
2387
2388 /*
2389 * We could not satisfy the request using the freelist alone;
2390 * allocate from the appropriate rudimentary caches and use
2391 * whatever we can get to construct the composite objects.
2392 */
2393 needed -= num;
2394
2395 /*
2396 * Mark these allocation requests as coming from a composite cache.
2397 * Also, if the caller is willing to be blocked, mark the request
2398 * with MCR_FAILOK such that we don't end up sleeping at the mbuf
2399 * slab layer waiting for the individual object when one or more
2400 * of the already-constructed composite objects are available.
2401 */
2402 wait |= MCR_COMP;
2403 if (!(wait & MCR_NOSLEEP))
2404 wait |= MCR_FAILOK;
2405
2406 /* allocate mbufs */
2407 needed = mcache_alloc_ext(m_cache(MC_MBUF), &mp_list, needed, wait);
2408 if (needed == 0) {
2409 ASSERT(mp_list == NULL);
2410 goto fail;
2411 }
2412
2413 /* allocate clusters */
2414 if (class == MC_MBUF_CL) {
2415 cl_class = MC_CL;
2416 } else if (class == MC_MBUF_BIGCL) {
2417 cl_class = MC_BIGCL;
2418 } else {
2419 VERIFY(class == MC_MBUF_16KCL);
2420 cl_class = MC_16KCL;
2421 }
2422 needed = mcache_alloc_ext(m_cache(cl_class), &clp_list, needed, wait);
2423 if (needed == 0) {
2424 ASSERT(clp_list == NULL);
2425 goto fail;
2426 }
2427
2428 needed = mcache_alloc_ext(ref_cache, &ref_list, needed, wait);
2429 if (needed == 0) {
2430 ASSERT(ref_list == NULL);
2431 goto fail;
2432 }
2433
2434 /*
2435 * By this time "needed" is MIN(mbuf, cluster, ref). Any left
2436 * overs will get freed accordingly before we return to caller.
2437 */
2438 for (cnum = 0; cnum < needed; cnum++) {
2439 struct mbuf *ms;
2440
2441 m = ms = (struct mbuf *)mp_list;
2442 mp_list = mp_list->obj_next;
2443
2444 cl = clp_list;
2445 clp_list = clp_list->obj_next;
2446 ((mcache_obj_t *)cl)->obj_next = NULL;
2447
2448 rfa = (struct ext_ref *)ref_list;
2449 ref_list = ref_list->obj_next;
2450 ((mcache_obj_t *)(void *)rfa)->obj_next = NULL;
2451
2452 /*
2453 * If auditing is enabled, construct the shadow mbuf
2454 * in the audit structure instead of in the actual one.
2455 * mbuf_cslab_audit() will take care of restoring the
2456 * contents after the integrity check.
2457 */
2458 if (mclaudit != NULL) {
2459 mcache_audit_t *mca, *cl_mca;
2460
2461 lck_mtx_lock(mbuf_mlock);
2462 mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
2463 ms = MCA_SAVED_MBUF_PTR(mca);
2464 cl_mca = mcl_audit_buf2mca(MC_CL, (mcache_obj_t *)cl);
2465
2466 /*
2467 * Pair them up. Note that this is done at the time
2468 * the mbuf+cluster objects are constructed. This
2469 * information should be treated as "best effort"
2470 * debugging hint since more than one mbufs can refer
2471 * to a cluster. In that case, the cluster might not
2472 * be freed along with the mbuf it was paired with.
2473 */
2474 mca->mca_uptr = cl_mca;
2475 cl_mca->mca_uptr = mca;
2476
2477 ASSERT(mca->mca_uflags & MB_SCVALID);
2478 ASSERT(!(cl_mca->mca_uflags & MB_SCVALID));
2479 lck_mtx_unlock(mbuf_mlock);
2480
2481 /* Technically, they are in the freelist */
2482 if (mclverify) {
2483 size_t size;
2484
2485 mcache_set_pattern(MCACHE_FREE_PATTERN, m,
2486 m_maxsize(MC_MBUF));
2487
2488 if (class == MC_MBUF_CL)
2489 size = m_maxsize(MC_CL);
2490 else if (class == MC_MBUF_BIGCL)
2491 size = m_maxsize(MC_BIGCL);
2492 else
2493 size = m_maxsize(MC_16KCL);
2494
2495 mcache_set_pattern(MCACHE_FREE_PATTERN, cl,
2496 size);
2497 }
2498 }
2499
2500 MBUF_INIT(ms, 0, MT_FREE);
2501 if (class == MC_MBUF_16KCL) {
2502 MBUF_16KCL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE);
2503 } else if (class == MC_MBUF_BIGCL) {
2504 MBUF_BIGCL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE);
2505 } else {
2506 MBUF_CL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE);
2507 }
2508 VERIFY(ms->m_flags == M_EXT);
2509 VERIFY(MEXT_RFA(ms) != NULL && MBUF_IS_COMPOSITE(ms));
2510
2511 *list = (mcache_obj_t *)m;
2512 (*list)->obj_next = NULL;
2513 list = *plist = &(*list)->obj_next;
2514 }
2515
2516 fail:
2517 /*
2518 * Free up what's left of the above.
2519 */
2520 if (mp_list != NULL)
2521 mcache_free_ext(m_cache(MC_MBUF), mp_list);
2522 if (clp_list != NULL)
2523 mcache_free_ext(m_cache(cl_class), clp_list);
2524 if (ref_list != NULL)
2525 mcache_free_ext(ref_cache, ref_list);
2526
2527 lck_mtx_lock(mbuf_mlock);
2528 if (num > 0 || cnum > 0) {
2529 m_total(class) += cnum;
2530 VERIFY(m_total(class) <= m_maxlimit(class));
2531 m_alloc_cnt(class) += num + cnum;
2532 }
2533 if ((num + cnum) < want)
2534 m_fail_cnt(class) += (want - (num + cnum));
2535 lck_mtx_unlock(mbuf_mlock);
2536
2537 return (num + cnum);
2538 }
2539
2540 /*
2541 * Common de-allocator for composite objects called by the CPU cache
2542 * layer when one or more elements need to be returned to the appropriate
2543 * global freelist.
2544 */
2545 static void
2546 mbuf_cslab_free(void *arg, mcache_obj_t *list, int purged)
2547 {
2548 mbuf_class_t class = (mbuf_class_t)arg;
2549 unsigned int num;
2550 int w;
2551
2552 ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
2553
2554 lck_mtx_lock(mbuf_mlock);
2555
2556 num = cslab_free(class, list, purged);
2557 m_free_cnt(class) += num;
2558
2559 if ((w = mb_waiters) > 0)
2560 mb_waiters = 0;
2561
2562 lck_mtx_unlock(mbuf_mlock);
2563
2564 if (w != 0)
2565 wakeup(mb_waitchan);
2566 }
2567
2568 /*
2569 * Common auditor for composite objects called by the CPU cache layer
2570 * during an allocation or free request. For the former, this is called
2571 * after the objects are obtained from either the bucket or slab layer
2572 * and before they are returned to the caller. For the latter, this is
2573 * called immediately during free and before placing the objects into
2574 * the bucket or slab layer.
2575 */
2576 static void
2577 mbuf_cslab_audit(void *arg, mcache_obj_t *list, boolean_t alloc)
2578 {
2579 mbuf_class_t class = (mbuf_class_t)arg;
2580 mcache_audit_t *mca;
2581 struct mbuf *m, *ms;
2582 mcl_slab_t *clsp, *nsp;
2583 size_t size;
2584 void *cl;
2585
2586 ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
2587
2588 while ((m = ms = (struct mbuf *)list) != NULL) {
2589 lck_mtx_lock(mbuf_mlock);
2590 /* Do the mbuf sanity checks and record its transaction */
2591 mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
2592 mcl_audit_mbuf(mca, m, TRUE, alloc);
2593 if (mcltrace)
2594 mcache_buffer_log(mca, m, m_cache(class), &mb_start);
2595
2596 if (alloc)
2597 mca->mca_uflags |= MB_COMP_INUSE;
2598 else
2599 mca->mca_uflags &= ~MB_COMP_INUSE;
2600
2601 /*
2602 * Use the shadow mbuf in the audit structure if we are
2603 * freeing, since the contents of the actual mbuf has been
2604 * pattern-filled by the above call to mcl_audit_mbuf().
2605 */
2606 if (!alloc && mclverify)
2607 ms = MCA_SAVED_MBUF_PTR(mca);
2608
2609 /* Do the cluster sanity checks and record its transaction */
2610 cl = ms->m_ext.ext_buf;
2611 clsp = slab_get(cl);
2612 VERIFY(ms->m_flags == M_EXT && cl != NULL);
2613 VERIFY(MEXT_RFA(ms) != NULL && MBUF_IS_COMPOSITE(ms));
2614 if (class == MC_MBUF_CL)
2615 VERIFY(clsp->sl_refcnt >= 1 &&
2616 clsp->sl_refcnt <= NCLPBG);
2617 else
2618 VERIFY(clsp->sl_refcnt == 1);
2619
2620 if (class == MC_MBUF_16KCL) {
2621 int k;
2622 for (nsp = clsp, k = 1; k < NSLABSP16KB; k++) {
2623 nsp = nsp->sl_next;
2624 /* Next slab must already be present */
2625 VERIFY(nsp != NULL);
2626 VERIFY(nsp->sl_refcnt == 1);
2627 }
2628 }
2629
2630 mca = mcl_audit_buf2mca(MC_CL, cl);
2631 if (class == MC_MBUF_CL)
2632 size = m_maxsize(MC_CL);
2633 else if (class == MC_MBUF_BIGCL)
2634 size = m_maxsize(MC_BIGCL);
2635 else
2636 size = m_maxsize(MC_16KCL);
2637 mcl_audit_cluster(mca, cl, size, alloc, FALSE);
2638 if (mcltrace)
2639 mcache_buffer_log(mca, cl, m_cache(class), &mb_start);
2640
2641 if (alloc)
2642 mca->mca_uflags |= MB_COMP_INUSE;
2643 else
2644 mca->mca_uflags &= ~MB_COMP_INUSE;
2645 lck_mtx_unlock(mbuf_mlock);
2646
2647 list = list->obj_next;
2648 }
2649 }
2650
2651 /*
2652 * Allocate some number of mbuf clusters and place on cluster freelist.
2653 */
2654 static int
2655 m_clalloc(const u_int32_t num, const int wait, const u_int32_t bufsize)
2656 {
2657 int i;
2658 vm_size_t size = 0;
2659 int numpages = 0, large_buffer = (bufsize == m_maxsize(MC_16KCL));
2660 vm_offset_t page = 0;
2661 mcache_audit_t *mca_list = NULL;
2662 mcache_obj_t *con_list = NULL;
2663 mcl_slab_t *sp;
2664
2665 VERIFY(bufsize == m_maxsize(MC_BIGCL) ||
2666 bufsize == m_maxsize(MC_16KCL));
2667
2668 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2669
2670 /*
2671 * Multiple threads may attempt to populate the cluster map one
2672 * after another. Since we drop the lock below prior to acquiring
2673 * the physical page(s), our view of the cluster map may no longer
2674 * be accurate, and we could end up over-committing the pages beyond
2675 * the maximum allowed for each class. To prevent it, this entire
2676 * operation (including the page mapping) is serialized.
2677 */
2678 while (mb_clalloc_busy) {
2679 mb_clalloc_waiters++;
2680 (void) msleep(mb_clalloc_waitchan, mbuf_mlock,
2681 (PZERO-1), "m_clalloc", NULL);
2682 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2683 }
2684
2685 /* We are busy now; tell everyone else to go away */
2686 mb_clalloc_busy = TRUE;
2687
2688 /*
2689 * Honor the caller's wish to block or not block. We have a way
2690 * to grow the pool asynchronously using the mbuf worker thread.
2691 */
2692 i = m_howmany(num, bufsize);
2693 if (i == 0 || (wait & M_DONTWAIT))
2694 goto out;
2695
2696 lck_mtx_unlock(mbuf_mlock);
2697
2698 size = round_page(i * bufsize);
2699 page = kmem_mb_alloc(mb_map, size, large_buffer);
2700
2701 /*
2702 * If we did ask for "n" 16KB physically contiguous chunks
2703 * and didn't get them, then please try again without this
2704 * restriction.
2705 */
2706 if (large_buffer && page == 0)
2707 page = kmem_mb_alloc(mb_map, size, 0);
2708
2709 if (page == 0) {
2710 if (bufsize == m_maxsize(MC_BIGCL)) {
2711 /* Try for 1 page if failed, only 4KB request */
2712 size = NBPG;
2713 page = kmem_mb_alloc(mb_map, size, 0);
2714 }
2715
2716 if (page == 0) {
2717 lck_mtx_lock(mbuf_mlock);
2718 goto out;
2719 }
2720 }
2721
2722 VERIFY(IS_P2ALIGNED(page, NBPG));
2723 numpages = size / NBPG;
2724
2725 /* If auditing is enabled, allocate the audit structures now */
2726 if (mclaudit != NULL) {
2727 int needed;
2728
2729 /*
2730 * Yes, I realize this is a waste of memory for clusters
2731 * that never get transformed into mbufs, as we may end
2732 * up with NMBPBG-1 unused audit structures per cluster.
2733 * But doing so tremendously simplifies the allocation
2734 * strategy, since at this point we are not holding the
2735 * mbuf lock and the caller is okay to be blocked.
2736 */
2737 if (bufsize == m_maxsize(MC_BIGCL)) {
2738 needed = numpages * NMBPBG;
2739
2740 i = mcache_alloc_ext(mcl_audit_con_cache,
2741 &con_list, needed, MCR_SLEEP);
2742
2743 VERIFY(con_list != NULL && i == needed);
2744 } else {
2745 needed = numpages / NSLABSP16KB;
2746 }
2747
2748 i = mcache_alloc_ext(mcache_audit_cache,
2749 (mcache_obj_t **)&mca_list, needed, MCR_SLEEP);
2750
2751 VERIFY(mca_list != NULL && i == needed);
2752 }
2753
2754 lck_mtx_lock(mbuf_mlock);
2755
2756 for (i = 0; i < numpages; i++, page += NBPG) {
2757 ppnum_t offset = ((char *)page - (char *)mbutl) / NBPG;
2758 ppnum_t new_page = pmap_find_phys(kernel_pmap, page);
2759
2760 /*
2761 * If there is a mapper the appropriate I/O page is returned;
2762 * zero out the page to discard its past contents to prevent
2763 * exposing leftover kernel memory.
2764 */
2765 VERIFY(offset < mcl_pages);
2766 if (mcl_paddr_base != 0) {
2767 bzero((void *)(uintptr_t) page, page_size);
2768 new_page = IOMapperInsertPage(mcl_paddr_base,
2769 offset, new_page);
2770 }
2771 mcl_paddr[offset] = new_page;
2772
2773 /* Pattern-fill this fresh page */
2774 if (mclverify) {
2775 mcache_set_pattern(MCACHE_FREE_PATTERN,
2776 (caddr_t)page, NBPG);
2777 }
2778 if (bufsize == m_maxsize(MC_BIGCL)) {
2779 union mbigcluster *mbc = (union mbigcluster *)page;
2780
2781 /* One for the entire page */
2782 sp = slab_get(mbc);
2783 if (mclaudit != NULL) {
2784 mcl_audit_init(mbc, &mca_list, &con_list,
2785 AUDIT_CONTENTS_SIZE, NMBPBG);
2786 }
2787 VERIFY(sp->sl_refcnt == 0 && sp->sl_flags == 0);
2788 slab_init(sp, MC_BIGCL, SLF_MAPPED,
2789 mbc, mbc, bufsize, 0, 1);
2790
2791 /* Insert this slab */
2792 slab_insert(sp, MC_BIGCL);
2793
2794 /* Update stats now since slab_get() drops the lock */
2795 mbstat.m_bigclfree = ++m_infree(MC_BIGCL) +
2796 m_infree(MC_MBUF_BIGCL);
2797 mbstat.m_bigclusters = ++m_total(MC_BIGCL);
2798 VERIFY(m_total(MC_BIGCL) <= m_maxlimit(MC_BIGCL));
2799 } else if ((i % NSLABSP16KB) == 0) {
2800 union m16kcluster *m16kcl = (union m16kcluster *)page;
2801 mcl_slab_t *nsp;
2802 int k;
2803
2804 VERIFY(njcl > 0);
2805 /* One for the entire 16KB */
2806 sp = slab_get(m16kcl);
2807 if (mclaudit != NULL)
2808 mcl_audit_init(m16kcl, &mca_list, NULL, 0, 1);
2809
2810 VERIFY(sp->sl_refcnt == 0 && sp->sl_flags == 0);
2811 slab_init(sp, MC_16KCL, SLF_MAPPED,
2812 m16kcl, m16kcl, bufsize, 0, 1);
2813
2814 /*
2815 * 2nd-Nth page's slab is part of the first one,
2816 * where N is NSLABSP16KB.
2817 */
2818 for (k = 1; k < NSLABSP16KB; k++) {
2819 nsp = slab_get(((union mbigcluster *)page) + k);
2820 VERIFY(nsp->sl_refcnt == 0 &&
2821 nsp->sl_flags == 0);
2822 slab_init(nsp, MC_16KCL,
2823 SLF_MAPPED | SLF_PARTIAL,
2824 m16kcl, NULL, 0, 0, 0);
2825 }
2826
2827 /* Insert this slab */
2828 slab_insert(sp, MC_16KCL);
2829
2830 /* Update stats now since slab_get() drops the lock */
2831 m_infree(MC_16KCL)++;
2832 m_total(MC_16KCL)++;
2833 VERIFY(m_total(MC_16KCL) <= m_maxlimit(MC_16KCL));
2834 }
2835 }
2836 VERIFY(mca_list == NULL && con_list == NULL);
2837
2838 /* We're done; let others enter */
2839 mb_clalloc_busy = FALSE;
2840 if (mb_clalloc_waiters > 0) {
2841 mb_clalloc_waiters = 0;
2842 wakeup(mb_clalloc_waitchan);
2843 }
2844
2845 if (bufsize == m_maxsize(MC_BIGCL))
2846 return (numpages);
2847
2848 VERIFY(bufsize == m_maxsize(MC_16KCL));
2849 return (numpages / NSLABSP16KB);
2850
2851 out:
2852 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2853
2854 /* We're done; let others enter */
2855 mb_clalloc_busy = FALSE;
2856 if (mb_clalloc_waiters > 0) {
2857 mb_clalloc_waiters = 0;
2858 wakeup(mb_clalloc_waitchan);
2859 }
2860
2861 /*
2862 * When non-blocking we kick a thread if we have to grow the
2863 * pool or if the number of free clusters is less than requested.
2864 */
2865 if (bufsize == m_maxsize(MC_BIGCL)) {
2866 if (i > 0) {
2867 /*
2868 * Remember total number of 4KB clusters needed
2869 * at this time.
2870 */
2871 i += m_total(MC_BIGCL);
2872 if (i > mbuf_expand_big) {
2873 mbuf_expand_big = i;
2874 if (mbuf_worker_ready)
2875 wakeup((caddr_t)&mbuf_worker_run);
2876 }
2877 }
2878
2879 if (m_infree(MC_BIGCL) >= num)
2880 return (1);
2881 } else {
2882 if (i > 0) {
2883 /*
2884 * Remember total number of 16KB clusters needed
2885 * at this time.
2886 */
2887 i += m_total(MC_16KCL);
2888 if (i > mbuf_expand_16k) {
2889 mbuf_expand_16k = i;
2890 if (mbuf_worker_ready)
2891 wakeup((caddr_t)&mbuf_worker_run);
2892 }
2893 }
2894
2895 if (m_infree(MC_16KCL) >= num)
2896 return (1);
2897 }
2898 return (0);
2899 }
2900
2901 /*
2902 * Populate the global freelist of the corresponding buffer class.
2903 */
2904 static int
2905 freelist_populate(mbuf_class_t class, unsigned int num, int wait)
2906 {
2907 mcache_obj_t *o = NULL;
2908 int i, numpages = 0, count;
2909
2910 VERIFY(class == MC_MBUF || class == MC_CL || class == MC_BIGCL ||
2911 class == MC_16KCL);
2912
2913 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2914
2915 switch (class) {
2916 case MC_MBUF:
2917 case MC_CL:
2918 case MC_BIGCL:
2919 numpages = (num * m_size(class) + NBPG - 1) / NBPG;
2920 i = m_clalloc(numpages, wait, m_maxsize(MC_BIGCL));
2921
2922 /* Respect the 4KB clusters minimum limit */
2923 if (m_total(MC_BIGCL) == m_maxlimit(MC_BIGCL) &&
2924 m_infree(MC_BIGCL) <= m_minlimit(MC_BIGCL)) {
2925 if (class != MC_BIGCL || (wait & MCR_COMP))
2926 return (0);
2927 }
2928 if (class == MC_BIGCL)
2929 return (i != 0);
2930 break;
2931
2932 case MC_16KCL:
2933 return (m_clalloc(num, wait, m_maxsize(class)) != 0);
2934 /* NOTREACHED */
2935
2936 default:
2937 VERIFY(0);
2938 /* NOTREACHED */
2939 }
2940
2941 VERIFY(class == MC_MBUF || class == MC_CL);
2942
2943 /* how many objects will we cut the page into? */
2944 int numobj = (class == MC_MBUF ? NMBPBG : NCLPBG);
2945
2946 for (count = 0; count < numpages; count++) {
2947
2948 /* respect totals, minlimit, maxlimit */
2949 if (m_total(MC_BIGCL) <= m_minlimit(MC_BIGCL) ||
2950 m_total(class) >= m_maxlimit(class))
2951 break;
2952
2953 if ((o = slab_alloc(MC_BIGCL, wait)) == NULL)
2954 break;
2955
2956 struct mbuf *m = (struct mbuf *)o;
2957 union mcluster *c = (union mcluster *)o;
2958 mcl_slab_t *sp = slab_get(o);
2959 mcache_audit_t *mca = NULL;
2960
2961 VERIFY(slab_is_detached(sp) &&
2962 (sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED);
2963
2964 /*
2965 * Make sure that the cluster is unmolested
2966 * while in freelist
2967 */
2968 if (mclverify) {
2969 mca = mcl_audit_buf2mca(MC_BIGCL, o);
2970 mcache_audit_free_verify(mca, o, 0,
2971 m_maxsize(MC_BIGCL));
2972 }
2973
2974 /* Reinitialize it as an mbuf or 2K slab */
2975 slab_init(sp, class, sp->sl_flags,
2976 sp->sl_base, NULL, sp->sl_len, 0, numobj);
2977
2978 VERIFY(o == (mcache_obj_t *)sp->sl_base);
2979 VERIFY(sp->sl_head == NULL);
2980
2981 VERIFY(m_total(MC_BIGCL) > 0);
2982 m_total(MC_BIGCL)--;
2983 mbstat.m_bigclusters = m_total(MC_BIGCL);
2984
2985 m_total(class) += numobj;
2986 m_infree(class) += numobj;
2987
2988 VERIFY(m_total(MC_BIGCL) >= m_minlimit(MC_BIGCL));
2989 VERIFY(m_total(class) <= m_maxlimit(class));
2990
2991 i = numobj;
2992 if (class == MC_MBUF) {
2993 mbstat.m_mbufs = m_total(MC_MBUF);
2994 mtype_stat_add(MT_FREE, NMBPBG);
2995 while (i--) {
2996 /*
2997 * If auditing is enabled, construct the
2998 * shadow mbuf in the audit structure
2999 * instead of the actual one.
3000 * mbuf_slab_audit() will take care of
3001 * restoring the contents after the
3002 * integrity check.
3003 */
3004 if (mclaudit != NULL) {
3005 struct mbuf *ms;
3006 mca = mcl_audit_buf2mca(MC_MBUF,
3007 (mcache_obj_t *)m);
3008 ms = MCA_SAVED_MBUF_PTR(mca);
3009 ms->m_type = MT_FREE;
3010 } else {
3011 m->m_type = MT_FREE;
3012 }
3013 m->m_next = sp->sl_head;
3014 sp->sl_head = (void *)m++;
3015 }
3016 } else { /* MC_CL */
3017 mbstat.m_clfree =
3018 m_infree(MC_CL) + m_infree(MC_MBUF_CL);
3019 mbstat.m_clusters = m_total(MC_CL);
3020 while (i--) {
3021 c->mcl_next = sp->sl_head;
3022 sp->sl_head = (void *)c++;
3023 }
3024 }
3025
3026 /* Insert into the mbuf or 2k slab list */
3027 slab_insert(sp, class);
3028
3029 if ((i = mb_waiters) > 0)
3030 mb_waiters = 0;
3031 if (i != 0)
3032 wakeup(mb_waitchan);
3033 }
3034 return (count != 0);
3035 }
3036
3037 /*
3038 * For each class, initialize the freelist to hold m_minlimit() objects.
3039 */
3040 static void
3041 freelist_init(mbuf_class_t class)
3042 {
3043 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
3044
3045 VERIFY(class == MC_CL || class == MC_BIGCL);
3046 VERIFY(m_total(class) == 0);
3047 VERIFY(m_minlimit(class) > 0);
3048
3049 while (m_total(class) < m_minlimit(class))
3050 (void) freelist_populate(class, m_minlimit(class), M_WAIT);
3051
3052 VERIFY(m_total(class) >= m_minlimit(class));
3053 }
3054
3055 /*
3056 * (Inaccurately) check if it might be worth a trip back to the
3057 * mcache layer due the availability of objects there. We'll
3058 * end up back here if there's nothing up there.
3059 */
3060 static boolean_t
3061 mbuf_cached_above(mbuf_class_t class, int wait)
3062 {
3063 switch (class) {
3064 case MC_MBUF:
3065 if (wait & MCR_COMP)
3066 return (!mcache_bkt_isempty(m_cache(MC_MBUF_CL)) ||
3067 !mcache_bkt_isempty(m_cache(MC_MBUF_BIGCL)));
3068 break;
3069
3070 case MC_CL:
3071 if (wait & MCR_COMP)
3072 return (!mcache_bkt_isempty(m_cache(MC_MBUF_CL)));
3073 break;
3074
3075 case MC_BIGCL:
3076 if (wait & MCR_COMP)
3077 return (!mcache_bkt_isempty(m_cache(MC_MBUF_BIGCL)));
3078 break;
3079
3080 case MC_16KCL:
3081 if (wait & MCR_COMP)
3082 return (!mcache_bkt_isempty(m_cache(MC_MBUF_16KCL)));
3083 break;
3084
3085 case MC_MBUF_CL:
3086 case MC_MBUF_BIGCL:
3087 case MC_MBUF_16KCL:
3088 break;
3089
3090 default:
3091 VERIFY(0);
3092 /* NOTREACHED */
3093 }
3094
3095 return (!mcache_bkt_isempty(m_cache(class)));
3096 }
3097
3098 /*
3099 * If possible, convert constructed objects to raw ones.
3100 */
3101 static boolean_t
3102 mbuf_steal(mbuf_class_t class, unsigned int num)
3103 {
3104 mcache_obj_t *top = NULL;
3105 mcache_obj_t **list = &top;
3106 unsigned int tot = 0;
3107
3108 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
3109
3110 switch (class) {
3111 case MC_MBUF:
3112 case MC_CL:
3113 case MC_BIGCL:
3114 case MC_16KCL:
3115 return (FALSE);
3116
3117 case MC_MBUF_CL:
3118 case MC_MBUF_BIGCL:
3119 case MC_MBUF_16KCL:
3120 /* Get the required number of constructed objects if possible */
3121 if (m_infree(class) > m_minlimit(class)) {
3122 tot = cslab_alloc(class, &list,
3123 MIN(num, m_infree(class)));
3124 }
3125
3126 /* And destroy them to get back the raw objects */
3127 if (top != NULL)
3128 (void) cslab_free(class, top, 1);
3129 break;
3130
3131 default:
3132 VERIFY(0);
3133 /* NOTREACHED */
3134 }
3135
3136 return (tot == num);
3137 }
3138
3139 static void
3140 m_reclaim(mbuf_class_t class, unsigned int num, boolean_t comp)
3141 {
3142 int m, bmap = 0;
3143
3144 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
3145
3146 VERIFY(m_total(MC_CL) <= m_maxlimit(MC_CL));
3147 VERIFY(m_total(MC_BIGCL) <= m_maxlimit(MC_BIGCL));
3148 VERIFY(m_total(MC_16KCL) <= m_maxlimit(MC_16KCL));
3149
3150 /*
3151 * This logic can be made smarter; for now, simply mark
3152 * all other related classes as potential victims.
3153 */
3154 switch (class) {
3155 case MC_MBUF:
3156 m_wantpurge(MC_CL)++;
3157 m_wantpurge(MC_BIGCL)++;
3158 m_wantpurge(MC_MBUF_CL)++;
3159 m_wantpurge(MC_MBUF_BIGCL)++;
3160 break;
3161
3162 case MC_CL:
3163 m_wantpurge(MC_MBUF)++;
3164 m_wantpurge(MC_BIGCL)++;
3165 m_wantpurge(MC_MBUF_BIGCL)++;
3166 if (!comp)
3167 m_wantpurge(MC_MBUF_CL)++;
3168 break;
3169
3170 case MC_BIGCL:
3171 m_wantpurge(MC_MBUF)++;
3172 m_wantpurge(MC_CL)++;
3173 m_wantpurge(MC_MBUF_CL)++;
3174 if (!comp)
3175 m_wantpurge(MC_MBUF_BIGCL)++;
3176 break;
3177
3178 case MC_16KCL:
3179 if (!comp)
3180 m_wantpurge(MC_MBUF_16KCL)++;
3181 break;
3182
3183 default:
3184 VERIFY(0);
3185 /* NOTREACHED */
3186 }
3187
3188 /*
3189 * Run through each marked class and check if we really need to
3190 * purge (and therefore temporarily disable) the per-CPU caches
3191 * layer used by the class. If so, remember the classes since
3192 * we are going to drop the lock below prior to purging.
3193 */
3194 for (m = 0; m < NELEM(mbuf_table); m++) {
3195 if (m_wantpurge(m) > 0) {
3196 m_wantpurge(m) = 0;
3197 /*
3198 * Try hard to steal the required number of objects
3199 * from the freelist of other mbuf classes. Only
3200 * purge and disable the per-CPU caches layer when
3201 * we don't have enough; it's the last resort.
3202 */
3203 if (!mbuf_steal(m, num))
3204 bmap |= (1 << m);
3205 }
3206 }
3207
3208 lck_mtx_unlock(mbuf_mlock);
3209
3210 if (bmap != 0) {
3211 /* signal the domains to drain */
3212 net_drain_domains();
3213
3214 /* Sigh; we have no other choices but to ask mcache to purge */
3215 for (m = 0; m < NELEM(mbuf_table); m++) {
3216 if ((bmap & (1 << m)) &&
3217 mcache_purge_cache(m_cache(m))) {
3218 lck_mtx_lock(mbuf_mlock);
3219 m_purge_cnt(m)++;
3220 mbstat.m_drain++;
3221 lck_mtx_unlock(mbuf_mlock);
3222 }
3223 }
3224 } else {
3225 /*
3226 * Request mcache to reap extra elements from all of its caches;
3227 * note that all reaps are serialized and happen only at a fixed
3228 * interval.
3229 */
3230 mcache_reap();
3231 }
3232 lck_mtx_lock(mbuf_mlock);
3233 }
3234
3235 static inline struct mbuf *
3236 m_get_common(int wait, short type, int hdr)
3237 {
3238 struct mbuf *m;
3239 int mcflags = MSLEEPF(wait);
3240
3241 /* Is this due to a non-blocking retry? If so, then try harder */
3242 if (mcflags & MCR_NOSLEEP)
3243 mcflags |= MCR_TRYHARD;
3244
3245 m = mcache_alloc(m_cache(MC_MBUF), mcflags);
3246 if (m != NULL) {
3247 MBUF_INIT(m, hdr, type);
3248 mtype_stat_inc(type);
3249 mtype_stat_dec(MT_FREE);
3250 #if CONFIG_MACF_NET
3251 if (hdr && mac_init_mbuf(m, wait) != 0) {
3252 m_free(m);
3253 return (NULL);
3254 }
3255 #endif /* MAC_NET */
3256 }
3257 return (m);
3258 }
3259
3260 /*
3261 * Space allocation routines; these are also available as macros
3262 * for critical paths.
3263 */
3264 #define _M_GET(wait, type) m_get_common(wait, type, 0)
3265 #define _M_GETHDR(wait, type) m_get_common(wait, type, 1)
3266 #define _M_RETRY(wait, type) _M_GET(wait, type)
3267 #define _M_RETRYHDR(wait, type) _M_GETHDR(wait, type)
3268 #define _MGET(m, how, type) ((m) = _M_GET(how, type))
3269 #define _MGETHDR(m, how, type) ((m) = _M_GETHDR(how, type))
3270
3271 struct mbuf *
3272 m_get(int wait, int type)
3273 {
3274 return (_M_GET(wait, type));
3275 }
3276
3277 struct mbuf *
3278 m_gethdr(int wait, int type)
3279 {
3280 return (_M_GETHDR(wait, type));
3281 }
3282
3283 struct mbuf *
3284 m_retry(int wait, int type)
3285 {
3286 return (_M_RETRY(wait, type));
3287 }
3288
3289 struct mbuf *
3290 m_retryhdr(int wait, int type)
3291 {
3292 return (_M_RETRYHDR(wait, type));
3293 }
3294
3295 struct mbuf *
3296 m_getclr(int wait, int type)
3297 {
3298 struct mbuf *m;
3299
3300 _MGET(m, wait, type);
3301 if (m != NULL)
3302 bzero(MTOD(m, caddr_t), MLEN);
3303 return (m);
3304 }
3305
3306 struct mbuf *
3307 m_free(struct mbuf *m)
3308 {
3309 struct mbuf *n = m->m_next;
3310
3311 if (m->m_type == MT_FREE)
3312 panic("m_free: freeing an already freed mbuf");
3313
3314 if (m->m_flags & M_PKTHDR) {
3315 /* Check for scratch area overflow */
3316 m_redzone_verify(m);
3317 /* Free the aux data and tags if there is any */
3318 m_tag_delete_chain(m, NULL);
3319 }
3320
3321 if (m->m_flags & M_EXT) {
3322 u_int32_t refcnt;
3323 u_int32_t composite;
3324
3325 refcnt = m_decref(m);
3326 composite = (MEXT_FLAGS(m) & EXTF_COMPOSITE);
3327 if (refcnt == 0 && !composite) {
3328 if (m->m_ext.ext_free == NULL) {
3329 mcache_free(m_cache(MC_CL), m->m_ext.ext_buf);
3330 } else if (m->m_ext.ext_free == m_bigfree) {
3331 mcache_free(m_cache(MC_BIGCL),
3332 m->m_ext.ext_buf);
3333 } else if (m->m_ext.ext_free == m_16kfree) {
3334 mcache_free(m_cache(MC_16KCL),
3335 m->m_ext.ext_buf);
3336 } else {
3337 (*(m->m_ext.ext_free))(m->m_ext.ext_buf,
3338 m->m_ext.ext_size, m->m_ext.ext_arg);
3339 }
3340 mcache_free(ref_cache, MEXT_RFA(m));
3341 MEXT_RFA(m) = NULL;
3342 } else if (refcnt == 0 && composite) {
3343 VERIFY(m->m_type != MT_FREE);
3344
3345 mtype_stat_dec(m->m_type);
3346 mtype_stat_inc(MT_FREE);
3347
3348 m->m_type = MT_FREE;
3349 m->m_flags = M_EXT;
3350 m->m_len = 0;
3351 m->m_next = m->m_nextpkt = NULL;
3352
3353 MEXT_FLAGS(m) &= ~EXTF_READONLY;
3354
3355 /* "Free" into the intermediate cache */
3356 if (m->m_ext.ext_free == NULL) {
3357 mcache_free(m_cache(MC_MBUF_CL), m);
3358 } else if (m->m_ext.ext_free == m_bigfree) {
3359 mcache_free(m_cache(MC_MBUF_BIGCL), m);
3360 } else {
3361 VERIFY(m->m_ext.ext_free == m_16kfree);
3362 mcache_free(m_cache(MC_MBUF_16KCL), m);
3363 }
3364 return (n);
3365 }
3366 }
3367
3368 if (m->m_type != MT_FREE) {
3369 mtype_stat_dec(m->m_type);
3370 mtype_stat_inc(MT_FREE);
3371 }
3372
3373 m->m_type = MT_FREE;
3374 m->m_flags = m->m_len = 0;
3375 m->m_next = m->m_nextpkt = NULL;
3376
3377 mcache_free(m_cache(MC_MBUF), m);
3378
3379 return (n);
3380 }
3381
3382 __private_extern__ struct mbuf *
3383 m_clattach(struct mbuf *m, int type, caddr_t extbuf,
3384 void (*extfree)(caddr_t, u_int, caddr_t), u_int extsize, caddr_t extarg,
3385 int wait)
3386 {
3387 struct ext_ref *rfa = NULL;
3388
3389 if (m == NULL && (m = _M_GETHDR(wait, type)) == NULL)
3390 return (NULL);
3391
3392 if (m->m_flags & M_EXT) {
3393 u_int32_t refcnt;
3394 u_int32_t composite;
3395
3396 refcnt = m_decref(m);
3397 composite = (MEXT_FLAGS(m) & EXTF_COMPOSITE);
3398 if (refcnt == 0 && !composite) {
3399 if (m->m_ext.ext_free == NULL) {
3400 mcache_free(m_cache(MC_CL), m->m_ext.ext_buf);
3401 } else if (m->m_ext.ext_free == m_bigfree) {
3402 mcache_free(m_cache(MC_BIGCL),
3403 m->m_ext.ext_buf);
3404 } else if (m->m_ext.ext_free == m_16kfree) {
3405 mcache_free(m_cache(MC_16KCL),
3406 m->m_ext.ext_buf);
3407 } else {
3408 (*(m->m_ext.ext_free))(m->m_ext.ext_buf,
3409 m->m_ext.ext_size, m->m_ext.ext_arg);
3410 }
3411 /* Re-use the reference structure */
3412 rfa = MEXT_RFA(m);
3413 } else if (refcnt == 0 && composite) {
3414 VERIFY(m->m_type != MT_FREE);
3415
3416 mtype_stat_dec(m->m_type);
3417 mtype_stat_inc(MT_FREE);
3418
3419 m->m_type = MT_FREE;
3420 m->m_flags = M_EXT;
3421 m->m_len = 0;
3422 m->m_next = m->m_nextpkt = NULL;
3423
3424 MEXT_FLAGS(m) &= ~EXTF_READONLY;
3425
3426 /* "Free" into the intermediate cache */
3427 if (m->m_ext.ext_free == NULL) {
3428 mcache_free(m_cache(MC_MBUF_CL), m);
3429 } else if (m->m_ext.ext_free == m_bigfree) {
3430 mcache_free(m_cache(MC_MBUF_BIGCL), m);
3431 } else {
3432 VERIFY(m->m_ext.ext_free == m_16kfree);
3433 mcache_free(m_cache(MC_MBUF_16KCL), m);
3434 }
3435 /*
3436 * Allocate a new mbuf, since we didn't divorce
3437 * the composite mbuf + cluster pair above.
3438 */
3439 if ((m = _M_GETHDR(wait, type)) == NULL)
3440 return (NULL);
3441 }
3442 }
3443
3444 if (rfa == NULL &&
3445 (rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL) {
3446 m_free(m);
3447 return (NULL);
3448 }
3449
3450 MEXT_INIT(m, extbuf, extsize, extfree, extarg, rfa, 1, 0);
3451
3452 return (m);
3453 }
3454
3455 /*
3456 * Perform `fast' allocation mbuf clusters from a cache of recently-freed
3457 * clusters. (If the cache is empty, new clusters are allocated en-masse.)
3458 */
3459 struct mbuf *
3460 m_getcl(int wait, int type, int flags)
3461 {
3462 struct mbuf *m;
3463 int mcflags = MSLEEPF(wait);
3464 int hdr = (flags & M_PKTHDR);
3465
3466 /* Is this due to a non-blocking retry? If so, then try harder */
3467 if (mcflags & MCR_NOSLEEP)
3468 mcflags |= MCR_TRYHARD;
3469
3470 m = mcache_alloc(m_cache(MC_MBUF_CL), mcflags);
3471 if (m != NULL) {
3472 u_int32_t flag;
3473 struct ext_ref *rfa;
3474 void *cl;
3475
3476 VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT);
3477 cl = m->m_ext.ext_buf;
3478 rfa = MEXT_RFA(m);
3479
3480 ASSERT(cl != NULL && rfa != NULL);
3481 VERIFY(MBUF_IS_COMPOSITE(m) && m->m_ext.ext_free == NULL);
3482
3483 flag = MEXT_FLAGS(m);
3484
3485 MBUF_INIT(m, hdr, type);
3486 MBUF_CL_INIT(m, cl, rfa, 1, flag);
3487
3488 mtype_stat_inc(type);
3489 mtype_stat_dec(MT_FREE);
3490 #if CONFIG_MACF_NET
3491 if (hdr && mac_init_mbuf(m, wait) != 0) {
3492 m_freem(m);
3493 return (NULL);
3494 }
3495 #endif /* MAC_NET */
3496 }
3497 return (m);
3498 }
3499
3500 /* m_mclget() add an mbuf cluster to a normal mbuf */
3501 struct mbuf *
3502 m_mclget(struct mbuf *m, int wait)
3503 {
3504 struct ext_ref *rfa;
3505
3506 if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL)
3507 return (m);
3508
3509 m->m_ext.ext_buf = m_mclalloc(wait);
3510 if (m->m_ext.ext_buf != NULL) {
3511 MBUF_CL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0);
3512 } else {
3513 mcache_free(ref_cache, rfa);
3514 }
3515 return (m);
3516 }
3517
3518 /* Allocate an mbuf cluster */
3519 caddr_t
3520 m_mclalloc(int wait)
3521 {
3522 int mcflags = MSLEEPF(wait);
3523
3524 /* Is this due to a non-blocking retry? If so, then try harder */
3525 if (mcflags & MCR_NOSLEEP)
3526 mcflags |= MCR_TRYHARD;
3527
3528 return (mcache_alloc(m_cache(MC_CL), mcflags));
3529 }
3530
3531 /* Free an mbuf cluster */
3532 void
3533 m_mclfree(caddr_t p)
3534 {
3535 mcache_free(m_cache(MC_CL), p);
3536 }
3537
3538 /*
3539 * mcl_hasreference() checks if a cluster of an mbuf is referenced by
3540 * another mbuf; see comments in m_incref() regarding EXTF_READONLY.
3541 */
3542 int
3543 m_mclhasreference(struct mbuf *m)
3544 {
3545 if (!(m->m_flags & M_EXT))
3546 return (0);
3547
3548 ASSERT(MEXT_RFA(m) != NULL);
3549
3550 return ((MEXT_FLAGS(m) & EXTF_READONLY) ? 1 : 0);
3551 }
3552
3553 __private_extern__ caddr_t
3554 m_bigalloc(int wait)
3555 {
3556 int mcflags = MSLEEPF(wait);
3557
3558 /* Is this due to a non-blocking retry? If so, then try harder */
3559 if (mcflags & MCR_NOSLEEP)
3560 mcflags |= MCR_TRYHARD;
3561
3562 return (mcache_alloc(m_cache(MC_BIGCL), mcflags));
3563 }
3564
3565 __private_extern__ void
3566 m_bigfree(caddr_t p, __unused u_int size, __unused caddr_t arg)
3567 {
3568 mcache_free(m_cache(MC_BIGCL), p);
3569 }
3570
3571 /* m_mbigget() add an 4KB mbuf cluster to a normal mbuf */
3572 __private_extern__ struct mbuf *
3573 m_mbigget(struct mbuf *m, int wait)
3574 {
3575 struct ext_ref *rfa;
3576
3577 if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL)
3578 return (m);
3579
3580 m->m_ext.ext_buf = m_bigalloc(wait);
3581 if (m->m_ext.ext_buf != NULL) {
3582 MBUF_BIGCL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0);
3583 } else {
3584 mcache_free(ref_cache, rfa);
3585 }
3586 return (m);
3587 }
3588
3589 __private_extern__ caddr_t
3590 m_16kalloc(int wait)
3591 {
3592 int mcflags = MSLEEPF(wait);
3593
3594 /* Is this due to a non-blocking retry? If so, then try harder */
3595 if (mcflags & MCR_NOSLEEP)
3596 mcflags |= MCR_TRYHARD;
3597
3598 return (mcache_alloc(m_cache(MC_16KCL), mcflags));
3599 }
3600
3601 __private_extern__ void
3602 m_16kfree(caddr_t p, __unused u_int size, __unused caddr_t arg)
3603 {
3604 mcache_free(m_cache(MC_16KCL), p);
3605 }
3606
3607 /* m_m16kget() add a 16KB mbuf cluster to a normal mbuf */
3608 __private_extern__ struct mbuf *
3609 m_m16kget(struct mbuf *m, int wait)
3610 {
3611 struct ext_ref *rfa;
3612
3613 if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL)
3614 return (m);
3615
3616 m->m_ext.ext_buf = m_16kalloc(wait);
3617 if (m->m_ext.ext_buf != NULL) {
3618 MBUF_16KCL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0);
3619 } else {
3620 mcache_free(ref_cache, rfa);
3621 }
3622 return (m);
3623 }
3624
3625 /*
3626 * "Move" mbuf pkthdr from "from" to "to".
3627 * "from" must have M_PKTHDR set, and "to" must be empty.
3628 */
3629 void
3630 m_copy_pkthdr(struct mbuf *to, struct mbuf *from)
3631 {
3632 VERIFY(from->m_flags & M_PKTHDR);
3633
3634 /* Check for scratch area overflow */
3635 m_redzone_verify(from);
3636
3637 if (to->m_flags & M_PKTHDR) {
3638 /* Check for scratch area overflow */
3639 m_redzone_verify(to);
3640 /* We will be taking over the tags of 'to' */
3641 m_tag_delete_chain(to, NULL);
3642 }
3643 to->m_pkthdr = from->m_pkthdr; /* especially tags */
3644 m_classifier_init(from, 0); /* purge classifier info */
3645 m_tag_init(from, 1); /* purge all tags from src */
3646 m_scratch_init(from); /* clear src scratch area */
3647 to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT);
3648 if ((to->m_flags & M_EXT) == 0)
3649 to->m_data = to->m_pktdat;
3650 m_redzone_init(to); /* setup red zone on dst */
3651 }
3652
3653 /*
3654 * Duplicate "from"'s mbuf pkthdr in "to".
3655 * "from" must have M_PKTHDR set, and "to" must be empty.
3656 * In particular, this does a deep copy of the packet tags.
3657 */
3658 static int
3659 m_dup_pkthdr(struct mbuf *to, struct mbuf *from, int how)
3660 {
3661 VERIFY(from->m_flags & M_PKTHDR);
3662
3663 /* Check for scratch area overflow */
3664 m_redzone_verify(from);
3665
3666 if (to->m_flags & M_PKTHDR) {
3667 /* Check for scratch area overflow */
3668 m_redzone_verify(to);
3669 /* We will be taking over the tags of 'to' */
3670 m_tag_delete_chain(to, NULL);
3671 }
3672 to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT);
3673 if ((to->m_flags & M_EXT) == 0)
3674 to->m_data = to->m_pktdat;
3675 to->m_pkthdr = from->m_pkthdr;
3676 m_redzone_init(to); /* setup red zone on dst */
3677 m_tag_init(to, 0); /* preserve dst static tags */
3678 return (m_tag_copy_chain(to, from, how));
3679 }
3680
3681 void
3682 m_copy_pftag(struct mbuf *to, struct mbuf *from)
3683 {
3684 to->m_pkthdr.pf_mtag = from->m_pkthdr.pf_mtag;
3685 #if PF_ECN
3686 to->m_pkthdr.pf_mtag.pftag_hdr = NULL;
3687 to->m_pkthdr.pf_mtag.pftag_flags &= ~(PF_TAG_HDR_INET|PF_TAG_HDR_INET6);
3688 #endif /* PF_ECN */
3689 }
3690
3691 void
3692 m_classifier_init(struct mbuf *m, uint32_t pktf_mask)
3693 {
3694 VERIFY(m->m_flags & M_PKTHDR);
3695
3696 m->m_pkthdr.pkt_proto = 0;
3697 m->m_pkthdr.pkt_flowsrc = 0;
3698 m->m_pkthdr.pkt_flowid = 0;
3699 m->m_pkthdr.pkt_flags &= pktf_mask; /* caller-defined mask */
3700 /* preserve service class and interface info for loopback packets */
3701 if (!(m->m_pkthdr.pkt_flags & PKTF_LOOP))
3702 (void) m_set_service_class(m, MBUF_SC_BE);
3703 if (!(m->m_pkthdr.pkt_flags & PKTF_IFAINFO))
3704 m->m_pkthdr.pkt_ifainfo = 0;
3705 #if MEASURE_BW
3706 m->m_pkthdr.pkt_bwseq = 0;
3707 #endif /* MEASURE_BW */
3708 }
3709
3710 void
3711 m_copy_classifier(struct mbuf *to, struct mbuf *from)
3712 {
3713 VERIFY(to->m_flags & M_PKTHDR);
3714 VERIFY(from->m_flags & M_PKTHDR);
3715
3716 to->m_pkthdr.pkt_proto = from->m_pkthdr.pkt_proto;
3717 to->m_pkthdr.pkt_flowsrc = from->m_pkthdr.pkt_flowsrc;
3718 to->m_pkthdr.pkt_flowid = from->m_pkthdr.pkt_flowid;
3719 to->m_pkthdr.pkt_flags = from->m_pkthdr.pkt_flags;
3720 (void) m_set_service_class(to, from->m_pkthdr.pkt_svc);
3721 to->m_pkthdr.pkt_ifainfo = from->m_pkthdr.pkt_ifainfo;
3722 to->m_pkthdr.ipsec_policy = from->m_pkthdr.ipsec_policy;
3723 #if MEASURE_BW
3724 to->m_pkthdr.pkt_bwseq = from->m_pkthdr.pkt_bwseq;
3725 #endif /* MEASURE_BW */
3726 }
3727
3728 /*
3729 * Return a list of mbuf hdrs that point to clusters. Try for num_needed;
3730 * if wantall is not set, return whatever number were available. Set up the
3731 * first num_with_pkthdrs with mbuf hdrs configured as packet headers; these
3732 * are chained on the m_nextpkt field. Any packets requested beyond this
3733 * are chained onto the last packet header's m_next field. The size of
3734 * the cluster is controlled by the parameter bufsize.
3735 */
3736 __private_extern__ struct mbuf *
3737 m_getpackets_internal(unsigned int *num_needed, int num_with_pkthdrs,
3738 int wait, int wantall, size_t bufsize)
3739 {
3740 struct mbuf *m;
3741 struct mbuf **np, *top;
3742 unsigned int pnum, needed = *num_needed;
3743 mcache_obj_t *mp_list = NULL;
3744 int mcflags = MSLEEPF(wait);
3745 u_int32_t flag;
3746 struct ext_ref *rfa;
3747 mcache_t *cp;
3748 void *cl;
3749
3750 ASSERT(bufsize == m_maxsize(MC_CL) ||
3751 bufsize == m_maxsize(MC_BIGCL) ||
3752 bufsize == m_maxsize(MC_16KCL));
3753
3754 /*
3755 * Caller must first check for njcl because this
3756 * routine is internal and not exposed/used via KPI.
3757 */
3758 VERIFY(bufsize != m_maxsize(MC_16KCL) || njcl > 0);
3759
3760 top = NULL;
3761 np = &top;
3762 pnum = 0;
3763
3764 /*
3765 * The caller doesn't want all the requested buffers; only some.
3766 * Try hard to get what we can, but don't block. This effectively
3767 * overrides MCR_SLEEP, since this thread will not go to sleep
3768 * if we can't get all the buffers.
3769 */
3770 if (!wantall || (mcflags & MCR_NOSLEEP))
3771 mcflags |= MCR_TRYHARD;
3772
3773 /* Allocate the composite mbuf + cluster elements from the cache */
3774 if (bufsize == m_maxsize(MC_CL))
3775 cp = m_cache(MC_MBUF_CL);
3776 else if (bufsize == m_maxsize(MC_BIGCL))
3777 cp = m_cache(MC_MBUF_BIGCL);
3778 else
3779 cp = m_cache(MC_MBUF_16KCL);
3780 needed = mcache_alloc_ext(cp, &mp_list, needed, mcflags);
3781
3782 for (pnum = 0; pnum < needed; pnum++) {
3783 m = (struct mbuf *)mp_list;
3784 mp_list = mp_list->obj_next;
3785
3786 VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT);
3787 cl = m->m_ext.ext_buf;
3788 rfa = MEXT_RFA(m);
3789
3790 ASSERT(cl != NULL && rfa != NULL);
3791 VERIFY(MBUF_IS_COMPOSITE(m));
3792
3793 flag = MEXT_FLAGS(m);
3794
3795 MBUF_INIT(m, num_with_pkthdrs, MT_DATA);
3796 if (bufsize == m_maxsize(MC_16KCL)) {
3797 MBUF_16KCL_INIT(m, cl, rfa, 1, flag);
3798 } else if (bufsize == m_maxsize(MC_BIGCL)) {
3799 MBUF_BIGCL_INIT(m, cl, rfa, 1, flag);
3800 } else {
3801 MBUF_CL_INIT(m, cl, rfa, 1, flag);
3802 }
3803
3804 if (num_with_pkthdrs > 0) {
3805 --num_with_pkthdrs;
3806 #if CONFIG_MACF_NET
3807 if (mac_mbuf_label_init(m, wait) != 0) {
3808 m_freem(m);
3809 break;
3810 }
3811 #endif /* MAC_NET */
3812 }
3813
3814 *np = m;
3815 if (num_with_pkthdrs > 0)
3816 np = &m->m_nextpkt;
3817 else
3818 np = &m->m_next;
3819 }
3820 ASSERT(pnum != *num_needed || mp_list == NULL);
3821 if (mp_list != NULL)
3822 mcache_free_ext(cp, mp_list);
3823
3824 if (pnum > 0) {
3825 mtype_stat_add(MT_DATA, pnum);
3826 mtype_stat_sub(MT_FREE, pnum);
3827 }
3828
3829 if (wantall && (pnum != *num_needed)) {
3830 if (top != NULL)
3831 m_freem_list(top);
3832 return (NULL);
3833 }
3834
3835 if (pnum > *num_needed) {
3836 printf("%s: File a radar related to <rdar://10146739>. \
3837 needed = %u, pnum = %u, num_needed = %u \n",
3838 __func__, needed, pnum, *num_needed);
3839 }
3840
3841 *num_needed = pnum;
3842 return (top);
3843 }
3844
3845 /*
3846 * Return list of mbuf linked by m_nextpkt. Try for numlist, and if
3847 * wantall is not set, return whatever number were available. The size of
3848 * each mbuf in the list is controlled by the parameter packetlen. Each
3849 * mbuf of the list may have a chain of mbufs linked by m_next. Each mbuf
3850 * in the chain is called a segment. If maxsegments is not null and the
3851 * value pointed to is not null, this specify the maximum number of segments
3852 * for a chain of mbufs. If maxsegments is zero or the value pointed to
3853 * is zero the caller does not have any restriction on the number of segments.
3854 * The actual number of segments of a mbuf chain is return in the value
3855 * pointed to by maxsegments.
3856 */
3857 __private_extern__ struct mbuf *
3858 m_allocpacket_internal(unsigned int *numlist, size_t packetlen,
3859 unsigned int *maxsegments, int wait, int wantall, size_t wantsize)
3860 {
3861 struct mbuf **np, *top, *first = NULL;
3862 size_t bufsize, r_bufsize;
3863 unsigned int num = 0;
3864 unsigned int nsegs = 0;
3865 unsigned int needed, resid;
3866 int mcflags = MSLEEPF(wait);
3867 mcache_obj_t *mp_list = NULL, *rmp_list = NULL;
3868 mcache_t *cp = NULL, *rcp = NULL;
3869
3870 if (*numlist == 0)
3871 return (NULL);
3872
3873 top = NULL;
3874 np = &top;
3875
3876 if (wantsize == 0) {
3877 if (packetlen <= MINCLSIZE) {
3878 bufsize = packetlen;
3879 } else if (packetlen > m_maxsize(MC_CL)) {
3880 /* Use 4KB if jumbo cluster pool isn't available */
3881 if (packetlen <= m_maxsize(MC_BIGCL) || njcl == 0)
3882 bufsize = m_maxsize(MC_BIGCL);
3883 else
3884 bufsize = m_maxsize(MC_16KCL);
3885 } else {
3886 bufsize = m_maxsize(MC_CL);
3887 }
3888 } else if (wantsize == m_maxsize(MC_CL) ||
3889 wantsize == m_maxsize(MC_BIGCL) ||
3890 (wantsize == m_maxsize(MC_16KCL) && njcl > 0)) {
3891 bufsize = wantsize;
3892 } else {
3893 return (NULL);
3894 }
3895
3896 if (bufsize <= MHLEN) {
3897 nsegs = 1;
3898 } else if (bufsize <= MINCLSIZE) {
3899 if (maxsegments != NULL && *maxsegments == 1) {
3900 bufsize = m_maxsize(MC_CL);
3901 nsegs = 1;
3902 } else {
3903 nsegs = 2;
3904 }
3905 } else if (bufsize == m_maxsize(MC_16KCL)) {
3906 VERIFY(njcl > 0);
3907 nsegs = ((packetlen - 1) >> (PGSHIFT + 2)) + 1;
3908 } else if (bufsize == m_maxsize(MC_BIGCL)) {
3909 nsegs = ((packetlen - 1) >> PGSHIFT) + 1;
3910 } else {
3911 nsegs = ((packetlen - 1) >> MCLSHIFT) + 1;
3912 }
3913 if (maxsegments != NULL) {
3914 if (*maxsegments && nsegs > *maxsegments) {
3915 *maxsegments = nsegs;
3916 return (NULL);
3917 }
3918 *maxsegments = nsegs;
3919 }
3920
3921 /*
3922 * The caller doesn't want all the requested buffers; only some.
3923 * Try hard to get what we can, but don't block. This effectively
3924 * overrides MCR_SLEEP, since this thread will not go to sleep
3925 * if we can't get all the buffers.
3926 */
3927 if (!wantall || (mcflags & MCR_NOSLEEP))
3928 mcflags |= MCR_TRYHARD;
3929
3930 /*
3931 * Simple case where all elements in the lists/chains are mbufs.
3932 * Unless bufsize is greater than MHLEN, each segment chain is made
3933 * up of exactly 1 mbuf. Otherwise, each segment chain is made up
3934 * of 2 mbufs; the second one is used for the residual data, i.e.
3935 * the remaining data that cannot fit into the first mbuf.
3936 */
3937 if (bufsize <= MINCLSIZE) {
3938 /* Allocate the elements in one shot from the mbuf cache */
3939 ASSERT(bufsize <= MHLEN || nsegs == 2);
3940 cp = m_cache(MC_MBUF);
3941 needed = mcache_alloc_ext(cp, &mp_list,
3942 (*numlist) * nsegs, mcflags);
3943
3944 /*
3945 * The number of elements must be even if we are to use an
3946 * mbuf (instead of a cluster) to store the residual data.
3947 * If we couldn't allocate the requested number of mbufs,
3948 * trim the number down (if it's odd) in order to avoid
3949 * creating a partial segment chain.
3950 */
3951 if (bufsize > MHLEN && (needed & 0x1))
3952 needed--;
3953
3954 while (num < needed) {
3955 struct mbuf *m;
3956
3957 m = (struct mbuf *)mp_list;
3958 mp_list = mp_list->obj_next;
3959 ASSERT(m != NULL);
3960
3961 MBUF_INIT(m, 1, MT_DATA);
3962 #if CONFIG_MACF_NET
3963 if (mac_init_mbuf(m, wait) != 0) {
3964 m_free(m);
3965 break;
3966 }
3967 #endif /* MAC_NET */
3968 num++;
3969 if (bufsize > MHLEN) {
3970 /* A second mbuf for this segment chain */
3971 m->m_next = (struct mbuf *)mp_list;
3972 mp_list = mp_list->obj_next;
3973 ASSERT(m->m_next != NULL);
3974
3975 MBUF_INIT(m->m_next, 0, MT_DATA);
3976 num++;
3977 }
3978 *np = m;
3979 np = &m->m_nextpkt;
3980 }
3981 ASSERT(num != *numlist || mp_list == NULL);
3982
3983 if (num > 0) {
3984 mtype_stat_add(MT_DATA, num);
3985 mtype_stat_sub(MT_FREE, num);
3986 }
3987 num /= nsegs;
3988
3989 /* We've got them all; return to caller */
3990 if (num == *numlist)
3991 return (top);
3992
3993 goto fail;
3994 }
3995
3996 /*
3997 * Complex cases where elements are made up of one or more composite
3998 * mbufs + cluster, depending on packetlen. Each N-segment chain can
3999 * be illustrated as follows:
4000 *
4001 * [mbuf + cluster 1] [mbuf + cluster 2] ... [mbuf + cluster N]
4002 *
4003 * Every composite mbuf + cluster element comes from the intermediate
4004 * cache (either MC_MBUF_CL or MC_MBUF_BIGCL). For space efficiency,
4005 * the last composite element will come from the MC_MBUF_CL cache,
4006 * unless the residual data is larger than 2KB where we use the
4007 * big cluster composite cache (MC_MBUF_BIGCL) instead. Residual
4008 * data is defined as extra data beyond the first element that cannot
4009 * fit into the previous element, i.e. there is no residual data if
4010 * the chain only has 1 segment.
4011 */
4012 r_bufsize = bufsize;
4013 resid = packetlen > bufsize ? packetlen % bufsize : 0;
4014 if (resid > 0) {
4015 /* There is residual data; figure out the cluster size */
4016 if (wantsize == 0 && packetlen > MINCLSIZE) {
4017 /*
4018 * Caller didn't request that all of the segments
4019 * in the chain use the same cluster size; use the
4020 * smaller of the cluster sizes.
4021 */
4022 if (njcl > 0 && resid > m_maxsize(MC_BIGCL))
4023 r_bufsize = m_maxsize(MC_16KCL);
4024 else if (resid > m_maxsize(MC_CL))
4025 r_bufsize = m_maxsize(MC_BIGCL);
4026 else
4027 r_bufsize = m_maxsize(MC_CL);
4028 } else {
4029 /* Use the same cluster size as the other segments */
4030 resid = 0;
4031 }
4032 }
4033
4034 needed = *numlist;
4035 if (resid > 0) {
4036 /*
4037 * Attempt to allocate composite mbuf + cluster elements for
4038 * the residual data in each chain; record the number of such
4039 * elements that can be allocated so that we know how many
4040 * segment chains we can afford to create.
4041 */
4042 if (r_bufsize <= m_maxsize(MC_CL))
4043 rcp = m_cache(MC_MBUF_CL);
4044 else if (r_bufsize <= m_maxsize(MC_BIGCL))
4045 rcp = m_cache(MC_MBUF_BIGCL);
4046 else
4047 rcp = m_cache(MC_MBUF_16KCL);
4048 needed = mcache_alloc_ext(rcp, &rmp_list, *numlist, mcflags);
4049
4050 if (needed == 0)
4051 goto fail;
4052
4053 /* This is temporarily reduced for calculation */
4054 ASSERT(nsegs > 1);
4055 nsegs--;
4056 }
4057
4058 /*
4059 * Attempt to allocate the rest of the composite mbuf + cluster
4060 * elements for the number of segment chains that we need.
4061 */
4062 if (bufsize <= m_maxsize(MC_CL))
4063 cp = m_cache(MC_MBUF_CL);
4064 else if (bufsize <= m_maxsize(MC_BIGCL))
4065 cp = m_cache(MC_MBUF_BIGCL);
4066 else
4067 cp = m_cache(MC_MBUF_16KCL);
4068 needed = mcache_alloc_ext(cp, &mp_list, needed * nsegs, mcflags);
4069
4070 /* Round it down to avoid creating a partial segment chain */
4071 needed = (needed / nsegs) * nsegs;
4072 if (needed == 0)
4073 goto fail;
4074
4075 if (resid > 0) {
4076 /*
4077 * We're about to construct the chain(s); take into account
4078 * the number of segments we have created above to hold the
4079 * residual data for each chain, as well as restore the
4080 * original count of segments per chain.
4081 */
4082 ASSERT(nsegs > 0);
4083 needed += needed / nsegs;
4084 nsegs++;
4085 }
4086
4087 for (;;) {
4088 struct mbuf *m;
4089 u_int32_t flag;
4090 struct ext_ref *rfa;
4091 void *cl;
4092 int pkthdr;
4093
4094 ++num;
4095 if (nsegs == 1 || (num % nsegs) != 0 || resid == 0) {
4096 m = (struct mbuf *)mp_list;
4097 mp_list = mp_list->obj_next;
4098 } else {
4099 m = (struct mbuf *)rmp_list;
4100 rmp_list = rmp_list->obj_next;
4101 }
4102 ASSERT(m != NULL);
4103 VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT);
4104 VERIFY(m->m_ext.ext_free == NULL ||
4105 m->m_ext.ext_free == m_bigfree ||
4106 m->m_ext.ext_free == m_16kfree);
4107
4108 cl = m->m_ext.ext_buf;
4109 rfa = MEXT_RFA(m);
4110
4111 ASSERT(cl != NULL && rfa != NULL);
4112 VERIFY(MBUF_IS_COMPOSITE(m));
4113
4114 flag = MEXT_FLAGS(m);
4115
4116 pkthdr = (nsegs == 1 || (num % nsegs) == 1);
4117 if (pkthdr)
4118 first = m;
4119 MBUF_INIT(m, pkthdr, MT_DATA);
4120 if (m->m_ext.ext_free == m_16kfree) {
4121 MBUF_16KCL_INIT(m, cl, rfa, 1, flag);
4122 } else if (m->m_ext.ext_free == m_bigfree) {
4123 MBUF_BIGCL_INIT(m, cl, rfa, 1, flag);
4124 } else {
4125 MBUF_CL_INIT(m, cl, rfa, 1, flag);
4126 }
4127 #if CONFIG_MACF_NET
4128 if (pkthdr && mac_init_mbuf(m, wait) != 0) {
4129 --num;
4130 m_freem(m);
4131 break;
4132 }
4133 #endif /* MAC_NET */
4134
4135 *np = m;
4136 if ((num % nsegs) == 0)
4137 np = &first->m_nextpkt;
4138 else
4139 np = &m->m_next;
4140
4141 if (num == needed)
4142 break;
4143 }
4144
4145 if (num > 0) {
4146 mtype_stat_add(MT_DATA, num);
4147 mtype_stat_sub(MT_FREE, num);
4148 }
4149
4150 num /= nsegs;
4151
4152 /* We've got them all; return to caller */
4153 if (num == *numlist) {
4154 ASSERT(mp_list == NULL && rmp_list == NULL);
4155 return (top);
4156 }
4157
4158 fail:
4159 /* Free up what's left of the above */
4160 if (mp_list != NULL)
4161 mcache_free_ext(cp, mp_list);
4162 if (rmp_list != NULL)
4163 mcache_free_ext(rcp, rmp_list);
4164 if (wantall && top != NULL) {
4165 m_freem(top);
4166 return (NULL);
4167 }
4168 *numlist = num;
4169 return (top);
4170 }
4171
4172 /*
4173 * Best effort to get a mbuf cluster + pkthdr. Used by drivers to allocated
4174 * packets on receive ring.
4175 */
4176 __private_extern__ struct mbuf *
4177 m_getpacket_how(int wait)
4178 {
4179 unsigned int num_needed = 1;
4180
4181 return (m_getpackets_internal(&num_needed, 1, wait, 1,
4182 m_maxsize(MC_CL)));
4183 }
4184
4185 /*
4186 * Best effort to get a mbuf cluster + pkthdr. Used by drivers to allocated
4187 * packets on receive ring.
4188 */
4189 struct mbuf *
4190 m_getpacket(void)
4191 {
4192 unsigned int num_needed = 1;
4193
4194 return (m_getpackets_internal(&num_needed, 1, M_WAIT, 1,
4195 m_maxsize(MC_CL)));
4196 }
4197
4198 /*
4199 * Return a list of mbuf hdrs that point to clusters. Try for num_needed;
4200 * if this can't be met, return whatever number were available. Set up the
4201 * first num_with_pkthdrs with mbuf hdrs configured as packet headers. These
4202 * are chained on the m_nextpkt field. Any packets requested beyond this are
4203 * chained onto the last packet header's m_next field.
4204 */
4205 struct mbuf *
4206 m_getpackets(int num_needed, int num_with_pkthdrs, int how)
4207 {
4208 unsigned int n = num_needed;
4209
4210 return (m_getpackets_internal(&n, num_with_pkthdrs, how, 0,
4211 m_maxsize(MC_CL)));
4212 }
4213
4214 /*
4215 * Return a list of mbuf hdrs set up as packet hdrs chained together
4216 * on the m_nextpkt field
4217 */
4218 struct mbuf *
4219 m_getpackethdrs(int num_needed, int how)
4220 {
4221 struct mbuf *m;
4222 struct mbuf **np, *top;
4223
4224 top = NULL;
4225 np = &top;
4226
4227 while (num_needed--) {
4228 m = _M_RETRYHDR(how, MT_DATA);
4229 if (m == NULL)
4230 break;
4231
4232 *np = m;
4233 np = &m->m_nextpkt;
4234 }
4235
4236 return (top);
4237 }
4238
4239 /*
4240 * Free an mbuf list (m_nextpkt) while following m_next. Returns the count
4241 * for mbufs packets freed. Used by the drivers.
4242 */
4243 int
4244 m_freem_list(struct mbuf *m)
4245 {
4246 struct mbuf *nextpkt;
4247 mcache_obj_t *mp_list = NULL;
4248 mcache_obj_t *mcl_list = NULL;
4249 mcache_obj_t *mbc_list = NULL;
4250 mcache_obj_t *m16k_list = NULL;
4251 mcache_obj_t *m_mcl_list = NULL;
4252 mcache_obj_t *m_mbc_list = NULL;
4253 mcache_obj_t *m_m16k_list = NULL;
4254 mcache_obj_t *ref_list = NULL;
4255 int pktcount = 0;
4256 int mt_free = 0, mt_data = 0, mt_header = 0, mt_soname = 0, mt_tag = 0;
4257
4258 while (m != NULL) {
4259 pktcount++;
4260
4261 nextpkt = m->m_nextpkt;
4262 m->m_nextpkt = NULL;
4263
4264 while (m != NULL) {
4265 struct mbuf *next = m->m_next;
4266 mcache_obj_t *o, *rfa;
4267 u_int32_t refcnt, composite;
4268
4269 if (m->m_type == MT_FREE)
4270 panic("m_free: freeing an already freed mbuf");
4271
4272 if (m->m_type != MT_FREE)
4273 mt_free++;
4274
4275 if (m->m_flags & M_PKTHDR) {
4276 /* Check for scratch area overflow */
4277 m_redzone_verify(m);
4278 /* Free the aux data and tags if there is any */
4279 m_tag_delete_chain(m, NULL);
4280 }
4281
4282 if (!(m->m_flags & M_EXT))
4283 goto simple_free;
4284
4285 o = (mcache_obj_t *)(void *)m->m_ext.ext_buf;
4286 refcnt = m_decref(m);
4287 composite = (MEXT_FLAGS(m) & EXTF_COMPOSITE);
4288 if (refcnt == 0 && !composite) {
4289 if (m->m_ext.ext_free == NULL) {
4290 o->obj_next = mcl_list;
4291 mcl_list = o;
4292 } else if (m->m_ext.ext_free == m_bigfree) {
4293 o->obj_next = mbc_list;
4294 mbc_list = o;
4295 } else if (m->m_ext.ext_free == m_16kfree) {
4296 o->obj_next = m16k_list;
4297 m16k_list = o;
4298 } else {
4299 (*(m->m_ext.ext_free))((caddr_t)o,
4300 m->m_ext.ext_size,
4301 m->m_ext.ext_arg);
4302 }
4303 rfa = (mcache_obj_t *)(void *)MEXT_RFA(m);
4304 rfa->obj_next = ref_list;
4305 ref_list = rfa;
4306 MEXT_RFA(m) = NULL;
4307 } else if (refcnt == 0 && composite) {
4308 VERIFY(m->m_type != MT_FREE);
4309 /*
4310 * Amortize the costs of atomic operations
4311 * by doing them at the end, if possible.
4312 */
4313 if (m->m_type == MT_DATA)
4314 mt_data++;
4315 else if (m->m_type == MT_HEADER)
4316 mt_header++;
4317 else if (m->m_type == MT_SONAME)
4318 mt_soname++;
4319 else if (m->m_type == MT_TAG)
4320 mt_tag++;
4321 else
4322 mtype_stat_dec(m->m_type);
4323
4324 m->m_type = MT_FREE;
4325 m->m_flags = M_EXT;
4326 m->m_len = 0;
4327 m->m_next = m->m_nextpkt = NULL;
4328
4329 MEXT_FLAGS(m) &= ~EXTF_READONLY;
4330
4331 /* "Free" into the intermediate cache */
4332 o = (mcache_obj_t *)m;
4333 if (m->m_ext.ext_free == NULL) {
4334 o->obj_next = m_mcl_list;
4335 m_mcl_list = o;
4336 } else if (m->m_ext.ext_free == m_bigfree) {
4337 o->obj_next = m_mbc_list;
4338 m_mbc_list = o;
4339 } else {
4340 VERIFY(m->m_ext.ext_free == m_16kfree);
4341 o->obj_next = m_m16k_list;
4342 m_m16k_list = o;
4343 }
4344 m = next;
4345 continue;
4346 }
4347 simple_free:
4348 /*
4349 * Amortize the costs of atomic operations
4350 * by doing them at the end, if possible.
4351 */
4352 if (m->m_type == MT_DATA)
4353 mt_data++;
4354 else if (m->m_type == MT_HEADER)
4355 mt_header++;
4356 else if (m->m_type == MT_SONAME)
4357 mt_soname++;
4358 else if (m->m_type == MT_TAG)
4359 mt_tag++;
4360 else if (m->m_type != MT_FREE)
4361 mtype_stat_dec(m->m_type);
4362
4363 m->m_type = MT_FREE;
4364 m->m_flags = m->m_len = 0;
4365 m->m_next = m->m_nextpkt = NULL;
4366
4367 ((mcache_obj_t *)m)->obj_next = mp_list;
4368 mp_list = (mcache_obj_t *)m;
4369
4370 m = next;
4371 }
4372
4373 m = nextpkt;
4374 }
4375
4376 if (mt_free > 0)
4377 mtype_stat_add(MT_FREE, mt_free);
4378 if (mt_data > 0)
4379 mtype_stat_sub(MT_DATA, mt_data);
4380 if (mt_header > 0)
4381 mtype_stat_sub(MT_HEADER, mt_header);
4382 if (mt_soname > 0)
4383 mtype_stat_sub(MT_SONAME, mt_soname);
4384 if (mt_tag > 0)
4385 mtype_stat_sub(MT_TAG, mt_tag);
4386
4387 if (mp_list != NULL)
4388 mcache_free_ext(m_cache(MC_MBUF), mp_list);
4389 if (mcl_list != NULL)
4390 mcache_free_ext(m_cache(MC_CL), mcl_list);
4391 if (mbc_list != NULL)
4392 mcache_free_ext(m_cache(MC_BIGCL), mbc_list);
4393 if (m16k_list != NULL)
4394 mcache_free_ext(m_cache(MC_16KCL), m16k_list);
4395 if (m_mcl_list != NULL)
4396 mcache_free_ext(m_cache(MC_MBUF_CL), m_mcl_list);
4397 if (m_mbc_list != NULL)
4398 mcache_free_ext(m_cache(MC_MBUF_BIGCL), m_mbc_list);
4399 if (m_m16k_list != NULL)
4400 mcache_free_ext(m_cache(MC_MBUF_16KCL), m_m16k_list);
4401 if (ref_list != NULL)
4402 mcache_free_ext(ref_cache, ref_list);
4403
4404 return (pktcount);
4405 }
4406
4407 void
4408 m_freem(struct mbuf *m)
4409 {
4410 while (m != NULL)
4411 m = m_free(m);
4412 }
4413
4414 /*
4415 * Mbuffer utility routines.
4416 */
4417
4418 /*
4419 * Compute the amount of space available before the current start
4420 * of data in an mbuf.
4421 */
4422 int
4423 m_leadingspace(struct mbuf *m)
4424 {
4425 if (m->m_flags & M_EXT) {
4426 if (MCLHASREFERENCE(m))
4427 return (0);
4428 return (m->m_data - m->m_ext.ext_buf);
4429 }
4430 if (m->m_flags & M_PKTHDR)
4431 return (m->m_data - m->m_pktdat);
4432 return (m->m_data - m->m_dat);
4433 }
4434
4435 /*
4436 * Compute the amount of space available after the end of data in an mbuf.
4437 */
4438 int
4439 m_trailingspace(struct mbuf *m)
4440 {
4441 if (m->m_flags & M_EXT) {
4442 if (MCLHASREFERENCE(m))
4443 return (0);
4444 return (m->m_ext.ext_buf + m->m_ext.ext_size -
4445 (m->m_data + m->m_len));
4446 }
4447 return (&m->m_dat[MLEN] - (m->m_data + m->m_len));
4448 }
4449
4450 /*
4451 * Lesser-used path for M_PREPEND: allocate new mbuf to prepend to chain,
4452 * copy junk along. Does not adjust packet header length.
4453 */
4454 struct mbuf *
4455 m_prepend(struct mbuf *m, int len, int how)
4456 {
4457 struct mbuf *mn;
4458
4459 _MGET(mn, how, m->m_type);
4460 if (mn == NULL) {
4461 m_freem(m);
4462 return (NULL);
4463 }
4464 if (m->m_flags & M_PKTHDR) {
4465 M_COPY_PKTHDR(mn, m);
4466 m->m_flags &= ~M_PKTHDR;
4467 }
4468 mn->m_next = m;
4469 m = mn;
4470 if (len < MHLEN)
4471 MH_ALIGN(m, len);
4472 m->m_len = len;
4473 return (m);
4474 }
4475
4476 /*
4477 * Replacement for old M_PREPEND macro: allocate new mbuf to prepend to
4478 * chain, copy junk along, and adjust length.
4479 */
4480 struct mbuf *
4481 m_prepend_2(struct mbuf *m, int len, int how)
4482 {
4483 if (M_LEADINGSPACE(m) >= len) {
4484 m->m_data -= len;
4485 m->m_len += len;
4486 } else {
4487 m = m_prepend(m, len, how);
4488 }
4489 if ((m) && (m->m_flags & M_PKTHDR))
4490 m->m_pkthdr.len += len;
4491 return (m);
4492 }
4493
4494 /*
4495 * Make a copy of an mbuf chain starting "off0" bytes from the beginning,
4496 * continuing for "len" bytes. If len is M_COPYALL, copy to end of mbuf.
4497 * The wait parameter is a choice of M_WAIT/M_DONTWAIT from caller.
4498 */
4499 int MCFail;
4500
4501 struct mbuf *
4502 m_copym_mode(struct mbuf *m, int off0, int len, int wait, uint32_t mode)
4503 {
4504 struct mbuf *n, *mhdr = NULL, **np;
4505 int off = off0;
4506 struct mbuf *top;
4507 int copyhdr = 0;
4508
4509 if (off < 0 || len < 0)
4510 panic("m_copym: invalid offset %d or len %d", off, len);
4511
4512 if (off == 0 && (m->m_flags & M_PKTHDR)) {
4513 mhdr = m;
4514 copyhdr = 1;
4515 }
4516
4517 while (off >= m->m_len) {
4518 if (m->m_next == NULL)
4519 panic("m_copym: invalid mbuf chain");
4520 off -= m->m_len;
4521 m = m->m_next;
4522 }
4523 np = &top;
4524 top = NULL;
4525
4526 while (len > 0) {
4527 if (m == NULL) {
4528 if (len != M_COPYALL)
4529 panic("m_copym: len != M_COPYALL");
4530 break;
4531 }
4532
4533 n = _M_RETRY(wait, m->m_type);
4534 *np = n;
4535
4536 if (n == NULL)
4537 goto nospace;
4538
4539 if (copyhdr != 0) {
4540 if (mode == M_COPYM_MOVE_HDR) {
4541 M_COPY_PKTHDR(n, mhdr);
4542 } else if (mode == M_COPYM_COPY_HDR) {
4543 if (m_dup_pkthdr(n, mhdr, wait) == 0)
4544 goto nospace;
4545 }
4546 if (len == M_COPYALL)
4547 n->m_pkthdr.len -= off0;
4548 else
4549 n->m_pkthdr.len = len;
4550 copyhdr = 0;
4551 }
4552 if (len == M_COPYALL) {
4553 if (MIN(len, (m->m_len - off)) == len) {
4554 printf("m->m_len %d - off %d = %d, %d\n",
4555 m->m_len, off, m->m_len - off,
4556 MIN(len, (m->m_len - off)));
4557 }
4558 }
4559 n->m_len = MIN(len, (m->m_len - off));
4560 if (n->m_len == M_COPYALL) {
4561 printf("n->m_len == M_COPYALL, fixing\n");
4562 n->m_len = MHLEN;
4563 }
4564 if (m->m_flags & M_EXT) {
4565 n->m_ext = m->m_ext;
4566 m_incref(m);
4567 n->m_data = m->m_data + off;
4568 n->m_flags |= M_EXT;
4569 } else {
4570 bcopy(MTOD(m, caddr_t)+off, MTOD(n, caddr_t),
4571 (unsigned)n->m_len);
4572 }
4573 if (len != M_COPYALL)
4574 len -= n->m_len;
4575 off = 0;
4576 m = m->m_next;
4577 np = &n->m_next;
4578 }
4579
4580 if (top == NULL)
4581 MCFail++;
4582
4583 return (top);
4584 nospace:
4585
4586 m_freem(top);
4587 MCFail++;
4588 return (NULL);
4589 }
4590
4591
4592 struct mbuf *
4593 m_copym(struct mbuf *m, int off0, int len, int wait)
4594 {
4595 return (m_copym_mode(m, off0, len, wait, M_COPYM_MOVE_HDR));
4596 }
4597
4598 /*
4599 * Equivalent to m_copym except that all necessary mbuf hdrs are allocated
4600 * within this routine also, the last mbuf and offset accessed are passed
4601 * out and can be passed back in to avoid having to rescan the entire mbuf
4602 * list (normally hung off of the socket)
4603 */
4604 struct mbuf *
4605 m_copym_with_hdrs(struct mbuf *m, int off0, int len0, int wait,
4606 struct mbuf **m_lastm, int *m_off, uint32_t mode)
4607 {
4608 struct mbuf *n, **np = NULL;
4609 int off = off0, len = len0;
4610 struct mbuf *top = NULL;
4611 int mcflags = MSLEEPF(wait);
4612 int copyhdr = 0;
4613 int type = 0;
4614 mcache_obj_t *list = NULL;
4615 int needed = 0;
4616
4617 if (off == 0 && (m->m_flags & M_PKTHDR))
4618 copyhdr = 1;
4619
4620 if (*m_lastm != NULL) {
4621 m = *m_lastm;
4622 off = *m_off;
4623 } else {
4624 while (off >= m->m_len) {
4625 off -= m->m_len;
4626 m = m->m_next;
4627 }
4628 }
4629
4630 n = m;
4631 while (len > 0) {
4632 needed++;
4633 ASSERT(n != NULL);
4634 len -= MIN(len, (n->m_len - ((needed == 1) ? off : 0)));
4635 n = n->m_next;
4636 }
4637 needed++;
4638 len = len0;
4639
4640 /*
4641 * If the caller doesn't want to be put to sleep, mark it with
4642 * MCR_TRYHARD so that we may reclaim buffers from other places
4643 * before giving up.
4644 */
4645 if (mcflags & MCR_NOSLEEP)
4646 mcflags |= MCR_TRYHARD;
4647
4648 if (mcache_alloc_ext(m_cache(MC_MBUF), &list, needed,
4649 mcflags) != needed)
4650 goto nospace;
4651
4652 needed = 0;
4653 while (len > 0) {
4654 n = (struct mbuf *)list;
4655 list = list->obj_next;
4656 ASSERT(n != NULL && m != NULL);
4657
4658 type = (top == NULL) ? MT_HEADER : m->m_type;
4659 MBUF_INIT(n, (top == NULL), type);
4660 #if CONFIG_MACF_NET
4661 if (top == NULL && mac_mbuf_label_init(n, wait) != 0) {
4662 mtype_stat_inc(MT_HEADER);
4663 mtype_stat_dec(MT_FREE);
4664 m_free(n);
4665 goto nospace;
4666 }
4667 #endif /* MAC_NET */
4668
4669 if (top == NULL) {
4670 top = n;
4671 np = &top->m_next;
4672 continue;
4673 } else {
4674 needed++;
4675 *np = n;
4676 }
4677
4678 if (copyhdr) {
4679 if (mode == M_COPYM_MOVE_HDR) {
4680 M_COPY_PKTHDR(n, m);
4681 } else if (mode == M_COPYM_COPY_HDR) {
4682 if (m_dup_pkthdr(n, m, wait) == 0)
4683 goto nospace;
4684 }
4685 n->m_pkthdr.len = len;
4686 copyhdr = 0;
4687 }
4688 n->m_len = MIN(len, (m->m_len - off));
4689
4690 if (m->m_flags & M_EXT) {
4691 n->m_ext = m->m_ext;
4692 m_incref(m);
4693 n->m_data = m->m_data + off;
4694 n->m_flags |= M_EXT;
4695 } else {
4696 bcopy(MTOD(m, caddr_t)+off, MTOD(n, caddr_t),
4697 (unsigned)n->m_len);
4698 }
4699 len -= n->m_len;
4700
4701 if (len == 0) {
4702 if ((off + n->m_len) == m->m_len) {
4703 *m_lastm = m->m_next;
4704 *m_off = 0;
4705 } else {
4706 *m_lastm = m;
4707 *m_off = off + n->m_len;
4708 }
4709 break;
4710 }
4711 off = 0;
4712 m = m->m_next;
4713 np = &n->m_next;
4714 }
4715
4716 mtype_stat_inc(MT_HEADER);
4717 mtype_stat_add(type, needed);
4718 mtype_stat_sub(MT_FREE, needed + 1);
4719
4720 ASSERT(list == NULL);
4721 return (top);
4722
4723 nospace:
4724 if (list != NULL)
4725 mcache_free_ext(m_cache(MC_MBUF), list);
4726 if (top != NULL)
4727 m_freem(top);
4728 MCFail++;
4729 return (NULL);
4730 }
4731
4732 /*
4733 * Copy data from an mbuf chain starting "off" bytes from the beginning,
4734 * continuing for "len" bytes, into the indicated buffer.
4735 */
4736 void
4737 m_copydata(struct mbuf *m, int off, int len, void *vp)
4738 {
4739 unsigned count;
4740 char *cp = vp;
4741
4742 if (off < 0 || len < 0)
4743 panic("m_copydata: invalid offset %d or len %d", off, len);
4744
4745 while (off > 0) {
4746 if (m == NULL)
4747 panic("m_copydata: invalid mbuf chain");
4748 if (off < m->m_len)
4749 break;
4750 off -= m->m_len;
4751 m = m->m_next;
4752 }
4753 while (len > 0) {
4754 if (m == NULL)
4755 panic("m_copydata: invalid mbuf chain");
4756 count = MIN(m->m_len - off, len);
4757 bcopy(MTOD(m, caddr_t) + off, cp, count);
4758 len -= count;
4759 cp += count;
4760 off = 0;
4761 m = m->m_next;
4762 }
4763 }
4764
4765 /*
4766 * Concatenate mbuf chain n to m. Both chains must be of the same type
4767 * (e.g. MT_DATA). Any m_pkthdr is not updated.
4768 */
4769 void
4770 m_cat(struct mbuf *m, struct mbuf *n)
4771 {
4772 while (m->m_next)
4773 m = m->m_next;
4774 while (n) {
4775 if ((m->m_flags & M_EXT) ||
4776 m->m_data + m->m_len + n->m_len >= &m->m_dat[MLEN]) {
4777 /* just join the two chains */
4778 m->m_next = n;
4779 return;
4780 }
4781 /* splat the data from one into the other */
4782 bcopy(MTOD(n, caddr_t), MTOD(m, caddr_t) + m->m_len,
4783 (u_int)n->m_len);
4784 m->m_len += n->m_len;
4785 n = m_free(n);
4786 }
4787 }
4788
4789 void
4790 m_adj(struct mbuf *mp, int req_len)
4791 {
4792 int len = req_len;
4793 struct mbuf *m;
4794 int count;
4795
4796 if ((m = mp) == NULL)
4797 return;
4798 if (len >= 0) {
4799 /*
4800 * Trim from head.
4801 */
4802 while (m != NULL && len > 0) {
4803 if (m->m_len <= len) {
4804 len -= m->m_len;
4805 m->m_len = 0;
4806 m = m->m_next;
4807 } else {
4808 m->m_len -= len;
4809 m->m_data += len;
4810 len = 0;
4811 }
4812 }
4813 m = mp;
4814 if (m->m_flags & M_PKTHDR)
4815 m->m_pkthdr.len -= (req_len - len);
4816 } else {
4817 /*
4818 * Trim from tail. Scan the mbuf chain,
4819 * calculating its length and finding the last mbuf.
4820 * If the adjustment only affects this mbuf, then just
4821 * adjust and return. Otherwise, rescan and truncate
4822 * after the remaining size.
4823 */
4824 len = -len;
4825 count = 0;
4826 for (;;) {
4827 count += m->m_len;
4828 if (m->m_next == (struct mbuf *)0)
4829 break;
4830 m = m->m_next;
4831 }
4832 if (m->m_len >= len) {
4833 m->m_len -= len;
4834 m = mp;
4835 if (m->m_flags & M_PKTHDR)
4836 m->m_pkthdr.len -= len;
4837 return;
4838 }
4839 count -= len;
4840 if (count < 0)
4841 count = 0;
4842 /*
4843 * Correct length for chain is "count".
4844 * Find the mbuf with last data, adjust its length,
4845 * and toss data from remaining mbufs on chain.
4846 */
4847 m = mp;
4848 if (m->m_flags & M_PKTHDR)
4849 m->m_pkthdr.len = count;
4850 for (; m; m = m->m_next) {
4851 if (m->m_len >= count) {
4852 m->m_len = count;
4853 break;
4854 }
4855 count -= m->m_len;
4856 }
4857 while ((m = m->m_next))
4858 m->m_len = 0;
4859 }
4860 }
4861
4862 /*
4863 * Rearange an mbuf chain so that len bytes are contiguous
4864 * and in the data area of an mbuf (so that mtod and dtom
4865 * will work for a structure of size len). Returns the resulting
4866 * mbuf chain on success, frees it and returns null on failure.
4867 * If there is room, it will add up to max_protohdr-len extra bytes to the
4868 * contiguous region in an attempt to avoid being called next time.
4869 */
4870 int MPFail;
4871
4872 struct mbuf *
4873 m_pullup(struct mbuf *n, int len)
4874 {
4875 struct mbuf *m;
4876 int count;
4877 int space;
4878
4879 /*
4880 * If first mbuf has no cluster, and has room for len bytes
4881 * without shifting current data, pullup into it,
4882 * otherwise allocate a new mbuf to prepend to the chain.
4883 */
4884 if ((n->m_flags & M_EXT) == 0 &&
4885 n->m_data + len < &n->m_dat[MLEN] && n->m_next) {
4886 if (n->m_len >= len)
4887 return (n);
4888 m = n;
4889 n = n->m_next;
4890 len -= m->m_len;
4891 } else {
4892 if (len > MHLEN)
4893 goto bad;
4894 _MGET(m, M_DONTWAIT, n->m_type);
4895 if (m == 0)
4896 goto bad;
4897 m->m_len = 0;
4898 if (n->m_flags & M_PKTHDR) {
4899 M_COPY_PKTHDR(m, n);
4900 n->m_flags &= ~M_PKTHDR;
4901 }
4902 }
4903 space = &m->m_dat[MLEN] - (m->m_data + m->m_len);
4904 do {
4905 count = MIN(MIN(MAX(len, max_protohdr), space), n->m_len);
4906 bcopy(MTOD(n, caddr_t), MTOD(m, caddr_t) + m->m_len,
4907 (unsigned)count);
4908 len -= count;
4909 m->m_len += count;
4910 n->m_len -= count;
4911 space -= count;
4912 if (n->m_len)
4913 n->m_data += count;
4914 else
4915 n = m_free(n);
4916 } while (len > 0 && n);
4917 if (len > 0) {
4918 (void) m_free(m);
4919 goto bad;
4920 }
4921 m->m_next = n;
4922 return (m);
4923 bad:
4924 m_freem(n);
4925 MPFail++;
4926 return (0);
4927 }
4928
4929 /*
4930 * Like m_pullup(), except a new mbuf is always allocated, and we allow
4931 * the amount of empty space before the data in the new mbuf to be specified
4932 * (in the event that the caller expects to prepend later).
4933 */
4934 __private_extern__ int MSFail = 0;
4935
4936 __private_extern__ struct mbuf *
4937 m_copyup(struct mbuf *n, int len, int dstoff)
4938 {
4939 struct mbuf *m;
4940 int count, space;
4941
4942 if (len > (MHLEN - dstoff))
4943 goto bad;
4944 MGET(m, M_DONTWAIT, n->m_type);
4945 if (m == NULL)
4946 goto bad;
4947 m->m_len = 0;
4948 if (n->m_flags & M_PKTHDR) {
4949 m_copy_pkthdr(m, n);
4950 n->m_flags &= ~M_PKTHDR;
4951 }
4952 m->m_data += dstoff;
4953 space = &m->m_dat[MLEN] - (m->m_data + m->m_len);
4954 do {
4955 count = min(min(max(len, max_protohdr), space), n->m_len);
4956 memcpy(mtod(m, caddr_t) + m->m_len, mtod(n, caddr_t),
4957 (unsigned)count);
4958 len -= count;
4959 m->m_len += count;
4960 n->m_len -= count;
4961 space -= count;
4962 if (n->m_len)
4963 n->m_data += count;
4964 else
4965 n = m_free(n);
4966 } while (len > 0 && n);
4967 if (len > 0) {
4968 (void) m_free(m);
4969 goto bad;
4970 }
4971 m->m_next = n;
4972 return (m);
4973 bad:
4974 m_freem(n);
4975 MSFail++;
4976 return (NULL);
4977 }
4978
4979 /*
4980 * Partition an mbuf chain in two pieces, returning the tail --
4981 * all but the first len0 bytes. In case of failure, it returns NULL and
4982 * attempts to restore the chain to its original state.
4983 */
4984 struct mbuf *
4985 m_split(struct mbuf *m0, int len0, int wait)
4986 {
4987 return (m_split0(m0, len0, wait, 1));
4988 }
4989
4990 static struct mbuf *
4991 m_split0(struct mbuf *m0, int len0, int wait, int copyhdr)
4992 {
4993 struct mbuf *m, *n;
4994 unsigned len = len0, remain;
4995
4996 for (m = m0; m && len > m->m_len; m = m->m_next)
4997 len -= m->m_len;
4998 if (m == NULL)
4999 return (NULL);
5000 remain = m->m_len - len;
5001 if (copyhdr && (m0->m_flags & M_PKTHDR)) {
5002 _MGETHDR(n, wait, m0->m_type);
5003 if (n == NULL)
5004 return (NULL);
5005 n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif;
5006 n->m_pkthdr.len = m0->m_pkthdr.len - len0;
5007 m0->m_pkthdr.len = len0;
5008 if (m->m_flags & M_EXT)
5009 goto extpacket;
5010 if (remain > MHLEN) {
5011 /* m can't be the lead packet */
5012 MH_ALIGN(n, 0);
5013 n->m_next = m_split(m, len, wait);
5014 if (n->m_next == NULL) {
5015 (void) m_free(n);
5016 return (NULL);
5017 } else
5018 return (n);
5019 } else
5020 MH_ALIGN(n, remain);
5021 } else if (remain == 0) {
5022 n = m->m_next;
5023 m->m_next = NULL;
5024 return (n);
5025 } else {
5026 _MGET(n, wait, m->m_type);
5027 if (n == NULL)
5028 return (NULL);
5029 M_ALIGN(n, remain);
5030 }
5031 extpacket:
5032 if (m->m_flags & M_EXT) {
5033 n->m_flags |= M_EXT;
5034 n->m_ext = m->m_ext;
5035 m_incref(m);
5036 n->m_data = m->m_data + len;
5037 } else {
5038 bcopy(MTOD(m, caddr_t) + len, MTOD(n, caddr_t), remain);
5039 }
5040 n->m_len = remain;
5041 m->m_len = len;
5042 n->m_next = m->m_next;
5043 m->m_next = NULL;
5044 return (n);
5045 }
5046
5047 /*
5048 * Routine to copy from device local memory into mbufs.
5049 */
5050 struct mbuf *
5051 m_devget(char *buf, int totlen, int off0, struct ifnet *ifp,
5052 void (*copy)(const void *, void *, size_t))
5053 {
5054 struct mbuf *m;
5055 struct mbuf *top = NULL, **mp = &top;
5056 int off = off0, len;
5057 char *cp;
5058 char *epkt;
5059
5060 cp = buf;
5061 epkt = cp + totlen;
5062 if (off) {
5063 /*
5064 * If 'off' is non-zero, packet is trailer-encapsulated,
5065 * so we have to skip the type and length fields.
5066 */
5067 cp += off + 2 * sizeof (u_int16_t);
5068 totlen -= 2 * sizeof (u_int16_t);
5069 }
5070 _MGETHDR(m, M_DONTWAIT, MT_DATA);
5071 if (m == NULL)
5072 return (NULL);
5073 m->m_pkthdr.rcvif = ifp;
5074 m->m_pkthdr.len = totlen;
5075 m->m_len = MHLEN;
5076
5077 while (totlen > 0) {
5078 if (top != NULL) {
5079 _MGET(m, M_DONTWAIT, MT_DATA);
5080 if (m == NULL) {
5081 m_freem(top);
5082 return (NULL);
5083 }
5084 m->m_len = MLEN;
5085 }
5086 len = MIN(totlen, epkt - cp);
5087 if (len >= MINCLSIZE) {
5088 MCLGET(m, M_DONTWAIT);
5089 if (m->m_flags & M_EXT) {
5090 m->m_len = len = MIN(len, m_maxsize(MC_CL));
5091 } else {
5092 /* give up when it's out of cluster mbufs */
5093 if (top != NULL)
5094 m_freem(top);
5095 m_freem(m);
5096 return (NULL);
5097 }
5098 } else {
5099 /*
5100 * Place initial small packet/header at end of mbuf.
5101 */
5102 if (len < m->m_len) {
5103 if (top == NULL &&
5104 len + max_linkhdr <= m->m_len)
5105 m->m_data += max_linkhdr;
5106 m->m_len = len;
5107 } else {
5108 len = m->m_len;
5109 }
5110 }
5111 if (copy)
5112 copy(cp, MTOD(m, caddr_t), (unsigned)len);
5113 else
5114 bcopy(cp, MTOD(m, caddr_t), (unsigned)len);
5115 cp += len;
5116 *mp = m;
5117 mp = &m->m_next;
5118 totlen -= len;
5119 if (cp == epkt)
5120 cp = buf;
5121 }
5122 return (top);
5123 }
5124
5125 #ifndef MBUF_GROWTH_NORMAL_THRESH
5126 #define MBUF_GROWTH_NORMAL_THRESH 25
5127 #endif
5128
5129 /*
5130 * Cluster freelist allocation check.
5131 */
5132 static int
5133 m_howmany(int num, size_t bufsize)
5134 {
5135 int i = 0, j = 0;
5136 u_int32_t m_mbclusters, m_clusters, m_bigclusters, m_16kclusters;
5137 u_int32_t m_mbfree, m_clfree, m_bigclfree, m_16kclfree;
5138 u_int32_t sumclusters, freeclusters;
5139 u_int32_t percent_pool, percent_kmem;
5140 u_int32_t mb_growth, mb_growth_thresh;
5141
5142 VERIFY(bufsize == m_maxsize(MC_BIGCL) ||
5143 bufsize == m_maxsize(MC_16KCL));
5144
5145 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
5146
5147 /* Numbers in 2K cluster units */
5148 m_mbclusters = m_total(MC_MBUF) >> NMBPCLSHIFT;
5149 m_clusters = m_total(MC_CL);
5150 m_bigclusters = m_total(MC_BIGCL) << NCLPBGSHIFT;
5151 m_16kclusters = m_total(MC_16KCL);
5152 sumclusters = m_mbclusters + m_clusters + m_bigclusters;
5153
5154 m_mbfree = m_infree(MC_MBUF) >> NMBPCLSHIFT;
5155 m_clfree = m_infree(MC_CL);
5156 m_bigclfree = m_infree(MC_BIGCL) << NCLPBGSHIFT;
5157 m_16kclfree = m_infree(MC_16KCL);
5158 freeclusters = m_mbfree + m_clfree + m_bigclfree;
5159
5160 /* Bail if we've maxed out the mbuf memory map */
5161 if ((bufsize == m_maxsize(MC_BIGCL) && sumclusters >= nclusters) ||
5162 (njcl > 0 && bufsize == m_maxsize(MC_16KCL) &&
5163 (m_16kclusters << NCLPJCLSHIFT) >= njcl)) {
5164 return (0);
5165 }
5166
5167 if (bufsize == m_maxsize(MC_BIGCL)) {
5168 /* Under minimum */
5169 if (m_bigclusters < m_minlimit(MC_BIGCL))
5170 return (m_minlimit(MC_BIGCL) - m_bigclusters);
5171
5172 percent_pool =
5173 ((sumclusters - freeclusters) * 100) / sumclusters;
5174 percent_kmem = (sumclusters * 100) / nclusters;
5175
5176 /*
5177 * If a light/normal user, grow conservatively (75%)
5178 * If a heavy user, grow aggressively (50%)
5179 */
5180 if (percent_kmem < MBUF_GROWTH_NORMAL_THRESH)
5181 mb_growth = MB_GROWTH_NORMAL;
5182 else
5183 mb_growth = MB_GROWTH_AGGRESSIVE;
5184
5185 if (percent_kmem < 5) {
5186 /* For initial allocations */
5187 i = num;
5188 } else {
5189 /* Return if >= MBIGCL_LOWAT clusters available */
5190 if (m_infree(MC_BIGCL) >= MBIGCL_LOWAT &&
5191 m_total(MC_BIGCL) >=
5192 MBIGCL_LOWAT + m_minlimit(MC_BIGCL))
5193 return (0);
5194
5195 /* Ensure at least num clusters are accessible */
5196 if (num >= m_infree(MC_BIGCL))
5197 i = num - m_infree(MC_BIGCL);
5198 if (num > m_total(MC_BIGCL) - m_minlimit(MC_BIGCL))
5199 j = num - (m_total(MC_BIGCL) -
5200 m_minlimit(MC_BIGCL));
5201
5202 i = MAX(i, j);
5203
5204 /*
5205 * Grow pool if percent_pool > 75 (normal growth)
5206 * or percent_pool > 50 (aggressive growth).
5207 */
5208 mb_growth_thresh = 100 - (100 / (1 << mb_growth));
5209 if (percent_pool > mb_growth_thresh)
5210 j = ((sumclusters + num) >> mb_growth) -
5211 freeclusters;
5212 i = MAX(i, j);
5213 }
5214
5215 /* Check to ensure we didn't go over limits */
5216 if (i + m_bigclusters >= m_maxlimit(MC_BIGCL))
5217 i = m_maxlimit(MC_BIGCL) - m_bigclusters;
5218 if ((i << 1) + sumclusters >= nclusters)
5219 i = (nclusters - sumclusters) >> 1;
5220 VERIFY((m_total(MC_BIGCL) + i) <= m_maxlimit(MC_BIGCL));
5221 VERIFY(sumclusters + (i << 1) <= nclusters);
5222
5223 } else { /* 16K CL */
5224 VERIFY(njcl > 0);
5225 /* Under minimum */
5226 if (m_16kclusters < MIN16KCL)
5227 return (MIN16KCL - m_16kclusters);
5228 if (m_16kclfree >= M16KCL_LOWAT)
5229 return (0);
5230
5231 /* Ensure at least num clusters are available */
5232 if (num >= m_16kclfree)
5233 i = num - m_16kclfree;
5234
5235 /* Always grow 16KCL pool aggressively */
5236 if (((m_16kclusters + num) >> 1) > m_16kclfree)
5237 j = ((m_16kclusters + num) >> 1) - m_16kclfree;
5238 i = MAX(i, j);
5239
5240 /* Check to ensure we don't go over limit */
5241 if (i + m_16kclusters >= m_maxlimit(MC_16KCL))
5242 i = m_maxlimit(MC_16KCL) - m_16kclusters;
5243 VERIFY((m_total(MC_16KCL) + i) <= m_maxlimit(MC_16KCL));
5244 }
5245 return (i);
5246 }
5247 /*
5248 * Return the number of bytes in the mbuf chain, m.
5249 */
5250 unsigned int
5251 m_length(struct mbuf *m)
5252 {
5253 struct mbuf *m0;
5254 unsigned int pktlen;
5255
5256 if (m->m_flags & M_PKTHDR)
5257 return (m->m_pkthdr.len);
5258
5259 pktlen = 0;
5260 for (m0 = m; m0 != NULL; m0 = m0->m_next)
5261 pktlen += m0->m_len;
5262 return (pktlen);
5263 }
5264
5265 /*
5266 * Copy data from a buffer back into the indicated mbuf chain,
5267 * starting "off" bytes from the beginning, extending the mbuf
5268 * chain if necessary.
5269 */
5270 void
5271 m_copyback(struct mbuf *m0, int off, int len, const void *cp)
5272 {
5273 #if DEBUG
5274 struct mbuf *origm = m0;
5275 int error;
5276 #endif /* DEBUG */
5277
5278 if (m0 == NULL)
5279 return;
5280
5281 #if DEBUG
5282 error =
5283 #endif /* DEBUG */
5284 m_copyback0(&m0, off, len, cp,
5285 M_COPYBACK0_COPYBACK | M_COPYBACK0_EXTEND, M_DONTWAIT);
5286
5287 #if DEBUG
5288 if (error != 0 || (m0 != NULL && origm != m0))
5289 panic("m_copyback");
5290 #endif /* DEBUG */
5291 }
5292
5293 struct mbuf *
5294 m_copyback_cow(struct mbuf *m0, int off, int len, const void *cp, int how)
5295 {
5296 int error;
5297
5298 /* don't support chain expansion */
5299 VERIFY(off + len <= m_length(m0));
5300
5301 error = m_copyback0(&m0, off, len, cp,
5302 M_COPYBACK0_COPYBACK | M_COPYBACK0_COW, how);
5303 if (error) {
5304 /*
5305 * no way to recover from partial success.
5306 * just free the chain.
5307 */
5308 m_freem(m0);
5309 return (NULL);
5310 }
5311 return (m0);
5312 }
5313
5314 /*
5315 * m_makewritable: ensure the specified range writable.
5316 */
5317 int
5318 m_makewritable(struct mbuf **mp, int off, int len, int how)
5319 {
5320 int error;
5321 #if DEBUG
5322 struct mbuf *n;
5323 int origlen, reslen;
5324
5325 origlen = m_length(*mp);
5326 #endif /* DEBUG */
5327
5328 #if 0 /* M_COPYALL is large enough */
5329 if (len == M_COPYALL)
5330 len = m_length(*mp) - off; /* XXX */
5331 #endif
5332
5333 error = m_copyback0(mp, off, len, NULL,
5334 M_COPYBACK0_PRESERVE | M_COPYBACK0_COW, how);
5335
5336 #if DEBUG
5337 reslen = 0;
5338 for (n = *mp; n; n = n->m_next)
5339 reslen += n->m_len;
5340 if (origlen != reslen)
5341 panic("m_makewritable: length changed");
5342 if (((*mp)->m_flags & M_PKTHDR) && reslen != (*mp)->m_pkthdr.len)
5343 panic("m_makewritable: inconsist");
5344 #endif /* DEBUG */
5345
5346 return (error);
5347 }
5348
5349 static int
5350 m_copyback0(struct mbuf **mp0, int off, int len, const void *vp, int flags,
5351 int how)
5352 {
5353 int mlen;
5354 struct mbuf *m, *n;
5355 struct mbuf **mp;
5356 int totlen = 0;
5357 const char *cp = vp;
5358
5359 VERIFY(mp0 != NULL);
5360 VERIFY(*mp0 != NULL);
5361 VERIFY((flags & M_COPYBACK0_PRESERVE) == 0 || cp == NULL);
5362 VERIFY((flags & M_COPYBACK0_COPYBACK) == 0 || cp != NULL);
5363
5364 /*
5365 * we don't bother to update "totlen" in the case of M_COPYBACK0_COW,
5366 * assuming that M_COPYBACK0_EXTEND and M_COPYBACK0_COW are exclusive.
5367 */
5368
5369 VERIFY((~flags & (M_COPYBACK0_EXTEND|M_COPYBACK0_COW)) != 0);
5370
5371 mp = mp0;
5372 m = *mp;
5373 while (off > (mlen = m->m_len)) {
5374 off -= mlen;
5375 totlen += mlen;
5376 if (m->m_next == NULL) {
5377 int tspace;
5378 extend:
5379 if (!(flags & M_COPYBACK0_EXTEND))
5380 goto out;
5381
5382 /*
5383 * try to make some space at the end of "m".
5384 */
5385
5386 mlen = m->m_len;
5387 if (off + len >= MINCLSIZE &&
5388 !(m->m_flags & M_EXT) && m->m_len == 0) {
5389 MCLGET(m, how);
5390 }
5391 tspace = M_TRAILINGSPACE(m);
5392 if (tspace > 0) {
5393 tspace = MIN(tspace, off + len);
5394 VERIFY(tspace > 0);
5395 bzero(mtod(m, char *) + m->m_len,
5396 MIN(off, tspace));
5397 m->m_len += tspace;
5398 off += mlen;
5399 totlen -= mlen;
5400 continue;
5401 }
5402
5403 /*
5404 * need to allocate an mbuf.
5405 */
5406
5407 if (off + len >= MINCLSIZE) {
5408 n = m_getcl(how, m->m_type, 0);
5409 } else {
5410 n = _M_GET(how, m->m_type);
5411 }
5412 if (n == NULL) {
5413 goto out;
5414 }
5415 n->m_len = 0;
5416 n->m_len = MIN(M_TRAILINGSPACE(n), off + len);
5417 bzero(mtod(n, char *), MIN(n->m_len, off));
5418 m->m_next = n;
5419 }
5420 mp = &m->m_next;
5421 m = m->m_next;
5422 }
5423 while (len > 0) {
5424 mlen = m->m_len - off;
5425 if (mlen != 0 && m_mclhasreference(m)) {
5426 char *datap;
5427 int eatlen;
5428
5429 /*
5430 * this mbuf is read-only.
5431 * allocate a new writable mbuf and try again.
5432 */
5433
5434 #if DIAGNOSTIC
5435 if (!(flags & M_COPYBACK0_COW))
5436 panic("m_copyback0: read-only");
5437 #endif /* DIAGNOSTIC */
5438
5439 /*
5440 * if we're going to write into the middle of
5441 * a mbuf, split it first.
5442 */
5443 if (off > 0 && len < mlen) {
5444 n = m_split0(m, off, how, 0);
5445 if (n == NULL)
5446 goto enobufs;
5447 m->m_next = n;
5448 mp = &m->m_next;
5449 m = n;
5450 off = 0;
5451 continue;
5452 }
5453
5454 /*
5455 * XXX TODO coalesce into the trailingspace of
5456 * the previous mbuf when possible.
5457 */
5458
5459 /*
5460 * allocate a new mbuf. copy packet header if needed.
5461 */
5462 n = _M_GET(how, m->m_type);
5463 if (n == NULL)
5464 goto enobufs;
5465 if (off == 0 && (m->m_flags & M_PKTHDR)) {
5466 M_COPY_PKTHDR(n, m);
5467 n->m_len = MHLEN;
5468 } else {
5469 if (len >= MINCLSIZE)
5470 MCLGET(n, M_DONTWAIT);
5471 n->m_len =
5472 (n->m_flags & M_EXT) ? MCLBYTES : MLEN;
5473 }
5474 if (n->m_len > len)
5475 n->m_len = len;
5476
5477 /*
5478 * free the region which has been overwritten.
5479 * copying data from old mbufs if requested.
5480 */
5481 if (flags & M_COPYBACK0_PRESERVE)
5482 datap = mtod(n, char *);
5483 else
5484 datap = NULL;
5485 eatlen = n->m_len;
5486 VERIFY(off == 0 || eatlen >= mlen);
5487 if (off > 0) {
5488 VERIFY(len >= mlen);
5489 m->m_len = off;
5490 m->m_next = n;
5491 if (datap) {
5492 m_copydata(m, off, mlen, datap);
5493 datap += mlen;
5494 }
5495 eatlen -= mlen;
5496 mp = &m->m_next;
5497 m = m->m_next;
5498 }
5499 while (m != NULL && m_mclhasreference(m) &&
5500 n->m_type == m->m_type && eatlen > 0) {
5501 mlen = MIN(eatlen, m->m_len);
5502 if (datap) {
5503 m_copydata(m, 0, mlen, datap);
5504 datap += mlen;
5505 }
5506 m->m_data += mlen;
5507 m->m_len -= mlen;
5508 eatlen -= mlen;
5509 if (m->m_len == 0)
5510 *mp = m = m_free(m);
5511 }
5512 if (eatlen > 0)
5513 n->m_len -= eatlen;
5514 n->m_next = m;
5515 *mp = m = n;
5516 continue;
5517 }
5518 mlen = MIN(mlen, len);
5519 if (flags & M_COPYBACK0_COPYBACK) {
5520 bcopy(cp, mtod(m, caddr_t) + off, (unsigned)mlen);
5521 cp += mlen;
5522 }
5523 len -= mlen;
5524 mlen += off;
5525 off = 0;
5526 totlen += mlen;
5527 if (len == 0)
5528 break;
5529 if (m->m_next == NULL) {
5530 goto extend;
5531 }
5532 mp = &m->m_next;
5533 m = m->m_next;
5534 }
5535 out:
5536 if (((m = *mp0)->m_flags & M_PKTHDR) && (m->m_pkthdr.len < totlen)) {
5537 VERIFY(flags & M_COPYBACK0_EXTEND);
5538 m->m_pkthdr.len = totlen;
5539 }
5540
5541 return (0);
5542
5543 enobufs:
5544 return (ENOBUFS);
5545 }
5546
5547 uint64_t
5548 mcl_to_paddr(char *addr)
5549 {
5550 vm_offset_t base_phys;
5551
5552 if (!MBUF_IN_MAP(addr))
5553 return (0);
5554 base_phys = mcl_paddr[atop_64(addr - (char *)mbutl)];
5555
5556 if (base_phys == 0)
5557 return (0);
5558 return ((uint64_t)(ptoa_64(base_phys) | ((uint64_t)addr & PAGE_MASK)));
5559 }
5560
5561 /*
5562 * Dup the mbuf chain passed in. The whole thing. No cute additional cruft.
5563 * And really copy the thing. That way, we don't "precompute" checksums
5564 * for unsuspecting consumers. Assumption: m->m_nextpkt == 0. Trick: for
5565 * small packets, don't dup into a cluster. That way received packets
5566 * don't take up too much room in the sockbuf (cf. sbspace()).
5567 */
5568 int MDFail;
5569
5570 struct mbuf *
5571 m_dup(struct mbuf *m, int how)
5572 {
5573 struct mbuf *n, **np;
5574 struct mbuf *top;
5575 int copyhdr = 0;
5576
5577 np = &top;
5578 top = NULL;
5579 if (m->m_flags & M_PKTHDR)
5580 copyhdr = 1;
5581
5582 /*
5583 * Quick check: if we have one mbuf and its data fits in an
5584 * mbuf with packet header, just copy and go.
5585 */
5586 if (m->m_next == NULL) {
5587 /* Then just move the data into an mbuf and be done... */
5588 if (copyhdr) {
5589 if (m->m_pkthdr.len <= MHLEN && m->m_len <= MHLEN) {
5590 if ((n = _M_GETHDR(how, m->m_type)) == NULL)
5591 return (NULL);
5592 n->m_len = m->m_len;
5593 m_dup_pkthdr(n, m, how);
5594 bcopy(m->m_data, n->m_data, m->m_len);
5595 return (n);
5596 }
5597 } else if (m->m_len <= MLEN) {
5598 if ((n = _M_GET(how, m->m_type)) == NULL)
5599 return (NULL);
5600 bcopy(m->m_data, n->m_data, m->m_len);
5601 n->m_len = m->m_len;
5602 return (n);
5603 }
5604 }
5605 while (m != NULL) {
5606 #if BLUE_DEBUG
5607 kprintf("<%x: %x, %x, %x\n", m, m->m_flags, m->m_len,
5608 m->m_data);
5609 #endif
5610 if (copyhdr)
5611 n = _M_GETHDR(how, m->m_type);
5612 else
5613 n = _M_GET(how, m->m_type);
5614 if (n == NULL)
5615 goto nospace;
5616 if (m->m_flags & M_EXT) {
5617 if (m->m_len <= m_maxsize(MC_CL))
5618 MCLGET(n, how);
5619 else if (m->m_len <= m_maxsize(MC_BIGCL))
5620 n = m_mbigget(n, how);
5621 else if (m->m_len <= m_maxsize(MC_16KCL) && njcl > 0)
5622 n = m_m16kget(n, how);
5623 if (!(n->m_flags & M_EXT)) {
5624 (void) m_free(n);
5625 goto nospace;
5626 }
5627 }
5628 *np = n;
5629 if (copyhdr) {
5630 /* Don't use M_COPY_PKTHDR: preserve m_data */
5631 m_dup_pkthdr(n, m, how);
5632 copyhdr = 0;
5633 if (!(n->m_flags & M_EXT))
5634 n->m_data = n->m_pktdat;
5635 }
5636 n->m_len = m->m_len;
5637 /*
5638 * Get the dup on the same bdry as the original
5639 * Assume that the two mbufs have the same offset to data area
5640 * (up to word boundaries)
5641 */
5642 bcopy(MTOD(m, caddr_t), MTOD(n, caddr_t), (unsigned)n->m_len);
5643 m = m->m_next;
5644 np = &n->m_next;
5645 #if BLUE_DEBUG
5646 kprintf(">%x: %x, %x, %x\n", n, n->m_flags, n->m_len,
5647 n->m_data);
5648 #endif
5649 }
5650
5651 if (top == NULL)
5652 MDFail++;
5653 return (top);
5654
5655 nospace:
5656 m_freem(top);
5657 MDFail++;
5658 return (NULL);
5659 }
5660
5661 #define MBUF_MULTIPAGES(m) \
5662 (((m)->m_flags & M_EXT) && \
5663 ((IS_P2ALIGNED((m)->m_data, NBPG) && (m)->m_len > NBPG) || \
5664 (!IS_P2ALIGNED((m)->m_data, NBPG) && \
5665 P2ROUNDUP((m)->m_data, NBPG) < ((uintptr_t)(m)->m_data + (m)->m_len))))
5666
5667 static struct mbuf *
5668 m_expand(struct mbuf *m, struct mbuf **last)
5669 {
5670 struct mbuf *top = NULL;
5671 struct mbuf **nm = &top;
5672 uintptr_t data0, data;
5673 unsigned int len0, len;
5674
5675 VERIFY(MBUF_MULTIPAGES(m));
5676 VERIFY(m->m_next == NULL);
5677 data0 = (uintptr_t)m->m_data;
5678 len0 = m->m_len;
5679 *last = top;
5680
5681 for (;;) {
5682 struct mbuf *n;
5683
5684 data = data0;
5685 if (IS_P2ALIGNED(data, NBPG) && len0 > NBPG)
5686 len = NBPG;
5687 else if (!IS_P2ALIGNED(data, NBPG) &&
5688 P2ROUNDUP(data, NBPG) < (data + len0))
5689 len = P2ROUNDUP(data, NBPG) - data;
5690 else
5691 len = len0;
5692
5693 VERIFY(len > 0);
5694 VERIFY(m->m_flags & M_EXT);
5695 m->m_data = (void *)data;
5696 m->m_len = len;
5697
5698 *nm = *last = m;
5699 nm = &m->m_next;
5700 m->m_next = NULL;
5701
5702 data0 += len;
5703 len0 -= len;
5704 if (len0 == 0)
5705 break;
5706
5707 n = _M_RETRY(M_DONTWAIT, MT_DATA);
5708 if (n == NULL) {
5709 m_freem(top);
5710 top = *last = NULL;
5711 break;
5712 }
5713
5714 n->m_ext = m->m_ext;
5715 m_incref(m);
5716 n->m_flags |= M_EXT;
5717 m = n;
5718 }
5719 return (top);
5720 }
5721
5722 struct mbuf *
5723 m_normalize(struct mbuf *m)
5724 {
5725 struct mbuf *top = NULL;
5726 struct mbuf **nm = &top;
5727 boolean_t expanded = FALSE;
5728
5729 while (m != NULL) {
5730 struct mbuf *n;
5731
5732 n = m->m_next;
5733 m->m_next = NULL;
5734
5735 /* Does the data cross one or more page boundaries? */
5736 if (MBUF_MULTIPAGES(m)) {
5737 struct mbuf *last;
5738 if ((m = m_expand(m, &last)) == NULL) {
5739 m_freem(n);
5740 m_freem(top);
5741 top = NULL;
5742 break;
5743 }
5744 *nm = m;
5745 nm = &last->m_next;
5746 expanded = TRUE;
5747 } else {
5748 *nm = m;
5749 nm = &m->m_next;
5750 }
5751 m = n;
5752 }
5753 if (expanded)
5754 atomic_add_32(&mb_normalized, 1);
5755 return (top);
5756 }
5757
5758 /*
5759 * Append the specified data to the indicated mbuf chain,
5760 * Extend the mbuf chain if the new data does not fit in
5761 * existing space.
5762 *
5763 * Return 1 if able to complete the job; otherwise 0.
5764 */
5765 int
5766 m_append(struct mbuf *m0, int len, caddr_t cp)
5767 {
5768 struct mbuf *m, *n;
5769 int remainder, space;
5770
5771 for (m = m0; m->m_next != NULL; m = m->m_next)
5772 ;
5773 remainder = len;
5774 space = M_TRAILINGSPACE(m);
5775 if (space > 0) {
5776 /*
5777 * Copy into available space.
5778 */
5779 if (space > remainder)
5780 space = remainder;
5781 bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
5782 m->m_len += space;
5783 cp += space, remainder -= space;
5784 }
5785 while (remainder > 0) {
5786 /*
5787 * Allocate a new mbuf; could check space
5788 * and allocate a cluster instead.
5789 */
5790 n = m_get(M_WAITOK, m->m_type);
5791 if (n == NULL)
5792 break;
5793 n->m_len = min(MLEN, remainder);
5794 bcopy(cp, mtod(n, caddr_t), n->m_len);
5795 cp += n->m_len;
5796 remainder -= n->m_len;
5797 m->m_next = n;
5798 m = n;
5799 }
5800 if (m0->m_flags & M_PKTHDR)
5801 m0->m_pkthdr.len += len - remainder;
5802 return (remainder == 0);
5803 }
5804
5805 struct mbuf *
5806 m_last(struct mbuf *m)
5807 {
5808 while (m->m_next != NULL)
5809 m = m->m_next;
5810 return (m);
5811 }
5812
5813 unsigned int
5814 m_fixhdr(struct mbuf *m0)
5815 {
5816 u_int len;
5817
5818 VERIFY(m0->m_flags & M_PKTHDR);
5819
5820 len = m_length2(m0, NULL);
5821 m0->m_pkthdr.len = len;
5822 return (len);
5823 }
5824
5825 unsigned int
5826 m_length2(struct mbuf *m0, struct mbuf **last)
5827 {
5828 struct mbuf *m;
5829 u_int len;
5830
5831 len = 0;
5832 for (m = m0; m != NULL; m = m->m_next) {
5833 len += m->m_len;
5834 if (m->m_next == NULL)
5835 break;
5836 }
5837 if (last != NULL)
5838 *last = m;
5839 return (len);
5840 }
5841
5842 /*
5843 * Defragment a mbuf chain, returning the shortest possible chain of mbufs
5844 * and clusters. If allocation fails and this cannot be completed, NULL will
5845 * be returned, but the passed in chain will be unchanged. Upon success,
5846 * the original chain will be freed, and the new chain will be returned.
5847 *
5848 * If a non-packet header is passed in, the original mbuf (chain?) will
5849 * be returned unharmed.
5850 *
5851 * If offset is specfied, the first mbuf in the chain will have a leading
5852 * space of the amount stated by the "off" parameter.
5853 *
5854 * This routine requires that the m_pkthdr.header field of the original
5855 * mbuf chain is cleared by the caller.
5856 */
5857 struct mbuf *
5858 m_defrag_offset(struct mbuf *m0, u_int32_t off, int how)
5859 {
5860 struct mbuf *m_new = NULL, *m_final = NULL;
5861 int progress = 0, length, pktlen;
5862
5863 if (!(m0->m_flags & M_PKTHDR))
5864 return (m0);
5865
5866 VERIFY(off < MHLEN);
5867 m_fixhdr(m0); /* Needed sanity check */
5868
5869 pktlen = m0->m_pkthdr.len + off;
5870 if (pktlen > MHLEN)
5871 m_final = m_getcl(how, MT_DATA, M_PKTHDR);
5872 else
5873 m_final = m_gethdr(how, MT_DATA);
5874
5875 if (m_final == NULL)
5876 goto nospace;
5877
5878 if (off > 0) {
5879 pktlen -= off;
5880 m_final->m_data += off;
5881 }
5882
5883 /*
5884 * Caller must have handled the contents pointed to by this
5885 * pointer before coming here, as otherwise it will point to
5886 * the original mbuf which will get freed upon success.
5887 */
5888 VERIFY(m0->m_pkthdr.pkt_hdr == NULL);
5889
5890 if (m_dup_pkthdr(m_final, m0, how) == 0)
5891 goto nospace;
5892
5893 m_new = m_final;
5894
5895 while (progress < pktlen) {
5896 length = pktlen - progress;
5897 if (length > MCLBYTES)
5898 length = MCLBYTES;
5899 length -= ((m_new == m_final) ? off : 0);
5900
5901 if (m_new == NULL) {
5902 if (length > MLEN)
5903 m_new = m_getcl(how, MT_DATA, 0);
5904 else
5905 m_new = m_get(how, MT_DATA);
5906 if (m_new == NULL)
5907 goto nospace;
5908 }
5909
5910 m_copydata(m0, progress, length, mtod(m_new, caddr_t));
5911 progress += length;
5912 m_new->m_len = length;
5913 if (m_new != m_final)
5914 m_cat(m_final, m_new);
5915 m_new = NULL;
5916 }
5917 m_freem(m0);
5918 m0 = m_final;
5919 return (m0);
5920 nospace:
5921 if (m_final)
5922 m_freem(m_final);
5923 return (NULL);
5924 }
5925
5926 struct mbuf *
5927 m_defrag(struct mbuf *m0, int how)
5928 {
5929 return (m_defrag_offset(m0, 0, how));
5930 }
5931
5932 void
5933 m_mchtype(struct mbuf *m, int t)
5934 {
5935 mtype_stat_inc(t);
5936 mtype_stat_dec(m->m_type);
5937 (m)->m_type = t;
5938 }
5939
5940 void *
5941 m_mtod(struct mbuf *m)
5942 {
5943 return (MTOD(m, void *));
5944 }
5945
5946 struct mbuf *
5947 m_dtom(void *x)
5948 {
5949 return ((struct mbuf *)((uintptr_t)(x) & ~(MSIZE-1)));
5950 }
5951
5952 void
5953 m_mcheck(struct mbuf *m)
5954 {
5955 _MCHECK(m);
5956 }
5957
5958 /*
5959 * Return a pointer to mbuf/offset of location in mbuf chain.
5960 */
5961 struct mbuf *
5962 m_getptr(struct mbuf *m, int loc, int *off)
5963 {
5964
5965 while (loc >= 0) {
5966 /* Normal end of search. */
5967 if (m->m_len > loc) {
5968 *off = loc;
5969 return (m);
5970 } else {
5971 loc -= m->m_len;
5972 if (m->m_next == NULL) {
5973 if (loc == 0) {
5974 /* Point at the end of valid data. */
5975 *off = m->m_len;
5976 return (m);
5977 }
5978 return (NULL);
5979 }
5980 m = m->m_next;
5981 }
5982 }
5983 return (NULL);
5984 }
5985
5986 /*
5987 * Inform the corresponding mcache(s) that there's a waiter below.
5988 */
5989 static void
5990 mbuf_waiter_inc(mbuf_class_t class, boolean_t comp)
5991 {
5992 mcache_waiter_inc(m_cache(class));
5993 if (comp) {
5994 if (class == MC_CL) {
5995 mcache_waiter_inc(m_cache(MC_MBUF_CL));
5996 } else if (class == MC_BIGCL) {
5997 mcache_waiter_inc(m_cache(MC_MBUF_BIGCL));
5998 } else if (class == MC_16KCL) {
5999 mcache_waiter_inc(m_cache(MC_MBUF_16KCL));
6000 } else {
6001 mcache_waiter_inc(m_cache(MC_MBUF_CL));
6002 mcache_waiter_inc(m_cache(MC_MBUF_BIGCL));
6003 }
6004 }
6005 }
6006
6007 /*
6008 * Inform the corresponding mcache(s) that there's no more waiter below.
6009 */
6010 static void
6011 mbuf_waiter_dec(mbuf_class_t class, boolean_t comp)
6012 {
6013 mcache_waiter_dec(m_cache(class));
6014 if (comp) {
6015 if (class == MC_CL) {
6016 mcache_waiter_dec(m_cache(MC_MBUF_CL));
6017 } else if (class == MC_BIGCL) {
6018 mcache_waiter_dec(m_cache(MC_MBUF_BIGCL));
6019 } else if (class == MC_16KCL) {
6020 mcache_waiter_dec(m_cache(MC_MBUF_16KCL));
6021 } else {
6022 mcache_waiter_dec(m_cache(MC_MBUF_CL));
6023 mcache_waiter_dec(m_cache(MC_MBUF_BIGCL));
6024 }
6025 }
6026 }
6027
6028 /*
6029 * Called during slab (blocking and non-blocking) allocation. If there
6030 * is at least one waiter, and the time since the first waiter is blocked
6031 * is greater than the watchdog timeout, panic the system.
6032 */
6033 static void
6034 mbuf_watchdog(void)
6035 {
6036 struct timeval now;
6037 unsigned int since;
6038
6039 if (mb_waiters == 0 || !mb_watchdog)
6040 return;
6041
6042 microuptime(&now);
6043 since = now.tv_sec - mb_wdtstart.tv_sec;
6044 if (since >= MB_WDT_MAXTIME) {
6045 panic_plain("%s: %d waiters stuck for %u secs\n%s", __func__,
6046 mb_waiters, since, mbuf_dump());
6047 /* NOTREACHED */
6048 }
6049 }
6050
6051 /*
6052 * Called during blocking allocation. Returns TRUE if one or more objects
6053 * are available at the per-CPU caches layer and that allocation should be
6054 * retried at that level.
6055 */
6056 static boolean_t
6057 mbuf_sleep(mbuf_class_t class, unsigned int num, int wait)
6058 {
6059 boolean_t mcache_retry = FALSE;
6060
6061 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
6062
6063 /* Check if there's anything at the cache layer */
6064 if (mbuf_cached_above(class, wait)) {
6065 mcache_retry = TRUE;
6066 goto done;
6067 }
6068
6069 /* Nothing? Then try hard to get it from somewhere */
6070 m_reclaim(class, num, (wait & MCR_COMP));
6071
6072 /* We tried hard and got something? */
6073 if (m_infree(class) > 0) {
6074 mbstat.m_wait++;
6075 goto done;
6076 } else if (mbuf_cached_above(class, wait)) {
6077 mbstat.m_wait++;
6078 mcache_retry = TRUE;
6079 goto done;
6080 } else if (wait & MCR_TRYHARD) {
6081 mcache_retry = TRUE;
6082 goto done;
6083 }
6084
6085 /*
6086 * There's really nothing for us right now; inform the
6087 * cache(s) that there is a waiter below and go to sleep.
6088 */
6089 mbuf_waiter_inc(class, (wait & MCR_COMP));
6090
6091 VERIFY(!(wait & MCR_NOSLEEP));
6092
6093 /*
6094 * If this is the first waiter, arm the watchdog timer. Otherwise
6095 * check if we need to panic the system due to watchdog timeout.
6096 */
6097 if (mb_waiters == 0)
6098 microuptime(&mb_wdtstart);
6099 else
6100 mbuf_watchdog();
6101
6102 mb_waiters++;
6103 (void) msleep(mb_waitchan, mbuf_mlock, (PZERO-1), m_cname(class), NULL);
6104
6105 /* We are now up; stop getting notified until next round */
6106 mbuf_waiter_dec(class, (wait & MCR_COMP));
6107
6108 /* We waited and got something */
6109 if (m_infree(class) > 0) {
6110 mbstat.m_wait++;
6111 goto done;
6112 } else if (mbuf_cached_above(class, wait)) {
6113 mbstat.m_wait++;
6114 mcache_retry = TRUE;
6115 }
6116 done:
6117 return (mcache_retry);
6118 }
6119
6120 static void
6121 mbuf_worker_thread(void)
6122 {
6123 int mbuf_expand;
6124
6125 while (1) {
6126 lck_mtx_lock(mbuf_mlock);
6127
6128 mbuf_expand = 0;
6129 if (mbuf_expand_mcl) {
6130 int n;
6131
6132 /* Adjust to current number of cluster in use */
6133 n = mbuf_expand_mcl -
6134 (m_total(MC_CL) - m_infree(MC_CL));
6135 if ((n + m_total(MC_CL)) > m_maxlimit(MC_CL))
6136 n = m_maxlimit(MC_CL) - m_total(MC_CL);
6137 mbuf_expand_mcl = 0;
6138
6139 if (n > 0 && freelist_populate(MC_CL, n, M_WAIT) > 0)
6140 mbuf_expand++;
6141 }
6142 if (mbuf_expand_big) {
6143 int n;
6144
6145 /* Adjust to current number of 4 KB cluster in use */
6146 n = mbuf_expand_big -
6147 (m_total(MC_BIGCL) - m_infree(MC_BIGCL));
6148 if ((n + m_total(MC_BIGCL)) > m_maxlimit(MC_BIGCL))
6149 n = m_maxlimit(MC_BIGCL) - m_total(MC_BIGCL);
6150 mbuf_expand_big = 0;
6151
6152 if (n > 0 && freelist_populate(MC_BIGCL, n, M_WAIT) > 0)
6153 mbuf_expand++;
6154 }
6155 if (mbuf_expand_16k) {
6156 int n;
6157
6158 /* Adjust to current number of 16 KB cluster in use */
6159 n = mbuf_expand_16k -
6160 (m_total(MC_16KCL) - m_infree(MC_16KCL));
6161 if ((n + m_total(MC_16KCL)) > m_maxlimit(MC_16KCL))
6162 n = m_maxlimit(MC_16KCL) - m_total(MC_16KCL);
6163 mbuf_expand_16k = 0;
6164
6165 if (n > 0)
6166 (void) freelist_populate(MC_16KCL, n, M_WAIT);
6167 }
6168
6169 /*
6170 * Because we can run out of memory before filling the mbuf
6171 * map, we should not allocate more clusters than they are
6172 * mbufs -- otherwise we could have a large number of useless
6173 * clusters allocated.
6174 */
6175 if (mbuf_expand) {
6176 while (m_total(MC_MBUF) <
6177 (m_total(MC_BIGCL) + m_total(MC_CL))) {
6178 if (freelist_populate(MC_MBUF, 1, M_WAIT) == 0)
6179 break;
6180 }
6181 }
6182
6183 lck_mtx_unlock(mbuf_mlock);
6184
6185 assert_wait(&mbuf_worker_run, THREAD_UNINT);
6186 (void) thread_block((thread_continue_t)mbuf_worker_thread);
6187 }
6188 }
6189
6190 static void
6191 mbuf_worker_thread_init(void)
6192 {
6193 mbuf_worker_ready++;
6194 mbuf_worker_thread();
6195 }
6196
6197 static mcl_slab_t *
6198 slab_get(void *buf)
6199 {
6200 mcl_slabg_t *slg;
6201 unsigned int ix, k;
6202
6203 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
6204
6205 VERIFY(MBUF_IN_MAP(buf));
6206 ix = ((char *)buf - (char *)mbutl) >> MBSHIFT;
6207 VERIFY(ix < maxslabgrp);
6208
6209 if ((slg = slabstbl[ix]) == NULL) {
6210 /*
6211 * In the current implementation, we never shrink the memory
6212 * pool (hence the cluster map); if we attempt to reallocate
6213 * a cluster group when it's already allocated, panic since
6214 * this is a sign of a memory corruption (slabstbl[ix] got
6215 * nullified). This also means that there shouldn't be any
6216 * hole in the kernel sub-map for the mbuf pool.
6217 */
6218 ++slabgrp;
6219 VERIFY(ix < slabgrp);
6220 /*
6221 * Slabs expansion can only be done single threaded; when
6222 * we get here, it must be as a result of m_clalloc() which
6223 * is serialized and therefore mb_clalloc_busy must be set.
6224 */
6225 VERIFY(mb_clalloc_busy);
6226 lck_mtx_unlock(mbuf_mlock);
6227
6228 /* This is a new buffer; create the slabs group for it */
6229 MALLOC(slg, mcl_slabg_t *, sizeof (*slg), M_TEMP,
6230 M_WAITOK | M_ZERO);
6231 VERIFY(slg != NULL);
6232
6233 lck_mtx_lock(mbuf_mlock);
6234 /*
6235 * No other thread could have gone into m_clalloc() after
6236 * we dropped the lock above, so verify that it's true.
6237 */
6238 VERIFY(mb_clalloc_busy);
6239
6240 slabstbl[ix] = slg;
6241
6242 /* Chain each slab in the group to its forward neighbor */
6243 for (k = 1; k < NSLABSPMB; k++)
6244 slg->slg_slab[k - 1].sl_next = &slg->slg_slab[k];
6245 VERIFY(slg->slg_slab[NSLABSPMB - 1].sl_next == NULL);
6246
6247 /* And chain the last slab in the previous group to this */
6248 if (ix > 0) {
6249 VERIFY(slabstbl[ix - 1]->
6250 slg_slab[NSLABSPMB - 1].sl_next == NULL);
6251 slabstbl[ix - 1]->slg_slab[NSLABSPMB - 1].sl_next =
6252 &slg->slg_slab[0];
6253 }
6254 }
6255
6256 ix = MTOBG(buf) % NSLABSPMB;
6257 VERIFY(ix < NSLABSPMB);
6258
6259 return (&slg->slg_slab[ix]);
6260 }
6261
6262 static void
6263 slab_init(mcl_slab_t *sp, mbuf_class_t class, u_int32_t flags,
6264 void *base, void *head, unsigned int len, int refcnt, int chunks)
6265 {
6266 sp->sl_class = class;
6267 sp->sl_flags = flags;
6268 sp->sl_base = base;
6269 sp->sl_head = head;
6270 sp->sl_len = len;
6271 sp->sl_refcnt = refcnt;
6272 sp->sl_chunks = chunks;
6273 slab_detach(sp);
6274 }
6275
6276 static void
6277 slab_insert(mcl_slab_t *sp, mbuf_class_t class)
6278 {
6279 VERIFY(slab_is_detached(sp));
6280 m_slab_cnt(class)++;
6281 TAILQ_INSERT_TAIL(&m_slablist(class), sp, sl_link);
6282 sp->sl_flags &= ~SLF_DETACHED;
6283 if (class == MC_16KCL) {
6284 int k;
6285 for (k = 1; k < NSLABSP16KB; k++) {
6286 sp = sp->sl_next;
6287 /* Next slab must already be present */
6288 VERIFY(sp != NULL);
6289 VERIFY(slab_is_detached(sp));
6290 sp->sl_flags &= ~SLF_DETACHED;
6291 }
6292 }
6293 }
6294
6295 static void
6296 slab_remove(mcl_slab_t *sp, mbuf_class_t class)
6297 {
6298 VERIFY(!slab_is_detached(sp));
6299 VERIFY(m_slab_cnt(class) > 0);
6300 m_slab_cnt(class)--;
6301 TAILQ_REMOVE(&m_slablist(class), sp, sl_link);
6302 slab_detach(sp);
6303 if (class == MC_16KCL) {
6304 int k;
6305 for (k = 1; k < NSLABSP16KB; k++) {
6306 sp = sp->sl_next;
6307 /* Next slab must already be present */
6308 VERIFY(sp != NULL);
6309 VERIFY(!slab_is_detached(sp));
6310 slab_detach(sp);
6311 }
6312 }
6313 }
6314
6315 static boolean_t
6316 slab_inrange(mcl_slab_t *sp, void *buf)
6317 {
6318 return ((uintptr_t)buf >= (uintptr_t)sp->sl_base &&
6319 (uintptr_t)buf < ((uintptr_t)sp->sl_base + sp->sl_len));
6320 }
6321
6322 #undef panic
6323
6324 static void
6325 slab_nextptr_panic(mcl_slab_t *sp, void *addr)
6326 {
6327 int i;
6328 unsigned int chunk_len = sp->sl_len / sp->sl_chunks;
6329 uintptr_t buf = (uintptr_t)sp->sl_base;
6330
6331 for (i = 0; i < sp->sl_chunks; i++, buf += chunk_len) {
6332 void *next = ((mcache_obj_t *)buf)->obj_next;
6333 if (next != addr)
6334 continue;
6335 if (!mclverify) {
6336 if (next != NULL && !MBUF_IN_MAP(next)) {
6337 mcache_t *cp = m_cache(sp->sl_class);
6338 panic("%s: %s buffer %p in slab %p modified "
6339 "after free at offset 0: %p out of range "
6340 "[%p-%p)\n", __func__, cp->mc_name,
6341 (void *)buf, sp, next, mbutl, embutl);
6342 /* NOTREACHED */
6343 }
6344 } else {
6345 mcache_audit_t *mca = mcl_audit_buf2mca(sp->sl_class,
6346 (mcache_obj_t *)buf);
6347 mcl_audit_verify_nextptr(next, mca);
6348 }
6349 }
6350 }
6351
6352 static void
6353 slab_detach(mcl_slab_t *sp)
6354 {
6355 sp->sl_link.tqe_next = (mcl_slab_t *)-1;
6356 sp->sl_link.tqe_prev = (mcl_slab_t **)-1;
6357 sp->sl_flags |= SLF_DETACHED;
6358 }
6359
6360 static boolean_t
6361 slab_is_detached(mcl_slab_t *sp)
6362 {
6363 return ((intptr_t)sp->sl_link.tqe_next == -1 &&
6364 (intptr_t)sp->sl_link.tqe_prev == -1 &&
6365 (sp->sl_flags & SLF_DETACHED));
6366 }
6367
6368 static void
6369 mcl_audit_init(void *buf, mcache_audit_t **mca_list,
6370 mcache_obj_t **con_list, size_t con_size, unsigned int num)
6371 {
6372 mcache_audit_t *mca, *mca_tail;
6373 mcache_obj_t *con = NULL;
6374 boolean_t save_contents = (con_list != NULL);
6375 unsigned int i, ix;
6376
6377 ASSERT(num <= NMBPBG);
6378 ASSERT(con_list == NULL || con_size != 0);
6379
6380 ix = MTOBG(buf);
6381 VERIFY(ix < maxclaudit);
6382
6383 /* Make sure we haven't been here before */
6384 for (i = 0; i < NMBPBG; i++)
6385 VERIFY(mclaudit[ix].cl_audit[i] == NULL);
6386
6387 mca = mca_tail = *mca_list;
6388 if (save_contents)
6389 con = *con_list;
6390
6391 for (i = 0; i < num; i++) {
6392 mcache_audit_t *next;
6393
6394 next = mca->mca_next;
6395 bzero(mca, sizeof (*mca));
6396 mca->mca_next = next;
6397 mclaudit[ix].cl_audit[i] = mca;
6398
6399 /* Attach the contents buffer if requested */
6400 if (save_contents) {
6401 mcl_saved_contents_t *msc =
6402 (mcl_saved_contents_t *)(void *)con;
6403
6404 VERIFY(msc != NULL);
6405 VERIFY(IS_P2ALIGNED(msc, sizeof (u_int64_t)));
6406 VERIFY(con_size == sizeof (*msc));
6407 mca->mca_contents_size = con_size;
6408 mca->mca_contents = msc;
6409 con = con->obj_next;
6410 bzero(mca->mca_contents, mca->mca_contents_size);
6411 }
6412
6413 mca_tail = mca;
6414 mca = mca->mca_next;
6415 }
6416
6417 if (save_contents)
6418 *con_list = con;
6419
6420 *mca_list = mca_tail->mca_next;
6421 mca_tail->mca_next = NULL;
6422 }
6423
6424 /*
6425 * Given an address of a buffer (mbuf/2KB/4KB/16KB), return
6426 * the corresponding audit structure for that buffer.
6427 */
6428 static mcache_audit_t *
6429 mcl_audit_buf2mca(mbuf_class_t class, mcache_obj_t *o)
6430 {
6431 mcache_audit_t *mca = NULL;
6432 int ix = MTOBG(o);
6433
6434 VERIFY(ix < maxclaudit);
6435 VERIFY(IS_P2ALIGNED(o, MIN(m_maxsize(class), NBPG)));
6436
6437 switch (class) {
6438 case MC_MBUF:
6439 /*
6440 * For the mbuf case, find the index of the page
6441 * used by the mbuf and use that index to locate the
6442 * base address of the page. Then find out the
6443 * mbuf index relative to the page base and use
6444 * it to locate the audit structure.
6445 */
6446 VERIFY(MCLIDX(BGTOM(ix), o) < (int)NMBPBG);
6447 mca = mclaudit[ix].cl_audit[MCLIDX(BGTOM(ix), o)];
6448 break;
6449
6450 case MC_CL:
6451 /*
6452 * Same thing as above, but for 2KB clusters in a page.
6453 */
6454 VERIFY(CLBGIDX(BGTOM(ix), o) < (int)NCLPBG);
6455 mca = mclaudit[ix].cl_audit[CLBGIDX(BGTOM(ix), o)];
6456 break;
6457
6458 case MC_BIGCL:
6459 case MC_16KCL:
6460 /*
6461 * Same as above, but only return the first element.
6462 */
6463 mca = mclaudit[ix].cl_audit[0];
6464 break;
6465
6466 default:
6467 VERIFY(0);
6468 /* NOTREACHED */
6469 }
6470
6471 return (mca);
6472 }
6473
6474 static void
6475 mcl_audit_mbuf(mcache_audit_t *mca, void *addr, boolean_t composite,
6476 boolean_t alloc)
6477 {
6478 struct mbuf *m = addr;
6479 mcache_obj_t *next = ((mcache_obj_t *)m)->obj_next;
6480
6481 VERIFY(mca->mca_contents != NULL &&
6482 mca->mca_contents_size == AUDIT_CONTENTS_SIZE);
6483
6484 if (mclverify)
6485 mcl_audit_verify_nextptr(next, mca);
6486
6487 if (!alloc) {
6488 /* Save constructed mbuf fields */
6489 mcl_audit_save_mbuf(m, mca);
6490 if (mclverify) {
6491 mcache_set_pattern(MCACHE_FREE_PATTERN, m,
6492 m_maxsize(MC_MBUF));
6493 }
6494 ((mcache_obj_t *)m)->obj_next = next;
6495 return;
6496 }
6497
6498 /* Check if the buffer has been corrupted while in freelist */
6499 if (mclverify) {
6500 mcache_audit_free_verify_set(mca, addr, 0, m_maxsize(MC_MBUF));
6501 }
6502 /* Restore constructed mbuf fields */
6503 mcl_audit_restore_mbuf(m, mca, composite);
6504 }
6505
6506 static void
6507 mcl_audit_restore_mbuf(struct mbuf *m, mcache_audit_t *mca, boolean_t composite)
6508 {
6509 struct mbuf *ms = MCA_SAVED_MBUF_PTR(mca);
6510
6511 if (composite) {
6512 struct mbuf *next = m->m_next;
6513 VERIFY(ms->m_flags == M_EXT && MEXT_RFA(ms) != NULL &&
6514 MBUF_IS_COMPOSITE(ms));
6515 VERIFY(mca->mca_contents_size == AUDIT_CONTENTS_SIZE);
6516 /*
6517 * We could have hand-picked the mbuf fields and restore
6518 * them individually, but that will be a maintenance
6519 * headache. Instead, restore everything that was saved;
6520 * the mbuf layer will recheck and reinitialize anyway.
6521 */
6522 bcopy(ms, m, MCA_SAVED_MBUF_SIZE);
6523 m->m_next = next;
6524 } else {
6525 /*
6526 * For a regular mbuf (no cluster attached) there's nothing
6527 * to restore other than the type field, which is expected
6528 * to be MT_FREE.
6529 */
6530 m->m_type = ms->m_type;
6531 }
6532 _MCHECK(m);
6533 }
6534
6535 static void
6536 mcl_audit_save_mbuf(struct mbuf *m, mcache_audit_t *mca)
6537 {
6538 VERIFY(mca->mca_contents_size == AUDIT_CONTENTS_SIZE);
6539 _MCHECK(m);
6540 bcopy(m, MCA_SAVED_MBUF_PTR(mca), MCA_SAVED_MBUF_SIZE);
6541 }
6542
6543 static void
6544 mcl_audit_cluster(mcache_audit_t *mca, void *addr, size_t size, boolean_t alloc,
6545 boolean_t save_next)
6546 {
6547 mcache_obj_t *next = ((mcache_obj_t *)addr)->obj_next;
6548
6549 if (!alloc) {
6550 if (mclverify) {
6551 mcache_set_pattern(MCACHE_FREE_PATTERN, addr, size);
6552 }
6553 if (save_next) {
6554 mcl_audit_verify_nextptr(next, mca);
6555 ((mcache_obj_t *)addr)->obj_next = next;
6556 }
6557 } else if (mclverify) {
6558 /* Check if the buffer has been corrupted while in freelist */
6559 mcl_audit_verify_nextptr(next, mca);
6560 mcache_audit_free_verify_set(mca, addr, 0, size);
6561 }
6562 }
6563
6564 static void
6565 mcl_audit_scratch(mcache_audit_t *mca)
6566 {
6567 void *stack[MCACHE_STACK_DEPTH + 1];
6568 mcl_scratch_audit_t *msa;
6569 struct timeval now;
6570
6571 VERIFY(mca->mca_contents != NULL);
6572 msa = MCA_SAVED_SCRATCH_PTR(mca);
6573
6574 msa->msa_pthread = msa->msa_thread;
6575 msa->msa_thread = current_thread();
6576 bcopy(msa->msa_stack, msa->msa_pstack, sizeof (msa->msa_pstack));
6577 msa->msa_pdepth = msa->msa_depth;
6578 bzero(stack, sizeof (stack));
6579 msa->msa_depth = OSBacktrace(stack, MCACHE_STACK_DEPTH + 1) - 1;
6580 bcopy(&stack[1], msa->msa_stack, sizeof (mca->mca_pstack));
6581
6582 msa->msa_ptstamp = msa->msa_tstamp;
6583 microuptime(&now);
6584 /* tstamp is in ms relative to base_ts */
6585 msa->msa_tstamp = ((now.tv_usec - mb_start.tv_usec) / 1000);
6586 if ((now.tv_sec - mb_start.tv_sec) > 0)
6587 msa->msa_tstamp += ((now.tv_sec - mb_start.tv_sec) * 1000);
6588 }
6589
6590 static void
6591 mcl_audit_mcheck_panic(struct mbuf *m)
6592 {
6593 mcache_audit_t *mca;
6594
6595 MRANGE(m);
6596 mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
6597
6598 panic("mcl_audit: freed mbuf %p with type 0x%x (instead of 0x%x)\n%s\n",
6599 m, (u_int16_t)m->m_type, MT_FREE, mcache_dump_mca(mca));
6600 /* NOTREACHED */
6601 }
6602
6603 static void
6604 mcl_audit_verify_nextptr(void *next, mcache_audit_t *mca)
6605 {
6606 if (next != NULL && !MBUF_IN_MAP(next) &&
6607 (next != (void *)MCACHE_FREE_PATTERN || !mclverify)) {
6608 panic("mcl_audit: buffer %p modified after free at offset 0: "
6609 "%p out of range [%p-%p)\n%s\n",
6610 mca->mca_addr, next, mbutl, embutl, mcache_dump_mca(mca));
6611 /* NOTREACHED */
6612 }
6613 }
6614
6615 /* This function turns on mbuf leak detection */
6616 static void
6617 mleak_activate(void)
6618 {
6619 mleak_table.mleak_sample_factor = MLEAK_SAMPLE_FACTOR;
6620 PE_parse_boot_argn("mleak_sample_factor",
6621 &mleak_table.mleak_sample_factor,
6622 sizeof (mleak_table.mleak_sample_factor));
6623
6624 if (mleak_table.mleak_sample_factor == 0)
6625 mclfindleak = 0;
6626
6627 if (mclfindleak == 0)
6628 return;
6629
6630 vm_size_t alloc_size =
6631 mleak_alloc_buckets * sizeof (struct mallocation);
6632 vm_size_t trace_size = mleak_trace_buckets * sizeof (struct mtrace);
6633
6634 MALLOC(mleak_allocations, struct mallocation *, alloc_size,
6635 M_TEMP, M_WAITOK | M_ZERO);
6636 VERIFY(mleak_allocations != NULL);
6637
6638 MALLOC(mleak_traces, struct mtrace *, trace_size,
6639 M_TEMP, M_WAITOK | M_ZERO);
6640 VERIFY(mleak_traces != NULL);
6641
6642 MALLOC(mleak_stat, mleak_stat_t *, MLEAK_STAT_SIZE(MLEAK_NUM_TRACES),
6643 M_TEMP, M_WAITOK | M_ZERO);
6644 VERIFY(mleak_stat != NULL);
6645 mleak_stat->ml_cnt = MLEAK_NUM_TRACES;
6646 #ifdef __LP64__
6647 mleak_stat->ml_isaddr64 = 1;
6648 #endif /* __LP64__ */
6649 }
6650
6651 static void
6652 mleak_logger(u_int32_t num, mcache_obj_t *addr, boolean_t alloc)
6653 {
6654 int temp;
6655
6656 if (mclfindleak == 0)
6657 return;
6658
6659 if (!alloc)
6660 return (mleak_free(addr));
6661
6662 temp = atomic_add_32_ov(&mleak_table.mleak_capture, 1);
6663
6664 if ((temp % mleak_table.mleak_sample_factor) == 0 && addr != NULL) {
6665 uintptr_t bt[MLEAK_STACK_DEPTH];
6666 int logged = fastbacktrace(bt, MLEAK_STACK_DEPTH);
6667 mleak_log(bt, addr, logged, num);
6668 }
6669 }
6670
6671 /*
6672 * This function records the allocation in the mleak_allocations table
6673 * and the backtrace in the mleak_traces table; if allocation slot is in use,
6674 * replace old allocation with new one if the trace slot is in use, return
6675 * (or increment refcount if same trace).
6676 */
6677 static boolean_t
6678 mleak_log(uintptr_t *bt, mcache_obj_t *addr, uint32_t depth, int num)
6679 {
6680 struct mallocation *allocation;
6681 struct mtrace *trace;
6682 uint32_t trace_index;
6683
6684 /* Quit if someone else modifying the tables */
6685 if (!lck_mtx_try_lock_spin(mleak_lock)) {
6686 mleak_table.total_conflicts++;
6687 return (FALSE);
6688 }
6689
6690 allocation = &mleak_allocations[hashaddr((uintptr_t)addr,
6691 mleak_alloc_buckets)];
6692 trace_index = hashbacktrace(bt, depth, mleak_trace_buckets);
6693 trace = &mleak_traces[trace_index];
6694
6695 VERIFY(allocation <= &mleak_allocations[mleak_alloc_buckets - 1]);
6696 VERIFY(trace <= &mleak_traces[mleak_trace_buckets - 1]);
6697
6698 allocation->hitcount++;
6699 trace->hitcount++;
6700
6701 /*
6702 * If the allocation bucket we want is occupied
6703 * and the occupier has the same trace, just bail.
6704 */
6705 if (allocation->element != NULL &&
6706 trace_index == allocation->trace_index) {
6707 mleak_table.alloc_collisions++;
6708 lck_mtx_unlock(mleak_lock);
6709 return (TRUE);
6710 }
6711
6712 /*
6713 * Store the backtrace in the traces array;
6714 * Size of zero = trace bucket is free.
6715 */
6716 if (trace->allocs > 0 &&
6717 bcmp(trace->addr, bt, (depth * sizeof (uintptr_t))) != 0) {
6718 /* Different, unique trace, but the same hash! Bail out. */
6719 trace->collisions++;
6720 mleak_table.trace_collisions++;
6721 lck_mtx_unlock(mleak_lock);
6722 return (TRUE);
6723 } else if (trace->allocs > 0) {
6724 /* Same trace, already added, so increment refcount */
6725 trace->allocs++;
6726 } else {
6727 /* Found an unused trace bucket, so record the trace here */
6728 if (trace->depth != 0) {
6729 /* this slot previously used but not currently in use */
6730 mleak_table.trace_overwrites++;
6731 }
6732 mleak_table.trace_recorded++;
6733 trace->allocs = 1;
6734 memcpy(trace->addr, bt, (depth * sizeof (uintptr_t)));
6735 trace->depth = depth;
6736 trace->collisions = 0;
6737 }
6738
6739 /* Step 2: Store the allocation record in the allocations array */
6740 if (allocation->element != NULL) {
6741 /*
6742 * Replace an existing allocation. No need to preserve
6743 * because only a subset of the allocations are being
6744 * recorded anyway.
6745 */
6746 mleak_table.alloc_collisions++;
6747 } else if (allocation->trace_index != 0) {
6748 mleak_table.alloc_overwrites++;
6749 }
6750 allocation->element = addr;
6751 allocation->trace_index = trace_index;
6752 allocation->count = num;
6753 mleak_table.alloc_recorded++;
6754 mleak_table.outstanding_allocs++;
6755
6756 lck_mtx_unlock(mleak_lock);
6757 return (TRUE);
6758 }
6759
6760 static void
6761 mleak_free(mcache_obj_t *addr)
6762 {
6763 while (addr != NULL) {
6764 struct mallocation *allocation = &mleak_allocations
6765 [hashaddr((uintptr_t)addr, mleak_alloc_buckets)];
6766
6767 if (allocation->element == addr &&
6768 allocation->trace_index < mleak_trace_buckets) {
6769 lck_mtx_lock_spin(mleak_lock);
6770 if (allocation->element == addr &&
6771 allocation->trace_index < mleak_trace_buckets) {
6772 struct mtrace *trace;
6773 trace = &mleak_traces[allocation->trace_index];
6774 /* allocs = 0 means trace bucket is unused */
6775 if (trace->allocs > 0)
6776 trace->allocs--;
6777 if (trace->allocs == 0)
6778 trace->depth = 0;
6779 /* NULL element means alloc bucket is unused */
6780 allocation->element = NULL;
6781 mleak_table.outstanding_allocs--;
6782 }
6783 lck_mtx_unlock(mleak_lock);
6784 }
6785 addr = addr->obj_next;
6786 }
6787 }
6788
6789 static void
6790 mleak_sort_traces()
6791 {
6792 int i, j, k;
6793 struct mtrace *swap;
6794
6795 for(i = 0; i < MLEAK_NUM_TRACES; i++)
6796 mleak_top_trace[i] = NULL;
6797
6798 for(i = 0, j = 0; j < MLEAK_NUM_TRACES && i < mleak_trace_buckets; i++)
6799 {
6800 if (mleak_traces[i].allocs <= 0)
6801 continue;
6802
6803 mleak_top_trace[j] = &mleak_traces[i];
6804 for (k = j; k > 0; k--) {
6805 if (mleak_top_trace[k]->allocs <=
6806 mleak_top_trace[k-1]->allocs)
6807 break;
6808
6809 swap = mleak_top_trace[k-1];
6810 mleak_top_trace[k-1] = mleak_top_trace[k];
6811 mleak_top_trace[k] = swap;
6812 }
6813 j++;
6814 }
6815
6816 j--;
6817 for(; i < mleak_trace_buckets; i++) {
6818 if (mleak_traces[i].allocs <= mleak_top_trace[j]->allocs)
6819 continue;
6820
6821 mleak_top_trace[j] = &mleak_traces[i];
6822
6823 for (k = j; k > 0; k--) {
6824 if (mleak_top_trace[k]->allocs <=
6825 mleak_top_trace[k-1]->allocs)
6826 break;
6827
6828 swap = mleak_top_trace[k-1];
6829 mleak_top_trace[k-1] = mleak_top_trace[k];
6830 mleak_top_trace[k] = swap;
6831 }
6832 }
6833 }
6834
6835 static void
6836 mleak_update_stats()
6837 {
6838 mleak_trace_stat_t *mltr;
6839 int i;
6840
6841 VERIFY(mleak_stat != NULL);
6842 #ifdef __LP64__
6843 VERIFY(mleak_stat->ml_isaddr64);
6844 #else
6845 VERIFY(!mleak_stat->ml_isaddr64);
6846 #endif /* !__LP64__ */
6847 VERIFY(mleak_stat->ml_cnt == MLEAK_NUM_TRACES);
6848
6849 mleak_sort_traces();
6850
6851 mltr = &mleak_stat->ml_trace[0];
6852 bzero(mltr, sizeof (*mltr) * MLEAK_NUM_TRACES);
6853 for (i = 0; i < MLEAK_NUM_TRACES; i++) {
6854 int j;
6855
6856 if (mleak_top_trace[i] == NULL ||
6857 mleak_top_trace[i]->allocs == 0)
6858 continue;
6859
6860 mltr->mltr_collisions = mleak_top_trace[i]->collisions;
6861 mltr->mltr_hitcount = mleak_top_trace[i]->hitcount;
6862 mltr->mltr_allocs = mleak_top_trace[i]->allocs;
6863 mltr->mltr_depth = mleak_top_trace[i]->depth;
6864
6865 VERIFY(mltr->mltr_depth <= MLEAK_STACK_DEPTH);
6866 for (j = 0; j < mltr->mltr_depth; j++)
6867 mltr->mltr_addr[j] = mleak_top_trace[i]->addr[j];
6868
6869 mltr++;
6870 }
6871 }
6872
6873 static struct mbtypes {
6874 int mt_type;
6875 const char *mt_name;
6876 } mbtypes[] = {
6877 { MT_DATA, "data" },
6878 { MT_OOBDATA, "oob data" },
6879 { MT_CONTROL, "ancillary data" },
6880 { MT_HEADER, "packet headers" },
6881 { MT_SOCKET, "socket structures" },
6882 { MT_PCB, "protocol control blocks" },
6883 { MT_RTABLE, "routing table entries" },
6884 { MT_HTABLE, "IMP host table entries" },
6885 { MT_ATABLE, "address resolution tables" },
6886 { MT_FTABLE, "fragment reassembly queue headers" },
6887 { MT_SONAME, "socket names and addresses" },
6888 { MT_SOOPTS, "socket options" },
6889 { MT_RIGHTS, "access rights" },
6890 { MT_IFADDR, "interface addresses" },
6891 { MT_TAG, "packet tags" },
6892 { 0, NULL }
6893 };
6894
6895 #define MBUF_DUMP_BUF_CHK() { \
6896 clen -= k; \
6897 if (clen < 1) \
6898 goto done; \
6899 c += k; \
6900 }
6901
6902 static char *
6903 mbuf_dump(void)
6904 {
6905 unsigned long totmem = 0, totfree = 0, totmbufs, totused, totpct;
6906 u_int32_t m_mbufs = 0, m_clfree = 0, m_bigclfree = 0;
6907 u_int32_t m_mbufclfree = 0, m_mbufbigclfree = 0;
6908 u_int32_t m_16kclusters = 0, m_16kclfree = 0, m_mbuf16kclfree = 0;
6909 int nmbtypes = sizeof (mbstat.m_mtypes) / sizeof (short);
6910 uint8_t seen[256];
6911 struct mbtypes *mp;
6912 mb_class_stat_t *sp;
6913 mleak_trace_stat_t *mltr;
6914 char *c = mbuf_dump_buf;
6915 int i, k, clen = MBUF_DUMP_BUF_SIZE;
6916
6917 mbuf_dump_buf[0] = '\0';
6918
6919 /* synchronize all statistics in the mbuf table */
6920 mbuf_stat_sync();
6921 mbuf_mtypes_sync(TRUE);
6922
6923 sp = &mb_stat->mbs_class[0];
6924 for (i = 0; i < mb_stat->mbs_cnt; i++, sp++) {
6925 u_int32_t mem;
6926
6927 if (m_class(i) == MC_MBUF) {
6928 m_mbufs = sp->mbcl_active;
6929 } else if (m_class(i) == MC_CL) {
6930 m_clfree = sp->mbcl_total - sp->mbcl_active;
6931 } else if (m_class(i) == MC_BIGCL) {
6932 m_bigclfree = sp->mbcl_total - sp->mbcl_active;
6933 } else if (njcl > 0 && m_class(i) == MC_16KCL) {
6934 m_16kclfree = sp->mbcl_total - sp->mbcl_active;
6935 m_16kclusters = sp->mbcl_total;
6936 } else if (m_class(i) == MC_MBUF_CL) {
6937 m_mbufclfree = sp->mbcl_total - sp->mbcl_active;
6938 } else if (m_class(i) == MC_MBUF_BIGCL) {
6939 m_mbufbigclfree = sp->mbcl_total - sp->mbcl_active;
6940 } else if (njcl > 0 && m_class(i) == MC_MBUF_16KCL) {
6941 m_mbuf16kclfree = sp->mbcl_total - sp->mbcl_active;
6942 }
6943
6944 mem = sp->mbcl_ctotal * sp->mbcl_size;
6945 totmem += mem;
6946 totfree += (sp->mbcl_mc_cached + sp->mbcl_infree) *
6947 sp->mbcl_size;
6948
6949 }
6950
6951 /* adjust free counts to include composite caches */
6952 m_clfree += m_mbufclfree;
6953 m_bigclfree += m_mbufbigclfree;
6954 m_16kclfree += m_mbuf16kclfree;
6955
6956 totmbufs = 0;
6957 for (mp = mbtypes; mp->mt_name != NULL; mp++)
6958 totmbufs += mbstat.m_mtypes[mp->mt_type];
6959 if (totmbufs > m_mbufs)
6960 totmbufs = m_mbufs;
6961 k = snprintf(c, clen, "%lu/%u mbufs in use:\n", totmbufs, m_mbufs);
6962 MBUF_DUMP_BUF_CHK();
6963
6964 bzero(&seen, sizeof (seen));
6965 for (mp = mbtypes; mp->mt_name != NULL; mp++) {
6966 if (mbstat.m_mtypes[mp->mt_type] != 0) {
6967 seen[mp->mt_type] = 1;
6968 k = snprintf(c, clen, "\t%u mbufs allocated to %s\n",
6969 mbstat.m_mtypes[mp->mt_type], mp->mt_name);
6970 MBUF_DUMP_BUF_CHK();
6971 }
6972 }
6973 seen[MT_FREE] = 1;
6974 for (i = 0; i < nmbtypes; i++)
6975 if (!seen[i] && mbstat.m_mtypes[i] != 0) {
6976 k = snprintf(c, clen, "\t%u mbufs allocated to "
6977 "<mbuf type %d>\n", mbstat.m_mtypes[i], i);
6978 MBUF_DUMP_BUF_CHK();
6979 }
6980 if ((m_mbufs - totmbufs) > 0) {
6981 k = snprintf(c, clen, "\t%lu mbufs allocated to caches\n",
6982 m_mbufs - totmbufs);
6983 MBUF_DUMP_BUF_CHK();
6984 }
6985 k = snprintf(c, clen, "%u/%u mbuf 2KB clusters in use\n"
6986 "%u/%u mbuf 4KB clusters in use\n",
6987 (unsigned int)(mbstat.m_clusters - m_clfree),
6988 (unsigned int)mbstat.m_clusters,
6989 (unsigned int)(mbstat.m_bigclusters - m_bigclfree),
6990 (unsigned int)mbstat.m_bigclusters);
6991 MBUF_DUMP_BUF_CHK();
6992
6993 if (njcl > 0) {
6994 k = snprintf(c, clen, "%u/%u mbuf %uKB clusters in use\n",
6995 m_16kclusters - m_16kclfree, m_16kclusters,
6996 njclbytes / 1024);
6997 MBUF_DUMP_BUF_CHK();
6998 }
6999 totused = totmem - totfree;
7000 if (totmem == 0) {
7001 totpct = 0;
7002 } else if (totused < (ULONG_MAX / 100)) {
7003 totpct = (totused * 100) / totmem;
7004 } else {
7005 u_long totmem1 = totmem / 100;
7006 u_long totused1 = totused / 100;
7007 totpct = (totused1 * 100) / totmem1;
7008 }
7009 k = snprintf(c, clen, "%lu KB allocated to network (approx. %lu%% "
7010 "in use)\n", totmem / 1024, totpct);
7011 MBUF_DUMP_BUF_CHK();
7012
7013 /* mbuf leak detection statistics */
7014 mleak_update_stats();
7015
7016 k = snprintf(c, clen, "\nmbuf leak detection table:\n");
7017 MBUF_DUMP_BUF_CHK();
7018 k = snprintf(c, clen, "\ttotal captured: %u (one per %u)\n",
7019 mleak_table.mleak_capture / mleak_table.mleak_sample_factor,
7020 mleak_table.mleak_sample_factor);
7021 MBUF_DUMP_BUF_CHK();
7022 k = snprintf(c, clen, "\ttotal allocs outstanding: %llu\n",
7023 mleak_table.outstanding_allocs);
7024 MBUF_DUMP_BUF_CHK();
7025 k = snprintf(c, clen, "\tnew hash recorded: %llu allocs, %llu traces\n",
7026 mleak_table.alloc_recorded, mleak_table.trace_recorded);
7027 MBUF_DUMP_BUF_CHK();
7028 k = snprintf(c, clen, "\thash collisions: %llu allocs, %llu traces\n",
7029 mleak_table.alloc_collisions, mleak_table.trace_collisions);
7030 MBUF_DUMP_BUF_CHK();
7031 k = snprintf(c, clen, "\toverwrites: %llu allocs, %llu traces\n",
7032 mleak_table.alloc_overwrites, mleak_table.trace_overwrites);
7033 MBUF_DUMP_BUF_CHK();
7034 k = snprintf(c, clen, "\tlock conflicts: %llu\n\n",
7035 mleak_table.total_conflicts);
7036 MBUF_DUMP_BUF_CHK();
7037
7038 k = snprintf(c, clen, "top %d outstanding traces:\n",
7039 mleak_stat->ml_cnt);
7040 MBUF_DUMP_BUF_CHK();
7041 for (i = 0; i < mleak_stat->ml_cnt; i++) {
7042 mltr = &mleak_stat->ml_trace[i];
7043 k = snprintf(c, clen, "[%d] %llu outstanding alloc(s), "
7044 "%llu hit(s), %llu collision(s)\n", (i + 1),
7045 mltr->mltr_allocs, mltr->mltr_hitcount,
7046 mltr->mltr_collisions);
7047 MBUF_DUMP_BUF_CHK();
7048 }
7049
7050 if (mleak_stat->ml_isaddr64)
7051 k = snprintf(c, clen, MB_LEAK_HDR_64);
7052 else
7053 k = snprintf(c, clen, MB_LEAK_HDR_32);
7054 MBUF_DUMP_BUF_CHK();
7055
7056 for (i = 0; i < MLEAK_STACK_DEPTH; i++) {
7057 int j;
7058 k = snprintf(c, clen, "%2d: ", (i + 1));
7059 MBUF_DUMP_BUF_CHK();
7060 for (j = 0; j < mleak_stat->ml_cnt; j++) {
7061 mltr = &mleak_stat->ml_trace[j];
7062 if (i < mltr->mltr_depth) {
7063 if (mleak_stat->ml_isaddr64) {
7064 k = snprintf(c, clen, "0x%0llx ",
7065 mltr->mltr_addr[i]);
7066 } else {
7067 k = snprintf(c, clen,
7068 "0x%08x ",
7069 (u_int32_t)mltr->mltr_addr[i]);
7070 }
7071 } else {
7072 if (mleak_stat->ml_isaddr64)
7073 k = snprintf(c, clen,
7074 MB_LEAK_SPACING_64);
7075 else
7076 k = snprintf(c, clen,
7077 MB_LEAK_SPACING_32);
7078 }
7079 MBUF_DUMP_BUF_CHK();
7080 }
7081 k = snprintf(c, clen, "\n");
7082 MBUF_DUMP_BUF_CHK();
7083 }
7084 done:
7085 return (mbuf_dump_buf);
7086 }
7087
7088 #undef MBUF_DUMP_BUF_CHK
7089
7090 /*
7091 * Convert between a regular and a packet header mbuf. Caller is responsible
7092 * for setting or clearing M_PKTHDR; this routine does the rest of the work.
7093 */
7094 int
7095 m_reinit(struct mbuf *m, int hdr)
7096 {
7097 int ret = 0;
7098
7099 if (hdr) {
7100 VERIFY(!(m->m_flags & M_PKTHDR));
7101 if (!(m->m_flags & M_EXT) &&
7102 (m->m_data != m->m_dat || m->m_len > 0)) {
7103 /*
7104 * If there's no external cluster attached and the
7105 * mbuf appears to contain user data, we cannot
7106 * safely convert this to a packet header mbuf,
7107 * as the packet header structure might overlap
7108 * with the data.
7109 */
7110 printf("%s: cannot set M_PKTHDR on altered mbuf %p, "
7111 "m_data %p (expected %p), m_len %d (expected 0)\n",
7112 __func__, m, m->m_data, m->m_dat, m->m_len);
7113 ret = EBUSY;
7114 } else {
7115 VERIFY((m->m_flags & M_EXT) || m->m_data == m->m_dat);
7116 m->m_flags |= M_PKTHDR;
7117 MBUF_INIT_PKTHDR(m);
7118 }
7119 } else {
7120 /* Check for scratch area overflow */
7121 m_redzone_verify(m);
7122 /* Free the aux data and tags if there is any */
7123 m_tag_delete_chain(m, NULL);
7124 m->m_flags &= ~M_PKTHDR;
7125 }
7126
7127 return (ret);
7128 }
7129
7130 void
7131 m_scratch_init(struct mbuf *m)
7132 {
7133 VERIFY(m->m_flags & M_PKTHDR);
7134
7135 bzero(&m->m_pkthdr.pkt_mpriv, sizeof (m->m_pkthdr.pkt_mpriv));
7136 }
7137
7138 u_int32_t
7139 m_scratch_get(struct mbuf *m, u_int8_t **p)
7140 {
7141 VERIFY(m->m_flags & M_PKTHDR);
7142
7143 if (mcltrace) {
7144 mcache_audit_t *mca;
7145
7146 lck_mtx_lock(mbuf_mlock);
7147 mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
7148 if (mca->mca_uflags & MB_SCVALID)
7149 mcl_audit_scratch(mca);
7150 lck_mtx_unlock(mbuf_mlock);
7151 }
7152
7153 *p = (u_int8_t *)&m->m_pkthdr.pkt_mpriv;
7154 return (sizeof (m->m_pkthdr.pkt_mpriv));
7155 }
7156
7157 static void
7158 m_redzone_init(struct mbuf *m)
7159 {
7160 VERIFY(m->m_flags & M_PKTHDR);
7161 /*
7162 * Each mbuf has a unique red zone pattern, which is a XOR
7163 * of the red zone cookie and the address of the mbuf.
7164 */
7165 m->m_pkthdr.redzone = ((u_int32_t)(uintptr_t)m) ^ mb_redzone_cookie;
7166 }
7167
7168 static void
7169 m_redzone_verify(struct mbuf *m)
7170 {
7171 u_int32_t mb_redzone;
7172
7173 VERIFY(m->m_flags & M_PKTHDR);
7174
7175 mb_redzone = ((u_int32_t)(uintptr_t)m) ^ mb_redzone_cookie;
7176 if (m->m_pkthdr.redzone != mb_redzone) {
7177 panic("mbuf %p redzone violation with value 0x%x "
7178 "(instead of 0x%x, using cookie 0x%x)\n",
7179 m, m->m_pkthdr.redzone, mb_redzone, mb_redzone_cookie);
7180 /* NOTREACHED */
7181 }
7182 }
7183
7184 SYSCTL_DECL(_kern_ipc);
7185 SYSCTL_PROC(_kern_ipc, KIPC_MBSTAT, mbstat,
7186 CTLFLAG_RD | CTLFLAG_LOCKED,
7187 0, 0, mbstat_sysctl, "S,mbstat", "");
7188 SYSCTL_PROC(_kern_ipc, OID_AUTO, mb_stat,
7189 CTLFLAG_RD | CTLFLAG_LOCKED,
7190 0, 0, mb_stat_sysctl, "S,mb_stat", "");
7191 SYSCTL_PROC(_kern_ipc, OID_AUTO, mleak_top_trace,
7192 CTLFLAG_RD | CTLFLAG_LOCKED,
7193 0, 0, mleak_top_trace_sysctl, "S,mb_top_trace", "");
7194 SYSCTL_PROC(_kern_ipc, OID_AUTO, mleak_table,
7195 CTLFLAG_RD | CTLFLAG_LOCKED,
7196 0, 0, mleak_table_sysctl, "S,mleak_table", "");
7197 SYSCTL_INT(_kern_ipc, OID_AUTO, mleak_sample_factor,
7198 CTLFLAG_RW | CTLFLAG_LOCKED, &mleak_table.mleak_sample_factor, 0, "");
7199 SYSCTL_INT(_kern_ipc, OID_AUTO, mb_normalized,
7200 CTLFLAG_RD | CTLFLAG_LOCKED, &mb_normalized, 0, "");
7201 SYSCTL_INT(_kern_ipc, OID_AUTO, mb_watchdog,
7202 CTLFLAG_RW | CTLFLAG_LOCKED, &mb_watchdog, 0, "");