]> git.saurik.com Git - apple/xnu.git/blob - bsd/kern/uipc_mbuf.c
57915610ceffa2d8566598e5312bed7c6d01bbc6
[apple/xnu.git] / bsd / kern / uipc_mbuf.c
1 /*
2 * Copyright (c) 1998-2017 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
29 /*
30 * Copyright (c) 1982, 1986, 1988, 1991, 1993
31 * The Regents of the University of California. All rights reserved.
32 *
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
35 * are met:
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
41 * 3. All advertising materials mentioning features or use of this software
42 * must display the following acknowledgement:
43 * This product includes software developed by the University of
44 * California, Berkeley and its contributors.
45 * 4. Neither the name of the University nor the names of its contributors
46 * may be used to endorse or promote products derived from this software
47 * without specific prior written permission.
48 *
49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59 * SUCH DAMAGE.
60 *
61 * @(#)uipc_mbuf.c 8.2 (Berkeley) 1/4/94
62 */
63 /*
64 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
65 * support for mandatory and extensible security protections. This notice
66 * is included in support of clause 2.2 (b) of the Apple Public License,
67 * Version 2.0.
68 */
69
70 #include <sys/param.h>
71 #include <sys/systm.h>
72 #include <sys/malloc.h>
73 #include <sys/mbuf.h>
74 #include <sys/kernel.h>
75 #include <sys/sysctl.h>
76 #include <sys/syslog.h>
77 #include <sys/protosw.h>
78 #include <sys/domain.h>
79 #include <sys/queue.h>
80 #include <sys/proc.h>
81
82 #include <dev/random/randomdev.h>
83
84 #include <kern/kern_types.h>
85 #include <kern/simple_lock.h>
86 #include <kern/queue.h>
87 #include <kern/sched_prim.h>
88 #include <kern/backtrace.h>
89 #include <kern/cpu_number.h>
90 #include <kern/zalloc.h>
91
92 #include <libkern/OSAtomic.h>
93 #include <libkern/OSDebug.h>
94 #include <libkern/libkern.h>
95
96 #include <IOKit/IOMapper.h>
97
98 #include <machine/limits.h>
99 #include <machine/machine_routines.h>
100
101 #if CONFIG_MACF_NET
102 #include <security/mac_framework.h>
103 #endif /* MAC_NET */
104
105 #include <sys/mcache.h>
106 #include <net/ntstat.h>
107
108 /*
109 * MBUF IMPLEMENTATION NOTES.
110 *
111 * There is a total of 5 per-CPU caches:
112 *
113 * MC_MBUF:
114 * This is a cache of rudimentary objects of MSIZE in size; each
115 * object represents an mbuf structure. This cache preserves only
116 * the m_type field of the mbuf during its transactions.
117 *
118 * MC_CL:
119 * This is a cache of rudimentary objects of MCLBYTES in size; each
120 * object represents a mcluster structure. This cache does not
121 * preserve the contents of the objects during its transactions.
122 *
123 * MC_BIGCL:
124 * This is a cache of rudimentary objects of MBIGCLBYTES in size; each
125 * object represents a mbigcluster structure. This cache does not
126 * preserve the contents of the objects during its transaction.
127 *
128 * MC_MBUF_CL:
129 * This is a cache of mbufs each having a cluster attached to it.
130 * It is backed by MC_MBUF and MC_CL rudimentary caches. Several
131 * fields of the mbuf related to the external cluster are preserved
132 * during transactions.
133 *
134 * MC_MBUF_BIGCL:
135 * This is a cache of mbufs each having a big cluster attached to it.
136 * It is backed by MC_MBUF and MC_BIGCL rudimentary caches. Several
137 * fields of the mbuf related to the external cluster are preserved
138 * during transactions.
139 *
140 * OBJECT ALLOCATION:
141 *
142 * Allocation requests are handled first at the per-CPU (mcache) layer
143 * before falling back to the slab layer. Performance is optimal when
144 * the request is satisfied at the CPU layer because global data/lock
145 * never gets accessed. When the slab layer is entered for allocation,
146 * the slab freelist will be checked first for available objects before
147 * the VM backing store is invoked. Slab layer operations are serialized
148 * for all of the caches as the mbuf global lock is held most of the time.
149 * Allocation paths are different depending on the class of objects:
150 *
151 * a. Rudimentary object:
152 *
153 * { m_get_common(), m_clattach(), m_mclget(),
154 * m_mclalloc(), m_bigalloc(), m_copym_with_hdrs(),
155 * composite object allocation }
156 * | ^
157 * | |
158 * | +-----------------------+
159 * v |
160 * mcache_alloc/mcache_alloc_ext() mbuf_slab_audit()
161 * | ^
162 * v |
163 * [CPU cache] -------> (found?) -------+
164 * | |
165 * v |
166 * mbuf_slab_alloc() |
167 * | |
168 * v |
169 * +---------> [freelist] -------> (found?) -------+
170 * | |
171 * | v
172 * | m_clalloc()
173 * | |
174 * | v
175 * +---<<---- kmem_mb_alloc()
176 *
177 * b. Composite object:
178 *
179 * { m_getpackets_internal(), m_allocpacket_internal() }
180 * | ^
181 * | |
182 * | +------ (done) ---------+
183 * v |
184 * mcache_alloc/mcache_alloc_ext() mbuf_cslab_audit()
185 * | ^
186 * v |
187 * [CPU cache] -------> (found?) -------+
188 * | |
189 * v |
190 * mbuf_cslab_alloc() |
191 * | |
192 * v |
193 * [freelist] -------> (found?) -------+
194 * | |
195 * v |
196 * (rudimentary object) |
197 * mcache_alloc/mcache_alloc_ext() ------>>-----+
198 *
199 * Auditing notes: If auditing is enabled, buffers will be subjected to
200 * integrity checks by the audit routine. This is done by verifying their
201 * contents against DEADBEEF (free) pattern before returning them to caller.
202 * As part of this step, the routine will also record the transaction and
203 * pattern-fill the buffers with BADDCAFE (uninitialized) pattern. It will
204 * also restore any constructed data structure fields if necessary.
205 *
206 * OBJECT DEALLOCATION:
207 *
208 * Freeing an object simply involves placing it into the CPU cache; this
209 * pollutes the cache to benefit subsequent allocations. The slab layer
210 * will only be entered if the object is to be purged out of the cache.
211 * During normal operations, this happens only when the CPU layer resizes
212 * its bucket while it's adjusting to the allocation load. Deallocation
213 * paths are different depending on the class of objects:
214 *
215 * a. Rudimentary object:
216 *
217 * { m_free(), m_freem_list(), composite object deallocation }
218 * | ^
219 * | |
220 * | +------ (done) ---------+
221 * v |
222 * mcache_free/mcache_free_ext() |
223 * | |
224 * v |
225 * mbuf_slab_audit() |
226 * | |
227 * v |
228 * [CPU cache] ---> (not purging?) -----+
229 * | |
230 * v |
231 * mbuf_slab_free() |
232 * | |
233 * v |
234 * [freelist] ----------->>------------+
235 * (objects get purged to VM only on demand)
236 *
237 * b. Composite object:
238 *
239 * { m_free(), m_freem_list() }
240 * | ^
241 * | |
242 * | +------ (done) ---------+
243 * v |
244 * mcache_free/mcache_free_ext() |
245 * | |
246 * v |
247 * mbuf_cslab_audit() |
248 * | |
249 * v |
250 * [CPU cache] ---> (not purging?) -----+
251 * | |
252 * v |
253 * mbuf_cslab_free() |
254 * | |
255 * v |
256 * [freelist] ---> (not purging?) -----+
257 * | |
258 * v |
259 * (rudimentary object) |
260 * mcache_free/mcache_free_ext() ------->>------+
261 *
262 * Auditing notes: If auditing is enabled, the audit routine will save
263 * any constructed data structure fields (if necessary) before filling the
264 * contents of the buffers with DEADBEEF (free) pattern and recording the
265 * transaction. Buffers that are freed (whether at CPU or slab layer) are
266 * expected to contain the free pattern.
267 *
268 * DEBUGGING:
269 *
270 * Debugging can be enabled by adding "mbuf_debug=0x3" to boot-args; this
271 * translates to the mcache flags (MCF_VERIFY | MCF_AUDIT). Additionally,
272 * the CPU layer cache can be disabled by setting the MCF_NOCPUCACHE flag,
273 * i.e. modify the boot argument parameter to "mbuf_debug=0x13". Leak
274 * detection may also be disabled by setting the MCF_NOLEAKLOG flag, e.g.
275 * "mbuf_debug=0x113". Note that debugging consumes more CPU and memory.
276 *
277 * Each object is associated with exactly one mcache_audit_t structure that
278 * contains the information related to its last buffer transaction. Given
279 * an address of an object, the audit structure can be retrieved by finding
280 * the position of the object relevant to the base address of the cluster:
281 *
282 * +------------+ +=============+
283 * | mbuf addr | | mclaudit[i] |
284 * +------------+ +=============+
285 * | | cl_audit[0] |
286 * i = MTOBG(addr) +-------------+
287 * | +-----> | cl_audit[1] | -----> mcache_audit_t
288 * b = BGTOM(i) | +-------------+
289 * | | | ... |
290 * x = MCLIDX(b, addr) | +-------------+
291 * | | | cl_audit[7] |
292 * +-----------------+ +-------------+
293 * (e.g. x == 1)
294 *
295 * The mclaudit[] array is allocated at initialization time, but its contents
296 * get populated when the corresponding cluster is created. Because a page
297 * can be turned into NMBPG number of mbufs, we preserve enough space for the
298 * mbufs so that there is a 1-to-1 mapping between them. A page that never
299 * gets (or has not yet) turned into mbufs will use only cl_audit[0] with the
300 * remaining entries unused. For 16KB cluster, only one entry from the first
301 * page is allocated and used for the entire object.
302 */
303
304 /* TODO: should be in header file */
305 /* kernel translater */
306 extern vm_offset_t kmem_mb_alloc(vm_map_t, int, int, kern_return_t *);
307 extern ppnum_t pmap_find_phys(pmap_t pmap, addr64_t va);
308 extern vm_map_t mb_map; /* special map */
309
310 static uint32_t mb_kmem_contig_failed;
311 static uint32_t mb_kmem_failed;
312 static uint32_t mb_kmem_one_failed;
313 /* Timestamp of allocation failures. */
314 static uint64_t mb_kmem_contig_failed_ts;
315 static uint64_t mb_kmem_failed_ts;
316 static uint64_t mb_kmem_one_failed_ts;
317 static uint64_t mb_kmem_contig_failed_size;
318 static uint64_t mb_kmem_failed_size;
319 static uint32_t mb_kmem_stats[6];
320 static const char *mb_kmem_stats_labels[] = { "INVALID_ARGUMENT",
321 "INVALID_ADDRESS",
322 "RESOURCE_SHORTAGE",
323 "NO_SPACE",
324 "KERN_FAILURE",
325 "OTHERS" };
326
327 /* Global lock */
328 decl_lck_mtx_data(static, mbuf_mlock_data);
329 static lck_mtx_t *mbuf_mlock = &mbuf_mlock_data;
330 static lck_attr_t *mbuf_mlock_attr;
331 static lck_grp_t *mbuf_mlock_grp;
332 static lck_grp_attr_t *mbuf_mlock_grp_attr;
333
334 /* Back-end (common) layer */
335 static uint64_t mb_expand_cnt;
336 static uint64_t mb_expand_cl_cnt;
337 static uint64_t mb_expand_cl_total;
338 static uint64_t mb_expand_bigcl_cnt;
339 static uint64_t mb_expand_bigcl_total;
340 static uint64_t mb_expand_16kcl_cnt;
341 static uint64_t mb_expand_16kcl_total;
342 static boolean_t mbuf_worker_needs_wakeup; /* wait channel for mbuf worker */
343 static uint32_t mbuf_worker_run_cnt;
344 static uint64_t mbuf_worker_last_runtime;
345 static int mbuf_worker_ready; /* worker thread is runnable */
346 static int ncpu; /* number of CPUs */
347 static ppnum_t *mcl_paddr; /* Array of cluster physical addresses */
348 static ppnum_t mcl_pages; /* Size of array (# physical pages) */
349 static ppnum_t mcl_paddr_base; /* Handle returned by IOMapper::iovmAlloc() */
350 static mcache_t *ref_cache; /* Cache of cluster reference & flags */
351 static mcache_t *mcl_audit_con_cache; /* Audit contents cache */
352 static unsigned int mbuf_debug; /* patchable mbuf mcache flags */
353 static unsigned int mb_normalized; /* number of packets "normalized" */
354
355 #define MB_GROWTH_AGGRESSIVE 1 /* Threshold: 1/2 of total */
356 #define MB_GROWTH_NORMAL 2 /* Threshold: 3/4 of total */
357
358 typedef enum {
359 MC_MBUF = 0, /* Regular mbuf */
360 MC_CL, /* Cluster */
361 MC_BIGCL, /* Large (4KB) cluster */
362 MC_16KCL, /* Jumbo (16KB) cluster */
363 MC_MBUF_CL, /* mbuf + cluster */
364 MC_MBUF_BIGCL, /* mbuf + large (4KB) cluster */
365 MC_MBUF_16KCL /* mbuf + jumbo (16KB) cluster */
366 } mbuf_class_t;
367
368 #define MBUF_CLASS_MIN MC_MBUF
369 #define MBUF_CLASS_MAX MC_MBUF_16KCL
370 #define MBUF_CLASS_LAST MC_16KCL
371 #define MBUF_CLASS_VALID(c) \
372 ((int)(c) >= MBUF_CLASS_MIN && (int)(c) <= MBUF_CLASS_MAX)
373 #define MBUF_CLASS_COMPOSITE(c) \
374 ((int)(c) > MBUF_CLASS_LAST)
375
376
377 /*
378 * mbuf specific mcache allocation request flags.
379 */
380 #define MCR_COMP MCR_USR1 /* for MC_MBUF_{CL,BIGCL,16KCL} caches */
381
382 /*
383 * Per-cluster slab structure.
384 *
385 * A slab is a cluster control structure that contains one or more object
386 * chunks; the available chunks are chained in the slab's freelist (sl_head).
387 * Each time a chunk is taken out of the slab, the slab's reference count
388 * gets incremented. When all chunks have been taken out, the empty slab
389 * gets removed (SLF_DETACHED) from the class's slab list. A chunk that is
390 * returned to a slab causes the slab's reference count to be decremented;
391 * it also causes the slab to be reinserted back to class's slab list, if
392 * it's not already done.
393 *
394 * Compartmentalizing of the object chunks into slabs allows us to easily
395 * merge one or more slabs together when the adjacent slabs are idle, as
396 * well as to convert or move a slab from one class to another; e.g. the
397 * mbuf cluster slab can be converted to a regular cluster slab when all
398 * mbufs in the slab have been freed.
399 *
400 * A slab may also span across multiple clusters for chunks larger than
401 * a cluster's size. In this case, only the slab of the first cluster is
402 * used. The rest of the slabs are marked with SLF_PARTIAL to indicate
403 * that they are part of the larger slab.
404 *
405 * Each slab controls a page of memory.
406 */
407 typedef struct mcl_slab {
408 struct mcl_slab *sl_next; /* neighboring slab */
409 u_int8_t sl_class; /* controlling mbuf class */
410 int8_t sl_refcnt; /* outstanding allocations */
411 int8_t sl_chunks; /* chunks (bufs) in this slab */
412 u_int16_t sl_flags; /* slab flags (see below) */
413 u_int16_t sl_len; /* slab length */
414 void *sl_base; /* base of allocated memory */
415 void *sl_head; /* first free buffer */
416 TAILQ_ENTRY(mcl_slab) sl_link; /* next/prev slab on freelist */
417 } mcl_slab_t;
418
419 #define SLF_MAPPED 0x0001 /* backed by a mapped page */
420 #define SLF_PARTIAL 0x0002 /* part of another slab */
421 #define SLF_DETACHED 0x0004 /* not in slab freelist */
422
423 /*
424 * The array of slabs are broken into groups of arrays per 1MB of kernel
425 * memory to reduce the footprint. Each group is allocated on demand
426 * whenever a new piece of memory mapped in from the VM crosses the 1MB
427 * boundary.
428 */
429 #define NSLABSPMB ((1 << MBSHIFT) >> PAGE_SHIFT)
430
431 typedef struct mcl_slabg {
432 mcl_slab_t *slg_slab; /* group of slabs */
433 } mcl_slabg_t;
434
435 /*
436 * Number of slabs needed to control a 16KB cluster object.
437 */
438 #define NSLABSP16KB (M16KCLBYTES >> PAGE_SHIFT)
439
440 /*
441 * Per-cluster audit structure.
442 */
443 typedef struct {
444 mcache_audit_t **cl_audit; /* array of audits */
445 } mcl_audit_t;
446
447 typedef struct {
448 struct thread *msa_thread; /* thread doing transaction */
449 struct thread *msa_pthread; /* previous transaction thread */
450 uint32_t msa_tstamp; /* transaction timestamp (ms) */
451 uint32_t msa_ptstamp; /* prev transaction timestamp (ms) */
452 uint16_t msa_depth; /* pc stack depth */
453 uint16_t msa_pdepth; /* previous transaction pc stack */
454 void *msa_stack[MCACHE_STACK_DEPTH];
455 void *msa_pstack[MCACHE_STACK_DEPTH];
456 } mcl_scratch_audit_t;
457
458 typedef struct {
459 /*
460 * Size of data from the beginning of an mbuf that covers m_hdr,
461 * pkthdr and m_ext structures. If auditing is enabled, we allocate
462 * a shadow mbuf structure of this size inside each audit structure,
463 * and the contents of the real mbuf gets copied into it when the mbuf
464 * is freed. This allows us to pattern-fill the mbuf for integrity
465 * check, and to preserve any constructed mbuf fields (e.g. mbuf +
466 * cluster cache case). Note that we don't save the contents of
467 * clusters when they are freed; we simply pattern-fill them.
468 */
469 u_int8_t sc_mbuf[(MSIZE - _MHLEN) + sizeof (_m_ext_t)];
470 mcl_scratch_audit_t sc_scratch __attribute__((aligned(8)));
471 } mcl_saved_contents_t;
472
473 #define AUDIT_CONTENTS_SIZE (sizeof (mcl_saved_contents_t))
474
475 #define MCA_SAVED_MBUF_PTR(_mca) \
476 ((struct mbuf *)(void *)((mcl_saved_contents_t *) \
477 (_mca)->mca_contents)->sc_mbuf)
478 #define MCA_SAVED_MBUF_SIZE \
479 (sizeof (((mcl_saved_contents_t *)0)->sc_mbuf))
480 #define MCA_SAVED_SCRATCH_PTR(_mca) \
481 (&((mcl_saved_contents_t *)(_mca)->mca_contents)->sc_scratch)
482
483 /*
484 * mbuf specific mcache audit flags
485 */
486 #define MB_INUSE 0x01 /* object has not been returned to slab */
487 #define MB_COMP_INUSE 0x02 /* object has not been returned to cslab */
488 #define MB_SCVALID 0x04 /* object has valid saved contents */
489
490 /*
491 * Each of the following two arrays hold up to nmbclusters elements.
492 */
493 static mcl_audit_t *mclaudit; /* array of cluster audit information */
494 static unsigned int maxclaudit; /* max # of entries in audit table */
495 static mcl_slabg_t **slabstbl; /* cluster slabs table */
496 static unsigned int maxslabgrp; /* max # of entries in slabs table */
497 static unsigned int slabgrp; /* # of entries in slabs table */
498
499 /* Globals */
500 int nclusters; /* # of clusters for non-jumbo (legacy) sizes */
501 int njcl; /* # of clusters for jumbo sizes */
502 int njclbytes; /* size of a jumbo cluster */
503 unsigned char *mbutl; /* first mapped cluster address */
504 unsigned char *embutl; /* ending virtual address of mclusters */
505 int _max_linkhdr; /* largest link-level header */
506 int _max_protohdr; /* largest protocol header */
507 int max_hdr; /* largest link+protocol header */
508 int max_datalen; /* MHLEN - max_hdr */
509
510 static boolean_t mclverify; /* debug: pattern-checking */
511 static boolean_t mcltrace; /* debug: stack tracing */
512 static boolean_t mclfindleak; /* debug: leak detection */
513 static boolean_t mclexpleak; /* debug: expose leak info to user space */
514
515 static struct timeval mb_start; /* beginning of time */
516
517 /* mbuf leak detection variables */
518 static struct mleak_table mleak_table;
519 static mleak_stat_t *mleak_stat;
520
521 #define MLEAK_STAT_SIZE(n) \
522 __builtin_offsetof(mleak_stat_t, ml_trace[n])
523
524 struct mallocation {
525 mcache_obj_t *element; /* the alloc'ed element, NULL if unused */
526 u_int32_t trace_index; /* mtrace index for corresponding backtrace */
527 u_int32_t count; /* How many objects were requested */
528 u_int64_t hitcount; /* for determining hash effectiveness */
529 };
530
531 struct mtrace {
532 u_int64_t collisions;
533 u_int64_t hitcount;
534 u_int64_t allocs;
535 u_int64_t depth;
536 uintptr_t addr[MLEAK_STACK_DEPTH];
537 };
538
539 /* Size must be a power of two for the zhash to be able to just mask off bits */
540 #define MLEAK_ALLOCATION_MAP_NUM 512
541 #define MLEAK_TRACE_MAP_NUM 256
542
543 /*
544 * Sample factor for how often to record a trace. This is overwritable
545 * by the boot-arg mleak_sample_factor.
546 */
547 #define MLEAK_SAMPLE_FACTOR 500
548
549 /*
550 * Number of top leakers recorded.
551 */
552 #define MLEAK_NUM_TRACES 5
553
554 #define MB_LEAK_SPACING_64 " "
555 #define MB_LEAK_SPACING_32 " "
556
557
558 #define MB_LEAK_HDR_32 "\n\
559 trace [1] trace [2] trace [3] trace [4] trace [5] \n\
560 ---------- ---------- ---------- ---------- ---------- \n\
561 "
562
563 #define MB_LEAK_HDR_64 "\n\
564 trace [1] trace [2] trace [3] \
565 trace [4] trace [5] \n\
566 ------------------ ------------------ ------------------ \
567 ------------------ ------------------ \n\
568 "
569
570 static uint32_t mleak_alloc_buckets = MLEAK_ALLOCATION_MAP_NUM;
571 static uint32_t mleak_trace_buckets = MLEAK_TRACE_MAP_NUM;
572
573 /* Hashmaps of allocations and their corresponding traces */
574 static struct mallocation *mleak_allocations;
575 static struct mtrace *mleak_traces;
576 static struct mtrace *mleak_top_trace[MLEAK_NUM_TRACES];
577
578 /* Lock to protect mleak tables from concurrent modification */
579 decl_lck_mtx_data(static, mleak_lock_data);
580 static lck_mtx_t *mleak_lock = &mleak_lock_data;
581 static lck_attr_t *mleak_lock_attr;
582 static lck_grp_t *mleak_lock_grp;
583 static lck_grp_attr_t *mleak_lock_grp_attr;
584
585 /* Lock to protect the completion callback table */
586 static lck_grp_attr_t *mbuf_tx_compl_tbl_lck_grp_attr = NULL;
587 static lck_attr_t *mbuf_tx_compl_tbl_lck_attr = NULL;
588 static lck_grp_t *mbuf_tx_compl_tbl_lck_grp = NULL;
589 decl_lck_rw_data(, mbuf_tx_compl_tbl_lck_rw_data);
590 lck_rw_t *mbuf_tx_compl_tbl_lock = &mbuf_tx_compl_tbl_lck_rw_data;
591
592 extern u_int32_t high_sb_max;
593
594 /* The minimum number of objects that are allocated, to start. */
595 #define MINCL 32
596 #define MINBIGCL (MINCL >> 1)
597 #define MIN16KCL (MINCL >> 2)
598
599 /* Low watermarks (only map in pages once free counts go below) */
600 #define MBIGCL_LOWAT MINBIGCL
601 #define M16KCL_LOWAT MIN16KCL
602
603 typedef struct {
604 mbuf_class_t mtbl_class; /* class type */
605 mcache_t *mtbl_cache; /* mcache for this buffer class */
606 TAILQ_HEAD(mcl_slhead, mcl_slab) mtbl_slablist; /* slab list */
607 mcache_obj_t *mtbl_cobjlist; /* composite objects freelist */
608 mb_class_stat_t *mtbl_stats; /* statistics fetchable via sysctl */
609 u_int32_t mtbl_maxsize; /* maximum buffer size */
610 int mtbl_minlimit; /* minimum allowed */
611 int mtbl_maxlimit; /* maximum allowed */
612 u_int32_t mtbl_wantpurge; /* purge during next reclaim */
613 uint32_t mtbl_avgtotal; /* average total on iOS */
614 u_int32_t mtbl_expand; /* worker should expand the class */
615 } mbuf_table_t;
616
617 #define m_class(c) mbuf_table[c].mtbl_class
618 #define m_cache(c) mbuf_table[c].mtbl_cache
619 #define m_slablist(c) mbuf_table[c].mtbl_slablist
620 #define m_cobjlist(c) mbuf_table[c].mtbl_cobjlist
621 #define m_maxsize(c) mbuf_table[c].mtbl_maxsize
622 #define m_minlimit(c) mbuf_table[c].mtbl_minlimit
623 #define m_maxlimit(c) mbuf_table[c].mtbl_maxlimit
624 #define m_wantpurge(c) mbuf_table[c].mtbl_wantpurge
625 #define m_avgtotal(c) mbuf_table[c].mtbl_avgtotal
626 #define m_cname(c) mbuf_table[c].mtbl_stats->mbcl_cname
627 #define m_size(c) mbuf_table[c].mtbl_stats->mbcl_size
628 #define m_total(c) mbuf_table[c].mtbl_stats->mbcl_total
629 #define m_active(c) mbuf_table[c].mtbl_stats->mbcl_active
630 #define m_infree(c) mbuf_table[c].mtbl_stats->mbcl_infree
631 #define m_slab_cnt(c) mbuf_table[c].mtbl_stats->mbcl_slab_cnt
632 #define m_alloc_cnt(c) mbuf_table[c].mtbl_stats->mbcl_alloc_cnt
633 #define m_free_cnt(c) mbuf_table[c].mtbl_stats->mbcl_free_cnt
634 #define m_notified(c) mbuf_table[c].mtbl_stats->mbcl_notified
635 #define m_purge_cnt(c) mbuf_table[c].mtbl_stats->mbcl_purge_cnt
636 #define m_fail_cnt(c) mbuf_table[c].mtbl_stats->mbcl_fail_cnt
637 #define m_ctotal(c) mbuf_table[c].mtbl_stats->mbcl_ctotal
638 #define m_peak(c) mbuf_table[c].mtbl_stats->mbcl_peak_reported
639 #define m_release_cnt(c) mbuf_table[c].mtbl_stats->mbcl_release_cnt
640 #define m_region_expand(c) mbuf_table[c].mtbl_expand
641
642 static mbuf_table_t mbuf_table[] = {
643 /*
644 * The caches for mbufs, regular clusters and big clusters.
645 * The average total values were based on data gathered by actual
646 * usage patterns on iOS.
647 */
648 { MC_MBUF, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_MBUF)),
649 NULL, NULL, 0, 0, 0, 0, 3000, 0 },
650 { MC_CL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_CL)),
651 NULL, NULL, 0, 0, 0, 0, 2000, 0 },
652 { MC_BIGCL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_BIGCL)),
653 NULL, NULL, 0, 0, 0, 0, 1000, 0 },
654 { MC_16KCL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_16KCL)),
655 NULL, NULL, 0, 0, 0, 0, 200, 0 },
656 /*
657 * The following are special caches; they serve as intermediate
658 * caches backed by the above rudimentary caches. Each object
659 * in the cache is an mbuf with a cluster attached to it. Unlike
660 * the above caches, these intermediate caches do not directly
661 * deal with the slab structures; instead, the constructed
662 * cached elements are simply stored in the freelists.
663 */
664 { MC_MBUF_CL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0, 2000, 0 },
665 { MC_MBUF_BIGCL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0, 1000, 0 },
666 { MC_MBUF_16KCL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0, 200, 0 },
667 };
668
669 #define NELEM(a) (sizeof (a) / sizeof ((a)[0]))
670
671 static void *mb_waitchan = &mbuf_table; /* wait channel for all caches */
672 static int mb_waiters; /* number of waiters */
673
674 boolean_t mb_peak_newreport = FALSE;
675 boolean_t mb_peak_firstreport = FALSE;
676
677 /* generate a report by default after 1 week of uptime */
678 #define MBUF_PEAK_FIRST_REPORT_THRESHOLD 604800
679
680 #define MB_WDT_MAXTIME 10 /* # of secs before watchdog panic */
681 static struct timeval mb_wdtstart; /* watchdog start timestamp */
682 static char *mbuf_dump_buf;
683
684 #define MBUF_DUMP_BUF_SIZE 3072
685
686 /*
687 * mbuf watchdog is enabled by default on embedded platforms. It is
688 * also toggeable via the kern.ipc.mb_watchdog sysctl.
689 * Garbage collection is also enabled by default on embedded platforms.
690 * mb_drain_maxint controls the amount of time to wait (in seconds) before
691 * consecutive calls to m_drain().
692 */
693 #if CONFIG_EMBEDDED
694 static unsigned int mb_watchdog = 1;
695 static unsigned int mb_drain_maxint = 60;
696 #else
697 static unsigned int mb_watchdog = 0;
698 static unsigned int mb_drain_maxint = 0;
699 #endif /* CONFIG_EMBEDDED */
700
701 uintptr_t mb_obscure_extfree __attribute__((visibility("hidden")));
702 uintptr_t mb_obscure_extref __attribute__((visibility("hidden")));
703
704 /* Red zone */
705 static u_int32_t mb_redzone_cookie;
706 static void m_redzone_init(struct mbuf *);
707 static void m_redzone_verify(struct mbuf *m);
708
709 /* The following are used to serialize m_clalloc() */
710 static boolean_t mb_clalloc_busy;
711 static void *mb_clalloc_waitchan = &mb_clalloc_busy;
712 static int mb_clalloc_waiters;
713
714 static void mbuf_mtypes_sync(boolean_t);
715 static int mbstat_sysctl SYSCTL_HANDLER_ARGS;
716 static void mbuf_stat_sync(void);
717 static int mb_stat_sysctl SYSCTL_HANDLER_ARGS;
718 static int mleak_top_trace_sysctl SYSCTL_HANDLER_ARGS;
719 static int mleak_table_sysctl SYSCTL_HANDLER_ARGS;
720 static char *mbuf_dump(void);
721 static void mbuf_table_init(void);
722 static inline void m_incref(struct mbuf *);
723 static inline u_int16_t m_decref(struct mbuf *);
724 static int m_clalloc(const u_int32_t, const int, const u_int32_t);
725 static void mbuf_worker_thread_init(void);
726 static mcache_obj_t *slab_alloc(mbuf_class_t, int);
727 static void slab_free(mbuf_class_t, mcache_obj_t *);
728 static unsigned int mbuf_slab_alloc(void *, mcache_obj_t ***,
729 unsigned int, int);
730 static void mbuf_slab_free(void *, mcache_obj_t *, int);
731 static void mbuf_slab_audit(void *, mcache_obj_t *, boolean_t);
732 static void mbuf_slab_notify(void *, u_int32_t);
733 static unsigned int cslab_alloc(mbuf_class_t, mcache_obj_t ***,
734 unsigned int);
735 static unsigned int cslab_free(mbuf_class_t, mcache_obj_t *, int);
736 static unsigned int mbuf_cslab_alloc(void *, mcache_obj_t ***,
737 unsigned int, int);
738 static void mbuf_cslab_free(void *, mcache_obj_t *, int);
739 static void mbuf_cslab_audit(void *, mcache_obj_t *, boolean_t);
740 static int freelist_populate(mbuf_class_t, unsigned int, int);
741 static void freelist_init(mbuf_class_t);
742 static boolean_t mbuf_cached_above(mbuf_class_t, int);
743 static boolean_t mbuf_steal(mbuf_class_t, unsigned int);
744 static void m_reclaim(mbuf_class_t, unsigned int, boolean_t);
745 static int m_howmany(int, size_t);
746 static void mbuf_worker_thread(void);
747 static void mbuf_watchdog(void);
748 static boolean_t mbuf_sleep(mbuf_class_t, unsigned int, int);
749
750 static void mcl_audit_init(void *, mcache_audit_t **, mcache_obj_t **,
751 size_t, unsigned int);
752 static void mcl_audit_free(void *, unsigned int);
753 static mcache_audit_t *mcl_audit_buf2mca(mbuf_class_t, mcache_obj_t *);
754 static void mcl_audit_mbuf(mcache_audit_t *, void *, boolean_t, boolean_t);
755 static void mcl_audit_cluster(mcache_audit_t *, void *, size_t, boolean_t,
756 boolean_t);
757 static void mcl_audit_restore_mbuf(struct mbuf *, mcache_audit_t *, boolean_t);
758 static void mcl_audit_save_mbuf(struct mbuf *, mcache_audit_t *);
759 static void mcl_audit_scratch(mcache_audit_t *);
760 static void mcl_audit_mcheck_panic(struct mbuf *);
761 static void mcl_audit_verify_nextptr(void *, mcache_audit_t *);
762
763 static void mleak_activate(void);
764 static void mleak_logger(u_int32_t, mcache_obj_t *, boolean_t);
765 static boolean_t mleak_log(uintptr_t *, mcache_obj_t *, uint32_t, int);
766 static void mleak_free(mcache_obj_t *);
767 static void mleak_sort_traces(void);
768 static void mleak_update_stats(void);
769
770 static mcl_slab_t *slab_get(void *);
771 static void slab_init(mcl_slab_t *, mbuf_class_t, u_int32_t,
772 void *, void *, unsigned int, int, int);
773 static void slab_insert(mcl_slab_t *, mbuf_class_t);
774 static void slab_remove(mcl_slab_t *, mbuf_class_t);
775 static boolean_t slab_inrange(mcl_slab_t *, void *);
776 static void slab_nextptr_panic(mcl_slab_t *, void *);
777 static void slab_detach(mcl_slab_t *);
778 static boolean_t slab_is_detached(mcl_slab_t *);
779
780 static int m_copyback0(struct mbuf **, int, int, const void *, int, int);
781 static struct mbuf *m_split0(struct mbuf *, int, int, int);
782 __private_extern__ void mbuf_report_peak_usage(void);
783 static boolean_t mbuf_report_usage(mbuf_class_t);
784
785 /* flags for m_copyback0 */
786 #define M_COPYBACK0_COPYBACK 0x0001 /* copyback from cp */
787 #define M_COPYBACK0_PRESERVE 0x0002 /* preserve original data */
788 #define M_COPYBACK0_COW 0x0004 /* do copy-on-write */
789 #define M_COPYBACK0_EXTEND 0x0008 /* extend chain */
790
791 /*
792 * This flag is set for all mbufs that come out of and into the composite
793 * mbuf + cluster caches, i.e. MC_MBUF_CL and MC_MBUF_BIGCL. mbufs that
794 * are marked with such a flag have clusters attached to them, and will be
795 * treated differently when they are freed; instead of being placed back
796 * into the mbuf and cluster freelists, the composite mbuf + cluster objects
797 * are placed back into the appropriate composite cache's freelist, and the
798 * actual freeing is deferred until the composite objects are purged. At
799 * such a time, this flag will be cleared from the mbufs and the objects
800 * will be freed into their own separate freelists.
801 */
802 #define EXTF_COMPOSITE 0x1
803
804 /*
805 * This flag indicates that the external cluster is read-only, i.e. it is
806 * or was referred to by more than one mbufs. Once set, this flag is never
807 * cleared.
808 */
809 #define EXTF_READONLY 0x2
810 /*
811 * This flag indicates that the external cluster is paired with the mbuf.
812 * Pairing implies an external free routine defined which will be invoked
813 * when the reference count drops to the minimum at m_free time. This
814 * flag is never cleared.
815 */
816 #define EXTF_PAIRED 0x4
817
818 #define EXTF_MASK \
819 (EXTF_COMPOSITE | EXTF_READONLY | EXTF_PAIRED)
820
821 #define MEXT_MINREF(m) ((m_get_rfa(m))->minref)
822 #define MEXT_REF(m) ((m_get_rfa(m))->refcnt)
823 #define MEXT_PREF(m) ((m_get_rfa(m))->prefcnt)
824 #define MEXT_FLAGS(m) ((m_get_rfa(m))->flags)
825 #define MEXT_PRIV(m) ((m_get_rfa(m))->priv)
826 #define MEXT_PMBUF(m) ((m_get_rfa(m))->paired)
827 #define MEXT_TOKEN(m) ((m_get_rfa(m))->ext_token)
828 #define MBUF_IS_COMPOSITE(m) \
829 (MEXT_REF(m) == MEXT_MINREF(m) && \
830 (MEXT_FLAGS(m) & EXTF_MASK) == EXTF_COMPOSITE)
831 /*
832 * This macro can be used to test if the mbuf is paired to an external
833 * cluster. The test for MEXT_PMBUF being equal to the mbuf in subject
834 * is important, as EXTF_PAIRED alone is insufficient since it is immutable,
835 * and thus survives calls to m_free_paired.
836 */
837 #define MBUF_IS_PAIRED(m) \
838 (((m)->m_flags & M_EXT) && \
839 (MEXT_FLAGS(m) & EXTF_MASK) == EXTF_PAIRED && \
840 MEXT_PMBUF(m) == (m))
841
842 /*
843 * Macros used to verify the integrity of the mbuf.
844 */
845 #define _MCHECK(m) { \
846 if ((m)->m_type != MT_FREE && !MBUF_IS_PAIRED(m)) { \
847 if (mclaudit == NULL) \
848 panic("MCHECK: m_type=%d m=%p", \
849 (u_int16_t)(m)->m_type, m); \
850 else \
851 mcl_audit_mcheck_panic(m); \
852 } \
853 }
854
855 #define MBUF_IN_MAP(addr) \
856 ((unsigned char *)(addr) >= mbutl && \
857 (unsigned char *)(addr) < embutl)
858
859 #define MRANGE(addr) { \
860 if (!MBUF_IN_MAP(addr)) \
861 panic("MRANGE: address out of range 0x%p", addr); \
862 }
863
864 /*
865 * Macro version of mtod.
866 */
867 #define MTOD(m, t) ((t)((m)->m_data))
868
869 /*
870 * Macros to obtain page index given a base cluster address
871 */
872 #define MTOPG(x) (((unsigned char *)x - mbutl) >> PAGE_SHIFT)
873 #define PGTOM(x) (mbutl + (x << PAGE_SHIFT))
874
875 /*
876 * Macro to find the mbuf index relative to a base.
877 */
878 #define MBPAGEIDX(c, m) \
879 (((unsigned char *)(m) - (unsigned char *)(c)) >> MSIZESHIFT)
880
881 /*
882 * Same thing for 2KB cluster index.
883 */
884 #define CLPAGEIDX(c, m) \
885 (((unsigned char *)(m) - (unsigned char *)(c)) >> MCLSHIFT)
886
887 /*
888 * Macro to find 4KB cluster index relative to a base
889 */
890 #define BCLPAGEIDX(c, m) \
891 (((unsigned char *)(m) - (unsigned char *)(c)) >> MBIGCLSHIFT)
892
893 /*
894 * Macros used during mbuf and cluster initialization.
895 */
896 #define MBUF_INIT_PKTHDR(m) { \
897 (m)->m_pkthdr.rcvif = NULL; \
898 (m)->m_pkthdr.pkt_hdr = NULL; \
899 (m)->m_pkthdr.len = 0; \
900 (m)->m_pkthdr.csum_flags = 0; \
901 (m)->m_pkthdr.csum_data = 0; \
902 (m)->m_pkthdr.vlan_tag = 0; \
903 m_classifier_init(m, 0); \
904 m_tag_init(m, 1); \
905 m_scratch_init(m); \
906 m_redzone_init(m); \
907 }
908
909 #define MBUF_INIT(m, pkthdr, type) { \
910 _MCHECK(m); \
911 (m)->m_next = (m)->m_nextpkt = NULL; \
912 (m)->m_len = 0; \
913 (m)->m_type = type; \
914 if ((pkthdr) == 0) { \
915 (m)->m_data = (m)->m_dat; \
916 (m)->m_flags = 0; \
917 } else { \
918 (m)->m_data = (m)->m_pktdat; \
919 (m)->m_flags = M_PKTHDR; \
920 MBUF_INIT_PKTHDR(m); \
921 } \
922 }
923
924 #define MEXT_INIT(m, buf, size, free, arg, rfa, min, ref, pref, flag, \
925 priv, pm) { \
926 (m)->m_data = (m)->m_ext.ext_buf = (buf); \
927 (m)->m_flags |= M_EXT; \
928 m_set_ext((m), (rfa), (free), (arg)); \
929 (m)->m_ext.ext_size = (size); \
930 MEXT_MINREF(m) = (min); \
931 MEXT_REF(m) = (ref); \
932 MEXT_PREF(m) = (pref); \
933 MEXT_FLAGS(m) = (flag); \
934 MEXT_PRIV(m) = (priv); \
935 MEXT_PMBUF(m) = (pm); \
936 }
937
938 #define MBUF_CL_INIT(m, buf, rfa, ref, flag) \
939 MEXT_INIT(m, buf, m_maxsize(MC_CL), NULL, NULL, rfa, 0, \
940 ref, 0, flag, 0, NULL)
941
942 #define MBUF_BIGCL_INIT(m, buf, rfa, ref, flag) \
943 MEXT_INIT(m, buf, m_maxsize(MC_BIGCL), m_bigfree, NULL, rfa, 0, \
944 ref, 0, flag, 0, NULL)
945
946 #define MBUF_16KCL_INIT(m, buf, rfa, ref, flag) \
947 MEXT_INIT(m, buf, m_maxsize(MC_16KCL), m_16kfree, NULL, rfa, 0, \
948 ref, 0, flag, 0, NULL)
949
950 /*
951 * Macro to convert BSD malloc sleep flag to mcache's
952 */
953 #define MSLEEPF(f) ((!((f) & M_DONTWAIT)) ? MCR_SLEEP : MCR_NOSLEEP)
954
955 /*
956 * The structure that holds all mbuf class statistics exportable via sysctl.
957 * Similar to mbstat structure, the mb_stat structure is protected by the
958 * global mbuf lock. It contains additional information about the classes
959 * that allows for a more accurate view of the state of the allocator.
960 */
961 struct mb_stat *mb_stat;
962 struct omb_stat *omb_stat; /* For backwards compatibility */
963
964 #define MB_STAT_SIZE(n) \
965 __builtin_offsetof(mb_stat_t, mbs_class[n])
966 #define OMB_STAT_SIZE(n) \
967 ((size_t)(&((struct omb_stat *)0)->mbs_class[n]))
968
969 /*
970 * The legacy structure holding all of the mbuf allocation statistics.
971 * The actual statistics used by the kernel are stored in the mbuf_table
972 * instead, and are updated atomically while the global mbuf lock is held.
973 * They are mirrored in mbstat to support legacy applications (e.g. netstat).
974 * Unlike before, the kernel no longer relies on the contents of mbstat for
975 * its operations (e.g. cluster expansion) because the structure is exposed
976 * to outside and could possibly be modified, therefore making it unsafe.
977 * With the exception of the mbstat.m_mtypes array (see below), all of the
978 * statistics are updated as they change.
979 */
980 struct mbstat mbstat;
981
982 #define MBSTAT_MTYPES_MAX \
983 (sizeof (mbstat.m_mtypes) / sizeof (mbstat.m_mtypes[0]))
984
985 /*
986 * Allocation statistics related to mbuf types (up to MT_MAX-1) are updated
987 * atomically and stored in a per-CPU structure which is lock-free; this is
988 * done in order to avoid writing to the global mbstat data structure which
989 * would cause false sharing. During sysctl request for kern.ipc.mbstat,
990 * the statistics across all CPUs will be converged into the mbstat.m_mtypes
991 * array and returned to the application. Any updates for types greater or
992 * equal than MT_MAX would be done atomically to the mbstat; this slows down
993 * performance but is okay since the kernel uses only up to MT_MAX-1 while
994 * anything beyond that (up to type 255) is considered a corner case.
995 */
996 typedef struct {
997 unsigned int cpu_mtypes[MT_MAX];
998 } __attribute__((aligned(MAX_CPU_CACHE_LINE_SIZE), packed)) mtypes_cpu_t;
999
1000 typedef struct {
1001 mtypes_cpu_t mbs_cpu[1];
1002 } mbuf_mtypes_t;
1003
1004 static mbuf_mtypes_t *mbuf_mtypes; /* per-CPU statistics */
1005
1006 #define MBUF_MTYPES_SIZE(n) \
1007 ((size_t)(&((mbuf_mtypes_t *)0)->mbs_cpu[n]))
1008
1009 #define MTYPES_CPU(p) \
1010 ((mtypes_cpu_t *)(void *)((char *)(p) + MBUF_MTYPES_SIZE(cpu_number())))
1011
1012 #define mtype_stat_add(type, n) { \
1013 if ((unsigned)(type) < MT_MAX) { \
1014 mtypes_cpu_t *mbs = MTYPES_CPU(mbuf_mtypes); \
1015 atomic_add_32(&mbs->cpu_mtypes[type], n); \
1016 } else if ((unsigned)(type) < (unsigned)MBSTAT_MTYPES_MAX) { \
1017 atomic_add_16((int16_t *)&mbstat.m_mtypes[type], n); \
1018 } \
1019 }
1020
1021 #define mtype_stat_sub(t, n) mtype_stat_add(t, -(n))
1022 #define mtype_stat_inc(t) mtype_stat_add(t, 1)
1023 #define mtype_stat_dec(t) mtype_stat_sub(t, 1)
1024
1025 static void
1026 mbuf_mtypes_sync(boolean_t locked)
1027 {
1028 int m, n;
1029 mtypes_cpu_t mtc;
1030
1031 if (locked)
1032 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1033
1034 bzero(&mtc, sizeof (mtc));
1035 for (m = 0; m < ncpu; m++) {
1036 mtypes_cpu_t *scp = &mbuf_mtypes->mbs_cpu[m];
1037 mtypes_cpu_t temp;
1038
1039 bcopy(&scp->cpu_mtypes, &temp.cpu_mtypes,
1040 sizeof (temp.cpu_mtypes));
1041
1042 for (n = 0; n < MT_MAX; n++)
1043 mtc.cpu_mtypes[n] += temp.cpu_mtypes[n];
1044 }
1045 if (!locked)
1046 lck_mtx_lock(mbuf_mlock);
1047 for (n = 0; n < MT_MAX; n++)
1048 mbstat.m_mtypes[n] = mtc.cpu_mtypes[n];
1049 if (!locked)
1050 lck_mtx_unlock(mbuf_mlock);
1051 }
1052
1053 static int
1054 mbstat_sysctl SYSCTL_HANDLER_ARGS
1055 {
1056 #pragma unused(oidp, arg1, arg2)
1057 mbuf_mtypes_sync(FALSE);
1058
1059 return (SYSCTL_OUT(req, &mbstat, sizeof (mbstat)));
1060 }
1061
1062 static void
1063 mbuf_stat_sync(void)
1064 {
1065 mb_class_stat_t *sp;
1066 mcache_cpu_t *ccp;
1067 mcache_t *cp;
1068 int k, m, bktsize;
1069
1070 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1071
1072 for (k = 0; k < NELEM(mbuf_table); k++) {
1073 cp = m_cache(k);
1074 ccp = &cp->mc_cpu[0];
1075 bktsize = ccp->cc_bktsize;
1076 sp = mbuf_table[k].mtbl_stats;
1077
1078 if (cp->mc_flags & MCF_NOCPUCACHE)
1079 sp->mbcl_mc_state = MCS_DISABLED;
1080 else if (cp->mc_purge_cnt > 0)
1081 sp->mbcl_mc_state = MCS_PURGING;
1082 else if (bktsize == 0)
1083 sp->mbcl_mc_state = MCS_OFFLINE;
1084 else
1085 sp->mbcl_mc_state = MCS_ONLINE;
1086
1087 sp->mbcl_mc_cached = 0;
1088 for (m = 0; m < ncpu; m++) {
1089 ccp = &cp->mc_cpu[m];
1090 if (ccp->cc_objs > 0)
1091 sp->mbcl_mc_cached += ccp->cc_objs;
1092 if (ccp->cc_pobjs > 0)
1093 sp->mbcl_mc_cached += ccp->cc_pobjs;
1094 }
1095 sp->mbcl_mc_cached += (cp->mc_full.bl_total * bktsize);
1096 sp->mbcl_active = sp->mbcl_total - sp->mbcl_mc_cached -
1097 sp->mbcl_infree;
1098
1099 sp->mbcl_mc_waiter_cnt = cp->mc_waiter_cnt;
1100 sp->mbcl_mc_wretry_cnt = cp->mc_wretry_cnt;
1101 sp->mbcl_mc_nwretry_cnt = cp->mc_nwretry_cnt;
1102
1103 /* Calculate total count specific to each class */
1104 sp->mbcl_ctotal = sp->mbcl_total;
1105 switch (m_class(k)) {
1106 case MC_MBUF:
1107 /* Deduct mbufs used in composite caches */
1108 sp->mbcl_ctotal -= (m_total(MC_MBUF_CL) +
1109 m_total(MC_MBUF_BIGCL));
1110 break;
1111
1112 case MC_CL:
1113 /* Deduct clusters used in composite cache */
1114 sp->mbcl_ctotal -= m_total(MC_MBUF_CL);
1115 break;
1116
1117 case MC_BIGCL:
1118 /* Deduct clusters used in composite cache */
1119 sp->mbcl_ctotal -= m_total(MC_MBUF_BIGCL);
1120 break;
1121
1122 case MC_16KCL:
1123 /* Deduct clusters used in composite cache */
1124 sp->mbcl_ctotal -= m_total(MC_MBUF_16KCL);
1125 break;
1126
1127 default:
1128 break;
1129 }
1130 }
1131 }
1132
1133 static int
1134 mb_stat_sysctl SYSCTL_HANDLER_ARGS
1135 {
1136 #pragma unused(oidp, arg1, arg2)
1137 void *statp;
1138 int k, statsz, proc64 = proc_is64bit(req->p);
1139
1140 lck_mtx_lock(mbuf_mlock);
1141 mbuf_stat_sync();
1142
1143 if (!proc64) {
1144 struct omb_class_stat *oc;
1145 struct mb_class_stat *c;
1146
1147 omb_stat->mbs_cnt = mb_stat->mbs_cnt;
1148 oc = &omb_stat->mbs_class[0];
1149 c = &mb_stat->mbs_class[0];
1150 for (k = 0; k < omb_stat->mbs_cnt; k++, oc++, c++) {
1151 (void) snprintf(oc->mbcl_cname, sizeof (oc->mbcl_cname),
1152 "%s", c->mbcl_cname);
1153 oc->mbcl_size = c->mbcl_size;
1154 oc->mbcl_total = c->mbcl_total;
1155 oc->mbcl_active = c->mbcl_active;
1156 oc->mbcl_infree = c->mbcl_infree;
1157 oc->mbcl_slab_cnt = c->mbcl_slab_cnt;
1158 oc->mbcl_alloc_cnt = c->mbcl_alloc_cnt;
1159 oc->mbcl_free_cnt = c->mbcl_free_cnt;
1160 oc->mbcl_notified = c->mbcl_notified;
1161 oc->mbcl_purge_cnt = c->mbcl_purge_cnt;
1162 oc->mbcl_fail_cnt = c->mbcl_fail_cnt;
1163 oc->mbcl_ctotal = c->mbcl_ctotal;
1164 oc->mbcl_release_cnt = c->mbcl_release_cnt;
1165 oc->mbcl_mc_state = c->mbcl_mc_state;
1166 oc->mbcl_mc_cached = c->mbcl_mc_cached;
1167 oc->mbcl_mc_waiter_cnt = c->mbcl_mc_waiter_cnt;
1168 oc->mbcl_mc_wretry_cnt = c->mbcl_mc_wretry_cnt;
1169 oc->mbcl_mc_nwretry_cnt = c->mbcl_mc_nwretry_cnt;
1170 }
1171 statp = omb_stat;
1172 statsz = OMB_STAT_SIZE(NELEM(mbuf_table));
1173 } else {
1174 statp = mb_stat;
1175 statsz = MB_STAT_SIZE(NELEM(mbuf_table));
1176 }
1177
1178 lck_mtx_unlock(mbuf_mlock);
1179
1180 return (SYSCTL_OUT(req, statp, statsz));
1181 }
1182
1183 static int
1184 mleak_top_trace_sysctl SYSCTL_HANDLER_ARGS
1185 {
1186 #pragma unused(oidp, arg1, arg2)
1187 int i;
1188
1189 /* Ensure leak tracing turned on */
1190 if (!mclfindleak || !mclexpleak)
1191 return (ENXIO);
1192
1193 lck_mtx_lock(mleak_lock);
1194 mleak_update_stats();
1195 i = SYSCTL_OUT(req, mleak_stat, MLEAK_STAT_SIZE(MLEAK_NUM_TRACES));
1196 lck_mtx_unlock(mleak_lock);
1197
1198 return (i);
1199 }
1200
1201 static int
1202 mleak_table_sysctl SYSCTL_HANDLER_ARGS
1203 {
1204 #pragma unused(oidp, arg1, arg2)
1205 int i = 0;
1206
1207 /* Ensure leak tracing turned on */
1208 if (!mclfindleak || !mclexpleak)
1209 return (ENXIO);
1210
1211 lck_mtx_lock(mleak_lock);
1212 i = SYSCTL_OUT(req, &mleak_table, sizeof (mleak_table));
1213 lck_mtx_unlock(mleak_lock);
1214
1215 return (i);
1216 }
1217
1218 static inline void
1219 m_incref(struct mbuf *m)
1220 {
1221 UInt16 old, new;
1222 volatile UInt16 *addr = (volatile UInt16 *)&MEXT_REF(m);
1223
1224 do {
1225 old = *addr;
1226 new = old + 1;
1227 ASSERT(new != 0);
1228 } while (!OSCompareAndSwap16(old, new, addr));
1229
1230 /*
1231 * If cluster is shared, mark it with (sticky) EXTF_READONLY;
1232 * we don't clear the flag when the refcount goes back to the
1233 * minimum, to simplify code calling m_mclhasreference().
1234 */
1235 if (new > (MEXT_MINREF(m) + 1) && !(MEXT_FLAGS(m) & EXTF_READONLY))
1236 (void) OSBitOrAtomic16(EXTF_READONLY, &MEXT_FLAGS(m));
1237 }
1238
1239 static inline u_int16_t
1240 m_decref(struct mbuf *m)
1241 {
1242 UInt16 old, new;
1243 volatile UInt16 *addr = (volatile UInt16 *)&MEXT_REF(m);
1244
1245 do {
1246 old = *addr;
1247 new = old - 1;
1248 ASSERT(old != 0);
1249 } while (!OSCompareAndSwap16(old, new, addr));
1250
1251 return (new);
1252 }
1253
1254 static void
1255 mbuf_table_init(void)
1256 {
1257 unsigned int b, c, s;
1258 int m, config_mbuf_jumbo = 0;
1259
1260 MALLOC(omb_stat, struct omb_stat *, OMB_STAT_SIZE(NELEM(mbuf_table)),
1261 M_TEMP, M_WAITOK | M_ZERO);
1262 VERIFY(omb_stat != NULL);
1263
1264 MALLOC(mb_stat, mb_stat_t *, MB_STAT_SIZE(NELEM(mbuf_table)),
1265 M_TEMP, M_WAITOK | M_ZERO);
1266 VERIFY(mb_stat != NULL);
1267
1268 mb_stat->mbs_cnt = NELEM(mbuf_table);
1269 for (m = 0; m < NELEM(mbuf_table); m++)
1270 mbuf_table[m].mtbl_stats = &mb_stat->mbs_class[m];
1271
1272 #if CONFIG_MBUF_JUMBO
1273 config_mbuf_jumbo = 1;
1274 #endif /* CONFIG_MBUF_JUMBO */
1275
1276 if (config_mbuf_jumbo == 1 || PAGE_SIZE == M16KCLBYTES) {
1277 /*
1278 * Set aside 1/3 of the mbuf cluster map for jumbo
1279 * clusters; we do this only on platforms where jumbo
1280 * cluster pool is enabled.
1281 */
1282 njcl = nmbclusters / 3;
1283 njclbytes = M16KCLBYTES;
1284 }
1285
1286 /*
1287 * nclusters holds both the 2KB and 4KB pools, so ensure it's
1288 * a multiple of 4KB clusters.
1289 */
1290 nclusters = P2ROUNDDOWN(nmbclusters - njcl, NCLPG);
1291 if (njcl > 0) {
1292 /*
1293 * Each jumbo cluster takes 8 2KB clusters, so make
1294 * sure that the pool size is evenly divisible by 8;
1295 * njcl is in 2KB unit, hence treated as such.
1296 */
1297 njcl = P2ROUNDDOWN(nmbclusters - nclusters, NCLPJCL);
1298
1299 /* Update nclusters with rounded down value of njcl */
1300 nclusters = P2ROUNDDOWN(nmbclusters - njcl, NCLPG);
1301 }
1302
1303 /*
1304 * njcl is valid only on platforms with 16KB jumbo clusters or
1305 * with 16KB pages, where it is configured to 1/3 of the pool
1306 * size. On these platforms, the remaining is used for 2KB
1307 * and 4KB clusters. On platforms without 16KB jumbo clusters,
1308 * the entire pool is used for both 2KB and 4KB clusters. A 4KB
1309 * cluster can either be splitted into 16 mbufs, or into 2 2KB
1310 * clusters.
1311 *
1312 * +---+---+------------ ... -----------+------- ... -------+
1313 * | c | b | s | njcl |
1314 * +---+---+------------ ... -----------+------- ... -------+
1315 *
1316 * 1/32th of the shared region is reserved for pure 2KB and 4KB
1317 * clusters (1/64th each.)
1318 */
1319 c = P2ROUNDDOWN((nclusters >> 6), NCLPG); /* in 2KB unit */
1320 b = P2ROUNDDOWN((nclusters >> (6 + NCLPBGSHIFT)), NBCLPG); /* in 4KB unit */
1321 s = nclusters - (c + (b << NCLPBGSHIFT)); /* in 2KB unit */
1322
1323 /*
1324 * 1/64th (c) is reserved for 2KB clusters.
1325 */
1326 m_minlimit(MC_CL) = c;
1327 m_maxlimit(MC_CL) = s + c; /* in 2KB unit */
1328 m_maxsize(MC_CL) = m_size(MC_CL) = MCLBYTES;
1329 (void) snprintf(m_cname(MC_CL), MAX_MBUF_CNAME, "cl");
1330
1331 /*
1332 * Another 1/64th (b) of the map is reserved for 4KB clusters.
1333 * It cannot be turned into 2KB clusters or mbufs.
1334 */
1335 m_minlimit(MC_BIGCL) = b;
1336 m_maxlimit(MC_BIGCL) = (s >> NCLPBGSHIFT) + b; /* in 4KB unit */
1337 m_maxsize(MC_BIGCL) = m_size(MC_BIGCL) = MBIGCLBYTES;
1338 (void) snprintf(m_cname(MC_BIGCL), MAX_MBUF_CNAME, "bigcl");
1339
1340 /*
1341 * The remaining 31/32ths (s) are all-purpose (mbufs, 2KB, or 4KB)
1342 */
1343 m_minlimit(MC_MBUF) = 0;
1344 m_maxlimit(MC_MBUF) = (s << NMBPCLSHIFT); /* in mbuf unit */
1345 m_maxsize(MC_MBUF) = m_size(MC_MBUF) = MSIZE;
1346 (void) snprintf(m_cname(MC_MBUF), MAX_MBUF_CNAME, "mbuf");
1347
1348 /*
1349 * Set limits for the composite classes.
1350 */
1351 m_minlimit(MC_MBUF_CL) = 0;
1352 m_maxlimit(MC_MBUF_CL) = m_maxlimit(MC_CL);
1353 m_maxsize(MC_MBUF_CL) = MCLBYTES;
1354 m_size(MC_MBUF_CL) = m_size(MC_MBUF) + m_size(MC_CL);
1355 (void) snprintf(m_cname(MC_MBUF_CL), MAX_MBUF_CNAME, "mbuf_cl");
1356
1357 m_minlimit(MC_MBUF_BIGCL) = 0;
1358 m_maxlimit(MC_MBUF_BIGCL) = m_maxlimit(MC_BIGCL);
1359 m_maxsize(MC_MBUF_BIGCL) = MBIGCLBYTES;
1360 m_size(MC_MBUF_BIGCL) = m_size(MC_MBUF) + m_size(MC_BIGCL);
1361 (void) snprintf(m_cname(MC_MBUF_BIGCL), MAX_MBUF_CNAME, "mbuf_bigcl");
1362
1363 /*
1364 * And for jumbo classes.
1365 */
1366 m_minlimit(MC_16KCL) = 0;
1367 m_maxlimit(MC_16KCL) = (njcl >> NCLPJCLSHIFT); /* in 16KB unit */
1368 m_maxsize(MC_16KCL) = m_size(MC_16KCL) = M16KCLBYTES;
1369 (void) snprintf(m_cname(MC_16KCL), MAX_MBUF_CNAME, "16kcl");
1370
1371 m_minlimit(MC_MBUF_16KCL) = 0;
1372 m_maxlimit(MC_MBUF_16KCL) = m_maxlimit(MC_16KCL);
1373 m_maxsize(MC_MBUF_16KCL) = M16KCLBYTES;
1374 m_size(MC_MBUF_16KCL) = m_size(MC_MBUF) + m_size(MC_16KCL);
1375 (void) snprintf(m_cname(MC_MBUF_16KCL), MAX_MBUF_CNAME, "mbuf_16kcl");
1376
1377 /*
1378 * Initialize the legacy mbstat structure.
1379 */
1380 bzero(&mbstat, sizeof (mbstat));
1381 mbstat.m_msize = m_maxsize(MC_MBUF);
1382 mbstat.m_mclbytes = m_maxsize(MC_CL);
1383 mbstat.m_minclsize = MINCLSIZE;
1384 mbstat.m_mlen = MLEN;
1385 mbstat.m_mhlen = MHLEN;
1386 mbstat.m_bigmclbytes = m_maxsize(MC_BIGCL);
1387 }
1388
1389 #if defined(__LP64__)
1390 typedef struct ncl_tbl {
1391 uint64_t nt_maxmem; /* memory (sane) size */
1392 uint32_t nt_mbpool; /* mbuf pool size */
1393 } ncl_tbl_t;
1394
1395 /* Non-server */
1396 static ncl_tbl_t ncl_table[] = {
1397 { (1ULL << GBSHIFT) /* 1 GB */, (64 << MBSHIFT) /* 64 MB */ },
1398 { (1ULL << (GBSHIFT + 3)) /* 8 GB */, (96 << MBSHIFT) /* 96 MB */ },
1399 { (1ULL << (GBSHIFT + 4)) /* 16 GB */, (128 << MBSHIFT) /* 128 MB */ },
1400 { 0, 0 }
1401 };
1402
1403 /* Server */
1404 static ncl_tbl_t ncl_table_srv[] = {
1405 { (1ULL << GBSHIFT) /* 1 GB */, (96 << MBSHIFT) /* 96 MB */ },
1406 { (1ULL << (GBSHIFT + 2)) /* 4 GB */, (128 << MBSHIFT) /* 128 MB */ },
1407 { (1ULL << (GBSHIFT + 3)) /* 8 GB */, (160 << MBSHIFT) /* 160 MB */ },
1408 { (1ULL << (GBSHIFT + 4)) /* 16 GB */, (192 << MBSHIFT) /* 192 MB */ },
1409 { (1ULL << (GBSHIFT + 5)) /* 32 GB */, (256 << MBSHIFT) /* 256 MB */ },
1410 { (1ULL << (GBSHIFT + 6)) /* 64 GB */, (384 << MBSHIFT) /* 384 MB */ },
1411 { 0, 0 }
1412 };
1413 #endif /* __LP64__ */
1414
1415 __private_extern__ unsigned int
1416 mbuf_default_ncl(int server, uint64_t mem)
1417 {
1418 #if !defined(__LP64__)
1419 #pragma unused(server)
1420 unsigned int n;
1421 /*
1422 * 32-bit kernel (default to 64MB of mbuf pool for >= 1GB RAM).
1423 */
1424 if ((n = ((mem / 16) / MCLBYTES)) > 32768)
1425 n = 32768;
1426 #else
1427 unsigned int n, i;
1428 ncl_tbl_t *tbl = (server ? ncl_table_srv : ncl_table);
1429 /*
1430 * 64-bit kernel (mbuf pool size based on table).
1431 */
1432 n = tbl[0].nt_mbpool;
1433 for (i = 0; tbl[i].nt_mbpool != 0; i++) {
1434 if (mem < tbl[i].nt_maxmem)
1435 break;
1436 n = tbl[i].nt_mbpool;
1437 }
1438 n >>= MCLSHIFT;
1439 #endif /* !__LP64__ */
1440 return (n);
1441 }
1442
1443 __private_extern__ void
1444 mbinit(void)
1445 {
1446 unsigned int m;
1447 unsigned int initmcl = 0;
1448 void *buf;
1449 thread_t thread = THREAD_NULL;
1450
1451 microuptime(&mb_start);
1452
1453 /*
1454 * These MBUF_ values must be equal to their private counterparts.
1455 */
1456 _CASSERT(MBUF_EXT == M_EXT);
1457 _CASSERT(MBUF_PKTHDR == M_PKTHDR);
1458 _CASSERT(MBUF_EOR == M_EOR);
1459 _CASSERT(MBUF_LOOP == M_LOOP);
1460 _CASSERT(MBUF_BCAST == M_BCAST);
1461 _CASSERT(MBUF_MCAST == M_MCAST);
1462 _CASSERT(MBUF_FRAG == M_FRAG);
1463 _CASSERT(MBUF_FIRSTFRAG == M_FIRSTFRAG);
1464 _CASSERT(MBUF_LASTFRAG == M_LASTFRAG);
1465 _CASSERT(MBUF_PROMISC == M_PROMISC);
1466 _CASSERT(MBUF_HASFCS == M_HASFCS);
1467
1468 _CASSERT(MBUF_TYPE_FREE == MT_FREE);
1469 _CASSERT(MBUF_TYPE_DATA == MT_DATA);
1470 _CASSERT(MBUF_TYPE_HEADER == MT_HEADER);
1471 _CASSERT(MBUF_TYPE_SOCKET == MT_SOCKET);
1472 _CASSERT(MBUF_TYPE_PCB == MT_PCB);
1473 _CASSERT(MBUF_TYPE_RTABLE == MT_RTABLE);
1474 _CASSERT(MBUF_TYPE_HTABLE == MT_HTABLE);
1475 _CASSERT(MBUF_TYPE_ATABLE == MT_ATABLE);
1476 _CASSERT(MBUF_TYPE_SONAME == MT_SONAME);
1477 _CASSERT(MBUF_TYPE_SOOPTS == MT_SOOPTS);
1478 _CASSERT(MBUF_TYPE_FTABLE == MT_FTABLE);
1479 _CASSERT(MBUF_TYPE_RIGHTS == MT_RIGHTS);
1480 _CASSERT(MBUF_TYPE_IFADDR == MT_IFADDR);
1481 _CASSERT(MBUF_TYPE_CONTROL == MT_CONTROL);
1482 _CASSERT(MBUF_TYPE_OOBDATA == MT_OOBDATA);
1483
1484 _CASSERT(MBUF_TSO_IPV4 == CSUM_TSO_IPV4);
1485 _CASSERT(MBUF_TSO_IPV6 == CSUM_TSO_IPV6);
1486 _CASSERT(MBUF_CSUM_REQ_SUM16 == CSUM_PARTIAL);
1487 _CASSERT(MBUF_CSUM_TCP_SUM16 == MBUF_CSUM_REQ_SUM16);
1488 _CASSERT(MBUF_CSUM_REQ_ZERO_INVERT == CSUM_ZERO_INVERT);
1489 _CASSERT(MBUF_CSUM_REQ_IP == CSUM_IP);
1490 _CASSERT(MBUF_CSUM_REQ_TCP == CSUM_TCP);
1491 _CASSERT(MBUF_CSUM_REQ_UDP == CSUM_UDP);
1492 _CASSERT(MBUF_CSUM_REQ_TCPIPV6 == CSUM_TCPIPV6);
1493 _CASSERT(MBUF_CSUM_REQ_UDPIPV6 == CSUM_UDPIPV6);
1494 _CASSERT(MBUF_CSUM_DID_IP == CSUM_IP_CHECKED);
1495 _CASSERT(MBUF_CSUM_IP_GOOD == CSUM_IP_VALID);
1496 _CASSERT(MBUF_CSUM_DID_DATA == CSUM_DATA_VALID);
1497 _CASSERT(MBUF_CSUM_PSEUDO_HDR == CSUM_PSEUDO_HDR);
1498
1499 _CASSERT(MBUF_WAITOK == M_WAIT);
1500 _CASSERT(MBUF_DONTWAIT == M_DONTWAIT);
1501 _CASSERT(MBUF_COPYALL == M_COPYALL);
1502
1503 _CASSERT(MBUF_SC2TC(MBUF_SC_BK_SYS) == MBUF_TC_BK);
1504 _CASSERT(MBUF_SC2TC(MBUF_SC_BK) == MBUF_TC_BK);
1505 _CASSERT(MBUF_SC2TC(MBUF_SC_BE) == MBUF_TC_BE);
1506 _CASSERT(MBUF_SC2TC(MBUF_SC_RD) == MBUF_TC_BE);
1507 _CASSERT(MBUF_SC2TC(MBUF_SC_OAM) == MBUF_TC_BE);
1508 _CASSERT(MBUF_SC2TC(MBUF_SC_AV) == MBUF_TC_VI);
1509 _CASSERT(MBUF_SC2TC(MBUF_SC_RV) == MBUF_TC_VI);
1510 _CASSERT(MBUF_SC2TC(MBUF_SC_VI) == MBUF_TC_VI);
1511 _CASSERT(MBUF_SC2TC(MBUF_SC_VO) == MBUF_TC_VO);
1512 _CASSERT(MBUF_SC2TC(MBUF_SC_CTL) == MBUF_TC_VO);
1513
1514 _CASSERT(MBUF_TC2SCVAL(MBUF_TC_BK) == SCVAL_BK);
1515 _CASSERT(MBUF_TC2SCVAL(MBUF_TC_BE) == SCVAL_BE);
1516 _CASSERT(MBUF_TC2SCVAL(MBUF_TC_VI) == SCVAL_VI);
1517 _CASSERT(MBUF_TC2SCVAL(MBUF_TC_VO) == SCVAL_VO);
1518
1519 /* Module specific scratch space (32-bit alignment requirement) */
1520 _CASSERT(!(offsetof(struct mbuf, m_pkthdr.pkt_mpriv) %
1521 sizeof (uint32_t)));
1522
1523 /* Initialize random red zone cookie value */
1524 _CASSERT(sizeof (mb_redzone_cookie) ==
1525 sizeof (((struct pkthdr *)0)->redzone));
1526 read_random(&mb_redzone_cookie, sizeof (mb_redzone_cookie));
1527 read_random(&mb_obscure_extref, sizeof (mb_obscure_extref));
1528 read_random(&mb_obscure_extfree, sizeof (mb_obscure_extfree));
1529 mb_obscure_extref |= 0x3;
1530 mb_obscure_extfree |= 0x3;
1531
1532 /* Make sure we don't save more than we should */
1533 _CASSERT(MCA_SAVED_MBUF_SIZE <= sizeof (struct mbuf));
1534
1535 if (nmbclusters == 0)
1536 nmbclusters = NMBCLUSTERS;
1537
1538 /* This should be a sane (at least even) value by now */
1539 VERIFY(nmbclusters != 0 && !(nmbclusters & 0x1));
1540
1541 /* Setup the mbuf table */
1542 mbuf_table_init();
1543
1544 /* Global lock for common layer */
1545 mbuf_mlock_grp_attr = lck_grp_attr_alloc_init();
1546 mbuf_mlock_grp = lck_grp_alloc_init("mbuf", mbuf_mlock_grp_attr);
1547 mbuf_mlock_attr = lck_attr_alloc_init();
1548 lck_mtx_init(mbuf_mlock, mbuf_mlock_grp, mbuf_mlock_attr);
1549
1550 /*
1551 * Allocate cluster slabs table:
1552 *
1553 * maxslabgrp = (N * 2048) / (1024 * 1024)
1554 *
1555 * Where N is nmbclusters rounded up to the nearest 512. This yields
1556 * mcl_slab_g_t units, each one representing a MB of memory.
1557 */
1558 maxslabgrp =
1559 (P2ROUNDUP(nmbclusters, (MBSIZE >> MCLSHIFT)) << MCLSHIFT) >> MBSHIFT;
1560 MALLOC(slabstbl, mcl_slabg_t **, maxslabgrp * sizeof (mcl_slabg_t *),
1561 M_TEMP, M_WAITOK | M_ZERO);
1562 VERIFY(slabstbl != NULL);
1563
1564 /*
1565 * Allocate audit structures, if needed:
1566 *
1567 * maxclaudit = (maxslabgrp * 1024 * 1024) / PAGE_SIZE
1568 *
1569 * This yields mcl_audit_t units, each one representing a page.
1570 */
1571 PE_parse_boot_argn("mbuf_debug", &mbuf_debug, sizeof (mbuf_debug));
1572 mbuf_debug |= mcache_getflags();
1573 if (mbuf_debug & MCF_DEBUG) {
1574 int l;
1575 mcl_audit_t *mclad;
1576 maxclaudit = ((maxslabgrp << MBSHIFT) >> PAGE_SHIFT);
1577 MALLOC(mclaudit, mcl_audit_t *, maxclaudit * sizeof (*mclaudit),
1578 M_TEMP, M_WAITOK | M_ZERO);
1579 VERIFY(mclaudit != NULL);
1580 for (l = 0, mclad = mclaudit; l < maxclaudit; l++) {
1581 MALLOC(mclad[l].cl_audit, mcache_audit_t **,
1582 NMBPG * sizeof(mcache_audit_t *),
1583 M_TEMP, M_WAITOK | M_ZERO);
1584 VERIFY(mclad[l].cl_audit != NULL);
1585 }
1586
1587 mcl_audit_con_cache = mcache_create("mcl_audit_contents",
1588 AUDIT_CONTENTS_SIZE, sizeof (u_int64_t), 0, MCR_SLEEP);
1589 VERIFY(mcl_audit_con_cache != NULL);
1590 }
1591 mclverify = (mbuf_debug & MCF_VERIFY);
1592 mcltrace = (mbuf_debug & MCF_TRACE);
1593 mclfindleak = !(mbuf_debug & MCF_NOLEAKLOG);
1594 mclexpleak = mclfindleak && (mbuf_debug & MCF_EXPLEAKLOG);
1595
1596 /* Enable mbuf leak logging, with a lock to protect the tables */
1597
1598 mleak_lock_grp_attr = lck_grp_attr_alloc_init();
1599 mleak_lock_grp = lck_grp_alloc_init("mleak_lock", mleak_lock_grp_attr);
1600 mleak_lock_attr = lck_attr_alloc_init();
1601 lck_mtx_init(mleak_lock, mleak_lock_grp, mleak_lock_attr);
1602
1603 mleak_activate();
1604
1605 /*
1606 * Allocate structure for per-CPU statistics that's aligned
1607 * on the CPU cache boundary; this code assumes that we never
1608 * uninitialize this framework, since the original address
1609 * before alignment is not saved.
1610 */
1611 ncpu = ml_get_max_cpus();
1612 MALLOC(buf, void *, MBUF_MTYPES_SIZE(ncpu) + CPU_CACHE_LINE_SIZE,
1613 M_TEMP, M_WAITOK);
1614 VERIFY(buf != NULL);
1615
1616 mbuf_mtypes = (mbuf_mtypes_t *)P2ROUNDUP((intptr_t)buf,
1617 CPU_CACHE_LINE_SIZE);
1618 bzero(mbuf_mtypes, MBUF_MTYPES_SIZE(ncpu));
1619
1620 /* Calculate the number of pages assigned to the cluster pool */
1621 mcl_pages = (nmbclusters << MCLSHIFT) / PAGE_SIZE;
1622 MALLOC(mcl_paddr, ppnum_t *, mcl_pages * sizeof (ppnum_t),
1623 M_TEMP, M_WAITOK);
1624 VERIFY(mcl_paddr != NULL);
1625
1626 /* Register with the I/O Bus mapper */
1627 mcl_paddr_base = IOMapperIOVMAlloc(mcl_pages);
1628 bzero((char *)mcl_paddr, mcl_pages * sizeof (ppnum_t));
1629
1630 embutl = (mbutl + (nmbclusters * MCLBYTES));
1631 VERIFY(((embutl - mbutl) % MBIGCLBYTES) == 0);
1632
1633 /* Prime up the freelist */
1634 PE_parse_boot_argn("initmcl", &initmcl, sizeof (initmcl));
1635 if (initmcl != 0) {
1636 initmcl >>= NCLPBGSHIFT; /* become a 4K unit */
1637 if (initmcl > m_maxlimit(MC_BIGCL))
1638 initmcl = m_maxlimit(MC_BIGCL);
1639 }
1640 if (initmcl < m_minlimit(MC_BIGCL))
1641 initmcl = m_minlimit(MC_BIGCL);
1642
1643 lck_mtx_lock(mbuf_mlock);
1644
1645 /*
1646 * For classes with non-zero minimum limits, populate their freelists
1647 * so that m_total(class) is at least m_minlimit(class).
1648 */
1649 VERIFY(m_total(MC_BIGCL) == 0 && m_minlimit(MC_BIGCL) != 0);
1650 freelist_populate(m_class(MC_BIGCL), initmcl, M_WAIT);
1651 VERIFY(m_total(MC_BIGCL) >= m_minlimit(MC_BIGCL));
1652 freelist_init(m_class(MC_CL));
1653
1654 for (m = 0; m < NELEM(mbuf_table); m++) {
1655 /* Make sure we didn't miss any */
1656 VERIFY(m_minlimit(m_class(m)) == 0 ||
1657 m_total(m_class(m)) >= m_minlimit(m_class(m)));
1658
1659 /* populate the initial sizes and report from there on */
1660 m_peak(m_class(m)) = m_total(m_class(m));
1661 }
1662 mb_peak_newreport = FALSE;
1663
1664 lck_mtx_unlock(mbuf_mlock);
1665
1666 (void) kernel_thread_start((thread_continue_t)mbuf_worker_thread_init,
1667 NULL, &thread);
1668 thread_deallocate(thread);
1669
1670 ref_cache = mcache_create("mext_ref", sizeof (struct ext_ref),
1671 0, 0, MCR_SLEEP);
1672
1673 /* Create the cache for each class */
1674 for (m = 0; m < NELEM(mbuf_table); m++) {
1675 void *allocfunc, *freefunc, *auditfunc, *logfunc;
1676 u_int32_t flags;
1677
1678 flags = mbuf_debug;
1679 if (m_class(m) == MC_MBUF_CL || m_class(m) == MC_MBUF_BIGCL ||
1680 m_class(m) == MC_MBUF_16KCL) {
1681 allocfunc = mbuf_cslab_alloc;
1682 freefunc = mbuf_cslab_free;
1683 auditfunc = mbuf_cslab_audit;
1684 logfunc = mleak_logger;
1685 } else {
1686 allocfunc = mbuf_slab_alloc;
1687 freefunc = mbuf_slab_free;
1688 auditfunc = mbuf_slab_audit;
1689 logfunc = mleak_logger;
1690 }
1691
1692 /*
1693 * Disable per-CPU caches for jumbo classes if there
1694 * is no jumbo cluster pool available in the system.
1695 * The cache itself is still created (but will never
1696 * be populated) since it simplifies the code.
1697 */
1698 if ((m_class(m) == MC_MBUF_16KCL || m_class(m) == MC_16KCL) &&
1699 njcl == 0)
1700 flags |= MCF_NOCPUCACHE;
1701
1702 if (!mclfindleak)
1703 flags |= MCF_NOLEAKLOG;
1704
1705 m_cache(m) = mcache_create_ext(m_cname(m), m_maxsize(m),
1706 allocfunc, freefunc, auditfunc, logfunc, mbuf_slab_notify,
1707 (void *)(uintptr_t)m, flags, MCR_SLEEP);
1708 }
1709
1710 /*
1711 * Set the max limit on sb_max to be 1/16 th of the size of
1712 * memory allocated for mbuf clusters.
1713 */
1714 high_sb_max = (nmbclusters << (MCLSHIFT - 4));
1715 if (high_sb_max < sb_max) {
1716 /* sb_max is too large for this configuration, scale it down */
1717 if (high_sb_max > (1 << MBSHIFT)) {
1718 /* We have atleast 16 M of mbuf pool */
1719 sb_max = high_sb_max;
1720 } else if ((nmbclusters << MCLSHIFT) > (1 << MBSHIFT)) {
1721 /*
1722 * If we have more than 1M of mbufpool, cap the size of
1723 * max sock buf at 1M
1724 */
1725 sb_max = high_sb_max = (1 << MBSHIFT);
1726 } else {
1727 sb_max = high_sb_max;
1728 }
1729 }
1730
1731 /* allocate space for mbuf_dump_buf */
1732 MALLOC(mbuf_dump_buf, char *, MBUF_DUMP_BUF_SIZE, M_TEMP, M_WAITOK);
1733 VERIFY(mbuf_dump_buf != NULL);
1734
1735 if (mbuf_debug & MCF_DEBUG) {
1736 printf("%s: MLEN %d, MHLEN %d\n", __func__,
1737 (int)_MLEN, (int)_MHLEN);
1738 }
1739
1740 printf("%s: done [%d MB total pool size, (%d/%d) split]\n", __func__,
1741 (nmbclusters << MCLSHIFT) >> MBSHIFT,
1742 (nclusters << MCLSHIFT) >> MBSHIFT,
1743 (njcl << MCLSHIFT) >> MBSHIFT);
1744
1745 /* initialize lock form tx completion callback table */
1746 mbuf_tx_compl_tbl_lck_grp_attr = lck_grp_attr_alloc_init();
1747 if (mbuf_tx_compl_tbl_lck_grp_attr == NULL) {
1748 panic("%s: lck_grp_attr_alloc_init failed", __func__);
1749 /* NOTREACHED */
1750 }
1751 mbuf_tx_compl_tbl_lck_grp = lck_grp_alloc_init("mbuf_tx_compl_tbl",
1752 mbuf_tx_compl_tbl_lck_grp_attr);
1753 if (mbuf_tx_compl_tbl_lck_grp == NULL) {
1754 panic("%s: lck_grp_alloc_init failed", __func__);
1755 /* NOTREACHED */
1756 }
1757 mbuf_tx_compl_tbl_lck_attr = lck_attr_alloc_init();
1758 if (mbuf_tx_compl_tbl_lck_attr == NULL) {
1759 panic("%s: lck_attr_alloc_init failed", __func__);
1760 /* NOTREACHED */
1761 }
1762 lck_rw_init(mbuf_tx_compl_tbl_lock, mbuf_tx_compl_tbl_lck_grp,
1763 mbuf_tx_compl_tbl_lck_attr);
1764
1765 }
1766
1767 /*
1768 * Obtain a slab of object(s) from the class's freelist.
1769 */
1770 static mcache_obj_t *
1771 slab_alloc(mbuf_class_t class, int wait)
1772 {
1773 mcl_slab_t *sp;
1774 mcache_obj_t *buf;
1775
1776 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1777
1778 /* This should always be NULL for us */
1779 VERIFY(m_cobjlist(class) == NULL);
1780
1781 /*
1782 * Treat composite objects as having longer lifespan by using
1783 * a slab from the reverse direction, in hoping that this could
1784 * reduce the probability of fragmentation for slabs that hold
1785 * more than one buffer chunks (e.g. mbuf slabs). For other
1786 * slabs, this probably doesn't make much of a difference.
1787 */
1788 if ((class == MC_MBUF || class == MC_CL || class == MC_BIGCL)
1789 && (wait & MCR_COMP))
1790 sp = (mcl_slab_t *)TAILQ_LAST(&m_slablist(class), mcl_slhead);
1791 else
1792 sp = (mcl_slab_t *)TAILQ_FIRST(&m_slablist(class));
1793
1794 if (sp == NULL) {
1795 VERIFY(m_infree(class) == 0 && m_slab_cnt(class) == 0);
1796 /* The slab list for this class is empty */
1797 return (NULL);
1798 }
1799
1800 VERIFY(m_infree(class) > 0);
1801 VERIFY(!slab_is_detached(sp));
1802 VERIFY(sp->sl_class == class &&
1803 (sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED);
1804 buf = sp->sl_head;
1805 VERIFY(slab_inrange(sp, buf) && sp == slab_get(buf));
1806 sp->sl_head = buf->obj_next;
1807 /* Increment slab reference */
1808 sp->sl_refcnt++;
1809
1810 VERIFY(sp->sl_head != NULL || sp->sl_refcnt == sp->sl_chunks);
1811
1812 if (sp->sl_head != NULL && !slab_inrange(sp, sp->sl_head)) {
1813 slab_nextptr_panic(sp, sp->sl_head);
1814 /* In case sl_head is in the map but not in the slab */
1815 VERIFY(slab_inrange(sp, sp->sl_head));
1816 /* NOTREACHED */
1817 }
1818
1819 if (mclaudit != NULL) {
1820 mcache_audit_t *mca = mcl_audit_buf2mca(class, buf);
1821 mca->mca_uflags = 0;
1822 /* Save contents on mbuf objects only */
1823 if (class == MC_MBUF)
1824 mca->mca_uflags |= MB_SCVALID;
1825 }
1826
1827 if (class == MC_CL) {
1828 mbstat.m_clfree = (--m_infree(MC_CL)) + m_infree(MC_MBUF_CL);
1829 /*
1830 * A 2K cluster slab can have at most NCLPG references.
1831 */
1832 VERIFY(sp->sl_refcnt >= 1 && sp->sl_refcnt <= NCLPG &&
1833 sp->sl_chunks == NCLPG && sp->sl_len == PAGE_SIZE);
1834 VERIFY(sp->sl_refcnt < NCLPG || sp->sl_head == NULL);
1835 } else if (class == MC_BIGCL) {
1836 mbstat.m_bigclfree = (--m_infree(MC_BIGCL)) +
1837 m_infree(MC_MBUF_BIGCL);
1838 /*
1839 * A 4K cluster slab can have NBCLPG references.
1840 */
1841 VERIFY(sp->sl_refcnt >= 1 && sp->sl_chunks == NBCLPG &&
1842 sp->sl_len == PAGE_SIZE &&
1843 (sp->sl_refcnt < NBCLPG || sp->sl_head == NULL));
1844 } else if (class == MC_16KCL) {
1845 mcl_slab_t *nsp;
1846 int k;
1847
1848 --m_infree(MC_16KCL);
1849 VERIFY(sp->sl_refcnt == 1 && sp->sl_chunks == 1 &&
1850 sp->sl_len == m_maxsize(class) && sp->sl_head == NULL);
1851 /*
1852 * Increment 2nd-Nth slab reference, where N is NSLABSP16KB.
1853 * A 16KB big cluster takes NSLABSP16KB slabs, each having at
1854 * most 1 reference.
1855 */
1856 for (nsp = sp, k = 1; k < NSLABSP16KB; k++) {
1857 nsp = nsp->sl_next;
1858 /* Next slab must already be present */
1859 VERIFY(nsp != NULL);
1860 nsp->sl_refcnt++;
1861 VERIFY(!slab_is_detached(nsp));
1862 VERIFY(nsp->sl_class == MC_16KCL &&
1863 nsp->sl_flags == (SLF_MAPPED | SLF_PARTIAL) &&
1864 nsp->sl_refcnt == 1 && nsp->sl_chunks == 0 &&
1865 nsp->sl_len == 0 && nsp->sl_base == sp->sl_base &&
1866 nsp->sl_head == NULL);
1867 }
1868 } else {
1869 VERIFY(class == MC_MBUF);
1870 --m_infree(MC_MBUF);
1871 /*
1872 * If auditing is turned on, this check is
1873 * deferred until later in mbuf_slab_audit().
1874 */
1875 if (mclaudit == NULL)
1876 _MCHECK((struct mbuf *)buf);
1877 /*
1878 * Since we have incremented the reference count above,
1879 * an mbuf slab (formerly a 4KB cluster slab that was cut
1880 * up into mbufs) must have a reference count between 1
1881 * and NMBPG at this point.
1882 */
1883 VERIFY(sp->sl_refcnt >= 1 && sp->sl_refcnt <= NMBPG &&
1884 sp->sl_chunks == NMBPG &&
1885 sp->sl_len == PAGE_SIZE);
1886 VERIFY(sp->sl_refcnt < NMBPG || sp->sl_head == NULL);
1887 }
1888
1889 /* If empty, remove this slab from the class's freelist */
1890 if (sp->sl_head == NULL) {
1891 VERIFY(class != MC_MBUF || sp->sl_refcnt == NMBPG);
1892 VERIFY(class != MC_CL || sp->sl_refcnt == NCLPG);
1893 VERIFY(class != MC_BIGCL || sp->sl_refcnt == NBCLPG);
1894 slab_remove(sp, class);
1895 }
1896
1897 return (buf);
1898 }
1899
1900 /*
1901 * Place a slab of object(s) back into a class's slab list.
1902 */
1903 static void
1904 slab_free(mbuf_class_t class, mcache_obj_t *buf)
1905 {
1906 mcl_slab_t *sp;
1907 boolean_t reinit_supercl = false;
1908 mbuf_class_t super_class;
1909
1910 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1911
1912 VERIFY(class != MC_16KCL || njcl > 0);
1913 VERIFY(buf->obj_next == NULL);
1914
1915 /*
1916 * Synchronizing with m_clalloc, as it reads m_total, while we here
1917 * are modifying m_total.
1918 */
1919 while (mb_clalloc_busy) {
1920 mb_clalloc_waiters++;
1921 (void) msleep(mb_clalloc_waitchan, mbuf_mlock,
1922 (PZERO-1), "m_clalloc", NULL);
1923 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1924 }
1925
1926 /* We are busy now; tell everyone else to go away */
1927 mb_clalloc_busy = TRUE;
1928
1929 sp = slab_get(buf);
1930 VERIFY(sp->sl_class == class && slab_inrange(sp, buf) &&
1931 (sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED);
1932
1933 /* Decrement slab reference */
1934 sp->sl_refcnt--;
1935
1936 if (class == MC_CL) {
1937 VERIFY(IS_P2ALIGNED(buf, MCLBYTES));
1938 /*
1939 * A slab that has been splitted for 2KB clusters can have
1940 * at most 1 outstanding reference at this point.
1941 */
1942 VERIFY(sp->sl_refcnt >= 0 && sp->sl_refcnt <= (NCLPG - 1) &&
1943 sp->sl_chunks == NCLPG && sp->sl_len == PAGE_SIZE);
1944 VERIFY(sp->sl_refcnt < (NCLPG - 1) ||
1945 (slab_is_detached(sp) && sp->sl_head == NULL));
1946 } else if (class == MC_BIGCL) {
1947 VERIFY(IS_P2ALIGNED(buf, MBIGCLBYTES));
1948
1949 /* A 4KB cluster slab can have NBCLPG references at most */
1950 VERIFY(sp->sl_refcnt >= 0 && sp->sl_chunks == NBCLPG);
1951 VERIFY(sp->sl_refcnt < (NBCLPG - 1) ||
1952 (slab_is_detached(sp) && sp->sl_head == NULL));
1953 } else if (class == MC_16KCL) {
1954 mcl_slab_t *nsp;
1955 int k;
1956 /*
1957 * A 16KB cluster takes NSLABSP16KB slabs, all must
1958 * now have 0 reference.
1959 */
1960 VERIFY(IS_P2ALIGNED(buf, PAGE_SIZE));
1961 VERIFY(sp->sl_refcnt == 0 && sp->sl_chunks == 1 &&
1962 sp->sl_len == m_maxsize(class) && sp->sl_head == NULL);
1963 VERIFY(slab_is_detached(sp));
1964 for (nsp = sp, k = 1; k < NSLABSP16KB; k++) {
1965 nsp = nsp->sl_next;
1966 /* Next slab must already be present */
1967 VERIFY(nsp != NULL);
1968 nsp->sl_refcnt--;
1969 VERIFY(slab_is_detached(nsp));
1970 VERIFY(nsp->sl_class == MC_16KCL &&
1971 (nsp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) &&
1972 nsp->sl_refcnt == 0 && nsp->sl_chunks == 0 &&
1973 nsp->sl_len == 0 && nsp->sl_base == sp->sl_base &&
1974 nsp->sl_head == NULL);
1975 }
1976 } else {
1977 /*
1978 * A slab that has been splitted for mbufs has at most
1979 * NMBPG reference counts. Since we have decremented
1980 * one reference above, it must now be between 0 and
1981 * NMBPG-1.
1982 */
1983 VERIFY(class == MC_MBUF);
1984 VERIFY(sp->sl_refcnt >= 0 &&
1985 sp->sl_refcnt <= (NMBPG - 1) &&
1986 sp->sl_chunks == NMBPG &&
1987 sp->sl_len == PAGE_SIZE);
1988 VERIFY(sp->sl_refcnt < (NMBPG - 1) ||
1989 (slab_is_detached(sp) && sp->sl_head == NULL));
1990 }
1991
1992 /*
1993 * When auditing is enabled, ensure that the buffer still
1994 * contains the free pattern. Otherwise it got corrupted
1995 * while at the CPU cache layer.
1996 */
1997 if (mclaudit != NULL) {
1998 mcache_audit_t *mca = mcl_audit_buf2mca(class, buf);
1999 if (mclverify) {
2000 mcache_audit_free_verify(mca, buf, 0,
2001 m_maxsize(class));
2002 }
2003 mca->mca_uflags &= ~MB_SCVALID;
2004 }
2005
2006 if (class == MC_CL) {
2007 mbstat.m_clfree = (++m_infree(MC_CL)) + m_infree(MC_MBUF_CL);
2008 buf->obj_next = sp->sl_head;
2009 } else if (class == MC_BIGCL) {
2010 mbstat.m_bigclfree = (++m_infree(MC_BIGCL)) +
2011 m_infree(MC_MBUF_BIGCL);
2012 buf->obj_next = sp->sl_head;
2013 } else if (class == MC_16KCL) {
2014 ++m_infree(MC_16KCL);
2015 } else {
2016 ++m_infree(MC_MBUF);
2017 buf->obj_next = sp->sl_head;
2018 }
2019 sp->sl_head = buf;
2020
2021 /*
2022 * If a slab has been split to either one which holds 2KB clusters,
2023 * or one which holds mbufs, turn it back to one which holds a
2024 * 4 or 16 KB cluster depending on the page size.
2025 */
2026 if (m_maxsize(MC_BIGCL) == PAGE_SIZE) {
2027 super_class = MC_BIGCL;
2028 } else {
2029 VERIFY(PAGE_SIZE == m_maxsize(MC_16KCL));
2030 super_class = MC_16KCL;
2031 }
2032 if (class == MC_MBUF && sp->sl_refcnt == 0 &&
2033 m_total(class) >= (m_minlimit(class) + NMBPG) &&
2034 m_total(super_class) < m_maxlimit(super_class)) {
2035 int i = NMBPG;
2036
2037 m_total(MC_MBUF) -= NMBPG;
2038 mbstat.m_mbufs = m_total(MC_MBUF);
2039 m_infree(MC_MBUF) -= NMBPG;
2040 mtype_stat_add(MT_FREE, -((unsigned)NMBPG));
2041
2042 while (i--) {
2043 struct mbuf *m = sp->sl_head;
2044 VERIFY(m != NULL);
2045 sp->sl_head = m->m_next;
2046 m->m_next = NULL;
2047 }
2048 reinit_supercl = true;
2049 } else if (class == MC_CL && sp->sl_refcnt == 0 &&
2050 m_total(class) >= (m_minlimit(class) + NCLPG) &&
2051 m_total(super_class) < m_maxlimit(super_class)) {
2052 int i = NCLPG;
2053
2054 m_total(MC_CL) -= NCLPG;
2055 mbstat.m_clusters = m_total(MC_CL);
2056 m_infree(MC_CL) -= NCLPG;
2057
2058 while (i--) {
2059 union mcluster *c = sp->sl_head;
2060 VERIFY(c != NULL);
2061 sp->sl_head = c->mcl_next;
2062 c->mcl_next = NULL;
2063 }
2064 reinit_supercl = true;
2065 } else if (class == MC_BIGCL && super_class != MC_BIGCL &&
2066 sp->sl_refcnt == 0 &&
2067 m_total(class) >= (m_minlimit(class) + NBCLPG) &&
2068 m_total(super_class) < m_maxlimit(super_class)) {
2069 int i = NBCLPG;
2070
2071 VERIFY(super_class == MC_16KCL);
2072 m_total(MC_BIGCL) -= NBCLPG;
2073 mbstat.m_bigclusters = m_total(MC_BIGCL);
2074 m_infree(MC_BIGCL) -= NBCLPG;
2075
2076 while (i--) {
2077 union mbigcluster *bc = sp->sl_head;
2078 VERIFY(bc != NULL);
2079 sp->sl_head = bc->mbc_next;
2080 bc->mbc_next = NULL;
2081 }
2082 reinit_supercl = true;
2083 }
2084
2085 if (reinit_supercl) {
2086 VERIFY(sp->sl_head == NULL);
2087 VERIFY(m_total(class) >= m_minlimit(class));
2088 slab_remove(sp, class);
2089
2090 /* Reinitialize it as a cluster for the super class */
2091 m_total(super_class)++;
2092 m_infree(super_class)++;
2093 VERIFY(sp->sl_flags == (SLF_MAPPED | SLF_DETACHED) &&
2094 sp->sl_len == PAGE_SIZE && sp->sl_refcnt == 0);
2095
2096 slab_init(sp, super_class, SLF_MAPPED, sp->sl_base,
2097 sp->sl_base, PAGE_SIZE, 0, 1);
2098 if (mclverify)
2099 mcache_set_pattern(MCACHE_FREE_PATTERN,
2100 (caddr_t)sp->sl_base, sp->sl_len);
2101 ((mcache_obj_t *)(sp->sl_base))->obj_next = NULL;
2102
2103 if (super_class == MC_BIGCL) {
2104 mbstat.m_bigclusters = m_total(MC_BIGCL);
2105 mbstat.m_bigclfree = m_infree(MC_BIGCL) +
2106 m_infree(MC_MBUF_BIGCL);
2107 }
2108
2109 VERIFY(slab_is_detached(sp));
2110 VERIFY(m_total(super_class) <= m_maxlimit(super_class));
2111
2112 /* And finally switch class */
2113 class = super_class;
2114 }
2115
2116 /* Reinsert the slab to the class's slab list */
2117 if (slab_is_detached(sp))
2118 slab_insert(sp, class);
2119
2120 /* We're done; let others enter */
2121 mb_clalloc_busy = FALSE;
2122 if (mb_clalloc_waiters > 0) {
2123 mb_clalloc_waiters = 0;
2124 wakeup(mb_clalloc_waitchan);
2125 }
2126 }
2127
2128 /*
2129 * Common allocator for rudimentary objects called by the CPU cache layer
2130 * during an allocation request whenever there is no available element in the
2131 * bucket layer. It returns one or more elements from the appropriate global
2132 * freelist. If the freelist is empty, it will attempt to populate it and
2133 * retry the allocation.
2134 */
2135 static unsigned int
2136 mbuf_slab_alloc(void *arg, mcache_obj_t ***plist, unsigned int num, int wait)
2137 {
2138 mbuf_class_t class = (mbuf_class_t)arg;
2139 unsigned int need = num;
2140 mcache_obj_t **list = *plist;
2141
2142 ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
2143 ASSERT(need > 0);
2144
2145 lck_mtx_lock(mbuf_mlock);
2146
2147 for (;;) {
2148 if ((*list = slab_alloc(class, wait)) != NULL) {
2149 (*list)->obj_next = NULL;
2150 list = *plist = &(*list)->obj_next;
2151
2152 if (--need == 0) {
2153 /*
2154 * If the number of elements in freelist has
2155 * dropped below low watermark, asynchronously
2156 * populate the freelist now rather than doing
2157 * it later when we run out of elements.
2158 */
2159 if (!mbuf_cached_above(class, wait) &&
2160 m_infree(class) < (m_total(class) >> 5)) {
2161 (void) freelist_populate(class, 1,
2162 M_DONTWAIT);
2163 }
2164 break;
2165 }
2166 } else {
2167 VERIFY(m_infree(class) == 0 || class == MC_CL);
2168
2169 (void) freelist_populate(class, 1,
2170 (wait & MCR_NOSLEEP) ? M_DONTWAIT : M_WAIT);
2171
2172 if (m_infree(class) > 0)
2173 continue;
2174
2175 /* Check if there's anything at the cache layer */
2176 if (mbuf_cached_above(class, wait))
2177 break;
2178
2179 /* watchdog checkpoint */
2180 mbuf_watchdog();
2181
2182 /* We have nothing and cannot block; give up */
2183 if (wait & MCR_NOSLEEP) {
2184 if (!(wait & MCR_TRYHARD)) {
2185 m_fail_cnt(class)++;
2186 mbstat.m_drops++;
2187 break;
2188 }
2189 }
2190
2191 /*
2192 * If the freelist is still empty and the caller is
2193 * willing to be blocked, sleep on the wait channel
2194 * until an element is available. Otherwise, if
2195 * MCR_TRYHARD is set, do our best to satisfy the
2196 * request without having to go to sleep.
2197 */
2198 if (mbuf_worker_ready &&
2199 mbuf_sleep(class, need, wait))
2200 break;
2201
2202 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2203 }
2204 }
2205
2206 m_alloc_cnt(class) += num - need;
2207 lck_mtx_unlock(mbuf_mlock);
2208
2209 return (num - need);
2210 }
2211
2212 /*
2213 * Common de-allocator for rudimentary objects called by the CPU cache
2214 * layer when one or more elements need to be returned to the appropriate
2215 * global freelist.
2216 */
2217 static void
2218 mbuf_slab_free(void *arg, mcache_obj_t *list, __unused int purged)
2219 {
2220 mbuf_class_t class = (mbuf_class_t)arg;
2221 mcache_obj_t *nlist;
2222 unsigned int num = 0;
2223 int w;
2224
2225 ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
2226
2227 lck_mtx_lock(mbuf_mlock);
2228
2229 for (;;) {
2230 nlist = list->obj_next;
2231 list->obj_next = NULL;
2232 slab_free(class, list);
2233 ++num;
2234 if ((list = nlist) == NULL)
2235 break;
2236 }
2237 m_free_cnt(class) += num;
2238
2239 if ((w = mb_waiters) > 0)
2240 mb_waiters = 0;
2241
2242 lck_mtx_unlock(mbuf_mlock);
2243
2244 if (w != 0)
2245 wakeup(mb_waitchan);
2246 }
2247
2248 /*
2249 * Common auditor for rudimentary objects called by the CPU cache layer
2250 * during an allocation or free request. For the former, this is called
2251 * after the objects are obtained from either the bucket or slab layer
2252 * and before they are returned to the caller. For the latter, this is
2253 * called immediately during free and before placing the objects into
2254 * the bucket or slab layer.
2255 */
2256 static void
2257 mbuf_slab_audit(void *arg, mcache_obj_t *list, boolean_t alloc)
2258 {
2259 mbuf_class_t class = (mbuf_class_t)arg;
2260 mcache_audit_t *mca;
2261
2262 ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
2263
2264 while (list != NULL) {
2265 lck_mtx_lock(mbuf_mlock);
2266 mca = mcl_audit_buf2mca(class, list);
2267
2268 /* Do the sanity checks */
2269 if (class == MC_MBUF) {
2270 mcl_audit_mbuf(mca, list, FALSE, alloc);
2271 ASSERT(mca->mca_uflags & MB_SCVALID);
2272 } else {
2273 mcl_audit_cluster(mca, list, m_maxsize(class),
2274 alloc, TRUE);
2275 ASSERT(!(mca->mca_uflags & MB_SCVALID));
2276 }
2277 /* Record this transaction */
2278 if (mcltrace)
2279 mcache_buffer_log(mca, list, m_cache(class), &mb_start);
2280
2281 if (alloc)
2282 mca->mca_uflags |= MB_INUSE;
2283 else
2284 mca->mca_uflags &= ~MB_INUSE;
2285 /* Unpair the object (unconditionally) */
2286 mca->mca_uptr = NULL;
2287 lck_mtx_unlock(mbuf_mlock);
2288
2289 list = list->obj_next;
2290 }
2291 }
2292
2293 /*
2294 * Common notify routine for all caches. It is called by mcache when
2295 * one or more objects get freed. We use this indication to trigger
2296 * the wakeup of any sleeping threads so that they can retry their
2297 * allocation requests.
2298 */
2299 static void
2300 mbuf_slab_notify(void *arg, u_int32_t reason)
2301 {
2302 mbuf_class_t class = (mbuf_class_t)arg;
2303 int w;
2304
2305 ASSERT(MBUF_CLASS_VALID(class));
2306
2307 if (reason != MCN_RETRYALLOC)
2308 return;
2309
2310 lck_mtx_lock(mbuf_mlock);
2311 if ((w = mb_waiters) > 0) {
2312 m_notified(class)++;
2313 mb_waiters = 0;
2314 }
2315 lck_mtx_unlock(mbuf_mlock);
2316
2317 if (w != 0)
2318 wakeup(mb_waitchan);
2319 }
2320
2321 /*
2322 * Obtain object(s) from the composite class's freelist.
2323 */
2324 static unsigned int
2325 cslab_alloc(mbuf_class_t class, mcache_obj_t ***plist, unsigned int num)
2326 {
2327 unsigned int need = num;
2328 mcl_slab_t *sp, *clsp, *nsp;
2329 struct mbuf *m;
2330 mcache_obj_t **list = *plist;
2331 void *cl;
2332
2333 VERIFY(need > 0);
2334 VERIFY(class != MC_MBUF_16KCL || njcl > 0);
2335 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2336
2337 /* Get what we can from the freelist */
2338 while ((*list = m_cobjlist(class)) != NULL) {
2339 MRANGE(*list);
2340
2341 m = (struct mbuf *)*list;
2342 sp = slab_get(m);
2343 cl = m->m_ext.ext_buf;
2344 clsp = slab_get(cl);
2345 VERIFY(m->m_flags == M_EXT && cl != NULL);
2346 VERIFY(m_get_rfa(m) != NULL && MBUF_IS_COMPOSITE(m));
2347
2348 if (class == MC_MBUF_CL) {
2349 VERIFY(clsp->sl_refcnt >= 1 &&
2350 clsp->sl_refcnt <= NCLPG);
2351 } else {
2352 VERIFY(clsp->sl_refcnt >= 1 &&
2353 clsp->sl_refcnt <= NBCLPG);
2354 }
2355
2356 if (class == MC_MBUF_16KCL) {
2357 int k;
2358 for (nsp = clsp, k = 1; k < NSLABSP16KB; k++) {
2359 nsp = nsp->sl_next;
2360 /* Next slab must already be present */
2361 VERIFY(nsp != NULL);
2362 VERIFY(nsp->sl_refcnt == 1);
2363 }
2364 }
2365
2366 if ((m_cobjlist(class) = (*list)->obj_next) != NULL &&
2367 !MBUF_IN_MAP(m_cobjlist(class))) {
2368 slab_nextptr_panic(sp, m_cobjlist(class));
2369 /* NOTREACHED */
2370 }
2371 (*list)->obj_next = NULL;
2372 list = *plist = &(*list)->obj_next;
2373
2374 if (--need == 0)
2375 break;
2376 }
2377 m_infree(class) -= (num - need);
2378
2379 return (num - need);
2380 }
2381
2382 /*
2383 * Place object(s) back into a composite class's freelist.
2384 */
2385 static unsigned int
2386 cslab_free(mbuf_class_t class, mcache_obj_t *list, int purged)
2387 {
2388 mcache_obj_t *o, *tail;
2389 unsigned int num = 0;
2390 struct mbuf *m, *ms;
2391 mcache_audit_t *mca = NULL;
2392 mcache_obj_t *ref_list = NULL;
2393 mcl_slab_t *clsp, *nsp;
2394 void *cl;
2395 mbuf_class_t cl_class;
2396
2397 ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
2398 VERIFY(class != MC_MBUF_16KCL || njcl > 0);
2399 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2400
2401 if (class == MC_MBUF_CL) {
2402 cl_class = MC_CL;
2403 } else if (class == MC_MBUF_BIGCL) {
2404 cl_class = MC_BIGCL;
2405 } else {
2406 VERIFY(class == MC_MBUF_16KCL);
2407 cl_class = MC_16KCL;
2408 }
2409
2410 o = tail = list;
2411
2412 while ((m = ms = (struct mbuf *)o) != NULL) {
2413 mcache_obj_t *rfa, *nexto = o->obj_next;
2414
2415 /* Do the mbuf sanity checks */
2416 if (mclaudit != NULL) {
2417 mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
2418 if (mclverify) {
2419 mcache_audit_free_verify(mca, m, 0,
2420 m_maxsize(MC_MBUF));
2421 }
2422 ms = MCA_SAVED_MBUF_PTR(mca);
2423 }
2424
2425 /* Do the cluster sanity checks */
2426 cl = ms->m_ext.ext_buf;
2427 clsp = slab_get(cl);
2428 if (mclverify) {
2429 size_t size = m_maxsize(cl_class);
2430 mcache_audit_free_verify(mcl_audit_buf2mca(cl_class,
2431 (mcache_obj_t *)cl), cl, 0, size);
2432 }
2433 VERIFY(ms->m_type == MT_FREE);
2434 VERIFY(ms->m_flags == M_EXT);
2435 VERIFY(m_get_rfa(ms) != NULL && MBUF_IS_COMPOSITE(ms));
2436 if (cl_class == MC_CL) {
2437 VERIFY(clsp->sl_refcnt >= 1 &&
2438 clsp->sl_refcnt <= NCLPG);
2439 } else {
2440 VERIFY(clsp->sl_refcnt >= 1 &&
2441 clsp->sl_refcnt <= NBCLPG);
2442 }
2443 if (cl_class == MC_16KCL) {
2444 int k;
2445 for (nsp = clsp, k = 1; k < NSLABSP16KB; k++) {
2446 nsp = nsp->sl_next;
2447 /* Next slab must already be present */
2448 VERIFY(nsp != NULL);
2449 VERIFY(nsp->sl_refcnt == 1);
2450 }
2451 }
2452
2453 /*
2454 * If we're asked to purge, restore the actual mbuf using
2455 * contents of the shadow structure (if auditing is enabled)
2456 * and clear EXTF_COMPOSITE flag from the mbuf, as we are
2457 * about to free it and the attached cluster into their caches.
2458 */
2459 if (purged) {
2460 /* Restore constructed mbuf fields */
2461 if (mclaudit != NULL)
2462 mcl_audit_restore_mbuf(m, mca, TRUE);
2463
2464 MEXT_MINREF(m) = 0;
2465 MEXT_REF(m) = 0;
2466 MEXT_PREF(m) = 0;
2467 MEXT_FLAGS(m) = 0;
2468 MEXT_PRIV(m) = 0;
2469 MEXT_PMBUF(m) = NULL;
2470 MEXT_TOKEN(m) = 0;
2471
2472 rfa = (mcache_obj_t *)(void *)m_get_rfa(m);
2473 m_set_ext(m, NULL, NULL, NULL);
2474 rfa->obj_next = ref_list;
2475 ref_list = rfa;
2476
2477 m->m_type = MT_FREE;
2478 m->m_flags = m->m_len = 0;
2479 m->m_next = m->m_nextpkt = NULL;
2480
2481 /* Save mbuf fields and make auditing happy */
2482 if (mclaudit != NULL)
2483 mcl_audit_mbuf(mca, o, FALSE, FALSE);
2484
2485 VERIFY(m_total(class) > 0);
2486 m_total(class)--;
2487
2488 /* Free the mbuf */
2489 o->obj_next = NULL;
2490 slab_free(MC_MBUF, o);
2491
2492 /* And free the cluster */
2493 ((mcache_obj_t *)cl)->obj_next = NULL;
2494 if (class == MC_MBUF_CL)
2495 slab_free(MC_CL, cl);
2496 else if (class == MC_MBUF_BIGCL)
2497 slab_free(MC_BIGCL, cl);
2498 else
2499 slab_free(MC_16KCL, cl);
2500 }
2501
2502 ++num;
2503 tail = o;
2504 o = nexto;
2505 }
2506
2507 if (!purged) {
2508 tail->obj_next = m_cobjlist(class);
2509 m_cobjlist(class) = list;
2510 m_infree(class) += num;
2511 } else if (ref_list != NULL) {
2512 mcache_free_ext(ref_cache, ref_list);
2513 }
2514
2515 return (num);
2516 }
2517
2518 /*
2519 * Common allocator for composite objects called by the CPU cache layer
2520 * during an allocation request whenever there is no available element in
2521 * the bucket layer. It returns one or more composite elements from the
2522 * appropriate global freelist. If the freelist is empty, it will attempt
2523 * to obtain the rudimentary objects from their caches and construct them
2524 * into composite mbuf + cluster objects.
2525 */
2526 static unsigned int
2527 mbuf_cslab_alloc(void *arg, mcache_obj_t ***plist, unsigned int needed,
2528 int wait)
2529 {
2530 mbuf_class_t class = (mbuf_class_t)arg;
2531 mbuf_class_t cl_class = 0;
2532 unsigned int num = 0, cnum = 0, want = needed;
2533 mcache_obj_t *ref_list = NULL;
2534 mcache_obj_t *mp_list = NULL;
2535 mcache_obj_t *clp_list = NULL;
2536 mcache_obj_t **list;
2537 struct ext_ref *rfa;
2538 struct mbuf *m;
2539 void *cl;
2540
2541 ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
2542 ASSERT(needed > 0);
2543
2544 VERIFY(class != MC_MBUF_16KCL || njcl > 0);
2545
2546 /* There should not be any slab for this class */
2547 VERIFY(m_slab_cnt(class) == 0 &&
2548 m_slablist(class).tqh_first == NULL &&
2549 m_slablist(class).tqh_last == NULL);
2550
2551 lck_mtx_lock(mbuf_mlock);
2552
2553 /* Try using the freelist first */
2554 num = cslab_alloc(class, plist, needed);
2555 list = *plist;
2556 if (num == needed) {
2557 m_alloc_cnt(class) += num;
2558 lck_mtx_unlock(mbuf_mlock);
2559 return (needed);
2560 }
2561
2562 lck_mtx_unlock(mbuf_mlock);
2563
2564 /*
2565 * We could not satisfy the request using the freelist alone;
2566 * allocate from the appropriate rudimentary caches and use
2567 * whatever we can get to construct the composite objects.
2568 */
2569 needed -= num;
2570
2571 /*
2572 * Mark these allocation requests as coming from a composite cache.
2573 * Also, if the caller is willing to be blocked, mark the request
2574 * with MCR_FAILOK such that we don't end up sleeping at the mbuf
2575 * slab layer waiting for the individual object when one or more
2576 * of the already-constructed composite objects are available.
2577 */
2578 wait |= MCR_COMP;
2579 if (!(wait & MCR_NOSLEEP))
2580 wait |= MCR_FAILOK;
2581
2582 /* allocate mbufs */
2583 needed = mcache_alloc_ext(m_cache(MC_MBUF), &mp_list, needed, wait);
2584 if (needed == 0) {
2585 ASSERT(mp_list == NULL);
2586 goto fail;
2587 }
2588
2589 /* allocate clusters */
2590 if (class == MC_MBUF_CL) {
2591 cl_class = MC_CL;
2592 } else if (class == MC_MBUF_BIGCL) {
2593 cl_class = MC_BIGCL;
2594 } else {
2595 VERIFY(class == MC_MBUF_16KCL);
2596 cl_class = MC_16KCL;
2597 }
2598 needed = mcache_alloc_ext(m_cache(cl_class), &clp_list, needed, wait);
2599 if (needed == 0) {
2600 ASSERT(clp_list == NULL);
2601 goto fail;
2602 }
2603
2604 needed = mcache_alloc_ext(ref_cache, &ref_list, needed, wait);
2605 if (needed == 0) {
2606 ASSERT(ref_list == NULL);
2607 goto fail;
2608 }
2609
2610 /*
2611 * By this time "needed" is MIN(mbuf, cluster, ref). Any left
2612 * overs will get freed accordingly before we return to caller.
2613 */
2614 for (cnum = 0; cnum < needed; cnum++) {
2615 struct mbuf *ms;
2616
2617 m = ms = (struct mbuf *)mp_list;
2618 mp_list = mp_list->obj_next;
2619
2620 cl = clp_list;
2621 clp_list = clp_list->obj_next;
2622 ((mcache_obj_t *)cl)->obj_next = NULL;
2623
2624 rfa = (struct ext_ref *)ref_list;
2625 ref_list = ref_list->obj_next;
2626 ((mcache_obj_t *)(void *)rfa)->obj_next = NULL;
2627
2628 /*
2629 * If auditing is enabled, construct the shadow mbuf
2630 * in the audit structure instead of in the actual one.
2631 * mbuf_cslab_audit() will take care of restoring the
2632 * contents after the integrity check.
2633 */
2634 if (mclaudit != NULL) {
2635 mcache_audit_t *mca, *cl_mca;
2636
2637 lck_mtx_lock(mbuf_mlock);
2638 mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
2639 ms = MCA_SAVED_MBUF_PTR(mca);
2640 cl_mca = mcl_audit_buf2mca(cl_class,
2641 (mcache_obj_t *)cl);
2642
2643 /*
2644 * Pair them up. Note that this is done at the time
2645 * the mbuf+cluster objects are constructed. This
2646 * information should be treated as "best effort"
2647 * debugging hint since more than one mbufs can refer
2648 * to a cluster. In that case, the cluster might not
2649 * be freed along with the mbuf it was paired with.
2650 */
2651 mca->mca_uptr = cl_mca;
2652 cl_mca->mca_uptr = mca;
2653
2654 ASSERT(mca->mca_uflags & MB_SCVALID);
2655 ASSERT(!(cl_mca->mca_uflags & MB_SCVALID));
2656 lck_mtx_unlock(mbuf_mlock);
2657
2658 /* Technically, they are in the freelist */
2659 if (mclverify) {
2660 size_t size;
2661
2662 mcache_set_pattern(MCACHE_FREE_PATTERN, m,
2663 m_maxsize(MC_MBUF));
2664
2665 if (class == MC_MBUF_CL)
2666 size = m_maxsize(MC_CL);
2667 else if (class == MC_MBUF_BIGCL)
2668 size = m_maxsize(MC_BIGCL);
2669 else
2670 size = m_maxsize(MC_16KCL);
2671
2672 mcache_set_pattern(MCACHE_FREE_PATTERN, cl,
2673 size);
2674 }
2675 }
2676
2677 MBUF_INIT(ms, 0, MT_FREE);
2678 if (class == MC_MBUF_16KCL) {
2679 MBUF_16KCL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE);
2680 } else if (class == MC_MBUF_BIGCL) {
2681 MBUF_BIGCL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE);
2682 } else {
2683 MBUF_CL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE);
2684 }
2685 VERIFY(ms->m_flags == M_EXT);
2686 VERIFY(m_get_rfa(ms) != NULL && MBUF_IS_COMPOSITE(ms));
2687
2688 *list = (mcache_obj_t *)m;
2689 (*list)->obj_next = NULL;
2690 list = *plist = &(*list)->obj_next;
2691 }
2692
2693 fail:
2694 /*
2695 * Free up what's left of the above.
2696 */
2697 if (mp_list != NULL)
2698 mcache_free_ext(m_cache(MC_MBUF), mp_list);
2699 if (clp_list != NULL)
2700 mcache_free_ext(m_cache(cl_class), clp_list);
2701 if (ref_list != NULL)
2702 mcache_free_ext(ref_cache, ref_list);
2703
2704 lck_mtx_lock(mbuf_mlock);
2705 if (num > 0 || cnum > 0) {
2706 m_total(class) += cnum;
2707 VERIFY(m_total(class) <= m_maxlimit(class));
2708 m_alloc_cnt(class) += num + cnum;
2709 }
2710 if ((num + cnum) < want)
2711 m_fail_cnt(class) += (want - (num + cnum));
2712 lck_mtx_unlock(mbuf_mlock);
2713
2714 return (num + cnum);
2715 }
2716
2717 /*
2718 * Common de-allocator for composite objects called by the CPU cache
2719 * layer when one or more elements need to be returned to the appropriate
2720 * global freelist.
2721 */
2722 static void
2723 mbuf_cslab_free(void *arg, mcache_obj_t *list, int purged)
2724 {
2725 mbuf_class_t class = (mbuf_class_t)arg;
2726 unsigned int num;
2727 int w;
2728
2729 ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
2730
2731 lck_mtx_lock(mbuf_mlock);
2732
2733 num = cslab_free(class, list, purged);
2734 m_free_cnt(class) += num;
2735
2736 if ((w = mb_waiters) > 0)
2737 mb_waiters = 0;
2738
2739 lck_mtx_unlock(mbuf_mlock);
2740
2741 if (w != 0)
2742 wakeup(mb_waitchan);
2743 }
2744
2745 /*
2746 * Common auditor for composite objects called by the CPU cache layer
2747 * during an allocation or free request. For the former, this is called
2748 * after the objects are obtained from either the bucket or slab layer
2749 * and before they are returned to the caller. For the latter, this is
2750 * called immediately during free and before placing the objects into
2751 * the bucket or slab layer.
2752 */
2753 static void
2754 mbuf_cslab_audit(void *arg, mcache_obj_t *list, boolean_t alloc)
2755 {
2756 mbuf_class_t class = (mbuf_class_t)arg, cl_class;
2757 mcache_audit_t *mca;
2758 struct mbuf *m, *ms;
2759 mcl_slab_t *clsp, *nsp;
2760 size_t cl_size;
2761 void *cl;
2762
2763 ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
2764 if (class == MC_MBUF_CL)
2765 cl_class = MC_CL;
2766 else if (class == MC_MBUF_BIGCL)
2767 cl_class = MC_BIGCL;
2768 else
2769 cl_class = MC_16KCL;
2770 cl_size = m_maxsize(cl_class);
2771
2772 while ((m = ms = (struct mbuf *)list) != NULL) {
2773 lck_mtx_lock(mbuf_mlock);
2774 /* Do the mbuf sanity checks and record its transaction */
2775 mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
2776 mcl_audit_mbuf(mca, m, TRUE, alloc);
2777 if (mcltrace)
2778 mcache_buffer_log(mca, m, m_cache(class), &mb_start);
2779
2780 if (alloc)
2781 mca->mca_uflags |= MB_COMP_INUSE;
2782 else
2783 mca->mca_uflags &= ~MB_COMP_INUSE;
2784
2785 /*
2786 * Use the shadow mbuf in the audit structure if we are
2787 * freeing, since the contents of the actual mbuf has been
2788 * pattern-filled by the above call to mcl_audit_mbuf().
2789 */
2790 if (!alloc && mclverify)
2791 ms = MCA_SAVED_MBUF_PTR(mca);
2792
2793 /* Do the cluster sanity checks and record its transaction */
2794 cl = ms->m_ext.ext_buf;
2795 clsp = slab_get(cl);
2796 VERIFY(ms->m_flags == M_EXT && cl != NULL);
2797 VERIFY(m_get_rfa(ms) != NULL && MBUF_IS_COMPOSITE(ms));
2798 if (class == MC_MBUF_CL)
2799 VERIFY(clsp->sl_refcnt >= 1 &&
2800 clsp->sl_refcnt <= NCLPG);
2801 else
2802 VERIFY(clsp->sl_refcnt >= 1 &&
2803 clsp->sl_refcnt <= NBCLPG);
2804
2805 if (class == MC_MBUF_16KCL) {
2806 int k;
2807 for (nsp = clsp, k = 1; k < NSLABSP16KB; k++) {
2808 nsp = nsp->sl_next;
2809 /* Next slab must already be present */
2810 VERIFY(nsp != NULL);
2811 VERIFY(nsp->sl_refcnt == 1);
2812 }
2813 }
2814
2815
2816 mca = mcl_audit_buf2mca(cl_class, cl);
2817 mcl_audit_cluster(mca, cl, cl_size, alloc, FALSE);
2818 if (mcltrace)
2819 mcache_buffer_log(mca, cl, m_cache(class), &mb_start);
2820
2821 if (alloc)
2822 mca->mca_uflags |= MB_COMP_INUSE;
2823 else
2824 mca->mca_uflags &= ~MB_COMP_INUSE;
2825 lck_mtx_unlock(mbuf_mlock);
2826
2827 list = list->obj_next;
2828 }
2829 }
2830
2831 static void
2832 m_vm_error_stats(uint32_t *cnt, uint64_t *ts, uint64_t *size,
2833 uint64_t alloc_size, kern_return_t error)
2834 {
2835
2836 *cnt = *cnt + 1;
2837 *ts = net_uptime();
2838 if (size) {
2839 *size = alloc_size;
2840 }
2841 _CASSERT(sizeof(mb_kmem_stats) / sizeof(mb_kmem_stats[0]) ==
2842 sizeof(mb_kmem_stats_labels) / sizeof(mb_kmem_stats_labels[0]));
2843 switch (error) {
2844 case KERN_SUCCESS:
2845 break;
2846 case KERN_INVALID_ARGUMENT:
2847 mb_kmem_stats[0]++;
2848 break;
2849 case KERN_INVALID_ADDRESS:
2850 mb_kmem_stats[1]++;
2851 break;
2852 case KERN_RESOURCE_SHORTAGE:
2853 mb_kmem_stats[2]++;
2854 break;
2855 case KERN_NO_SPACE:
2856 mb_kmem_stats[3]++;
2857 break;
2858 case KERN_FAILURE:
2859 mb_kmem_stats[4]++;
2860 break;
2861 default:
2862 mb_kmem_stats[5]++;
2863 break;
2864 }
2865 }
2866
2867 /*
2868 * Allocate some number of mbuf clusters and place on cluster freelist.
2869 */
2870 static int
2871 m_clalloc(const u_int32_t num, const int wait, const u_int32_t bufsize)
2872 {
2873 int i, count = 0;
2874 vm_size_t size = 0;
2875 int numpages = 0, large_buffer;
2876 vm_offset_t page = 0;
2877 mcache_audit_t *mca_list = NULL;
2878 mcache_obj_t *con_list = NULL;
2879 mcl_slab_t *sp;
2880 mbuf_class_t class;
2881 kern_return_t error;
2882
2883 /* Set if a buffer allocation needs allocation of multiple pages */
2884 large_buffer = ((bufsize == m_maxsize(MC_16KCL)) &&
2885 PAGE_SIZE < M16KCLBYTES);
2886 VERIFY(bufsize == m_maxsize(MC_BIGCL) ||
2887 bufsize == m_maxsize(MC_16KCL));
2888
2889 VERIFY((bufsize == PAGE_SIZE) ||
2890 (bufsize > PAGE_SIZE && bufsize == m_maxsize(MC_16KCL)));
2891
2892 if (bufsize == m_size(MC_BIGCL))
2893 class = MC_BIGCL;
2894 else
2895 class = MC_16KCL;
2896
2897 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2898
2899 /*
2900 * Multiple threads may attempt to populate the cluster map one
2901 * after another. Since we drop the lock below prior to acquiring
2902 * the physical page(s), our view of the cluster map may no longer
2903 * be accurate, and we could end up over-committing the pages beyond
2904 * the maximum allowed for each class. To prevent it, this entire
2905 * operation (including the page mapping) is serialized.
2906 */
2907 while (mb_clalloc_busy) {
2908 mb_clalloc_waiters++;
2909 (void) msleep(mb_clalloc_waitchan, mbuf_mlock,
2910 (PZERO-1), "m_clalloc", NULL);
2911 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2912 }
2913
2914 /* We are busy now; tell everyone else to go away */
2915 mb_clalloc_busy = TRUE;
2916
2917 /*
2918 * Honor the caller's wish to block or not block. We have a way
2919 * to grow the pool asynchronously using the mbuf worker thread.
2920 */
2921 i = m_howmany(num, bufsize);
2922 if (i <= 0 || (wait & M_DONTWAIT))
2923 goto out;
2924
2925 lck_mtx_unlock(mbuf_mlock);
2926
2927 size = round_page(i * bufsize);
2928 page = kmem_mb_alloc(mb_map, size, large_buffer, &error);
2929
2930 /*
2931 * If we did ask for "n" 16KB physically contiguous chunks
2932 * and didn't get them, then please try again without this
2933 * restriction.
2934 */
2935 net_update_uptime();
2936 if (large_buffer && page == 0) {
2937 m_vm_error_stats(&mb_kmem_contig_failed,
2938 &mb_kmem_contig_failed_ts,
2939 &mb_kmem_contig_failed_size,
2940 size, error);
2941 page = kmem_mb_alloc(mb_map, size, 0, &error);
2942 }
2943
2944 if (page == 0) {
2945 m_vm_error_stats(&mb_kmem_failed,
2946 &mb_kmem_failed_ts,
2947 &mb_kmem_failed_size,
2948 size, error);
2949 #if PAGE_SIZE == 4096
2950 if (bufsize == m_maxsize(MC_BIGCL)) {
2951 #else
2952 if (bufsize >= m_maxsize(MC_BIGCL)) {
2953 #endif
2954 /* Try for 1 page if failed */
2955 size = PAGE_SIZE;
2956 page = kmem_mb_alloc(mb_map, size, 0, &error);
2957 }
2958
2959 if (page == 0) {
2960 m_vm_error_stats(&mb_kmem_one_failed,
2961 &mb_kmem_one_failed_ts,
2962 NULL, size, error);
2963 lck_mtx_lock(mbuf_mlock);
2964 goto out;
2965 }
2966 }
2967
2968 VERIFY(IS_P2ALIGNED(page, PAGE_SIZE));
2969 numpages = size / PAGE_SIZE;
2970
2971 /* If auditing is enabled, allocate the audit structures now */
2972 if (mclaudit != NULL) {
2973 int needed;
2974
2975 /*
2976 * Yes, I realize this is a waste of memory for clusters
2977 * that never get transformed into mbufs, as we may end
2978 * up with NMBPG-1 unused audit structures per cluster.
2979 * But doing so tremendously simplifies the allocation
2980 * strategy, since at this point we are not holding the
2981 * mbuf lock and the caller is okay to be blocked.
2982 */
2983 if (bufsize == PAGE_SIZE) {
2984 needed = numpages * NMBPG;
2985
2986 i = mcache_alloc_ext(mcl_audit_con_cache,
2987 &con_list, needed, MCR_SLEEP);
2988
2989 VERIFY(con_list != NULL && i == needed);
2990 } else {
2991 /*
2992 * if multiple 4K pages are being used for a
2993 * 16K cluster
2994 */
2995 needed = numpages / NSLABSP16KB;
2996 }
2997
2998 i = mcache_alloc_ext(mcache_audit_cache,
2999 (mcache_obj_t **)&mca_list, needed, MCR_SLEEP);
3000
3001 VERIFY(mca_list != NULL && i == needed);
3002 }
3003
3004 lck_mtx_lock(mbuf_mlock);
3005
3006 for (i = 0; i < numpages; i++, page += PAGE_SIZE) {
3007 ppnum_t offset =
3008 ((unsigned char *)page - mbutl) >> PAGE_SHIFT;
3009 ppnum_t new_page = pmap_find_phys(kernel_pmap, page);
3010
3011 /*
3012 * If there is a mapper the appropriate I/O page is
3013 * returned; zero out the page to discard its past
3014 * contents to prevent exposing leftover kernel memory.
3015 */
3016 VERIFY(offset < mcl_pages);
3017 if (mcl_paddr_base != 0) {
3018 bzero((void *)(uintptr_t) page, PAGE_SIZE);
3019 new_page = IOMapperInsertPage(mcl_paddr_base,
3020 offset, new_page);
3021 }
3022 mcl_paddr[offset] = new_page;
3023
3024 /* Pattern-fill this fresh page */
3025 if (mclverify) {
3026 mcache_set_pattern(MCACHE_FREE_PATTERN,
3027 (caddr_t)page, PAGE_SIZE);
3028 }
3029 if (bufsize == PAGE_SIZE) {
3030 mcache_obj_t *buf;
3031 /* One for the entire page */
3032 sp = slab_get((void *)page);
3033 if (mclaudit != NULL) {
3034 mcl_audit_init((void *)page,
3035 &mca_list, &con_list,
3036 AUDIT_CONTENTS_SIZE, NMBPG);
3037 }
3038 VERIFY(sp->sl_refcnt == 0 && sp->sl_flags == 0);
3039 slab_init(sp, class, SLF_MAPPED, (void *)page,
3040 (void *)page, PAGE_SIZE, 0, 1);
3041 buf = (mcache_obj_t *)page;
3042 buf->obj_next = NULL;
3043
3044 /* Insert this slab */
3045 slab_insert(sp, class);
3046
3047 /* Update stats now since slab_get drops the lock */
3048 ++m_infree(class);
3049 ++m_total(class);
3050 VERIFY(m_total(class) <= m_maxlimit(class));
3051 if (class == MC_BIGCL) {
3052 mbstat.m_bigclfree = m_infree(MC_BIGCL) +
3053 m_infree(MC_MBUF_BIGCL);
3054 mbstat.m_bigclusters = m_total(MC_BIGCL);
3055 }
3056 ++count;
3057 } else if ((bufsize > PAGE_SIZE) &&
3058 (i % NSLABSP16KB) == 0) {
3059 union m16kcluster *m16kcl = (union m16kcluster *)page;
3060 mcl_slab_t *nsp;
3061 int k;
3062
3063 /* One for the entire 16KB */
3064 sp = slab_get(m16kcl);
3065 if (mclaudit != NULL)
3066 mcl_audit_init(m16kcl, &mca_list, NULL, 0, 1);
3067
3068 VERIFY(sp->sl_refcnt == 0 && sp->sl_flags == 0);
3069 slab_init(sp, MC_16KCL, SLF_MAPPED,
3070 m16kcl, m16kcl, bufsize, 0, 1);
3071 m16kcl->m16kcl_next = NULL;
3072
3073 /*
3074 * 2nd-Nth page's slab is part of the first one,
3075 * where N is NSLABSP16KB.
3076 */
3077 for (k = 1; k < NSLABSP16KB; k++) {
3078 nsp = slab_get(((union mbigcluster *)page) + k);
3079 VERIFY(nsp->sl_refcnt == 0 &&
3080 nsp->sl_flags == 0);
3081 slab_init(nsp, MC_16KCL,
3082 SLF_MAPPED | SLF_PARTIAL,
3083 m16kcl, NULL, 0, 0, 0);
3084 }
3085 /* Insert this slab */
3086 slab_insert(sp, MC_16KCL);
3087
3088 /* Update stats now since slab_get drops the lock */
3089 ++m_infree(MC_16KCL);
3090 ++m_total(MC_16KCL);
3091 VERIFY(m_total(MC_16KCL) <= m_maxlimit(MC_16KCL));
3092 ++count;
3093 }
3094 }
3095 VERIFY(mca_list == NULL && con_list == NULL);
3096
3097 if (!mb_peak_newreport && mbuf_report_usage(class))
3098 mb_peak_newreport = TRUE;
3099
3100 /* We're done; let others enter */
3101 mb_clalloc_busy = FALSE;
3102 if (mb_clalloc_waiters > 0) {
3103 mb_clalloc_waiters = 0;
3104 wakeup(mb_clalloc_waitchan);
3105 }
3106
3107 return (count);
3108 out:
3109 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
3110
3111 /* We're done; let others enter */
3112 mb_clalloc_busy = FALSE;
3113 if (mb_clalloc_waiters > 0) {
3114 mb_clalloc_waiters = 0;
3115 wakeup(mb_clalloc_waitchan);
3116 }
3117
3118 /*
3119 * When non-blocking we kick a thread if we have to grow the
3120 * pool or if the number of free clusters is less than requested.
3121 */
3122 if (i > 0 && mbuf_worker_ready && mbuf_worker_needs_wakeup) {
3123 wakeup((caddr_t)&mbuf_worker_needs_wakeup);
3124 mbuf_worker_needs_wakeup = FALSE;
3125 }
3126 if (class == MC_BIGCL) {
3127 if (i > 0) {
3128 /*
3129 * Remember total number of 4KB clusters needed
3130 * at this time.
3131 */
3132 i += m_total(MC_BIGCL);
3133 if (i > m_region_expand(MC_BIGCL)) {
3134 m_region_expand(MC_BIGCL) = i;
3135 }
3136 }
3137 if (m_infree(MC_BIGCL) >= num)
3138 return (1);
3139 } else {
3140 if (i > 0) {
3141 /*
3142 * Remember total number of 16KB clusters needed
3143 * at this time.
3144 */
3145 i += m_total(MC_16KCL);
3146 if (i > m_region_expand(MC_16KCL)) {
3147 m_region_expand(MC_16KCL) = i;
3148 }
3149 }
3150 if (m_infree(MC_16KCL) >= num)
3151 return (1);
3152 }
3153 return (0);
3154 }
3155
3156 /*
3157 * Populate the global freelist of the corresponding buffer class.
3158 */
3159 static int
3160 freelist_populate(mbuf_class_t class, unsigned int num, int wait)
3161 {
3162 mcache_obj_t *o = NULL;
3163 int i, numpages = 0, count;
3164 mbuf_class_t super_class;
3165
3166 VERIFY(class == MC_MBUF || class == MC_CL || class == MC_BIGCL ||
3167 class == MC_16KCL);
3168
3169 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
3170
3171 VERIFY(PAGE_SIZE == m_maxsize(MC_BIGCL) ||
3172 PAGE_SIZE == m_maxsize(MC_16KCL));
3173
3174 if (m_maxsize(class) >= PAGE_SIZE)
3175 return(m_clalloc(num, wait, m_maxsize(class)) != 0);
3176
3177 /*
3178 * The rest of the function will allocate pages and will slice
3179 * them up into the right size
3180 */
3181
3182 numpages = (num * m_size(class) + PAGE_SIZE - 1) / PAGE_SIZE;
3183
3184 /* Currently assume that pages are 4K or 16K */
3185 if (PAGE_SIZE == m_maxsize(MC_BIGCL))
3186 super_class = MC_BIGCL;
3187 else
3188 super_class = MC_16KCL;
3189
3190 i = m_clalloc(numpages, wait, m_maxsize(super_class));
3191
3192 /* how many objects will we cut the page into? */
3193 int numobj = PAGE_SIZE / m_maxsize(class);
3194
3195 for (count = 0; count < numpages; count++) {
3196 /* respect totals, minlimit, maxlimit */
3197 if (m_total(super_class) <= m_minlimit(super_class) ||
3198 m_total(class) >= m_maxlimit(class))
3199 break;
3200
3201 if ((o = slab_alloc(super_class, wait)) == NULL)
3202 break;
3203
3204 struct mbuf *m = (struct mbuf *)o;
3205 union mcluster *c = (union mcluster *)o;
3206 union mbigcluster *mbc = (union mbigcluster *)o;
3207 mcl_slab_t *sp = slab_get(o);
3208 mcache_audit_t *mca = NULL;
3209
3210 /*
3211 * since one full page will be converted to MC_MBUF or
3212 * MC_CL, verify that the reference count will match that
3213 * assumption
3214 */
3215 VERIFY(sp->sl_refcnt == 1 && slab_is_detached(sp));
3216 VERIFY((sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED);
3217 /*
3218 * Make sure that the cluster is unmolested
3219 * while in freelist
3220 */
3221 if (mclverify) {
3222 mca = mcl_audit_buf2mca(super_class,
3223 (mcache_obj_t *)o);
3224 mcache_audit_free_verify(mca,
3225 (mcache_obj_t *)o, 0, m_maxsize(super_class));
3226 }
3227
3228 /* Reinitialize it as an mbuf or 2K or 4K slab */
3229 slab_init(sp, class, sp->sl_flags,
3230 sp->sl_base, NULL, PAGE_SIZE, 0, numobj);
3231
3232 VERIFY(sp->sl_head == NULL);
3233
3234 VERIFY(m_total(super_class) >= 1);
3235 m_total(super_class)--;
3236
3237 if (super_class == MC_BIGCL)
3238 mbstat.m_bigclusters = m_total(MC_BIGCL);
3239
3240 m_total(class) += numobj;
3241 VERIFY(m_total(class) <= m_maxlimit(class));
3242 m_infree(class) += numobj;
3243
3244 if (!mb_peak_newreport && mbuf_report_usage(class))
3245 mb_peak_newreport = TRUE;
3246
3247 i = numobj;
3248 if (class == MC_MBUF) {
3249 mbstat.m_mbufs = m_total(MC_MBUF);
3250 mtype_stat_add(MT_FREE, NMBPG);
3251 while (i--) {
3252 /*
3253 * If auditing is enabled, construct the
3254 * shadow mbuf in the audit structure
3255 * instead of the actual one.
3256 * mbuf_slab_audit() will take care of
3257 * restoring the contents after the
3258 * integrity check.
3259 */
3260 if (mclaudit != NULL) {
3261 struct mbuf *ms;
3262 mca = mcl_audit_buf2mca(MC_MBUF,
3263 (mcache_obj_t *)m);
3264 ms = MCA_SAVED_MBUF_PTR(mca);
3265 ms->m_type = MT_FREE;
3266 } else {
3267 m->m_type = MT_FREE;
3268 }
3269 m->m_next = sp->sl_head;
3270 sp->sl_head = (void *)m++;
3271 }
3272 } else if (class == MC_CL) { /* MC_CL */
3273 mbstat.m_clfree =
3274 m_infree(MC_CL) + m_infree(MC_MBUF_CL);
3275 mbstat.m_clusters = m_total(MC_CL);
3276 while (i--) {
3277 c->mcl_next = sp->sl_head;
3278 sp->sl_head = (void *)c++;
3279 }
3280 } else {
3281 VERIFY(class == MC_BIGCL);
3282 mbstat.m_bigclusters = m_total(MC_BIGCL);
3283 mbstat.m_bigclfree = m_infree(MC_BIGCL) +
3284 m_infree(MC_MBUF_BIGCL);
3285 while (i--) {
3286 mbc->mbc_next = sp->sl_head;
3287 sp->sl_head = (void *)mbc++;
3288 }
3289 }
3290
3291 /* Insert into the mbuf or 2k or 4k slab list */
3292 slab_insert(sp, class);
3293
3294 if ((i = mb_waiters) > 0)
3295 mb_waiters = 0;
3296 if (i != 0)
3297 wakeup(mb_waitchan);
3298 }
3299 return (count != 0);
3300 }
3301
3302 /*
3303 * For each class, initialize the freelist to hold m_minlimit() objects.
3304 */
3305 static void
3306 freelist_init(mbuf_class_t class)
3307 {
3308 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
3309
3310 VERIFY(class == MC_CL || class == MC_BIGCL);
3311 VERIFY(m_total(class) == 0);
3312 VERIFY(m_minlimit(class) > 0);
3313
3314 while (m_total(class) < m_minlimit(class))
3315 (void) freelist_populate(class, m_minlimit(class), M_WAIT);
3316
3317 VERIFY(m_total(class) >= m_minlimit(class));
3318 }
3319
3320 /*
3321 * (Inaccurately) check if it might be worth a trip back to the
3322 * mcache layer due the availability of objects there. We'll
3323 * end up back here if there's nothing up there.
3324 */
3325 static boolean_t
3326 mbuf_cached_above(mbuf_class_t class, int wait)
3327 {
3328 switch (class) {
3329 case MC_MBUF:
3330 if (wait & MCR_COMP)
3331 return (!mcache_bkt_isempty(m_cache(MC_MBUF_CL)) ||
3332 !mcache_bkt_isempty(m_cache(MC_MBUF_BIGCL)));
3333 break;
3334
3335 case MC_CL:
3336 if (wait & MCR_COMP)
3337 return (!mcache_bkt_isempty(m_cache(MC_MBUF_CL)));
3338 break;
3339
3340 case MC_BIGCL:
3341 if (wait & MCR_COMP)
3342 return (!mcache_bkt_isempty(m_cache(MC_MBUF_BIGCL)));
3343 break;
3344
3345 case MC_16KCL:
3346 if (wait & MCR_COMP)
3347 return (!mcache_bkt_isempty(m_cache(MC_MBUF_16KCL)));
3348 break;
3349
3350 case MC_MBUF_CL:
3351 case MC_MBUF_BIGCL:
3352 case MC_MBUF_16KCL:
3353 break;
3354
3355 default:
3356 VERIFY(0);
3357 /* NOTREACHED */
3358 }
3359
3360 return (!mcache_bkt_isempty(m_cache(class)));
3361 }
3362
3363 /*
3364 * If possible, convert constructed objects to raw ones.
3365 */
3366 static boolean_t
3367 mbuf_steal(mbuf_class_t class, unsigned int num)
3368 {
3369 mcache_obj_t *top = NULL;
3370 mcache_obj_t **list = &top;
3371 unsigned int tot = 0;
3372
3373 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
3374
3375 switch (class) {
3376 case MC_MBUF:
3377 case MC_CL:
3378 case MC_BIGCL:
3379 case MC_16KCL:
3380 return (FALSE);
3381
3382 case MC_MBUF_CL:
3383 case MC_MBUF_BIGCL:
3384 case MC_MBUF_16KCL:
3385 /* Get the required number of constructed objects if possible */
3386 if (m_infree(class) > m_minlimit(class)) {
3387 tot = cslab_alloc(class, &list,
3388 MIN(num, m_infree(class)));
3389 }
3390
3391 /* And destroy them to get back the raw objects */
3392 if (top != NULL)
3393 (void) cslab_free(class, top, 1);
3394 break;
3395
3396 default:
3397 VERIFY(0);
3398 /* NOTREACHED */
3399 }
3400
3401 return (tot == num);
3402 }
3403
3404 static void
3405 m_reclaim(mbuf_class_t class, unsigned int num, boolean_t comp)
3406 {
3407 int m, bmap = 0;
3408
3409 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
3410
3411 VERIFY(m_total(MC_CL) <= m_maxlimit(MC_CL));
3412 VERIFY(m_total(MC_BIGCL) <= m_maxlimit(MC_BIGCL));
3413 VERIFY(m_total(MC_16KCL) <= m_maxlimit(MC_16KCL));
3414
3415 /*
3416 * This logic can be made smarter; for now, simply mark
3417 * all other related classes as potential victims.
3418 */
3419 switch (class) {
3420 case MC_MBUF:
3421 m_wantpurge(MC_CL)++;
3422 m_wantpurge(MC_BIGCL)++;
3423 m_wantpurge(MC_MBUF_CL)++;
3424 m_wantpurge(MC_MBUF_BIGCL)++;
3425 break;
3426
3427 case MC_CL:
3428 m_wantpurge(MC_MBUF)++;
3429 m_wantpurge(MC_BIGCL)++;
3430 m_wantpurge(MC_MBUF_BIGCL)++;
3431 if (!comp)
3432 m_wantpurge(MC_MBUF_CL)++;
3433 break;
3434
3435 case MC_BIGCL:
3436 m_wantpurge(MC_MBUF)++;
3437 m_wantpurge(MC_CL)++;
3438 m_wantpurge(MC_MBUF_CL)++;
3439 if (!comp)
3440 m_wantpurge(MC_MBUF_BIGCL)++;
3441 break;
3442
3443 case MC_16KCL:
3444 if (!comp)
3445 m_wantpurge(MC_MBUF_16KCL)++;
3446 break;
3447
3448 default:
3449 VERIFY(0);
3450 /* NOTREACHED */
3451 }
3452
3453 /*
3454 * Run through each marked class and check if we really need to
3455 * purge (and therefore temporarily disable) the per-CPU caches
3456 * layer used by the class. If so, remember the classes since
3457 * we are going to drop the lock below prior to purging.
3458 */
3459 for (m = 0; m < NELEM(mbuf_table); m++) {
3460 if (m_wantpurge(m) > 0) {
3461 m_wantpurge(m) = 0;
3462 /*
3463 * Try hard to steal the required number of objects
3464 * from the freelist of other mbuf classes. Only
3465 * purge and disable the per-CPU caches layer when
3466 * we don't have enough; it's the last resort.
3467 */
3468 if (!mbuf_steal(m, num))
3469 bmap |= (1 << m);
3470 }
3471 }
3472
3473 lck_mtx_unlock(mbuf_mlock);
3474
3475 if (bmap != 0) {
3476 /* signal the domains to drain */
3477 net_drain_domains();
3478
3479 /* Sigh; we have no other choices but to ask mcache to purge */
3480 for (m = 0; m < NELEM(mbuf_table); m++) {
3481 if ((bmap & (1 << m)) &&
3482 mcache_purge_cache(m_cache(m), TRUE)) {
3483 lck_mtx_lock(mbuf_mlock);
3484 m_purge_cnt(m)++;
3485 mbstat.m_drain++;
3486 lck_mtx_unlock(mbuf_mlock);
3487 }
3488 }
3489 } else {
3490 /*
3491 * Request mcache to reap extra elements from all of its caches;
3492 * note that all reaps are serialized and happen only at a fixed
3493 * interval.
3494 */
3495 mcache_reap();
3496 }
3497 lck_mtx_lock(mbuf_mlock);
3498 }
3499
3500 static inline struct mbuf *
3501 m_get_common(int wait, short type, int hdr)
3502 {
3503 struct mbuf *m;
3504 int mcflags = MSLEEPF(wait);
3505
3506 /* Is this due to a non-blocking retry? If so, then try harder */
3507 if (mcflags & MCR_NOSLEEP)
3508 mcflags |= MCR_TRYHARD;
3509
3510 m = mcache_alloc(m_cache(MC_MBUF), mcflags);
3511 if (m != NULL) {
3512 MBUF_INIT(m, hdr, type);
3513 mtype_stat_inc(type);
3514 mtype_stat_dec(MT_FREE);
3515 #if CONFIG_MACF_NET
3516 if (hdr && mac_init_mbuf(m, wait) != 0) {
3517 m_free(m);
3518 return (NULL);
3519 }
3520 #endif /* MAC_NET */
3521 }
3522 return (m);
3523 }
3524
3525 /*
3526 * Space allocation routines; these are also available as macros
3527 * for critical paths.
3528 */
3529 #define _M_GET(wait, type) m_get_common(wait, type, 0)
3530 #define _M_GETHDR(wait, type) m_get_common(wait, type, 1)
3531 #define _M_RETRY(wait, type) _M_GET(wait, type)
3532 #define _M_RETRYHDR(wait, type) _M_GETHDR(wait, type)
3533 #define _MGET(m, how, type) ((m) = _M_GET(how, type))
3534 #define _MGETHDR(m, how, type) ((m) = _M_GETHDR(how, type))
3535
3536 struct mbuf *
3537 m_get(int wait, int type)
3538 {
3539 return (_M_GET(wait, type));
3540 }
3541
3542 struct mbuf *
3543 m_gethdr(int wait, int type)
3544 {
3545 return (_M_GETHDR(wait, type));
3546 }
3547
3548 struct mbuf *
3549 m_retry(int wait, int type)
3550 {
3551 return (_M_RETRY(wait, type));
3552 }
3553
3554 struct mbuf *
3555 m_retryhdr(int wait, int type)
3556 {
3557 return (_M_RETRYHDR(wait, type));
3558 }
3559
3560 struct mbuf *
3561 m_getclr(int wait, int type)
3562 {
3563 struct mbuf *m;
3564
3565 _MGET(m, wait, type);
3566 if (m != NULL)
3567 bzero(MTOD(m, caddr_t), MLEN);
3568 return (m);
3569 }
3570
3571 static int
3572 m_free_paired(struct mbuf *m)
3573 {
3574 VERIFY((m->m_flags & M_EXT) && (MEXT_FLAGS(m) & EXTF_PAIRED));
3575
3576 membar_sync();
3577 if (MEXT_PMBUF(m) == m) {
3578 volatile UInt16 *addr = (volatile UInt16 *)&MEXT_PREF(m);
3579 int16_t oprefcnt, prefcnt;
3580
3581 /*
3582 * Paired ref count might be negative in case we lose
3583 * against another thread clearing MEXT_PMBUF, in the
3584 * event it occurs after the above memory barrier sync.
3585 * In that case just ignore as things have been unpaired.
3586 */
3587 do {
3588 oprefcnt = *addr;
3589 prefcnt = oprefcnt - 1;
3590 } while (!OSCompareAndSwap16(oprefcnt, prefcnt, addr));
3591
3592 if (prefcnt > 1) {
3593 return (1);
3594 } else if (prefcnt == 1) {
3595 (*(m_get_ext_free(m)))(m->m_ext.ext_buf,
3596 m->m_ext.ext_size, m_get_ext_arg(m));
3597 return (1);
3598 } else if (prefcnt == 0) {
3599 VERIFY(MBUF_IS_PAIRED(m));
3600
3601 /*
3602 * Restore minref to its natural value, so that
3603 * the caller will be able to free the cluster
3604 * as appropriate.
3605 */
3606 MEXT_MINREF(m) = 0;
3607
3608 /*
3609 * Clear MEXT_PMBUF, but leave EXTF_PAIRED intact
3610 * as it is immutable. atomic_set_ptr also causes
3611 * memory barrier sync.
3612 */
3613 atomic_set_ptr(&MEXT_PMBUF(m), NULL);
3614
3615 switch (m->m_ext.ext_size) {
3616 case MCLBYTES:
3617 m_set_ext(m, m_get_rfa(m), NULL, NULL);
3618 break;
3619
3620 case MBIGCLBYTES:
3621 m_set_ext(m, m_get_rfa(m), m_bigfree, NULL);
3622 break;
3623
3624 case M16KCLBYTES:
3625 m_set_ext(m, m_get_rfa(m), m_16kfree, NULL);
3626 break;
3627
3628 default:
3629 VERIFY(0);
3630 /* NOTREACHED */
3631 }
3632 }
3633 }
3634
3635 /*
3636 * Tell caller the unpair has occurred, and that the reference
3637 * count on the external cluster held for the paired mbuf should
3638 * now be dropped.
3639 */
3640 return (0);
3641 }
3642
3643 struct mbuf *
3644 m_free(struct mbuf *m)
3645 {
3646 struct mbuf *n = m->m_next;
3647
3648 if (m->m_type == MT_FREE)
3649 panic("m_free: freeing an already freed mbuf");
3650
3651 if (m->m_flags & M_PKTHDR) {
3652 /* Check for scratch area overflow */
3653 m_redzone_verify(m);
3654 /* Free the aux data and tags if there is any */
3655 m_tag_delete_chain(m, NULL);
3656
3657 m_do_tx_compl_callback(m, NULL);
3658 }
3659
3660 if (m->m_flags & M_EXT) {
3661 u_int16_t refcnt;
3662 u_int32_t composite;
3663 m_ext_free_func_t m_free_func;
3664
3665 if (MBUF_IS_PAIRED(m) && m_free_paired(m))
3666 return (n);
3667
3668 refcnt = m_decref(m);
3669 composite = (MEXT_FLAGS(m) & EXTF_COMPOSITE);
3670 m_free_func = m_get_ext_free(m);
3671
3672 if (refcnt == MEXT_MINREF(m) && !composite) {
3673 if (m_free_func == NULL) {
3674 mcache_free(m_cache(MC_CL), m->m_ext.ext_buf);
3675 } else if (m_free_func == m_bigfree) {
3676 mcache_free(m_cache(MC_BIGCL),
3677 m->m_ext.ext_buf);
3678 } else if (m_free_func == m_16kfree) {
3679 mcache_free(m_cache(MC_16KCL),
3680 m->m_ext.ext_buf);
3681 } else {
3682 (*m_free_func)(m->m_ext.ext_buf,
3683 m->m_ext.ext_size, m_get_ext_arg(m));
3684 }
3685 mcache_free(ref_cache, m_get_rfa(m));
3686 m_set_ext(m, NULL, NULL, NULL);
3687 } else if (refcnt == MEXT_MINREF(m) && composite) {
3688 VERIFY(!(MEXT_FLAGS(m) & EXTF_PAIRED));
3689 VERIFY(m->m_type != MT_FREE);
3690
3691 mtype_stat_dec(m->m_type);
3692 mtype_stat_inc(MT_FREE);
3693
3694 m->m_type = MT_FREE;
3695 m->m_flags = M_EXT;
3696 m->m_len = 0;
3697 m->m_next = m->m_nextpkt = NULL;
3698
3699 MEXT_FLAGS(m) &= ~EXTF_READONLY;
3700
3701 /* "Free" into the intermediate cache */
3702 if (m_free_func == NULL) {
3703 mcache_free(m_cache(MC_MBUF_CL), m);
3704 } else if (m_free_func == m_bigfree) {
3705 mcache_free(m_cache(MC_MBUF_BIGCL), m);
3706 } else {
3707 VERIFY(m_free_func == m_16kfree);
3708 mcache_free(m_cache(MC_MBUF_16KCL), m);
3709 }
3710 return (n);
3711 }
3712 }
3713
3714 if (m->m_type != MT_FREE) {
3715 mtype_stat_dec(m->m_type);
3716 mtype_stat_inc(MT_FREE);
3717 }
3718
3719 m->m_type = MT_FREE;
3720 m->m_flags = m->m_len = 0;
3721 m->m_next = m->m_nextpkt = NULL;
3722
3723 mcache_free(m_cache(MC_MBUF), m);
3724
3725 return (n);
3726 }
3727
3728 __private_extern__ struct mbuf *
3729 m_clattach(struct mbuf *m, int type, caddr_t extbuf,
3730 void (*extfree)(caddr_t, u_int, caddr_t), u_int extsize, caddr_t extarg,
3731 int wait, int pair)
3732 {
3733 struct ext_ref *rfa = NULL;
3734
3735 /*
3736 * If pairing is requested and an existing mbuf is provided, reject
3737 * it if it's already been paired to another cluster. Otherwise,
3738 * allocate a new one or free any existing below.
3739 */
3740 if ((m != NULL && MBUF_IS_PAIRED(m)) ||
3741 (m == NULL && (m = _M_GETHDR(wait, type)) == NULL))
3742 return (NULL);
3743
3744 if (m->m_flags & M_EXT) {
3745 u_int16_t refcnt;
3746 u_int32_t composite;
3747 m_ext_free_func_t m_free_func;
3748
3749 refcnt = m_decref(m);
3750 composite = (MEXT_FLAGS(m) & EXTF_COMPOSITE);
3751 VERIFY(!(MEXT_FLAGS(m) & EXTF_PAIRED) && MEXT_PMBUF(m) == NULL);
3752 m_free_func = m_get_ext_free(m);
3753 if (refcnt == MEXT_MINREF(m) && !composite) {
3754 if (m_free_func == NULL) {
3755 mcache_free(m_cache(MC_CL), m->m_ext.ext_buf);
3756 } else if (m_free_func == m_bigfree) {
3757 mcache_free(m_cache(MC_BIGCL),
3758 m->m_ext.ext_buf);
3759 } else if (m_free_func == m_16kfree) {
3760 mcache_free(m_cache(MC_16KCL),
3761 m->m_ext.ext_buf);
3762 } else {
3763 (*m_free_func)(m->m_ext.ext_buf,
3764 m->m_ext.ext_size, m_get_ext_arg(m));
3765 }
3766 /* Re-use the reference structure */
3767 rfa = m_get_rfa(m);
3768 } else if (refcnt == MEXT_MINREF(m) && composite) {
3769 VERIFY(m->m_type != MT_FREE);
3770
3771 mtype_stat_dec(m->m_type);
3772 mtype_stat_inc(MT_FREE);
3773
3774 m->m_type = MT_FREE;
3775 m->m_flags = M_EXT;
3776 m->m_len = 0;
3777 m->m_next = m->m_nextpkt = NULL;
3778
3779 MEXT_FLAGS(m) &= ~EXTF_READONLY;
3780
3781 /* "Free" into the intermediate cache */
3782 if (m_free_func == NULL) {
3783 mcache_free(m_cache(MC_MBUF_CL), m);
3784 } else if (m_free_func == m_bigfree) {
3785 mcache_free(m_cache(MC_MBUF_BIGCL), m);
3786 } else {
3787 VERIFY(m_free_func == m_16kfree);
3788 mcache_free(m_cache(MC_MBUF_16KCL), m);
3789 }
3790 /*
3791 * Allocate a new mbuf, since we didn't divorce
3792 * the composite mbuf + cluster pair above.
3793 */
3794 if ((m = _M_GETHDR(wait, type)) == NULL)
3795 return (NULL);
3796 }
3797 }
3798
3799 if (rfa == NULL &&
3800 (rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL) {
3801 m_free(m);
3802 return (NULL);
3803 }
3804
3805 if (!pair) {
3806 MEXT_INIT(m, extbuf, extsize, extfree, extarg, rfa,
3807 0, 1, 0, 0, 0, NULL);
3808 } else {
3809 MEXT_INIT(m, extbuf, extsize, extfree, (caddr_t)m, rfa,
3810 1, 1, 1, EXTF_PAIRED, 0, m);
3811 }
3812
3813 return (m);
3814 }
3815
3816 /*
3817 * Perform `fast' allocation mbuf clusters from a cache of recently-freed
3818 * clusters. (If the cache is empty, new clusters are allocated en-masse.)
3819 */
3820 struct mbuf *
3821 m_getcl(int wait, int type, int flags)
3822 {
3823 struct mbuf *m;
3824 int mcflags = MSLEEPF(wait);
3825 int hdr = (flags & M_PKTHDR);
3826
3827 /* Is this due to a non-blocking retry? If so, then try harder */
3828 if (mcflags & MCR_NOSLEEP)
3829 mcflags |= MCR_TRYHARD;
3830
3831 m = mcache_alloc(m_cache(MC_MBUF_CL), mcflags);
3832 if (m != NULL) {
3833 u_int16_t flag;
3834 struct ext_ref *rfa;
3835 void *cl;
3836
3837 VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT);
3838 cl = m->m_ext.ext_buf;
3839 rfa = m_get_rfa(m);
3840
3841 ASSERT(cl != NULL && rfa != NULL);
3842 VERIFY(MBUF_IS_COMPOSITE(m) && m_get_ext_free(m) == NULL);
3843
3844 flag = MEXT_FLAGS(m);
3845
3846 MBUF_INIT(m, hdr, type);
3847 MBUF_CL_INIT(m, cl, rfa, 1, flag);
3848
3849 mtype_stat_inc(type);
3850 mtype_stat_dec(MT_FREE);
3851 #if CONFIG_MACF_NET
3852 if (hdr && mac_init_mbuf(m, wait) != 0) {
3853 m_freem(m);
3854 return (NULL);
3855 }
3856 #endif /* MAC_NET */
3857 }
3858 return (m);
3859 }
3860
3861 /* m_mclget() add an mbuf cluster to a normal mbuf */
3862 struct mbuf *
3863 m_mclget(struct mbuf *m, int wait)
3864 {
3865 struct ext_ref *rfa;
3866
3867 if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL)
3868 return (m);
3869
3870 m->m_ext.ext_buf = m_mclalloc(wait);
3871 if (m->m_ext.ext_buf != NULL) {
3872 MBUF_CL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0);
3873 } else {
3874 mcache_free(ref_cache, rfa);
3875 }
3876 return (m);
3877 }
3878
3879 /* Allocate an mbuf cluster */
3880 caddr_t
3881 m_mclalloc(int wait)
3882 {
3883 int mcflags = MSLEEPF(wait);
3884
3885 /* Is this due to a non-blocking retry? If so, then try harder */
3886 if (mcflags & MCR_NOSLEEP)
3887 mcflags |= MCR_TRYHARD;
3888
3889 return (mcache_alloc(m_cache(MC_CL), mcflags));
3890 }
3891
3892 /* Free an mbuf cluster */
3893 void
3894 m_mclfree(caddr_t p)
3895 {
3896 mcache_free(m_cache(MC_CL), p);
3897 }
3898
3899 /*
3900 * mcl_hasreference() checks if a cluster of an mbuf is referenced by
3901 * another mbuf; see comments in m_incref() regarding EXTF_READONLY.
3902 */
3903 int
3904 m_mclhasreference(struct mbuf *m)
3905 {
3906 if (!(m->m_flags & M_EXT))
3907 return (0);
3908
3909 ASSERT(m_get_rfa(m) != NULL);
3910
3911 return ((MEXT_FLAGS(m) & EXTF_READONLY) ? 1 : 0);
3912 }
3913
3914 __private_extern__ caddr_t
3915 m_bigalloc(int wait)
3916 {
3917 int mcflags = MSLEEPF(wait);
3918
3919 /* Is this due to a non-blocking retry? If so, then try harder */
3920 if (mcflags & MCR_NOSLEEP)
3921 mcflags |= MCR_TRYHARD;
3922
3923 return (mcache_alloc(m_cache(MC_BIGCL), mcflags));
3924 }
3925
3926 __private_extern__ void
3927 m_bigfree(caddr_t p, __unused u_int size, __unused caddr_t arg)
3928 {
3929 mcache_free(m_cache(MC_BIGCL), p);
3930 }
3931
3932 /* m_mbigget() add an 4KB mbuf cluster to a normal mbuf */
3933 __private_extern__ struct mbuf *
3934 m_mbigget(struct mbuf *m, int wait)
3935 {
3936 struct ext_ref *rfa;
3937
3938 if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL)
3939 return (m);
3940
3941 m->m_ext.ext_buf = m_bigalloc(wait);
3942 if (m->m_ext.ext_buf != NULL) {
3943 MBUF_BIGCL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0);
3944 } else {
3945 mcache_free(ref_cache, rfa);
3946 }
3947 return (m);
3948 }
3949
3950 __private_extern__ caddr_t
3951 m_16kalloc(int wait)
3952 {
3953 int mcflags = MSLEEPF(wait);
3954
3955 /* Is this due to a non-blocking retry? If so, then try harder */
3956 if (mcflags & MCR_NOSLEEP)
3957 mcflags |= MCR_TRYHARD;
3958
3959 return (mcache_alloc(m_cache(MC_16KCL), mcflags));
3960 }
3961
3962 __private_extern__ void
3963 m_16kfree(caddr_t p, __unused u_int size, __unused caddr_t arg)
3964 {
3965 mcache_free(m_cache(MC_16KCL), p);
3966 }
3967
3968 /* m_m16kget() add a 16KB mbuf cluster to a normal mbuf */
3969 __private_extern__ struct mbuf *
3970 m_m16kget(struct mbuf *m, int wait)
3971 {
3972 struct ext_ref *rfa;
3973
3974 if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL)
3975 return (m);
3976
3977 m->m_ext.ext_buf = m_16kalloc(wait);
3978 if (m->m_ext.ext_buf != NULL) {
3979 MBUF_16KCL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0);
3980 } else {
3981 mcache_free(ref_cache, rfa);
3982 }
3983 return (m);
3984 }
3985
3986 /*
3987 * "Move" mbuf pkthdr from "from" to "to".
3988 * "from" must have M_PKTHDR set, and "to" must be empty.
3989 */
3990 void
3991 m_copy_pkthdr(struct mbuf *to, struct mbuf *from)
3992 {
3993 VERIFY(from->m_flags & M_PKTHDR);
3994
3995 /* Check for scratch area overflow */
3996 m_redzone_verify(from);
3997
3998 if (to->m_flags & M_PKTHDR) {
3999 /* Check for scratch area overflow */
4000 m_redzone_verify(to);
4001 /* We will be taking over the tags of 'to' */
4002 m_tag_delete_chain(to, NULL);
4003 }
4004 to->m_pkthdr = from->m_pkthdr; /* especially tags */
4005 m_classifier_init(from, 0); /* purge classifier info */
4006 m_tag_init(from, 1); /* purge all tags from src */
4007 m_scratch_init(from); /* clear src scratch area */
4008 to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT);
4009 if ((to->m_flags & M_EXT) == 0)
4010 to->m_data = to->m_pktdat;
4011 m_redzone_init(to); /* setup red zone on dst */
4012 }
4013
4014 /*
4015 * Duplicate "from"'s mbuf pkthdr in "to".
4016 * "from" must have M_PKTHDR set, and "to" must be empty.
4017 * In particular, this does a deep copy of the packet tags.
4018 */
4019 static int
4020 m_dup_pkthdr(struct mbuf *to, struct mbuf *from, int how)
4021 {
4022 VERIFY(from->m_flags & M_PKTHDR);
4023
4024 /* Check for scratch area overflow */
4025 m_redzone_verify(from);
4026
4027 if (to->m_flags & M_PKTHDR) {
4028 /* Check for scratch area overflow */
4029 m_redzone_verify(to);
4030 /* We will be taking over the tags of 'to' */
4031 m_tag_delete_chain(to, NULL);
4032 }
4033 to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT);
4034 if ((to->m_flags & M_EXT) == 0)
4035 to->m_data = to->m_pktdat;
4036 to->m_pkthdr = from->m_pkthdr;
4037 m_redzone_init(to); /* setup red zone on dst */
4038 m_tag_init(to, 0); /* preserve dst static tags */
4039 return (m_tag_copy_chain(to, from, how));
4040 }
4041
4042 void
4043 m_copy_pftag(struct mbuf *to, struct mbuf *from)
4044 {
4045 memcpy(m_pftag(to), m_pftag(from), sizeof(struct pf_mtag));
4046 #if PF_ECN
4047 m_pftag(to)->pftag_hdr = NULL;
4048 m_pftag(to)->pftag_flags &= ~(PF_TAG_HDR_INET|PF_TAG_HDR_INET6);
4049 #endif /* PF_ECN */
4050 }
4051
4052 void
4053 m_classifier_init(struct mbuf *m, uint32_t pktf_mask)
4054 {
4055 VERIFY(m->m_flags & M_PKTHDR);
4056
4057 m->m_pkthdr.pkt_proto = 0;
4058 m->m_pkthdr.pkt_flowsrc = 0;
4059 m->m_pkthdr.pkt_flowid = 0;
4060 m->m_pkthdr.pkt_flags &= pktf_mask; /* caller-defined mask */
4061 /* preserve service class and interface info for loopback packets */
4062 if (!(m->m_pkthdr.pkt_flags & PKTF_LOOP))
4063 (void) m_set_service_class(m, MBUF_SC_BE);
4064 if (!(m->m_pkthdr.pkt_flags & PKTF_IFAINFO))
4065 m->m_pkthdr.pkt_ifainfo = 0;
4066 /*
4067 * Preserve timestamp if requested
4068 */
4069 if (!(m->m_pkthdr.pkt_flags & PKTF_TS_VALID))
4070 m->m_pkthdr.pkt_timestamp = 0;
4071 }
4072
4073 void
4074 m_copy_classifier(struct mbuf *to, struct mbuf *from)
4075 {
4076 VERIFY(to->m_flags & M_PKTHDR);
4077 VERIFY(from->m_flags & M_PKTHDR);
4078
4079 to->m_pkthdr.pkt_proto = from->m_pkthdr.pkt_proto;
4080 to->m_pkthdr.pkt_flowsrc = from->m_pkthdr.pkt_flowsrc;
4081 to->m_pkthdr.pkt_flowid = from->m_pkthdr.pkt_flowid;
4082 to->m_pkthdr.pkt_flags = from->m_pkthdr.pkt_flags;
4083 (void) m_set_service_class(to, from->m_pkthdr.pkt_svc);
4084 to->m_pkthdr.pkt_ifainfo = from->m_pkthdr.pkt_ifainfo;
4085 }
4086
4087 /*
4088 * Return a list of mbuf hdrs that point to clusters. Try for num_needed;
4089 * if wantall is not set, return whatever number were available. Set up the
4090 * first num_with_pkthdrs with mbuf hdrs configured as packet headers; these
4091 * are chained on the m_nextpkt field. Any packets requested beyond this
4092 * are chained onto the last packet header's m_next field. The size of
4093 * the cluster is controlled by the parameter bufsize.
4094 */
4095 __private_extern__ struct mbuf *
4096 m_getpackets_internal(unsigned int *num_needed, int num_with_pkthdrs,
4097 int wait, int wantall, size_t bufsize)
4098 {
4099 struct mbuf *m;
4100 struct mbuf **np, *top;
4101 unsigned int pnum, needed = *num_needed;
4102 mcache_obj_t *mp_list = NULL;
4103 int mcflags = MSLEEPF(wait);
4104 u_int16_t flag;
4105 struct ext_ref *rfa;
4106 mcache_t *cp;
4107 void *cl;
4108
4109 ASSERT(bufsize == m_maxsize(MC_CL) ||
4110 bufsize == m_maxsize(MC_BIGCL) ||
4111 bufsize == m_maxsize(MC_16KCL));
4112
4113 /*
4114 * Caller must first check for njcl because this
4115 * routine is internal and not exposed/used via KPI.
4116 */
4117 VERIFY(bufsize != m_maxsize(MC_16KCL) || njcl > 0);
4118
4119 top = NULL;
4120 np = &top;
4121 pnum = 0;
4122
4123 /*
4124 * The caller doesn't want all the requested buffers; only some.
4125 * Try hard to get what we can, but don't block. This effectively
4126 * overrides MCR_SLEEP, since this thread will not go to sleep
4127 * if we can't get all the buffers.
4128 */
4129 if (!wantall || (mcflags & MCR_NOSLEEP))
4130 mcflags |= MCR_TRYHARD;
4131
4132 /* Allocate the composite mbuf + cluster elements from the cache */
4133 if (bufsize == m_maxsize(MC_CL))
4134 cp = m_cache(MC_MBUF_CL);
4135 else if (bufsize == m_maxsize(MC_BIGCL))
4136 cp = m_cache(MC_MBUF_BIGCL);
4137 else
4138 cp = m_cache(MC_MBUF_16KCL);
4139 needed = mcache_alloc_ext(cp, &mp_list, needed, mcflags);
4140
4141 for (pnum = 0; pnum < needed; pnum++) {
4142 m = (struct mbuf *)mp_list;
4143 mp_list = mp_list->obj_next;
4144
4145 VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT);
4146 cl = m->m_ext.ext_buf;
4147 rfa = m_get_rfa(m);
4148
4149 ASSERT(cl != NULL && rfa != NULL);
4150 VERIFY(MBUF_IS_COMPOSITE(m));
4151
4152 flag = MEXT_FLAGS(m);
4153
4154 MBUF_INIT(m, num_with_pkthdrs, MT_DATA);
4155 if (bufsize == m_maxsize(MC_16KCL)) {
4156 MBUF_16KCL_INIT(m, cl, rfa, 1, flag);
4157 } else if (bufsize == m_maxsize(MC_BIGCL)) {
4158 MBUF_BIGCL_INIT(m, cl, rfa, 1, flag);
4159 } else {
4160 MBUF_CL_INIT(m, cl, rfa, 1, flag);
4161 }
4162
4163 if (num_with_pkthdrs > 0) {
4164 --num_with_pkthdrs;
4165 #if CONFIG_MACF_NET
4166 if (mac_mbuf_label_init(m, wait) != 0) {
4167 m_freem(m);
4168 break;
4169 }
4170 #endif /* MAC_NET */
4171 }
4172
4173 *np = m;
4174 if (num_with_pkthdrs > 0)
4175 np = &m->m_nextpkt;
4176 else
4177 np = &m->m_next;
4178 }
4179 ASSERT(pnum != *num_needed || mp_list == NULL);
4180 if (mp_list != NULL)
4181 mcache_free_ext(cp, mp_list);
4182
4183 if (pnum > 0) {
4184 mtype_stat_add(MT_DATA, pnum);
4185 mtype_stat_sub(MT_FREE, pnum);
4186 }
4187
4188 if (wantall && (pnum != *num_needed)) {
4189 if (top != NULL)
4190 m_freem_list(top);
4191 return (NULL);
4192 }
4193
4194 if (pnum > *num_needed) {
4195 printf("%s: File a radar related to <rdar://10146739>. \
4196 needed = %u, pnum = %u, num_needed = %u \n",
4197 __func__, needed, pnum, *num_needed);
4198 }
4199
4200 *num_needed = pnum;
4201 return (top);
4202 }
4203
4204 /*
4205 * Return list of mbuf linked by m_nextpkt. Try for numlist, and if
4206 * wantall is not set, return whatever number were available. The size of
4207 * each mbuf in the list is controlled by the parameter packetlen. Each
4208 * mbuf of the list may have a chain of mbufs linked by m_next. Each mbuf
4209 * in the chain is called a segment. If maxsegments is not null and the
4210 * value pointed to is not null, this specify the maximum number of segments
4211 * for a chain of mbufs. If maxsegments is zero or the value pointed to
4212 * is zero the caller does not have any restriction on the number of segments.
4213 * The actual number of segments of a mbuf chain is return in the value
4214 * pointed to by maxsegments.
4215 */
4216 __private_extern__ struct mbuf *
4217 m_allocpacket_internal(unsigned int *numlist, size_t packetlen,
4218 unsigned int *maxsegments, int wait, int wantall, size_t wantsize)
4219 {
4220 struct mbuf **np, *top, *first = NULL;
4221 size_t bufsize, r_bufsize;
4222 unsigned int num = 0;
4223 unsigned int nsegs = 0;
4224 unsigned int needed, resid;
4225 int mcflags = MSLEEPF(wait);
4226 mcache_obj_t *mp_list = NULL, *rmp_list = NULL;
4227 mcache_t *cp = NULL, *rcp = NULL;
4228
4229 if (*numlist == 0)
4230 return (NULL);
4231
4232 top = NULL;
4233 np = &top;
4234
4235 if (wantsize == 0) {
4236 if (packetlen <= MINCLSIZE) {
4237 bufsize = packetlen;
4238 } else if (packetlen > m_maxsize(MC_CL)) {
4239 /* Use 4KB if jumbo cluster pool isn't available */
4240 if (packetlen <= m_maxsize(MC_BIGCL) || njcl == 0)
4241 bufsize = m_maxsize(MC_BIGCL);
4242 else
4243 bufsize = m_maxsize(MC_16KCL);
4244 } else {
4245 bufsize = m_maxsize(MC_CL);
4246 }
4247 } else if (wantsize == m_maxsize(MC_CL) ||
4248 wantsize == m_maxsize(MC_BIGCL) ||
4249 (wantsize == m_maxsize(MC_16KCL) && njcl > 0)) {
4250 bufsize = wantsize;
4251 } else {
4252 return (NULL);
4253 }
4254
4255 if (bufsize <= MHLEN) {
4256 nsegs = 1;
4257 } else if (bufsize <= MINCLSIZE) {
4258 if (maxsegments != NULL && *maxsegments == 1) {
4259 bufsize = m_maxsize(MC_CL);
4260 nsegs = 1;
4261 } else {
4262 nsegs = 2;
4263 }
4264 } else if (bufsize == m_maxsize(MC_16KCL)) {
4265 VERIFY(njcl > 0);
4266 nsegs = ((packetlen - 1) >> M16KCLSHIFT) + 1;
4267 } else if (bufsize == m_maxsize(MC_BIGCL)) {
4268 nsegs = ((packetlen - 1) >> MBIGCLSHIFT) + 1;
4269 } else {
4270 nsegs = ((packetlen - 1) >> MCLSHIFT) + 1;
4271 }
4272 if (maxsegments != NULL) {
4273 if (*maxsegments && nsegs > *maxsegments) {
4274 *maxsegments = nsegs;
4275 return (NULL);
4276 }
4277 *maxsegments = nsegs;
4278 }
4279
4280 /*
4281 * The caller doesn't want all the requested buffers; only some.
4282 * Try hard to get what we can, but don't block. This effectively
4283 * overrides MCR_SLEEP, since this thread will not go to sleep
4284 * if we can't get all the buffers.
4285 */
4286 if (!wantall || (mcflags & MCR_NOSLEEP))
4287 mcflags |= MCR_TRYHARD;
4288
4289 /*
4290 * Simple case where all elements in the lists/chains are mbufs.
4291 * Unless bufsize is greater than MHLEN, each segment chain is made
4292 * up of exactly 1 mbuf. Otherwise, each segment chain is made up
4293 * of 2 mbufs; the second one is used for the residual data, i.e.
4294 * the remaining data that cannot fit into the first mbuf.
4295 */
4296 if (bufsize <= MINCLSIZE) {
4297 /* Allocate the elements in one shot from the mbuf cache */
4298 ASSERT(bufsize <= MHLEN || nsegs == 2);
4299 cp = m_cache(MC_MBUF);
4300 needed = mcache_alloc_ext(cp, &mp_list,
4301 (*numlist) * nsegs, mcflags);
4302
4303 /*
4304 * The number of elements must be even if we are to use an
4305 * mbuf (instead of a cluster) to store the residual data.
4306 * If we couldn't allocate the requested number of mbufs,
4307 * trim the number down (if it's odd) in order to avoid
4308 * creating a partial segment chain.
4309 */
4310 if (bufsize > MHLEN && (needed & 0x1))
4311 needed--;
4312
4313 while (num < needed) {
4314 struct mbuf *m;
4315
4316 m = (struct mbuf *)mp_list;
4317 mp_list = mp_list->obj_next;
4318 ASSERT(m != NULL);
4319
4320 MBUF_INIT(m, 1, MT_DATA);
4321 #if CONFIG_MACF_NET
4322 if (mac_init_mbuf(m, wait) != 0) {
4323 m_free(m);
4324 break;
4325 }
4326 #endif /* MAC_NET */
4327 num++;
4328 if (bufsize > MHLEN) {
4329 /* A second mbuf for this segment chain */
4330 m->m_next = (struct mbuf *)mp_list;
4331 mp_list = mp_list->obj_next;
4332 ASSERT(m->m_next != NULL);
4333
4334 MBUF_INIT(m->m_next, 0, MT_DATA);
4335 num++;
4336 }
4337 *np = m;
4338 np = &m->m_nextpkt;
4339 }
4340 ASSERT(num != *numlist || mp_list == NULL);
4341
4342 if (num > 0) {
4343 mtype_stat_add(MT_DATA, num);
4344 mtype_stat_sub(MT_FREE, num);
4345 }
4346 num /= nsegs;
4347
4348 /* We've got them all; return to caller */
4349 if (num == *numlist)
4350 return (top);
4351
4352 goto fail;
4353 }
4354
4355 /*
4356 * Complex cases where elements are made up of one or more composite
4357 * mbufs + cluster, depending on packetlen. Each N-segment chain can
4358 * be illustrated as follows:
4359 *
4360 * [mbuf + cluster 1] [mbuf + cluster 2] ... [mbuf + cluster N]
4361 *
4362 * Every composite mbuf + cluster element comes from the intermediate
4363 * cache (either MC_MBUF_CL or MC_MBUF_BIGCL). For space efficiency,
4364 * the last composite element will come from the MC_MBUF_CL cache,
4365 * unless the residual data is larger than 2KB where we use the
4366 * big cluster composite cache (MC_MBUF_BIGCL) instead. Residual
4367 * data is defined as extra data beyond the first element that cannot
4368 * fit into the previous element, i.e. there is no residual data if
4369 * the chain only has 1 segment.
4370 */
4371 r_bufsize = bufsize;
4372 resid = packetlen > bufsize ? packetlen % bufsize : 0;
4373 if (resid > 0) {
4374 /* There is residual data; figure out the cluster size */
4375 if (wantsize == 0 && packetlen > MINCLSIZE) {
4376 /*
4377 * Caller didn't request that all of the segments
4378 * in the chain use the same cluster size; use the
4379 * smaller of the cluster sizes.
4380 */
4381 if (njcl > 0 && resid > m_maxsize(MC_BIGCL))
4382 r_bufsize = m_maxsize(MC_16KCL);
4383 else if (resid > m_maxsize(MC_CL))
4384 r_bufsize = m_maxsize(MC_BIGCL);
4385 else
4386 r_bufsize = m_maxsize(MC_CL);
4387 } else {
4388 /* Use the same cluster size as the other segments */
4389 resid = 0;
4390 }
4391 }
4392
4393 needed = *numlist;
4394 if (resid > 0) {
4395 /*
4396 * Attempt to allocate composite mbuf + cluster elements for
4397 * the residual data in each chain; record the number of such
4398 * elements that can be allocated so that we know how many
4399 * segment chains we can afford to create.
4400 */
4401 if (r_bufsize <= m_maxsize(MC_CL))
4402 rcp = m_cache(MC_MBUF_CL);
4403 else if (r_bufsize <= m_maxsize(MC_BIGCL))
4404 rcp = m_cache(MC_MBUF_BIGCL);
4405 else
4406 rcp = m_cache(MC_MBUF_16KCL);
4407 needed = mcache_alloc_ext(rcp, &rmp_list, *numlist, mcflags);
4408
4409 if (needed == 0)
4410 goto fail;
4411
4412 /* This is temporarily reduced for calculation */
4413 ASSERT(nsegs > 1);
4414 nsegs--;
4415 }
4416
4417 /*
4418 * Attempt to allocate the rest of the composite mbuf + cluster
4419 * elements for the number of segment chains that we need.
4420 */
4421 if (bufsize <= m_maxsize(MC_CL))
4422 cp = m_cache(MC_MBUF_CL);
4423 else if (bufsize <= m_maxsize(MC_BIGCL))
4424 cp = m_cache(MC_MBUF_BIGCL);
4425 else
4426 cp = m_cache(MC_MBUF_16KCL);
4427 needed = mcache_alloc_ext(cp, &mp_list, needed * nsegs, mcflags);
4428
4429 /* Round it down to avoid creating a partial segment chain */
4430 needed = (needed / nsegs) * nsegs;
4431 if (needed == 0)
4432 goto fail;
4433
4434 if (resid > 0) {
4435 /*
4436 * We're about to construct the chain(s); take into account
4437 * the number of segments we have created above to hold the
4438 * residual data for each chain, as well as restore the
4439 * original count of segments per chain.
4440 */
4441 ASSERT(nsegs > 0);
4442 needed += needed / nsegs;
4443 nsegs++;
4444 }
4445
4446 for (;;) {
4447 struct mbuf *m;
4448 u_int16_t flag;
4449 struct ext_ref *rfa;
4450 void *cl;
4451 int pkthdr;
4452 m_ext_free_func_t m_free_func;
4453
4454 ++num;
4455 if (nsegs == 1 || (num % nsegs) != 0 || resid == 0) {
4456 m = (struct mbuf *)mp_list;
4457 mp_list = mp_list->obj_next;
4458 } else {
4459 m = (struct mbuf *)rmp_list;
4460 rmp_list = rmp_list->obj_next;
4461 }
4462 m_free_func = m_get_ext_free(m);
4463 ASSERT(m != NULL);
4464 VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT);
4465 VERIFY(m_free_func == NULL || m_free_func == m_bigfree ||
4466 m_free_func == m_16kfree);
4467
4468 cl = m->m_ext.ext_buf;
4469 rfa = m_get_rfa(m);
4470
4471 ASSERT(cl != NULL && rfa != NULL);
4472 VERIFY(MBUF_IS_COMPOSITE(m));
4473
4474 flag = MEXT_FLAGS(m);
4475
4476 pkthdr = (nsegs == 1 || (num % nsegs) == 1);
4477 if (pkthdr)
4478 first = m;
4479 MBUF_INIT(m, pkthdr, MT_DATA);
4480 if (m_free_func == m_16kfree) {
4481 MBUF_16KCL_INIT(m, cl, rfa, 1, flag);
4482 } else if (m_free_func == m_bigfree) {
4483 MBUF_BIGCL_INIT(m, cl, rfa, 1, flag);
4484 } else {
4485 MBUF_CL_INIT(m, cl, rfa, 1, flag);
4486 }
4487 #if CONFIG_MACF_NET
4488 if (pkthdr && mac_init_mbuf(m, wait) != 0) {
4489 --num;
4490 m_freem(m);
4491 break;
4492 }
4493 #endif /* MAC_NET */
4494
4495 *np = m;
4496 if ((num % nsegs) == 0)
4497 np = &first->m_nextpkt;
4498 else
4499 np = &m->m_next;
4500
4501 if (num == needed)
4502 break;
4503 }
4504
4505 if (num > 0) {
4506 mtype_stat_add(MT_DATA, num);
4507 mtype_stat_sub(MT_FREE, num);
4508 }
4509
4510 num /= nsegs;
4511
4512 /* We've got them all; return to caller */
4513 if (num == *numlist) {
4514 ASSERT(mp_list == NULL && rmp_list == NULL);
4515 return (top);
4516 }
4517
4518 fail:
4519 /* Free up what's left of the above */
4520 if (mp_list != NULL)
4521 mcache_free_ext(cp, mp_list);
4522 if (rmp_list != NULL)
4523 mcache_free_ext(rcp, rmp_list);
4524 if (wantall && top != NULL) {
4525 m_freem(top);
4526 return (NULL);
4527 }
4528 *numlist = num;
4529 return (top);
4530 }
4531
4532 /*
4533 * Best effort to get a mbuf cluster + pkthdr. Used by drivers to allocated
4534 * packets on receive ring.
4535 */
4536 __private_extern__ struct mbuf *
4537 m_getpacket_how(int wait)
4538 {
4539 unsigned int num_needed = 1;
4540
4541 return (m_getpackets_internal(&num_needed, 1, wait, 1,
4542 m_maxsize(MC_CL)));
4543 }
4544
4545 /*
4546 * Best effort to get a mbuf cluster + pkthdr. Used by drivers to allocated
4547 * packets on receive ring.
4548 */
4549 struct mbuf *
4550 m_getpacket(void)
4551 {
4552 unsigned int num_needed = 1;
4553
4554 return (m_getpackets_internal(&num_needed, 1, M_WAIT, 1,
4555 m_maxsize(MC_CL)));
4556 }
4557
4558 /*
4559 * Return a list of mbuf hdrs that point to clusters. Try for num_needed;
4560 * if this can't be met, return whatever number were available. Set up the
4561 * first num_with_pkthdrs with mbuf hdrs configured as packet headers. These
4562 * are chained on the m_nextpkt field. Any packets requested beyond this are
4563 * chained onto the last packet header's m_next field.
4564 */
4565 struct mbuf *
4566 m_getpackets(int num_needed, int num_with_pkthdrs, int how)
4567 {
4568 unsigned int n = num_needed;
4569
4570 return (m_getpackets_internal(&n, num_with_pkthdrs, how, 0,
4571 m_maxsize(MC_CL)));
4572 }
4573
4574 /*
4575 * Return a list of mbuf hdrs set up as packet hdrs chained together
4576 * on the m_nextpkt field
4577 */
4578 struct mbuf *
4579 m_getpackethdrs(int num_needed, int how)
4580 {
4581 struct mbuf *m;
4582 struct mbuf **np, *top;
4583
4584 top = NULL;
4585 np = &top;
4586
4587 while (num_needed--) {
4588 m = _M_RETRYHDR(how, MT_DATA);
4589 if (m == NULL)
4590 break;
4591
4592 *np = m;
4593 np = &m->m_nextpkt;
4594 }
4595
4596 return (top);
4597 }
4598
4599 /*
4600 * Free an mbuf list (m_nextpkt) while following m_next. Returns the count
4601 * for mbufs packets freed. Used by the drivers.
4602 */
4603 int
4604 m_freem_list(struct mbuf *m)
4605 {
4606 struct mbuf *nextpkt;
4607 mcache_obj_t *mp_list = NULL;
4608 mcache_obj_t *mcl_list = NULL;
4609 mcache_obj_t *mbc_list = NULL;
4610 mcache_obj_t *m16k_list = NULL;
4611 mcache_obj_t *m_mcl_list = NULL;
4612 mcache_obj_t *m_mbc_list = NULL;
4613 mcache_obj_t *m_m16k_list = NULL;
4614 mcache_obj_t *ref_list = NULL;
4615 int pktcount = 0;
4616 int mt_free = 0, mt_data = 0, mt_header = 0, mt_soname = 0, mt_tag = 0;
4617
4618 while (m != NULL) {
4619 pktcount++;
4620
4621 nextpkt = m->m_nextpkt;
4622 m->m_nextpkt = NULL;
4623
4624 while (m != NULL) {
4625 struct mbuf *next = m->m_next;
4626 mcache_obj_t *o, *rfa;
4627 u_int32_t composite;
4628 u_int16_t refcnt;
4629 m_ext_free_func_t m_free_func;
4630
4631 if (m->m_type == MT_FREE)
4632 panic("m_free: freeing an already freed mbuf");
4633
4634 if (m->m_flags & M_PKTHDR) {
4635 /* Check for scratch area overflow */
4636 m_redzone_verify(m);
4637 /* Free the aux data and tags if there is any */
4638 m_tag_delete_chain(m, NULL);
4639 }
4640
4641 if (!(m->m_flags & M_EXT)) {
4642 mt_free++;
4643 goto simple_free;
4644 }
4645
4646 if (MBUF_IS_PAIRED(m) && m_free_paired(m)) {
4647 m = next;
4648 continue;
4649 }
4650
4651 mt_free++;
4652
4653 o = (mcache_obj_t *)(void *)m->m_ext.ext_buf;
4654 refcnt = m_decref(m);
4655 composite = (MEXT_FLAGS(m) & EXTF_COMPOSITE);
4656 m_free_func = m_get_ext_free(m);
4657 if (refcnt == MEXT_MINREF(m) && !composite) {
4658 if (m_free_func == NULL) {
4659 o->obj_next = mcl_list;
4660 mcl_list = o;
4661 } else if (m_free_func == m_bigfree) {
4662 o->obj_next = mbc_list;
4663 mbc_list = o;
4664 } else if (m_free_func == m_16kfree) {
4665 o->obj_next = m16k_list;
4666 m16k_list = o;
4667 } else {
4668 (*(m_free_func))((caddr_t)o,
4669 m->m_ext.ext_size,
4670 m_get_ext_arg(m));
4671 }
4672 rfa = (mcache_obj_t *)(void *)m_get_rfa(m);
4673 rfa->obj_next = ref_list;
4674 ref_list = rfa;
4675 m_set_ext(m, NULL, NULL, NULL);
4676 } else if (refcnt == MEXT_MINREF(m) && composite) {
4677 VERIFY(!(MEXT_FLAGS(m) & EXTF_PAIRED));
4678 VERIFY(m->m_type != MT_FREE);
4679 /*
4680 * Amortize the costs of atomic operations
4681 * by doing them at the end, if possible.
4682 */
4683 if (m->m_type == MT_DATA)
4684 mt_data++;
4685 else if (m->m_type == MT_HEADER)
4686 mt_header++;
4687 else if (m->m_type == MT_SONAME)
4688 mt_soname++;
4689 else if (m->m_type == MT_TAG)
4690 mt_tag++;
4691 else
4692 mtype_stat_dec(m->m_type);
4693
4694 m->m_type = MT_FREE;
4695 m->m_flags = M_EXT;
4696 m->m_len = 0;
4697 m->m_next = m->m_nextpkt = NULL;
4698
4699 MEXT_FLAGS(m) &= ~EXTF_READONLY;
4700
4701 /* "Free" into the intermediate cache */
4702 o = (mcache_obj_t *)m;
4703 if (m_free_func == NULL) {
4704 o->obj_next = m_mcl_list;
4705 m_mcl_list = o;
4706 } else if (m_free_func == m_bigfree) {
4707 o->obj_next = m_mbc_list;
4708 m_mbc_list = o;
4709 } else {
4710 VERIFY(m_free_func == m_16kfree);
4711 o->obj_next = m_m16k_list;
4712 m_m16k_list = o;
4713 }
4714 m = next;
4715 continue;
4716 }
4717 simple_free:
4718 /*
4719 * Amortize the costs of atomic operations
4720 * by doing them at the end, if possible.
4721 */
4722 if (m->m_type == MT_DATA)
4723 mt_data++;
4724 else if (m->m_type == MT_HEADER)
4725 mt_header++;
4726 else if (m->m_type == MT_SONAME)
4727 mt_soname++;
4728 else if (m->m_type == MT_TAG)
4729 mt_tag++;
4730 else if (m->m_type != MT_FREE)
4731 mtype_stat_dec(m->m_type);
4732
4733 m->m_type = MT_FREE;
4734 m->m_flags = m->m_len = 0;
4735 m->m_next = m->m_nextpkt = NULL;
4736
4737 ((mcache_obj_t *)m)->obj_next = mp_list;
4738 mp_list = (mcache_obj_t *)m;
4739
4740 m = next;
4741 }
4742
4743 m = nextpkt;
4744 }
4745
4746 if (mt_free > 0)
4747 mtype_stat_add(MT_FREE, mt_free);
4748 if (mt_data > 0)
4749 mtype_stat_sub(MT_DATA, mt_data);
4750 if (mt_header > 0)
4751 mtype_stat_sub(MT_HEADER, mt_header);
4752 if (mt_soname > 0)
4753 mtype_stat_sub(MT_SONAME, mt_soname);
4754 if (mt_tag > 0)
4755 mtype_stat_sub(MT_TAG, mt_tag);
4756
4757 if (mp_list != NULL)
4758 mcache_free_ext(m_cache(MC_MBUF), mp_list);
4759 if (mcl_list != NULL)
4760 mcache_free_ext(m_cache(MC_CL), mcl_list);
4761 if (mbc_list != NULL)
4762 mcache_free_ext(m_cache(MC_BIGCL), mbc_list);
4763 if (m16k_list != NULL)
4764 mcache_free_ext(m_cache(MC_16KCL), m16k_list);
4765 if (m_mcl_list != NULL)
4766 mcache_free_ext(m_cache(MC_MBUF_CL), m_mcl_list);
4767 if (m_mbc_list != NULL)
4768 mcache_free_ext(m_cache(MC_MBUF_BIGCL), m_mbc_list);
4769 if (m_m16k_list != NULL)
4770 mcache_free_ext(m_cache(MC_MBUF_16KCL), m_m16k_list);
4771 if (ref_list != NULL)
4772 mcache_free_ext(ref_cache, ref_list);
4773
4774 return (pktcount);
4775 }
4776
4777 void
4778 m_freem(struct mbuf *m)
4779 {
4780 while (m != NULL)
4781 m = m_free(m);
4782 }
4783
4784 /*
4785 * Mbuffer utility routines.
4786 */
4787
4788 /*
4789 * Compute the amount of space available before the current start
4790 * of data in an mbuf.
4791 */
4792 int
4793 m_leadingspace(struct mbuf *m)
4794 {
4795 if (m->m_flags & M_EXT) {
4796 if (MCLHASREFERENCE(m))
4797 return (0);
4798 return (m->m_data - m->m_ext.ext_buf);
4799 }
4800 if (m->m_flags & M_PKTHDR)
4801 return (m->m_data - m->m_pktdat);
4802 return (m->m_data - m->m_dat);
4803 }
4804
4805 /*
4806 * Compute the amount of space available after the end of data in an mbuf.
4807 */
4808 int
4809 m_trailingspace(struct mbuf *m)
4810 {
4811 if (m->m_flags & M_EXT) {
4812 if (MCLHASREFERENCE(m))
4813 return (0);
4814 return (m->m_ext.ext_buf + m->m_ext.ext_size -
4815 (m->m_data + m->m_len));
4816 }
4817 return (&m->m_dat[MLEN] - (m->m_data + m->m_len));
4818 }
4819
4820 /*
4821 * Lesser-used path for M_PREPEND: allocate new mbuf to prepend to chain,
4822 * copy junk along. Does not adjust packet header length.
4823 */
4824 struct mbuf *
4825 m_prepend(struct mbuf *m, int len, int how)
4826 {
4827 struct mbuf *mn;
4828
4829 _MGET(mn, how, m->m_type);
4830 if (mn == NULL) {
4831 m_freem(m);
4832 return (NULL);
4833 }
4834 if (m->m_flags & M_PKTHDR) {
4835 M_COPY_PKTHDR(mn, m);
4836 m->m_flags &= ~M_PKTHDR;
4837 }
4838 mn->m_next = m;
4839 m = mn;
4840 if (m->m_flags & M_PKTHDR) {
4841 VERIFY(len <= MHLEN);
4842 MH_ALIGN(m, len);
4843 } else {
4844 VERIFY(len <= MLEN);
4845 M_ALIGN(m, len);
4846 }
4847 m->m_len = len;
4848 return (m);
4849 }
4850
4851 /*
4852 * Replacement for old M_PREPEND macro: allocate new mbuf to prepend to
4853 * chain, copy junk along, and adjust length.
4854 */
4855 struct mbuf *
4856 m_prepend_2(struct mbuf *m, int len, int how, int align)
4857 {
4858 if (M_LEADINGSPACE(m) >= len &&
4859 (!align || IS_P2ALIGNED((m->m_data - len), sizeof(u_int32_t)))) {
4860 m->m_data -= len;
4861 m->m_len += len;
4862 } else {
4863 m = m_prepend(m, len, how);
4864 }
4865 if ((m) && (m->m_flags & M_PKTHDR))
4866 m->m_pkthdr.len += len;
4867 return (m);
4868 }
4869
4870 /*
4871 * Make a copy of an mbuf chain starting "off0" bytes from the beginning,
4872 * continuing for "len" bytes. If len is M_COPYALL, copy to end of mbuf.
4873 * The wait parameter is a choice of M_WAIT/M_DONTWAIT from caller.
4874 */
4875 int MCFail;
4876
4877 struct mbuf *
4878 m_copym_mode(struct mbuf *m, int off0, int len, int wait, uint32_t mode)
4879 {
4880 struct mbuf *n, *mhdr = NULL, **np;
4881 int off = off0;
4882 struct mbuf *top;
4883 int copyhdr = 0;
4884
4885 if (off < 0 || len < 0)
4886 panic("m_copym: invalid offset %d or len %d", off, len);
4887
4888 VERIFY((mode != M_COPYM_MUST_COPY_HDR &&
4889 mode != M_COPYM_MUST_MOVE_HDR) || (m->m_flags & M_PKTHDR));
4890
4891 if ((off == 0 && (m->m_flags & M_PKTHDR)) ||
4892 mode == M_COPYM_MUST_COPY_HDR || mode == M_COPYM_MUST_MOVE_HDR) {
4893 mhdr = m;
4894 copyhdr = 1;
4895 }
4896
4897 while (off >= m->m_len) {
4898 if (m->m_next == NULL)
4899 panic("m_copym: invalid mbuf chain");
4900 off -= m->m_len;
4901 m = m->m_next;
4902 }
4903 np = &top;
4904 top = NULL;
4905
4906 while (len > 0) {
4907 if (m == NULL) {
4908 if (len != M_COPYALL)
4909 panic("m_copym: len != M_COPYALL");
4910 break;
4911 }
4912
4913 if (copyhdr)
4914 n = _M_RETRYHDR(wait, m->m_type);
4915 else
4916 n = _M_RETRY(wait, m->m_type);
4917 *np = n;
4918
4919 if (n == NULL)
4920 goto nospace;
4921
4922 if (copyhdr != 0) {
4923 if ((mode == M_COPYM_MOVE_HDR) ||
4924 (mode == M_COPYM_MUST_MOVE_HDR)) {
4925 M_COPY_PKTHDR(n, mhdr);
4926 } else if ((mode == M_COPYM_COPY_HDR) ||
4927 (mode == M_COPYM_MUST_COPY_HDR)) {
4928 if (m_dup_pkthdr(n, mhdr, wait) == 0)
4929 goto nospace;
4930 }
4931 if (len == M_COPYALL)
4932 n->m_pkthdr.len -= off0;
4933 else
4934 n->m_pkthdr.len = len;
4935 copyhdr = 0;
4936 /*
4937 * There is data to copy from the packet header mbuf
4938 * if it is empty or it is before the starting offset
4939 */
4940 if (mhdr != m) {
4941 np = &n->m_next;
4942 continue;
4943 }
4944 }
4945 n->m_len = MIN(len, (m->m_len - off));
4946 if (m->m_flags & M_EXT) {
4947 n->m_ext = m->m_ext;
4948 m_incref(m);
4949 n->m_data = m->m_data + off;
4950 n->m_flags |= M_EXT;
4951 } else {
4952 /*
4953 * Limit to the capacity of the destination
4954 */
4955 if (n->m_flags & M_PKTHDR)
4956 n->m_len = MIN(n->m_len, MHLEN);
4957 else
4958 n->m_len = MIN(n->m_len, MLEN);
4959
4960 if (MTOD(n, char *) + n->m_len > ((char *)n) + MSIZE)
4961 panic("%s n %p copy overflow",
4962 __func__, n);
4963
4964 bcopy(MTOD(m, caddr_t)+off, MTOD(n, caddr_t),
4965 (unsigned)n->m_len);
4966 }
4967 if (len != M_COPYALL)
4968 len -= n->m_len;
4969 off = 0;
4970 m = m->m_next;
4971 np = &n->m_next;
4972 }
4973
4974 if (top == NULL)
4975 MCFail++;
4976
4977 return (top);
4978 nospace:
4979
4980 m_freem(top);
4981 MCFail++;
4982 return (NULL);
4983 }
4984
4985
4986 struct mbuf *
4987 m_copym(struct mbuf *m, int off0, int len, int wait)
4988 {
4989 return (m_copym_mode(m, off0, len, wait, M_COPYM_MOVE_HDR));
4990 }
4991
4992 /*
4993 * Equivalent to m_copym except that all necessary mbuf hdrs are allocated
4994 * within this routine also, the last mbuf and offset accessed are passed
4995 * out and can be passed back in to avoid having to rescan the entire mbuf
4996 * list (normally hung off of the socket)
4997 */
4998 struct mbuf *
4999 m_copym_with_hdrs(struct mbuf *m0, int off0, int len0, int wait,
5000 struct mbuf **m_lastm, int *m_off, uint32_t mode)
5001 {
5002 struct mbuf *m = m0, *n, **np = NULL;
5003 int off = off0, len = len0;
5004 struct mbuf *top = NULL;
5005 int mcflags = MSLEEPF(wait);
5006 int copyhdr = 0;
5007 int type = 0;
5008 mcache_obj_t *list = NULL;
5009 int needed = 0;
5010
5011 if (off == 0 && (m->m_flags & M_PKTHDR))
5012 copyhdr = 1;
5013
5014 if (m_lastm != NULL && *m_lastm != NULL) {
5015 m = *m_lastm;
5016 off = *m_off;
5017 } else {
5018 while (off >= m->m_len) {
5019 off -= m->m_len;
5020 m = m->m_next;
5021 }
5022 }
5023
5024 n = m;
5025 while (len > 0) {
5026 needed++;
5027 ASSERT(n != NULL);
5028 len -= MIN(len, (n->m_len - ((needed == 1) ? off : 0)));
5029 n = n->m_next;
5030 }
5031 needed++;
5032 len = len0;
5033
5034 /*
5035 * If the caller doesn't want to be put to sleep, mark it with
5036 * MCR_TRYHARD so that we may reclaim buffers from other places
5037 * before giving up.
5038 */
5039 if (mcflags & MCR_NOSLEEP)
5040 mcflags |= MCR_TRYHARD;
5041
5042 if (mcache_alloc_ext(m_cache(MC_MBUF), &list, needed,
5043 mcflags) != needed)
5044 goto nospace;
5045
5046 needed = 0;
5047 while (len > 0) {
5048 n = (struct mbuf *)list;
5049 list = list->obj_next;
5050 ASSERT(n != NULL && m != NULL);
5051
5052 type = (top == NULL) ? MT_HEADER : m->m_type;
5053 MBUF_INIT(n, (top == NULL), type);
5054 #if CONFIG_MACF_NET
5055 if (top == NULL && mac_mbuf_label_init(n, wait) != 0) {
5056 mtype_stat_inc(MT_HEADER);
5057 mtype_stat_dec(MT_FREE);
5058 m_free(n);
5059 goto nospace;
5060 }
5061 #endif /* MAC_NET */
5062
5063 if (top == NULL) {
5064 top = n;
5065 np = &top->m_next;
5066 continue;
5067 } else {
5068 needed++;
5069 *np = n;
5070 }
5071
5072 if (copyhdr) {
5073 if ((mode == M_COPYM_MOVE_HDR) ||
5074 (mode == M_COPYM_MUST_MOVE_HDR)) {
5075 M_COPY_PKTHDR(n, m);
5076 } else if ((mode == M_COPYM_COPY_HDR) ||
5077 (mode == M_COPYM_MUST_COPY_HDR)) {
5078 if (m_dup_pkthdr(n, m, wait) == 0)
5079 goto nospace;
5080 }
5081 n->m_pkthdr.len = len;
5082 copyhdr = 0;
5083 }
5084 n->m_len = MIN(len, (m->m_len - off));
5085
5086 if (m->m_flags & M_EXT) {
5087 n->m_ext = m->m_ext;
5088 m_incref(m);
5089 n->m_data = m->m_data + off;
5090 n->m_flags |= M_EXT;
5091 } else {
5092 if (MTOD(n, char *) + n->m_len > ((char *)n) + MSIZE)
5093 panic("%s n %p copy overflow",
5094 __func__, n);
5095
5096 bcopy(MTOD(m, caddr_t)+off, MTOD(n, caddr_t),
5097 (unsigned)n->m_len);
5098 }
5099 len -= n->m_len;
5100
5101 if (len == 0) {
5102 if (m_lastm != NULL && m_off != NULL) {
5103 if ((off + n->m_len) == m->m_len) {
5104 *m_lastm = m->m_next;
5105 *m_off = 0;
5106 } else {
5107 *m_lastm = m;
5108 *m_off = off + n->m_len;
5109 }
5110 }
5111 break;
5112 }
5113 off = 0;
5114 m = m->m_next;
5115 np = &n->m_next;
5116 }
5117
5118 mtype_stat_inc(MT_HEADER);
5119 mtype_stat_add(type, needed);
5120 mtype_stat_sub(MT_FREE, needed + 1);
5121
5122 ASSERT(list == NULL);
5123 return (top);
5124
5125 nospace:
5126 if (list != NULL)
5127 mcache_free_ext(m_cache(MC_MBUF), list);
5128 if (top != NULL)
5129 m_freem(top);
5130 MCFail++;
5131 return (NULL);
5132 }
5133
5134 /*
5135 * Copy data from an mbuf chain starting "off" bytes from the beginning,
5136 * continuing for "len" bytes, into the indicated buffer.
5137 */
5138 void
5139 m_copydata(struct mbuf *m, int off, int len, void *vp)
5140 {
5141 int off0 = off, len0 = len;
5142 struct mbuf *m0 = m;
5143 unsigned count;
5144 char *cp = vp;
5145
5146 if (__improbable(off < 0 || len < 0)) {
5147 panic("%s: invalid offset %d or len %d", __func__, off, len);
5148 /* NOTREACHED */
5149 }
5150
5151 while (off > 0) {
5152 if (__improbable(m == NULL)) {
5153 panic("%s: invalid mbuf chain %p [off %d, len %d]",
5154 __func__, m0, off0, len0);
5155 /* NOTREACHED */
5156 }
5157 if (off < m->m_len)
5158 break;
5159 off -= m->m_len;
5160 m = m->m_next;
5161 }
5162 while (len > 0) {
5163 if (__improbable(m == NULL)) {
5164 panic("%s: invalid mbuf chain %p [off %d, len %d]",
5165 __func__, m0, off0, len0);
5166 /* NOTREACHED */
5167 }
5168 count = MIN(m->m_len - off, len);
5169 bcopy(MTOD(m, caddr_t) + off, cp, count);
5170 len -= count;
5171 cp += count;
5172 off = 0;
5173 m = m->m_next;
5174 }
5175 }
5176
5177 /*
5178 * Concatenate mbuf chain n to m. Both chains must be of the same type
5179 * (e.g. MT_DATA). Any m_pkthdr is not updated.
5180 */
5181 void
5182 m_cat(struct mbuf *m, struct mbuf *n)
5183 {
5184 while (m->m_next)
5185 m = m->m_next;
5186 while (n) {
5187 if ((m->m_flags & M_EXT) ||
5188 m->m_data + m->m_len + n->m_len >= &m->m_dat[MLEN]) {
5189 /* just join the two chains */
5190 m->m_next = n;
5191 return;
5192 }
5193 /* splat the data from one into the other */
5194 bcopy(MTOD(n, caddr_t), MTOD(m, caddr_t) + m->m_len,
5195 (u_int)n->m_len);
5196 m->m_len += n->m_len;
5197 n = m_free(n);
5198 }
5199 }
5200
5201 void
5202 m_adj(struct mbuf *mp, int req_len)
5203 {
5204 int len = req_len;
5205 struct mbuf *m;
5206 int count;
5207
5208 if ((m = mp) == NULL)
5209 return;
5210 if (len >= 0) {
5211 /*
5212 * Trim from head.
5213 */
5214 while (m != NULL && len > 0) {
5215 if (m->m_len <= len) {
5216 len -= m->m_len;
5217 m->m_len = 0;
5218 m = m->m_next;
5219 } else {
5220 m->m_len -= len;
5221 m->m_data += len;
5222 len = 0;
5223 }
5224 }
5225 m = mp;
5226 if (m->m_flags & M_PKTHDR)
5227 m->m_pkthdr.len -= (req_len - len);
5228 } else {
5229 /*
5230 * Trim from tail. Scan the mbuf chain,
5231 * calculating its length and finding the last mbuf.
5232 * If the adjustment only affects this mbuf, then just
5233 * adjust and return. Otherwise, rescan and truncate
5234 * after the remaining size.
5235 */
5236 len = -len;
5237 count = 0;
5238 for (;;) {
5239 count += m->m_len;
5240 if (m->m_next == (struct mbuf *)0)
5241 break;
5242 m = m->m_next;
5243 }
5244 if (m->m_len >= len) {
5245 m->m_len -= len;
5246 m = mp;
5247 if (m->m_flags & M_PKTHDR)
5248 m->m_pkthdr.len -= len;
5249 return;
5250 }
5251 count -= len;
5252 if (count < 0)
5253 count = 0;
5254 /*
5255 * Correct length for chain is "count".
5256 * Find the mbuf with last data, adjust its length,
5257 * and toss data from remaining mbufs on chain.
5258 */
5259 m = mp;
5260 if (m->m_flags & M_PKTHDR)
5261 m->m_pkthdr.len = count;
5262 for (; m; m = m->m_next) {
5263 if (m->m_len >= count) {
5264 m->m_len = count;
5265 break;
5266 }
5267 count -= m->m_len;
5268 }
5269 while ((m = m->m_next))
5270 m->m_len = 0;
5271 }
5272 }
5273
5274 /*
5275 * Rearange an mbuf chain so that len bytes are contiguous
5276 * and in the data area of an mbuf (so that mtod and dtom
5277 * will work for a structure of size len). Returns the resulting
5278 * mbuf chain on success, frees it and returns null on failure.
5279 * If there is room, it will add up to max_protohdr-len extra bytes to the
5280 * contiguous region in an attempt to avoid being called next time.
5281 */
5282 int MPFail;
5283
5284 struct mbuf *
5285 m_pullup(struct mbuf *n, int len)
5286 {
5287 struct mbuf *m;
5288 int count;
5289 int space;
5290
5291 /*
5292 * If first mbuf has no cluster, and has room for len bytes
5293 * without shifting current data, pullup into it,
5294 * otherwise allocate a new mbuf to prepend to the chain.
5295 */
5296 if ((n->m_flags & M_EXT) == 0 &&
5297 n->m_data + len < &n->m_dat[MLEN] && n->m_next) {
5298 if (n->m_len >= len)
5299 return (n);
5300 m = n;
5301 n = n->m_next;
5302 len -= m->m_len;
5303 } else {
5304 if (len > MHLEN)
5305 goto bad;
5306 _MGET(m, M_DONTWAIT, n->m_type);
5307 if (m == 0)
5308 goto bad;
5309 m->m_len = 0;
5310 if (n->m_flags & M_PKTHDR) {
5311 M_COPY_PKTHDR(m, n);
5312 n->m_flags &= ~M_PKTHDR;
5313 }
5314 }
5315 space = &m->m_dat[MLEN] - (m->m_data + m->m_len);
5316 do {
5317 count = MIN(MIN(MAX(len, max_protohdr), space), n->m_len);
5318 bcopy(MTOD(n, caddr_t), MTOD(m, caddr_t) + m->m_len,
5319 (unsigned)count);
5320 len -= count;
5321 m->m_len += count;
5322 n->m_len -= count;
5323 space -= count;
5324 if (n->m_len)
5325 n->m_data += count;
5326 else
5327 n = m_free(n);
5328 } while (len > 0 && n);
5329 if (len > 0) {
5330 (void) m_free(m);
5331 goto bad;
5332 }
5333 m->m_next = n;
5334 return (m);
5335 bad:
5336 m_freem(n);
5337 MPFail++;
5338 return (0);
5339 }
5340
5341 /*
5342 * Like m_pullup(), except a new mbuf is always allocated, and we allow
5343 * the amount of empty space before the data in the new mbuf to be specified
5344 * (in the event that the caller expects to prepend later).
5345 */
5346 __private_extern__ int MSFail = 0;
5347
5348 __private_extern__ struct mbuf *
5349 m_copyup(struct mbuf *n, int len, int dstoff)
5350 {
5351 struct mbuf *m;
5352 int count, space;
5353
5354 if (len > (MHLEN - dstoff))
5355 goto bad;
5356 MGET(m, M_DONTWAIT, n->m_type);
5357 if (m == NULL)
5358 goto bad;
5359 m->m_len = 0;
5360 if (n->m_flags & M_PKTHDR) {
5361 m_copy_pkthdr(m, n);
5362 n->m_flags &= ~M_PKTHDR;
5363 }
5364 m->m_data += dstoff;
5365 space = &m->m_dat[MLEN] - (m->m_data + m->m_len);
5366 do {
5367 count = min(min(max(len, max_protohdr), space), n->m_len);
5368 memcpy(mtod(m, caddr_t) + m->m_len, mtod(n, caddr_t),
5369 (unsigned)count);
5370 len -= count;
5371 m->m_len += count;
5372 n->m_len -= count;
5373 space -= count;
5374 if (n->m_len)
5375 n->m_data += count;
5376 else
5377 n = m_free(n);
5378 } while (len > 0 && n);
5379 if (len > 0) {
5380 (void) m_free(m);
5381 goto bad;
5382 }
5383 m->m_next = n;
5384 return (m);
5385 bad:
5386 m_freem(n);
5387 MSFail++;
5388 return (NULL);
5389 }
5390
5391 /*
5392 * Partition an mbuf chain in two pieces, returning the tail --
5393 * all but the first len0 bytes. In case of failure, it returns NULL and
5394 * attempts to restore the chain to its original state.
5395 */
5396 struct mbuf *
5397 m_split(struct mbuf *m0, int len0, int wait)
5398 {
5399 return (m_split0(m0, len0, wait, 1));
5400 }
5401
5402 static struct mbuf *
5403 m_split0(struct mbuf *m0, int len0, int wait, int copyhdr)
5404 {
5405 struct mbuf *m, *n;
5406 unsigned len = len0, remain;
5407
5408 for (m = m0; m && len > m->m_len; m = m->m_next)
5409 len -= m->m_len;
5410 if (m == NULL)
5411 return (NULL);
5412 remain = m->m_len - len;
5413 if (copyhdr && (m0->m_flags & M_PKTHDR)) {
5414 _MGETHDR(n, wait, m0->m_type);
5415 if (n == NULL)
5416 return (NULL);
5417 n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif;
5418 n->m_pkthdr.len = m0->m_pkthdr.len - len0;
5419 m0->m_pkthdr.len = len0;
5420 if (m->m_flags & M_EXT)
5421 goto extpacket;
5422 if (remain > MHLEN) {
5423 /* m can't be the lead packet */
5424 MH_ALIGN(n, 0);
5425 n->m_next = m_split(m, len, wait);
5426 if (n->m_next == NULL) {
5427 (void) m_free(n);
5428 return (NULL);
5429 } else
5430 return (n);
5431 } else
5432 MH_ALIGN(n, remain);
5433 } else if (remain == 0) {
5434 n = m->m_next;
5435 m->m_next = NULL;
5436 return (n);
5437 } else {
5438 _MGET(n, wait, m->m_type);
5439 if (n == NULL)
5440 return (NULL);
5441 M_ALIGN(n, remain);
5442 }
5443 extpacket:
5444 if (m->m_flags & M_EXT) {
5445 n->m_flags |= M_EXT;
5446 n->m_ext = m->m_ext;
5447 m_incref(m);
5448 n->m_data = m->m_data + len;
5449 } else {
5450 bcopy(MTOD(m, caddr_t) + len, MTOD(n, caddr_t), remain);
5451 }
5452 n->m_len = remain;
5453 m->m_len = len;
5454 n->m_next = m->m_next;
5455 m->m_next = NULL;
5456 return (n);
5457 }
5458
5459 /*
5460 * Routine to copy from device local memory into mbufs.
5461 */
5462 struct mbuf *
5463 m_devget(char *buf, int totlen, int off0, struct ifnet *ifp,
5464 void (*copy)(const void *, void *, size_t))
5465 {
5466 struct mbuf *m;
5467 struct mbuf *top = NULL, **mp = &top;
5468 int off = off0, len;
5469 char *cp;
5470 char *epkt;
5471
5472 cp = buf;
5473 epkt = cp + totlen;
5474 if (off) {
5475 /*
5476 * If 'off' is non-zero, packet is trailer-encapsulated,
5477 * so we have to skip the type and length fields.
5478 */
5479 cp += off + 2 * sizeof (u_int16_t);
5480 totlen -= 2 * sizeof (u_int16_t);
5481 }
5482 _MGETHDR(m, M_DONTWAIT, MT_DATA);
5483 if (m == NULL)
5484 return (NULL);
5485 m->m_pkthdr.rcvif = ifp;
5486 m->m_pkthdr.len = totlen;
5487 m->m_len = MHLEN;
5488
5489 while (totlen > 0) {
5490 if (top != NULL) {
5491 _MGET(m, M_DONTWAIT, MT_DATA);
5492 if (m == NULL) {
5493 m_freem(top);
5494 return (NULL);
5495 }
5496 m->m_len = MLEN;
5497 }
5498 len = MIN(totlen, epkt - cp);
5499 if (len >= MINCLSIZE) {
5500 MCLGET(m, M_DONTWAIT);
5501 if (m->m_flags & M_EXT) {
5502 m->m_len = len = MIN(len, m_maxsize(MC_CL));
5503 } else {
5504 /* give up when it's out of cluster mbufs */
5505 if (top != NULL)
5506 m_freem(top);
5507 m_freem(m);
5508 return (NULL);
5509 }
5510 } else {
5511 /*
5512 * Place initial small packet/header at end of mbuf.
5513 */
5514 if (len < m->m_len) {
5515 if (top == NULL &&
5516 len + max_linkhdr <= m->m_len)
5517 m->m_data += max_linkhdr;
5518 m->m_len = len;
5519 } else {
5520 len = m->m_len;
5521 }
5522 }
5523 if (copy)
5524 copy(cp, MTOD(m, caddr_t), (unsigned)len);
5525 else
5526 bcopy(cp, MTOD(m, caddr_t), (unsigned)len);
5527 cp += len;
5528 *mp = m;
5529 mp = &m->m_next;
5530 totlen -= len;
5531 if (cp == epkt)
5532 cp = buf;
5533 }
5534 return (top);
5535 }
5536
5537 #ifndef MBUF_GROWTH_NORMAL_THRESH
5538 #define MBUF_GROWTH_NORMAL_THRESH 25
5539 #endif
5540
5541 /*
5542 * Cluster freelist allocation check.
5543 */
5544 static int
5545 m_howmany(int num, size_t bufsize)
5546 {
5547 int i = 0, j = 0;
5548 u_int32_t m_mbclusters, m_clusters, m_bigclusters, m_16kclusters;
5549 u_int32_t m_mbfree, m_clfree, m_bigclfree, m_16kclfree;
5550 u_int32_t sumclusters, freeclusters;
5551 u_int32_t percent_pool, percent_kmem;
5552 u_int32_t mb_growth, mb_growth_thresh;
5553
5554 VERIFY(bufsize == m_maxsize(MC_BIGCL) ||
5555 bufsize == m_maxsize(MC_16KCL));
5556
5557 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
5558
5559 /* Numbers in 2K cluster units */
5560 m_mbclusters = m_total(MC_MBUF) >> NMBPCLSHIFT;
5561 m_clusters = m_total(MC_CL);
5562 m_bigclusters = m_total(MC_BIGCL) << NCLPBGSHIFT;
5563 m_16kclusters = m_total(MC_16KCL);
5564 sumclusters = m_mbclusters + m_clusters + m_bigclusters;
5565
5566 m_mbfree = m_infree(MC_MBUF) >> NMBPCLSHIFT;
5567 m_clfree = m_infree(MC_CL);
5568 m_bigclfree = m_infree(MC_BIGCL) << NCLPBGSHIFT;
5569 m_16kclfree = m_infree(MC_16KCL);
5570 freeclusters = m_mbfree + m_clfree + m_bigclfree;
5571
5572 /* Bail if we've maxed out the mbuf memory map */
5573 if ((bufsize == m_maxsize(MC_BIGCL) && sumclusters >= nclusters) ||
5574 (njcl > 0 && bufsize == m_maxsize(MC_16KCL) &&
5575 (m_16kclusters << NCLPJCLSHIFT) >= njcl)) {
5576 return (0);
5577 }
5578
5579 if (bufsize == m_maxsize(MC_BIGCL)) {
5580 /* Under minimum */
5581 if (m_bigclusters < m_minlimit(MC_BIGCL))
5582 return (m_minlimit(MC_BIGCL) - m_bigclusters);
5583
5584 percent_pool =
5585 ((sumclusters - freeclusters) * 100) / sumclusters;
5586 percent_kmem = (sumclusters * 100) / nclusters;
5587
5588 /*
5589 * If a light/normal user, grow conservatively (75%)
5590 * If a heavy user, grow aggressively (50%)
5591 */
5592 if (percent_kmem < MBUF_GROWTH_NORMAL_THRESH)
5593 mb_growth = MB_GROWTH_NORMAL;
5594 else
5595 mb_growth = MB_GROWTH_AGGRESSIVE;
5596
5597 if (percent_kmem < 5) {
5598 /* For initial allocations */
5599 i = num;
5600 } else {
5601 /* Return if >= MBIGCL_LOWAT clusters available */
5602 if (m_infree(MC_BIGCL) >= MBIGCL_LOWAT &&
5603 m_total(MC_BIGCL) >=
5604 MBIGCL_LOWAT + m_minlimit(MC_BIGCL))
5605 return (0);
5606
5607 /* Ensure at least num clusters are accessible */
5608 if (num >= m_infree(MC_BIGCL))
5609 i = num - m_infree(MC_BIGCL);
5610 if (num > m_total(MC_BIGCL) - m_minlimit(MC_BIGCL))
5611 j = num - (m_total(MC_BIGCL) -
5612 m_minlimit(MC_BIGCL));
5613
5614 i = MAX(i, j);
5615
5616 /*
5617 * Grow pool if percent_pool > 75 (normal growth)
5618 * or percent_pool > 50 (aggressive growth).
5619 */
5620 mb_growth_thresh = 100 - (100 / (1 << mb_growth));
5621 if (percent_pool > mb_growth_thresh)
5622 j = ((sumclusters + num) >> mb_growth) -
5623 freeclusters;
5624 i = MAX(i, j);
5625 }
5626
5627 /* Check to ensure we didn't go over limits */
5628 if (i + m_bigclusters >= m_maxlimit(MC_BIGCL))
5629 i = m_maxlimit(MC_BIGCL) - m_bigclusters;
5630 if ((i << 1) + sumclusters >= nclusters)
5631 i = (nclusters - sumclusters) >> 1;
5632 VERIFY((m_total(MC_BIGCL) + i) <= m_maxlimit(MC_BIGCL));
5633 VERIFY(sumclusters + (i << 1) <= nclusters);
5634
5635 } else { /* 16K CL */
5636 VERIFY(njcl > 0);
5637 /* Ensure at least num clusters are available */
5638 if (num >= m_16kclfree)
5639 i = num - m_16kclfree;
5640
5641 /* Always grow 16KCL pool aggressively */
5642 if (((m_16kclusters + num) >> 1) > m_16kclfree)
5643 j = ((m_16kclusters + num) >> 1) - m_16kclfree;
5644 i = MAX(i, j);
5645
5646 /* Check to ensure we don't go over limit */
5647 if ((i + m_total(MC_16KCL)) >= m_maxlimit(MC_16KCL))
5648 i = m_maxlimit(MC_16KCL) - m_total(MC_16KCL);
5649 }
5650 return (i);
5651 }
5652 /*
5653 * Return the number of bytes in the mbuf chain, m.
5654 */
5655 unsigned int
5656 m_length(struct mbuf *m)
5657 {
5658 struct mbuf *m0;
5659 unsigned int pktlen;
5660
5661 if (m->m_flags & M_PKTHDR)
5662 return (m->m_pkthdr.len);
5663
5664 pktlen = 0;
5665 for (m0 = m; m0 != NULL; m0 = m0->m_next)
5666 pktlen += m0->m_len;
5667 return (pktlen);
5668 }
5669
5670 /*
5671 * Copy data from a buffer back into the indicated mbuf chain,
5672 * starting "off" bytes from the beginning, extending the mbuf
5673 * chain if necessary.
5674 */
5675 void
5676 m_copyback(struct mbuf *m0, int off, int len, const void *cp)
5677 {
5678 #if DEBUG
5679 struct mbuf *origm = m0;
5680 int error;
5681 #endif /* DEBUG */
5682
5683 if (m0 == NULL)
5684 return;
5685
5686 #if DEBUG
5687 error =
5688 #endif /* DEBUG */
5689 m_copyback0(&m0, off, len, cp,
5690 M_COPYBACK0_COPYBACK | M_COPYBACK0_EXTEND, M_DONTWAIT);
5691
5692 #if DEBUG
5693 if (error != 0 || (m0 != NULL && origm != m0))
5694 panic("m_copyback");
5695 #endif /* DEBUG */
5696 }
5697
5698 struct mbuf *
5699 m_copyback_cow(struct mbuf *m0, int off, int len, const void *cp, int how)
5700 {
5701 int error;
5702
5703 /* don't support chain expansion */
5704 VERIFY(off + len <= m_length(m0));
5705
5706 error = m_copyback0(&m0, off, len, cp,
5707 M_COPYBACK0_COPYBACK | M_COPYBACK0_COW, how);
5708 if (error) {
5709 /*
5710 * no way to recover from partial success.
5711 * just free the chain.
5712 */
5713 m_freem(m0);
5714 return (NULL);
5715 }
5716 return (m0);
5717 }
5718
5719 /*
5720 * m_makewritable: ensure the specified range writable.
5721 */
5722 int
5723 m_makewritable(struct mbuf **mp, int off, int len, int how)
5724 {
5725 int error;
5726 #if DEBUG
5727 struct mbuf *n;
5728 int origlen, reslen;
5729
5730 origlen = m_length(*mp);
5731 #endif /* DEBUG */
5732
5733 #if 0 /* M_COPYALL is large enough */
5734 if (len == M_COPYALL)
5735 len = m_length(*mp) - off; /* XXX */
5736 #endif
5737
5738 error = m_copyback0(mp, off, len, NULL,
5739 M_COPYBACK0_PRESERVE | M_COPYBACK0_COW, how);
5740
5741 #if DEBUG
5742 reslen = 0;
5743 for (n = *mp; n; n = n->m_next)
5744 reslen += n->m_len;
5745 if (origlen != reslen)
5746 panic("m_makewritable: length changed");
5747 if (((*mp)->m_flags & M_PKTHDR) && reslen != (*mp)->m_pkthdr.len)
5748 panic("m_makewritable: inconsist");
5749 #endif /* DEBUG */
5750
5751 return (error);
5752 }
5753
5754 static int
5755 m_copyback0(struct mbuf **mp0, int off, int len, const void *vp, int flags,
5756 int how)
5757 {
5758 int mlen;
5759 struct mbuf *m, *n;
5760 struct mbuf **mp;
5761 int totlen = 0;
5762 const char *cp = vp;
5763
5764 VERIFY(mp0 != NULL);
5765 VERIFY(*mp0 != NULL);
5766 VERIFY((flags & M_COPYBACK0_PRESERVE) == 0 || cp == NULL);
5767 VERIFY((flags & M_COPYBACK0_COPYBACK) == 0 || cp != NULL);
5768
5769 /*
5770 * we don't bother to update "totlen" in the case of M_COPYBACK0_COW,
5771 * assuming that M_COPYBACK0_EXTEND and M_COPYBACK0_COW are exclusive.
5772 */
5773
5774 VERIFY((~flags & (M_COPYBACK0_EXTEND|M_COPYBACK0_COW)) != 0);
5775
5776 mp = mp0;
5777 m = *mp;
5778 while (off > (mlen = m->m_len)) {
5779 off -= mlen;
5780 totlen += mlen;
5781 if (m->m_next == NULL) {
5782 int tspace;
5783 extend:
5784 if (!(flags & M_COPYBACK0_EXTEND))
5785 goto out;
5786
5787 /*
5788 * try to make some space at the end of "m".
5789 */
5790
5791 mlen = m->m_len;
5792 if (off + len >= MINCLSIZE &&
5793 !(m->m_flags & M_EXT) && m->m_len == 0) {
5794 MCLGET(m, how);
5795 }
5796 tspace = M_TRAILINGSPACE(m);
5797 if (tspace > 0) {
5798 tspace = MIN(tspace, off + len);
5799 VERIFY(tspace > 0);
5800 bzero(mtod(m, char *) + m->m_len,
5801 MIN(off, tspace));
5802 m->m_len += tspace;
5803 off += mlen;
5804 totlen -= mlen;
5805 continue;
5806 }
5807
5808 /*
5809 * need to allocate an mbuf.
5810 */
5811
5812 if (off + len >= MINCLSIZE) {
5813 n = m_getcl(how, m->m_type, 0);
5814 } else {
5815 n = _M_GET(how, m->m_type);
5816 }
5817 if (n == NULL) {
5818 goto out;
5819 }
5820 n->m_len = 0;
5821 n->m_len = MIN(M_TRAILINGSPACE(n), off + len);
5822 bzero(mtod(n, char *), MIN(n->m_len, off));
5823 m->m_next = n;
5824 }
5825 mp = &m->m_next;
5826 m = m->m_next;
5827 }
5828 while (len > 0) {
5829 mlen = m->m_len - off;
5830 if (mlen != 0 && m_mclhasreference(m)) {
5831 char *datap;
5832 int eatlen;
5833
5834 /*
5835 * this mbuf is read-only.
5836 * allocate a new writable mbuf and try again.
5837 */
5838
5839 #if DIAGNOSTIC
5840 if (!(flags & M_COPYBACK0_COW))
5841 panic("m_copyback0: read-only");
5842 #endif /* DIAGNOSTIC */
5843
5844 /*
5845 * if we're going to write into the middle of
5846 * a mbuf, split it first.
5847 */
5848 if (off > 0 && len < mlen) {
5849 n = m_split0(m, off, how, 0);
5850 if (n == NULL)
5851 goto enobufs;
5852 m->m_next = n;
5853 mp = &m->m_next;
5854 m = n;
5855 off = 0;
5856 continue;
5857 }
5858
5859 /*
5860 * XXX TODO coalesce into the trailingspace of
5861 * the previous mbuf when possible.
5862 */
5863
5864 /*
5865 * allocate a new mbuf. copy packet header if needed.
5866 */
5867 n = _M_GET(how, m->m_type);
5868 if (n == NULL)
5869 goto enobufs;
5870 if (off == 0 && (m->m_flags & M_PKTHDR)) {
5871 M_COPY_PKTHDR(n, m);
5872 n->m_len = MHLEN;
5873 } else {
5874 if (len >= MINCLSIZE)
5875 MCLGET(n, M_DONTWAIT);
5876 n->m_len =
5877 (n->m_flags & M_EXT) ? MCLBYTES : MLEN;
5878 }
5879 if (n->m_len > len)
5880 n->m_len = len;
5881
5882 /*
5883 * free the region which has been overwritten.
5884 * copying data from old mbufs if requested.
5885 */
5886 if (flags & M_COPYBACK0_PRESERVE)
5887 datap = mtod(n, char *);
5888 else
5889 datap = NULL;
5890 eatlen = n->m_len;
5891 VERIFY(off == 0 || eatlen >= mlen);
5892 if (off > 0) {
5893 VERIFY(len >= mlen);
5894 m->m_len = off;
5895 m->m_next = n;
5896 if (datap) {
5897 m_copydata(m, off, mlen, datap);
5898 datap += mlen;
5899 }
5900 eatlen -= mlen;
5901 mp = &m->m_next;
5902 m = m->m_next;
5903 }
5904 while (m != NULL && m_mclhasreference(m) &&
5905 n->m_type == m->m_type && eatlen > 0) {
5906 mlen = MIN(eatlen, m->m_len);
5907 if (datap) {
5908 m_copydata(m, 0, mlen, datap);
5909 datap += mlen;
5910 }
5911 m->m_data += mlen;
5912 m->m_len -= mlen;
5913 eatlen -= mlen;
5914 if (m->m_len == 0)
5915 *mp = m = m_free(m);
5916 }
5917 if (eatlen > 0)
5918 n->m_len -= eatlen;
5919 n->m_next = m;
5920 *mp = m = n;
5921 continue;
5922 }
5923 mlen = MIN(mlen, len);
5924 if (flags & M_COPYBACK0_COPYBACK) {
5925 bcopy(cp, mtod(m, caddr_t) + off, (unsigned)mlen);
5926 cp += mlen;
5927 }
5928 len -= mlen;
5929 mlen += off;
5930 off = 0;
5931 totlen += mlen;
5932 if (len == 0)
5933 break;
5934 if (m->m_next == NULL) {
5935 goto extend;
5936 }
5937 mp = &m->m_next;
5938 m = m->m_next;
5939 }
5940 out:
5941 if (((m = *mp0)->m_flags & M_PKTHDR) && (m->m_pkthdr.len < totlen)) {
5942 VERIFY(flags & M_COPYBACK0_EXTEND);
5943 m->m_pkthdr.len = totlen;
5944 }
5945
5946 return (0);
5947
5948 enobufs:
5949 return (ENOBUFS);
5950 }
5951
5952 uint64_t
5953 mcl_to_paddr(char *addr)
5954 {
5955 vm_offset_t base_phys;
5956
5957 if (!MBUF_IN_MAP(addr))
5958 return (0);
5959 base_phys = mcl_paddr[atop_64(addr - (char *)mbutl)];
5960
5961 if (base_phys == 0)
5962 return (0);
5963 return ((uint64_t)(ptoa_64(base_phys) | ((uint64_t)addr & PAGE_MASK)));
5964 }
5965
5966 /*
5967 * Dup the mbuf chain passed in. The whole thing. No cute additional cruft.
5968 * And really copy the thing. That way, we don't "precompute" checksums
5969 * for unsuspecting consumers. Assumption: m->m_nextpkt == 0. Trick: for
5970 * small packets, don't dup into a cluster. That way received packets
5971 * don't take up too much room in the sockbuf (cf. sbspace()).
5972 */
5973 int MDFail;
5974
5975 struct mbuf *
5976 m_dup(struct mbuf *m, int how)
5977 {
5978 struct mbuf *n, **np;
5979 struct mbuf *top;
5980 int copyhdr = 0;
5981
5982 np = &top;
5983 top = NULL;
5984 if (m->m_flags & M_PKTHDR)
5985 copyhdr = 1;
5986
5987 /*
5988 * Quick check: if we have one mbuf and its data fits in an
5989 * mbuf with packet header, just copy and go.
5990 */
5991 if (m->m_next == NULL) {
5992 /* Then just move the data into an mbuf and be done... */
5993 if (copyhdr) {
5994 if (m->m_pkthdr.len <= MHLEN && m->m_len <= MHLEN) {
5995 if ((n = _M_GETHDR(how, m->m_type)) == NULL)
5996 return (NULL);
5997 n->m_len = m->m_len;
5998 m_dup_pkthdr(n, m, how);
5999 bcopy(m->m_data, n->m_data, m->m_len);
6000 return (n);
6001 }
6002 } else if (m->m_len <= MLEN) {
6003 if ((n = _M_GET(how, m->m_type)) == NULL)
6004 return (NULL);
6005 bcopy(m->m_data, n->m_data, m->m_len);
6006 n->m_len = m->m_len;
6007 return (n);
6008 }
6009 }
6010 while (m != NULL) {
6011 #if BLUE_DEBUG
6012 printf("<%x: %x, %x, %x\n", m, m->m_flags, m->m_len,
6013 m->m_data);
6014 #endif
6015 if (copyhdr)
6016 n = _M_GETHDR(how, m->m_type);
6017 else
6018 n = _M_GET(how, m->m_type);
6019 if (n == NULL)
6020 goto nospace;
6021 if (m->m_flags & M_EXT) {
6022 if (m->m_len <= m_maxsize(MC_CL))
6023 MCLGET(n, how);
6024 else if (m->m_len <= m_maxsize(MC_BIGCL))
6025 n = m_mbigget(n, how);
6026 else if (m->m_len <= m_maxsize(MC_16KCL) && njcl > 0)
6027 n = m_m16kget(n, how);
6028 if (!(n->m_flags & M_EXT)) {
6029 (void) m_free(n);
6030 goto nospace;
6031 }
6032 }
6033 *np = n;
6034 if (copyhdr) {
6035 /* Don't use M_COPY_PKTHDR: preserve m_data */
6036 m_dup_pkthdr(n, m, how);
6037 copyhdr = 0;
6038 if (!(n->m_flags & M_EXT))
6039 n->m_data = n->m_pktdat;
6040 }
6041 n->m_len = m->m_len;
6042 /*
6043 * Get the dup on the same bdry as the original
6044 * Assume that the two mbufs have the same offset to data area
6045 * (up to word boundaries)
6046 */
6047 bcopy(MTOD(m, caddr_t), MTOD(n, caddr_t), (unsigned)n->m_len);
6048 m = m->m_next;
6049 np = &n->m_next;
6050 #if BLUE_DEBUG
6051 printf(">%x: %x, %x, %x\n", n, n->m_flags, n->m_len,
6052 n->m_data);
6053 #endif
6054 }
6055
6056 if (top == NULL)
6057 MDFail++;
6058 return (top);
6059
6060 nospace:
6061 m_freem(top);
6062 MDFail++;
6063 return (NULL);
6064 }
6065
6066 #define MBUF_MULTIPAGES(m) \
6067 (((m)->m_flags & M_EXT) && \
6068 ((IS_P2ALIGNED((m)->m_data, PAGE_SIZE) \
6069 && (m)->m_len > PAGE_SIZE) || \
6070 (!IS_P2ALIGNED((m)->m_data, PAGE_SIZE) && \
6071 P2ROUNDUP((m)->m_data, PAGE_SIZE) < ((uintptr_t)(m)->m_data + (m)->m_len))))
6072
6073 static struct mbuf *
6074 m_expand(struct mbuf *m, struct mbuf **last)
6075 {
6076 struct mbuf *top = NULL;
6077 struct mbuf **nm = &top;
6078 uintptr_t data0, data;
6079 unsigned int len0, len;
6080
6081 VERIFY(MBUF_MULTIPAGES(m));
6082 VERIFY(m->m_next == NULL);
6083 data0 = (uintptr_t)m->m_data;
6084 len0 = m->m_len;
6085 *last = top;
6086
6087 for (;;) {
6088 struct mbuf *n;
6089
6090 data = data0;
6091 if (IS_P2ALIGNED(data, PAGE_SIZE) && len0 > PAGE_SIZE)
6092 len = PAGE_SIZE;
6093 else if (!IS_P2ALIGNED(data, PAGE_SIZE) &&
6094 P2ROUNDUP(data, PAGE_SIZE) < (data + len0))
6095 len = P2ROUNDUP(data, PAGE_SIZE) - data;
6096 else
6097 len = len0;
6098
6099 VERIFY(len > 0);
6100 VERIFY(m->m_flags & M_EXT);
6101 m->m_data = (void *)data;
6102 m->m_len = len;
6103
6104 *nm = *last = m;
6105 nm = &m->m_next;
6106 m->m_next = NULL;
6107
6108 data0 += len;
6109 len0 -= len;
6110 if (len0 == 0)
6111 break;
6112
6113 n = _M_RETRY(M_DONTWAIT, MT_DATA);
6114 if (n == NULL) {
6115 m_freem(top);
6116 top = *last = NULL;
6117 break;
6118 }
6119
6120 n->m_ext = m->m_ext;
6121 m_incref(m);
6122 n->m_flags |= M_EXT;
6123 m = n;
6124 }
6125 return (top);
6126 }
6127
6128 struct mbuf *
6129 m_normalize(struct mbuf *m)
6130 {
6131 struct mbuf *top = NULL;
6132 struct mbuf **nm = &top;
6133 boolean_t expanded = FALSE;
6134
6135 while (m != NULL) {
6136 struct mbuf *n;
6137
6138 n = m->m_next;
6139 m->m_next = NULL;
6140
6141 /* Does the data cross one or more page boundaries? */
6142 if (MBUF_MULTIPAGES(m)) {
6143 struct mbuf *last;
6144 if ((m = m_expand(m, &last)) == NULL) {
6145 m_freem(n);
6146 m_freem(top);
6147 top = NULL;
6148 break;
6149 }
6150 *nm = m;
6151 nm = &last->m_next;
6152 expanded = TRUE;
6153 } else {
6154 *nm = m;
6155 nm = &m->m_next;
6156 }
6157 m = n;
6158 }
6159 if (expanded)
6160 atomic_add_32(&mb_normalized, 1);
6161 return (top);
6162 }
6163
6164 /*
6165 * Append the specified data to the indicated mbuf chain,
6166 * Extend the mbuf chain if the new data does not fit in
6167 * existing space.
6168 *
6169 * Return 1 if able to complete the job; otherwise 0.
6170 */
6171 int
6172 m_append(struct mbuf *m0, int len, caddr_t cp)
6173 {
6174 struct mbuf *m, *n;
6175 int remainder, space;
6176
6177 for (m = m0; m->m_next != NULL; m = m->m_next)
6178 ;
6179 remainder = len;
6180 space = M_TRAILINGSPACE(m);
6181 if (space > 0) {
6182 /*
6183 * Copy into available space.
6184 */
6185 if (space > remainder)
6186 space = remainder;
6187 bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
6188 m->m_len += space;
6189 cp += space;
6190 remainder -= space;
6191 }
6192 while (remainder > 0) {
6193 /*
6194 * Allocate a new mbuf; could check space
6195 * and allocate a cluster instead.
6196 */
6197 n = m_get(M_WAITOK, m->m_type);
6198 if (n == NULL)
6199 break;
6200 n->m_len = min(MLEN, remainder);
6201 bcopy(cp, mtod(n, caddr_t), n->m_len);
6202 cp += n->m_len;
6203 remainder -= n->m_len;
6204 m->m_next = n;
6205 m = n;
6206 }
6207 if (m0->m_flags & M_PKTHDR)
6208 m0->m_pkthdr.len += len - remainder;
6209 return (remainder == 0);
6210 }
6211
6212 struct mbuf *
6213 m_last(struct mbuf *m)
6214 {
6215 while (m->m_next != NULL)
6216 m = m->m_next;
6217 return (m);
6218 }
6219
6220 unsigned int
6221 m_fixhdr(struct mbuf *m0)
6222 {
6223 u_int len;
6224
6225 VERIFY(m0->m_flags & M_PKTHDR);
6226
6227 len = m_length2(m0, NULL);
6228 m0->m_pkthdr.len = len;
6229 return (len);
6230 }
6231
6232 unsigned int
6233 m_length2(struct mbuf *m0, struct mbuf **last)
6234 {
6235 struct mbuf *m;
6236 u_int len;
6237
6238 len = 0;
6239 for (m = m0; m != NULL; m = m->m_next) {
6240 len += m->m_len;
6241 if (m->m_next == NULL)
6242 break;
6243 }
6244 if (last != NULL)
6245 *last = m;
6246 return (len);
6247 }
6248
6249 /*
6250 * Defragment a mbuf chain, returning the shortest possible chain of mbufs
6251 * and clusters. If allocation fails and this cannot be completed, NULL will
6252 * be returned, but the passed in chain will be unchanged. Upon success,
6253 * the original chain will be freed, and the new chain will be returned.
6254 *
6255 * If a non-packet header is passed in, the original mbuf (chain?) will
6256 * be returned unharmed.
6257 *
6258 * If offset is specfied, the first mbuf in the chain will have a leading
6259 * space of the amount stated by the "off" parameter.
6260 *
6261 * This routine requires that the m_pkthdr.header field of the original
6262 * mbuf chain is cleared by the caller.
6263 */
6264 struct mbuf *
6265 m_defrag_offset(struct mbuf *m0, u_int32_t off, int how)
6266 {
6267 struct mbuf *m_new = NULL, *m_final = NULL;
6268 int progress = 0, length, pktlen;
6269
6270 if (!(m0->m_flags & M_PKTHDR))
6271 return (m0);
6272
6273 VERIFY(off < MHLEN);
6274 m_fixhdr(m0); /* Needed sanity check */
6275
6276 pktlen = m0->m_pkthdr.len + off;
6277 if (pktlen > MHLEN)
6278 m_final = m_getcl(how, MT_DATA, M_PKTHDR);
6279 else
6280 m_final = m_gethdr(how, MT_DATA);
6281
6282 if (m_final == NULL)
6283 goto nospace;
6284
6285 if (off > 0) {
6286 pktlen -= off;
6287 m_final->m_data += off;
6288 }
6289
6290 /*
6291 * Caller must have handled the contents pointed to by this
6292 * pointer before coming here, as otherwise it will point to
6293 * the original mbuf which will get freed upon success.
6294 */
6295 VERIFY(m0->m_pkthdr.pkt_hdr == NULL);
6296
6297 if (m_dup_pkthdr(m_final, m0, how) == 0)
6298 goto nospace;
6299
6300 m_new = m_final;
6301
6302 while (progress < pktlen) {
6303 length = pktlen - progress;
6304 if (length > MCLBYTES)
6305 length = MCLBYTES;
6306 length -= ((m_new == m_final) ? off : 0);
6307 if (length < 0)
6308 goto nospace;
6309
6310 if (m_new == NULL) {
6311 if (length > MLEN)
6312 m_new = m_getcl(how, MT_DATA, 0);
6313 else
6314 m_new = m_get(how, MT_DATA);
6315 if (m_new == NULL)
6316 goto nospace;
6317 }
6318
6319 m_copydata(m0, progress, length, mtod(m_new, caddr_t));
6320 progress += length;
6321 m_new->m_len = length;
6322 if (m_new != m_final)
6323 m_cat(m_final, m_new);
6324 m_new = NULL;
6325 }
6326 m_freem(m0);
6327 m0 = m_final;
6328 return (m0);
6329 nospace:
6330 if (m_final)
6331 m_freem(m_final);
6332 return (NULL);
6333 }
6334
6335 struct mbuf *
6336 m_defrag(struct mbuf *m0, int how)
6337 {
6338 return (m_defrag_offset(m0, 0, how));
6339 }
6340
6341 void
6342 m_mchtype(struct mbuf *m, int t)
6343 {
6344 mtype_stat_inc(t);
6345 mtype_stat_dec(m->m_type);
6346 (m)->m_type = t;
6347 }
6348
6349 void *
6350 m_mtod(struct mbuf *m)
6351 {
6352 return (MTOD(m, void *));
6353 }
6354
6355 struct mbuf *
6356 m_dtom(void *x)
6357 {
6358 return ((struct mbuf *)((uintptr_t)(x) & ~(MSIZE-1)));
6359 }
6360
6361 void
6362 m_mcheck(struct mbuf *m)
6363 {
6364 _MCHECK(m);
6365 }
6366
6367 /*
6368 * Return a pointer to mbuf/offset of location in mbuf chain.
6369 */
6370 struct mbuf *
6371 m_getptr(struct mbuf *m, int loc, int *off)
6372 {
6373
6374 while (loc >= 0) {
6375 /* Normal end of search. */
6376 if (m->m_len > loc) {
6377 *off = loc;
6378 return (m);
6379 } else {
6380 loc -= m->m_len;
6381 if (m->m_next == NULL) {
6382 if (loc == 0) {
6383 /* Point at the end of valid data. */
6384 *off = m->m_len;
6385 return (m);
6386 }
6387 return (NULL);
6388 }
6389 m = m->m_next;
6390 }
6391 }
6392 return (NULL);
6393 }
6394
6395 /*
6396 * Inform the corresponding mcache(s) that there's a waiter below.
6397 */
6398 static void
6399 mbuf_waiter_inc(mbuf_class_t class, boolean_t comp)
6400 {
6401 mcache_waiter_inc(m_cache(class));
6402 if (comp) {
6403 if (class == MC_CL) {
6404 mcache_waiter_inc(m_cache(MC_MBUF_CL));
6405 } else if (class == MC_BIGCL) {
6406 mcache_waiter_inc(m_cache(MC_MBUF_BIGCL));
6407 } else if (class == MC_16KCL) {
6408 mcache_waiter_inc(m_cache(MC_MBUF_16KCL));
6409 } else {
6410 mcache_waiter_inc(m_cache(MC_MBUF_CL));
6411 mcache_waiter_inc(m_cache(MC_MBUF_BIGCL));
6412 }
6413 }
6414 }
6415
6416 /*
6417 * Inform the corresponding mcache(s) that there's no more waiter below.
6418 */
6419 static void
6420 mbuf_waiter_dec(mbuf_class_t class, boolean_t comp)
6421 {
6422 mcache_waiter_dec(m_cache(class));
6423 if (comp) {
6424 if (class == MC_CL) {
6425 mcache_waiter_dec(m_cache(MC_MBUF_CL));
6426 } else if (class == MC_BIGCL) {
6427 mcache_waiter_dec(m_cache(MC_MBUF_BIGCL));
6428 } else if (class == MC_16KCL) {
6429 mcache_waiter_dec(m_cache(MC_MBUF_16KCL));
6430 } else {
6431 mcache_waiter_dec(m_cache(MC_MBUF_CL));
6432 mcache_waiter_dec(m_cache(MC_MBUF_BIGCL));
6433 }
6434 }
6435 }
6436
6437 /*
6438 * Called during slab (blocking and non-blocking) allocation. If there
6439 * is at least one waiter, and the time since the first waiter is blocked
6440 * is greater than the watchdog timeout, panic the system.
6441 */
6442 static void
6443 mbuf_watchdog(void)
6444 {
6445 struct timeval now;
6446 unsigned int since;
6447
6448 if (mb_waiters == 0 || !mb_watchdog)
6449 return;
6450
6451 microuptime(&now);
6452 since = now.tv_sec - mb_wdtstart.tv_sec;
6453 if (since >= MB_WDT_MAXTIME) {
6454 panic_plain("%s: %d waiters stuck for %u secs\n%s", __func__,
6455 mb_waiters, since, mbuf_dump());
6456 /* NOTREACHED */
6457 }
6458 }
6459
6460 /*
6461 * Called during blocking allocation. Returns TRUE if one or more objects
6462 * are available at the per-CPU caches layer and that allocation should be
6463 * retried at that level.
6464 */
6465 static boolean_t
6466 mbuf_sleep(mbuf_class_t class, unsigned int num, int wait)
6467 {
6468 boolean_t mcache_retry = FALSE;
6469
6470 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
6471
6472 /* Check if there's anything at the cache layer */
6473 if (mbuf_cached_above(class, wait)) {
6474 mcache_retry = TRUE;
6475 goto done;
6476 }
6477
6478 /* Nothing? Then try hard to get it from somewhere */
6479 m_reclaim(class, num, (wait & MCR_COMP));
6480
6481 /* We tried hard and got something? */
6482 if (m_infree(class) > 0) {
6483 mbstat.m_wait++;
6484 goto done;
6485 } else if (mbuf_cached_above(class, wait)) {
6486 mbstat.m_wait++;
6487 mcache_retry = TRUE;
6488 goto done;
6489 } else if (wait & MCR_TRYHARD) {
6490 mcache_retry = TRUE;
6491 goto done;
6492 }
6493
6494 /*
6495 * There's really nothing for us right now; inform the
6496 * cache(s) that there is a waiter below and go to sleep.
6497 */
6498 mbuf_waiter_inc(class, (wait & MCR_COMP));
6499
6500 VERIFY(!(wait & MCR_NOSLEEP));
6501
6502 /*
6503 * If this is the first waiter, arm the watchdog timer. Otherwise
6504 * check if we need to panic the system due to watchdog timeout.
6505 */
6506 if (mb_waiters == 0)
6507 microuptime(&mb_wdtstart);
6508 else
6509 mbuf_watchdog();
6510
6511 mb_waiters++;
6512 m_region_expand(class) += m_total(class) + num;
6513 /* wake up the worker thread */
6514 if (class > MC_MBUF && mbuf_worker_ready &&
6515 mbuf_worker_needs_wakeup) {
6516 wakeup((caddr_t)&mbuf_worker_needs_wakeup);
6517 mbuf_worker_needs_wakeup = FALSE;
6518 }
6519
6520 (void) msleep(mb_waitchan, mbuf_mlock, (PZERO-1), m_cname(class), NULL);
6521
6522 /* We are now up; stop getting notified until next round */
6523 mbuf_waiter_dec(class, (wait & MCR_COMP));
6524
6525 /* We waited and got something */
6526 if (m_infree(class) > 0) {
6527 mbstat.m_wait++;
6528 goto done;
6529 } else if (mbuf_cached_above(class, wait)) {
6530 mbstat.m_wait++;
6531 mcache_retry = TRUE;
6532 }
6533 done:
6534 return (mcache_retry);
6535 }
6536
6537 __attribute__((noreturn))
6538 static void
6539 mbuf_worker_thread(void)
6540 {
6541 int mbuf_expand;
6542
6543 while (1) {
6544 lck_mtx_lock(mbuf_mlock);
6545 mbuf_worker_run_cnt++;
6546 mbuf_expand = 0;
6547 if (m_region_expand(MC_CL) > 0) {
6548 int n;
6549 mb_expand_cl_cnt++;
6550 /* Adjust to current number of cluster in use */
6551 n = m_region_expand(MC_CL) -
6552 (m_total(MC_CL) - m_infree(MC_CL));
6553 if ((n + m_total(MC_CL)) > m_maxlimit(MC_CL))
6554 n = m_maxlimit(MC_CL) - m_total(MC_CL);
6555 if (n > 0) {
6556 mb_expand_cl_total += n;
6557 }
6558 m_region_expand(MC_CL) = 0;
6559
6560 if (n > 0 && freelist_populate(MC_CL, n, M_WAIT) > 0)
6561 mbuf_expand++;
6562 }
6563 if (m_region_expand(MC_BIGCL) > 0) {
6564 int n;
6565 mb_expand_bigcl_cnt++;
6566 /* Adjust to current number of 4 KB cluster in use */
6567 n = m_region_expand(MC_BIGCL) -
6568 (m_total(MC_BIGCL) - m_infree(MC_BIGCL));
6569 if ((n + m_total(MC_BIGCL)) > m_maxlimit(MC_BIGCL))
6570 n = m_maxlimit(MC_BIGCL) - m_total(MC_BIGCL);
6571 if (n > 0) {
6572 mb_expand_bigcl_total += n;
6573 }
6574 m_region_expand(MC_BIGCL) = 0;
6575
6576 if (n > 0 && freelist_populate(MC_BIGCL, n, M_WAIT) > 0)
6577 mbuf_expand++;
6578 }
6579 if (m_region_expand(MC_16KCL) > 0) {
6580 int n;
6581 mb_expand_16kcl_cnt++;
6582 /* Adjust to current number of 16 KB cluster in use */
6583 n = m_region_expand(MC_16KCL) -
6584 (m_total(MC_16KCL) - m_infree(MC_16KCL));
6585 if ((n + m_total(MC_16KCL)) > m_maxlimit(MC_16KCL))
6586 n = m_maxlimit(MC_16KCL) - m_total(MC_16KCL);
6587 if (n > 0) {
6588 mb_expand_16kcl_total += n;
6589 }
6590 m_region_expand(MC_16KCL) = 0;
6591
6592 if (n > 0)
6593 (void) freelist_populate(MC_16KCL, n, M_WAIT);
6594 }
6595
6596 /*
6597 * Because we can run out of memory before filling the mbuf
6598 * map, we should not allocate more clusters than they are
6599 * mbufs -- otherwise we could have a large number of useless
6600 * clusters allocated.
6601 */
6602 if (mbuf_expand) {
6603 while (m_total(MC_MBUF) <
6604 (m_total(MC_BIGCL) + m_total(MC_CL))) {
6605 mb_expand_cnt++;
6606 if (freelist_populate(MC_MBUF, 1, M_WAIT) == 0)
6607 break;
6608 }
6609 }
6610
6611 mbuf_worker_needs_wakeup = TRUE;
6612 /*
6613 * If there's a deadlock and we're not sending / receiving
6614 * packets, net_uptime() won't be updated. Update it here
6615 * so we are sure it's correct.
6616 */
6617 net_update_uptime();
6618 mbuf_worker_last_runtime = net_uptime();
6619 assert_wait((caddr_t)&mbuf_worker_needs_wakeup,
6620 THREAD_UNINT);
6621 lck_mtx_unlock(mbuf_mlock);
6622 (void) thread_block((thread_continue_t)mbuf_worker_thread);
6623 }
6624 }
6625
6626 __attribute__((noreturn))
6627 static void
6628 mbuf_worker_thread_init(void)
6629 {
6630 mbuf_worker_ready++;
6631 mbuf_worker_thread();
6632 }
6633
6634 static mcl_slab_t *
6635 slab_get(void *buf)
6636 {
6637 mcl_slabg_t *slg;
6638 unsigned int ix, k;
6639
6640 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
6641
6642 VERIFY(MBUF_IN_MAP(buf));
6643 ix = ((unsigned char *)buf - mbutl) >> MBSHIFT;
6644 VERIFY(ix < maxslabgrp);
6645
6646 if ((slg = slabstbl[ix]) == NULL) {
6647 /*
6648 * In the current implementation, we never shrink the slabs
6649 * table; if we attempt to reallocate a cluster group when
6650 * it's already allocated, panic since this is a sign of a
6651 * memory corruption (slabstbl[ix] got nullified).
6652 */
6653 ++slabgrp;
6654 VERIFY(ix < slabgrp);
6655 /*
6656 * Slabs expansion can only be done single threaded; when
6657 * we get here, it must be as a result of m_clalloc() which
6658 * is serialized and therefore mb_clalloc_busy must be set.
6659 */
6660 VERIFY(mb_clalloc_busy);
6661 lck_mtx_unlock(mbuf_mlock);
6662
6663 /* This is a new buffer; create the slabs group for it */
6664 MALLOC(slg, mcl_slabg_t *, sizeof (*slg), M_TEMP,
6665 M_WAITOK | M_ZERO);
6666 MALLOC(slg->slg_slab, mcl_slab_t *, sizeof(mcl_slab_t) * NSLABSPMB,
6667 M_TEMP, M_WAITOK | M_ZERO);
6668 VERIFY(slg != NULL && slg->slg_slab != NULL);
6669
6670 lck_mtx_lock(mbuf_mlock);
6671 /*
6672 * No other thread could have gone into m_clalloc() after
6673 * we dropped the lock above, so verify that it's true.
6674 */
6675 VERIFY(mb_clalloc_busy);
6676
6677 slabstbl[ix] = slg;
6678
6679 /* Chain each slab in the group to its forward neighbor */
6680 for (k = 1; k < NSLABSPMB; k++)
6681 slg->slg_slab[k - 1].sl_next = &slg->slg_slab[k];
6682 VERIFY(slg->slg_slab[NSLABSPMB - 1].sl_next == NULL);
6683
6684 /* And chain the last slab in the previous group to this */
6685 if (ix > 0) {
6686 VERIFY(slabstbl[ix - 1]->
6687 slg_slab[NSLABSPMB - 1].sl_next == NULL);
6688 slabstbl[ix - 1]->slg_slab[NSLABSPMB - 1].sl_next =
6689 &slg->slg_slab[0];
6690 }
6691 }
6692
6693 ix = MTOPG(buf) % NSLABSPMB;
6694 VERIFY(ix < NSLABSPMB);
6695
6696 return (&slg->slg_slab[ix]);
6697 }
6698
6699 static void
6700 slab_init(mcl_slab_t *sp, mbuf_class_t class, u_int32_t flags,
6701 void *base, void *head, unsigned int len, int refcnt, int chunks)
6702 {
6703 sp->sl_class = class;
6704 sp->sl_flags = flags;
6705 sp->sl_base = base;
6706 sp->sl_head = head;
6707 sp->sl_len = len;
6708 sp->sl_refcnt = refcnt;
6709 sp->sl_chunks = chunks;
6710 slab_detach(sp);
6711 }
6712
6713 static void
6714 slab_insert(mcl_slab_t *sp, mbuf_class_t class)
6715 {
6716 VERIFY(slab_is_detached(sp));
6717 m_slab_cnt(class)++;
6718 TAILQ_INSERT_TAIL(&m_slablist(class), sp, sl_link);
6719 sp->sl_flags &= ~SLF_DETACHED;
6720
6721 /*
6722 * If a buffer spans multiple contiguous pages then mark them as
6723 * detached too
6724 */
6725 if (class == MC_16KCL) {
6726 int k;
6727 for (k = 1; k < NSLABSP16KB; k++) {
6728 sp = sp->sl_next;
6729 /* Next slab must already be present */
6730 VERIFY(sp != NULL && slab_is_detached(sp));
6731 sp->sl_flags &= ~SLF_DETACHED;
6732 }
6733 }
6734 }
6735
6736 static void
6737 slab_remove(mcl_slab_t *sp, mbuf_class_t class)
6738 {
6739 int k;
6740 VERIFY(!slab_is_detached(sp));
6741 VERIFY(m_slab_cnt(class) > 0);
6742 m_slab_cnt(class)--;
6743 TAILQ_REMOVE(&m_slablist(class), sp, sl_link);
6744 slab_detach(sp);
6745 if (class == MC_16KCL) {
6746 for (k = 1; k < NSLABSP16KB; k++) {
6747 sp = sp->sl_next;
6748 /* Next slab must already be present */
6749 VERIFY(sp != NULL);
6750 VERIFY(!slab_is_detached(sp));
6751 slab_detach(sp);
6752 }
6753 }
6754 }
6755
6756 static boolean_t
6757 slab_inrange(mcl_slab_t *sp, void *buf)
6758 {
6759 return ((uintptr_t)buf >= (uintptr_t)sp->sl_base &&
6760 (uintptr_t)buf < ((uintptr_t)sp->sl_base + sp->sl_len));
6761 }
6762
6763 #undef panic
6764
6765 static void
6766 slab_nextptr_panic(mcl_slab_t *sp, void *addr)
6767 {
6768 int i;
6769 unsigned int chunk_len = sp->sl_len / sp->sl_chunks;
6770 uintptr_t buf = (uintptr_t)sp->sl_base;
6771
6772 for (i = 0; i < sp->sl_chunks; i++, buf += chunk_len) {
6773 void *next = ((mcache_obj_t *)buf)->obj_next;
6774 if (next != addr)
6775 continue;
6776 if (!mclverify) {
6777 if (next != NULL && !MBUF_IN_MAP(next)) {
6778 mcache_t *cp = m_cache(sp->sl_class);
6779 panic("%s: %s buffer %p in slab %p modified "
6780 "after free at offset 0: %p out of range "
6781 "[%p-%p)\n", __func__, cp->mc_name,
6782 (void *)buf, sp, next, mbutl, embutl);
6783 /* NOTREACHED */
6784 }
6785 } else {
6786 mcache_audit_t *mca = mcl_audit_buf2mca(sp->sl_class,
6787 (mcache_obj_t *)buf);
6788 mcl_audit_verify_nextptr(next, mca);
6789 }
6790 }
6791 }
6792
6793 static void
6794 slab_detach(mcl_slab_t *sp)
6795 {
6796 sp->sl_link.tqe_next = (mcl_slab_t *)-1;
6797 sp->sl_link.tqe_prev = (mcl_slab_t **)-1;
6798 sp->sl_flags |= SLF_DETACHED;
6799 }
6800
6801 static boolean_t
6802 slab_is_detached(mcl_slab_t *sp)
6803 {
6804 return ((intptr_t)sp->sl_link.tqe_next == -1 &&
6805 (intptr_t)sp->sl_link.tqe_prev == -1 &&
6806 (sp->sl_flags & SLF_DETACHED));
6807 }
6808
6809 static void
6810 mcl_audit_init(void *buf, mcache_audit_t **mca_list,
6811 mcache_obj_t **con_list, size_t con_size, unsigned int num)
6812 {
6813 mcache_audit_t *mca, *mca_tail;
6814 mcache_obj_t *con = NULL;
6815 boolean_t save_contents = (con_list != NULL);
6816 unsigned int i, ix;
6817
6818 ASSERT(num <= NMBPG);
6819 ASSERT(con_list == NULL || con_size != 0);
6820
6821 ix = MTOPG(buf);
6822 VERIFY(ix < maxclaudit);
6823
6824 /* Make sure we haven't been here before */
6825 for (i = 0; i < num; i++)
6826 VERIFY(mclaudit[ix].cl_audit[i] == NULL);
6827
6828 mca = mca_tail = *mca_list;
6829 if (save_contents)
6830 con = *con_list;
6831
6832 for (i = 0; i < num; i++) {
6833 mcache_audit_t *next;
6834
6835 next = mca->mca_next;
6836 bzero(mca, sizeof (*mca));
6837 mca->mca_next = next;
6838 mclaudit[ix].cl_audit[i] = mca;
6839
6840 /* Attach the contents buffer if requested */
6841 if (save_contents) {
6842 mcl_saved_contents_t *msc =
6843 (mcl_saved_contents_t *)(void *)con;
6844
6845 VERIFY(msc != NULL);
6846 VERIFY(IS_P2ALIGNED(msc, sizeof (u_int64_t)));
6847 VERIFY(con_size == sizeof (*msc));
6848 mca->mca_contents_size = con_size;
6849 mca->mca_contents = msc;
6850 con = con->obj_next;
6851 bzero(mca->mca_contents, mca->mca_contents_size);
6852 }
6853
6854 mca_tail = mca;
6855 mca = mca->mca_next;
6856 }
6857
6858 if (save_contents)
6859 *con_list = con;
6860
6861 *mca_list = mca_tail->mca_next;
6862 mca_tail->mca_next = NULL;
6863 }
6864
6865 static void
6866 mcl_audit_free(void *buf, unsigned int num)
6867 {
6868 unsigned int i, ix;
6869 mcache_audit_t *mca, *mca_list;
6870
6871 ix = MTOPG(buf);
6872 VERIFY(ix < maxclaudit);
6873
6874 if (mclaudit[ix].cl_audit[0] != NULL) {
6875 mca_list = mclaudit[ix].cl_audit[0];
6876 for (i = 0; i < num; i++) {
6877 mca = mclaudit[ix].cl_audit[i];
6878 mclaudit[ix].cl_audit[i] = NULL;
6879 if (mca->mca_contents)
6880 mcache_free(mcl_audit_con_cache,
6881 mca->mca_contents);
6882 }
6883 mcache_free_ext(mcache_audit_cache,
6884 (mcache_obj_t *)mca_list);
6885 }
6886 }
6887
6888 /*
6889 * Given an address of a buffer (mbuf/2KB/4KB/16KB), return
6890 * the corresponding audit structure for that buffer.
6891 */
6892 static mcache_audit_t *
6893 mcl_audit_buf2mca(mbuf_class_t class, mcache_obj_t *mobj)
6894 {
6895 mcache_audit_t *mca = NULL;
6896 int ix = MTOPG(mobj), m_idx = 0;
6897 unsigned char *page_addr;
6898
6899 VERIFY(ix < maxclaudit);
6900 VERIFY(IS_P2ALIGNED(mobj, MIN(m_maxsize(class), PAGE_SIZE)));
6901
6902 page_addr = PGTOM(ix);
6903
6904 switch (class) {
6905 case MC_MBUF:
6906 /*
6907 * For the mbuf case, find the index of the page
6908 * used by the mbuf and use that index to locate the
6909 * base address of the page. Then find out the
6910 * mbuf index relative to the page base and use
6911 * it to locate the audit structure.
6912 */
6913 m_idx = MBPAGEIDX(page_addr, mobj);
6914 VERIFY(m_idx < (int)NMBPG);
6915 mca = mclaudit[ix].cl_audit[m_idx];
6916 break;
6917
6918 case MC_CL:
6919 /*
6920 * Same thing as above, but for 2KB clusters in a page.
6921 */
6922 m_idx = CLPAGEIDX(page_addr, mobj);
6923 VERIFY(m_idx < (int)NCLPG);
6924 mca = mclaudit[ix].cl_audit[m_idx];
6925 break;
6926
6927 case MC_BIGCL:
6928 m_idx = BCLPAGEIDX(page_addr, mobj);
6929 VERIFY(m_idx < (int)NBCLPG);
6930 mca = mclaudit[ix].cl_audit[m_idx];
6931 break;
6932 case MC_16KCL:
6933 /*
6934 * Same as above, but only return the first element.
6935 */
6936 mca = mclaudit[ix].cl_audit[0];
6937 break;
6938
6939 default:
6940 VERIFY(0);
6941 /* NOTREACHED */
6942 }
6943
6944 return (mca);
6945 }
6946
6947 static void
6948 mcl_audit_mbuf(mcache_audit_t *mca, void *addr, boolean_t composite,
6949 boolean_t alloc)
6950 {
6951 struct mbuf *m = addr;
6952 mcache_obj_t *next = ((mcache_obj_t *)m)->obj_next;
6953
6954 VERIFY(mca->mca_contents != NULL &&
6955 mca->mca_contents_size == AUDIT_CONTENTS_SIZE);
6956
6957 if (mclverify)
6958 mcl_audit_verify_nextptr(next, mca);
6959
6960 if (!alloc) {
6961 /* Save constructed mbuf fields */
6962 mcl_audit_save_mbuf(m, mca);
6963 if (mclverify) {
6964 mcache_set_pattern(MCACHE_FREE_PATTERN, m,
6965 m_maxsize(MC_MBUF));
6966 }
6967 ((mcache_obj_t *)m)->obj_next = next;
6968 return;
6969 }
6970
6971 /* Check if the buffer has been corrupted while in freelist */
6972 if (mclverify) {
6973 mcache_audit_free_verify_set(mca, addr, 0, m_maxsize(MC_MBUF));
6974 }
6975 /* Restore constructed mbuf fields */
6976 mcl_audit_restore_mbuf(m, mca, composite);
6977 }
6978
6979 static void
6980 mcl_audit_restore_mbuf(struct mbuf *m, mcache_audit_t *mca, boolean_t composite)
6981 {
6982 struct mbuf *ms = MCA_SAVED_MBUF_PTR(mca);
6983
6984 if (composite) {
6985 struct mbuf *next = m->m_next;
6986 VERIFY(ms->m_flags == M_EXT && m_get_rfa(ms) != NULL &&
6987 MBUF_IS_COMPOSITE(ms));
6988 VERIFY(mca->mca_contents_size == AUDIT_CONTENTS_SIZE);
6989 /*
6990 * We could have hand-picked the mbuf fields and restore
6991 * them individually, but that will be a maintenance
6992 * headache. Instead, restore everything that was saved;
6993 * the mbuf layer will recheck and reinitialize anyway.
6994 */
6995 bcopy(ms, m, MCA_SAVED_MBUF_SIZE);
6996 m->m_next = next;
6997 } else {
6998 /*
6999 * For a regular mbuf (no cluster attached) there's nothing
7000 * to restore other than the type field, which is expected
7001 * to be MT_FREE.
7002 */
7003 m->m_type = ms->m_type;
7004 }
7005 _MCHECK(m);
7006 }
7007
7008 static void
7009 mcl_audit_save_mbuf(struct mbuf *m, mcache_audit_t *mca)
7010 {
7011 VERIFY(mca->mca_contents_size == AUDIT_CONTENTS_SIZE);
7012 _MCHECK(m);
7013 bcopy(m, MCA_SAVED_MBUF_PTR(mca), MCA_SAVED_MBUF_SIZE);
7014 }
7015
7016 static void
7017 mcl_audit_cluster(mcache_audit_t *mca, void *addr, size_t size, boolean_t alloc,
7018 boolean_t save_next)
7019 {
7020 mcache_obj_t *next = ((mcache_obj_t *)addr)->obj_next;
7021
7022 if (!alloc) {
7023 if (mclverify) {
7024 mcache_set_pattern(MCACHE_FREE_PATTERN, addr, size);
7025 }
7026 if (save_next) {
7027 mcl_audit_verify_nextptr(next, mca);
7028 ((mcache_obj_t *)addr)->obj_next = next;
7029 }
7030 } else if (mclverify) {
7031 /* Check if the buffer has been corrupted while in freelist */
7032 mcl_audit_verify_nextptr(next, mca);
7033 mcache_audit_free_verify_set(mca, addr, 0, size);
7034 }
7035 }
7036
7037 static void
7038 mcl_audit_scratch(mcache_audit_t *mca)
7039 {
7040 void *stack[MCACHE_STACK_DEPTH + 1];
7041 mcl_scratch_audit_t *msa;
7042 struct timeval now;
7043
7044 VERIFY(mca->mca_contents != NULL);
7045 msa = MCA_SAVED_SCRATCH_PTR(mca);
7046
7047 msa->msa_pthread = msa->msa_thread;
7048 msa->msa_thread = current_thread();
7049 bcopy(msa->msa_stack, msa->msa_pstack, sizeof (msa->msa_pstack));
7050 msa->msa_pdepth = msa->msa_depth;
7051 bzero(stack, sizeof (stack));
7052 msa->msa_depth = OSBacktrace(stack, MCACHE_STACK_DEPTH + 1) - 1;
7053 bcopy(&stack[1], msa->msa_stack, sizeof (msa->msa_stack));
7054
7055 msa->msa_ptstamp = msa->msa_tstamp;
7056 microuptime(&now);
7057 /* tstamp is in ms relative to base_ts */
7058 msa->msa_tstamp = ((now.tv_usec - mb_start.tv_usec) / 1000);
7059 if ((now.tv_sec - mb_start.tv_sec) > 0)
7060 msa->msa_tstamp += ((now.tv_sec - mb_start.tv_sec) * 1000);
7061 }
7062
7063 static void
7064 mcl_audit_mcheck_panic(struct mbuf *m)
7065 {
7066 mcache_audit_t *mca;
7067
7068 MRANGE(m);
7069 mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
7070
7071 panic("mcl_audit: freed mbuf %p with type 0x%x (instead of 0x%x)\n%s\n",
7072 m, (u_int16_t)m->m_type, MT_FREE, mcache_dump_mca(mca));
7073 /* NOTREACHED */
7074 }
7075
7076 static void
7077 mcl_audit_verify_nextptr(void *next, mcache_audit_t *mca)
7078 {
7079 if (next != NULL && !MBUF_IN_MAP(next) &&
7080 (next != (void *)MCACHE_FREE_PATTERN || !mclverify)) {
7081 panic("mcl_audit: buffer %p modified after free at offset 0: "
7082 "%p out of range [%p-%p)\n%s\n",
7083 mca->mca_addr, next, mbutl, embutl, mcache_dump_mca(mca));
7084 /* NOTREACHED */
7085 }
7086 }
7087
7088 /* This function turns on mbuf leak detection */
7089 static void
7090 mleak_activate(void)
7091 {
7092 mleak_table.mleak_sample_factor = MLEAK_SAMPLE_FACTOR;
7093 PE_parse_boot_argn("mleak_sample_factor",
7094 &mleak_table.mleak_sample_factor,
7095 sizeof (mleak_table.mleak_sample_factor));
7096
7097 if (mleak_table.mleak_sample_factor == 0)
7098 mclfindleak = 0;
7099
7100 if (mclfindleak == 0)
7101 return;
7102
7103 vm_size_t alloc_size =
7104 mleak_alloc_buckets * sizeof (struct mallocation);
7105 vm_size_t trace_size = mleak_trace_buckets * sizeof (struct mtrace);
7106
7107 MALLOC(mleak_allocations, struct mallocation *, alloc_size,
7108 M_TEMP, M_WAITOK | M_ZERO);
7109 VERIFY(mleak_allocations != NULL);
7110
7111 MALLOC(mleak_traces, struct mtrace *, trace_size,
7112 M_TEMP, M_WAITOK | M_ZERO);
7113 VERIFY(mleak_traces != NULL);
7114
7115 MALLOC(mleak_stat, mleak_stat_t *, MLEAK_STAT_SIZE(MLEAK_NUM_TRACES),
7116 M_TEMP, M_WAITOK | M_ZERO);
7117 VERIFY(mleak_stat != NULL);
7118 mleak_stat->ml_cnt = MLEAK_NUM_TRACES;
7119 #ifdef __LP64__
7120 mleak_stat->ml_isaddr64 = 1;
7121 #endif /* __LP64__ */
7122 }
7123
7124 static void
7125 mleak_logger(u_int32_t num, mcache_obj_t *addr, boolean_t alloc)
7126 {
7127 int temp;
7128
7129 if (mclfindleak == 0)
7130 return;
7131
7132 if (!alloc)
7133 return (mleak_free(addr));
7134
7135 temp = atomic_add_32_ov(&mleak_table.mleak_capture, 1);
7136
7137 if ((temp % mleak_table.mleak_sample_factor) == 0 && addr != NULL) {
7138 uintptr_t bt[MLEAK_STACK_DEPTH];
7139 int logged = backtrace(bt, MLEAK_STACK_DEPTH);
7140 mleak_log(bt, addr, logged, num);
7141 }
7142 }
7143
7144 /*
7145 * This function records the allocation in the mleak_allocations table
7146 * and the backtrace in the mleak_traces table; if allocation slot is in use,
7147 * replace old allocation with new one if the trace slot is in use, return
7148 * (or increment refcount if same trace).
7149 */
7150 static boolean_t
7151 mleak_log(uintptr_t *bt, mcache_obj_t *addr, uint32_t depth, int num)
7152 {
7153 struct mallocation *allocation;
7154 struct mtrace *trace;
7155 uint32_t trace_index;
7156
7157 /* Quit if someone else modifying the tables */
7158 if (!lck_mtx_try_lock_spin(mleak_lock)) {
7159 mleak_table.total_conflicts++;
7160 return (FALSE);
7161 }
7162
7163 allocation = &mleak_allocations[hashaddr((uintptr_t)addr,
7164 mleak_alloc_buckets)];
7165 trace_index = hashbacktrace(bt, depth, mleak_trace_buckets);
7166 trace = &mleak_traces[trace_index];
7167
7168 VERIFY(allocation <= &mleak_allocations[mleak_alloc_buckets - 1]);
7169 VERIFY(trace <= &mleak_traces[mleak_trace_buckets - 1]);
7170
7171 allocation->hitcount++;
7172 trace->hitcount++;
7173
7174 /*
7175 * If the allocation bucket we want is occupied
7176 * and the occupier has the same trace, just bail.
7177 */
7178 if (allocation->element != NULL &&
7179 trace_index == allocation->trace_index) {
7180 mleak_table.alloc_collisions++;
7181 lck_mtx_unlock(mleak_lock);
7182 return (TRUE);
7183 }
7184
7185 /*
7186 * Store the backtrace in the traces array;
7187 * Size of zero = trace bucket is free.
7188 */
7189 if (trace->allocs > 0 &&
7190 bcmp(trace->addr, bt, (depth * sizeof (uintptr_t))) != 0) {
7191 /* Different, unique trace, but the same hash! Bail out. */
7192 trace->collisions++;
7193 mleak_table.trace_collisions++;
7194 lck_mtx_unlock(mleak_lock);
7195 return (TRUE);
7196 } else if (trace->allocs > 0) {
7197 /* Same trace, already added, so increment refcount */
7198 trace->allocs++;
7199 } else {
7200 /* Found an unused trace bucket, so record the trace here */
7201 if (trace->depth != 0) {
7202 /* this slot previously used but not currently in use */
7203 mleak_table.trace_overwrites++;
7204 }
7205 mleak_table.trace_recorded++;
7206 trace->allocs = 1;
7207 memcpy(trace->addr, bt, (depth * sizeof (uintptr_t)));
7208 trace->depth = depth;
7209 trace->collisions = 0;
7210 }
7211
7212 /* Step 2: Store the allocation record in the allocations array */
7213 if (allocation->element != NULL) {
7214 /*
7215 * Replace an existing allocation. No need to preserve
7216 * because only a subset of the allocations are being
7217 * recorded anyway.
7218 */
7219 mleak_table.alloc_collisions++;
7220 } else if (allocation->trace_index != 0) {
7221 mleak_table.alloc_overwrites++;
7222 }
7223 allocation->element = addr;
7224 allocation->trace_index = trace_index;
7225 allocation->count = num;
7226 mleak_table.alloc_recorded++;
7227 mleak_table.outstanding_allocs++;
7228
7229 lck_mtx_unlock(mleak_lock);
7230 return (TRUE);
7231 }
7232
7233 static void
7234 mleak_free(mcache_obj_t *addr)
7235 {
7236 while (addr != NULL) {
7237 struct mallocation *allocation = &mleak_allocations
7238 [hashaddr((uintptr_t)addr, mleak_alloc_buckets)];
7239
7240 if (allocation->element == addr &&
7241 allocation->trace_index < mleak_trace_buckets) {
7242 lck_mtx_lock_spin(mleak_lock);
7243 if (allocation->element == addr &&
7244 allocation->trace_index < mleak_trace_buckets) {
7245 struct mtrace *trace;
7246 trace = &mleak_traces[allocation->trace_index];
7247 /* allocs = 0 means trace bucket is unused */
7248 if (trace->allocs > 0)
7249 trace->allocs--;
7250 if (trace->allocs == 0)
7251 trace->depth = 0;
7252 /* NULL element means alloc bucket is unused */
7253 allocation->element = NULL;
7254 mleak_table.outstanding_allocs--;
7255 }
7256 lck_mtx_unlock(mleak_lock);
7257 }
7258 addr = addr->obj_next;
7259 }
7260 }
7261
7262 static void
7263 mleak_sort_traces()
7264 {
7265 int i, j, k;
7266 struct mtrace *swap;
7267
7268 for(i = 0; i < MLEAK_NUM_TRACES; i++)
7269 mleak_top_trace[i] = NULL;
7270
7271 for(i = 0, j = 0; j < MLEAK_NUM_TRACES && i < mleak_trace_buckets; i++)
7272 {
7273 if (mleak_traces[i].allocs <= 0)
7274 continue;
7275
7276 mleak_top_trace[j] = &mleak_traces[i];
7277 for (k = j; k > 0; k--) {
7278 if (mleak_top_trace[k]->allocs <=
7279 mleak_top_trace[k-1]->allocs)
7280 break;
7281
7282 swap = mleak_top_trace[k-1];
7283 mleak_top_trace[k-1] = mleak_top_trace[k];
7284 mleak_top_trace[k] = swap;
7285 }
7286 j++;
7287 }
7288
7289 j--;
7290 for(; i < mleak_trace_buckets; i++) {
7291 if (mleak_traces[i].allocs <= mleak_top_trace[j]->allocs)
7292 continue;
7293
7294 mleak_top_trace[j] = &mleak_traces[i];
7295
7296 for (k = j; k > 0; k--) {
7297 if (mleak_top_trace[k]->allocs <=
7298 mleak_top_trace[k-1]->allocs)
7299 break;
7300
7301 swap = mleak_top_trace[k-1];
7302 mleak_top_trace[k-1] = mleak_top_trace[k];
7303 mleak_top_trace[k] = swap;
7304 }
7305 }
7306 }
7307
7308 static void
7309 mleak_update_stats()
7310 {
7311 mleak_trace_stat_t *mltr;
7312 int i;
7313
7314 VERIFY(mleak_stat != NULL);
7315 #ifdef __LP64__
7316 VERIFY(mleak_stat->ml_isaddr64);
7317 #else
7318 VERIFY(!mleak_stat->ml_isaddr64);
7319 #endif /* !__LP64__ */
7320 VERIFY(mleak_stat->ml_cnt == MLEAK_NUM_TRACES);
7321
7322 mleak_sort_traces();
7323
7324 mltr = &mleak_stat->ml_trace[0];
7325 bzero(mltr, sizeof (*mltr) * MLEAK_NUM_TRACES);
7326 for (i = 0; i < MLEAK_NUM_TRACES; i++) {
7327 int j;
7328
7329 if (mleak_top_trace[i] == NULL ||
7330 mleak_top_trace[i]->allocs == 0)
7331 continue;
7332
7333 mltr->mltr_collisions = mleak_top_trace[i]->collisions;
7334 mltr->mltr_hitcount = mleak_top_trace[i]->hitcount;
7335 mltr->mltr_allocs = mleak_top_trace[i]->allocs;
7336 mltr->mltr_depth = mleak_top_trace[i]->depth;
7337
7338 VERIFY(mltr->mltr_depth <= MLEAK_STACK_DEPTH);
7339 for (j = 0; j < mltr->mltr_depth; j++)
7340 mltr->mltr_addr[j] = mleak_top_trace[i]->addr[j];
7341
7342 mltr++;
7343 }
7344 }
7345
7346 static struct mbtypes {
7347 int mt_type;
7348 const char *mt_name;
7349 } mbtypes[] = {
7350 { MT_DATA, "data" },
7351 { MT_OOBDATA, "oob data" },
7352 { MT_CONTROL, "ancillary data" },
7353 { MT_HEADER, "packet headers" },
7354 { MT_SOCKET, "socket structures" },
7355 { MT_PCB, "protocol control blocks" },
7356 { MT_RTABLE, "routing table entries" },
7357 { MT_HTABLE, "IMP host table entries" },
7358 { MT_ATABLE, "address resolution tables" },
7359 { MT_FTABLE, "fragment reassembly queue headers" },
7360 { MT_SONAME, "socket names and addresses" },
7361 { MT_SOOPTS, "socket options" },
7362 { MT_RIGHTS, "access rights" },
7363 { MT_IFADDR, "interface addresses" },
7364 { MT_TAG, "packet tags" },
7365 { 0, NULL }
7366 };
7367
7368 #define MBUF_DUMP_BUF_CHK() { \
7369 clen -= k; \
7370 if (clen < 1) \
7371 goto done; \
7372 c += k; \
7373 }
7374
7375 static char *
7376 mbuf_dump(void)
7377 {
7378 unsigned long totmem = 0, totfree = 0, totmbufs, totused, totpct,
7379 totreturned = 0;
7380 u_int32_t m_mbufs = 0, m_clfree = 0, m_bigclfree = 0;
7381 u_int32_t m_mbufclfree = 0, m_mbufbigclfree = 0;
7382 u_int32_t m_16kclusters = 0, m_16kclfree = 0, m_mbuf16kclfree = 0;
7383 int nmbtypes = sizeof (mbstat.m_mtypes) / sizeof (short);
7384 uint8_t seen[256];
7385 struct mbtypes *mp;
7386 mb_class_stat_t *sp;
7387 mleak_trace_stat_t *mltr;
7388 char *c = mbuf_dump_buf;
7389 int i, k, clen = MBUF_DUMP_BUF_SIZE;
7390
7391 mbuf_dump_buf[0] = '\0';
7392
7393 /* synchronize all statistics in the mbuf table */
7394 mbuf_stat_sync();
7395 mbuf_mtypes_sync(TRUE);
7396
7397 sp = &mb_stat->mbs_class[0];
7398 for (i = 0; i < mb_stat->mbs_cnt; i++, sp++) {
7399 u_int32_t mem;
7400
7401 if (m_class(i) == MC_MBUF) {
7402 m_mbufs = sp->mbcl_active;
7403 } else if (m_class(i) == MC_CL) {
7404 m_clfree = sp->mbcl_total - sp->mbcl_active;
7405 } else if (m_class(i) == MC_BIGCL) {
7406 m_bigclfree = sp->mbcl_total - sp->mbcl_active;
7407 } else if (njcl > 0 && m_class(i) == MC_16KCL) {
7408 m_16kclfree = sp->mbcl_total - sp->mbcl_active;
7409 m_16kclusters = sp->mbcl_total;
7410 } else if (m_class(i) == MC_MBUF_CL) {
7411 m_mbufclfree = sp->mbcl_total - sp->mbcl_active;
7412 } else if (m_class(i) == MC_MBUF_BIGCL) {
7413 m_mbufbigclfree = sp->mbcl_total - sp->mbcl_active;
7414 } else if (njcl > 0 && m_class(i) == MC_MBUF_16KCL) {
7415 m_mbuf16kclfree = sp->mbcl_total - sp->mbcl_active;
7416 }
7417
7418 mem = sp->mbcl_ctotal * sp->mbcl_size;
7419 totmem += mem;
7420 totfree += (sp->mbcl_mc_cached + sp->mbcl_infree) *
7421 sp->mbcl_size;
7422 totreturned += sp->mbcl_release_cnt;
7423
7424 }
7425
7426 /* adjust free counts to include composite caches */
7427 m_clfree += m_mbufclfree;
7428 m_bigclfree += m_mbufbigclfree;
7429 m_16kclfree += m_mbuf16kclfree;
7430
7431 totmbufs = 0;
7432 for (mp = mbtypes; mp->mt_name != NULL; mp++)
7433 totmbufs += mbstat.m_mtypes[mp->mt_type];
7434 if (totmbufs > m_mbufs)
7435 totmbufs = m_mbufs;
7436 k = snprintf(c, clen, "%lu/%u mbufs in use:\n", totmbufs, m_mbufs);
7437 MBUF_DUMP_BUF_CHK();
7438
7439 bzero(&seen, sizeof (seen));
7440 for (mp = mbtypes; mp->mt_name != NULL; mp++) {
7441 if (mbstat.m_mtypes[mp->mt_type] != 0) {
7442 seen[mp->mt_type] = 1;
7443 k = snprintf(c, clen, "\t%u mbufs allocated to %s\n",
7444 mbstat.m_mtypes[mp->mt_type], mp->mt_name);
7445 MBUF_DUMP_BUF_CHK();
7446 }
7447 }
7448 seen[MT_FREE] = 1;
7449 for (i = 0; i < nmbtypes; i++)
7450 if (!seen[i] && mbstat.m_mtypes[i] != 0) {
7451 k = snprintf(c, clen, "\t%u mbufs allocated to "
7452 "<mbuf type %d>\n", mbstat.m_mtypes[i], i);
7453 MBUF_DUMP_BUF_CHK();
7454 }
7455 if ((m_mbufs - totmbufs) > 0) {
7456 k = snprintf(c, clen, "\t%lu mbufs allocated to caches\n",
7457 m_mbufs - totmbufs);
7458 MBUF_DUMP_BUF_CHK();
7459 }
7460 k = snprintf(c, clen, "%u/%u mbuf 2KB clusters in use\n"
7461 "%u/%u mbuf 4KB clusters in use\n",
7462 (unsigned int)(mbstat.m_clusters - m_clfree),
7463 (unsigned int)mbstat.m_clusters,
7464 (unsigned int)(mbstat.m_bigclusters - m_bigclfree),
7465 (unsigned int)mbstat.m_bigclusters);
7466 MBUF_DUMP_BUF_CHK();
7467
7468 if (njcl > 0) {
7469 k = snprintf(c, clen, "%u/%u mbuf %uKB clusters in use\n",
7470 m_16kclusters - m_16kclfree, m_16kclusters,
7471 njclbytes / 1024);
7472 MBUF_DUMP_BUF_CHK();
7473 }
7474 totused = totmem - totfree;
7475 if (totmem == 0) {
7476 totpct = 0;
7477 } else if (totused < (ULONG_MAX / 100)) {
7478 totpct = (totused * 100) / totmem;
7479 } else {
7480 u_long totmem1 = totmem / 100;
7481 u_long totused1 = totused / 100;
7482 totpct = (totused1 * 100) / totmem1;
7483 }
7484 k = snprintf(c, clen, "%lu KB allocated to network (approx. %lu%% "
7485 "in use)\n", totmem / 1024, totpct);
7486 MBUF_DUMP_BUF_CHK();
7487 k = snprintf(c, clen, "%lu KB returned to the system\n",
7488 totreturned / 1024);
7489 MBUF_DUMP_BUF_CHK();
7490
7491 net_update_uptime();
7492 k = snprintf(c, clen,
7493 "VM allocation failures: contiguous %u, normal %u, one page %u\n",
7494 mb_kmem_contig_failed, mb_kmem_failed, mb_kmem_one_failed);
7495 MBUF_DUMP_BUF_CHK();
7496 if (mb_kmem_contig_failed_ts || mb_kmem_failed_ts ||
7497 mb_kmem_one_failed_ts) {
7498 k = snprintf(c, clen,
7499 "VM allocation failure timestamps: contiguous %llu "
7500 "(size %llu), normal %llu (size %llu), one page %llu "
7501 "(now %llu)\n",
7502 mb_kmem_contig_failed_ts, mb_kmem_contig_failed_size,
7503 mb_kmem_failed_ts, mb_kmem_failed_size,
7504 mb_kmem_one_failed_ts, net_uptime());
7505 MBUF_DUMP_BUF_CHK();
7506 k = snprintf(c, clen,
7507 "VM return codes: ");
7508 MBUF_DUMP_BUF_CHK();
7509 for (i = 0;
7510 i < sizeof(mb_kmem_stats) / sizeof(mb_kmem_stats[0]);
7511 i++) {
7512 k = snprintf(c, clen, "%s: %u ", mb_kmem_stats_labels[i],
7513 mb_kmem_stats[i]);
7514 MBUF_DUMP_BUF_CHK();
7515 }
7516 k = snprintf(c, clen, "\n");
7517 MBUF_DUMP_BUF_CHK();
7518 }
7519 k = snprintf(c, clen,
7520 "worker thread runs: %u, expansions: %llu, cl %llu/%llu, "
7521 "bigcl %llu/%llu, 16k %llu/%llu\n", mbuf_worker_run_cnt,
7522 mb_expand_cnt, mb_expand_cl_cnt, mb_expand_cl_total,
7523 mb_expand_bigcl_cnt, mb_expand_bigcl_total, mb_expand_16kcl_cnt,
7524 mb_expand_16kcl_total);
7525 MBUF_DUMP_BUF_CHK();
7526 if (mbuf_worker_last_runtime != 0) {
7527 k = snprintf(c, clen, "worker thread last run time: "
7528 "%llu (%llu seconds ago)\n",
7529 mbuf_worker_last_runtime,
7530 net_uptime() - mbuf_worker_last_runtime);
7531 MBUF_DUMP_BUF_CHK();
7532 }
7533
7534 /* mbuf leak detection statistics */
7535 mleak_update_stats();
7536
7537 k = snprintf(c, clen, "\nmbuf leak detection table:\n");
7538 MBUF_DUMP_BUF_CHK();
7539 k = snprintf(c, clen, "\ttotal captured: %u (one per %u)\n",
7540 mleak_table.mleak_capture / mleak_table.mleak_sample_factor,
7541 mleak_table.mleak_sample_factor);
7542 MBUF_DUMP_BUF_CHK();
7543 k = snprintf(c, clen, "\ttotal allocs outstanding: %llu\n",
7544 mleak_table.outstanding_allocs);
7545 MBUF_DUMP_BUF_CHK();
7546 k = snprintf(c, clen, "\tnew hash recorded: %llu allocs, %llu traces\n",
7547 mleak_table.alloc_recorded, mleak_table.trace_recorded);
7548 MBUF_DUMP_BUF_CHK();
7549 k = snprintf(c, clen, "\thash collisions: %llu allocs, %llu traces\n",
7550 mleak_table.alloc_collisions, mleak_table.trace_collisions);
7551 MBUF_DUMP_BUF_CHK();
7552 k = snprintf(c, clen, "\toverwrites: %llu allocs, %llu traces\n",
7553 mleak_table.alloc_overwrites, mleak_table.trace_overwrites);
7554 MBUF_DUMP_BUF_CHK();
7555 k = snprintf(c, clen, "\tlock conflicts: %llu\n\n",
7556 mleak_table.total_conflicts);
7557 MBUF_DUMP_BUF_CHK();
7558
7559 k = snprintf(c, clen, "top %d outstanding traces:\n",
7560 mleak_stat->ml_cnt);
7561 MBUF_DUMP_BUF_CHK();
7562 for (i = 0; i < mleak_stat->ml_cnt; i++) {
7563 mltr = &mleak_stat->ml_trace[i];
7564 k = snprintf(c, clen, "[%d] %llu outstanding alloc(s), "
7565 "%llu hit(s), %llu collision(s)\n", (i + 1),
7566 mltr->mltr_allocs, mltr->mltr_hitcount,
7567 mltr->mltr_collisions);
7568 MBUF_DUMP_BUF_CHK();
7569 }
7570
7571 if (mleak_stat->ml_isaddr64)
7572 k = snprintf(c, clen, MB_LEAK_HDR_64);
7573 else
7574 k = snprintf(c, clen, MB_LEAK_HDR_32);
7575 MBUF_DUMP_BUF_CHK();
7576
7577 for (i = 0; i < MLEAK_STACK_DEPTH; i++) {
7578 int j;
7579 k = snprintf(c, clen, "%2d: ", (i + 1));
7580 MBUF_DUMP_BUF_CHK();
7581 for (j = 0; j < mleak_stat->ml_cnt; j++) {
7582 mltr = &mleak_stat->ml_trace[j];
7583 if (i < mltr->mltr_depth) {
7584 if (mleak_stat->ml_isaddr64) {
7585 k = snprintf(c, clen, "0x%0llx ",
7586 (uint64_t)VM_KERNEL_UNSLIDE(
7587 mltr->mltr_addr[i]));
7588 } else {
7589 k = snprintf(c, clen,
7590 "0x%08x ",
7591 (uint32_t)VM_KERNEL_UNSLIDE(
7592 mltr->mltr_addr[i]));
7593 }
7594 } else {
7595 if (mleak_stat->ml_isaddr64)
7596 k = snprintf(c, clen,
7597 MB_LEAK_SPACING_64);
7598 else
7599 k = snprintf(c, clen,
7600 MB_LEAK_SPACING_32);
7601 }
7602 MBUF_DUMP_BUF_CHK();
7603 }
7604 k = snprintf(c, clen, "\n");
7605 MBUF_DUMP_BUF_CHK();
7606 }
7607 done:
7608 return (mbuf_dump_buf);
7609 }
7610
7611 #undef MBUF_DUMP_BUF_CHK
7612
7613 /*
7614 * Convert between a regular and a packet header mbuf. Caller is responsible
7615 * for setting or clearing M_PKTHDR; this routine does the rest of the work.
7616 */
7617 int
7618 m_reinit(struct mbuf *m, int hdr)
7619 {
7620 int ret = 0;
7621
7622 if (hdr) {
7623 VERIFY(!(m->m_flags & M_PKTHDR));
7624 if (!(m->m_flags & M_EXT) &&
7625 (m->m_data != m->m_dat || m->m_len > 0)) {
7626 /*
7627 * If there's no external cluster attached and the
7628 * mbuf appears to contain user data, we cannot
7629 * safely convert this to a packet header mbuf,
7630 * as the packet header structure might overlap
7631 * with the data.
7632 */
7633 printf("%s: cannot set M_PKTHDR on altered mbuf %llx, "
7634 "m_data %llx (expected %llx), "
7635 "m_len %d (expected 0)\n",
7636 __func__,
7637 (uint64_t)VM_KERNEL_ADDRPERM(m),
7638 (uint64_t)VM_KERNEL_ADDRPERM(m->m_data),
7639 (uint64_t)VM_KERNEL_ADDRPERM(m->m_dat), m->m_len);
7640 ret = EBUSY;
7641 } else {
7642 VERIFY((m->m_flags & M_EXT) || m->m_data == m->m_dat);
7643 m->m_flags |= M_PKTHDR;
7644 MBUF_INIT_PKTHDR(m);
7645 }
7646 } else {
7647 /* Check for scratch area overflow */
7648 m_redzone_verify(m);
7649 /* Free the aux data and tags if there is any */
7650 m_tag_delete_chain(m, NULL);
7651 m->m_flags &= ~M_PKTHDR;
7652 }
7653
7654 return (ret);
7655 }
7656
7657 int
7658 m_ext_set_prop(struct mbuf *m, uint32_t o, uint32_t n)
7659 {
7660 ASSERT(m->m_flags & M_EXT);
7661 return (atomic_test_set_32(&MEXT_PRIV(m), o, n));
7662 }
7663
7664 uint32_t
7665 m_ext_get_prop(struct mbuf *m)
7666 {
7667 ASSERT(m->m_flags & M_EXT);
7668 return (MEXT_PRIV(m));
7669 }
7670
7671 int
7672 m_ext_paired_is_active(struct mbuf *m)
7673 {
7674 return (MBUF_IS_PAIRED(m) ? (MEXT_PREF(m) > MEXT_MINREF(m)) : 1);
7675 }
7676
7677 void
7678 m_ext_paired_activate(struct mbuf *m)
7679 {
7680 struct ext_ref *rfa;
7681 int hdr, type;
7682 caddr_t extbuf;
7683 m_ext_free_func_t extfree;
7684 u_int extsize;
7685
7686 VERIFY(MBUF_IS_PAIRED(m));
7687 VERIFY(MEXT_REF(m) == MEXT_MINREF(m));
7688 VERIFY(MEXT_PREF(m) == MEXT_MINREF(m));
7689
7690 hdr = (m->m_flags & M_PKTHDR);
7691 type = m->m_type;
7692 extbuf = m->m_ext.ext_buf;
7693 extfree = m_get_ext_free(m);
7694 extsize = m->m_ext.ext_size;
7695 rfa = m_get_rfa(m);
7696
7697 VERIFY(extbuf != NULL && rfa != NULL);
7698
7699 /*
7700 * Safe to reinitialize packet header tags, since it's
7701 * already taken care of at m_free() time. Similar to
7702 * what's done in m_clattach() for the cluster. Bump
7703 * up MEXT_PREF to indicate activation.
7704 */
7705 MBUF_INIT(m, hdr, type);
7706 MEXT_INIT(m, extbuf, extsize, extfree, (caddr_t)m, rfa,
7707 1, 1, 2, EXTF_PAIRED, MEXT_PRIV(m), m);
7708 }
7709
7710 void
7711 m_scratch_init(struct mbuf *m)
7712 {
7713 struct pkthdr *pkt = &m->m_pkthdr;
7714
7715 VERIFY(m->m_flags & M_PKTHDR);
7716
7717 /* See comments in <rdar://problem/14040693> */
7718 if (pkt->pkt_flags & PKTF_PRIV_GUARDED) {
7719 panic_plain("Invalid attempt to modify guarded module-private "
7720 "area: mbuf %p, pkt_flags 0x%x\n", m, pkt->pkt_flags);
7721 /* NOTREACHED */
7722 }
7723
7724 bzero(&pkt->pkt_mpriv, sizeof (pkt->pkt_mpriv));
7725 }
7726
7727 /*
7728 * This routine is reserved for mbuf_get_driver_scratch(); clients inside
7729 * xnu that intend on utilizing the module-private area should directly
7730 * refer to the pkt_mpriv structure in the pkthdr. They are also expected
7731 * to set and clear PKTF_PRIV_GUARDED, while owning the packet and prior
7732 * to handing it off to another module, respectively.
7733 */
7734 u_int32_t
7735 m_scratch_get(struct mbuf *m, u_int8_t **p)
7736 {
7737 struct pkthdr *pkt = &m->m_pkthdr;
7738
7739 VERIFY(m->m_flags & M_PKTHDR);
7740
7741 /* See comments in <rdar://problem/14040693> */
7742 if (pkt->pkt_flags & PKTF_PRIV_GUARDED) {
7743 panic_plain("Invalid attempt to access guarded module-private "
7744 "area: mbuf %p, pkt_flags 0x%x\n", m, pkt->pkt_flags);
7745 /* NOTREACHED */
7746 }
7747
7748 if (mcltrace) {
7749 mcache_audit_t *mca;
7750
7751 lck_mtx_lock(mbuf_mlock);
7752 mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
7753 if (mca->mca_uflags & MB_SCVALID)
7754 mcl_audit_scratch(mca);
7755 lck_mtx_unlock(mbuf_mlock);
7756 }
7757
7758 *p = (u_int8_t *)&pkt->pkt_mpriv;
7759 return (sizeof (pkt->pkt_mpriv));
7760 }
7761
7762 static void
7763 m_redzone_init(struct mbuf *m)
7764 {
7765 VERIFY(m->m_flags & M_PKTHDR);
7766 /*
7767 * Each mbuf has a unique red zone pattern, which is a XOR
7768 * of the red zone cookie and the address of the mbuf.
7769 */
7770 m->m_pkthdr.redzone = ((u_int32_t)(uintptr_t)m) ^ mb_redzone_cookie;
7771 }
7772
7773 static void
7774 m_redzone_verify(struct mbuf *m)
7775 {
7776 u_int32_t mb_redzone;
7777
7778 VERIFY(m->m_flags & M_PKTHDR);
7779
7780 mb_redzone = ((u_int32_t)(uintptr_t)m) ^ mb_redzone_cookie;
7781 if (m->m_pkthdr.redzone != mb_redzone) {
7782 panic("mbuf %p redzone violation with value 0x%x "
7783 "(instead of 0x%x, using cookie 0x%x)\n",
7784 m, m->m_pkthdr.redzone, mb_redzone, mb_redzone_cookie);
7785 /* NOTREACHED */
7786 }
7787 }
7788
7789 __private_extern__ inline void
7790 m_set_ext(struct mbuf *m, struct ext_ref *rfa, m_ext_free_func_t ext_free,
7791 caddr_t ext_arg)
7792 {
7793 VERIFY(m->m_flags & M_EXT);
7794 if (rfa != NULL) {
7795 m->m_ext.ext_refflags =
7796 (struct ext_ref *)(((uintptr_t)rfa) ^ mb_obscure_extref);
7797 if (ext_free != NULL) {
7798 rfa->ext_token = ((uintptr_t)&rfa->ext_token) ^
7799 mb_obscure_extfree;
7800 m->m_ext.ext_free = (m_ext_free_func_t)
7801 (((uintptr_t)ext_free) ^ rfa->ext_token);
7802 if (ext_arg != NULL) {
7803 m->m_ext.ext_arg =
7804 (caddr_t)(((uintptr_t)ext_arg) ^ rfa->ext_token);
7805 } else {
7806 m->m_ext.ext_arg = NULL;
7807 }
7808 } else {
7809 rfa->ext_token = 0;
7810 m->m_ext.ext_free = NULL;
7811 m->m_ext.ext_arg = NULL;
7812 }
7813 } else {
7814 /*
7815 * If we are going to loose the cookie in ext_token by
7816 * resetting the rfa, we should use the global cookie
7817 * to obscure the ext_free and ext_arg pointers.
7818 */
7819 if (ext_free != NULL) {
7820 m->m_ext.ext_free =
7821 (m_ext_free_func_t)((uintptr_t)ext_free ^
7822 mb_obscure_extfree);
7823 if (ext_arg != NULL) {
7824 m->m_ext.ext_arg =
7825 (caddr_t)((uintptr_t)ext_arg ^
7826 mb_obscure_extfree);
7827 } else {
7828 m->m_ext.ext_arg = NULL;
7829 }
7830 } else {
7831 m->m_ext.ext_free = NULL;
7832 m->m_ext.ext_arg = NULL;
7833 }
7834 m->m_ext.ext_refflags = NULL;
7835 }
7836 }
7837
7838 __private_extern__ inline struct ext_ref *
7839 m_get_rfa(struct mbuf *m)
7840 {
7841 if (m->m_ext.ext_refflags == NULL)
7842 return (NULL);
7843 else
7844 return ((struct ext_ref *)(((uintptr_t)m->m_ext.ext_refflags) ^ mb_obscure_extref));
7845 }
7846
7847 __private_extern__ inline m_ext_free_func_t
7848 m_get_ext_free(struct mbuf *m)
7849 {
7850 struct ext_ref *rfa;
7851 if (m->m_ext.ext_free == NULL)
7852 return (NULL);
7853
7854 rfa = m_get_rfa(m);
7855 if (rfa == NULL)
7856 return ((m_ext_free_func_t)((uintptr_t)m->m_ext.ext_free ^ mb_obscure_extfree));
7857 else
7858 return ((m_ext_free_func_t)(((uintptr_t)m->m_ext.ext_free)
7859 ^ rfa->ext_token));
7860 }
7861
7862 __private_extern__ inline caddr_t
7863 m_get_ext_arg(struct mbuf *m)
7864 {
7865 struct ext_ref *rfa;
7866 if (m->m_ext.ext_arg == NULL)
7867 return (NULL);
7868
7869 rfa = m_get_rfa(m);
7870 if (rfa == NULL) {
7871 return ((caddr_t)((uintptr_t)m->m_ext.ext_arg ^ mb_obscure_extfree));
7872 } else {
7873 return ((caddr_t)(((uintptr_t)m->m_ext.ext_arg) ^
7874 rfa->ext_token));
7875 }
7876 }
7877
7878 /*
7879 * Send a report of mbuf usage if the usage is at least 6% of max limit
7880 * or if there has been at least 3% increase since the last report.
7881 *
7882 * The values 6% and 3% are chosen so that we can do simple arithmetic
7883 * with shift operations.
7884 */
7885 static boolean_t
7886 mbuf_report_usage(mbuf_class_t cl)
7887 {
7888 /* if a report is already in progress, nothing to do */
7889 if (mb_peak_newreport)
7890 return (TRUE);
7891
7892 if (m_total(cl) > m_peak(cl) &&
7893 m_total(cl) >= (m_maxlimit(cl) >> 4) &&
7894 (m_total(cl) - m_peak(cl)) >= (m_peak(cl) >> 5))
7895 return (TRUE);
7896 return (FALSE);
7897 }
7898
7899 __private_extern__ void
7900 mbuf_report_peak_usage(void)
7901 {
7902 int i = 0;
7903 u_int64_t uptime;
7904 struct nstat_sysinfo_data ns_data;
7905 uint32_t memreleased = 0;
7906 static uint32_t prevmemreleased;
7907
7908 uptime = net_uptime();
7909 lck_mtx_lock(mbuf_mlock);
7910
7911 /* Generate an initial report after 1 week of uptime */
7912 if (!mb_peak_firstreport &&
7913 uptime > MBUF_PEAK_FIRST_REPORT_THRESHOLD) {
7914 mb_peak_newreport = TRUE;
7915 mb_peak_firstreport = TRUE;
7916 }
7917
7918 if (!mb_peak_newreport) {
7919 lck_mtx_unlock(mbuf_mlock);
7920 return;
7921 }
7922
7923 /*
7924 * Since a report is being generated before 1 week,
7925 * we do not need to force another one later
7926 */
7927 if (uptime < MBUF_PEAK_FIRST_REPORT_THRESHOLD)
7928 mb_peak_firstreport = TRUE;
7929
7930 for (i = 0; i < NELEM(mbuf_table); i++) {
7931 m_peak(m_class(i)) = m_total(m_class(i));
7932 memreleased += m_release_cnt(i);
7933 }
7934 memreleased = memreleased - prevmemreleased;
7935 prevmemreleased = memreleased;
7936 mb_peak_newreport = FALSE;
7937 lck_mtx_unlock(mbuf_mlock);
7938
7939 bzero(&ns_data, sizeof(ns_data));
7940 ns_data.flags = NSTAT_SYSINFO_MBUF_STATS;
7941 ns_data.u.mb_stats.total_256b = m_peak(MC_MBUF);
7942 ns_data.u.mb_stats.total_2kb = m_peak(MC_CL);
7943 ns_data.u.mb_stats.total_4kb = m_peak(MC_BIGCL);
7944 ns_data.u.mb_stats.total_16kb = m_peak(MC_16KCL);
7945 ns_data.u.mb_stats.sbmb_total = total_sbmb_cnt_peak;
7946 ns_data.u.mb_stats.sb_atmbuflimit = sbmb_limreached;
7947 ns_data.u.mb_stats.draincnt = mbstat.m_drain;
7948 ns_data.u.mb_stats.memreleased = memreleased;
7949 ns_data.u.mb_stats.sbmb_floor = total_sbmb_cnt_floor;
7950
7951 nstat_sysinfo_send_data(&ns_data);
7952
7953 /*
7954 * Reset the floor whenever we report a new
7955 * peak to track the trend (increase peek usage
7956 * is not a leak if mbufs get released
7957 * between reports and the floor stays low)
7958 */
7959 total_sbmb_cnt_floor = total_sbmb_cnt_peak;
7960 }
7961
7962 /*
7963 * Called by the VM when there's memory pressure.
7964 */
7965 __private_extern__ void
7966 m_drain(void)
7967 {
7968 mbuf_class_t mc;
7969 mcl_slab_t *sp, *sp_tmp, *nsp;
7970 unsigned int num, k, interval, released = 0;
7971 unsigned long total_mem = 0, use_mem = 0;
7972 boolean_t ret, purge_caches = FALSE;
7973 ppnum_t offset;
7974 mcache_obj_t *obj;
7975 unsigned long per;
7976 static uint64_t last_drain = 0;
7977 static unsigned char scratch[32];
7978 static ppnum_t scratch_pa = 0;
7979
7980 if (mb_drain_maxint == 0 || mb_waiters)
7981 return;
7982 if (scratch_pa == 0) {
7983 bzero(scratch, sizeof(scratch));
7984 scratch_pa = pmap_find_phys(kernel_pmap, (addr64_t)scratch);
7985 VERIFY(scratch_pa);
7986 } else if (mclverify) {
7987 /*
7988 * Panic if a driver wrote to our scratch memory.
7989 */
7990 for (k = 0; k < sizeof(scratch); k++)
7991 if (scratch[k])
7992 panic("suspect DMA to freed address");
7993 }
7994 /*
7995 * Don't free memory too often as that could cause excessive
7996 * waiting times for mbufs. Purge caches if we were asked to drain
7997 * in the last 5 minutes.
7998 */
7999 lck_mtx_lock(mbuf_mlock);
8000 if (last_drain == 0) {
8001 last_drain = net_uptime();
8002 lck_mtx_unlock(mbuf_mlock);
8003 return;
8004 }
8005 interval = net_uptime() - last_drain;
8006 if (interval <= mb_drain_maxint) {
8007 lck_mtx_unlock(mbuf_mlock);
8008 return;
8009 }
8010 if (interval <= mb_drain_maxint * 5)
8011 purge_caches = TRUE;
8012 last_drain = net_uptime();
8013 /*
8014 * Don't free any memory if we're using 60% or more.
8015 */
8016 for (mc = 0; mc < NELEM(mbuf_table); mc++) {
8017 total_mem += m_total(mc) * m_maxsize(mc);
8018 use_mem += m_active(mc) * m_maxsize(mc);
8019 }
8020 per = (use_mem * 100) / total_mem;
8021 if (per >= 60) {
8022 lck_mtx_unlock(mbuf_mlock);
8023 return;
8024 }
8025 /*
8026 * Purge all the caches. This effectively disables
8027 * caching for a few seconds, but the mbuf worker thread will
8028 * re-enable them again.
8029 */
8030 if (purge_caches == TRUE)
8031 for (mc = 0; mc < NELEM(mbuf_table); mc++) {
8032 if (m_total(mc) < m_avgtotal(mc))
8033 continue;
8034 lck_mtx_unlock(mbuf_mlock);
8035 ret = mcache_purge_cache(m_cache(mc), FALSE);
8036 lck_mtx_lock(mbuf_mlock);
8037 if (ret == TRUE)
8038 m_purge_cnt(mc)++;
8039 }
8040 /*
8041 * Move the objects from the composite class freelist to
8042 * the rudimentary slabs list, but keep at least 10% of the average
8043 * total in the freelist.
8044 */
8045 for (mc = 0; mc < NELEM(mbuf_table); mc++) {
8046 while (m_cobjlist(mc) &&
8047 m_total(mc) < m_avgtotal(mc) &&
8048 m_infree(mc) > 0.1 * m_avgtotal(mc) + m_minlimit(mc)) {
8049 obj = m_cobjlist(mc);
8050 m_cobjlist(mc) = obj->obj_next;
8051 obj->obj_next = NULL;
8052 num = cslab_free(mc, obj, 1);
8053 VERIFY(num == 1);
8054 m_free_cnt(mc)++;
8055 m_infree(mc)--;
8056 /* cslab_free() handles m_total */
8057 }
8058 }
8059 /*
8060 * Free the buffers present in the slab list up to 10% of the total
8061 * average per class.
8062 *
8063 * We walk the list backwards in an attempt to reduce fragmentation.
8064 */
8065 for (mc = NELEM(mbuf_table) - 1; (int)mc >= 0; mc--) {
8066 TAILQ_FOREACH_SAFE(sp, &m_slablist(mc), sl_link, sp_tmp) {
8067 /*
8068 * Process only unused slabs occupying memory.
8069 */
8070 if (sp->sl_refcnt != 0 || sp->sl_len == 0 ||
8071 sp->sl_base == NULL)
8072 continue;
8073 if (m_total(mc) < m_avgtotal(mc) ||
8074 m_infree(mc) < 0.1 * m_avgtotal(mc) + m_minlimit(mc))
8075 break;
8076 slab_remove(sp, mc);
8077 switch (mc) {
8078 case MC_MBUF:
8079 m_infree(mc) -= NMBPG;
8080 m_total(mc) -= NMBPG;
8081 if (mclaudit != NULL)
8082 mcl_audit_free(sp->sl_base, NMBPG);
8083 break;
8084 case MC_CL:
8085 m_infree(mc) -= NCLPG;
8086 m_total(mc) -= NCLPG;
8087 if (mclaudit != NULL)
8088 mcl_audit_free(sp->sl_base, NMBPG);
8089 break;
8090 case MC_BIGCL:
8091 {
8092 m_infree(mc) -= NBCLPG;
8093 m_total(mc) -= NBCLPG;
8094 if (mclaudit != NULL)
8095 mcl_audit_free(sp->sl_base, NMBPG);
8096 break;
8097 }
8098 case MC_16KCL:
8099 m_infree(mc)--;
8100 m_total(mc)--;
8101 for (nsp = sp, k = 1; k < NSLABSP16KB; k++) {
8102 nsp = nsp->sl_next;
8103 VERIFY(nsp->sl_refcnt == 0 &&
8104 nsp->sl_base != NULL &&
8105 nsp->sl_len == 0);
8106 slab_init(nsp, 0, 0, NULL, NULL, 0, 0,
8107 0);
8108 nsp->sl_flags = 0;
8109 }
8110 if (mclaudit != NULL) {
8111 if (sp->sl_len == PAGE_SIZE) {
8112 mcl_audit_free(sp->sl_base,
8113 NMBPG);
8114 } else {
8115 mcl_audit_free(sp->sl_base, 1);
8116 }
8117 }
8118 break;
8119 default:
8120 /*
8121 * The composite classes have their own
8122 * freelist (m_cobjlist), so we only
8123 * process rudimentary classes here.
8124 */
8125 VERIFY(0);
8126 }
8127 m_release_cnt(mc) += m_size(mc);
8128 released += m_size(mc);
8129 VERIFY(sp->sl_base != NULL &&
8130 sp->sl_len >= PAGE_SIZE);
8131 offset = MTOPG(sp->sl_base);
8132 /*
8133 * Make sure the IOMapper points to a valid, but
8134 * bogus, address. This should prevent further DMA
8135 * accesses to freed memory.
8136 */
8137 IOMapperInsertPage(mcl_paddr_base, offset, scratch_pa);
8138 mcl_paddr[offset] = 0;
8139 kmem_free(mb_map, (vm_offset_t)sp->sl_base,
8140 sp->sl_len);
8141 slab_init(sp, 0, 0, NULL, NULL, 0, 0, 0);
8142 sp->sl_flags = 0;
8143 }
8144 }
8145 mbstat.m_drain++;
8146 mbstat.m_bigclusters = m_total(MC_BIGCL);
8147 mbstat.m_clusters = m_total(MC_CL);
8148 mbstat.m_mbufs = m_total(MC_MBUF);
8149 mbuf_stat_sync();
8150 mbuf_mtypes_sync(TRUE);
8151 lck_mtx_unlock(mbuf_mlock);
8152 }
8153
8154 static int
8155 m_drain_force_sysctl SYSCTL_HANDLER_ARGS
8156 {
8157 #pragma unused(arg1, arg2)
8158 int val = 0, err;
8159
8160 err = sysctl_handle_int(oidp, &val, 0, req);
8161 if (err != 0 || req->newptr == USER_ADDR_NULL)
8162 return (err);
8163 if (val)
8164 m_drain();
8165
8166 return (err);
8167 }
8168
8169 #if DEBUG || DEVELOPMENT
8170
8171 static int mbtest_val;
8172 static int mbtest_running;
8173
8174 static void mbtest_thread(__unused void *arg)
8175 {
8176 int i;
8177
8178 printf("%s thread starting\n", __func__);
8179
8180 for (i = 0; i < 1000; i++) {
8181 unsigned int needed = 100000;
8182 struct mbuf *m1, *m2, *m3;
8183
8184 if (njcl > 0) {
8185 needed = 100000;
8186 m3 = m_getpackets_internal(&needed, 0, M_DONTWAIT, 0, M16KCLBYTES);
8187 m_freem_list(m3);
8188 }
8189
8190 needed = 100000;
8191 m2 = m_getpackets_internal(&needed, 0, M_DONTWAIT, 0, MBIGCLBYTES);
8192 m_freem_list(m2);
8193
8194 m1 = m_getpackets_internal(&needed, 0, M_DONTWAIT, 0, MCLBYTES);
8195 m_freem_list(m1);
8196 }
8197
8198 printf("%s thread ending\n", __func__);
8199
8200 OSDecrementAtomic(&mbtest_running);
8201 wakeup_one((caddr_t)&mbtest_running);
8202 }
8203
8204 static void sysctl_mbtest(void)
8205 {
8206 /* We launch three threads - wait for all of them */
8207 OSIncrementAtomic(&mbtest_running);
8208 OSIncrementAtomic(&mbtest_running);
8209 OSIncrementAtomic(&mbtest_running);
8210
8211 thread_call_func_delayed((thread_call_func_t)mbtest_thread, NULL, 10);
8212 thread_call_func_delayed((thread_call_func_t)mbtest_thread, NULL, 10);
8213 thread_call_func_delayed((thread_call_func_t)mbtest_thread, NULL, 10);
8214
8215 while (mbtest_running) {
8216 msleep((caddr_t)&mbtest_running, NULL, PUSER, "mbtest_running", NULL);
8217 }
8218 }
8219
8220 static int
8221 mbtest SYSCTL_HANDLER_ARGS
8222 {
8223 #pragma unused(arg1, arg2)
8224 int error = 0, val, oldval = mbtest_val;
8225
8226 val = oldval;
8227 error = sysctl_handle_int(oidp, &val, 0, req);
8228 if (error || !req->newptr)
8229 return (error);
8230
8231 if (val != oldval)
8232 sysctl_mbtest();
8233
8234 mbtest_val = val;
8235
8236 return (error);
8237 }
8238 #endif
8239
8240 SYSCTL_DECL(_kern_ipc);
8241 #if DEBUG || DEVELOPMENT
8242 SYSCTL_PROC(_kern_ipc, OID_AUTO, mbtest,
8243 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &mbtest_val, 0, &mbtest, "I",
8244 "Toggle to test mbufs");
8245 #endif
8246 SYSCTL_PROC(_kern_ipc, KIPC_MBSTAT, mbstat,
8247 CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
8248 0, 0, mbstat_sysctl, "S,mbstat", "");
8249 SYSCTL_PROC(_kern_ipc, OID_AUTO, mb_stat,
8250 CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
8251 0, 0, mb_stat_sysctl, "S,mb_stat", "");
8252 SYSCTL_PROC(_kern_ipc, OID_AUTO, mleak_top_trace,
8253 CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
8254 0, 0, mleak_top_trace_sysctl, "S,mb_top_trace", "");
8255 SYSCTL_PROC(_kern_ipc, OID_AUTO, mleak_table,
8256 CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
8257 0, 0, mleak_table_sysctl, "S,mleak_table", "");
8258 SYSCTL_INT(_kern_ipc, OID_AUTO, mleak_sample_factor,
8259 CTLFLAG_RW | CTLFLAG_LOCKED, &mleak_table.mleak_sample_factor, 0, "");
8260 SYSCTL_INT(_kern_ipc, OID_AUTO, mb_normalized,
8261 CTLFLAG_RD | CTLFLAG_LOCKED, &mb_normalized, 0, "");
8262 SYSCTL_INT(_kern_ipc, OID_AUTO, mb_watchdog,
8263 CTLFLAG_RW | CTLFLAG_LOCKED, &mb_watchdog, 0, "");
8264 SYSCTL_PROC(_kern_ipc, OID_AUTO, mb_drain_force,
8265 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, NULL, 0,
8266 m_drain_force_sysctl, "I",
8267 "Forces the mbuf garbage collection to run");
8268 SYSCTL_INT(_kern_ipc, OID_AUTO, mb_drain_maxint,
8269 CTLFLAG_RW | CTLFLAG_LOCKED, &mb_drain_maxint, 0,
8270 "Minimum time interval between garbage collection");