]> git.saurik.com Git - apple/xnu.git/blob - bsd/kern/uipc_mbuf.c
0112f0c024352e56582dc085f35de3742177e8e3
[apple/xnu.git] / bsd / kern / uipc_mbuf.c
1 /*
2 * Copyright (c) 1998-2012 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
29 /*
30 * Copyright (c) 1982, 1986, 1988, 1991, 1993
31 * The Regents of the University of California. All rights reserved.
32 *
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
35 * are met:
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
41 * 3. All advertising materials mentioning features or use of this software
42 * must display the following acknowledgement:
43 * This product includes software developed by the University of
44 * California, Berkeley and its contributors.
45 * 4. Neither the name of the University nor the names of its contributors
46 * may be used to endorse or promote products derived from this software
47 * without specific prior written permission.
48 *
49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59 * SUCH DAMAGE.
60 *
61 * @(#)uipc_mbuf.c 8.2 (Berkeley) 1/4/94
62 */
63 /*
64 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
65 * support for mandatory and extensible security protections. This notice
66 * is included in support of clause 2.2 (b) of the Apple Public License,
67 * Version 2.0.
68 */
69
70 #include <sys/param.h>
71 #include <sys/systm.h>
72 #include <sys/malloc.h>
73 #include <sys/mbuf.h>
74 #include <sys/kernel.h>
75 #include <sys/sysctl.h>
76 #include <sys/syslog.h>
77 #include <sys/protosw.h>
78 #include <sys/domain.h>
79 #include <sys/queue.h>
80 #include <sys/proc.h>
81
82 #include <kern/kern_types.h>
83 #include <kern/simple_lock.h>
84 #include <kern/queue.h>
85 #include <kern/sched_prim.h>
86 #include <kern/cpu_number.h>
87 #include <kern/zalloc.h>
88
89 #include <libkern/OSAtomic.h>
90 #include <libkern/libkern.h>
91
92 #include <IOKit/IOMapper.h>
93
94 #include <machine/limits.h>
95 #include <machine/machine_routines.h>
96
97 #if CONFIG_MACF_NET
98 #include <security/mac_framework.h>
99 #endif /* MAC_NET */
100
101 #include <sys/mcache.h>
102
103 /*
104 * MBUF IMPLEMENTATION NOTES.
105 *
106 * There is a total of 5 per-CPU caches:
107 *
108 * MC_MBUF:
109 * This is a cache of rudimentary objects of MSIZE in size; each
110 * object represents an mbuf structure. This cache preserves only
111 * the m_type field of the mbuf during its transactions.
112 *
113 * MC_CL:
114 * This is a cache of rudimentary objects of MCLBYTES in size; each
115 * object represents a mcluster structure. This cache does not
116 * preserve the contents of the objects during its transactions.
117 *
118 * MC_BIGCL:
119 * This is a cache of rudimentary objects of MBIGCLBYTES in size; each
120 * object represents a mbigcluster structure. This cache does not
121 * preserve the contents of the objects during its transaction.
122 *
123 * MC_MBUF_CL:
124 * This is a cache of mbufs each having a cluster attached to it.
125 * It is backed by MC_MBUF and MC_CL rudimentary caches. Several
126 * fields of the mbuf related to the external cluster are preserved
127 * during transactions.
128 *
129 * MC_MBUF_BIGCL:
130 * This is a cache of mbufs each having a big cluster attached to it.
131 * It is backed by MC_MBUF and MC_BIGCL rudimentary caches. Several
132 * fields of the mbuf related to the external cluster are preserved
133 * during transactions.
134 *
135 * OBJECT ALLOCATION:
136 *
137 * Allocation requests are handled first at the per-CPU (mcache) layer
138 * before falling back to the slab layer. Performance is optimal when
139 * the request is satisfied at the CPU layer because global data/lock
140 * never gets accessed. When the slab layer is entered for allocation,
141 * the slab freelist will be checked first for available objects before
142 * the VM backing store is invoked. Slab layer operations are serialized
143 * for all of the caches as the mbuf global lock is held most of the time.
144 * Allocation paths are different depending on the class of objects:
145 *
146 * a. Rudimentary object:
147 *
148 * { m_get_common(), m_clattach(), m_mclget(),
149 * m_mclalloc(), m_bigalloc(), m_copym_with_hdrs(),
150 * composite object allocation }
151 * | ^
152 * | |
153 * | +-----------------------+
154 * v |
155 * mcache_alloc/mcache_alloc_ext() mbuf_slab_audit()
156 * | ^
157 * v |
158 * [CPU cache] -------> (found?) -------+
159 * | |
160 * v |
161 * mbuf_slab_alloc() |
162 * | |
163 * v |
164 * +---------> [freelist] -------> (found?) -------+
165 * | |
166 * | v
167 * | m_clalloc()
168 * | |
169 * | v
170 * +---<<---- kmem_mb_alloc()
171 *
172 * b. Composite object:
173 *
174 * { m_getpackets_internal(), m_allocpacket_internal() }
175 * | ^
176 * | |
177 * | +------ (done) ---------+
178 * v |
179 * mcache_alloc/mcache_alloc_ext() mbuf_cslab_audit()
180 * | ^
181 * v |
182 * [CPU cache] -------> (found?) -------+
183 * | |
184 * v |
185 * mbuf_cslab_alloc() |
186 * | |
187 * v |
188 * [freelist] -------> (found?) -------+
189 * | |
190 * v |
191 * (rudimentary object) |
192 * mcache_alloc/mcache_alloc_ext() ------>>-----+
193 *
194 * Auditing notes: If auditing is enabled, buffers will be subjected to
195 * integrity checks by the audit routine. This is done by verifying their
196 * contents against DEADBEEF (free) pattern before returning them to caller.
197 * As part of this step, the routine will also record the transaction and
198 * pattern-fill the buffers with BADDCAFE (uninitialized) pattern. It will
199 * also restore any constructed data structure fields if necessary.
200 *
201 * OBJECT DEALLOCATION:
202 *
203 * Freeing an object simply involves placing it into the CPU cache; this
204 * pollutes the cache to benefit subsequent allocations. The slab layer
205 * will only be entered if the object is to be purged out of the cache.
206 * During normal operations, this happens only when the CPU layer resizes
207 * its bucket while it's adjusting to the allocation load. Deallocation
208 * paths are different depending on the class of objects:
209 *
210 * a. Rudimentary object:
211 *
212 * { m_free(), m_freem_list(), composite object deallocation }
213 * | ^
214 * | |
215 * | +------ (done) ---------+
216 * v |
217 * mcache_free/mcache_free_ext() |
218 * | |
219 * v |
220 * mbuf_slab_audit() |
221 * | |
222 * v |
223 * [CPU cache] ---> (not purging?) -----+
224 * | |
225 * v |
226 * mbuf_slab_free() |
227 * | |
228 * v |
229 * [freelist] ----------->>------------+
230 * (objects never get purged to VM)
231 *
232 * b. Composite object:
233 *
234 * { m_free(), m_freem_list() }
235 * | ^
236 * | |
237 * | +------ (done) ---------+
238 * v |
239 * mcache_free/mcache_free_ext() |
240 * | |
241 * v |
242 * mbuf_cslab_audit() |
243 * | |
244 * v |
245 * [CPU cache] ---> (not purging?) -----+
246 * | |
247 * v |
248 * mbuf_cslab_free() |
249 * | |
250 * v |
251 * [freelist] ---> (not purging?) -----+
252 * | |
253 * v |
254 * (rudimentary object) |
255 * mcache_free/mcache_free_ext() ------->>------+
256 *
257 * Auditing notes: If auditing is enabled, the audit routine will save
258 * any constructed data structure fields (if necessary) before filling the
259 * contents of the buffers with DEADBEEF (free) pattern and recording the
260 * transaction. Buffers that are freed (whether at CPU or slab layer) are
261 * expected to contain the free pattern.
262 *
263 * DEBUGGING:
264 *
265 * Debugging can be enabled by adding "mbuf_debug=0x3" to boot-args; this
266 * translates to the mcache flags (MCF_VERIFY | MCF_AUDIT). Additionally,
267 * the CPU layer cache can be disabled by setting the MCF_NOCPUCACHE flag,
268 * i.e. modify the boot argument parameter to "mbuf_debug=0x13". Leak
269 * detection may also be disabled by setting the MCF_NOLEAKLOG flag, e.g.
270 * "mbuf_debug=0x113". Note that debugging consumes more CPU and memory.
271 *
272 * Each object is associated with exactly one mcache_audit_t structure that
273 * contains the information related to its last buffer transaction. Given
274 * an address of an object, the audit structure can be retrieved by finding
275 * the position of the object relevant to the base address of the cluster:
276 *
277 * +------------+ +=============+
278 * | mbuf addr | | mclaudit[i] |
279 * +------------+ +=============+
280 * | | cl_audit[0] |
281 * i = MTOBG(addr) +-------------+
282 * | +-----> | cl_audit[1] | -----> mcache_audit_t
283 * b = BGTOM(i) | +-------------+
284 * | | | ... |
285 * x = MCLIDX(b, addr) | +-------------+
286 * | | | cl_audit[7] |
287 * +-----------------+ +-------------+
288 * (e.g. x == 1)
289 *
290 * The mclaudit[] array is allocated at initialization time, but its contents
291 * get populated when the corresponding cluster is created. Because a page
292 * can be turned into NMBPBG number of mbufs, we preserve enough space for the
293 * mbufs so that there is a 1-to-1 mapping between them. A page that never
294 * gets (or has not yet) turned into mbufs will use only cl_audit[0] with the
295 * remaining entries unused. For 16KB cluster, only one entry from the first
296 * page is allocated and used for the entire object.
297 */
298
299 /* TODO: should be in header file */
300 /* kernel translater */
301 extern vm_offset_t kmem_mb_alloc(vm_map_t, int, int);
302 extern ppnum_t pmap_find_phys(pmap_t pmap, addr64_t va);
303 extern vm_map_t mb_map; /* special map */
304
305 /* Global lock */
306 decl_lck_mtx_data(static, mbuf_mlock_data);
307 static lck_mtx_t *mbuf_mlock = &mbuf_mlock_data;
308 static lck_attr_t *mbuf_mlock_attr;
309 static lck_grp_t *mbuf_mlock_grp;
310 static lck_grp_attr_t *mbuf_mlock_grp_attr;
311
312 /* Back-end (common) layer */
313 static void *mbuf_worker_run; /* wait channel for worker thread */
314 static int mbuf_worker_ready; /* worker thread is runnable */
315 static int mbuf_expand_mcl; /* number of cluster creation requets */
316 static int mbuf_expand_big; /* number of big cluster creation requests */
317 static int mbuf_expand_16k; /* number of 16KB cluster creation requests */
318 static int ncpu; /* number of CPUs */
319 static ppnum_t *mcl_paddr; /* Array of cluster physical addresses */
320 static ppnum_t mcl_pages; /* Size of array (# physical pages) */
321 static ppnum_t mcl_paddr_base; /* Handle returned by IOMapper::iovmAlloc() */
322 static mcache_t *ref_cache; /* Cache of cluster reference & flags */
323 static mcache_t *mcl_audit_con_cache; /* Audit contents cache */
324 static unsigned int mbuf_debug; /* patchable mbuf mcache flags */
325 static unsigned int mb_normalized; /* number of packets "normalized" */
326
327 #define MB_GROWTH_AGGRESSIVE 1 /* Threshold: 1/2 of total */
328 #define MB_GROWTH_NORMAL 2 /* Threshold: 3/4 of total */
329
330 typedef enum {
331 MC_MBUF = 0, /* Regular mbuf */
332 MC_CL, /* Cluster */
333 MC_BIGCL, /* Large (4KB) cluster */
334 MC_16KCL, /* Jumbo (16KB) cluster */
335 MC_MBUF_CL, /* mbuf + cluster */
336 MC_MBUF_BIGCL, /* mbuf + large (4KB) cluster */
337 MC_MBUF_16KCL /* mbuf + jumbo (16KB) cluster */
338 } mbuf_class_t;
339
340 #define MBUF_CLASS_MIN MC_MBUF
341 #define MBUF_CLASS_MAX MC_MBUF_16KCL
342 #define MBUF_CLASS_LAST MC_16KCL
343 #define MBUF_CLASS_VALID(c) \
344 ((int)(c) >= MBUF_CLASS_MIN && (int)(c) <= MBUF_CLASS_MAX)
345 #define MBUF_CLASS_COMPOSITE(c) \
346 ((int)(c) > MBUF_CLASS_LAST)
347
348
349 /*
350 * mbuf specific mcache allocation request flags.
351 */
352 #define MCR_COMP MCR_USR1 /* for MC_MBUF_{CL,BIGCL,16KCL} caches */
353
354 /*
355 * Per-cluster slab structure.
356 *
357 * A slab is a cluster control structure that contains one or more object
358 * chunks; the available chunks are chained in the slab's freelist (sl_head).
359 * Each time a chunk is taken out of the slab, the slab's reference count
360 * gets incremented. When all chunks have been taken out, the empty slab
361 * gets removed (SLF_DETACHED) from the class's slab list. A chunk that is
362 * returned to a slab causes the slab's reference count to be decremented;
363 * it also causes the slab to be reinserted back to class's slab list, if
364 * it's not already done.
365 *
366 * Compartmentalizing of the object chunks into slabs allows us to easily
367 * merge one or more slabs together when the adjacent slabs are idle, as
368 * well as to convert or move a slab from one class to another; e.g. the
369 * mbuf cluster slab can be converted to a regular cluster slab when all
370 * mbufs in the slab have been freed.
371 *
372 * A slab may also span across multiple clusters for chunks larger than
373 * a cluster's size. In this case, only the slab of the first cluster is
374 * used. The rest of the slabs are marked with SLF_PARTIAL to indicate
375 * that they are part of the larger slab.
376 *
377 * Each slab controls a page of memory.
378 */
379 typedef struct mcl_slab {
380 struct mcl_slab *sl_next; /* neighboring slab */
381 u_int8_t sl_class; /* controlling mbuf class */
382 int8_t sl_refcnt; /* outstanding allocations */
383 int8_t sl_chunks; /* chunks (bufs) in this slab */
384 u_int16_t sl_flags; /* slab flags (see below) */
385 u_int16_t sl_len; /* slab length */
386 void *sl_base; /* base of allocated memory */
387 void *sl_head; /* first free buffer */
388 TAILQ_ENTRY(mcl_slab) sl_link; /* next/prev slab on freelist */
389 } mcl_slab_t;
390
391 #define SLF_MAPPED 0x0001 /* backed by a mapped page */
392 #define SLF_PARTIAL 0x0002 /* part of another slab */
393 #define SLF_DETACHED 0x0004 /* not in slab freelist */
394
395 /*
396 * The array of slabs are broken into groups of arrays per 1MB of kernel
397 * memory to reduce the footprint. Each group is allocated on demand
398 * whenever a new piece of memory mapped in from the VM crosses the 1MB
399 * boundary.
400 */
401 #define NSLABSPMB ((1 << MBSHIFT) >> PGSHIFT) /* 256 slabs/grp */
402
403 typedef struct mcl_slabg {
404 mcl_slab_t slg_slab[NSLABSPMB]; /* group of slabs */
405 } mcl_slabg_t;
406
407 /*
408 * Number of slabs needed to control a 16KB cluster object.
409 */
410 #define NSLABSP16KB (M16KCLBYTES >> PGSHIFT)
411
412 /*
413 * Per-cluster audit structure.
414 */
415 typedef struct {
416 mcache_audit_t *cl_audit[NMBPBG]; /* array of audits */
417 } mcl_audit_t;
418
419 /*
420 * Size of data from the beginning of an mbuf that covers m_hdr, pkthdr
421 * and m_ext structures. If auditing is enabled, we allocate a shadow
422 * mbuf structure of this size inside each audit structure, and the
423 * contents of the real mbuf gets copied into it when the mbuf is freed.
424 * This allows us to pattern-fill the mbuf for integrity check, and to
425 * preserve any constructed mbuf fields (e.g. mbuf + cluster cache case).
426 * Note that we don't save the contents of clusters when they are freed;
427 * we simply pattern-fill them.
428 */
429 #define AUDIT_CONTENTS_SIZE ((MSIZE - MHLEN) + sizeof (_m_ext_t))
430
431 /*
432 * mbuf specific mcache audit flags
433 */
434 #define MB_INUSE 0x01 /* object has not been returned to slab */
435 #define MB_COMP_INUSE 0x02 /* object has not been returned to cslab */
436 #define MB_SCVALID 0x04 /* object has valid saved contents */
437
438 /*
439 * Each of the following two arrays hold up to nmbclusters elements.
440 */
441 static mcl_audit_t *mclaudit; /* array of cluster audit information */
442 static unsigned int maxclaudit; /* max # of entries in audit table */
443 static mcl_slabg_t **slabstbl; /* cluster slabs table */
444 static unsigned int maxslabgrp; /* max # of entries in slabs table */
445 static unsigned int slabgrp; /* # of entries in slabs table */
446
447 /* Globals */
448 int nclusters; /* # of clusters for non-jumbo (legacy) sizes */
449 int njcl; /* # of clusters for jumbo sizes */
450 int njclbytes; /* size of a jumbo cluster */
451 union mbigcluster *mbutl; /* first mapped cluster address */
452 union mbigcluster *embutl; /* ending virtual address of mclusters */
453 int _max_linkhdr; /* largest link-level header */
454 int _max_protohdr; /* largest protocol header */
455 int max_hdr; /* largest link+protocol header */
456 int max_datalen; /* MHLEN - max_hdr */
457
458 static boolean_t mclverify; /* debug: pattern-checking */
459 static boolean_t mcltrace; /* debug: stack tracing */
460 static boolean_t mclfindleak; /* debug: leak detection */
461 static boolean_t mclexpleak; /* debug: expose leak info to user space */
462
463 /* mbuf leak detection variables */
464 static struct mleak_table mleak_table;
465 static mleak_stat_t *mleak_stat;
466
467 #define MLEAK_STAT_SIZE(n) \
468 ((size_t)(&((mleak_stat_t *)0)->ml_trace[n]))
469
470 struct mallocation {
471 mcache_obj_t *element; /* the alloc'ed element, NULL if unused */
472 u_int32_t trace_index; /* mtrace index for corresponding backtrace */
473 u_int32_t count; /* How many objects were requested */
474 u_int64_t hitcount; /* for determining hash effectiveness */
475 };
476
477 struct mtrace {
478 u_int64_t collisions;
479 u_int64_t hitcount;
480 u_int64_t allocs;
481 u_int64_t depth;
482 uintptr_t addr[MLEAK_STACK_DEPTH];
483 };
484
485 /* Size must be a power of two for the zhash to be able to just mask off bits */
486 #define MLEAK_ALLOCATION_MAP_NUM 512
487 #define MLEAK_TRACE_MAP_NUM 256
488
489 /*
490 * Sample factor for how often to record a trace. This is overwritable
491 * by the boot-arg mleak_sample_factor.
492 */
493 #define MLEAK_SAMPLE_FACTOR 500
494
495 /*
496 * Number of top leakers recorded.
497 */
498 #define MLEAK_NUM_TRACES 5
499
500 #define MB_LEAK_SPACING_64 " "
501 #define MB_LEAK_SPACING_32 " "
502
503
504 #define MB_LEAK_HDR_32 "\n\
505 trace [1] trace [2] trace [3] trace [4] trace [5] \n\
506 ---------- ---------- ---------- ---------- ---------- \n\
507 "
508
509 #define MB_LEAK_HDR_64 "\n\
510 trace [1] trace [2] trace [3] \
511 trace [4] trace [5] \n\
512 ------------------ ------------------ ------------------ \
513 ------------------ ------------------ \n\
514 "
515
516 static uint32_t mleak_alloc_buckets = MLEAK_ALLOCATION_MAP_NUM;
517 static uint32_t mleak_trace_buckets = MLEAK_TRACE_MAP_NUM;
518
519 /* Hashmaps of allocations and their corresponding traces */
520 static struct mallocation *mleak_allocations;
521 static struct mtrace *mleak_traces;
522 static struct mtrace *mleak_top_trace[MLEAK_NUM_TRACES];
523
524 /* Lock to protect mleak tables from concurrent modification */
525 decl_lck_mtx_data(static, mleak_lock_data);
526 static lck_mtx_t *mleak_lock = &mleak_lock_data;
527 static lck_attr_t *mleak_lock_attr;
528 static lck_grp_t *mleak_lock_grp;
529 static lck_grp_attr_t *mleak_lock_grp_attr;
530
531 extern u_int32_t high_sb_max;
532
533 /* TODO: should be in header file */
534 int do_reclaim = 0;
535
536 /* The minimum number of objects that are allocated, to start. */
537 #define MINCL 32
538 #define MINBIGCL (MINCL >> 1)
539 #define MIN16KCL (MINCL >> 2)
540
541 /* Low watermarks (only map in pages once free counts go below) */
542 #define MBIGCL_LOWAT MINBIGCL
543 #define M16KCL_LOWAT MIN16KCL
544
545 typedef struct {
546 mbuf_class_t mtbl_class; /* class type */
547 mcache_t *mtbl_cache; /* mcache for this buffer class */
548 TAILQ_HEAD(mcl_slhead, mcl_slab) mtbl_slablist; /* slab list */
549 mcache_obj_t *mtbl_cobjlist; /* composite objects freelist */
550 mb_class_stat_t *mtbl_stats; /* statistics fetchable via sysctl */
551 u_int32_t mtbl_maxsize; /* maximum buffer size */
552 int mtbl_minlimit; /* minimum allowed */
553 int mtbl_maxlimit; /* maximum allowed */
554 u_int32_t mtbl_wantpurge; /* purge during next reclaim */
555 } mbuf_table_t;
556
557 #define m_class(c) mbuf_table[c].mtbl_class
558 #define m_cache(c) mbuf_table[c].mtbl_cache
559 #define m_slablist(c) mbuf_table[c].mtbl_slablist
560 #define m_cobjlist(c) mbuf_table[c].mtbl_cobjlist
561 #define m_maxsize(c) mbuf_table[c].mtbl_maxsize
562 #define m_minlimit(c) mbuf_table[c].mtbl_minlimit
563 #define m_maxlimit(c) mbuf_table[c].mtbl_maxlimit
564 #define m_wantpurge(c) mbuf_table[c].mtbl_wantpurge
565 #define m_cname(c) mbuf_table[c].mtbl_stats->mbcl_cname
566 #define m_size(c) mbuf_table[c].mtbl_stats->mbcl_size
567 #define m_total(c) mbuf_table[c].mtbl_stats->mbcl_total
568 #define m_active(c) mbuf_table[c].mtbl_stats->mbcl_active
569 #define m_infree(c) mbuf_table[c].mtbl_stats->mbcl_infree
570 #define m_slab_cnt(c) mbuf_table[c].mtbl_stats->mbcl_slab_cnt
571 #define m_alloc_cnt(c) mbuf_table[c].mtbl_stats->mbcl_alloc_cnt
572 #define m_free_cnt(c) mbuf_table[c].mtbl_stats->mbcl_free_cnt
573 #define m_notified(c) mbuf_table[c].mtbl_stats->mbcl_notified
574 #define m_purge_cnt(c) mbuf_table[c].mtbl_stats->mbcl_purge_cnt
575 #define m_fail_cnt(c) mbuf_table[c].mtbl_stats->mbcl_fail_cnt
576 #define m_ctotal(c) mbuf_table[c].mtbl_stats->mbcl_ctotal
577
578 static mbuf_table_t mbuf_table[] = {
579 /*
580 * The caches for mbufs, regular clusters and big clusters.
581 */
582 { MC_MBUF, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_MBUF)),
583 NULL, NULL, 0, 0, 0, 0 },
584 { MC_CL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_CL)),
585 NULL, NULL, 0, 0, 0, 0 },
586 { MC_BIGCL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_BIGCL)),
587 NULL, NULL, 0, 0, 0, 0 },
588 { MC_16KCL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_16KCL)),
589 NULL, NULL, 0, 0, 0, 0 },
590 /*
591 * The following are special caches; they serve as intermediate
592 * caches backed by the above rudimentary caches. Each object
593 * in the cache is an mbuf with a cluster attached to it. Unlike
594 * the above caches, these intermediate caches do not directly
595 * deal with the slab structures; instead, the constructed
596 * cached elements are simply stored in the freelists.
597 */
598 { MC_MBUF_CL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0 },
599 { MC_MBUF_BIGCL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0 },
600 { MC_MBUF_16KCL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0 },
601 };
602
603 #define NELEM(a) (sizeof (a) / sizeof ((a)[0]))
604
605 static void *mb_waitchan = &mbuf_table; /* wait channel for all caches */
606 static int mb_waiters; /* number of waiters */
607
608 #define MB_WDT_MAXTIME 10 /* # of secs before watchdog panic */
609 static struct timeval mb_wdtstart; /* watchdog start timestamp */
610 static char *mbuf_dump_buf;
611
612 #define MBUF_DUMP_BUF_SIZE 2048
613
614 /*
615 * mbuf watchdog is enabled by default on embedded platforms. It is
616 * also toggeable via the kern.ipc.mb_watchdog sysctl.
617 */
618 #if CONFIG_EMBEDDED
619 static unsigned int mb_watchdog = 1;
620 #else
621 static unsigned int mb_watchdog = 0;
622 #endif /* CONFIG_EMBEDDED */
623
624 /* The following are used to serialize m_clalloc() */
625 static boolean_t mb_clalloc_busy;
626 static void *mb_clalloc_waitchan = &mb_clalloc_busy;
627 static int mb_clalloc_waiters;
628
629 static void mbuf_mtypes_sync(boolean_t);
630 static int mbstat_sysctl SYSCTL_HANDLER_ARGS;
631 static void mbuf_stat_sync(void);
632 static int mb_stat_sysctl SYSCTL_HANDLER_ARGS;
633 static int mleak_top_trace_sysctl SYSCTL_HANDLER_ARGS;
634 static int mleak_table_sysctl SYSCTL_HANDLER_ARGS;
635 static char *mbuf_dump(void);
636 static void mbuf_table_init(void);
637 static inline void m_incref(struct mbuf *);
638 static inline u_int32_t m_decref(struct mbuf *);
639 static int m_clalloc(const u_int32_t, const int, const u_int32_t);
640 static void mbuf_worker_thread_init(void);
641 static mcache_obj_t *slab_alloc(mbuf_class_t, int);
642 static void slab_free(mbuf_class_t, mcache_obj_t *);
643 static unsigned int mbuf_slab_alloc(void *, mcache_obj_t ***,
644 unsigned int, int);
645 static void mbuf_slab_free(void *, mcache_obj_t *, int);
646 static void mbuf_slab_audit(void *, mcache_obj_t *, boolean_t);
647 static void mbuf_slab_notify(void *, u_int32_t);
648 static unsigned int cslab_alloc(mbuf_class_t, mcache_obj_t ***,
649 unsigned int);
650 static unsigned int cslab_free(mbuf_class_t, mcache_obj_t *, int);
651 static unsigned int mbuf_cslab_alloc(void *, mcache_obj_t ***,
652 unsigned int, int);
653 static void mbuf_cslab_free(void *, mcache_obj_t *, int);
654 static void mbuf_cslab_audit(void *, mcache_obj_t *, boolean_t);
655 static int freelist_populate(mbuf_class_t, unsigned int, int);
656 static void freelist_init(mbuf_class_t);
657 static boolean_t mbuf_cached_above(mbuf_class_t, int);
658 static boolean_t mbuf_steal(mbuf_class_t, unsigned int);
659 static void m_reclaim(mbuf_class_t, unsigned int, boolean_t);
660 static int m_howmany(int, size_t);
661 static void mbuf_worker_thread(void);
662 static void mbuf_watchdog(void);
663 static boolean_t mbuf_sleep(mbuf_class_t, unsigned int, int);
664
665 static void mcl_audit_init(void *, mcache_audit_t **, mcache_obj_t **,
666 size_t, unsigned int);
667 static mcache_audit_t *mcl_audit_buf2mca(mbuf_class_t, mcache_obj_t *);
668 static void mcl_audit_mbuf(mcache_audit_t *, void *, boolean_t, boolean_t);
669 static void mcl_audit_cluster(mcache_audit_t *, void *, size_t, boolean_t,
670 boolean_t);
671 static void mcl_audit_restore_mbuf(struct mbuf *, mcache_audit_t *, boolean_t);
672 static void mcl_audit_save_mbuf(struct mbuf *, mcache_audit_t *);
673 static void mcl_audit_mcheck_panic(struct mbuf *);
674 static void mcl_audit_verify_nextptr(void *, mcache_audit_t *);
675
676 static void mleak_activate(void);
677 static void mleak_logger(u_int32_t, mcache_obj_t *, boolean_t);
678 static boolean_t mleak_log(uintptr_t *, mcache_obj_t *, uint32_t, int);
679 static void mleak_free(mcache_obj_t *);
680 static void mleak_sort_traces(void);
681 static void mleak_update_stats(void);
682
683 static mcl_slab_t *slab_get(void *);
684 static void slab_init(mcl_slab_t *, mbuf_class_t, u_int32_t,
685 void *, void *, unsigned int, int, int);
686 static void slab_insert(mcl_slab_t *, mbuf_class_t);
687 static void slab_remove(mcl_slab_t *, mbuf_class_t);
688 static boolean_t slab_inrange(mcl_slab_t *, void *);
689 static void slab_nextptr_panic(mcl_slab_t *, void *);
690 static void slab_detach(mcl_slab_t *);
691 static boolean_t slab_is_detached(mcl_slab_t *);
692
693 static int m_copyback0(struct mbuf **, int, int, const void *, int, int);
694 static struct mbuf *m_split0(struct mbuf *, int, int, int);
695
696 /* flags for m_copyback0 */
697 #define M_COPYBACK0_COPYBACK 0x0001 /* copyback from cp */
698 #define M_COPYBACK0_PRESERVE 0x0002 /* preserve original data */
699 #define M_COPYBACK0_COW 0x0004 /* do copy-on-write */
700 #define M_COPYBACK0_EXTEND 0x0008 /* extend chain */
701
702 /*
703 * This flag is set for all mbufs that come out of and into the composite
704 * mbuf + cluster caches, i.e. MC_MBUF_CL and MC_MBUF_BIGCL. mbufs that
705 * are marked with such a flag have clusters attached to them, and will be
706 * treated differently when they are freed; instead of being placed back
707 * into the mbuf and cluster freelists, the composite mbuf + cluster objects
708 * are placed back into the appropriate composite cache's freelist, and the
709 * actual freeing is deferred until the composite objects are purged. At
710 * such a time, this flag will be cleared from the mbufs and the objects
711 * will be freed into their own separate freelists.
712 */
713 #define EXTF_COMPOSITE 0x1
714
715 /*
716 * This flag indicates that the external cluster is read-only, i.e. it is
717 * or was referred to by more than one mbufs. Once set, this flag is never
718 * cleared.
719 */
720 #define EXTF_READONLY 0x2
721 #define EXTF_MASK (EXTF_COMPOSITE | EXTF_READONLY)
722
723 #define MEXT_RFA(m) ((m)->m_ext.ext_refflags)
724 #define MEXT_REF(m) (MEXT_RFA(m)->refcnt)
725 #define MEXT_FLAGS(m) (MEXT_RFA(m)->flags)
726 #define MBUF_IS_COMPOSITE(m) \
727 (MEXT_REF(m) == 0 && (MEXT_FLAGS(m) & EXTF_MASK) == EXTF_COMPOSITE)
728
729 /*
730 * Macros used to verify the integrity of the mbuf.
731 */
732 #define _MCHECK(m) { \
733 if ((m)->m_type != MT_FREE) { \
734 if (mclaudit == NULL) \
735 panic("MCHECK: m_type=%d m=%p", \
736 (u_int16_t)(m)->m_type, m); \
737 else \
738 mcl_audit_mcheck_panic(m); \
739 } \
740 }
741
742 #define MBUF_IN_MAP(addr) \
743 ((void *)(addr) >= (void *)mbutl && (void *)(addr) < (void *)embutl)
744
745 #define MRANGE(addr) { \
746 if (!MBUF_IN_MAP(addr)) \
747 panic("MRANGE: address out of range 0x%p", addr); \
748 }
749
750 /*
751 * Macro version of mtod.
752 */
753 #define MTOD(m, t) ((t)((m)->m_data))
754
755 /*
756 * Macros to obtain (4KB) cluster index and base cluster address.
757 */
758
759 #define MTOBG(x) (((char *)(x) - (char *)mbutl) >> MBIGCLSHIFT)
760 #define BGTOM(x) ((union mbigcluster *)(mbutl + (x)))
761
762 /*
763 * Macro to find the mbuf index relative to a base.
764 */
765 #define MCLIDX(c, m) (((char *)(m) - (char *)(c)) >> MSIZESHIFT)
766
767 /*
768 * Same thing for 2KB cluster index.
769 */
770 #define CLBGIDX(c, m) (((char *)(m) - (char *)(c)) >> MCLSHIFT)
771
772 /*
773 * Macros used during mbuf and cluster initialization.
774 */
775 #define MBUF_INIT(m, pkthdr, type) { \
776 _MCHECK(m); \
777 (m)->m_next = (m)->m_nextpkt = NULL; \
778 (m)->m_len = 0; \
779 (m)->m_type = type; \
780 if ((pkthdr) == 0) { \
781 (m)->m_data = (m)->m_dat; \
782 (m)->m_flags = 0; \
783 } else { \
784 (m)->m_data = (m)->m_pktdat; \
785 (m)->m_flags = M_PKTHDR; \
786 (m)->m_pkthdr.rcvif = NULL; \
787 (m)->m_pkthdr.len = 0; \
788 (m)->m_pkthdr.header = NULL; \
789 (m)->m_pkthdr.csum_flags = 0; \
790 (m)->m_pkthdr.csum_data = 0; \
791 (m)->m_pkthdr.tso_segsz = 0; \
792 (m)->m_pkthdr.vlan_tag = 0; \
793 (m)->m_pkthdr.socket_id = 0; \
794 (m)->m_pkthdr.vt_nrecs = 0; \
795 (m)->m_pkthdr.aux_flags = 0; \
796 m_tag_init(m); \
797 m_service_class_init(m); \
798 } \
799 }
800
801 #define MEXT_INIT(m, buf, size, free, arg, rfa, ref, flag) { \
802 (m)->m_data = (m)->m_ext.ext_buf = (buf); \
803 (m)->m_flags |= M_EXT; \
804 (m)->m_ext.ext_size = (size); \
805 (m)->m_ext.ext_free = (free); \
806 (m)->m_ext.ext_arg = (arg); \
807 (m)->m_ext.ext_refs.forward = (m)->m_ext.ext_refs.backward = \
808 &(m)->m_ext.ext_refs; \
809 MEXT_RFA(m) = (rfa); \
810 MEXT_REF(m) = (ref); \
811 MEXT_FLAGS(m) = (flag); \
812 }
813
814 #define MBUF_CL_INIT(m, buf, rfa, ref, flag) \
815 MEXT_INIT(m, buf, m_maxsize(MC_CL), NULL, NULL, rfa, ref, flag)
816
817 #define MBUF_BIGCL_INIT(m, buf, rfa, ref, flag) \
818 MEXT_INIT(m, buf, m_maxsize(MC_BIGCL), m_bigfree, NULL, rfa, ref, flag)
819
820 #define MBUF_16KCL_INIT(m, buf, rfa, ref, flag) \
821 MEXT_INIT(m, buf, m_maxsize(MC_16KCL), m_16kfree, NULL, rfa, ref, flag)
822
823 /*
824 * Macro to convert BSD malloc sleep flag to mcache's
825 */
826 #define MSLEEPF(f) ((!((f) & M_DONTWAIT)) ? MCR_SLEEP : MCR_NOSLEEP)
827
828 /*
829 * The structure that holds all mbuf class statistics exportable via sysctl.
830 * Similar to mbstat structure, the mb_stat structure is protected by the
831 * global mbuf lock. It contains additional information about the classes
832 * that allows for a more accurate view of the state of the allocator.
833 */
834 struct mb_stat *mb_stat;
835 struct omb_stat *omb_stat; /* For backwards compatibility */
836
837 #define MB_STAT_SIZE(n) \
838 ((size_t)(&((mb_stat_t *)0)->mbs_class[n]))
839 #define OMB_STAT_SIZE(n) \
840 ((size_t)(&((struct omb_stat *)0)->mbs_class[n]))
841
842 /*
843 * The legacy structure holding all of the mbuf allocation statistics.
844 * The actual statistics used by the kernel are stored in the mbuf_table
845 * instead, and are updated atomically while the global mbuf lock is held.
846 * They are mirrored in mbstat to support legacy applications (e.g. netstat).
847 * Unlike before, the kernel no longer relies on the contents of mbstat for
848 * its operations (e.g. cluster expansion) because the structure is exposed
849 * to outside and could possibly be modified, therefore making it unsafe.
850 * With the exception of the mbstat.m_mtypes array (see below), all of the
851 * statistics are updated as they change.
852 */
853 struct mbstat mbstat;
854
855 #define MBSTAT_MTYPES_MAX \
856 (sizeof (mbstat.m_mtypes) / sizeof (mbstat.m_mtypes[0]))
857
858 /*
859 * Allocation statistics related to mbuf types (up to MT_MAX-1) are updated
860 * atomically and stored in a per-CPU structure which is lock-free; this is
861 * done in order to avoid writing to the global mbstat data structure which
862 * would cause false sharing. During sysctl request for kern.ipc.mbstat,
863 * the statistics across all CPUs will be converged into the mbstat.m_mtypes
864 * array and returned to the application. Any updates for types greater or
865 * equal than MT_MAX would be done atomically to the mbstat; this slows down
866 * performance but is okay since the kernel uses only up to MT_MAX-1 while
867 * anything beyond that (up to type 255) is considered a corner case.
868 */
869 typedef struct {
870 unsigned int cpu_mtypes[MT_MAX];
871 } __attribute__((aligned(CPU_CACHE_SIZE), packed)) mtypes_cpu_t;
872
873 typedef struct {
874 mtypes_cpu_t mbs_cpu[1];
875 } mbuf_mtypes_t;
876
877 static mbuf_mtypes_t *mbuf_mtypes; /* per-CPU statistics */
878
879 #define MBUF_MTYPES_SIZE(n) \
880 ((size_t)(&((mbuf_mtypes_t *)0)->mbs_cpu[n]))
881
882 #define MTYPES_CPU(p) \
883 ((mtypes_cpu_t *)(void *)((char *)(p) + MBUF_MTYPES_SIZE(cpu_number())))
884
885 #define mtype_stat_add(type, n) { \
886 if ((unsigned)(type) < MT_MAX) { \
887 mtypes_cpu_t *mbs = MTYPES_CPU(mbuf_mtypes); \
888 atomic_add_32(&mbs->cpu_mtypes[type], n); \
889 } else if ((unsigned)(type) < (unsigned)MBSTAT_MTYPES_MAX) { \
890 atomic_add_16((int16_t *)&mbstat.m_mtypes[type], n); \
891 } \
892 }
893
894 #define mtype_stat_sub(t, n) mtype_stat_add(t, -(n))
895 #define mtype_stat_inc(t) mtype_stat_add(t, 1)
896 #define mtype_stat_dec(t) mtype_stat_sub(t, 1)
897
898 static void
899 mbuf_mtypes_sync(boolean_t locked)
900 {
901 int m, n;
902 mtypes_cpu_t mtc;
903
904 if (locked)
905 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
906
907 bzero(&mtc, sizeof (mtc));
908 for (m = 0; m < ncpu; m++) {
909 mtypes_cpu_t *scp = &mbuf_mtypes->mbs_cpu[m];
910 mtypes_cpu_t temp;
911
912 bcopy(&scp->cpu_mtypes, &temp.cpu_mtypes,
913 sizeof (temp.cpu_mtypes));
914
915 for (n = 0; n < MT_MAX; n++)
916 mtc.cpu_mtypes[n] += temp.cpu_mtypes[n];
917 }
918 if (!locked)
919 lck_mtx_lock(mbuf_mlock);
920 for (n = 0; n < MT_MAX; n++)
921 mbstat.m_mtypes[n] = mtc.cpu_mtypes[n];
922 if (!locked)
923 lck_mtx_unlock(mbuf_mlock);
924 }
925
926 static int
927 mbstat_sysctl SYSCTL_HANDLER_ARGS
928 {
929 #pragma unused(oidp, arg1, arg2)
930 mbuf_mtypes_sync(FALSE);
931
932 return (SYSCTL_OUT(req, &mbstat, sizeof (mbstat)));
933 }
934
935 static void
936 mbuf_stat_sync(void)
937 {
938 mb_class_stat_t *sp;
939 mcache_cpu_t *ccp;
940 mcache_t *cp;
941 int k, m, bktsize;
942
943 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
944
945 for (k = 0; k < NELEM(mbuf_table); k++) {
946 cp = m_cache(k);
947 ccp = &cp->mc_cpu[0];
948 bktsize = ccp->cc_bktsize;
949 sp = mbuf_table[k].mtbl_stats;
950
951 if (cp->mc_flags & MCF_NOCPUCACHE)
952 sp->mbcl_mc_state = MCS_DISABLED;
953 else if (cp->mc_purge_cnt > 0)
954 sp->mbcl_mc_state = MCS_PURGING;
955 else if (bktsize == 0)
956 sp->mbcl_mc_state = MCS_OFFLINE;
957 else
958 sp->mbcl_mc_state = MCS_ONLINE;
959
960 sp->mbcl_mc_cached = 0;
961 for (m = 0; m < ncpu; m++) {
962 ccp = &cp->mc_cpu[m];
963 if (ccp->cc_objs > 0)
964 sp->mbcl_mc_cached += ccp->cc_objs;
965 if (ccp->cc_pobjs > 0)
966 sp->mbcl_mc_cached += ccp->cc_pobjs;
967 }
968 sp->mbcl_mc_cached += (cp->mc_full.bl_total * bktsize);
969 sp->mbcl_active = sp->mbcl_total - sp->mbcl_mc_cached -
970 sp->mbcl_infree;
971
972 sp->mbcl_mc_waiter_cnt = cp->mc_waiter_cnt;
973 sp->mbcl_mc_wretry_cnt = cp->mc_wretry_cnt;
974 sp->mbcl_mc_nwretry_cnt = cp->mc_nwretry_cnt;
975
976 /* Calculate total count specific to each class */
977 sp->mbcl_ctotal = sp->mbcl_total;
978 switch (m_class(k)) {
979 case MC_MBUF:
980 /* Deduct mbufs used in composite caches */
981 sp->mbcl_ctotal -= (m_total(MC_MBUF_CL) +
982 m_total(MC_MBUF_BIGCL));
983 break;
984
985 case MC_CL:
986 /* Deduct clusters used in composite cache */
987 sp->mbcl_ctotal -= m_total(MC_MBUF_CL);
988 break;
989
990 case MC_BIGCL:
991 /* Deduct clusters used in composite cache */
992 sp->mbcl_ctotal -= m_total(MC_MBUF_BIGCL);
993 break;
994
995 case MC_16KCL:
996 /* Deduct clusters used in composite cache */
997 sp->mbcl_ctotal -= m_total(MC_MBUF_16KCL);
998 break;
999
1000 default:
1001 break;
1002 }
1003 }
1004 }
1005
1006 static int
1007 mb_stat_sysctl SYSCTL_HANDLER_ARGS
1008 {
1009 #pragma unused(oidp, arg1, arg2)
1010 void *statp;
1011 int k, statsz, proc64 = proc_is64bit(req->p);
1012
1013 lck_mtx_lock(mbuf_mlock);
1014 mbuf_stat_sync();
1015
1016 if (!proc64) {
1017 struct omb_class_stat *oc;
1018 struct mb_class_stat *c;
1019
1020 omb_stat->mbs_cnt = mb_stat->mbs_cnt;
1021 oc = &omb_stat->mbs_class[0];
1022 c = &mb_stat->mbs_class[0];
1023 for (k = 0; k < omb_stat->mbs_cnt; k++, oc++, c++) {
1024 (void) snprintf(oc->mbcl_cname, sizeof (oc->mbcl_cname),
1025 "%s", c->mbcl_cname);
1026 oc->mbcl_size = c->mbcl_size;
1027 oc->mbcl_total = c->mbcl_total;
1028 oc->mbcl_active = c->mbcl_active;
1029 oc->mbcl_infree = c->mbcl_infree;
1030 oc->mbcl_slab_cnt = c->mbcl_slab_cnt;
1031 oc->mbcl_alloc_cnt = c->mbcl_alloc_cnt;
1032 oc->mbcl_free_cnt = c->mbcl_free_cnt;
1033 oc->mbcl_notified = c->mbcl_notified;
1034 oc->mbcl_purge_cnt = c->mbcl_purge_cnt;
1035 oc->mbcl_fail_cnt = c->mbcl_fail_cnt;
1036 oc->mbcl_ctotal = c->mbcl_ctotal;
1037 oc->mbcl_mc_state = c->mbcl_mc_state;
1038 oc->mbcl_mc_cached = c->mbcl_mc_cached;
1039 oc->mbcl_mc_waiter_cnt = c->mbcl_mc_waiter_cnt;
1040 oc->mbcl_mc_wretry_cnt = c->mbcl_mc_wretry_cnt;
1041 oc->mbcl_mc_nwretry_cnt = c->mbcl_mc_nwretry_cnt;
1042 }
1043 statp = omb_stat;
1044 statsz = OMB_STAT_SIZE(NELEM(mbuf_table));
1045 } else {
1046 statp = mb_stat;
1047 statsz = MB_STAT_SIZE(NELEM(mbuf_table));
1048 }
1049
1050 lck_mtx_unlock(mbuf_mlock);
1051
1052 return (SYSCTL_OUT(req, statp, statsz));
1053 }
1054
1055 static int
1056 mleak_top_trace_sysctl SYSCTL_HANDLER_ARGS
1057 {
1058 #pragma unused(oidp, arg1, arg2)
1059 int i;
1060
1061 /* Ensure leak tracing turned on */
1062 if (!mclfindleak || !mclexpleak)
1063 return (ENXIO);
1064
1065 lck_mtx_lock(mleak_lock);
1066 mleak_update_stats();
1067 i = SYSCTL_OUT(req, mleak_stat, MLEAK_STAT_SIZE(MLEAK_NUM_TRACES));
1068 lck_mtx_unlock(mleak_lock);
1069
1070 return (i);
1071 }
1072
1073 static int
1074 mleak_table_sysctl SYSCTL_HANDLER_ARGS
1075 {
1076 #pragma unused(oidp, arg1, arg2)
1077 int i = 0;
1078
1079 /* Ensure leak tracing turned on */
1080 if (!mclfindleak || !mclexpleak)
1081 return (ENXIO);
1082
1083 lck_mtx_lock(mleak_lock);
1084 i = SYSCTL_OUT(req, &mleak_table, sizeof (mleak_table));
1085 lck_mtx_unlock(mleak_lock);
1086
1087 return (i);
1088 }
1089
1090 static inline void
1091 m_incref(struct mbuf *m)
1092 {
1093 UInt32 old, new;
1094 volatile UInt32 *addr = (volatile UInt32 *)&MEXT_REF(m);
1095
1096 do {
1097 old = *addr;
1098 new = old + 1;
1099 ASSERT(new != 0);
1100 } while (!OSCompareAndSwap(old, new, addr));
1101
1102 /*
1103 * If cluster is shared, mark it with (sticky) EXTF_READONLY;
1104 * we don't clear the flag when the refcount goes back to 1
1105 * to simplify code calling m_mclhasreference().
1106 */
1107 if (new > 1 && !(MEXT_FLAGS(m) & EXTF_READONLY))
1108 (void) OSBitOrAtomic(EXTF_READONLY, &MEXT_FLAGS(m));
1109 }
1110
1111 static inline u_int32_t
1112 m_decref(struct mbuf *m)
1113 {
1114 UInt32 old, new;
1115 volatile UInt32 *addr = (volatile UInt32 *)&MEXT_REF(m);
1116
1117 do {
1118 old = *addr;
1119 new = old - 1;
1120 ASSERT(old != 0);
1121 } while (!OSCompareAndSwap(old, new, addr));
1122
1123 return (new);
1124 }
1125
1126 static void
1127 mbuf_table_init(void)
1128 {
1129 unsigned int b, c, s;
1130 int m;
1131
1132 MALLOC(omb_stat, struct omb_stat *, OMB_STAT_SIZE(NELEM(mbuf_table)),
1133 M_TEMP, M_WAITOK | M_ZERO);
1134 VERIFY(omb_stat != NULL);
1135
1136 MALLOC(mb_stat, mb_stat_t *, MB_STAT_SIZE(NELEM(mbuf_table)),
1137 M_TEMP, M_WAITOK | M_ZERO);
1138 VERIFY(mb_stat != NULL);
1139
1140 mb_stat->mbs_cnt = NELEM(mbuf_table);
1141 for (m = 0; m < NELEM(mbuf_table); m++)
1142 mbuf_table[m].mtbl_stats = &mb_stat->mbs_class[m];
1143
1144 #if CONFIG_MBUF_JUMBO
1145 /*
1146 * Set aside 1/3 of the mbuf cluster map for jumbo clusters; we do
1147 * this only on platforms where jumbo cluster pool is enabled.
1148 */
1149 njcl = nmbclusters / 3;
1150 njclbytes = M16KCLBYTES;
1151 #endif /* CONFIG_MBUF_JUMBO */
1152
1153 /*
1154 * nclusters holds both the 2KB and 4KB pools, so ensure it's
1155 * a multiple of 4KB clusters.
1156 */
1157 nclusters = P2ROUNDDOWN(nmbclusters - njcl, NCLPBG);
1158 if (njcl > 0) {
1159 /*
1160 * Each jumbo cluster takes 8 2KB clusters, so make
1161 * sure that the pool size is evenly divisible by 8;
1162 * njcl is in 2KB unit, hence treated as such.
1163 */
1164 njcl = P2ROUNDDOWN(nmbclusters - nclusters, 8);
1165
1166 /* Update nclusters with rounded down value of njcl */
1167 nclusters = P2ROUNDDOWN(nmbclusters - njcl, NCLPBG);
1168 }
1169
1170 /*
1171 * njcl is valid only on platforms with 16KB jumbo clusters, where
1172 * it is configured to 1/3 of the pool size. On these platforms,
1173 * the remaining is used for 2KB and 4KB clusters. On platforms
1174 * without 16KB jumbo clusters, the entire pool is used for both
1175 * 2KB and 4KB clusters. A 4KB cluster can either be splitted into
1176 * 16 mbufs, or into 2 2KB clusters.
1177 *
1178 * +---+---+------------ ... -----------+------- ... -------+
1179 * | c | b | s | njcl |
1180 * +---+---+------------ ... -----------+------- ... -------+
1181 *
1182 * 1/32th of the shared region is reserved for pure 2KB and 4KB
1183 * clusters (1/64th each.)
1184 */
1185 c = P2ROUNDDOWN((nclusters >> 6), 2); /* in 2KB unit */
1186 b = P2ROUNDDOWN((nclusters >> (6 + NCLPBGSHIFT)), 2); /* in 4KB unit */
1187 s = nclusters - (c + (b << NCLPBGSHIFT)); /* in 2KB unit */
1188
1189 /*
1190 * 1/64th (c) is reserved for 2KB clusters.
1191 */
1192 m_minlimit(MC_CL) = c;
1193 m_maxlimit(MC_CL) = s + c; /* in 2KB unit */
1194 m_maxsize(MC_CL) = m_size(MC_CL) = MCLBYTES;
1195 (void) snprintf(m_cname(MC_CL), MAX_MBUF_CNAME, "cl");
1196
1197 /*
1198 * Another 1/64th (b) of the map is reserved for 4KB clusters.
1199 * It cannot be turned into 2KB clusters or mbufs.
1200 */
1201 m_minlimit(MC_BIGCL) = b;
1202 m_maxlimit(MC_BIGCL) = (s >> NCLPBGSHIFT) + b; /* in 4KB unit */
1203 m_maxsize(MC_BIGCL) = m_size(MC_BIGCL) = MBIGCLBYTES;
1204 (void) snprintf(m_cname(MC_BIGCL), MAX_MBUF_CNAME, "bigcl");
1205
1206 /*
1207 * The remaining 31/32ths (s) are all-purpose (mbufs, 2KB, or 4KB)
1208 */
1209 m_minlimit(MC_MBUF) = 0;
1210 m_maxlimit(MC_MBUF) = (s << NMBPCLSHIFT); /* in mbuf unit */
1211 m_maxsize(MC_MBUF) = m_size(MC_MBUF) = MSIZE;
1212 (void) snprintf(m_cname(MC_MBUF), MAX_MBUF_CNAME, "mbuf");
1213
1214 /*
1215 * Set limits for the composite classes.
1216 */
1217 m_minlimit(MC_MBUF_CL) = 0;
1218 m_maxlimit(MC_MBUF_CL) = m_maxlimit(MC_CL);
1219 m_maxsize(MC_MBUF_CL) = MCLBYTES;
1220 m_size(MC_MBUF_CL) = m_size(MC_MBUF) + m_size(MC_CL);
1221 (void) snprintf(m_cname(MC_MBUF_CL), MAX_MBUF_CNAME, "mbuf_cl");
1222
1223 m_minlimit(MC_MBUF_BIGCL) = 0;
1224 m_maxlimit(MC_MBUF_BIGCL) = m_maxlimit(MC_BIGCL);
1225 m_maxsize(MC_MBUF_BIGCL) = MBIGCLBYTES;
1226 m_size(MC_MBUF_BIGCL) = m_size(MC_MBUF) + m_size(MC_BIGCL);
1227 (void) snprintf(m_cname(MC_MBUF_BIGCL), MAX_MBUF_CNAME, "mbuf_bigcl");
1228
1229 /*
1230 * And for jumbo classes.
1231 */
1232 m_minlimit(MC_16KCL) = 0;
1233 m_maxlimit(MC_16KCL) = (njcl >> NCLPJCLSHIFT); /* in 16KB unit */
1234 m_maxsize(MC_16KCL) = m_size(MC_16KCL) = M16KCLBYTES;
1235 (void) snprintf(m_cname(MC_16KCL), MAX_MBUF_CNAME, "16kcl");
1236
1237 m_minlimit(MC_MBUF_16KCL) = 0;
1238 m_maxlimit(MC_MBUF_16KCL) = m_maxlimit(MC_16KCL);
1239 m_maxsize(MC_MBUF_16KCL) = M16KCLBYTES;
1240 m_size(MC_MBUF_16KCL) = m_size(MC_MBUF) + m_size(MC_16KCL);
1241 (void) snprintf(m_cname(MC_MBUF_16KCL), MAX_MBUF_CNAME, "mbuf_16kcl");
1242
1243 /*
1244 * Initialize the legacy mbstat structure.
1245 */
1246 bzero(&mbstat, sizeof (mbstat));
1247 mbstat.m_msize = m_maxsize(MC_MBUF);
1248 mbstat.m_mclbytes = m_maxsize(MC_CL);
1249 mbstat.m_minclsize = MINCLSIZE;
1250 mbstat.m_mlen = MLEN;
1251 mbstat.m_mhlen = MHLEN;
1252 mbstat.m_bigmclbytes = m_maxsize(MC_BIGCL);
1253 }
1254
1255 #if defined(__LP64__)
1256 typedef struct ncl_tbl {
1257 uint64_t nt_maxmem; /* memory (sane) size */
1258 uint32_t nt_mbpool; /* mbuf pool size */
1259 } ncl_tbl_t;
1260
1261 /* Non-server */
1262 static ncl_tbl_t ncl_table[] = {
1263 { (1ULL << GBSHIFT) /* 1 GB */, (64 << MBSHIFT) /* 64 MB */ },
1264 { (1ULL << (GBSHIFT + 3)) /* 8 GB */, (96 << MBSHIFT) /* 96 MB */ },
1265 { (1ULL << (GBSHIFT + 4)) /* 16 GB */, (128 << MBSHIFT) /* 128 MB */ },
1266 { 0, 0 }
1267 };
1268
1269 /* Server */
1270 static ncl_tbl_t ncl_table_srv[] = {
1271 { (1ULL << GBSHIFT) /* 1 GB */, (96 << MBSHIFT) /* 96 MB */ },
1272 { (1ULL << (GBSHIFT + 2)) /* 4 GB */, (128 << MBSHIFT) /* 128 MB */ },
1273 { (1ULL << (GBSHIFT + 3)) /* 8 GB */, (160 << MBSHIFT) /* 160 MB */ },
1274 { (1ULL << (GBSHIFT + 4)) /* 16 GB */, (192 << MBSHIFT) /* 192 MB */ },
1275 { (1ULL << (GBSHIFT + 5)) /* 32 GB */, (256 << MBSHIFT) /* 256 MB */ },
1276 { (1ULL << (GBSHIFT + 6)) /* 64 GB */, (384 << MBSHIFT) /* 384 MB */ },
1277 { 0, 0 }
1278 };
1279 #endif /* __LP64__ */
1280
1281 __private_extern__ unsigned int
1282 mbuf_default_ncl(int server, uint64_t mem)
1283 {
1284 #if !defined(__LP64__)
1285 #pragma unused(server)
1286 unsigned int n;
1287 /*
1288 * 32-bit kernel (default to 64MB of mbuf pool for >= 1GB RAM).
1289 */
1290 if ((n = ((mem / 16) / MCLBYTES)) > 32768)
1291 n = 32768;
1292 #else
1293 unsigned int n, i;
1294 ncl_tbl_t *tbl = (server ? ncl_table_srv : ncl_table);
1295 /*
1296 * 64-bit kernel (mbuf pool size based on table).
1297 */
1298 n = tbl[0].nt_mbpool;
1299 for (i = 0; tbl[i].nt_mbpool != 0; i++) {
1300 if (mem < tbl[i].nt_maxmem)
1301 break;
1302 n = tbl[i].nt_mbpool;
1303 }
1304 n >>= MCLSHIFT;
1305 #endif /* !__LP64__ */
1306 return (n);
1307 }
1308
1309 __private_extern__ void
1310 mbinit(void)
1311 {
1312 unsigned int m;
1313 unsigned int initmcl = 0;
1314 void *buf;
1315 thread_t thread = THREAD_NULL;
1316
1317 /*
1318 * These MBUF_ values must be equal to their private counterparts.
1319 */
1320 _CASSERT(MBUF_EXT == M_EXT);
1321 _CASSERT(MBUF_PKTHDR == M_PKTHDR);
1322 _CASSERT(MBUF_EOR == M_EOR);
1323 _CASSERT(MBUF_LOOP == M_LOOP);
1324 _CASSERT(MBUF_BCAST == M_BCAST);
1325 _CASSERT(MBUF_MCAST == M_MCAST);
1326 _CASSERT(MBUF_FRAG == M_FRAG);
1327 _CASSERT(MBUF_FIRSTFRAG == M_FIRSTFRAG);
1328 _CASSERT(MBUF_LASTFRAG == M_LASTFRAG);
1329 _CASSERT(MBUF_PROMISC == M_PROMISC);
1330 _CASSERT(MBUF_HASFCS == M_HASFCS);
1331
1332 _CASSERT(MBUF_TYPE_FREE == MT_FREE);
1333 _CASSERT(MBUF_TYPE_DATA == MT_DATA);
1334 _CASSERT(MBUF_TYPE_HEADER == MT_HEADER);
1335 _CASSERT(MBUF_TYPE_SOCKET == MT_SOCKET);
1336 _CASSERT(MBUF_TYPE_PCB == MT_PCB);
1337 _CASSERT(MBUF_TYPE_RTABLE == MT_RTABLE);
1338 _CASSERT(MBUF_TYPE_HTABLE == MT_HTABLE);
1339 _CASSERT(MBUF_TYPE_ATABLE == MT_ATABLE);
1340 _CASSERT(MBUF_TYPE_SONAME == MT_SONAME);
1341 _CASSERT(MBUF_TYPE_SOOPTS == MT_SOOPTS);
1342 _CASSERT(MBUF_TYPE_FTABLE == MT_FTABLE);
1343 _CASSERT(MBUF_TYPE_RIGHTS == MT_RIGHTS);
1344 _CASSERT(MBUF_TYPE_IFADDR == MT_IFADDR);
1345 _CASSERT(MBUF_TYPE_CONTROL == MT_CONTROL);
1346 _CASSERT(MBUF_TYPE_OOBDATA == MT_OOBDATA);
1347
1348 _CASSERT(MBUF_TSO_IPV4 == CSUM_TSO_IPV4);
1349 _CASSERT(MBUF_TSO_IPV6 == CSUM_TSO_IPV6);
1350 _CASSERT(MBUF_CSUM_REQ_SUM16 == CSUM_TCP_SUM16);
1351 _CASSERT(MBUF_CSUM_TCP_SUM16 == MBUF_CSUM_REQ_SUM16);
1352 _CASSERT(MBUF_CSUM_REQ_IP == CSUM_IP);
1353 _CASSERT(MBUF_CSUM_REQ_TCP == CSUM_TCP);
1354 _CASSERT(MBUF_CSUM_REQ_UDP == CSUM_UDP);
1355 _CASSERT(MBUF_CSUM_REQ_TCPIPV6 == CSUM_TCPIPV6);
1356 _CASSERT(MBUF_CSUM_REQ_UDPIPV6 == CSUM_UDPIPV6);
1357 _CASSERT(MBUF_CSUM_DID_IP == CSUM_IP_CHECKED);
1358 _CASSERT(MBUF_CSUM_IP_GOOD == CSUM_IP_VALID);
1359 _CASSERT(MBUF_CSUM_DID_DATA == CSUM_DATA_VALID);
1360 _CASSERT(MBUF_CSUM_PSEUDO_HDR == CSUM_PSEUDO_HDR);
1361
1362 _CASSERT(MBUF_WAITOK == M_WAIT);
1363 _CASSERT(MBUF_DONTWAIT == M_DONTWAIT);
1364 _CASSERT(MBUF_COPYALL == M_COPYALL);
1365
1366 _CASSERT(MBUF_PKTAUXF_INET_RESOLVE_RTR == MAUXF_INET_RESOLVE_RTR);
1367 _CASSERT(MBUF_PKTAUXF_INET6_RESOLVE_RTR == MAUXF_INET6_RESOLVE_RTR);
1368
1369 _CASSERT(MBUF_SC2TC(MBUF_SC_BK_SYS) == MBUF_TC_BK);
1370 _CASSERT(MBUF_SC2TC(MBUF_SC_BK) == MBUF_TC_BK);
1371 _CASSERT(MBUF_SC2TC(MBUF_SC_BE) == MBUF_TC_BE);
1372 _CASSERT(MBUF_SC2TC(MBUF_SC_RD) == MBUF_TC_BE);
1373 _CASSERT(MBUF_SC2TC(MBUF_SC_OAM) == MBUF_TC_BE);
1374 _CASSERT(MBUF_SC2TC(MBUF_SC_AV) == MBUF_TC_VI);
1375 _CASSERT(MBUF_SC2TC(MBUF_SC_RV) == MBUF_TC_VI);
1376 _CASSERT(MBUF_SC2TC(MBUF_SC_VI) == MBUF_TC_VI);
1377 _CASSERT(MBUF_SC2TC(MBUF_SC_VO) == MBUF_TC_VO);
1378 _CASSERT(MBUF_SC2TC(MBUF_SC_CTL) == MBUF_TC_VO);
1379
1380 _CASSERT(MBUF_TC2SCVAL(MBUF_TC_BK) == SCVAL_BK);
1381 _CASSERT(MBUF_TC2SCVAL(MBUF_TC_BE) == SCVAL_BE);
1382 _CASSERT(MBUF_TC2SCVAL(MBUF_TC_VI) == SCVAL_VI);
1383 _CASSERT(MBUF_TC2SCVAL(MBUF_TC_VO) == SCVAL_VO);
1384
1385 if (nmbclusters == 0)
1386 nmbclusters = NMBCLUSTERS;
1387
1388 /* This should be a sane (at least even) value by now */
1389 VERIFY(nmbclusters != 0 && !(nmbclusters & 0x1));
1390
1391 /* Setup the mbuf table */
1392 mbuf_table_init();
1393
1394 /* Global lock for common layer */
1395 mbuf_mlock_grp_attr = lck_grp_attr_alloc_init();
1396 mbuf_mlock_grp = lck_grp_alloc_init("mbuf", mbuf_mlock_grp_attr);
1397 mbuf_mlock_attr = lck_attr_alloc_init();
1398 lck_mtx_init(mbuf_mlock, mbuf_mlock_grp, mbuf_mlock_attr);
1399
1400 /*
1401 * Allocate cluster slabs table:
1402 *
1403 * maxslabgrp = (N * 2048) / (1024 * 1024)
1404 *
1405 * Where N is nmbclusters rounded up to the nearest 512. This yields
1406 * mcl_slab_g_t units, each one representing a MB of memory.
1407 */
1408 maxslabgrp =
1409 (P2ROUNDUP(nmbclusters, (MBSIZE >> 11)) << MCLSHIFT) >> MBSHIFT;
1410 MALLOC(slabstbl, mcl_slabg_t **, maxslabgrp * sizeof (mcl_slabg_t *),
1411 M_TEMP, M_WAITOK | M_ZERO);
1412 VERIFY(slabstbl != NULL);
1413
1414 /*
1415 * Allocate audit structures, if needed:
1416 *
1417 * maxclaudit = (maxslabgrp * 1024 * 1024) / 4096
1418 *
1419 * This yields mcl_audit_t units, each one representing a page.
1420 */
1421 PE_parse_boot_argn("mbuf_debug", &mbuf_debug, sizeof (mbuf_debug));
1422 mbuf_debug |= mcache_getflags();
1423 if (mbuf_debug & MCF_DEBUG) {
1424 maxclaudit = ((maxslabgrp << MBSHIFT) >> PGSHIFT);
1425 MALLOC(mclaudit, mcl_audit_t *, maxclaudit * sizeof (*mclaudit),
1426 M_TEMP, M_WAITOK | M_ZERO);
1427 VERIFY(mclaudit != NULL);
1428
1429 mcl_audit_con_cache = mcache_create("mcl_audit_contents",
1430 AUDIT_CONTENTS_SIZE, 0, 0, MCR_SLEEP);
1431 VERIFY(mcl_audit_con_cache != NULL);
1432 }
1433 mclverify = (mbuf_debug & MCF_VERIFY);
1434 mcltrace = (mbuf_debug & MCF_TRACE);
1435 mclfindleak = !(mbuf_debug & MCF_NOLEAKLOG);
1436 mclexpleak = mclfindleak && (mbuf_debug & MCF_EXPLEAKLOG);
1437
1438 /* Enable mbuf leak logging, with a lock to protect the tables */
1439
1440 mleak_lock_grp_attr = lck_grp_attr_alloc_init();
1441 mleak_lock_grp = lck_grp_alloc_init("mleak_lock", mleak_lock_grp_attr);
1442 mleak_lock_attr = lck_attr_alloc_init();
1443 lck_mtx_init(mleak_lock, mleak_lock_grp, mleak_lock_attr);
1444
1445 mleak_activate();
1446
1447 /* Calculate the number of pages assigned to the cluster pool */
1448 mcl_pages = (nmbclusters * MCLBYTES) / CLBYTES;
1449 MALLOC(mcl_paddr, ppnum_t *, mcl_pages * sizeof (ppnum_t),
1450 M_TEMP, M_WAITOK);
1451 VERIFY(mcl_paddr != NULL);
1452
1453 /* Register with the I/O Bus mapper */
1454 mcl_paddr_base = IOMapperIOVMAlloc(mcl_pages);
1455 bzero((char *)mcl_paddr, mcl_pages * sizeof (ppnum_t));
1456
1457 embutl = (union mbigcluster *)
1458 ((void *)((unsigned char *)mbutl + (nmbclusters * MCLBYTES)));
1459 VERIFY((((char *)embutl - (char *)mbutl) % MBIGCLBYTES) == 0);
1460
1461 /* Prime up the freelist */
1462 PE_parse_boot_argn("initmcl", &initmcl, sizeof (initmcl));
1463 if (initmcl != 0) {
1464 initmcl >>= NCLPBGSHIFT; /* become a 4K unit */
1465 if (initmcl > m_maxlimit(MC_BIGCL))
1466 initmcl = m_maxlimit(MC_BIGCL);
1467 }
1468 if (initmcl < m_minlimit(MC_BIGCL))
1469 initmcl = m_minlimit(MC_BIGCL);
1470
1471 lck_mtx_lock(mbuf_mlock);
1472
1473 /*
1474 * For classes with non-zero minimum limits, populate their freelists
1475 * so that m_total(class) is at least m_minlimit(class).
1476 */
1477 VERIFY(m_total(MC_BIGCL) == 0 && m_minlimit(MC_BIGCL) != 0);
1478 freelist_populate(m_class(MC_BIGCL), initmcl, M_WAIT);
1479 VERIFY(m_total(MC_BIGCL) >= m_minlimit(MC_BIGCL));
1480 freelist_init(m_class(MC_CL));
1481
1482 for (m = 0; m < NELEM(mbuf_table); m++) {
1483 /* Make sure we didn't miss any */
1484 VERIFY(m_minlimit(m_class(m)) == 0 ||
1485 m_total(m_class(m)) >= m_minlimit(m_class(m)));
1486 }
1487
1488 lck_mtx_unlock(mbuf_mlock);
1489
1490 (void) kernel_thread_start((thread_continue_t)mbuf_worker_thread_init,
1491 NULL, &thread);
1492 thread_deallocate(thread);
1493
1494 ref_cache = mcache_create("mext_ref", sizeof (struct ext_ref),
1495 0, 0, MCR_SLEEP);
1496
1497 /* Create the cache for each class */
1498 for (m = 0; m < NELEM(mbuf_table); m++) {
1499 void *allocfunc, *freefunc, *auditfunc, *logfunc;
1500 u_int32_t flags;
1501
1502 flags = mbuf_debug;
1503 if (m_class(m) == MC_MBUF_CL || m_class(m) == MC_MBUF_BIGCL ||
1504 m_class(m) == MC_MBUF_16KCL) {
1505 allocfunc = mbuf_cslab_alloc;
1506 freefunc = mbuf_cslab_free;
1507 auditfunc = mbuf_cslab_audit;
1508 logfunc = mleak_logger;
1509 } else {
1510 allocfunc = mbuf_slab_alloc;
1511 freefunc = mbuf_slab_free;
1512 auditfunc = mbuf_slab_audit;
1513 logfunc = mleak_logger;
1514 }
1515
1516 /*
1517 * Disable per-CPU caches for jumbo classes if there
1518 * is no jumbo cluster pool available in the system.
1519 * The cache itself is still created (but will never
1520 * be populated) since it simplifies the code.
1521 */
1522 if ((m_class(m) == MC_MBUF_16KCL || m_class(m) == MC_16KCL) &&
1523 njcl == 0)
1524 flags |= MCF_NOCPUCACHE;
1525
1526 if (!mclfindleak)
1527 flags |= MCF_NOLEAKLOG;
1528
1529 m_cache(m) = mcache_create_ext(m_cname(m), m_maxsize(m),
1530 allocfunc, freefunc, auditfunc, logfunc, mbuf_slab_notify,
1531 (void *)(uintptr_t)m, flags, MCR_SLEEP);
1532 }
1533
1534 /*
1535 * Allocate structure for per-CPU statistics that's aligned
1536 * on the CPU cache boundary; this code assumes that we never
1537 * uninitialize this framework, since the original address
1538 * before alignment is not saved.
1539 */
1540 ncpu = ml_get_max_cpus();
1541 MALLOC(buf, void *, MBUF_MTYPES_SIZE(ncpu) + CPU_CACHE_SIZE,
1542 M_TEMP, M_WAITOK);
1543 VERIFY(buf != NULL);
1544
1545 mbuf_mtypes = (mbuf_mtypes_t *)P2ROUNDUP((intptr_t)buf, CPU_CACHE_SIZE);
1546 bzero(mbuf_mtypes, MBUF_MTYPES_SIZE(ncpu));
1547
1548 /*
1549 * Set the max limit on sb_max to be 1/16 th of the size of
1550 * memory allocated for mbuf clusters.
1551 */
1552 high_sb_max = (nmbclusters << (MCLSHIFT - 4));
1553 if (high_sb_max < sb_max) {
1554 /* sb_max is too large for this configuration, scale it down */
1555 if (high_sb_max > (1 << MBSHIFT)) {
1556 /* We have atleast 16 M of mbuf pool */
1557 sb_max = high_sb_max;
1558 } else if ((nmbclusters << MCLSHIFT) > (1 << MBSHIFT)) {
1559 /*
1560 * If we have more than 1M of mbufpool, cap the size of
1561 * max sock buf at 1M
1562 */
1563 sb_max = high_sb_max = (1 << MBSHIFT);
1564 } else {
1565 sb_max = high_sb_max;
1566 }
1567 }
1568
1569 /* allocate space for mbuf_dump_buf */
1570 MALLOC(mbuf_dump_buf, char *, MBUF_DUMP_BUF_SIZE, M_TEMP, M_WAITOK);
1571 VERIFY(mbuf_dump_buf != NULL);
1572
1573 printf("mbinit: done [%d MB total pool size, (%d/%d) split]\n",
1574 (nmbclusters << MCLSHIFT) >> MBSHIFT,
1575 (nclusters << MCLSHIFT) >> MBSHIFT,
1576 (njcl << MCLSHIFT) >> MBSHIFT);
1577 }
1578
1579 /*
1580 * Obtain a slab of object(s) from the class's freelist.
1581 */
1582 static mcache_obj_t *
1583 slab_alloc(mbuf_class_t class, int wait)
1584 {
1585 mcl_slab_t *sp;
1586 mcache_obj_t *buf;
1587
1588 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1589
1590 VERIFY(class != MC_16KCL || njcl > 0);
1591
1592 /* This should always be NULL for us */
1593 VERIFY(m_cobjlist(class) == NULL);
1594
1595 /*
1596 * Treat composite objects as having longer lifespan by using
1597 * a slab from the reverse direction, in hoping that this could
1598 * reduce the probability of fragmentation for slabs that hold
1599 * more than one buffer chunks (e.g. mbuf slabs). For other
1600 * slabs, this probably doesn't make much of a difference.
1601 */
1602 if ((class == MC_MBUF || class == MC_CL) && (wait & MCR_COMP))
1603 sp = (mcl_slab_t *)TAILQ_LAST(&m_slablist(class), mcl_slhead);
1604 else
1605 sp = (mcl_slab_t *)TAILQ_FIRST(&m_slablist(class));
1606
1607 if (sp == NULL) {
1608 VERIFY(m_infree(class) == 0 && m_slab_cnt(class) == 0);
1609 /* The slab list for this class is empty */
1610 return (NULL);
1611 }
1612
1613 VERIFY(m_infree(class) > 0);
1614 VERIFY(!slab_is_detached(sp));
1615 VERIFY(sp->sl_class == class &&
1616 (sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED);
1617 buf = sp->sl_head;
1618 VERIFY(slab_inrange(sp, buf) && sp == slab_get(buf));
1619
1620 if (class == MC_MBUF) {
1621 sp->sl_head = buf->obj_next;
1622 VERIFY(sp->sl_head != NULL || sp->sl_refcnt == (NMBPBG - 1));
1623 } else if (class == MC_CL) {
1624 sp->sl_head = buf->obj_next;
1625 VERIFY(sp->sl_head != NULL || sp->sl_refcnt == (NCLPBG - 1));
1626 } else {
1627 sp->sl_head = NULL;
1628 }
1629 if (sp->sl_head != NULL && !slab_inrange(sp, sp->sl_head)) {
1630 slab_nextptr_panic(sp, sp->sl_head);
1631 /* In case sl_head is in the map but not in the slab */
1632 VERIFY(slab_inrange(sp, sp->sl_head));
1633 /* NOTREACHED */
1634 }
1635
1636 /* Increment slab reference */
1637 sp->sl_refcnt++;
1638
1639 if (mclaudit != NULL) {
1640 mcache_audit_t *mca = mcl_audit_buf2mca(class, buf);
1641 mca->mca_uflags = 0;
1642 /* Save contents on mbuf objects only */
1643 if (class == MC_MBUF)
1644 mca->mca_uflags |= MB_SCVALID;
1645 }
1646
1647 if (class == MC_CL) {
1648 mbstat.m_clfree = (--m_infree(MC_CL)) + m_infree(MC_MBUF_CL);
1649 /*
1650 * A 2K cluster slab can have at most NCLPBG references.
1651 */
1652 VERIFY(sp->sl_refcnt >= 1 && sp->sl_refcnt <= NCLPBG &&
1653 sp->sl_chunks == NCLPBG &&
1654 sp->sl_len == m_maxsize(MC_BIGCL));
1655 VERIFY(sp->sl_refcnt < NCLPBG || sp->sl_head == NULL);
1656 } else if (class == MC_BIGCL) {
1657 mbstat.m_bigclfree = (--m_infree(MC_BIGCL)) +
1658 m_infree(MC_MBUF_BIGCL);
1659 /*
1660 * A 4K cluster slab can have at most 1 reference.
1661 */
1662 VERIFY(sp->sl_refcnt == 1 && sp->sl_chunks == 1 &&
1663 sp->sl_len == m_maxsize(class) && sp->sl_head == NULL);
1664 } else if (class == MC_16KCL) {
1665 mcl_slab_t *nsp;
1666 int k;
1667
1668 --m_infree(MC_16KCL);
1669 VERIFY(sp->sl_refcnt == 1 && sp->sl_chunks == 1 &&
1670 sp->sl_len == m_maxsize(class) && sp->sl_head == NULL);
1671 /*
1672 * Increment 2nd-Nth slab reference, where N is NSLABSP16KB.
1673 * A 16KB big cluster takes NSLABSP16KB slabs, each having at
1674 * most 1 reference.
1675 */
1676 for (nsp = sp, k = 1; k < NSLABSP16KB; k++) {
1677 nsp = nsp->sl_next;
1678 /* Next slab must already be present */
1679 VERIFY(nsp != NULL);
1680 nsp->sl_refcnt++;
1681 VERIFY(!slab_is_detached(nsp));
1682 VERIFY(nsp->sl_class == MC_16KCL &&
1683 nsp->sl_flags == (SLF_MAPPED | SLF_PARTIAL) &&
1684 nsp->sl_refcnt == 1 && nsp->sl_chunks == 0 &&
1685 nsp->sl_len == 0 && nsp->sl_base == sp->sl_base &&
1686 nsp->sl_head == NULL);
1687 }
1688 } else {
1689 VERIFY(class == MC_MBUF);
1690 --m_infree(MC_MBUF);
1691 /*
1692 * If auditing is turned on, this check is
1693 * deferred until later in mbuf_slab_audit().
1694 */
1695 if (mclaudit == NULL)
1696 _MCHECK((struct mbuf *)buf);
1697 /*
1698 * Since we have incremented the reference count above,
1699 * an mbuf slab (formerly a 4KB cluster slab that was cut
1700 * up into mbufs) must have a reference count between 1
1701 * and NMBPBG at this point.
1702 */
1703 VERIFY(sp->sl_refcnt >= 1 && sp->sl_refcnt <= NMBPBG &&
1704 sp->sl_chunks == NMBPBG &&
1705 sp->sl_len == m_maxsize(MC_BIGCL));
1706 VERIFY(sp->sl_refcnt < NMBPBG || sp->sl_head == NULL);
1707 }
1708
1709 /* If empty, remove this slab from the class's freelist */
1710 if (sp->sl_head == NULL) {
1711 VERIFY(class != MC_MBUF || sp->sl_refcnt == NMBPBG);
1712 VERIFY(class != MC_CL || sp->sl_refcnt == NCLPBG);
1713 slab_remove(sp, class);
1714 }
1715
1716 return (buf);
1717 }
1718
1719 /*
1720 * Place a slab of object(s) back into a class's slab list.
1721 */
1722 static void
1723 slab_free(mbuf_class_t class, mcache_obj_t *buf)
1724 {
1725 mcl_slab_t *sp;
1726
1727 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1728
1729 VERIFY(class != MC_16KCL || njcl > 0);
1730 VERIFY(buf->obj_next == NULL);
1731 sp = slab_get(buf);
1732 VERIFY(sp->sl_class == class && slab_inrange(sp, buf) &&
1733 (sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED);
1734
1735 /* Decrement slab reference */
1736 sp->sl_refcnt--;
1737
1738 if (class == MC_CL) {
1739 VERIFY(IS_P2ALIGNED(buf, MCLBYTES));
1740 /*
1741 * A slab that has been splitted for 2KB clusters can have
1742 * at most 1 outstanding reference at this point.
1743 */
1744 VERIFY(sp->sl_refcnt >= 0 && sp->sl_refcnt <= (NCLPBG - 1) &&
1745 sp->sl_chunks == NCLPBG &&
1746 sp->sl_len == m_maxsize(MC_BIGCL));
1747 VERIFY(sp->sl_refcnt < (NCLPBG - 1) ||
1748 (slab_is_detached(sp) && sp->sl_head == NULL));
1749 } else if (class == MC_BIGCL) {
1750 VERIFY(IS_P2ALIGNED(buf, MCLBYTES));
1751 /*
1752 * A 4KB cluster slab can have at most 1 reference
1753 * which must be 0 at this point.
1754 */
1755 VERIFY(sp->sl_refcnt == 0 && sp->sl_chunks == 1 &&
1756 sp->sl_len == m_maxsize(class) && sp->sl_head == NULL);
1757 VERIFY(slab_is_detached(sp));
1758 } else if (class == MC_16KCL) {
1759 mcl_slab_t *nsp;
1760 int k;
1761 /*
1762 * A 16KB cluster takes NSLABSP16KB slabs, all must
1763 * now have 0 reference.
1764 */
1765 VERIFY(IS_P2ALIGNED(buf, MBIGCLBYTES));
1766 VERIFY(sp->sl_refcnt == 0 && sp->sl_chunks == 1 &&
1767 sp->sl_len == m_maxsize(class) && sp->sl_head == NULL);
1768 VERIFY(slab_is_detached(sp));
1769 for (nsp = sp, k = 1; k < NSLABSP16KB; k++) {
1770 nsp = nsp->sl_next;
1771 /* Next slab must already be present */
1772 VERIFY(nsp != NULL);
1773 nsp->sl_refcnt--;
1774 VERIFY(slab_is_detached(nsp));
1775 VERIFY(nsp->sl_class == MC_16KCL &&
1776 (nsp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) &&
1777 nsp->sl_refcnt == 0 && nsp->sl_chunks == 0 &&
1778 nsp->sl_len == 0 && nsp->sl_base == sp->sl_base &&
1779 nsp->sl_head == NULL);
1780 }
1781 } else {
1782 /*
1783 * A slab that has been splitted for mbufs has at most NMBPBG
1784 * reference counts. Since we have decremented one reference
1785 * above, it must now be between 0 and NMBPBG-1.
1786 */
1787 VERIFY(class == MC_MBUF);
1788 VERIFY(sp->sl_refcnt >= 0 && sp->sl_refcnt <= (NMBPBG - 1) &&
1789 sp->sl_chunks == NMBPBG &&
1790 sp->sl_len == m_maxsize(MC_BIGCL));
1791 VERIFY(sp->sl_refcnt < (NMBPBG - 1) ||
1792 (slab_is_detached(sp) && sp->sl_head == NULL));
1793 }
1794
1795 /*
1796 * When auditing is enabled, ensure that the buffer still
1797 * contains the free pattern. Otherwise it got corrupted
1798 * while at the CPU cache layer.
1799 */
1800 if (mclaudit != NULL) {
1801 mcache_audit_t *mca = mcl_audit_buf2mca(class, buf);
1802 if (mclverify) {
1803 mcache_audit_free_verify(mca, buf, 0, m_maxsize(class));
1804 }
1805 mca->mca_uflags &= ~MB_SCVALID;
1806 }
1807
1808 if (class == MC_CL) {
1809 mbstat.m_clfree = (++m_infree(MC_CL)) + m_infree(MC_MBUF_CL);
1810 buf->obj_next = sp->sl_head;
1811 } else if (class == MC_BIGCL) {
1812 mbstat.m_bigclfree = (++m_infree(MC_BIGCL)) +
1813 m_infree(MC_MBUF_BIGCL);
1814 } else if (class == MC_16KCL) {
1815 ++m_infree(MC_16KCL);
1816 } else {
1817 ++m_infree(MC_MBUF);
1818 buf->obj_next = sp->sl_head;
1819 }
1820 sp->sl_head = buf;
1821
1822 /*
1823 * If a slab has been splitted to either one which holds 2KB clusters,
1824 * or one which holds mbufs, turn it back to one which holds a 4KB
1825 * cluster.
1826 */
1827 if (class == MC_MBUF && sp->sl_refcnt == 0 &&
1828 m_total(class) > m_minlimit(class) &&
1829 m_total(MC_BIGCL) < m_maxlimit(MC_BIGCL)) {
1830 int i = NMBPBG;
1831
1832 m_total(MC_BIGCL)++;
1833 mbstat.m_bigclusters = m_total(MC_BIGCL);
1834 m_total(MC_MBUF) -= NMBPBG;
1835 mbstat.m_mbufs = m_total(MC_MBUF);
1836 m_infree(MC_MBUF) -= NMBPBG;
1837 mtype_stat_add(MT_FREE, -((unsigned)NMBPBG));
1838
1839 VERIFY(m_total(MC_BIGCL) <= m_maxlimit(MC_BIGCL));
1840 VERIFY(m_total(MC_MBUF) >= m_minlimit(MC_MBUF));
1841
1842 while (i--) {
1843 struct mbuf *m = sp->sl_head;
1844 VERIFY(m != NULL);
1845 sp->sl_head = m->m_next;
1846 m->m_next = NULL;
1847 }
1848 VERIFY(sp->sl_head == NULL);
1849
1850 /* Remove the slab from the mbuf class's slab list */
1851 slab_remove(sp, class);
1852
1853 /* Reinitialize it as a 4KB cluster slab */
1854 slab_init(sp, MC_BIGCL, sp->sl_flags, sp->sl_base, sp->sl_base,
1855 sp->sl_len, 0, 1);
1856
1857 if (mclverify) {
1858 mcache_set_pattern(MCACHE_FREE_PATTERN,
1859 (caddr_t)sp->sl_head, m_maxsize(MC_BIGCL));
1860 }
1861 mbstat.m_bigclfree = (++m_infree(MC_BIGCL)) +
1862 m_infree(MC_MBUF_BIGCL);
1863
1864 VERIFY(slab_is_detached(sp));
1865 /* And finally switch class */
1866 class = MC_BIGCL;
1867 } else if (class == MC_CL && sp->sl_refcnt == 0 &&
1868 m_total(class) > m_minlimit(class) &&
1869 m_total(MC_BIGCL) < m_maxlimit(MC_BIGCL)) {
1870 int i = NCLPBG;
1871
1872 m_total(MC_BIGCL)++;
1873 mbstat.m_bigclusters = m_total(MC_BIGCL);
1874 m_total(MC_CL) -= NCLPBG;
1875 mbstat.m_clusters = m_total(MC_CL);
1876 m_infree(MC_CL) -= NCLPBG;
1877 VERIFY(m_total(MC_BIGCL) <= m_maxlimit(MC_BIGCL));
1878 VERIFY(m_total(MC_CL) >= m_minlimit(MC_CL));
1879
1880 while (i--) {
1881 union mcluster *c = sp->sl_head;
1882 VERIFY(c != NULL);
1883 sp->sl_head = c->mcl_next;
1884 c->mcl_next = NULL;
1885 }
1886 VERIFY(sp->sl_head == NULL);
1887
1888 /* Remove the slab from the 2KB cluster class's slab list */
1889 slab_remove(sp, class);
1890
1891 /* Reinitialize it as a 4KB cluster slab */
1892 slab_init(sp, MC_BIGCL, sp->sl_flags, sp->sl_base, sp->sl_base,
1893 sp->sl_len, 0, 1);
1894
1895 if (mclverify) {
1896 mcache_set_pattern(MCACHE_FREE_PATTERN,
1897 (caddr_t)sp->sl_head, m_maxsize(MC_BIGCL));
1898 }
1899 mbstat.m_bigclfree = (++m_infree(MC_BIGCL)) +
1900 m_infree(MC_MBUF_BIGCL);
1901
1902 VERIFY(slab_is_detached(sp));
1903 /* And finally switch class */
1904 class = MC_BIGCL;
1905 }
1906
1907 /* Reinsert the slab to the class's slab list */
1908 if (slab_is_detached(sp))
1909 slab_insert(sp, class);
1910 }
1911
1912 /*
1913 * Common allocator for rudimentary objects called by the CPU cache layer
1914 * during an allocation request whenever there is no available element in the
1915 * bucket layer. It returns one or more elements from the appropriate global
1916 * freelist. If the freelist is empty, it will attempt to populate it and
1917 * retry the allocation.
1918 */
1919 static unsigned int
1920 mbuf_slab_alloc(void *arg, mcache_obj_t ***plist, unsigned int num, int wait)
1921 {
1922 mbuf_class_t class = (mbuf_class_t)arg;
1923 unsigned int need = num;
1924 mcache_obj_t **list = *plist;
1925
1926 ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
1927 ASSERT(need > 0);
1928
1929 lck_mtx_lock(mbuf_mlock);
1930
1931 for (;;) {
1932 if ((*list = slab_alloc(class, wait)) != NULL) {
1933 (*list)->obj_next = NULL;
1934 list = *plist = &(*list)->obj_next;
1935
1936 if (--need == 0) {
1937 /*
1938 * If the number of elements in freelist has
1939 * dropped below low watermark, asynchronously
1940 * populate the freelist now rather than doing
1941 * it later when we run out of elements.
1942 */
1943 if (!mbuf_cached_above(class, wait) &&
1944 m_infree(class) < m_total(class) >> 5) {
1945 (void) freelist_populate(class, 1,
1946 M_DONTWAIT);
1947 }
1948 break;
1949 }
1950 } else {
1951 VERIFY(m_infree(class) == 0 || class == MC_CL);
1952
1953 (void) freelist_populate(class, 1,
1954 (wait & MCR_NOSLEEP) ? M_DONTWAIT : M_WAIT);
1955
1956 if (m_infree(class) > 0)
1957 continue;
1958
1959 /* Check if there's anything at the cache layer */
1960 if (mbuf_cached_above(class, wait))
1961 break;
1962
1963 /* watchdog checkpoint */
1964 mbuf_watchdog();
1965
1966 /* We have nothing and cannot block; give up */
1967 if (wait & MCR_NOSLEEP) {
1968 if (!(wait & MCR_TRYHARD)) {
1969 m_fail_cnt(class)++;
1970 mbstat.m_drops++;
1971 break;
1972 }
1973 }
1974
1975 /*
1976 * If the freelist is still empty and the caller is
1977 * willing to be blocked, sleep on the wait channel
1978 * until an element is available. Otherwise, if
1979 * MCR_TRYHARD is set, do our best to satisfy the
1980 * request without having to go to sleep.
1981 */
1982 if (mbuf_worker_ready &&
1983 mbuf_sleep(class, need, wait))
1984 break;
1985
1986 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1987 }
1988 }
1989
1990 m_alloc_cnt(class) += num - need;
1991 lck_mtx_unlock(mbuf_mlock);
1992
1993 return (num - need);
1994 }
1995
1996 /*
1997 * Common de-allocator for rudimentary objects called by the CPU cache
1998 * layer when one or more elements need to be returned to the appropriate
1999 * global freelist.
2000 */
2001 static void
2002 mbuf_slab_free(void *arg, mcache_obj_t *list, __unused int purged)
2003 {
2004 mbuf_class_t class = (mbuf_class_t)arg;
2005 mcache_obj_t *nlist;
2006 unsigned int num = 0;
2007 int w;
2008
2009 ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
2010
2011 lck_mtx_lock(mbuf_mlock);
2012
2013 for (;;) {
2014 nlist = list->obj_next;
2015 list->obj_next = NULL;
2016 slab_free(class, list);
2017 ++num;
2018 if ((list = nlist) == NULL)
2019 break;
2020 }
2021 m_free_cnt(class) += num;
2022
2023 if ((w = mb_waiters) > 0)
2024 mb_waiters = 0;
2025
2026 lck_mtx_unlock(mbuf_mlock);
2027
2028 if (w != 0)
2029 wakeup(mb_waitchan);
2030 }
2031
2032 /*
2033 * Common auditor for rudimentary objects called by the CPU cache layer
2034 * during an allocation or free request. For the former, this is called
2035 * after the objects are obtained from either the bucket or slab layer
2036 * and before they are returned to the caller. For the latter, this is
2037 * called immediately during free and before placing the objects into
2038 * the bucket or slab layer.
2039 */
2040 static void
2041 mbuf_slab_audit(void *arg, mcache_obj_t *list, boolean_t alloc)
2042 {
2043 mbuf_class_t class = (mbuf_class_t)arg;
2044 mcache_audit_t *mca;
2045
2046 ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
2047
2048 while (list != NULL) {
2049 lck_mtx_lock(mbuf_mlock);
2050 mca = mcl_audit_buf2mca(class, list);
2051
2052 /* Do the sanity checks */
2053 if (class == MC_MBUF) {
2054 mcl_audit_mbuf(mca, list, FALSE, alloc);
2055 ASSERT(mca->mca_uflags & MB_SCVALID);
2056 } else {
2057 mcl_audit_cluster(mca, list, m_maxsize(class),
2058 alloc, TRUE);
2059 ASSERT(!(mca->mca_uflags & MB_SCVALID));
2060 }
2061 /* Record this transaction */
2062 if (mcltrace)
2063 mcache_buffer_log(mca, list, m_cache(class));
2064
2065 if (alloc)
2066 mca->mca_uflags |= MB_INUSE;
2067 else
2068 mca->mca_uflags &= ~MB_INUSE;
2069 /* Unpair the object (unconditionally) */
2070 mca->mca_uptr = NULL;
2071 lck_mtx_unlock(mbuf_mlock);
2072
2073 list = list->obj_next;
2074 }
2075 }
2076
2077 /*
2078 * Common notify routine for all caches. It is called by mcache when
2079 * one or more objects get freed. We use this indication to trigger
2080 * the wakeup of any sleeping threads so that they can retry their
2081 * allocation requests.
2082 */
2083 static void
2084 mbuf_slab_notify(void *arg, u_int32_t reason)
2085 {
2086 mbuf_class_t class = (mbuf_class_t)arg;
2087 int w;
2088
2089 ASSERT(MBUF_CLASS_VALID(class));
2090
2091 if (reason != MCN_RETRYALLOC)
2092 return;
2093
2094 lck_mtx_lock(mbuf_mlock);
2095 if ((w = mb_waiters) > 0) {
2096 m_notified(class)++;
2097 mb_waiters = 0;
2098 }
2099 lck_mtx_unlock(mbuf_mlock);
2100
2101 if (w != 0)
2102 wakeup(mb_waitchan);
2103 }
2104
2105 /*
2106 * Obtain object(s) from the composite class's freelist.
2107 */
2108 static unsigned int
2109 cslab_alloc(mbuf_class_t class, mcache_obj_t ***plist, unsigned int num)
2110 {
2111 unsigned int need = num;
2112 mcl_slab_t *sp, *clsp, *nsp;
2113 struct mbuf *m;
2114 mcache_obj_t **list = *plist;
2115 void *cl;
2116
2117 VERIFY(need > 0);
2118 VERIFY(class != MC_MBUF_16KCL || njcl > 0);
2119 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2120
2121 /* Get what we can from the freelist */
2122 while ((*list = m_cobjlist(class)) != NULL) {
2123 MRANGE(*list);
2124
2125 m = (struct mbuf *)*list;
2126 sp = slab_get(m);
2127 cl = m->m_ext.ext_buf;
2128 clsp = slab_get(cl);
2129 VERIFY(m->m_flags == M_EXT && cl != NULL);
2130 VERIFY(MEXT_RFA(m) != NULL && MBUF_IS_COMPOSITE(m));
2131
2132 if (class == MC_MBUF_CL) {
2133 VERIFY(clsp->sl_refcnt >= 1 &&
2134 clsp->sl_refcnt <= NCLPBG);
2135 } else {
2136 VERIFY(clsp->sl_refcnt == 1);
2137 }
2138
2139 if (class == MC_MBUF_16KCL) {
2140 int k;
2141 for (nsp = clsp, k = 1; k < NSLABSP16KB; k++) {
2142 nsp = nsp->sl_next;
2143 /* Next slab must already be present */
2144 VERIFY(nsp != NULL);
2145 VERIFY(nsp->sl_refcnt == 1);
2146 }
2147 }
2148
2149 if ((m_cobjlist(class) = (*list)->obj_next) != NULL &&
2150 !MBUF_IN_MAP(m_cobjlist(class))) {
2151 slab_nextptr_panic(sp, m_cobjlist(class));
2152 /* NOTREACHED */
2153 }
2154 (*list)->obj_next = NULL;
2155 list = *plist = &(*list)->obj_next;
2156
2157 if (--need == 0)
2158 break;
2159 }
2160 m_infree(class) -= (num - need);
2161
2162 return (num - need);
2163 }
2164
2165 /*
2166 * Place object(s) back into a composite class's freelist.
2167 */
2168 static unsigned int
2169 cslab_free(mbuf_class_t class, mcache_obj_t *list, int purged)
2170 {
2171 mcache_obj_t *o, *tail;
2172 unsigned int num = 0;
2173 struct mbuf *m, *ms;
2174 mcache_audit_t *mca = NULL;
2175 mcache_obj_t *ref_list = NULL;
2176 mcl_slab_t *clsp, *nsp;
2177 void *cl;
2178 mbuf_class_t cl_class;
2179
2180 ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
2181 VERIFY(class != MC_MBUF_16KCL || njcl > 0);
2182 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2183
2184 if (class == MC_MBUF_CL) {
2185 cl_class = MC_CL;
2186 } else if (class == MC_MBUF_BIGCL) {
2187 cl_class = MC_BIGCL;
2188 } else {
2189 VERIFY(class == MC_MBUF_16KCL);
2190 cl_class = MC_16KCL;
2191 }
2192
2193 o = tail = list;
2194
2195 while ((m = ms = (struct mbuf *)o) != NULL) {
2196 mcache_obj_t *rfa, *nexto = o->obj_next;
2197
2198 /* Do the mbuf sanity checks */
2199 if (mclaudit != NULL) {
2200 mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
2201 if (mclverify) {
2202 mcache_audit_free_verify(mca, m, 0,
2203 m_maxsize(MC_MBUF));
2204 }
2205 ms = (struct mbuf *)mca->mca_contents;
2206 }
2207
2208 /* Do the cluster sanity checks */
2209 cl = ms->m_ext.ext_buf;
2210 clsp = slab_get(cl);
2211 if (mclverify) {
2212 size_t size = m_maxsize(cl_class);
2213 mcache_audit_free_verify(mcl_audit_buf2mca(cl_class,
2214 (mcache_obj_t *)cl), cl, 0, size);
2215 }
2216 VERIFY(ms->m_type == MT_FREE);
2217 VERIFY(ms->m_flags == M_EXT);
2218 VERIFY(MEXT_RFA(ms) != NULL && MBUF_IS_COMPOSITE(ms));
2219 if (cl_class == MC_CL) {
2220 VERIFY(clsp->sl_refcnt >= 1 &&
2221 clsp->sl_refcnt <= NCLPBG);
2222 } else {
2223 VERIFY(clsp->sl_refcnt == 1);
2224 }
2225 if (cl_class == MC_16KCL) {
2226 int k;
2227 for (nsp = clsp, k = 1; k < NSLABSP16KB; k++) {
2228 nsp = nsp->sl_next;
2229 /* Next slab must already be present */
2230 VERIFY(nsp != NULL);
2231 VERIFY(nsp->sl_refcnt == 1);
2232 }
2233 }
2234
2235 /*
2236 * If we're asked to purge, restore the actual mbuf using
2237 * contents of the shadow structure (if auditing is enabled)
2238 * and clear EXTF_COMPOSITE flag from the mbuf, as we are
2239 * about to free it and the attached cluster into their caches.
2240 */
2241 if (purged) {
2242 /* Restore constructed mbuf fields */
2243 if (mclaudit != NULL)
2244 mcl_audit_restore_mbuf(m, mca, TRUE);
2245
2246 MEXT_REF(m) = 0;
2247 MEXT_FLAGS(m) = 0;
2248
2249 rfa = (mcache_obj_t *)(void *)MEXT_RFA(m);
2250 rfa->obj_next = ref_list;
2251 ref_list = rfa;
2252 MEXT_RFA(m) = NULL;
2253
2254 m->m_type = MT_FREE;
2255 m->m_flags = m->m_len = 0;
2256 m->m_next = m->m_nextpkt = NULL;
2257
2258 /* Save mbuf fields and make auditing happy */
2259 if (mclaudit != NULL)
2260 mcl_audit_mbuf(mca, o, FALSE, FALSE);
2261
2262 VERIFY(m_total(class) > 0);
2263 m_total(class)--;
2264
2265 /* Free the mbuf */
2266 o->obj_next = NULL;
2267 slab_free(MC_MBUF, o);
2268
2269 /* And free the cluster */
2270 ((mcache_obj_t *)cl)->obj_next = NULL;
2271 if (class == MC_MBUF_CL)
2272 slab_free(MC_CL, cl);
2273 else if (class == MC_MBUF_BIGCL)
2274 slab_free(MC_BIGCL, cl);
2275 else
2276 slab_free(MC_16KCL, cl);
2277 }
2278
2279 ++num;
2280 tail = o;
2281 o = nexto;
2282 }
2283
2284 if (!purged) {
2285 tail->obj_next = m_cobjlist(class);
2286 m_cobjlist(class) = list;
2287 m_infree(class) += num;
2288 } else if (ref_list != NULL) {
2289 mcache_free_ext(ref_cache, ref_list);
2290 }
2291
2292 return (num);
2293 }
2294
2295 /*
2296 * Common allocator for composite objects called by the CPU cache layer
2297 * during an allocation request whenever there is no available element in
2298 * the bucket layer. It returns one or more composite elements from the
2299 * appropriate global freelist. If the freelist is empty, it will attempt
2300 * to obtain the rudimentary objects from their caches and construct them
2301 * into composite mbuf + cluster objects.
2302 */
2303 static unsigned int
2304 mbuf_cslab_alloc(void *arg, mcache_obj_t ***plist, unsigned int needed,
2305 int wait)
2306 {
2307 mbuf_class_t class = (mbuf_class_t)arg;
2308 mbuf_class_t cl_class = 0;
2309 unsigned int num = 0, cnum = 0, want = needed;
2310 mcache_obj_t *ref_list = NULL;
2311 mcache_obj_t *mp_list = NULL;
2312 mcache_obj_t *clp_list = NULL;
2313 mcache_obj_t **list;
2314 struct ext_ref *rfa;
2315 struct mbuf *m;
2316 void *cl;
2317
2318 ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
2319 ASSERT(needed > 0);
2320
2321 VERIFY(class != MC_MBUF_16KCL || njcl > 0);
2322
2323 /* There should not be any slab for this class */
2324 VERIFY(m_slab_cnt(class) == 0 &&
2325 m_slablist(class).tqh_first == NULL &&
2326 m_slablist(class).tqh_last == NULL);
2327
2328 lck_mtx_lock(mbuf_mlock);
2329
2330 /* Try using the freelist first */
2331 num = cslab_alloc(class, plist, needed);
2332 list = *plist;
2333 if (num == needed) {
2334 m_alloc_cnt(class) += num;
2335 lck_mtx_unlock(mbuf_mlock);
2336 return (needed);
2337 }
2338
2339 lck_mtx_unlock(mbuf_mlock);
2340
2341 /*
2342 * We could not satisfy the request using the freelist alone;
2343 * allocate from the appropriate rudimentary caches and use
2344 * whatever we can get to construct the composite objects.
2345 */
2346 needed -= num;
2347
2348 /*
2349 * Mark these allocation requests as coming from a composite cache.
2350 * Also, if the caller is willing to be blocked, mark the request
2351 * with MCR_FAILOK such that we don't end up sleeping at the mbuf
2352 * slab layer waiting for the individual object when one or more
2353 * of the already-constructed composite objects are available.
2354 */
2355 wait |= MCR_COMP;
2356 if (!(wait & MCR_NOSLEEP))
2357 wait |= MCR_FAILOK;
2358
2359 /* allocate mbufs */
2360 needed = mcache_alloc_ext(m_cache(MC_MBUF), &mp_list, needed, wait);
2361 if (needed == 0) {
2362 ASSERT(mp_list == NULL);
2363 goto fail;
2364 }
2365
2366 /* allocate clusters */
2367 if (class == MC_MBUF_CL) {
2368 cl_class = MC_CL;
2369 } else if (class == MC_MBUF_BIGCL) {
2370 cl_class = MC_BIGCL;
2371 } else {
2372 VERIFY(class == MC_MBUF_16KCL);
2373 cl_class = MC_16KCL;
2374 }
2375 needed = mcache_alloc_ext(m_cache(cl_class), &clp_list, needed, wait);
2376 if (needed == 0) {
2377 ASSERT(clp_list == NULL);
2378 goto fail;
2379 }
2380
2381 needed = mcache_alloc_ext(ref_cache, &ref_list, needed, wait);
2382 if (needed == 0) {
2383 ASSERT(ref_list == NULL);
2384 goto fail;
2385 }
2386
2387 /*
2388 * By this time "needed" is MIN(mbuf, cluster, ref). Any left
2389 * overs will get freed accordingly before we return to caller.
2390 */
2391 for (cnum = 0; cnum < needed; cnum++) {
2392 struct mbuf *ms;
2393
2394 m = ms = (struct mbuf *)mp_list;
2395 mp_list = mp_list->obj_next;
2396
2397 cl = clp_list;
2398 clp_list = clp_list->obj_next;
2399 ((mcache_obj_t *)cl)->obj_next = NULL;
2400
2401 rfa = (struct ext_ref *)ref_list;
2402 ref_list = ref_list->obj_next;
2403 ((mcache_obj_t *)(void *)rfa)->obj_next = NULL;
2404
2405 /*
2406 * If auditing is enabled, construct the shadow mbuf
2407 * in the audit structure instead of in the actual one.
2408 * mbuf_cslab_audit() will take care of restoring the
2409 * contents after the integrity check.
2410 */
2411 if (mclaudit != NULL) {
2412 mcache_audit_t *mca, *cl_mca;
2413
2414 lck_mtx_lock(mbuf_mlock);
2415 mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
2416 ms = ((struct mbuf *)mca->mca_contents);
2417 cl_mca = mcl_audit_buf2mca(MC_CL, (mcache_obj_t *)cl);
2418
2419 /*
2420 * Pair them up. Note that this is done at the time
2421 * the mbuf+cluster objects are constructed. This
2422 * information should be treated as "best effort"
2423 * debugging hint since more than one mbufs can refer
2424 * to a cluster. In that case, the cluster might not
2425 * be freed along with the mbuf it was paired with.
2426 */
2427 mca->mca_uptr = cl_mca;
2428 cl_mca->mca_uptr = mca;
2429
2430 ASSERT(mca->mca_uflags & MB_SCVALID);
2431 ASSERT(!(cl_mca->mca_uflags & MB_SCVALID));
2432 lck_mtx_unlock(mbuf_mlock);
2433
2434 /* Technically, they are in the freelist */
2435 if (mclverify) {
2436 size_t size;
2437
2438 mcache_set_pattern(MCACHE_FREE_PATTERN, m,
2439 m_maxsize(MC_MBUF));
2440
2441 if (class == MC_MBUF_CL)
2442 size = m_maxsize(MC_CL);
2443 else if (class == MC_MBUF_BIGCL)
2444 size = m_maxsize(MC_BIGCL);
2445 else
2446 size = m_maxsize(MC_16KCL);
2447
2448 mcache_set_pattern(MCACHE_FREE_PATTERN, cl,
2449 size);
2450 }
2451 }
2452
2453 MBUF_INIT(ms, 0, MT_FREE);
2454 if (class == MC_MBUF_16KCL) {
2455 MBUF_16KCL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE);
2456 } else if (class == MC_MBUF_BIGCL) {
2457 MBUF_BIGCL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE);
2458 } else {
2459 MBUF_CL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE);
2460 }
2461 VERIFY(ms->m_flags == M_EXT);
2462 VERIFY(MEXT_RFA(ms) != NULL && MBUF_IS_COMPOSITE(ms));
2463
2464 *list = (mcache_obj_t *)m;
2465 (*list)->obj_next = NULL;
2466 list = *plist = &(*list)->obj_next;
2467 }
2468
2469 fail:
2470 /*
2471 * Free up what's left of the above.
2472 */
2473 if (mp_list != NULL)
2474 mcache_free_ext(m_cache(MC_MBUF), mp_list);
2475 if (clp_list != NULL)
2476 mcache_free_ext(m_cache(cl_class), clp_list);
2477 if (ref_list != NULL)
2478 mcache_free_ext(ref_cache, ref_list);
2479
2480 lck_mtx_lock(mbuf_mlock);
2481 if (num > 0 || cnum > 0) {
2482 m_total(class) += cnum;
2483 VERIFY(m_total(class) <= m_maxlimit(class));
2484 m_alloc_cnt(class) += num + cnum;
2485 }
2486 if ((num + cnum) < want)
2487 m_fail_cnt(class) += (want - (num + cnum));
2488 lck_mtx_unlock(mbuf_mlock);
2489
2490 return (num + cnum);
2491 }
2492
2493 /*
2494 * Common de-allocator for composite objects called by the CPU cache
2495 * layer when one or more elements need to be returned to the appropriate
2496 * global freelist.
2497 */
2498 static void
2499 mbuf_cslab_free(void *arg, mcache_obj_t *list, int purged)
2500 {
2501 mbuf_class_t class = (mbuf_class_t)arg;
2502 unsigned int num;
2503 int w;
2504
2505 ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
2506
2507 lck_mtx_lock(mbuf_mlock);
2508
2509 num = cslab_free(class, list, purged);
2510 m_free_cnt(class) += num;
2511
2512 if ((w = mb_waiters) > 0)
2513 mb_waiters = 0;
2514
2515 lck_mtx_unlock(mbuf_mlock);
2516
2517 if (w != 0)
2518 wakeup(mb_waitchan);
2519 }
2520
2521 /*
2522 * Common auditor for composite objects called by the CPU cache layer
2523 * during an allocation or free request. For the former, this is called
2524 * after the objects are obtained from either the bucket or slab layer
2525 * and before they are returned to the caller. For the latter, this is
2526 * called immediately during free and before placing the objects into
2527 * the bucket or slab layer.
2528 */
2529 static void
2530 mbuf_cslab_audit(void *arg, mcache_obj_t *list, boolean_t alloc)
2531 {
2532 mbuf_class_t class = (mbuf_class_t)arg;
2533 mcache_audit_t *mca;
2534 struct mbuf *m, *ms;
2535 mcl_slab_t *clsp, *nsp;
2536 size_t size;
2537 void *cl;
2538
2539 ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
2540
2541 while ((m = ms = (struct mbuf *)list) != NULL) {
2542 lck_mtx_lock(mbuf_mlock);
2543 /* Do the mbuf sanity checks and record its transaction */
2544 mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
2545 mcl_audit_mbuf(mca, m, TRUE, alloc);
2546 if (mcltrace)
2547 mcache_buffer_log(mca, m, m_cache(class));
2548
2549 if (alloc)
2550 mca->mca_uflags |= MB_COMP_INUSE;
2551 else
2552 mca->mca_uflags &= ~MB_COMP_INUSE;
2553
2554 /*
2555 * Use the shadow mbuf in the audit structure if we are
2556 * freeing, since the contents of the actual mbuf has been
2557 * pattern-filled by the above call to mcl_audit_mbuf().
2558 */
2559 if (!alloc && mclverify)
2560 ms = (struct mbuf *)mca->mca_contents;
2561
2562 /* Do the cluster sanity checks and record its transaction */
2563 cl = ms->m_ext.ext_buf;
2564 clsp = slab_get(cl);
2565 VERIFY(ms->m_flags == M_EXT && cl != NULL);
2566 VERIFY(MEXT_RFA(ms) != NULL && MBUF_IS_COMPOSITE(ms));
2567 if (class == MC_MBUF_CL)
2568 VERIFY(clsp->sl_refcnt >= 1 &&
2569 clsp->sl_refcnt <= NCLPBG);
2570 else
2571 VERIFY(clsp->sl_refcnt == 1);
2572
2573 if (class == MC_MBUF_16KCL) {
2574 int k;
2575 for (nsp = clsp, k = 1; k < NSLABSP16KB; k++) {
2576 nsp = nsp->sl_next;
2577 /* Next slab must already be present */
2578 VERIFY(nsp != NULL);
2579 VERIFY(nsp->sl_refcnt == 1);
2580 }
2581 }
2582
2583 mca = mcl_audit_buf2mca(MC_CL, cl);
2584 if (class == MC_MBUF_CL)
2585 size = m_maxsize(MC_CL);
2586 else if (class == MC_MBUF_BIGCL)
2587 size = m_maxsize(MC_BIGCL);
2588 else
2589 size = m_maxsize(MC_16KCL);
2590 mcl_audit_cluster(mca, cl, size, alloc, FALSE);
2591 if (mcltrace)
2592 mcache_buffer_log(mca, cl, m_cache(class));
2593
2594 if (alloc)
2595 mca->mca_uflags |= MB_COMP_INUSE;
2596 else
2597 mca->mca_uflags &= ~MB_COMP_INUSE;
2598 lck_mtx_unlock(mbuf_mlock);
2599
2600 list = list->obj_next;
2601 }
2602 }
2603
2604 /*
2605 * Allocate some number of mbuf clusters and place on cluster freelist.
2606 */
2607 static int
2608 m_clalloc(const u_int32_t num, const int wait, const u_int32_t bufsize)
2609 {
2610 int i;
2611 vm_size_t size = 0;
2612 int numpages = 0, large_buffer = (bufsize == m_maxsize(MC_16KCL));
2613 vm_offset_t page = 0;
2614 mcache_audit_t *mca_list = NULL;
2615 mcache_obj_t *con_list = NULL;
2616 mcl_slab_t *sp;
2617
2618 VERIFY(bufsize == m_maxsize(MC_BIGCL) ||
2619 bufsize == m_maxsize(MC_16KCL));
2620
2621 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2622
2623 /*
2624 * Multiple threads may attempt to populate the cluster map one
2625 * after another. Since we drop the lock below prior to acquiring
2626 * the physical page(s), our view of the cluster map may no longer
2627 * be accurate, and we could end up over-committing the pages beyond
2628 * the maximum allowed for each class. To prevent it, this entire
2629 * operation (including the page mapping) is serialized.
2630 */
2631 while (mb_clalloc_busy) {
2632 mb_clalloc_waiters++;
2633 (void) msleep(mb_clalloc_waitchan, mbuf_mlock,
2634 (PZERO-1), "m_clalloc", NULL);
2635 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2636 }
2637
2638 /* We are busy now; tell everyone else to go away */
2639 mb_clalloc_busy = TRUE;
2640
2641 /*
2642 * Honor the caller's wish to block or not block. We have a way
2643 * to grow the pool asynchronously using the mbuf worker thread.
2644 */
2645 i = m_howmany(num, bufsize);
2646 if (i == 0 || (wait & M_DONTWAIT))
2647 goto out;
2648
2649 lck_mtx_unlock(mbuf_mlock);
2650
2651 size = round_page(i * bufsize);
2652 page = kmem_mb_alloc(mb_map, size, large_buffer);
2653
2654 /*
2655 * If we did ask for "n" 16KB physically contiguous chunks
2656 * and didn't get them, then please try again without this
2657 * restriction.
2658 */
2659 if (large_buffer && page == 0)
2660 page = kmem_mb_alloc(mb_map, size, 0);
2661
2662 if (page == 0) {
2663 if (bufsize == m_maxsize(MC_BIGCL)) {
2664 /* Try for 1 page if failed, only 4KB request */
2665 size = NBPG;
2666 page = kmem_mb_alloc(mb_map, size, 0);
2667 }
2668
2669 if (page == 0) {
2670 lck_mtx_lock(mbuf_mlock);
2671 goto out;
2672 }
2673 }
2674
2675 VERIFY(IS_P2ALIGNED(page, NBPG));
2676 numpages = size / NBPG;
2677
2678 /* If auditing is enabled, allocate the audit structures now */
2679 if (mclaudit != NULL) {
2680 int needed;
2681
2682 /*
2683 * Yes, I realize this is a waste of memory for clusters
2684 * that never get transformed into mbufs, as we may end
2685 * up with NMBPBG-1 unused audit structures per cluster.
2686 * But doing so tremendously simplifies the allocation
2687 * strategy, since at this point we are not holding the
2688 * mbuf lock and the caller is okay to be blocked.
2689 */
2690 if (bufsize == m_maxsize(MC_BIGCL)) {
2691 needed = numpages * NMBPBG;
2692
2693 i = mcache_alloc_ext(mcl_audit_con_cache,
2694 &con_list, needed, MCR_SLEEP);
2695
2696 VERIFY(con_list != NULL && i == needed);
2697 } else {
2698 needed = numpages / NSLABSP16KB;
2699 }
2700
2701 i = mcache_alloc_ext(mcache_audit_cache,
2702 (mcache_obj_t **)&mca_list, needed, MCR_SLEEP);
2703
2704 VERIFY(mca_list != NULL && i == needed);
2705 }
2706
2707 lck_mtx_lock(mbuf_mlock);
2708
2709 for (i = 0; i < numpages; i++, page += NBPG) {
2710 ppnum_t offset = ((char *)page - (char *)mbutl) / NBPG;
2711 ppnum_t new_page = pmap_find_phys(kernel_pmap,
2712 (vm_offset_t)page);
2713
2714 /*
2715 * In the case of no mapper being available the following
2716 * code noops and returns the input page; if there is a
2717 * mapper the appropriate I/O page is returned.
2718 */
2719 VERIFY(offset < mcl_pages);
2720 new_page = IOMapperInsertPage(mcl_paddr_base, offset, new_page);
2721 mcl_paddr[offset] = new_page << PGSHIFT;
2722
2723 /* Pattern-fill this fresh page */
2724 if (mclverify) {
2725 mcache_set_pattern(MCACHE_FREE_PATTERN,
2726 (caddr_t)page, NBPG);
2727 }
2728 if (bufsize == m_maxsize(MC_BIGCL)) {
2729 union mbigcluster *mbc = (union mbigcluster *)page;
2730
2731 /* One for the entire page */
2732 sp = slab_get(mbc);
2733 if (mclaudit != NULL) {
2734 mcl_audit_init(mbc, &mca_list, &con_list,
2735 AUDIT_CONTENTS_SIZE, NMBPBG);
2736 }
2737 VERIFY(sp->sl_refcnt == 0 && sp->sl_flags == 0);
2738 slab_init(sp, MC_BIGCL, SLF_MAPPED,
2739 mbc, mbc, bufsize, 0, 1);
2740
2741 /* Insert this slab */
2742 slab_insert(sp, MC_BIGCL);
2743
2744 /* Update stats now since slab_get() drops the lock */
2745 mbstat.m_bigclfree = ++m_infree(MC_BIGCL) +
2746 m_infree(MC_MBUF_BIGCL);
2747 mbstat.m_bigclusters = ++m_total(MC_BIGCL);
2748 VERIFY(m_total(MC_BIGCL) <= m_maxlimit(MC_BIGCL));
2749 } else if ((i % NSLABSP16KB) == 0) {
2750 union m16kcluster *m16kcl = (union m16kcluster *)page;
2751 mcl_slab_t *nsp;
2752 int k;
2753
2754 VERIFY(njcl > 0);
2755 /* One for the entire 16KB */
2756 sp = slab_get(m16kcl);
2757 if (mclaudit != NULL)
2758 mcl_audit_init(m16kcl, &mca_list, NULL, 0, 1);
2759
2760 VERIFY(sp->sl_refcnt == 0 && sp->sl_flags == 0);
2761 slab_init(sp, MC_16KCL, SLF_MAPPED,
2762 m16kcl, m16kcl, bufsize, 0, 1);
2763
2764 /*
2765 * 2nd-Nth page's slab is part of the first one,
2766 * where N is NSLABSP16KB.
2767 */
2768 for (k = 1; k < NSLABSP16KB; k++) {
2769 nsp = slab_get(((union mbigcluster *)page) + k);
2770 VERIFY(nsp->sl_refcnt == 0 &&
2771 nsp->sl_flags == 0);
2772 slab_init(nsp, MC_16KCL,
2773 SLF_MAPPED | SLF_PARTIAL,
2774 m16kcl, NULL, 0, 0, 0);
2775 }
2776
2777 /* Insert this slab */
2778 slab_insert(sp, MC_16KCL);
2779
2780 /* Update stats now since slab_get() drops the lock */
2781 m_infree(MC_16KCL)++;
2782 m_total(MC_16KCL)++;
2783 VERIFY(m_total(MC_16KCL) <= m_maxlimit(MC_16KCL));
2784 }
2785 }
2786 VERIFY(mca_list == NULL && con_list == NULL);
2787
2788 /* We're done; let others enter */
2789 mb_clalloc_busy = FALSE;
2790 if (mb_clalloc_waiters > 0) {
2791 mb_clalloc_waiters = 0;
2792 wakeup(mb_clalloc_waitchan);
2793 }
2794
2795 if (bufsize == m_maxsize(MC_BIGCL))
2796 return (numpages);
2797
2798 VERIFY(bufsize == m_maxsize(MC_16KCL));
2799 return (numpages / NSLABSP16KB);
2800
2801 out:
2802 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2803
2804 /* We're done; let others enter */
2805 mb_clalloc_busy = FALSE;
2806 if (mb_clalloc_waiters > 0) {
2807 mb_clalloc_waiters = 0;
2808 wakeup(mb_clalloc_waitchan);
2809 }
2810
2811 /*
2812 * When non-blocking we kick a thread if we have to grow the
2813 * pool or if the number of free clusters is less than requested.
2814 */
2815 if (bufsize == m_maxsize(MC_BIGCL)) {
2816 if (i > 0) {
2817 /*
2818 * Remember total number of 4KB clusters needed
2819 * at this time.
2820 */
2821 i += m_total(MC_BIGCL);
2822 if (i > mbuf_expand_big) {
2823 mbuf_expand_big = i;
2824 if (mbuf_worker_ready)
2825 wakeup((caddr_t)&mbuf_worker_run);
2826 }
2827 }
2828
2829 if (m_infree(MC_BIGCL) >= num)
2830 return (1);
2831 } else {
2832 if (i > 0) {
2833 /*
2834 * Remember total number of 16KB clusters needed
2835 * at this time.
2836 */
2837 i += m_total(MC_16KCL);
2838 if (i > mbuf_expand_16k) {
2839 mbuf_expand_16k = i;
2840 if (mbuf_worker_ready)
2841 wakeup((caddr_t)&mbuf_worker_run);
2842 }
2843 }
2844
2845 if (m_infree(MC_16KCL) >= num)
2846 return (1);
2847 }
2848 return (0);
2849 }
2850
2851 /*
2852 * Populate the global freelist of the corresponding buffer class.
2853 */
2854 static int
2855 freelist_populate(mbuf_class_t class, unsigned int num, int wait)
2856 {
2857 mcache_obj_t *o = NULL;
2858 int i, numpages = 0, count;
2859
2860 VERIFY(class == MC_MBUF || class == MC_CL || class == MC_BIGCL ||
2861 class == MC_16KCL);
2862
2863 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2864
2865 switch (class) {
2866 case MC_MBUF:
2867 case MC_CL:
2868 case MC_BIGCL:
2869 numpages = (num * m_size(class) + NBPG - 1) / NBPG;
2870 i = m_clalloc(numpages, wait, m_maxsize(MC_BIGCL));
2871
2872 /* Respect the 4KB clusters minimum limit */
2873 if (m_total(MC_BIGCL) == m_maxlimit(MC_BIGCL) &&
2874 m_infree(MC_BIGCL) <= m_minlimit(MC_BIGCL)) {
2875 if (class != MC_BIGCL || (wait & MCR_COMP))
2876 return (0);
2877 }
2878 if (class == MC_BIGCL)
2879 return (i != 0);
2880 break;
2881
2882 case MC_16KCL:
2883 return (m_clalloc(num, wait, m_maxsize(class)) != 0);
2884 /* NOTREACHED */
2885
2886 default:
2887 VERIFY(0);
2888 /* NOTREACHED */
2889 }
2890
2891 VERIFY(class == MC_MBUF || class == MC_CL);
2892
2893 /* how many objects will we cut the page into? */
2894 int numobj = (class == MC_MBUF ? NMBPBG : NCLPBG);
2895
2896 for (count = 0; count < numpages; count++) {
2897
2898 /* respect totals, minlimit, maxlimit */
2899 if (m_total(MC_BIGCL) <= m_minlimit(MC_BIGCL) ||
2900 m_total(class) >= m_maxlimit(class))
2901 break;
2902
2903 if ((o = slab_alloc(MC_BIGCL, wait)) == NULL)
2904 break;
2905
2906 struct mbuf *m = (struct mbuf *)o;
2907 union mcluster *c = (union mcluster *)o;
2908 mcl_slab_t *sp = slab_get(o);
2909 mcache_audit_t *mca = NULL;
2910
2911 VERIFY(slab_is_detached(sp) &&
2912 (sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED);
2913
2914 /*
2915 * Make sure that the cluster is unmolested
2916 * while in freelist
2917 */
2918 if (mclverify) {
2919 mca = mcl_audit_buf2mca(MC_BIGCL, o);
2920 mcache_audit_free_verify(mca, o, 0,
2921 m_maxsize(MC_BIGCL));
2922 }
2923
2924 /* Reinitialize it as an mbuf or 2K slab */
2925 slab_init(sp, class, sp->sl_flags,
2926 sp->sl_base, NULL, sp->sl_len, 0, numobj);
2927
2928 VERIFY(o == (mcache_obj_t *)sp->sl_base);
2929 VERIFY(sp->sl_head == NULL);
2930
2931 VERIFY(m_total(MC_BIGCL) > 0);
2932 m_total(MC_BIGCL)--;
2933 mbstat.m_bigclusters = m_total(MC_BIGCL);
2934
2935 m_total(class) += numobj;
2936 m_infree(class) += numobj;
2937
2938 VERIFY(m_total(MC_BIGCL) >= m_minlimit(MC_BIGCL));
2939 VERIFY(m_total(class) <= m_maxlimit(class));
2940
2941 i = numobj;
2942 if (class == MC_MBUF) {
2943 mbstat.m_mbufs = m_total(MC_MBUF);
2944 mtype_stat_add(MT_FREE, NMBPBG);
2945 while (i--) {
2946 /*
2947 * If auditing is enabled, construct the
2948 * shadow mbuf in the audit structure
2949 * instead of the actual one.
2950 * mbuf_slab_audit() will take care of
2951 * restoring the contents after the
2952 * integrity check.
2953 */
2954 if (mclaudit != NULL) {
2955 struct mbuf *ms;
2956 mca = mcl_audit_buf2mca(MC_MBUF,
2957 (mcache_obj_t *)m);
2958 ms = ((struct mbuf *)
2959 mca->mca_contents);
2960 ms->m_type = MT_FREE;
2961 } else {
2962 m->m_type = MT_FREE;
2963 }
2964 m->m_next = sp->sl_head;
2965 sp->sl_head = (void *)m++;
2966 }
2967 } else { /* MC_CL */
2968 mbstat.m_clfree =
2969 m_infree(MC_CL) + m_infree(MC_MBUF_CL);
2970 mbstat.m_clusters = m_total(MC_CL);
2971 while (i--) {
2972 c->mcl_next = sp->sl_head;
2973 sp->sl_head = (void *)c++;
2974 }
2975 }
2976
2977 /* Insert into the mbuf or 2k slab list */
2978 slab_insert(sp, class);
2979
2980 if ((i = mb_waiters) > 0)
2981 mb_waiters = 0;
2982 if (i != 0)
2983 wakeup(mb_waitchan);
2984 }
2985 return (count != 0);
2986 }
2987
2988 /*
2989 * For each class, initialize the freelist to hold m_minlimit() objects.
2990 */
2991 static void
2992 freelist_init(mbuf_class_t class)
2993 {
2994 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2995
2996 VERIFY(class == MC_CL || class == MC_BIGCL);
2997 VERIFY(m_total(class) == 0);
2998 VERIFY(m_minlimit(class) > 0);
2999
3000 while (m_total(class) < m_minlimit(class))
3001 (void) freelist_populate(class, m_minlimit(class), M_WAIT);
3002
3003 VERIFY(m_total(class) >= m_minlimit(class));
3004 }
3005
3006 /*
3007 * (Inaccurately) check if it might be worth a trip back to the
3008 * mcache layer due the availability of objects there. We'll
3009 * end up back here if there's nothing up there.
3010 */
3011 static boolean_t
3012 mbuf_cached_above(mbuf_class_t class, int wait)
3013 {
3014 switch (class) {
3015 case MC_MBUF:
3016 if (wait & MCR_COMP)
3017 return (!mcache_bkt_isempty(m_cache(MC_MBUF_CL)) ||
3018 !mcache_bkt_isempty(m_cache(MC_MBUF_BIGCL)));
3019 break;
3020
3021 case MC_CL:
3022 if (wait & MCR_COMP)
3023 return (!mcache_bkt_isempty(m_cache(MC_MBUF_CL)));
3024 break;
3025
3026 case MC_BIGCL:
3027 if (wait & MCR_COMP)
3028 return (!mcache_bkt_isempty(m_cache(MC_MBUF_BIGCL)));
3029 break;
3030
3031 case MC_16KCL:
3032 if (wait & MCR_COMP)
3033 return (!mcache_bkt_isempty(m_cache(MC_MBUF_16KCL)));
3034 break;
3035
3036 case MC_MBUF_CL:
3037 case MC_MBUF_BIGCL:
3038 case MC_MBUF_16KCL:
3039 break;
3040
3041 default:
3042 VERIFY(0);
3043 /* NOTREACHED */
3044 }
3045
3046 return (!mcache_bkt_isempty(m_cache(class)));
3047 }
3048
3049 /*
3050 * If possible, convert constructed objects to raw ones.
3051 */
3052 static boolean_t
3053 mbuf_steal(mbuf_class_t class, unsigned int num)
3054 {
3055 mcache_obj_t *top = NULL;
3056 mcache_obj_t **list = &top;
3057 unsigned int tot = 0;
3058
3059 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
3060
3061 switch (class) {
3062 case MC_MBUF:
3063 case MC_CL:
3064 case MC_BIGCL:
3065 case MC_16KCL:
3066 return (FALSE);
3067
3068 case MC_MBUF_CL:
3069 case MC_MBUF_BIGCL:
3070 case MC_MBUF_16KCL:
3071 /* Get the required number of constructed objects if possible */
3072 if (m_infree(class) > m_minlimit(class)) {
3073 tot = cslab_alloc(class, &list,
3074 MIN(num, m_infree(class)));
3075 }
3076
3077 /* And destroy them to get back the raw objects */
3078 if (top != NULL)
3079 (void) cslab_free(class, top, 1);
3080 break;
3081
3082 default:
3083 VERIFY(0);
3084 /* NOTREACHED */
3085 }
3086
3087 return (tot == num);
3088 }
3089
3090 static void
3091 m_reclaim(mbuf_class_t class, unsigned int num, boolean_t comp)
3092 {
3093 int m, bmap = 0;
3094
3095 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
3096
3097 VERIFY(m_total(MC_CL) <= m_maxlimit(MC_CL));
3098 VERIFY(m_total(MC_BIGCL) <= m_maxlimit(MC_BIGCL));
3099 VERIFY(m_total(MC_16KCL) <= m_maxlimit(MC_16KCL));
3100
3101 /*
3102 * This logic can be made smarter; for now, simply mark
3103 * all other related classes as potential victims.
3104 */
3105 switch (class) {
3106 case MC_MBUF:
3107 m_wantpurge(MC_CL)++;
3108 m_wantpurge(MC_BIGCL)++;
3109 m_wantpurge(MC_MBUF_CL)++;
3110 m_wantpurge(MC_MBUF_BIGCL)++;
3111 break;
3112
3113 case MC_CL:
3114 m_wantpurge(MC_MBUF)++;
3115 m_wantpurge(MC_BIGCL)++;
3116 m_wantpurge(MC_MBUF_BIGCL)++;
3117 if (!comp)
3118 m_wantpurge(MC_MBUF_CL)++;
3119 break;
3120
3121 case MC_BIGCL:
3122 m_wantpurge(MC_MBUF)++;
3123 m_wantpurge(MC_CL)++;
3124 m_wantpurge(MC_MBUF_CL)++;
3125 if (!comp)
3126 m_wantpurge(MC_MBUF_BIGCL)++;
3127 break;
3128
3129 case MC_16KCL:
3130 if (!comp)
3131 m_wantpurge(MC_MBUF_16KCL)++;
3132 break;
3133
3134 default:
3135 VERIFY(0);
3136 /* NOTREACHED */
3137 }
3138
3139 /*
3140 * Run through each marked class and check if we really need to
3141 * purge (and therefore temporarily disable) the per-CPU caches
3142 * layer used by the class. If so, remember the classes since
3143 * we are going to drop the lock below prior to purging.
3144 */
3145 for (m = 0; m < NELEM(mbuf_table); m++) {
3146 if (m_wantpurge(m) > 0) {
3147 m_wantpurge(m) = 0;
3148 /*
3149 * Try hard to steal the required number of objects
3150 * from the freelist of other mbuf classes. Only
3151 * purge and disable the per-CPU caches layer when
3152 * we don't have enough; it's the last resort.
3153 */
3154 if (!mbuf_steal(m, num))
3155 bmap |= (1 << m);
3156 }
3157 }
3158
3159 lck_mtx_unlock(mbuf_mlock);
3160
3161 if (bmap != 0) {
3162 /* drain is performed in pfslowtimo(), to avoid deadlocks */
3163 do_reclaim = 1;
3164
3165 /* Sigh; we have no other choices but to ask mcache to purge */
3166 for (m = 0; m < NELEM(mbuf_table); m++) {
3167 if ((bmap & (1 << m)) &&
3168 mcache_purge_cache(m_cache(m))) {
3169 lck_mtx_lock(mbuf_mlock);
3170 m_purge_cnt(m)++;
3171 mbstat.m_drain++;
3172 lck_mtx_unlock(mbuf_mlock);
3173 }
3174 }
3175 } else {
3176 /*
3177 * Request mcache to reap extra elements from all of its caches;
3178 * note that all reaps are serialized and happen only at a fixed
3179 * interval.
3180 */
3181 mcache_reap();
3182 }
3183 lck_mtx_lock(mbuf_mlock);
3184 }
3185
3186 static inline struct mbuf *
3187 m_get_common(int wait, short type, int hdr)
3188 {
3189 struct mbuf *m;
3190 int mcflags = MSLEEPF(wait);
3191
3192 /* Is this due to a non-blocking retry? If so, then try harder */
3193 if (mcflags & MCR_NOSLEEP)
3194 mcflags |= MCR_TRYHARD;
3195
3196 m = mcache_alloc(m_cache(MC_MBUF), mcflags);
3197 if (m != NULL) {
3198 MBUF_INIT(m, hdr, type);
3199 mtype_stat_inc(type);
3200 mtype_stat_dec(MT_FREE);
3201 #if CONFIG_MACF_NET
3202 if (hdr && mac_init_mbuf(m, wait) != 0) {
3203 m_free(m);
3204 return (NULL);
3205 }
3206 #endif /* MAC_NET */
3207 }
3208 return (m);
3209 }
3210
3211 /*
3212 * Space allocation routines; these are also available as macros
3213 * for critical paths.
3214 */
3215 #define _M_GET(wait, type) m_get_common(wait, type, 0)
3216 #define _M_GETHDR(wait, type) m_get_common(wait, type, 1)
3217 #define _M_RETRY(wait, type) _M_GET(wait, type)
3218 #define _M_RETRYHDR(wait, type) _M_GETHDR(wait, type)
3219 #define _MGET(m, how, type) ((m) = _M_GET(how, type))
3220 #define _MGETHDR(m, how, type) ((m) = _M_GETHDR(how, type))
3221
3222 struct mbuf *
3223 m_get(int wait, int type)
3224 {
3225 return (_M_GET(wait, type));
3226 }
3227
3228 struct mbuf *
3229 m_gethdr(int wait, int type)
3230 {
3231 return (_M_GETHDR(wait, type));
3232 }
3233
3234 struct mbuf *
3235 m_retry(int wait, int type)
3236 {
3237 return (_M_RETRY(wait, type));
3238 }
3239
3240 struct mbuf *
3241 m_retryhdr(int wait, int type)
3242 {
3243 return (_M_RETRYHDR(wait, type));
3244 }
3245
3246 struct mbuf *
3247 m_getclr(int wait, int type)
3248 {
3249 struct mbuf *m;
3250
3251 _MGET(m, wait, type);
3252 if (m != NULL)
3253 bzero(MTOD(m, caddr_t), MLEN);
3254 return (m);
3255 }
3256
3257 struct mbuf *
3258 m_free(struct mbuf *m)
3259 {
3260 struct mbuf *n = m->m_next;
3261
3262 if (m->m_type == MT_FREE)
3263 panic("m_free: freeing an already freed mbuf");
3264
3265 /* Free the aux data and tags if there is any */
3266 if (m->m_flags & M_PKTHDR) {
3267 m_tag_delete_chain(m, NULL);
3268 }
3269
3270 if (m->m_flags & M_EXT) {
3271 u_int32_t refcnt;
3272 u_int32_t composite;
3273
3274 refcnt = m_decref(m);
3275 composite = (MEXT_FLAGS(m) & EXTF_COMPOSITE);
3276 if (refcnt == 0 && !composite) {
3277 if (m->m_ext.ext_free == NULL) {
3278 mcache_free(m_cache(MC_CL), m->m_ext.ext_buf);
3279 } else if (m->m_ext.ext_free == m_bigfree) {
3280 mcache_free(m_cache(MC_BIGCL),
3281 m->m_ext.ext_buf);
3282 } else if (m->m_ext.ext_free == m_16kfree) {
3283 mcache_free(m_cache(MC_16KCL),
3284 m->m_ext.ext_buf);
3285 } else {
3286 (*(m->m_ext.ext_free))(m->m_ext.ext_buf,
3287 m->m_ext.ext_size, m->m_ext.ext_arg);
3288 }
3289 mcache_free(ref_cache, MEXT_RFA(m));
3290 MEXT_RFA(m) = NULL;
3291 } else if (refcnt == 0 && composite) {
3292 VERIFY(m->m_type != MT_FREE);
3293
3294 mtype_stat_dec(m->m_type);
3295 mtype_stat_inc(MT_FREE);
3296
3297 m->m_type = MT_FREE;
3298 m->m_flags = M_EXT;
3299 m->m_len = 0;
3300 m->m_next = m->m_nextpkt = NULL;
3301
3302 MEXT_FLAGS(m) &= ~EXTF_READONLY;
3303
3304 /* "Free" into the intermediate cache */
3305 if (m->m_ext.ext_free == NULL) {
3306 mcache_free(m_cache(MC_MBUF_CL), m);
3307 } else if (m->m_ext.ext_free == m_bigfree) {
3308 mcache_free(m_cache(MC_MBUF_BIGCL), m);
3309 } else {
3310 VERIFY(m->m_ext.ext_free == m_16kfree);
3311 mcache_free(m_cache(MC_MBUF_16KCL), m);
3312 }
3313 return (n);
3314 }
3315 }
3316
3317 if (m->m_type != MT_FREE) {
3318 mtype_stat_dec(m->m_type);
3319 mtype_stat_inc(MT_FREE);
3320 }
3321
3322 m->m_type = MT_FREE;
3323 m->m_flags = m->m_len = 0;
3324 m->m_next = m->m_nextpkt = NULL;
3325
3326 mcache_free(m_cache(MC_MBUF), m);
3327
3328 return (n);
3329 }
3330
3331 __private_extern__ struct mbuf *
3332 m_clattach(struct mbuf *m, int type, caddr_t extbuf,
3333 void (*extfree)(caddr_t, u_int, caddr_t), u_int extsize, caddr_t extarg,
3334 int wait)
3335 {
3336 struct ext_ref *rfa = NULL;
3337
3338 if (m == NULL && (m = _M_GETHDR(wait, type)) == NULL)
3339 return (NULL);
3340
3341 if (m->m_flags & M_EXT) {
3342 u_int32_t refcnt;
3343 u_int32_t composite;
3344
3345 refcnt = m_decref(m);
3346 composite = (MEXT_FLAGS(m) & EXTF_COMPOSITE);
3347 if (refcnt == 0 && !composite) {
3348 if (m->m_ext.ext_free == NULL) {
3349 mcache_free(m_cache(MC_CL), m->m_ext.ext_buf);
3350 } else if (m->m_ext.ext_free == m_bigfree) {
3351 mcache_free(m_cache(MC_BIGCL),
3352 m->m_ext.ext_buf);
3353 } else if (m->m_ext.ext_free == m_16kfree) {
3354 mcache_free(m_cache(MC_16KCL),
3355 m->m_ext.ext_buf);
3356 } else {
3357 (*(m->m_ext.ext_free))(m->m_ext.ext_buf,
3358 m->m_ext.ext_size, m->m_ext.ext_arg);
3359 }
3360 /* Re-use the reference structure */
3361 rfa = MEXT_RFA(m);
3362 } else if (refcnt == 0 && composite) {
3363 VERIFY(m->m_type != MT_FREE);
3364
3365 mtype_stat_dec(m->m_type);
3366 mtype_stat_inc(MT_FREE);
3367
3368 m->m_type = MT_FREE;
3369 m->m_flags = M_EXT;
3370 m->m_len = 0;
3371 m->m_next = m->m_nextpkt = NULL;
3372
3373 MEXT_FLAGS(m) &= ~EXTF_READONLY;
3374
3375 /* "Free" into the intermediate cache */
3376 if (m->m_ext.ext_free == NULL) {
3377 mcache_free(m_cache(MC_MBUF_CL), m);
3378 } else if (m->m_ext.ext_free == m_bigfree) {
3379 mcache_free(m_cache(MC_MBUF_BIGCL), m);
3380 } else {
3381 VERIFY(m->m_ext.ext_free == m_16kfree);
3382 mcache_free(m_cache(MC_MBUF_16KCL), m);
3383 }
3384 /*
3385 * Allocate a new mbuf, since we didn't divorce
3386 * the composite mbuf + cluster pair above.
3387 */
3388 if ((m = _M_GETHDR(wait, type)) == NULL)
3389 return (NULL);
3390 }
3391 }
3392
3393 if (rfa == NULL &&
3394 (rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL) {
3395 m_free(m);
3396 return (NULL);
3397 }
3398
3399 MEXT_INIT(m, extbuf, extsize, extfree, extarg, rfa, 1, 0);
3400
3401 return (m);
3402 }
3403
3404 /*
3405 * Perform `fast' allocation mbuf clusters from a cache of recently-freed
3406 * clusters. (If the cache is empty, new clusters are allocated en-masse.)
3407 */
3408 struct mbuf *
3409 m_getcl(int wait, int type, int flags)
3410 {
3411 struct mbuf *m;
3412 int mcflags = MSLEEPF(wait);
3413 int hdr = (flags & M_PKTHDR);
3414
3415 /* Is this due to a non-blocking retry? If so, then try harder */
3416 if (mcflags & MCR_NOSLEEP)
3417 mcflags |= MCR_TRYHARD;
3418
3419 m = mcache_alloc(m_cache(MC_MBUF_CL), mcflags);
3420 if (m != NULL) {
3421 u_int32_t flag;
3422 struct ext_ref *rfa;
3423 void *cl;
3424
3425 VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT);
3426 cl = m->m_ext.ext_buf;
3427 rfa = MEXT_RFA(m);
3428
3429 ASSERT(cl != NULL && rfa != NULL);
3430 VERIFY(MBUF_IS_COMPOSITE(m) && m->m_ext.ext_free == NULL);
3431
3432 flag = MEXT_FLAGS(m);
3433
3434 MBUF_INIT(m, hdr, type);
3435 MBUF_CL_INIT(m, cl, rfa, 1, flag);
3436
3437 mtype_stat_inc(type);
3438 mtype_stat_dec(MT_FREE);
3439 #if CONFIG_MACF_NET
3440 if (hdr && mac_init_mbuf(m, wait) != 0) {
3441 m_freem(m);
3442 return (NULL);
3443 }
3444 #endif /* MAC_NET */
3445 }
3446 return (m);
3447 }
3448
3449 /* m_mclget() add an mbuf cluster to a normal mbuf */
3450 struct mbuf *
3451 m_mclget(struct mbuf *m, int wait)
3452 {
3453 struct ext_ref *rfa;
3454
3455 if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL)
3456 return (m);
3457
3458 m->m_ext.ext_buf = m_mclalloc(wait);
3459 if (m->m_ext.ext_buf != NULL) {
3460 MBUF_CL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0);
3461 } else {
3462 mcache_free(ref_cache, rfa);
3463 }
3464 return (m);
3465 }
3466
3467 /* Allocate an mbuf cluster */
3468 caddr_t
3469 m_mclalloc(int wait)
3470 {
3471 int mcflags = MSLEEPF(wait);
3472
3473 /* Is this due to a non-blocking retry? If so, then try harder */
3474 if (mcflags & MCR_NOSLEEP)
3475 mcflags |= MCR_TRYHARD;
3476
3477 return (mcache_alloc(m_cache(MC_CL), mcflags));
3478 }
3479
3480 /* Free an mbuf cluster */
3481 void
3482 m_mclfree(caddr_t p)
3483 {
3484 mcache_free(m_cache(MC_CL), p);
3485 }
3486
3487 /*
3488 * mcl_hasreference() checks if a cluster of an mbuf is referenced by
3489 * another mbuf; see comments in m_incref() regarding EXTF_READONLY.
3490 */
3491 int
3492 m_mclhasreference(struct mbuf *m)
3493 {
3494 if (!(m->m_flags & M_EXT))
3495 return (0);
3496
3497 ASSERT(MEXT_RFA(m) != NULL);
3498
3499 return ((MEXT_FLAGS(m) & EXTF_READONLY) ? 1 : 0);
3500 }
3501
3502 __private_extern__ caddr_t
3503 m_bigalloc(int wait)
3504 {
3505 int mcflags = MSLEEPF(wait);
3506
3507 /* Is this due to a non-blocking retry? If so, then try harder */
3508 if (mcflags & MCR_NOSLEEP)
3509 mcflags |= MCR_TRYHARD;
3510
3511 return (mcache_alloc(m_cache(MC_BIGCL), mcflags));
3512 }
3513
3514 __private_extern__ void
3515 m_bigfree(caddr_t p, __unused u_int size, __unused caddr_t arg)
3516 {
3517 mcache_free(m_cache(MC_BIGCL), p);
3518 }
3519
3520 /* m_mbigget() add an 4KB mbuf cluster to a normal mbuf */
3521 __private_extern__ struct mbuf *
3522 m_mbigget(struct mbuf *m, int wait)
3523 {
3524 struct ext_ref *rfa;
3525
3526 if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL)
3527 return (m);
3528
3529 m->m_ext.ext_buf = m_bigalloc(wait);
3530 if (m->m_ext.ext_buf != NULL) {
3531 MBUF_BIGCL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0);
3532 } else {
3533 mcache_free(ref_cache, rfa);
3534 }
3535 return (m);
3536 }
3537
3538 __private_extern__ caddr_t
3539 m_16kalloc(int wait)
3540 {
3541 int mcflags = MSLEEPF(wait);
3542
3543 /* Is this due to a non-blocking retry? If so, then try harder */
3544 if (mcflags & MCR_NOSLEEP)
3545 mcflags |= MCR_TRYHARD;
3546
3547 return (mcache_alloc(m_cache(MC_16KCL), mcflags));
3548 }
3549
3550 __private_extern__ void
3551 m_16kfree(caddr_t p, __unused u_int size, __unused caddr_t arg)
3552 {
3553 mcache_free(m_cache(MC_16KCL), p);
3554 }
3555
3556 /* m_m16kget() add a 16KB mbuf cluster to a normal mbuf */
3557 __private_extern__ struct mbuf *
3558 m_m16kget(struct mbuf *m, int wait)
3559 {
3560 struct ext_ref *rfa;
3561
3562 if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL)
3563 return (m);
3564
3565 m->m_ext.ext_buf = m_16kalloc(wait);
3566 if (m->m_ext.ext_buf != NULL) {
3567 MBUF_16KCL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0);
3568 } else {
3569 mcache_free(ref_cache, rfa);
3570 }
3571 return (m);
3572 }
3573
3574 /*
3575 * "Move" mbuf pkthdr from "from" to "to".
3576 * "from" must have M_PKTHDR set, and "to" must be empty.
3577 */
3578 void
3579 m_copy_pkthdr(struct mbuf *to, struct mbuf *from)
3580 {
3581 /* We will be taking over the tags of 'to' */
3582 if (to->m_flags & M_PKTHDR)
3583 m_tag_delete_chain(to, NULL);
3584 to->m_pkthdr = from->m_pkthdr; /* especially tags */
3585 m_tag_init(from); /* purge tags from src */
3586 m_service_class_init(from); /* reset svc class from src */
3587 from->m_pkthdr.aux_flags = 0; /* clear aux flags from src */
3588 to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT);
3589 if ((to->m_flags & M_EXT) == 0)
3590 to->m_data = to->m_pktdat;
3591 }
3592
3593 /*
3594 * Duplicate "from"'s mbuf pkthdr in "to".
3595 * "from" must have M_PKTHDR set, and "to" must be empty.
3596 * In particular, this does a deep copy of the packet tags.
3597 */
3598 static int
3599 m_dup_pkthdr(struct mbuf *to, struct mbuf *from, int how)
3600 {
3601 if (to->m_flags & M_PKTHDR)
3602 m_tag_delete_chain(to, NULL);
3603 to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT);
3604 if ((to->m_flags & M_EXT) == 0)
3605 to->m_data = to->m_pktdat;
3606 to->m_pkthdr = from->m_pkthdr;
3607 m_tag_init(to);
3608 return (m_tag_copy_chain(to, from, how));
3609 }
3610
3611 void
3612 m_copy_pftag(struct mbuf *to, struct mbuf *from)
3613 {
3614 to->m_pkthdr.pf_mtag = from->m_pkthdr.pf_mtag;
3615 to->m_pkthdr.pf_mtag.pftag_hdr = NULL;
3616 to->m_pkthdr.pf_mtag.pftag_flags &= ~(PF_TAG_HDR_INET|PF_TAG_HDR_INET6);
3617 }
3618
3619 /*
3620 * Return a list of mbuf hdrs that point to clusters. Try for num_needed;
3621 * if wantall is not set, return whatever number were available. Set up the
3622 * first num_with_pkthdrs with mbuf hdrs configured as packet headers; these
3623 * are chained on the m_nextpkt field. Any packets requested beyond this
3624 * are chained onto the last packet header's m_next field. The size of
3625 * the cluster is controlled by the parameter bufsize.
3626 */
3627 __private_extern__ struct mbuf *
3628 m_getpackets_internal(unsigned int *num_needed, int num_with_pkthdrs,
3629 int wait, int wantall, size_t bufsize)
3630 {
3631 struct mbuf *m;
3632 struct mbuf **np, *top;
3633 unsigned int pnum, needed = *num_needed;
3634 mcache_obj_t *mp_list = NULL;
3635 int mcflags = MSLEEPF(wait);
3636 u_int32_t flag;
3637 struct ext_ref *rfa;
3638 mcache_t *cp;
3639 void *cl;
3640
3641 ASSERT(bufsize == m_maxsize(MC_CL) ||
3642 bufsize == m_maxsize(MC_BIGCL) ||
3643 bufsize == m_maxsize(MC_16KCL));
3644
3645 /*
3646 * Caller must first check for njcl because this
3647 * routine is internal and not exposed/used via KPI.
3648 */
3649 VERIFY(bufsize != m_maxsize(MC_16KCL) || njcl > 0);
3650
3651 top = NULL;
3652 np = &top;
3653 pnum = 0;
3654
3655 /*
3656 * The caller doesn't want all the requested buffers; only some.
3657 * Try hard to get what we can, but don't block. This effectively
3658 * overrides MCR_SLEEP, since this thread will not go to sleep
3659 * if we can't get all the buffers.
3660 */
3661 if (!wantall || (mcflags & MCR_NOSLEEP))
3662 mcflags |= MCR_TRYHARD;
3663
3664 /* Allocate the composite mbuf + cluster elements from the cache */
3665 if (bufsize == m_maxsize(MC_CL))
3666 cp = m_cache(MC_MBUF_CL);
3667 else if (bufsize == m_maxsize(MC_BIGCL))
3668 cp = m_cache(MC_MBUF_BIGCL);
3669 else
3670 cp = m_cache(MC_MBUF_16KCL);
3671 needed = mcache_alloc_ext(cp, &mp_list, needed, mcflags);
3672
3673 for (pnum = 0; pnum < needed; pnum++) {
3674 m = (struct mbuf *)mp_list;
3675 mp_list = mp_list->obj_next;
3676
3677 VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT);
3678 cl = m->m_ext.ext_buf;
3679 rfa = MEXT_RFA(m);
3680
3681 ASSERT(cl != NULL && rfa != NULL);
3682 VERIFY(MBUF_IS_COMPOSITE(m));
3683
3684 flag = MEXT_FLAGS(m);
3685
3686 MBUF_INIT(m, num_with_pkthdrs, MT_DATA);
3687 if (bufsize == m_maxsize(MC_16KCL)) {
3688 MBUF_16KCL_INIT(m, cl, rfa, 1, flag);
3689 } else if (bufsize == m_maxsize(MC_BIGCL)) {
3690 MBUF_BIGCL_INIT(m, cl, rfa, 1, flag);
3691 } else {
3692 MBUF_CL_INIT(m, cl, rfa, 1, flag);
3693 }
3694
3695 if (num_with_pkthdrs > 0) {
3696 --num_with_pkthdrs;
3697 #if CONFIG_MACF_NET
3698 if (mac_mbuf_label_init(m, wait) != 0) {
3699 m_freem(m);
3700 break;
3701 }
3702 #endif /* MAC_NET */
3703 }
3704
3705 *np = m;
3706 if (num_with_pkthdrs > 0)
3707 np = &m->m_nextpkt;
3708 else
3709 np = &m->m_next;
3710 }
3711 ASSERT(pnum != *num_needed || mp_list == NULL);
3712 if (mp_list != NULL)
3713 mcache_free_ext(cp, mp_list);
3714
3715 if (pnum > 0) {
3716 mtype_stat_add(MT_DATA, pnum);
3717 mtype_stat_sub(MT_FREE, pnum);
3718 }
3719
3720 if (wantall && (pnum != *num_needed)) {
3721 if (top != NULL)
3722 m_freem_list(top);
3723 return (NULL);
3724 }
3725
3726 if (pnum > *num_needed) {
3727 printf("%s: File a radar related to <rdar://10146739>. \
3728 needed = %u, pnum = %u, num_needed = %u \n",
3729 __func__, needed, pnum, *num_needed);
3730 }
3731
3732 *num_needed = pnum;
3733 return (top);
3734 }
3735
3736 /*
3737 * Return list of mbuf linked by m_nextpkt. Try for numlist, and if
3738 * wantall is not set, return whatever number were available. The size of
3739 * each mbuf in the list is controlled by the parameter packetlen. Each
3740 * mbuf of the list may have a chain of mbufs linked by m_next. Each mbuf
3741 * in the chain is called a segment. If maxsegments is not null and the
3742 * value pointed to is not null, this specify the maximum number of segments
3743 * for a chain of mbufs. If maxsegments is zero or the value pointed to
3744 * is zero the caller does not have any restriction on the number of segments.
3745 * The actual number of segments of a mbuf chain is return in the value
3746 * pointed to by maxsegments.
3747 */
3748 __private_extern__ struct mbuf *
3749 m_allocpacket_internal(unsigned int *numlist, size_t packetlen,
3750 unsigned int *maxsegments, int wait, int wantall, size_t wantsize)
3751 {
3752 struct mbuf **np, *top, *first = NULL;
3753 size_t bufsize, r_bufsize;
3754 unsigned int num = 0;
3755 unsigned int nsegs = 0;
3756 unsigned int needed, resid;
3757 int mcflags = MSLEEPF(wait);
3758 mcache_obj_t *mp_list = NULL, *rmp_list = NULL;
3759 mcache_t *cp = NULL, *rcp = NULL;
3760
3761 if (*numlist == 0)
3762 return (NULL);
3763
3764 top = NULL;
3765 np = &top;
3766
3767 if (wantsize == 0) {
3768 if (packetlen <= MINCLSIZE) {
3769 bufsize = packetlen;
3770 } else if (packetlen > m_maxsize(MC_CL)) {
3771 /* Use 4KB if jumbo cluster pool isn't available */
3772 if (packetlen <= m_maxsize(MC_BIGCL) || njcl == 0)
3773 bufsize = m_maxsize(MC_BIGCL);
3774 else
3775 bufsize = m_maxsize(MC_16KCL);
3776 } else {
3777 bufsize = m_maxsize(MC_CL);
3778 }
3779 } else if (wantsize == m_maxsize(MC_CL) ||
3780 wantsize == m_maxsize(MC_BIGCL) ||
3781 (wantsize == m_maxsize(MC_16KCL) && njcl > 0)) {
3782 bufsize = wantsize;
3783 } else {
3784 return (NULL);
3785 }
3786
3787 if (bufsize <= MHLEN) {
3788 nsegs = 1;
3789 } else if (bufsize <= MINCLSIZE) {
3790 if (maxsegments != NULL && *maxsegments == 1) {
3791 bufsize = m_maxsize(MC_CL);
3792 nsegs = 1;
3793 } else {
3794 nsegs = 2;
3795 }
3796 } else if (bufsize == m_maxsize(MC_16KCL)) {
3797 VERIFY(njcl > 0);
3798 nsegs = ((packetlen - 1) >> (PGSHIFT + 2)) + 1;
3799 } else if (bufsize == m_maxsize(MC_BIGCL)) {
3800 nsegs = ((packetlen - 1) >> PGSHIFT) + 1;
3801 } else {
3802 nsegs = ((packetlen - 1) >> MCLSHIFT) + 1;
3803 }
3804 if (maxsegments != NULL) {
3805 if (*maxsegments && nsegs > *maxsegments) {
3806 *maxsegments = nsegs;
3807 return (NULL);
3808 }
3809 *maxsegments = nsegs;
3810 }
3811
3812 /*
3813 * The caller doesn't want all the requested buffers; only some.
3814 * Try hard to get what we can, but don't block. This effectively
3815 * overrides MCR_SLEEP, since this thread will not go to sleep
3816 * if we can't get all the buffers.
3817 */
3818 if (!wantall || (mcflags & MCR_NOSLEEP))
3819 mcflags |= MCR_TRYHARD;
3820
3821 /*
3822 * Simple case where all elements in the lists/chains are mbufs.
3823 * Unless bufsize is greater than MHLEN, each segment chain is made
3824 * up of exactly 1 mbuf. Otherwise, each segment chain is made up
3825 * of 2 mbufs; the second one is used for the residual data, i.e.
3826 * the remaining data that cannot fit into the first mbuf.
3827 */
3828 if (bufsize <= MINCLSIZE) {
3829 /* Allocate the elements in one shot from the mbuf cache */
3830 ASSERT(bufsize <= MHLEN || nsegs == 2);
3831 cp = m_cache(MC_MBUF);
3832 needed = mcache_alloc_ext(cp, &mp_list,
3833 (*numlist) * nsegs, mcflags);
3834
3835 /*
3836 * The number of elements must be even if we are to use an
3837 * mbuf (instead of a cluster) to store the residual data.
3838 * If we couldn't allocate the requested number of mbufs,
3839 * trim the number down (if it's odd) in order to avoid
3840 * creating a partial segment chain.
3841 */
3842 if (bufsize > MHLEN && (needed & 0x1))
3843 needed--;
3844
3845 while (num < needed) {
3846 struct mbuf *m;
3847
3848 m = (struct mbuf *)mp_list;
3849 mp_list = mp_list->obj_next;
3850 ASSERT(m != NULL);
3851
3852 MBUF_INIT(m, 1, MT_DATA);
3853 #if CONFIG_MACF_NET
3854 if (mac_init_mbuf(m, wait) != 0) {
3855 m_free(m);
3856 break;
3857 }
3858 #endif /* MAC_NET */
3859 num++;
3860 if (bufsize > MHLEN) {
3861 /* A second mbuf for this segment chain */
3862 m->m_next = (struct mbuf *)mp_list;
3863 mp_list = mp_list->obj_next;
3864 ASSERT(m->m_next != NULL);
3865
3866 MBUF_INIT(m->m_next, 0, MT_DATA);
3867 num++;
3868 }
3869 *np = m;
3870 np = &m->m_nextpkt;
3871 }
3872 ASSERT(num != *numlist || mp_list == NULL);
3873
3874 if (num > 0) {
3875 mtype_stat_add(MT_DATA, num);
3876 mtype_stat_sub(MT_FREE, num);
3877 }
3878 num /= nsegs;
3879
3880 /* We've got them all; return to caller */
3881 if (num == *numlist)
3882 return (top);
3883
3884 goto fail;
3885 }
3886
3887 /*
3888 * Complex cases where elements are made up of one or more composite
3889 * mbufs + cluster, depending on packetlen. Each N-segment chain can
3890 * be illustrated as follows:
3891 *
3892 * [mbuf + cluster 1] [mbuf + cluster 2] ... [mbuf + cluster N]
3893 *
3894 * Every composite mbuf + cluster element comes from the intermediate
3895 * cache (either MC_MBUF_CL or MC_MBUF_BIGCL). For space efficiency,
3896 * the last composite element will come from the MC_MBUF_CL cache,
3897 * unless the residual data is larger than 2KB where we use the
3898 * big cluster composite cache (MC_MBUF_BIGCL) instead. Residual
3899 * data is defined as extra data beyond the first element that cannot
3900 * fit into the previous element, i.e. there is no residual data if
3901 * the chain only has 1 segment.
3902 */
3903 r_bufsize = bufsize;
3904 resid = packetlen > bufsize ? packetlen % bufsize : 0;
3905 if (resid > 0) {
3906 /* There is residual data; figure out the cluster size */
3907 if (wantsize == 0 && packetlen > MINCLSIZE) {
3908 /*
3909 * Caller didn't request that all of the segments
3910 * in the chain use the same cluster size; use the
3911 * smaller of the cluster sizes.
3912 */
3913 if (njcl > 0 && resid > m_maxsize(MC_BIGCL))
3914 r_bufsize = m_maxsize(MC_16KCL);
3915 else if (resid > m_maxsize(MC_CL))
3916 r_bufsize = m_maxsize(MC_BIGCL);
3917 else
3918 r_bufsize = m_maxsize(MC_CL);
3919 } else {
3920 /* Use the same cluster size as the other segments */
3921 resid = 0;
3922 }
3923 }
3924
3925 needed = *numlist;
3926 if (resid > 0) {
3927 /*
3928 * Attempt to allocate composite mbuf + cluster elements for
3929 * the residual data in each chain; record the number of such
3930 * elements that can be allocated so that we know how many
3931 * segment chains we can afford to create.
3932 */
3933 if (r_bufsize <= m_maxsize(MC_CL))
3934 rcp = m_cache(MC_MBUF_CL);
3935 else if (r_bufsize <= m_maxsize(MC_BIGCL))
3936 rcp = m_cache(MC_MBUF_BIGCL);
3937 else
3938 rcp = m_cache(MC_MBUF_16KCL);
3939 needed = mcache_alloc_ext(rcp, &rmp_list, *numlist, mcflags);
3940
3941 if (needed == 0)
3942 goto fail;
3943
3944 /* This is temporarily reduced for calculation */
3945 ASSERT(nsegs > 1);
3946 nsegs--;
3947 }
3948
3949 /*
3950 * Attempt to allocate the rest of the composite mbuf + cluster
3951 * elements for the number of segment chains that we need.
3952 */
3953 if (bufsize <= m_maxsize(MC_CL))
3954 cp = m_cache(MC_MBUF_CL);
3955 else if (bufsize <= m_maxsize(MC_BIGCL))
3956 cp = m_cache(MC_MBUF_BIGCL);
3957 else
3958 cp = m_cache(MC_MBUF_16KCL);
3959 needed = mcache_alloc_ext(cp, &mp_list, needed * nsegs, mcflags);
3960
3961 /* Round it down to avoid creating a partial segment chain */
3962 needed = (needed / nsegs) * nsegs;
3963 if (needed == 0)
3964 goto fail;
3965
3966 if (resid > 0) {
3967 /*
3968 * We're about to construct the chain(s); take into account
3969 * the number of segments we have created above to hold the
3970 * residual data for each chain, as well as restore the
3971 * original count of segments per chain.
3972 */
3973 ASSERT(nsegs > 0);
3974 needed += needed / nsegs;
3975 nsegs++;
3976 }
3977
3978 for (;;) {
3979 struct mbuf *m;
3980 u_int32_t flag;
3981 struct ext_ref *rfa;
3982 void *cl;
3983 int pkthdr;
3984
3985 ++num;
3986 if (nsegs == 1 || (num % nsegs) != 0 || resid == 0) {
3987 m = (struct mbuf *)mp_list;
3988 mp_list = mp_list->obj_next;
3989 } else {
3990 m = (struct mbuf *)rmp_list;
3991 rmp_list = rmp_list->obj_next;
3992 }
3993 ASSERT(m != NULL);
3994 VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT);
3995 VERIFY(m->m_ext.ext_free == NULL ||
3996 m->m_ext.ext_free == m_bigfree ||
3997 m->m_ext.ext_free == m_16kfree);
3998
3999 cl = m->m_ext.ext_buf;
4000 rfa = MEXT_RFA(m);
4001
4002 ASSERT(cl != NULL && rfa != NULL);
4003 VERIFY(MBUF_IS_COMPOSITE(m));
4004
4005 flag = MEXT_FLAGS(m);
4006
4007 pkthdr = (nsegs == 1 || (num % nsegs) == 1);
4008 if (pkthdr)
4009 first = m;
4010 MBUF_INIT(m, pkthdr, MT_DATA);
4011 if (m->m_ext.ext_free == m_16kfree) {
4012 MBUF_16KCL_INIT(m, cl, rfa, 1, flag);
4013 } else if (m->m_ext.ext_free == m_bigfree) {
4014 MBUF_BIGCL_INIT(m, cl, rfa, 1, flag);
4015 } else {
4016 MBUF_CL_INIT(m, cl, rfa, 1, flag);
4017 }
4018 #if CONFIG_MACF_NET
4019 if (pkthdr && mac_init_mbuf(m, wait) != 0) {
4020 --num;
4021 m_freem(m);
4022 break;
4023 }
4024 #endif /* MAC_NET */
4025
4026 *np = m;
4027 if ((num % nsegs) == 0)
4028 np = &first->m_nextpkt;
4029 else
4030 np = &m->m_next;
4031
4032 if (num == needed)
4033 break;
4034 }
4035
4036 if (num > 0) {
4037 mtype_stat_add(MT_DATA, num);
4038 mtype_stat_sub(MT_FREE, num);
4039 }
4040
4041 num /= nsegs;
4042
4043 /* We've got them all; return to caller */
4044 if (num == *numlist) {
4045 ASSERT(mp_list == NULL && rmp_list == NULL);
4046 return (top);
4047 }
4048
4049 fail:
4050 /* Free up what's left of the above */
4051 if (mp_list != NULL)
4052 mcache_free_ext(cp, mp_list);
4053 if (rmp_list != NULL)
4054 mcache_free_ext(rcp, rmp_list);
4055 if (wantall && top != NULL) {
4056 m_freem(top);
4057 return (NULL);
4058 }
4059 *numlist = num;
4060 return (top);
4061 }
4062
4063 /*
4064 * Best effort to get a mbuf cluster + pkthdr. Used by drivers to allocated
4065 * packets on receive ring.
4066 */
4067 __private_extern__ struct mbuf *
4068 m_getpacket_how(int wait)
4069 {
4070 unsigned int num_needed = 1;
4071
4072 return (m_getpackets_internal(&num_needed, 1, wait, 1,
4073 m_maxsize(MC_CL)));
4074 }
4075
4076 /*
4077 * Best effort to get a mbuf cluster + pkthdr. Used by drivers to allocated
4078 * packets on receive ring.
4079 */
4080 struct mbuf *
4081 m_getpacket(void)
4082 {
4083 unsigned int num_needed = 1;
4084
4085 return (m_getpackets_internal(&num_needed, 1, M_WAIT, 1,
4086 m_maxsize(MC_CL)));
4087 }
4088
4089 /*
4090 * Return a list of mbuf hdrs that point to clusters. Try for num_needed;
4091 * if this can't be met, return whatever number were available. Set up the
4092 * first num_with_pkthdrs with mbuf hdrs configured as packet headers. These
4093 * are chained on the m_nextpkt field. Any packets requested beyond this are
4094 * chained onto the last packet header's m_next field.
4095 */
4096 struct mbuf *
4097 m_getpackets(int num_needed, int num_with_pkthdrs, int how)
4098 {
4099 unsigned int n = num_needed;
4100
4101 return (m_getpackets_internal(&n, num_with_pkthdrs, how, 0,
4102 m_maxsize(MC_CL)));
4103 }
4104
4105 /*
4106 * Return a list of mbuf hdrs set up as packet hdrs chained together
4107 * on the m_nextpkt field
4108 */
4109 struct mbuf *
4110 m_getpackethdrs(int num_needed, int how)
4111 {
4112 struct mbuf *m;
4113 struct mbuf **np, *top;
4114
4115 top = NULL;
4116 np = &top;
4117
4118 while (num_needed--) {
4119 m = _M_RETRYHDR(how, MT_DATA);
4120 if (m == NULL)
4121 break;
4122
4123 *np = m;
4124 np = &m->m_nextpkt;
4125 }
4126
4127 return (top);
4128 }
4129
4130 /*
4131 * Free an mbuf list (m_nextpkt) while following m_next. Returns the count
4132 * for mbufs packets freed. Used by the drivers.
4133 */
4134 int
4135 m_freem_list(struct mbuf *m)
4136 {
4137 struct mbuf *nextpkt;
4138 mcache_obj_t *mp_list = NULL;
4139 mcache_obj_t *mcl_list = NULL;
4140 mcache_obj_t *mbc_list = NULL;
4141 mcache_obj_t *m16k_list = NULL;
4142 mcache_obj_t *m_mcl_list = NULL;
4143 mcache_obj_t *m_mbc_list = NULL;
4144 mcache_obj_t *m_m16k_list = NULL;
4145 mcache_obj_t *ref_list = NULL;
4146 int pktcount = 0;
4147 int mt_free = 0, mt_data = 0, mt_header = 0, mt_soname = 0, mt_tag = 0;
4148
4149 while (m != NULL) {
4150 pktcount++;
4151
4152 nextpkt = m->m_nextpkt;
4153 m->m_nextpkt = NULL;
4154
4155 while (m != NULL) {
4156 struct mbuf *next = m->m_next;
4157 mcache_obj_t *o, *rfa;
4158 u_int32_t refcnt, composite;
4159
4160 if (m->m_type == MT_FREE)
4161 panic("m_free: freeing an already freed mbuf");
4162
4163 if (m->m_type != MT_FREE)
4164 mt_free++;
4165
4166 if (m->m_flags & M_PKTHDR) {
4167 m_tag_delete_chain(m, NULL);
4168 }
4169
4170 if (!(m->m_flags & M_EXT))
4171 goto simple_free;
4172
4173 o = (mcache_obj_t *)(void *)m->m_ext.ext_buf;
4174 refcnt = m_decref(m);
4175 composite = (MEXT_FLAGS(m) & EXTF_COMPOSITE);
4176 if (refcnt == 0 && !composite) {
4177 if (m->m_ext.ext_free == NULL) {
4178 o->obj_next = mcl_list;
4179 mcl_list = o;
4180 } else if (m->m_ext.ext_free == m_bigfree) {
4181 o->obj_next = mbc_list;
4182 mbc_list = o;
4183 } else if (m->m_ext.ext_free == m_16kfree) {
4184 o->obj_next = m16k_list;
4185 m16k_list = o;
4186 } else {
4187 (*(m->m_ext.ext_free))((caddr_t)o,
4188 m->m_ext.ext_size,
4189 m->m_ext.ext_arg);
4190 }
4191 rfa = (mcache_obj_t *)(void *)MEXT_RFA(m);
4192 rfa->obj_next = ref_list;
4193 ref_list = rfa;
4194 MEXT_RFA(m) = NULL;
4195 } else if (refcnt == 0 && composite) {
4196 VERIFY(m->m_type != MT_FREE);
4197 /*
4198 * Amortize the costs of atomic operations
4199 * by doing them at the end, if possible.
4200 */
4201 if (m->m_type == MT_DATA)
4202 mt_data++;
4203 else if (m->m_type == MT_HEADER)
4204 mt_header++;
4205 else if (m->m_type == MT_SONAME)
4206 mt_soname++;
4207 else if (m->m_type == MT_TAG)
4208 mt_tag++;
4209 else
4210 mtype_stat_dec(m->m_type);
4211
4212 m->m_type = MT_FREE;
4213 m->m_flags = M_EXT;
4214 m->m_len = 0;
4215 m->m_next = m->m_nextpkt = NULL;
4216
4217 MEXT_FLAGS(m) &= ~EXTF_READONLY;
4218
4219 /* "Free" into the intermediate cache */
4220 o = (mcache_obj_t *)m;
4221 if (m->m_ext.ext_free == NULL) {
4222 o->obj_next = m_mcl_list;
4223 m_mcl_list = o;
4224 } else if (m->m_ext.ext_free == m_bigfree) {
4225 o->obj_next = m_mbc_list;
4226 m_mbc_list = o;
4227 } else {
4228 VERIFY(m->m_ext.ext_free == m_16kfree);
4229 o->obj_next = m_m16k_list;
4230 m_m16k_list = o;
4231 }
4232 m = next;
4233 continue;
4234 }
4235 simple_free:
4236 /*
4237 * Amortize the costs of atomic operations
4238 * by doing them at the end, if possible.
4239 */
4240 if (m->m_type == MT_DATA)
4241 mt_data++;
4242 else if (m->m_type == MT_HEADER)
4243 mt_header++;
4244 else if (m->m_type == MT_SONAME)
4245 mt_soname++;
4246 else if (m->m_type == MT_TAG)
4247 mt_tag++;
4248 else if (m->m_type != MT_FREE)
4249 mtype_stat_dec(m->m_type);
4250
4251 m->m_type = MT_FREE;
4252 m->m_flags = m->m_len = 0;
4253 m->m_next = m->m_nextpkt = NULL;
4254
4255 ((mcache_obj_t *)m)->obj_next = mp_list;
4256 mp_list = (mcache_obj_t *)m;
4257
4258 m = next;
4259 }
4260
4261 m = nextpkt;
4262 }
4263
4264 if (mt_free > 0)
4265 mtype_stat_add(MT_FREE, mt_free);
4266 if (mt_data > 0)
4267 mtype_stat_sub(MT_DATA, mt_data);
4268 if (mt_header > 0)
4269 mtype_stat_sub(MT_HEADER, mt_header);
4270 if (mt_soname > 0)
4271 mtype_stat_sub(MT_SONAME, mt_soname);
4272 if (mt_tag > 0)
4273 mtype_stat_sub(MT_TAG, mt_tag);
4274
4275 if (mp_list != NULL)
4276 mcache_free_ext(m_cache(MC_MBUF), mp_list);
4277 if (mcl_list != NULL)
4278 mcache_free_ext(m_cache(MC_CL), mcl_list);
4279 if (mbc_list != NULL)
4280 mcache_free_ext(m_cache(MC_BIGCL), mbc_list);
4281 if (m16k_list != NULL)
4282 mcache_free_ext(m_cache(MC_16KCL), m16k_list);
4283 if (m_mcl_list != NULL)
4284 mcache_free_ext(m_cache(MC_MBUF_CL), m_mcl_list);
4285 if (m_mbc_list != NULL)
4286 mcache_free_ext(m_cache(MC_MBUF_BIGCL), m_mbc_list);
4287 if (m_m16k_list != NULL)
4288 mcache_free_ext(m_cache(MC_MBUF_16KCL), m_m16k_list);
4289 if (ref_list != NULL)
4290 mcache_free_ext(ref_cache, ref_list);
4291
4292 return (pktcount);
4293 }
4294
4295 void
4296 m_freem(struct mbuf *m)
4297 {
4298 while (m != NULL)
4299 m = m_free(m);
4300 }
4301
4302 /*
4303 * Mbuffer utility routines.
4304 */
4305
4306 /*
4307 * Compute the amount of space available before the current start
4308 * of data in an mbuf.
4309 */
4310 int
4311 m_leadingspace(struct mbuf *m)
4312 {
4313 if (m->m_flags & M_EXT) {
4314 if (MCLHASREFERENCE(m))
4315 return (0);
4316 return (m->m_data - m->m_ext.ext_buf);
4317 }
4318 if (m->m_flags & M_PKTHDR)
4319 return (m->m_data - m->m_pktdat);
4320 return (m->m_data - m->m_dat);
4321 }
4322
4323 /*
4324 * Compute the amount of space available after the end of data in an mbuf.
4325 */
4326 int
4327 m_trailingspace(struct mbuf *m)
4328 {
4329 if (m->m_flags & M_EXT) {
4330 if (MCLHASREFERENCE(m))
4331 return (0);
4332 return (m->m_ext.ext_buf + m->m_ext.ext_size -
4333 (m->m_data + m->m_len));
4334 }
4335 return (&m->m_dat[MLEN] - (m->m_data + m->m_len));
4336 }
4337
4338 /*
4339 * Lesser-used path for M_PREPEND: allocate new mbuf to prepend to chain,
4340 * copy junk along. Does not adjust packet header length.
4341 */
4342 struct mbuf *
4343 m_prepend(struct mbuf *m, int len, int how)
4344 {
4345 struct mbuf *mn;
4346
4347 _MGET(mn, how, m->m_type);
4348 if (mn == NULL) {
4349 m_freem(m);
4350 return (NULL);
4351 }
4352 if (m->m_flags & M_PKTHDR) {
4353 M_COPY_PKTHDR(mn, m);
4354 m->m_flags &= ~M_PKTHDR;
4355 }
4356 mn->m_next = m;
4357 m = mn;
4358 if (len < MHLEN)
4359 MH_ALIGN(m, len);
4360 m->m_len = len;
4361 return (m);
4362 }
4363
4364 /*
4365 * Replacement for old M_PREPEND macro: allocate new mbuf to prepend to
4366 * chain, copy junk along, and adjust length.
4367 */
4368 struct mbuf *
4369 m_prepend_2(struct mbuf *m, int len, int how)
4370 {
4371 if (M_LEADINGSPACE(m) >= len) {
4372 m->m_data -= len;
4373 m->m_len += len;
4374 } else {
4375 m = m_prepend(m, len, how);
4376 }
4377 if ((m) && (m->m_flags & M_PKTHDR))
4378 m->m_pkthdr.len += len;
4379 return (m);
4380 }
4381
4382 /*
4383 * Make a copy of an mbuf chain starting "off0" bytes from the beginning,
4384 * continuing for "len" bytes. If len is M_COPYALL, copy to end of mbuf.
4385 * The wait parameter is a choice of M_WAIT/M_DONTWAIT from caller.
4386 */
4387 int MCFail;
4388
4389 struct mbuf *
4390 m_copym(struct mbuf *m, int off0, int len, int wait)
4391 {
4392 struct mbuf *n, *mhdr = NULL, **np;
4393 int off = off0;
4394 struct mbuf *top;
4395 int copyhdr = 0;
4396
4397 if (off < 0 || len < 0)
4398 panic("m_copym: invalid offset %d or len %d", off, len);
4399
4400 if (off == 0 && (m->m_flags & M_PKTHDR)) {
4401 mhdr = m;
4402 copyhdr = 1;
4403 }
4404
4405 while (off >= m->m_len) {
4406 if (m->m_next == NULL)
4407 panic("m_copym: invalid mbuf chain");
4408 off -= m->m_len;
4409 m = m->m_next;
4410 }
4411 np = &top;
4412 top = NULL;
4413
4414 while (len > 0) {
4415 if (m == NULL) {
4416 if (len != M_COPYALL)
4417 panic("m_copym: len != M_COPYALL");
4418 break;
4419 }
4420
4421 n = _M_RETRY(wait, m->m_type);
4422 *np = n;
4423
4424 if (n == NULL)
4425 goto nospace;
4426
4427 if (copyhdr != 0) {
4428 M_COPY_PKTHDR(n, mhdr);
4429 if (len == M_COPYALL)
4430 n->m_pkthdr.len -= off0;
4431 else
4432 n->m_pkthdr.len = len;
4433 copyhdr = 0;
4434 }
4435 if (len == M_COPYALL) {
4436 if (MIN(len, (m->m_len - off)) == len) {
4437 printf("m->m_len %d - off %d = %d, %d\n",
4438 m->m_len, off, m->m_len - off,
4439 MIN(len, (m->m_len - off)));
4440 }
4441 }
4442 n->m_len = MIN(len, (m->m_len - off));
4443 if (n->m_len == M_COPYALL) {
4444 printf("n->m_len == M_COPYALL, fixing\n");
4445 n->m_len = MHLEN;
4446 }
4447 if (m->m_flags & M_EXT) {
4448 n->m_ext = m->m_ext;
4449 m_incref(m);
4450 n->m_data = m->m_data + off;
4451 n->m_flags |= M_EXT;
4452 } else {
4453 bcopy(MTOD(m, caddr_t)+off, MTOD(n, caddr_t),
4454 (unsigned)n->m_len);
4455 }
4456 if (len != M_COPYALL)
4457 len -= n->m_len;
4458 off = 0;
4459 m = m->m_next;
4460 np = &n->m_next;
4461 }
4462
4463 if (top == NULL)
4464 MCFail++;
4465
4466 return (top);
4467 nospace:
4468
4469 m_freem(top);
4470 MCFail++;
4471 return (NULL);
4472 }
4473
4474 /*
4475 * Equivalent to m_copym except that all necessary mbuf hdrs are allocated
4476 * within this routine also, the last mbuf and offset accessed are passed
4477 * out and can be passed back in to avoid having to rescan the entire mbuf
4478 * list (normally hung off of the socket)
4479 */
4480 struct mbuf *
4481 m_copym_with_hdrs(struct mbuf *m, int off0, int len0, int wait,
4482 struct mbuf **m_lastm, int *m_off)
4483 {
4484 struct mbuf *n, **np = NULL;
4485 int off = off0, len = len0;
4486 struct mbuf *top = NULL;
4487 int mcflags = MSLEEPF(wait);
4488 int copyhdr = 0;
4489 int type = 0;
4490 mcache_obj_t *list = NULL;
4491 int needed = 0;
4492
4493 if (off == 0 && (m->m_flags & M_PKTHDR))
4494 copyhdr = 1;
4495
4496 if (*m_lastm != NULL) {
4497 m = *m_lastm;
4498 off = *m_off;
4499 } else {
4500 while (off >= m->m_len) {
4501 off -= m->m_len;
4502 m = m->m_next;
4503 }
4504 }
4505
4506 n = m;
4507 while (len > 0) {
4508 needed++;
4509 ASSERT(n != NULL);
4510 len -= MIN(len, (n->m_len - ((needed == 1) ? off : 0)));
4511 n = n->m_next;
4512 }
4513 needed++;
4514 len = len0;
4515
4516 /*
4517 * If the caller doesn't want to be put to sleep, mark it with
4518 * MCR_TRYHARD so that we may reclaim buffers from other places
4519 * before giving up.
4520 */
4521 if (mcflags & MCR_NOSLEEP)
4522 mcflags |= MCR_TRYHARD;
4523
4524 if (mcache_alloc_ext(m_cache(MC_MBUF), &list, needed,
4525 mcflags) != needed)
4526 goto nospace;
4527
4528 needed = 0;
4529 while (len > 0) {
4530 n = (struct mbuf *)list;
4531 list = list->obj_next;
4532 ASSERT(n != NULL && m != NULL);
4533
4534 type = (top == NULL) ? MT_HEADER : m->m_type;
4535 MBUF_INIT(n, (top == NULL), type);
4536 #if CONFIG_MACF_NET
4537 if (top == NULL && mac_mbuf_label_init(n, wait) != 0) {
4538 mtype_stat_inc(MT_HEADER);
4539 mtype_stat_dec(MT_FREE);
4540 m_free(n);
4541 goto nospace;
4542 }
4543 #endif /* MAC_NET */
4544
4545 if (top == NULL) {
4546 top = n;
4547 np = &top->m_next;
4548 continue;
4549 } else {
4550 needed++;
4551 *np = n;
4552 }
4553
4554 if (copyhdr) {
4555 M_COPY_PKTHDR(n, m);
4556 n->m_pkthdr.len = len;
4557 copyhdr = 0;
4558 }
4559 n->m_len = MIN(len, (m->m_len - off));
4560
4561 if (m->m_flags & M_EXT) {
4562 n->m_ext = m->m_ext;
4563 m_incref(m);
4564 n->m_data = m->m_data + off;
4565 n->m_flags |= M_EXT;
4566 } else {
4567 bcopy(MTOD(m, caddr_t)+off, MTOD(n, caddr_t),
4568 (unsigned)n->m_len);
4569 }
4570 len -= n->m_len;
4571
4572 if (len == 0) {
4573 if ((off + n->m_len) == m->m_len) {
4574 *m_lastm = m->m_next;
4575 *m_off = 0;
4576 } else {
4577 *m_lastm = m;
4578 *m_off = off + n->m_len;
4579 }
4580 break;
4581 }
4582 off = 0;
4583 m = m->m_next;
4584 np = &n->m_next;
4585 }
4586
4587 mtype_stat_inc(MT_HEADER);
4588 mtype_stat_add(type, needed);
4589 mtype_stat_sub(MT_FREE, needed + 1);
4590
4591 ASSERT(list == NULL);
4592 return (top);
4593
4594 nospace:
4595 if (list != NULL)
4596 mcache_free_ext(m_cache(MC_MBUF), list);
4597 if (top != NULL)
4598 m_freem(top);
4599 MCFail++;
4600 return (NULL);
4601 }
4602
4603 /*
4604 * Copy data from an mbuf chain starting "off" bytes from the beginning,
4605 * continuing for "len" bytes, into the indicated buffer.
4606 */
4607 void
4608 m_copydata(struct mbuf *m, int off, int len, void *vp)
4609 {
4610 unsigned count;
4611 char *cp = vp;
4612
4613 if (off < 0 || len < 0)
4614 panic("m_copydata: invalid offset %d or len %d", off, len);
4615
4616 while (off > 0) {
4617 if (m == NULL)
4618 panic("m_copydata: invalid mbuf chain");
4619 if (off < m->m_len)
4620 break;
4621 off -= m->m_len;
4622 m = m->m_next;
4623 }
4624 while (len > 0) {
4625 if (m == NULL)
4626 panic("m_copydata: invalid mbuf chain");
4627 count = MIN(m->m_len - off, len);
4628 bcopy(MTOD(m, caddr_t) + off, cp, count);
4629 len -= count;
4630 cp += count;
4631 off = 0;
4632 m = m->m_next;
4633 }
4634 }
4635
4636 /*
4637 * Concatenate mbuf chain n to m. Both chains must be of the same type
4638 * (e.g. MT_DATA). Any m_pkthdr is not updated.
4639 */
4640 void
4641 m_cat(struct mbuf *m, struct mbuf *n)
4642 {
4643 while (m->m_next)
4644 m = m->m_next;
4645 while (n) {
4646 if ((m->m_flags & M_EXT) ||
4647 m->m_data + m->m_len + n->m_len >= &m->m_dat[MLEN]) {
4648 /* just join the two chains */
4649 m->m_next = n;
4650 return;
4651 }
4652 /* splat the data from one into the other */
4653 bcopy(MTOD(n, caddr_t), MTOD(m, caddr_t) + m->m_len,
4654 (u_int)n->m_len);
4655 m->m_len += n->m_len;
4656 n = m_free(n);
4657 }
4658 }
4659
4660 void
4661 m_adj(struct mbuf *mp, int req_len)
4662 {
4663 int len = req_len;
4664 struct mbuf *m;
4665 int count;
4666
4667 if ((m = mp) == NULL)
4668 return;
4669 if (len >= 0) {
4670 /*
4671 * Trim from head.
4672 */
4673 while (m != NULL && len > 0) {
4674 if (m->m_len <= len) {
4675 len -= m->m_len;
4676 m->m_len = 0;
4677 m = m->m_next;
4678 } else {
4679 m->m_len -= len;
4680 m->m_data += len;
4681 len = 0;
4682 }
4683 }
4684 m = mp;
4685 if (m->m_flags & M_PKTHDR)
4686 m->m_pkthdr.len -= (req_len - len);
4687 } else {
4688 /*
4689 * Trim from tail. Scan the mbuf chain,
4690 * calculating its length and finding the last mbuf.
4691 * If the adjustment only affects this mbuf, then just
4692 * adjust and return. Otherwise, rescan and truncate
4693 * after the remaining size.
4694 */
4695 len = -len;
4696 count = 0;
4697 for (;;) {
4698 count += m->m_len;
4699 if (m->m_next == (struct mbuf *)0)
4700 break;
4701 m = m->m_next;
4702 }
4703 if (m->m_len >= len) {
4704 m->m_len -= len;
4705 m = mp;
4706 if (m->m_flags & M_PKTHDR)
4707 m->m_pkthdr.len -= len;
4708 return;
4709 }
4710 count -= len;
4711 if (count < 0)
4712 count = 0;
4713 /*
4714 * Correct length for chain is "count".
4715 * Find the mbuf with last data, adjust its length,
4716 * and toss data from remaining mbufs on chain.
4717 */
4718 m = mp;
4719 if (m->m_flags & M_PKTHDR)
4720 m->m_pkthdr.len = count;
4721 for (; m; m = m->m_next) {
4722 if (m->m_len >= count) {
4723 m->m_len = count;
4724 break;
4725 }
4726 count -= m->m_len;
4727 }
4728 while ((m = m->m_next))
4729 m->m_len = 0;
4730 }
4731 }
4732
4733 /*
4734 * Rearange an mbuf chain so that len bytes are contiguous
4735 * and in the data area of an mbuf (so that mtod and dtom
4736 * will work for a structure of size len). Returns the resulting
4737 * mbuf chain on success, frees it and returns null on failure.
4738 * If there is room, it will add up to max_protohdr-len extra bytes to the
4739 * contiguous region in an attempt to avoid being called next time.
4740 */
4741 int MPFail;
4742
4743 struct mbuf *
4744 m_pullup(struct mbuf *n, int len)
4745 {
4746 struct mbuf *m;
4747 int count;
4748 int space;
4749
4750 /*
4751 * If first mbuf has no cluster, and has room for len bytes
4752 * without shifting current data, pullup into it,
4753 * otherwise allocate a new mbuf to prepend to the chain.
4754 */
4755 if ((n->m_flags & M_EXT) == 0 &&
4756 n->m_data + len < &n->m_dat[MLEN] && n->m_next) {
4757 if (n->m_len >= len)
4758 return (n);
4759 m = n;
4760 n = n->m_next;
4761 len -= m->m_len;
4762 } else {
4763 if (len > MHLEN)
4764 goto bad;
4765 _MGET(m, M_DONTWAIT, n->m_type);
4766 if (m == 0)
4767 goto bad;
4768 m->m_len = 0;
4769 if (n->m_flags & M_PKTHDR) {
4770 M_COPY_PKTHDR(m, n);
4771 n->m_flags &= ~M_PKTHDR;
4772 }
4773 }
4774 space = &m->m_dat[MLEN] - (m->m_data + m->m_len);
4775 do {
4776 count = MIN(MIN(MAX(len, max_protohdr), space), n->m_len);
4777 bcopy(MTOD(n, caddr_t), MTOD(m, caddr_t) + m->m_len,
4778 (unsigned)count);
4779 len -= count;
4780 m->m_len += count;
4781 n->m_len -= count;
4782 space -= count;
4783 if (n->m_len)
4784 n->m_data += count;
4785 else
4786 n = m_free(n);
4787 } while (len > 0 && n);
4788 if (len > 0) {
4789 (void) m_free(m);
4790 goto bad;
4791 }
4792 m->m_next = n;
4793 return (m);
4794 bad:
4795 m_freem(n);
4796 MPFail++;
4797 return (0);
4798 }
4799
4800 /*
4801 * Like m_pullup(), except a new mbuf is always allocated, and we allow
4802 * the amount of empty space before the data in the new mbuf to be specified
4803 * (in the event that the caller expects to prepend later).
4804 */
4805 __private_extern__ int MSFail = 0;
4806
4807 __private_extern__ struct mbuf *
4808 m_copyup(struct mbuf *n, int len, int dstoff)
4809 {
4810 struct mbuf *m;
4811 int count, space;
4812
4813 if (len > (MHLEN - dstoff))
4814 goto bad;
4815 MGET(m, M_DONTWAIT, n->m_type);
4816 if (m == NULL)
4817 goto bad;
4818 m->m_len = 0;
4819 if (n->m_flags & M_PKTHDR) {
4820 m_copy_pkthdr(m, n);
4821 n->m_flags &= ~M_PKTHDR;
4822 }
4823 m->m_data += dstoff;
4824 space = &m->m_dat[MLEN] - (m->m_data + m->m_len);
4825 do {
4826 count = min(min(max(len, max_protohdr), space), n->m_len);
4827 memcpy(mtod(m, caddr_t) + m->m_len, mtod(n, caddr_t),
4828 (unsigned)count);
4829 len -= count;
4830 m->m_len += count;
4831 n->m_len -= count;
4832 space -= count;
4833 if (n->m_len)
4834 n->m_data += count;
4835 else
4836 n = m_free(n);
4837 } while (len > 0 && n);
4838 if (len > 0) {
4839 (void) m_free(m);
4840 goto bad;
4841 }
4842 m->m_next = n;
4843 return (m);
4844 bad:
4845 m_freem(n);
4846 MSFail++;
4847 return (NULL);
4848 }
4849
4850 /*
4851 * Partition an mbuf chain in two pieces, returning the tail --
4852 * all but the first len0 bytes. In case of failure, it returns NULL and
4853 * attempts to restore the chain to its original state.
4854 */
4855 struct mbuf *
4856 m_split(struct mbuf *m0, int len0, int wait)
4857 {
4858 return (m_split0(m0, len0, wait, 1));
4859 }
4860
4861 static struct mbuf *
4862 m_split0(struct mbuf *m0, int len0, int wait, int copyhdr)
4863 {
4864 struct mbuf *m, *n;
4865 unsigned len = len0, remain;
4866
4867 for (m = m0; m && len > m->m_len; m = m->m_next)
4868 len -= m->m_len;
4869 if (m == NULL)
4870 return (NULL);
4871 remain = m->m_len - len;
4872 if (copyhdr && (m0->m_flags & M_PKTHDR)) {
4873 _MGETHDR(n, wait, m0->m_type);
4874 if (n == NULL)
4875 return (NULL);
4876 n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif;
4877 n->m_pkthdr.len = m0->m_pkthdr.len - len0;
4878 m0->m_pkthdr.len = len0;
4879 if (m->m_flags & M_EXT)
4880 goto extpacket;
4881 if (remain > MHLEN) {
4882 /* m can't be the lead packet */
4883 MH_ALIGN(n, 0);
4884 n->m_next = m_split(m, len, wait);
4885 if (n->m_next == NULL) {
4886 (void) m_free(n);
4887 return (NULL);
4888 } else
4889 return (n);
4890 } else
4891 MH_ALIGN(n, remain);
4892 } else if (remain == 0) {
4893 n = m->m_next;
4894 m->m_next = NULL;
4895 return (n);
4896 } else {
4897 _MGET(n, wait, m->m_type);
4898 if (n == NULL)
4899 return (NULL);
4900 M_ALIGN(n, remain);
4901 }
4902 extpacket:
4903 if (m->m_flags & M_EXT) {
4904 n->m_flags |= M_EXT;
4905 n->m_ext = m->m_ext;
4906 m_incref(m);
4907 n->m_data = m->m_data + len;
4908 } else {
4909 bcopy(MTOD(m, caddr_t) + len, MTOD(n, caddr_t), remain);
4910 }
4911 n->m_len = remain;
4912 m->m_len = len;
4913 n->m_next = m->m_next;
4914 m->m_next = NULL;
4915 return (n);
4916 }
4917
4918 /*
4919 * Routine to copy from device local memory into mbufs.
4920 */
4921 struct mbuf *
4922 m_devget(char *buf, int totlen, int off0, struct ifnet *ifp,
4923 void (*copy)(const void *, void *, size_t))
4924 {
4925 struct mbuf *m;
4926 struct mbuf *top = NULL, **mp = &top;
4927 int off = off0, len;
4928 char *cp;
4929 char *epkt;
4930
4931 cp = buf;
4932 epkt = cp + totlen;
4933 if (off) {
4934 /*
4935 * If 'off' is non-zero, packet is trailer-encapsulated,
4936 * so we have to skip the type and length fields.
4937 */
4938 cp += off + 2 * sizeof (u_int16_t);
4939 totlen -= 2 * sizeof (u_int16_t);
4940 }
4941 _MGETHDR(m, M_DONTWAIT, MT_DATA);
4942 if (m == NULL)
4943 return (NULL);
4944 m->m_pkthdr.rcvif = ifp;
4945 m->m_pkthdr.len = totlen;
4946 m->m_len = MHLEN;
4947
4948 while (totlen > 0) {
4949 if (top != NULL) {
4950 _MGET(m, M_DONTWAIT, MT_DATA);
4951 if (m == NULL) {
4952 m_freem(top);
4953 return (NULL);
4954 }
4955 m->m_len = MLEN;
4956 }
4957 len = MIN(totlen, epkt - cp);
4958 if (len >= MINCLSIZE) {
4959 MCLGET(m, M_DONTWAIT);
4960 if (m->m_flags & M_EXT) {
4961 m->m_len = len = MIN(len, m_maxsize(MC_CL));
4962 } else {
4963 /* give up when it's out of cluster mbufs */
4964 if (top != NULL)
4965 m_freem(top);
4966 m_freem(m);
4967 return (NULL);
4968 }
4969 } else {
4970 /*
4971 * Place initial small packet/header at end of mbuf.
4972 */
4973 if (len < m->m_len) {
4974 if (top == NULL &&
4975 len + max_linkhdr <= m->m_len)
4976 m->m_data += max_linkhdr;
4977 m->m_len = len;
4978 } else {
4979 len = m->m_len;
4980 }
4981 }
4982 if (copy)
4983 copy(cp, MTOD(m, caddr_t), (unsigned)len);
4984 else
4985 bcopy(cp, MTOD(m, caddr_t), (unsigned)len);
4986 cp += len;
4987 *mp = m;
4988 mp = &m->m_next;
4989 totlen -= len;
4990 if (cp == epkt)
4991 cp = buf;
4992 }
4993 return (top);
4994 }
4995
4996 #ifndef MBUF_GROWTH_NORMAL_THRESH
4997 #define MBUF_GROWTH_NORMAL_THRESH 25
4998 #endif
4999
5000 /*
5001 * Cluster freelist allocation check.
5002 */
5003 static int
5004 m_howmany(int num, size_t bufsize)
5005 {
5006 int i = 0, j = 0;
5007 u_int32_t m_mbclusters, m_clusters, m_bigclusters, m_16kclusters;
5008 u_int32_t m_mbfree, m_clfree, m_bigclfree, m_16kclfree;
5009 u_int32_t sumclusters, freeclusters;
5010 u_int32_t percent_pool, percent_kmem;
5011 u_int32_t mb_growth, mb_growth_thresh;
5012
5013 VERIFY(bufsize == m_maxsize(MC_BIGCL) ||
5014 bufsize == m_maxsize(MC_16KCL));
5015
5016 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
5017
5018 /* Numbers in 2K cluster units */
5019 m_mbclusters = m_total(MC_MBUF) >> NMBPCLSHIFT;
5020 m_clusters = m_total(MC_CL);
5021 m_bigclusters = m_total(MC_BIGCL) << NCLPBGSHIFT;
5022 m_16kclusters = m_total(MC_16KCL);
5023 sumclusters = m_mbclusters + m_clusters + m_bigclusters;
5024
5025 m_mbfree = m_infree(MC_MBUF) >> NMBPCLSHIFT;
5026 m_clfree = m_infree(MC_CL);
5027 m_bigclfree = m_infree(MC_BIGCL) << NCLPBGSHIFT;
5028 m_16kclfree = m_infree(MC_16KCL);
5029 freeclusters = m_mbfree + m_clfree + m_bigclfree;
5030
5031 /* Bail if we've maxed out the mbuf memory map */
5032 if ((bufsize == m_maxsize(MC_BIGCL) && sumclusters >= nclusters) ||
5033 (njcl > 0 && bufsize == m_maxsize(MC_16KCL) &&
5034 (m_16kclusters << NCLPJCLSHIFT) >= njcl)) {
5035 return (0);
5036 }
5037
5038 if (bufsize == m_maxsize(MC_BIGCL)) {
5039 /* Under minimum */
5040 if (m_bigclusters < m_minlimit(MC_BIGCL))
5041 return (m_minlimit(MC_BIGCL) - m_bigclusters);
5042
5043 percent_pool =
5044 ((sumclusters - freeclusters) * 100) / sumclusters;
5045 percent_kmem = (sumclusters * 100) / nclusters;
5046
5047 /*
5048 * If a light/normal user, grow conservatively (75%)
5049 * If a heavy user, grow aggressively (50%)
5050 */
5051 if (percent_kmem < MBUF_GROWTH_NORMAL_THRESH)
5052 mb_growth = MB_GROWTH_NORMAL;
5053 else
5054 mb_growth = MB_GROWTH_AGGRESSIVE;
5055
5056 if (percent_kmem < 5) {
5057 /* For initial allocations */
5058 i = num;
5059 } else {
5060 /* Return if >= MBIGCL_LOWAT clusters available */
5061 if (m_infree(MC_BIGCL) >= MBIGCL_LOWAT &&
5062 m_total(MC_BIGCL) >=
5063 MBIGCL_LOWAT + m_minlimit(MC_BIGCL))
5064 return (0);
5065
5066 /* Ensure at least num clusters are accessible */
5067 if (num >= m_infree(MC_BIGCL))
5068 i = num - m_infree(MC_BIGCL);
5069 if (num > m_total(MC_BIGCL) - m_minlimit(MC_BIGCL))
5070 j = num - (m_total(MC_BIGCL) -
5071 m_minlimit(MC_BIGCL));
5072
5073 i = MAX(i, j);
5074
5075 /*
5076 * Grow pool if percent_pool > 75 (normal growth)
5077 * or percent_pool > 50 (aggressive growth).
5078 */
5079 mb_growth_thresh = 100 - (100 / (1 << mb_growth));
5080 if (percent_pool > mb_growth_thresh)
5081 j = ((sumclusters + num) >> mb_growth) -
5082 freeclusters;
5083 i = MAX(i, j);
5084 }
5085
5086 /* Check to ensure we didn't go over limits */
5087 if (i + m_bigclusters >= m_maxlimit(MC_BIGCL))
5088 i = m_maxlimit(MC_BIGCL) - m_bigclusters;
5089 if ((i << 1) + sumclusters >= nclusters)
5090 i = (nclusters - sumclusters) >> 1;
5091 VERIFY((m_total(MC_BIGCL) + i) <= m_maxlimit(MC_BIGCL));
5092 VERIFY(sumclusters + (i << 1) <= nclusters);
5093
5094 } else { /* 16K CL */
5095 VERIFY(njcl > 0);
5096 /* Under minimum */
5097 if (m_16kclusters < MIN16KCL)
5098 return (MIN16KCL - m_16kclusters);
5099 if (m_16kclfree >= M16KCL_LOWAT)
5100 return (0);
5101
5102 /* Ensure at least num clusters are available */
5103 if (num >= m_16kclfree)
5104 i = num - m_16kclfree;
5105
5106 /* Always grow 16KCL pool aggressively */
5107 if (((m_16kclusters + num) >> 1) > m_16kclfree)
5108 j = ((m_16kclusters + num) >> 1) - m_16kclfree;
5109 i = MAX(i, j);
5110
5111 /* Check to ensure we don't go over limit */
5112 if (i + m_16kclusters >= m_maxlimit(MC_16KCL))
5113 i = m_maxlimit(MC_16KCL) - m_16kclusters;
5114 VERIFY((m_total(MC_16KCL) + i) <= m_maxlimit(MC_16KCL));
5115 }
5116 return (i);
5117 }
5118 /*
5119 * Return the number of bytes in the mbuf chain, m.
5120 */
5121 unsigned int
5122 m_length(struct mbuf *m)
5123 {
5124 struct mbuf *m0;
5125 unsigned int pktlen;
5126
5127 if (m->m_flags & M_PKTHDR)
5128 return (m->m_pkthdr.len);
5129
5130 pktlen = 0;
5131 for (m0 = m; m0 != NULL; m0 = m0->m_next)
5132 pktlen += m0->m_len;
5133 return (pktlen);
5134 }
5135
5136 /*
5137 * Copy data from a buffer back into the indicated mbuf chain,
5138 * starting "off" bytes from the beginning, extending the mbuf
5139 * chain if necessary.
5140 */
5141 void
5142 m_copyback(struct mbuf *m0, int off, int len, const void *cp)
5143 {
5144 #if DEBUG
5145 struct mbuf *origm = m0;
5146 int error;
5147 #endif /* DEBUG */
5148
5149 if (m0 == NULL)
5150 return;
5151
5152 #if DEBUG
5153 error =
5154 #endif /* DEBUG */
5155 m_copyback0(&m0, off, len, cp,
5156 M_COPYBACK0_COPYBACK | M_COPYBACK0_EXTEND, M_DONTWAIT);
5157
5158 #if DEBUG
5159 if (error != 0 || (m0 != NULL && origm != m0))
5160 panic("m_copyback");
5161 #endif /* DEBUG */
5162 }
5163
5164 struct mbuf *
5165 m_copyback_cow(struct mbuf *m0, int off, int len, const void *cp, int how)
5166 {
5167 int error;
5168
5169 /* don't support chain expansion */
5170 VERIFY(off + len <= m_length(m0));
5171
5172 error = m_copyback0(&m0, off, len, cp,
5173 M_COPYBACK0_COPYBACK | M_COPYBACK0_COW, how);
5174 if (error) {
5175 /*
5176 * no way to recover from partial success.
5177 * just free the chain.
5178 */
5179 m_freem(m0);
5180 return (NULL);
5181 }
5182 return (m0);
5183 }
5184
5185 /*
5186 * m_makewritable: ensure the specified range writable.
5187 */
5188 int
5189 m_makewritable(struct mbuf **mp, int off, int len, int how)
5190 {
5191 int error;
5192 #if DEBUG
5193 struct mbuf *n;
5194 int origlen, reslen;
5195
5196 origlen = m_length(*mp);
5197 #endif /* DEBUG */
5198
5199 #if 0 /* M_COPYALL is large enough */
5200 if (len == M_COPYALL)
5201 len = m_length(*mp) - off; /* XXX */
5202 #endif
5203
5204 error = m_copyback0(mp, off, len, NULL,
5205 M_COPYBACK0_PRESERVE | M_COPYBACK0_COW, how);
5206
5207 #if DEBUG
5208 reslen = 0;
5209 for (n = *mp; n; n = n->m_next)
5210 reslen += n->m_len;
5211 if (origlen != reslen)
5212 panic("m_makewritable: length changed");
5213 if (((*mp)->m_flags & M_PKTHDR) && reslen != (*mp)->m_pkthdr.len)
5214 panic("m_makewritable: inconsist");
5215 #endif /* DEBUG */
5216
5217 return (error);
5218 }
5219
5220 static int
5221 m_copyback0(struct mbuf **mp0, int off, int len, const void *vp, int flags,
5222 int how)
5223 {
5224 int mlen;
5225 struct mbuf *m, *n;
5226 struct mbuf **mp;
5227 int totlen = 0;
5228 const char *cp = vp;
5229
5230 VERIFY(mp0 != NULL);
5231 VERIFY(*mp0 != NULL);
5232 VERIFY((flags & M_COPYBACK0_PRESERVE) == 0 || cp == NULL);
5233 VERIFY((flags & M_COPYBACK0_COPYBACK) == 0 || cp != NULL);
5234
5235 /*
5236 * we don't bother to update "totlen" in the case of M_COPYBACK0_COW,
5237 * assuming that M_COPYBACK0_EXTEND and M_COPYBACK0_COW are exclusive.
5238 */
5239
5240 VERIFY((~flags & (M_COPYBACK0_EXTEND|M_COPYBACK0_COW)) != 0);
5241
5242 mp = mp0;
5243 m = *mp;
5244 while (off > (mlen = m->m_len)) {
5245 off -= mlen;
5246 totlen += mlen;
5247 if (m->m_next == NULL) {
5248 int tspace;
5249 extend:
5250 if (!(flags & M_COPYBACK0_EXTEND))
5251 goto out;
5252
5253 /*
5254 * try to make some space at the end of "m".
5255 */
5256
5257 mlen = m->m_len;
5258 if (off + len >= MINCLSIZE &&
5259 !(m->m_flags & M_EXT) && m->m_len == 0) {
5260 MCLGET(m, how);
5261 }
5262 tspace = M_TRAILINGSPACE(m);
5263 if (tspace > 0) {
5264 tspace = MIN(tspace, off + len);
5265 VERIFY(tspace > 0);
5266 bzero(mtod(m, char *) + m->m_len,
5267 MIN(off, tspace));
5268 m->m_len += tspace;
5269 off += mlen;
5270 totlen -= mlen;
5271 continue;
5272 }
5273
5274 /*
5275 * need to allocate an mbuf.
5276 */
5277
5278 if (off + len >= MINCLSIZE) {
5279 n = m_getcl(how, m->m_type, 0);
5280 } else {
5281 n = _M_GET(how, m->m_type);
5282 }
5283 if (n == NULL) {
5284 goto out;
5285 }
5286 n->m_len = 0;
5287 n->m_len = MIN(M_TRAILINGSPACE(n), off + len);
5288 bzero(mtod(n, char *), MIN(n->m_len, off));
5289 m->m_next = n;
5290 }
5291 mp = &m->m_next;
5292 m = m->m_next;
5293 }
5294 while (len > 0) {
5295 mlen = m->m_len - off;
5296 if (mlen != 0 && m_mclhasreference(m)) {
5297 char *datap;
5298 int eatlen;
5299
5300 /*
5301 * this mbuf is read-only.
5302 * allocate a new writable mbuf and try again.
5303 */
5304
5305 #if defined(DIAGNOSTIC)
5306 if (!(flags & M_COPYBACK0_COW))
5307 panic("m_copyback0: read-only");
5308 #endif /* defined(DIAGNOSTIC) */
5309
5310 /*
5311 * if we're going to write into the middle of
5312 * a mbuf, split it first.
5313 */
5314 if (off > 0 && len < mlen) {
5315 n = m_split0(m, off, how, 0);
5316 if (n == NULL)
5317 goto enobufs;
5318 m->m_next = n;
5319 mp = &m->m_next;
5320 m = n;
5321 off = 0;
5322 continue;
5323 }
5324
5325 /*
5326 * XXX TODO coalesce into the trailingspace of
5327 * the previous mbuf when possible.
5328 */
5329
5330 /*
5331 * allocate a new mbuf. copy packet header if needed.
5332 */
5333 n = _M_GET(how, m->m_type);
5334 if (n == NULL)
5335 goto enobufs;
5336 if (off == 0 && (m->m_flags & M_PKTHDR)) {
5337 M_COPY_PKTHDR(n, m);
5338 n->m_len = MHLEN;
5339 } else {
5340 if (len >= MINCLSIZE)
5341 MCLGET(n, M_DONTWAIT);
5342 n->m_len =
5343 (n->m_flags & M_EXT) ? MCLBYTES : MLEN;
5344 }
5345 if (n->m_len > len)
5346 n->m_len = len;
5347
5348 /*
5349 * free the region which has been overwritten.
5350 * copying data from old mbufs if requested.
5351 */
5352 if (flags & M_COPYBACK0_PRESERVE)
5353 datap = mtod(n, char *);
5354 else
5355 datap = NULL;
5356 eatlen = n->m_len;
5357 VERIFY(off == 0 || eatlen >= mlen);
5358 if (off > 0) {
5359 VERIFY(len >= mlen);
5360 m->m_len = off;
5361 m->m_next = n;
5362 if (datap) {
5363 m_copydata(m, off, mlen, datap);
5364 datap += mlen;
5365 }
5366 eatlen -= mlen;
5367 mp = &m->m_next;
5368 m = m->m_next;
5369 }
5370 while (m != NULL && m_mclhasreference(m) &&
5371 n->m_type == m->m_type && eatlen > 0) {
5372 mlen = MIN(eatlen, m->m_len);
5373 if (datap) {
5374 m_copydata(m, 0, mlen, datap);
5375 datap += mlen;
5376 }
5377 m->m_data += mlen;
5378 m->m_len -= mlen;
5379 eatlen -= mlen;
5380 if (m->m_len == 0)
5381 *mp = m = m_free(m);
5382 }
5383 if (eatlen > 0)
5384 n->m_len -= eatlen;
5385 n->m_next = m;
5386 *mp = m = n;
5387 continue;
5388 }
5389 mlen = MIN(mlen, len);
5390 if (flags & M_COPYBACK0_COPYBACK) {
5391 bcopy(cp, mtod(m, caddr_t) + off, (unsigned)mlen);
5392 cp += mlen;
5393 }
5394 len -= mlen;
5395 mlen += off;
5396 off = 0;
5397 totlen += mlen;
5398 if (len == 0)
5399 break;
5400 if (m->m_next == NULL) {
5401 goto extend;
5402 }
5403 mp = &m->m_next;
5404 m = m->m_next;
5405 }
5406 out:
5407 if (((m = *mp0)->m_flags & M_PKTHDR) && (m->m_pkthdr.len < totlen)) {
5408 VERIFY(flags & M_COPYBACK0_EXTEND);
5409 m->m_pkthdr.len = totlen;
5410 }
5411
5412 return (0);
5413
5414 enobufs:
5415 return (ENOBUFS);
5416 }
5417
5418 char *
5419 mcl_to_paddr(char *addr)
5420 {
5421 vm_offset_t base_phys;
5422
5423 if (!MBUF_IN_MAP(addr))
5424 return (NULL);
5425 base_phys = mcl_paddr[(addr - (char *)mbutl) >> PGSHIFT];
5426
5427 if (base_phys == 0)
5428 return (NULL);
5429 return ((char *)((uintptr_t)base_phys | ((uintptr_t)addr & PGOFSET)));
5430 }
5431
5432 /*
5433 * Dup the mbuf chain passed in. The whole thing. No cute additional cruft.
5434 * And really copy the thing. That way, we don't "precompute" checksums
5435 * for unsuspecting consumers. Assumption: m->m_nextpkt == 0. Trick: for
5436 * small packets, don't dup into a cluster. That way received packets
5437 * don't take up too much room in the sockbuf (cf. sbspace()).
5438 */
5439 int MDFail;
5440
5441 struct mbuf *
5442 m_dup(struct mbuf *m, int how)
5443 {
5444 struct mbuf *n, **np;
5445 struct mbuf *top;
5446 int copyhdr = 0;
5447
5448 np = &top;
5449 top = NULL;
5450 if (m->m_flags & M_PKTHDR)
5451 copyhdr = 1;
5452
5453 /*
5454 * Quick check: if we have one mbuf and its data fits in an
5455 * mbuf with packet header, just copy and go.
5456 */
5457 if (m->m_next == NULL) {
5458 /* Then just move the data into an mbuf and be done... */
5459 if (copyhdr) {
5460 if (m->m_pkthdr.len <= MHLEN && m->m_len <= MHLEN) {
5461 if ((n = _M_GETHDR(how, m->m_type)) == NULL)
5462 return (NULL);
5463 n->m_len = m->m_len;
5464 m_dup_pkthdr(n, m, how);
5465 bcopy(m->m_data, n->m_data, m->m_len);
5466 return (n);
5467 }
5468 } else if (m->m_len <= MLEN) {
5469 if ((n = _M_GET(how, m->m_type)) == NULL)
5470 return (NULL);
5471 bcopy(m->m_data, n->m_data, m->m_len);
5472 n->m_len = m->m_len;
5473 return (n);
5474 }
5475 }
5476 while (m != NULL) {
5477 #if BLUE_DEBUG
5478 kprintf("<%x: %x, %x, %x\n", m, m->m_flags, m->m_len,
5479 m->m_data);
5480 #endif
5481 if (copyhdr)
5482 n = _M_GETHDR(how, m->m_type);
5483 else
5484 n = _M_GET(how, m->m_type);
5485 if (n == NULL)
5486 goto nospace;
5487 if (m->m_flags & M_EXT) {
5488 if (m->m_len <= m_maxsize(MC_CL))
5489 MCLGET(n, how);
5490 else if (m->m_len <= m_maxsize(MC_BIGCL))
5491 n = m_mbigget(n, how);
5492 else if (m->m_len <= m_maxsize(MC_16KCL) && njcl > 0)
5493 n = m_m16kget(n, how);
5494 if (!(n->m_flags & M_EXT)) {
5495 (void) m_free(n);
5496 goto nospace;
5497 }
5498 }
5499 *np = n;
5500 if (copyhdr) {
5501 /* Don't use M_COPY_PKTHDR: preserve m_data */
5502 m_dup_pkthdr(n, m, how);
5503 copyhdr = 0;
5504 if (!(n->m_flags & M_EXT))
5505 n->m_data = n->m_pktdat;
5506 }
5507 n->m_len = m->m_len;
5508 /*
5509 * Get the dup on the same bdry as the original
5510 * Assume that the two mbufs have the same offset to data area
5511 * (up to word boundaries)
5512 */
5513 bcopy(MTOD(m, caddr_t), MTOD(n, caddr_t), (unsigned)n->m_len);
5514 m = m->m_next;
5515 np = &n->m_next;
5516 #if BLUE_DEBUG
5517 kprintf(">%x: %x, %x, %x\n", n, n->m_flags, n->m_len,
5518 n->m_data);
5519 #endif
5520 }
5521
5522 if (top == NULL)
5523 MDFail++;
5524 return (top);
5525
5526 nospace:
5527 m_freem(top);
5528 MDFail++;
5529 return (NULL);
5530 }
5531
5532 #define MBUF_MULTIPAGES(m) \
5533 (((m)->m_flags & M_EXT) && \
5534 ((IS_P2ALIGNED((m)->m_data, NBPG) && (m)->m_len > NBPG) || \
5535 (!IS_P2ALIGNED((m)->m_data, NBPG) && \
5536 P2ROUNDUP((m)->m_data, NBPG) < ((uintptr_t)(m)->m_data + (m)->m_len))))
5537
5538 static struct mbuf *
5539 m_expand(struct mbuf *m, struct mbuf **last)
5540 {
5541 struct mbuf *top = NULL;
5542 struct mbuf **nm = &top;
5543 uintptr_t data0, data;
5544 unsigned int len0, len;
5545
5546 VERIFY(MBUF_MULTIPAGES(m));
5547 VERIFY(m->m_next == NULL);
5548 data0 = (uintptr_t)m->m_data;
5549 len0 = m->m_len;
5550 *last = top;
5551
5552 for (;;) {
5553 struct mbuf *n;
5554
5555 data = data0;
5556 if (IS_P2ALIGNED(data, NBPG) && len0 > NBPG)
5557 len = NBPG;
5558 else if (!IS_P2ALIGNED(data, NBPG) &&
5559 P2ROUNDUP(data, NBPG) < (data + len0))
5560 len = P2ROUNDUP(data, NBPG) - data;
5561 else
5562 len = len0;
5563
5564 VERIFY(len > 0);
5565 VERIFY(m->m_flags & M_EXT);
5566 m->m_data = (void *)data;
5567 m->m_len = len;
5568
5569 *nm = *last = m;
5570 nm = &m->m_next;
5571 m->m_next = NULL;
5572
5573 data0 += len;
5574 len0 -= len;
5575 if (len0 == 0)
5576 break;
5577
5578 n = _M_RETRY(M_DONTWAIT, MT_DATA);
5579 if (n == NULL) {
5580 m_freem(top);
5581 top = *last = NULL;
5582 break;
5583 }
5584
5585 n->m_ext = m->m_ext;
5586 m_incref(m);
5587 n->m_flags |= M_EXT;
5588 m = n;
5589 }
5590 return (top);
5591 }
5592
5593 struct mbuf *
5594 m_normalize(struct mbuf *m)
5595 {
5596 struct mbuf *top = NULL;
5597 struct mbuf **nm = &top;
5598 boolean_t expanded = FALSE;
5599
5600 while (m != NULL) {
5601 struct mbuf *n;
5602
5603 n = m->m_next;
5604 m->m_next = NULL;
5605
5606 /* Does the data cross one or more page boundaries? */
5607 if (MBUF_MULTIPAGES(m)) {
5608 struct mbuf *last;
5609 if ((m = m_expand(m, &last)) == NULL) {
5610 m_freem(n);
5611 m_freem(top);
5612 top = NULL;
5613 break;
5614 }
5615 *nm = m;
5616 nm = &last->m_next;
5617 expanded = TRUE;
5618 } else {
5619 *nm = m;
5620 nm = &m->m_next;
5621 }
5622 m = n;
5623 }
5624 if (expanded)
5625 atomic_add_32(&mb_normalized, 1);
5626 return (top);
5627 }
5628
5629 /*
5630 * Append the specified data to the indicated mbuf chain,
5631 * Extend the mbuf chain if the new data does not fit in
5632 * existing space.
5633 *
5634 * Return 1 if able to complete the job; otherwise 0.
5635 */
5636 int
5637 m_append(struct mbuf *m0, int len, caddr_t cp)
5638 {
5639 struct mbuf *m, *n;
5640 int remainder, space;
5641
5642 for (m = m0; m->m_next != NULL; m = m->m_next)
5643 ;
5644 remainder = len;
5645 space = M_TRAILINGSPACE(m);
5646 if (space > 0) {
5647 /*
5648 * Copy into available space.
5649 */
5650 if (space > remainder)
5651 space = remainder;
5652 bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
5653 m->m_len += space;
5654 cp += space, remainder -= space;
5655 }
5656 while (remainder > 0) {
5657 /*
5658 * Allocate a new mbuf; could check space
5659 * and allocate a cluster instead.
5660 */
5661 n = m_get(M_WAITOK, m->m_type);
5662 if (n == NULL)
5663 break;
5664 n->m_len = min(MLEN, remainder);
5665 bcopy(cp, mtod(n, caddr_t), n->m_len);
5666 cp += n->m_len;
5667 remainder -= n->m_len;
5668 m->m_next = n;
5669 m = n;
5670 }
5671 if (m0->m_flags & M_PKTHDR)
5672 m0->m_pkthdr.len += len - remainder;
5673 return (remainder == 0);
5674 }
5675
5676 struct mbuf *
5677 m_last(struct mbuf *m)
5678 {
5679 while (m->m_next != NULL)
5680 m = m->m_next;
5681 return (m);
5682 }
5683
5684 unsigned int
5685 m_fixhdr(struct mbuf *m0)
5686 {
5687 u_int len;
5688
5689 len = m_length2(m0, NULL);
5690 m0->m_pkthdr.len = len;
5691 return (len);
5692 }
5693
5694 unsigned int
5695 m_length2(struct mbuf *m0, struct mbuf **last)
5696 {
5697 struct mbuf *m;
5698 u_int len;
5699
5700 len = 0;
5701 for (m = m0; m != NULL; m = m->m_next) {
5702 len += m->m_len;
5703 if (m->m_next == NULL)
5704 break;
5705 }
5706 if (last != NULL)
5707 *last = m;
5708 return (len);
5709 }
5710
5711 /*
5712 * Defragment a mbuf chain, returning the shortest possible chain of mbufs
5713 * and clusters. If allocation fails and this cannot be completed, NULL will
5714 * be returned, but the passed in chain will be unchanged. Upon success,
5715 * the original chain will be freed, and the new chain will be returned.
5716 *
5717 * If a non-packet header is passed in, the original mbuf (chain?) will
5718 * be returned unharmed.
5719 *
5720 * If offset is specfied, the first mbuf in the chain will have a leading
5721 * space of the amount stated by the "off" parameter.
5722 *
5723 * This routine requires that the m_pkthdr.header field of the original
5724 * mbuf chain is cleared by the caller.
5725 */
5726 struct mbuf *
5727 m_defrag_offset(struct mbuf *m0, u_int32_t off, int how)
5728 {
5729 struct mbuf *m_new = NULL, *m_final = NULL;
5730 int progress = 0, length, pktlen;
5731
5732 if (!(m0->m_flags & M_PKTHDR))
5733 return (m0);
5734
5735 VERIFY(off < MHLEN);
5736 m_fixhdr(m0); /* Needed sanity check */
5737
5738 pktlen = m0->m_pkthdr.len + off;
5739 if (pktlen > MHLEN)
5740 m_final = m_getcl(how, MT_DATA, M_PKTHDR);
5741 else
5742 m_final = m_gethdr(how, MT_DATA);
5743
5744 if (m_final == NULL)
5745 goto nospace;
5746
5747 if (off > 0) {
5748 pktlen -= off;
5749 m_final->m_len -= off;
5750 m_final->m_data += off;
5751 }
5752
5753 /*
5754 * Caller must have handled the contents pointed to by this
5755 * pointer before coming here, as otherwise it will point to
5756 * the original mbuf which will get freed upon success.
5757 */
5758 VERIFY(m0->m_pkthdr.header == NULL);
5759
5760 if (m_dup_pkthdr(m_final, m0, how) == 0)
5761 goto nospace;
5762
5763 m_new = m_final;
5764
5765 while (progress < pktlen) {
5766 length = pktlen - progress;
5767 if (length > MCLBYTES)
5768 length = MCLBYTES;
5769
5770 if (m_new == NULL) {
5771 if (length > MLEN)
5772 m_new = m_getcl(how, MT_DATA, 0);
5773 else
5774 m_new = m_get(how, MT_DATA);
5775 if (m_new == NULL)
5776 goto nospace;
5777 }
5778
5779 m_copydata(m0, progress, length, mtod(m_new, caddr_t));
5780 progress += length;
5781 m_new->m_len = length;
5782 if (m_new != m_final)
5783 m_cat(m_final, m_new);
5784 m_new = NULL;
5785 }
5786 m_freem(m0);
5787 m0 = m_final;
5788 return (m0);
5789 nospace:
5790 if (m_final)
5791 m_freem(m_final);
5792 return (NULL);
5793 }
5794
5795 struct mbuf *
5796 m_defrag(struct mbuf *m0, int how)
5797 {
5798 return (m_defrag_offset(m0, 0, how));
5799 }
5800
5801 void
5802 m_mchtype(struct mbuf *m, int t)
5803 {
5804 mtype_stat_inc(t);
5805 mtype_stat_dec(m->m_type);
5806 (m)->m_type = t;
5807 }
5808
5809 void *
5810 m_mtod(struct mbuf *m)
5811 {
5812 return (MTOD(m, void *));
5813 }
5814
5815 struct mbuf *
5816 m_dtom(void *x)
5817 {
5818 return ((struct mbuf *)((uintptr_t)(x) & ~(MSIZE-1)));
5819 }
5820
5821 void
5822 m_mcheck(struct mbuf *m)
5823 {
5824 _MCHECK(m);
5825 }
5826
5827 /*
5828 * Return a pointer to mbuf/offset of location in mbuf chain.
5829 */
5830 struct mbuf *
5831 m_getptr(struct mbuf *m, int loc, int *off)
5832 {
5833
5834 while (loc >= 0) {
5835 /* Normal end of search. */
5836 if (m->m_len > loc) {
5837 *off = loc;
5838 return (m);
5839 } else {
5840 loc -= m->m_len;
5841 if (m->m_next == NULL) {
5842 if (loc == 0) {
5843 /* Point at the end of valid data. */
5844 *off = m->m_len;
5845 return (m);
5846 }
5847 return (NULL);
5848 }
5849 m = m->m_next;
5850 }
5851 }
5852 return (NULL);
5853 }
5854
5855 /*
5856 * Inform the corresponding mcache(s) that there's a waiter below.
5857 */
5858 static void
5859 mbuf_waiter_inc(mbuf_class_t class, boolean_t comp)
5860 {
5861 mcache_waiter_inc(m_cache(class));
5862 if (comp) {
5863 if (class == MC_CL) {
5864 mcache_waiter_inc(m_cache(MC_MBUF_CL));
5865 } else if (class == MC_BIGCL) {
5866 mcache_waiter_inc(m_cache(MC_MBUF_BIGCL));
5867 } else if (class == MC_16KCL) {
5868 mcache_waiter_inc(m_cache(MC_MBUF_16KCL));
5869 } else {
5870 mcache_waiter_inc(m_cache(MC_MBUF_CL));
5871 mcache_waiter_inc(m_cache(MC_MBUF_BIGCL));
5872 }
5873 }
5874 }
5875
5876 /*
5877 * Inform the corresponding mcache(s) that there's no more waiter below.
5878 */
5879 static void
5880 mbuf_waiter_dec(mbuf_class_t class, boolean_t comp)
5881 {
5882 mcache_waiter_dec(m_cache(class));
5883 if (comp) {
5884 if (class == MC_CL) {
5885 mcache_waiter_dec(m_cache(MC_MBUF_CL));
5886 } else if (class == MC_BIGCL) {
5887 mcache_waiter_dec(m_cache(MC_MBUF_BIGCL));
5888 } else if (class == MC_16KCL) {
5889 mcache_waiter_dec(m_cache(MC_MBUF_16KCL));
5890 } else {
5891 mcache_waiter_dec(m_cache(MC_MBUF_CL));
5892 mcache_waiter_dec(m_cache(MC_MBUF_BIGCL));
5893 }
5894 }
5895 }
5896
5897 /*
5898 * Called during slab (blocking and non-blocking) allocation. If there
5899 * is at least one waiter, and the time since the first waiter is blocked
5900 * is greater than the watchdog timeout, panic the system.
5901 */
5902 static void
5903 mbuf_watchdog(void)
5904 {
5905 struct timeval now;
5906 unsigned int since;
5907
5908 if (mb_waiters == 0 || !mb_watchdog)
5909 return;
5910
5911 microuptime(&now);
5912 since = now.tv_sec - mb_wdtstart.tv_sec;
5913 if (since >= MB_WDT_MAXTIME) {
5914 panic_plain("%s: %d waiters stuck for %u secs\n%s", __func__,
5915 mb_waiters, since, mbuf_dump());
5916 /* NOTREACHED */
5917 }
5918 }
5919
5920 /*
5921 * Called during blocking allocation. Returns TRUE if one or more objects
5922 * are available at the per-CPU caches layer and that allocation should be
5923 * retried at that level.
5924 */
5925 static boolean_t
5926 mbuf_sleep(mbuf_class_t class, unsigned int num, int wait)
5927 {
5928 boolean_t mcache_retry = FALSE;
5929
5930 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
5931
5932 /* Check if there's anything at the cache layer */
5933 if (mbuf_cached_above(class, wait)) {
5934 mcache_retry = TRUE;
5935 goto done;
5936 }
5937
5938 /* Nothing? Then try hard to get it from somewhere */
5939 m_reclaim(class, num, (wait & MCR_COMP));
5940
5941 /* We tried hard and got something? */
5942 if (m_infree(class) > 0) {
5943 mbstat.m_wait++;
5944 goto done;
5945 } else if (mbuf_cached_above(class, wait)) {
5946 mbstat.m_wait++;
5947 mcache_retry = TRUE;
5948 goto done;
5949 } else if (wait & MCR_TRYHARD) {
5950 mcache_retry = TRUE;
5951 goto done;
5952 }
5953
5954 /*
5955 * There's really nothing for us right now; inform the
5956 * cache(s) that there is a waiter below and go to sleep.
5957 */
5958 mbuf_waiter_inc(class, (wait & MCR_COMP));
5959
5960 VERIFY(!(wait & MCR_NOSLEEP));
5961
5962 /*
5963 * If this is the first waiter, arm the watchdog timer. Otherwise
5964 * check if we need to panic the system due to watchdog timeout.
5965 */
5966 if (mb_waiters == 0)
5967 microuptime(&mb_wdtstart);
5968 else
5969 mbuf_watchdog();
5970
5971 mb_waiters++;
5972 (void) msleep(mb_waitchan, mbuf_mlock, (PZERO-1), m_cname(class), NULL);
5973
5974 /* We are now up; stop getting notified until next round */
5975 mbuf_waiter_dec(class, (wait & MCR_COMP));
5976
5977 /* We waited and got something */
5978 if (m_infree(class) > 0) {
5979 mbstat.m_wait++;
5980 goto done;
5981 } else if (mbuf_cached_above(class, wait)) {
5982 mbstat.m_wait++;
5983 mcache_retry = TRUE;
5984 }
5985 done:
5986 return (mcache_retry);
5987 }
5988
5989 static void
5990 mbuf_worker_thread(void)
5991 {
5992 int mbuf_expand;
5993
5994 while (1) {
5995 lck_mtx_lock(mbuf_mlock);
5996
5997 mbuf_expand = 0;
5998 if (mbuf_expand_mcl) {
5999 int n;
6000
6001 /* Adjust to current number of cluster in use */
6002 n = mbuf_expand_mcl -
6003 (m_total(MC_CL) - m_infree(MC_CL));
6004 if ((n + m_total(MC_CL)) > m_maxlimit(MC_CL))
6005 n = m_maxlimit(MC_CL) - m_total(MC_CL);
6006 mbuf_expand_mcl = 0;
6007
6008 if (n > 0 && freelist_populate(MC_CL, n, M_WAIT) > 0)
6009 mbuf_expand++;
6010 }
6011 if (mbuf_expand_big) {
6012 int n;
6013
6014 /* Adjust to current number of 4 KB cluster in use */
6015 n = mbuf_expand_big -
6016 (m_total(MC_BIGCL) - m_infree(MC_BIGCL));
6017 if ((n + m_total(MC_BIGCL)) > m_maxlimit(MC_BIGCL))
6018 n = m_maxlimit(MC_BIGCL) - m_total(MC_BIGCL);
6019 mbuf_expand_big = 0;
6020
6021 if (n > 0 && freelist_populate(MC_BIGCL, n, M_WAIT) > 0)
6022 mbuf_expand++;
6023 }
6024 if (mbuf_expand_16k) {
6025 int n;
6026
6027 /* Adjust to current number of 16 KB cluster in use */
6028 n = mbuf_expand_16k -
6029 (m_total(MC_16KCL) - m_infree(MC_16KCL));
6030 if ((n + m_total(MC_16KCL)) > m_maxlimit(MC_16KCL))
6031 n = m_maxlimit(MC_16KCL) - m_total(MC_16KCL);
6032 mbuf_expand_16k = 0;
6033
6034 if (n > 0)
6035 (void) freelist_populate(MC_16KCL, n, M_WAIT);
6036 }
6037
6038 /*
6039 * Because we can run out of memory before filling the mbuf
6040 * map, we should not allocate more clusters than they are
6041 * mbufs -- otherwise we could have a large number of useless
6042 * clusters allocated.
6043 */
6044 if (mbuf_expand) {
6045 while (m_total(MC_MBUF) <
6046 (m_total(MC_BIGCL) + m_total(MC_CL))) {
6047 if (freelist_populate(MC_MBUF, 1, M_WAIT) == 0)
6048 break;
6049 }
6050 }
6051
6052 lck_mtx_unlock(mbuf_mlock);
6053
6054 assert_wait(&mbuf_worker_run, THREAD_UNINT);
6055 (void) thread_block((thread_continue_t)mbuf_worker_thread);
6056 }
6057 }
6058
6059 static void
6060 mbuf_worker_thread_init(void)
6061 {
6062 mbuf_worker_ready++;
6063 mbuf_worker_thread();
6064 }
6065
6066 static mcl_slab_t *
6067 slab_get(void *buf)
6068 {
6069 mcl_slabg_t *slg;
6070 unsigned int ix, k;
6071
6072 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
6073
6074 VERIFY(MBUF_IN_MAP(buf));
6075 ix = ((char *)buf - (char *)mbutl) >> MBSHIFT;
6076 VERIFY(ix < maxslabgrp);
6077
6078 if ((slg = slabstbl[ix]) == NULL) {
6079 /*
6080 * In the current implementation, we never shrink the memory
6081 * pool (hence the cluster map); if we attempt to reallocate
6082 * a cluster group when it's already allocated, panic since
6083 * this is a sign of a memory corruption (slabstbl[ix] got
6084 * nullified). This also means that there shouldn't be any
6085 * hole in the kernel sub-map for the mbuf pool.
6086 */
6087 ++slabgrp;
6088 VERIFY(ix < slabgrp);
6089 /*
6090 * Slabs expansion can only be done single threaded; when
6091 * we get here, it must be as a result of m_clalloc() which
6092 * is serialized and therefore mb_clalloc_busy must be set.
6093 */
6094 VERIFY(mb_clalloc_busy);
6095 lck_mtx_unlock(mbuf_mlock);
6096
6097 /* This is a new buffer; create the slabs group for it */
6098 MALLOC(slg, mcl_slabg_t *, sizeof (*slg), M_TEMP,
6099 M_WAITOK | M_ZERO);
6100 VERIFY(slg != NULL);
6101
6102 lck_mtx_lock(mbuf_mlock);
6103 /*
6104 * No other thread could have gone into m_clalloc() after
6105 * we dropped the lock above, so verify that it's true.
6106 */
6107 VERIFY(mb_clalloc_busy);
6108
6109 slabstbl[ix] = slg;
6110
6111 /* Chain each slab in the group to its forward neighbor */
6112 for (k = 1; k < NSLABSPMB; k++)
6113 slg->slg_slab[k - 1].sl_next = &slg->slg_slab[k];
6114 VERIFY(slg->slg_slab[NSLABSPMB - 1].sl_next == NULL);
6115
6116 /* And chain the last slab in the previous group to this */
6117 if (ix > 0) {
6118 VERIFY(slabstbl[ix - 1]->
6119 slg_slab[NSLABSPMB - 1].sl_next == NULL);
6120 slabstbl[ix - 1]->slg_slab[NSLABSPMB - 1].sl_next =
6121 &slg->slg_slab[0];
6122 }
6123 }
6124
6125 ix = MTOBG(buf) % NSLABSPMB;
6126 VERIFY(ix < NSLABSPMB);
6127
6128 return (&slg->slg_slab[ix]);
6129 }
6130
6131 static void
6132 slab_init(mcl_slab_t *sp, mbuf_class_t class, u_int32_t flags,
6133 void *base, void *head, unsigned int len, int refcnt, int chunks)
6134 {
6135 sp->sl_class = class;
6136 sp->sl_flags = flags;
6137 sp->sl_base = base;
6138 sp->sl_head = head;
6139 sp->sl_len = len;
6140 sp->sl_refcnt = refcnt;
6141 sp->sl_chunks = chunks;
6142 slab_detach(sp);
6143 }
6144
6145 static void
6146 slab_insert(mcl_slab_t *sp, mbuf_class_t class)
6147 {
6148 VERIFY(slab_is_detached(sp));
6149 m_slab_cnt(class)++;
6150 TAILQ_INSERT_TAIL(&m_slablist(class), sp, sl_link);
6151 sp->sl_flags &= ~SLF_DETACHED;
6152 if (class == MC_16KCL) {
6153 int k;
6154 for (k = 1; k < NSLABSP16KB; k++) {
6155 sp = sp->sl_next;
6156 /* Next slab must already be present */
6157 VERIFY(sp != NULL);
6158 VERIFY(slab_is_detached(sp));
6159 sp->sl_flags &= ~SLF_DETACHED;
6160 }
6161 }
6162 }
6163
6164 static void
6165 slab_remove(mcl_slab_t *sp, mbuf_class_t class)
6166 {
6167 VERIFY(!slab_is_detached(sp));
6168 VERIFY(m_slab_cnt(class) > 0);
6169 m_slab_cnt(class)--;
6170 TAILQ_REMOVE(&m_slablist(class), sp, sl_link);
6171 slab_detach(sp);
6172 if (class == MC_16KCL) {
6173 int k;
6174 for (k = 1; k < NSLABSP16KB; k++) {
6175 sp = sp->sl_next;
6176 /* Next slab must already be present */
6177 VERIFY(sp != NULL);
6178 VERIFY(!slab_is_detached(sp));
6179 slab_detach(sp);
6180 }
6181 }
6182 }
6183
6184 static boolean_t
6185 slab_inrange(mcl_slab_t *sp, void *buf)
6186 {
6187 return ((uintptr_t)buf >= (uintptr_t)sp->sl_base &&
6188 (uintptr_t)buf < ((uintptr_t)sp->sl_base + sp->sl_len));
6189 }
6190
6191 #undef panic
6192
6193 static void
6194 slab_nextptr_panic(mcl_slab_t *sp, void *addr)
6195 {
6196 int i;
6197 unsigned int chunk_len = sp->sl_len / sp->sl_chunks;
6198 uintptr_t buf = (uintptr_t)sp->sl_base;
6199
6200 for (i = 0; i < sp->sl_chunks; i++, buf += chunk_len) {
6201 void *next = ((mcache_obj_t *)buf)->obj_next;
6202 if (next != addr)
6203 continue;
6204 if (!mclverify) {
6205 if (next != NULL && !MBUF_IN_MAP(next)) {
6206 mcache_t *cp = m_cache(sp->sl_class);
6207 panic("%s: %s buffer %p in slab %p modified "
6208 "after free at offset 0: %p out of range "
6209 "[%p-%p)\n", __func__, cp->mc_name,
6210 (void *)buf, sp, next, mbutl, embutl);
6211 /* NOTREACHED */
6212 }
6213 } else {
6214 mcache_audit_t *mca = mcl_audit_buf2mca(sp->sl_class,
6215 (mcache_obj_t *)buf);
6216 mcl_audit_verify_nextptr(next, mca);
6217 }
6218 }
6219 }
6220
6221 static void
6222 slab_detach(mcl_slab_t *sp)
6223 {
6224 sp->sl_link.tqe_next = (mcl_slab_t *)-1;
6225 sp->sl_link.tqe_prev = (mcl_slab_t **)-1;
6226 sp->sl_flags |= SLF_DETACHED;
6227 }
6228
6229 static boolean_t
6230 slab_is_detached(mcl_slab_t *sp)
6231 {
6232 return ((intptr_t)sp->sl_link.tqe_next == -1 &&
6233 (intptr_t)sp->sl_link.tqe_prev == -1 &&
6234 (sp->sl_flags & SLF_DETACHED));
6235 }
6236
6237 static void
6238 mcl_audit_init(void *buf, mcache_audit_t **mca_list,
6239 mcache_obj_t **con_list, size_t con_size, unsigned int num)
6240 {
6241 mcache_audit_t *mca, *mca_tail;
6242 mcache_obj_t *con = NULL;
6243 boolean_t save_contents = (con_list != NULL);
6244 unsigned int i, ix;
6245
6246 ASSERT(num <= NMBPBG);
6247 ASSERT(con_list == NULL || con_size != 0);
6248
6249 ix = MTOBG(buf);
6250 VERIFY(ix < maxclaudit);
6251
6252 /* Make sure we haven't been here before */
6253 for (i = 0; i < NMBPBG; i++)
6254 VERIFY(mclaudit[ix].cl_audit[i] == NULL);
6255
6256 mca = mca_tail = *mca_list;
6257 if (save_contents)
6258 con = *con_list;
6259
6260 for (i = 0; i < num; i++) {
6261 mcache_audit_t *next;
6262
6263 next = mca->mca_next;
6264 bzero(mca, sizeof (*mca));
6265 mca->mca_next = next;
6266 mclaudit[ix].cl_audit[i] = mca;
6267
6268 /* Attach the contents buffer if requested */
6269 if (save_contents) {
6270 VERIFY(con != NULL);
6271 mca->mca_contents_size = con_size;
6272 mca->mca_contents = con;
6273 con = con->obj_next;
6274 bzero(mca->mca_contents, mca->mca_contents_size);
6275 }
6276
6277 mca_tail = mca;
6278 mca = mca->mca_next;
6279 }
6280
6281 if (save_contents)
6282 *con_list = con;
6283
6284 *mca_list = mca_tail->mca_next;
6285 mca_tail->mca_next = NULL;
6286 }
6287
6288 /*
6289 * Given an address of a buffer (mbuf/2KB/4KB/16KB), return
6290 * the corresponding audit structure for that buffer.
6291 */
6292 static mcache_audit_t *
6293 mcl_audit_buf2mca(mbuf_class_t class, mcache_obj_t *o)
6294 {
6295 mcache_audit_t *mca = NULL;
6296 int ix = MTOBG(o);
6297
6298 VERIFY(ix < maxclaudit);
6299 VERIFY(IS_P2ALIGNED(o, MIN(m_maxsize(class), NBPG)));
6300
6301 switch (class) {
6302 case MC_MBUF:
6303 /*
6304 * For the mbuf case, find the index of the page
6305 * used by the mbuf and use that index to locate the
6306 * base address of the page. Then find out the
6307 * mbuf index relative to the page base and use
6308 * it to locate the audit structure.
6309 */
6310 VERIFY(MCLIDX(BGTOM(ix), o) < (int)NMBPBG);
6311 mca = mclaudit[ix].cl_audit[MCLIDX(BGTOM(ix), o)];
6312 break;
6313
6314 case MC_CL:
6315 /*
6316 * Same thing as above, but for 2KB clusters in a page.
6317 */
6318 VERIFY(CLBGIDX(BGTOM(ix), o) < (int)NCLPBG);
6319 mca = mclaudit[ix].cl_audit[CLBGIDX(BGTOM(ix), o)];
6320 break;
6321
6322 case MC_BIGCL:
6323 case MC_16KCL:
6324 /*
6325 * Same as above, but only return the first element.
6326 */
6327 mca = mclaudit[ix].cl_audit[0];
6328 break;
6329
6330 default:
6331 VERIFY(0);
6332 /* NOTREACHED */
6333 }
6334
6335 return (mca);
6336 }
6337
6338 static void
6339 mcl_audit_mbuf(mcache_audit_t *mca, void *addr, boolean_t composite,
6340 boolean_t alloc)
6341 {
6342 struct mbuf *m = addr;
6343 mcache_obj_t *next = ((mcache_obj_t *)m)->obj_next;
6344
6345 VERIFY(mca->mca_contents != NULL &&
6346 mca->mca_contents_size == AUDIT_CONTENTS_SIZE);
6347
6348 if (mclverify)
6349 mcl_audit_verify_nextptr(next, mca);
6350
6351 if (!alloc) {
6352 /* Save constructed mbuf fields */
6353 mcl_audit_save_mbuf(m, mca);
6354 if (mclverify) {
6355 mcache_set_pattern(MCACHE_FREE_PATTERN, m,
6356 m_maxsize(MC_MBUF));
6357 }
6358 ((mcache_obj_t *)m)->obj_next = next;
6359 return;
6360 }
6361
6362 /* Check if the buffer has been corrupted while in freelist */
6363 if (mclverify) {
6364 mcache_audit_free_verify_set(mca, addr, 0, m_maxsize(MC_MBUF));
6365 }
6366 /* Restore constructed mbuf fields */
6367 mcl_audit_restore_mbuf(m, mca, composite);
6368 }
6369
6370 static void
6371 mcl_audit_restore_mbuf(struct mbuf *m, mcache_audit_t *mca, boolean_t composite)
6372 {
6373 struct mbuf *ms = (struct mbuf *)mca->mca_contents;
6374
6375 if (composite) {
6376 struct mbuf *next = m->m_next;
6377 VERIFY(ms->m_flags == M_EXT && MEXT_RFA(ms) != NULL &&
6378 MBUF_IS_COMPOSITE(ms));
6379 /*
6380 * We could have hand-picked the mbuf fields and restore
6381 * them individually, but that will be a maintenance
6382 * headache. Instead, restore everything that was saved;
6383 * the mbuf layer will recheck and reinitialize anyway.
6384 */
6385 bcopy(ms, m, mca->mca_contents_size);
6386 m->m_next = next;
6387 } else {
6388 /*
6389 * For a regular mbuf (no cluster attached) there's nothing
6390 * to restore other than the type field, which is expected
6391 * to be MT_FREE.
6392 */
6393 m->m_type = ms->m_type;
6394 }
6395 _MCHECK(m);
6396 }
6397
6398 static void
6399 mcl_audit_save_mbuf(struct mbuf *m, mcache_audit_t *mca)
6400 {
6401 _MCHECK(m);
6402 bcopy(m, mca->mca_contents, mca->mca_contents_size);
6403 }
6404
6405 static void
6406 mcl_audit_cluster(mcache_audit_t *mca, void *addr, size_t size, boolean_t alloc,
6407 boolean_t save_next)
6408 {
6409 mcache_obj_t *next = ((mcache_obj_t *)addr)->obj_next;
6410
6411 if (!alloc) {
6412 if (mclverify) {
6413 mcache_set_pattern(MCACHE_FREE_PATTERN, addr, size);
6414 }
6415 if (save_next) {
6416 mcl_audit_verify_nextptr(next, mca);
6417 ((mcache_obj_t *)addr)->obj_next = next;
6418 }
6419 } else if (mclverify) {
6420 /* Check if the buffer has been corrupted while in freelist */
6421 mcl_audit_verify_nextptr(next, mca);
6422 mcache_audit_free_verify_set(mca, addr, 0, size);
6423 }
6424 }
6425
6426 static void
6427 mcl_audit_mcheck_panic(struct mbuf *m)
6428 {
6429 mcache_audit_t *mca;
6430
6431 MRANGE(m);
6432 mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
6433
6434 panic("mcl_audit: freed mbuf %p with type 0x%x (instead of 0x%x)\n%s\n",
6435 m, (u_int16_t)m->m_type, MT_FREE, mcache_dump_mca(mca));
6436 /* NOTREACHED */
6437 }
6438
6439 static void
6440 mcl_audit_verify_nextptr(void *next, mcache_audit_t *mca)
6441 {
6442 if (next != NULL && !MBUF_IN_MAP(next) &&
6443 (next != (void *)MCACHE_FREE_PATTERN || !mclverify)) {
6444 panic("mcl_audit: buffer %p modified after free at offset 0: "
6445 "%p out of range [%p-%p)\n%s\n",
6446 mca->mca_addr, next, mbutl, embutl, mcache_dump_mca(mca));
6447 /* NOTREACHED */
6448 }
6449 }
6450
6451 /* This function turns on mbuf leak detection */
6452 static void
6453 mleak_activate(void)
6454 {
6455 mleak_table.mleak_sample_factor = MLEAK_SAMPLE_FACTOR;
6456 PE_parse_boot_argn("mleak_sample_factor",
6457 &mleak_table.mleak_sample_factor,
6458 sizeof (mleak_table.mleak_sample_factor));
6459
6460 if (mleak_table.mleak_sample_factor == 0)
6461 mclfindleak = 0;
6462
6463 if (mclfindleak == 0)
6464 return;
6465
6466 vm_size_t alloc_size =
6467 mleak_alloc_buckets * sizeof (struct mallocation);
6468 vm_size_t trace_size = mleak_trace_buckets * sizeof (struct mtrace);
6469
6470 MALLOC(mleak_allocations, struct mallocation *, alloc_size,
6471 M_TEMP, M_WAITOK | M_ZERO);
6472 VERIFY(mleak_allocations != NULL);
6473
6474 MALLOC(mleak_traces, struct mtrace *, trace_size,
6475 M_TEMP, M_WAITOK | M_ZERO);
6476 VERIFY(mleak_traces != NULL);
6477
6478 MALLOC(mleak_stat, mleak_stat_t *, MLEAK_STAT_SIZE(MLEAK_NUM_TRACES),
6479 M_TEMP, M_WAITOK | M_ZERO);
6480 VERIFY(mleak_stat != NULL);
6481 mleak_stat->ml_cnt = MLEAK_NUM_TRACES;
6482 #ifdef __LP64__
6483 mleak_stat->ml_isaddr64 = 1;
6484 #endif /* __LP64__ */
6485 }
6486
6487 static void
6488 mleak_logger(u_int32_t num, mcache_obj_t *addr, boolean_t alloc)
6489 {
6490 int temp;
6491
6492 if (mclfindleak == 0)
6493 return;
6494
6495 if (!alloc)
6496 return (mleak_free(addr));
6497
6498 temp = atomic_add_32_ov(&mleak_table.mleak_capture, 1);
6499
6500 if ((temp % mleak_table.mleak_sample_factor) == 0 && addr != NULL) {
6501 uintptr_t bt[MLEAK_STACK_DEPTH];
6502 int logged = fastbacktrace(bt, MLEAK_STACK_DEPTH);
6503 mleak_log(bt, addr, logged, num);
6504 }
6505 }
6506
6507 /*
6508 * This function records the allocation in the mleak_allocations table
6509 * and the backtrace in the mleak_traces table; if allocation slot is in use,
6510 * replace old allocation with new one if the trace slot is in use, return
6511 * (or increment refcount if same trace).
6512 */
6513 static boolean_t
6514 mleak_log(uintptr_t *bt, mcache_obj_t *addr, uint32_t depth, int num)
6515 {
6516 struct mallocation *allocation;
6517 struct mtrace *trace;
6518 uint32_t trace_index;
6519
6520 /* Quit if someone else modifying the tables */
6521 if (!lck_mtx_try_lock_spin(mleak_lock)) {
6522 mleak_table.total_conflicts++;
6523 return (FALSE);
6524 }
6525
6526 allocation = &mleak_allocations[hashaddr((uintptr_t)addr,
6527 mleak_alloc_buckets)];
6528 trace_index = hashbacktrace(bt, depth, mleak_trace_buckets);
6529 trace = &mleak_traces[trace_index];
6530
6531 VERIFY(allocation <= &mleak_allocations[mleak_alloc_buckets - 1]);
6532 VERIFY(trace <= &mleak_traces[mleak_trace_buckets - 1]);
6533
6534 allocation->hitcount++;
6535 trace->hitcount++;
6536
6537 /*
6538 * If the allocation bucket we want is occupied
6539 * and the occupier has the same trace, just bail.
6540 */
6541 if (allocation->element != NULL &&
6542 trace_index == allocation->trace_index) {
6543 mleak_table.alloc_collisions++;
6544 lck_mtx_unlock(mleak_lock);
6545 return (TRUE);
6546 }
6547
6548 /*
6549 * Store the backtrace in the traces array;
6550 * Size of zero = trace bucket is free.
6551 */
6552 if (trace->allocs > 0 &&
6553 bcmp(trace->addr, bt, (depth * sizeof (uintptr_t))) != 0) {
6554 /* Different, unique trace, but the same hash! Bail out. */
6555 trace->collisions++;
6556 mleak_table.trace_collisions++;
6557 lck_mtx_unlock(mleak_lock);
6558 return (TRUE);
6559 } else if (trace->allocs > 0) {
6560 /* Same trace, already added, so increment refcount */
6561 trace->allocs++;
6562 } else {
6563 /* Found an unused trace bucket, so record the trace here */
6564 if (trace->depth != 0) {
6565 /* this slot previously used but not currently in use */
6566 mleak_table.trace_overwrites++;
6567 }
6568 mleak_table.trace_recorded++;
6569 trace->allocs = 1;
6570 memcpy(trace->addr, bt, (depth * sizeof (uintptr_t)));
6571 trace->depth = depth;
6572 trace->collisions = 0;
6573 }
6574
6575 /* Step 2: Store the allocation record in the allocations array */
6576 if (allocation->element != NULL) {
6577 /*
6578 * Replace an existing allocation. No need to preserve
6579 * because only a subset of the allocations are being
6580 * recorded anyway.
6581 */
6582 mleak_table.alloc_collisions++;
6583 } else if (allocation->trace_index != 0) {
6584 mleak_table.alloc_overwrites++;
6585 }
6586 allocation->element = addr;
6587 allocation->trace_index = trace_index;
6588 allocation->count = num;
6589 mleak_table.alloc_recorded++;
6590 mleak_table.outstanding_allocs++;
6591
6592 lck_mtx_unlock(mleak_lock);
6593 return (TRUE);
6594 }
6595
6596 static void
6597 mleak_free(mcache_obj_t *addr)
6598 {
6599 while (addr != NULL) {
6600 struct mallocation *allocation = &mleak_allocations
6601 [hashaddr((uintptr_t)addr, mleak_alloc_buckets)];
6602
6603 if (allocation->element == addr &&
6604 allocation->trace_index < mleak_trace_buckets) {
6605 lck_mtx_lock_spin(mleak_lock);
6606 if (allocation->element == addr &&
6607 allocation->trace_index < mleak_trace_buckets) {
6608 struct mtrace *trace;
6609 trace = &mleak_traces[allocation->trace_index];
6610 /* allocs = 0 means trace bucket is unused */
6611 if (trace->allocs > 0)
6612 trace->allocs--;
6613 if (trace->allocs == 0)
6614 trace->depth = 0;
6615 /* NULL element means alloc bucket is unused */
6616 allocation->element = NULL;
6617 mleak_table.outstanding_allocs--;
6618 }
6619 lck_mtx_unlock(mleak_lock);
6620 }
6621 addr = addr->obj_next;
6622 }
6623 }
6624
6625 static void
6626 mleak_sort_traces()
6627 {
6628 int i, j, k;
6629 struct mtrace *swap;
6630
6631 for(i = 0; i < MLEAK_NUM_TRACES; i++)
6632 mleak_top_trace[i] = NULL;
6633
6634 for(i = 0, j = 0; j < MLEAK_NUM_TRACES && i < mleak_trace_buckets; i++)
6635 {
6636 if (mleak_traces[i].allocs <= 0)
6637 continue;
6638
6639 mleak_top_trace[j] = &mleak_traces[i];
6640 for (k = j; k > 0; k--) {
6641 if (mleak_top_trace[k]->allocs <=
6642 mleak_top_trace[k-1]->allocs)
6643 break;
6644
6645 swap = mleak_top_trace[k-1];
6646 mleak_top_trace[k-1] = mleak_top_trace[k];
6647 mleak_top_trace[k] = swap;
6648 }
6649 j++;
6650 }
6651
6652 j--;
6653 for(; i < mleak_trace_buckets; i++) {
6654 if (mleak_traces[i].allocs <= mleak_top_trace[j]->allocs)
6655 continue;
6656
6657 mleak_top_trace[j] = &mleak_traces[i];
6658
6659 for (k = j; k > 0; k--) {
6660 if (mleak_top_trace[k]->allocs <=
6661 mleak_top_trace[k-1]->allocs)
6662 break;
6663
6664 swap = mleak_top_trace[k-1];
6665 mleak_top_trace[k-1] = mleak_top_trace[k];
6666 mleak_top_trace[k] = swap;
6667 }
6668 }
6669 }
6670
6671 static void
6672 mleak_update_stats()
6673 {
6674 mleak_trace_stat_t *mltr;
6675 int i;
6676
6677 VERIFY(mleak_stat != NULL);
6678 #ifdef __LP64__
6679 VERIFY(mleak_stat->ml_isaddr64);
6680 #else
6681 VERIFY(!mleak_stat->ml_isaddr64);
6682 #endif /* !__LP64__ */
6683 VERIFY(mleak_stat->ml_cnt == MLEAK_NUM_TRACES);
6684
6685 mleak_sort_traces();
6686
6687 mltr = &mleak_stat->ml_trace[0];
6688 bzero(mltr, sizeof (*mltr) * MLEAK_NUM_TRACES);
6689 for (i = 0; i < MLEAK_NUM_TRACES; i++) {
6690 int j;
6691
6692 if (mleak_top_trace[i] == NULL ||
6693 mleak_top_trace[i]->allocs == 0)
6694 continue;
6695
6696 mltr->mltr_collisions = mleak_top_trace[i]->collisions;
6697 mltr->mltr_hitcount = mleak_top_trace[i]->hitcount;
6698 mltr->mltr_allocs = mleak_top_trace[i]->allocs;
6699 mltr->mltr_depth = mleak_top_trace[i]->depth;
6700
6701 VERIFY(mltr->mltr_depth <= MLEAK_STACK_DEPTH);
6702 for (j = 0; j < mltr->mltr_depth; j++)
6703 mltr->mltr_addr[j] = mleak_top_trace[i]->addr[j];
6704
6705 mltr++;
6706 }
6707 }
6708
6709 static struct mbtypes {
6710 int mt_type;
6711 const char *mt_name;
6712 } mbtypes[] = {
6713 { MT_DATA, "data" },
6714 { MT_OOBDATA, "oob data" },
6715 { MT_CONTROL, "ancillary data" },
6716 { MT_HEADER, "packet headers" },
6717 { MT_SOCKET, "socket structures" },
6718 { MT_PCB, "protocol control blocks" },
6719 { MT_RTABLE, "routing table entries" },
6720 { MT_HTABLE, "IMP host table entries" },
6721 { MT_ATABLE, "address resolution tables" },
6722 { MT_FTABLE, "fragment reassembly queue headers" },
6723 { MT_SONAME, "socket names and addresses" },
6724 { MT_SOOPTS, "socket options" },
6725 { MT_RIGHTS, "access rights" },
6726 { MT_IFADDR, "interface addresses" },
6727 { MT_TAG, "packet tags" },
6728 { 0, NULL }
6729 };
6730
6731 #define MBUF_DUMP_BUF_CHK() { \
6732 clen -= k; \
6733 if (clen < 1) \
6734 goto done; \
6735 c += k; \
6736 }
6737
6738 static char *
6739 mbuf_dump(void)
6740 {
6741 unsigned long totmem = 0, totfree = 0, totmbufs, totused, totpct;
6742 u_int32_t m_mbufs = 0, m_clfree = 0, m_bigclfree = 0;
6743 u_int32_t m_mbufclfree = 0, m_mbufbigclfree = 0;
6744 u_int32_t m_16kclusters = 0, m_16kclfree = 0, m_mbuf16kclfree = 0;
6745 int nmbtypes = sizeof (mbstat.m_mtypes) / sizeof (short);
6746 uint8_t seen[256];
6747 struct mbtypes *mp;
6748 mb_class_stat_t *sp;
6749 mleak_trace_stat_t *mltr;
6750 char *c = mbuf_dump_buf;
6751 int i, k, clen = MBUF_DUMP_BUF_SIZE;
6752
6753 mbuf_dump_buf[0] = '\0';
6754
6755 /* synchronize all statistics in the mbuf table */
6756 mbuf_stat_sync();
6757 mbuf_mtypes_sync(TRUE);
6758
6759 sp = &mb_stat->mbs_class[0];
6760 for (i = 0; i < mb_stat->mbs_cnt; i++, sp++) {
6761 u_int32_t mem;
6762
6763 if (m_class(i) == MC_MBUF) {
6764 m_mbufs = sp->mbcl_active;
6765 } else if (m_class(i) == MC_CL) {
6766 m_clfree = sp->mbcl_total - sp->mbcl_active;
6767 } else if (m_class(i) == MC_BIGCL) {
6768 m_bigclfree = sp->mbcl_total - sp->mbcl_active;
6769 } else if (njcl > 0 && m_class(i) == MC_16KCL) {
6770 m_16kclfree = sp->mbcl_total - sp->mbcl_active;
6771 m_16kclusters = sp->mbcl_total;
6772 } else if (m_class(i) == MC_MBUF_CL) {
6773 m_mbufclfree = sp->mbcl_total - sp->mbcl_active;
6774 } else if (m_class(i) == MC_MBUF_BIGCL) {
6775 m_mbufbigclfree = sp->mbcl_total - sp->mbcl_active;
6776 } else if (njcl > 0 && m_class(i) == MC_MBUF_16KCL) {
6777 m_mbuf16kclfree = sp->mbcl_total - sp->mbcl_active;
6778 }
6779
6780 mem = sp->mbcl_ctotal * sp->mbcl_size;
6781 totmem += mem;
6782 totfree += (sp->mbcl_mc_cached + sp->mbcl_infree) *
6783 sp->mbcl_size;
6784
6785 }
6786
6787 /* adjust free counts to include composite caches */
6788 m_clfree += m_mbufclfree;
6789 m_bigclfree += m_mbufbigclfree;
6790 m_16kclfree += m_mbuf16kclfree;
6791
6792 totmbufs = 0;
6793 for (mp = mbtypes; mp->mt_name != NULL; mp++)
6794 totmbufs += mbstat.m_mtypes[mp->mt_type];
6795 if (totmbufs > m_mbufs)
6796 totmbufs = m_mbufs;
6797 k = snprintf(c, clen, "%lu/%u mbufs in use:\n", totmbufs, m_mbufs);
6798 MBUF_DUMP_BUF_CHK();
6799
6800 bzero(&seen, sizeof (seen));
6801 for (mp = mbtypes; mp->mt_name != NULL; mp++) {
6802 if (mbstat.m_mtypes[mp->mt_type] != 0) {
6803 seen[mp->mt_type] = 1;
6804 k = snprintf(c, clen, "\t%u mbufs allocated to %s\n",
6805 mbstat.m_mtypes[mp->mt_type], mp->mt_name);
6806 MBUF_DUMP_BUF_CHK();
6807 }
6808 }
6809 seen[MT_FREE] = 1;
6810 for (i = 0; i < nmbtypes; i++)
6811 if (!seen[i] && mbstat.m_mtypes[i] != 0) {
6812 k = snprintf(c, clen, "\t%u mbufs allocated to "
6813 "<mbuf type %d>\n", mbstat.m_mtypes[i], i);
6814 MBUF_DUMP_BUF_CHK();
6815 }
6816 if ((m_mbufs - totmbufs) > 0) {
6817 k = snprintf(c, clen, "\t%lu mbufs allocated to caches\n",
6818 m_mbufs - totmbufs);
6819 MBUF_DUMP_BUF_CHK();
6820 }
6821 k = snprintf(c, clen, "%u/%u mbuf 2KB clusters in use\n"
6822 "%u/%u mbuf 4KB clusters in use\n",
6823 (unsigned int)(mbstat.m_clusters - m_clfree),
6824 (unsigned int)mbstat.m_clusters,
6825 (unsigned int)(mbstat.m_bigclusters - m_bigclfree),
6826 (unsigned int)mbstat.m_bigclusters);
6827 MBUF_DUMP_BUF_CHK();
6828
6829 if (njcl > 0) {
6830 k = snprintf(c, clen, "%u/%u mbuf %uKB clusters in use\n",
6831 m_16kclusters - m_16kclfree, m_16kclusters,
6832 njclbytes / 1024);
6833 MBUF_DUMP_BUF_CHK();
6834 }
6835 totused = totmem - totfree;
6836 if (totmem == 0) {
6837 totpct = 0;
6838 } else if (totused < (ULONG_MAX / 100)) {
6839 totpct = (totused * 100) / totmem;
6840 } else {
6841 u_long totmem1 = totmem / 100;
6842 u_long totused1 = totused / 100;
6843 totpct = (totused1 * 100) / totmem1;
6844 }
6845 k = snprintf(c, clen, "%lu KB allocated to network (approx. %lu%% "
6846 "in use)\n", totmem / 1024, totpct);
6847 MBUF_DUMP_BUF_CHK();
6848
6849 /* mbuf leak detection statistics */
6850 mleak_update_stats();
6851
6852 k = snprintf(c, clen, "\nmbuf leak detection table:\n");
6853 MBUF_DUMP_BUF_CHK();
6854 k = snprintf(c, clen, "\ttotal captured: %u (one per %u)\n",
6855 mleak_table.mleak_capture / mleak_table.mleak_sample_factor,
6856 mleak_table.mleak_sample_factor);
6857 MBUF_DUMP_BUF_CHK();
6858 k = snprintf(c, clen, "\ttotal allocs outstanding: %llu\n",
6859 mleak_table.outstanding_allocs);
6860 MBUF_DUMP_BUF_CHK();
6861 k = snprintf(c, clen, "\tnew hash recorded: %llu allocs, %llu traces\n",
6862 mleak_table.alloc_recorded, mleak_table.trace_recorded);
6863 MBUF_DUMP_BUF_CHK();
6864 k = snprintf(c, clen, "\thash collisions: %llu allocs, %llu traces\n",
6865 mleak_table.alloc_collisions, mleak_table.trace_collisions);
6866 MBUF_DUMP_BUF_CHK();
6867 k = snprintf(c, clen, "\toverwrites: %llu allocs, %llu traces\n",
6868 mleak_table.alloc_overwrites, mleak_table.trace_overwrites);
6869 MBUF_DUMP_BUF_CHK();
6870 k = snprintf(c, clen, "\tlock conflicts: %llu\n\n",
6871 mleak_table.total_conflicts);
6872 MBUF_DUMP_BUF_CHK();
6873
6874 k = snprintf(c, clen, "top %d outstanding traces:\n",
6875 mleak_stat->ml_cnt);
6876 MBUF_DUMP_BUF_CHK();
6877 for (i = 0; i < mleak_stat->ml_cnt; i++) {
6878 mltr = &mleak_stat->ml_trace[i];
6879 k = snprintf(c, clen, "[%d] %llu outstanding alloc(s), "
6880 "%llu hit(s), %llu collision(s)\n", (i + 1),
6881 mltr->mltr_allocs, mltr->mltr_hitcount,
6882 mltr->mltr_collisions);
6883 MBUF_DUMP_BUF_CHK();
6884 }
6885
6886 if (mleak_stat->ml_isaddr64)
6887 k = snprintf(c, clen, MB_LEAK_HDR_64);
6888 else
6889 k = snprintf(c, clen, MB_LEAK_HDR_32);
6890 MBUF_DUMP_BUF_CHK();
6891
6892 for (i = 0; i < MLEAK_STACK_DEPTH; i++) {
6893 int j;
6894 k = snprintf(c, clen, "%2d: ", (i + 1));
6895 MBUF_DUMP_BUF_CHK();
6896 for (j = 0; j < mleak_stat->ml_cnt; j++) {
6897 mltr = &mleak_stat->ml_trace[j];
6898 if (i < mltr->mltr_depth) {
6899 if (mleak_stat->ml_isaddr64) {
6900 k = snprintf(c, clen, "0x%0llx ",
6901 mltr->mltr_addr[i]);
6902 } else {
6903 k = snprintf(c, clen,
6904 "0x%08x ",
6905 (u_int32_t)mltr->mltr_addr[i]);
6906 }
6907 } else {
6908 if (mleak_stat->ml_isaddr64)
6909 k = snprintf(c, clen,
6910 MB_LEAK_SPACING_64);
6911 else
6912 k = snprintf(c, clen,
6913 MB_LEAK_SPACING_32);
6914 }
6915 MBUF_DUMP_BUF_CHK();
6916 }
6917 k = snprintf(c, clen, "\n");
6918 MBUF_DUMP_BUF_CHK();
6919 }
6920 done:
6921 return (mbuf_dump_buf);
6922 }
6923
6924 #undef MBUF_DUMP_BUF_CHK
6925
6926 SYSCTL_DECL(_kern_ipc);
6927 SYSCTL_PROC(_kern_ipc, KIPC_MBSTAT, mbstat,
6928 CTLFLAG_RD | CTLFLAG_LOCKED,
6929 0, 0, mbstat_sysctl, "S,mbstat", "");
6930 SYSCTL_PROC(_kern_ipc, OID_AUTO, mb_stat,
6931 CTLFLAG_RD | CTLFLAG_LOCKED,
6932 0, 0, mb_stat_sysctl, "S,mb_stat", "");
6933 SYSCTL_PROC(_kern_ipc, OID_AUTO, mleak_top_trace,
6934 CTLFLAG_RD | CTLFLAG_LOCKED,
6935 0, 0, mleak_top_trace_sysctl, "S,mb_top_trace", "");
6936 SYSCTL_PROC(_kern_ipc, OID_AUTO, mleak_table,
6937 CTLFLAG_RD | CTLFLAG_LOCKED,
6938 0, 0, mleak_table_sysctl, "S,mleak_table", "");
6939 SYSCTL_INT(_kern_ipc, OID_AUTO, mleak_sample_factor,
6940 CTLFLAG_RW | CTLFLAG_LOCKED, &mleak_table.mleak_sample_factor, 0, "");
6941 SYSCTL_INT(_kern_ipc, OID_AUTO, mb_normalized,
6942 CTLFLAG_RD | CTLFLAG_LOCKED, &mb_normalized, 0, "");
6943 SYSCTL_INT(_kern_ipc, OID_AUTO, mb_watchdog,
6944 CTLFLAG_RW | CTLFLAG_LOCKED, &mb_watchdog, 0, "");