]> git.saurik.com Git - apple/xnu.git/blob - bsd/kern/uipc_mbuf.c
3edca9510c2df80355dcad9aad27ff6e922cf782
[apple/xnu.git] / bsd / kern / uipc_mbuf.c
1 /*
2 * Copyright (c) 1998-2012 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
29 /*
30 * Copyright (c) 1982, 1986, 1988, 1991, 1993
31 * The Regents of the University of California. All rights reserved.
32 *
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
35 * are met:
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
41 * 3. All advertising materials mentioning features or use of this software
42 * must display the following acknowledgement:
43 * This product includes software developed by the University of
44 * California, Berkeley and its contributors.
45 * 4. Neither the name of the University nor the names of its contributors
46 * may be used to endorse or promote products derived from this software
47 * without specific prior written permission.
48 *
49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59 * SUCH DAMAGE.
60 *
61 * @(#)uipc_mbuf.c 8.2 (Berkeley) 1/4/94
62 */
63 /*
64 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
65 * support for mandatory and extensible security protections. This notice
66 * is included in support of clause 2.2 (b) of the Apple Public License,
67 * Version 2.0.
68 */
69
70 #include <sys/param.h>
71 #include <sys/systm.h>
72 #include <sys/malloc.h>
73 #include <sys/mbuf.h>
74 #include <sys/kernel.h>
75 #include <sys/sysctl.h>
76 #include <sys/syslog.h>
77 #include <sys/protosw.h>
78 #include <sys/domain.h>
79 #include <sys/queue.h>
80 #include <sys/proc.h>
81
82 #include <kern/kern_types.h>
83 #include <kern/simple_lock.h>
84 #include <kern/queue.h>
85 #include <kern/sched_prim.h>
86 #include <kern/cpu_number.h>
87 #include <kern/zalloc.h>
88
89 #include <libkern/OSAtomic.h>
90 #include <libkern/libkern.h>
91
92 #include <IOKit/IOMapper.h>
93
94 #include <machine/limits.h>
95 #include <machine/machine_routines.h>
96
97 #if CONFIG_MACF_NET
98 #include <security/mac_framework.h>
99 #endif /* MAC_NET */
100
101 #include <sys/mcache.h>
102
103 /*
104 * MBUF IMPLEMENTATION NOTES.
105 *
106 * There is a total of 5 per-CPU caches:
107 *
108 * MC_MBUF:
109 * This is a cache of rudimentary objects of MSIZE in size; each
110 * object represents an mbuf structure. This cache preserves only
111 * the m_type field of the mbuf during its transactions.
112 *
113 * MC_CL:
114 * This is a cache of rudimentary objects of MCLBYTES in size; each
115 * object represents a mcluster structure. This cache does not
116 * preserve the contents of the objects during its transactions.
117 *
118 * MC_BIGCL:
119 * This is a cache of rudimentary objects of MBIGCLBYTES in size; each
120 * object represents a mbigcluster structure. This cache does not
121 * preserve the contents of the objects during its transaction.
122 *
123 * MC_MBUF_CL:
124 * This is a cache of mbufs each having a cluster attached to it.
125 * It is backed by MC_MBUF and MC_CL rudimentary caches. Several
126 * fields of the mbuf related to the external cluster are preserved
127 * during transactions.
128 *
129 * MC_MBUF_BIGCL:
130 * This is a cache of mbufs each having a big cluster attached to it.
131 * It is backed by MC_MBUF and MC_BIGCL rudimentary caches. Several
132 * fields of the mbuf related to the external cluster are preserved
133 * during transactions.
134 *
135 * OBJECT ALLOCATION:
136 *
137 * Allocation requests are handled first at the per-CPU (mcache) layer
138 * before falling back to the slab layer. Performance is optimal when
139 * the request is satisfied at the CPU layer because global data/lock
140 * never gets accessed. When the slab layer is entered for allocation,
141 * the slab freelist will be checked first for available objects before
142 * the VM backing store is invoked. Slab layer operations are serialized
143 * for all of the caches as the mbuf global lock is held most of the time.
144 * Allocation paths are different depending on the class of objects:
145 *
146 * a. Rudimentary object:
147 *
148 * { m_get_common(), m_clattach(), m_mclget(),
149 * m_mclalloc(), m_bigalloc(), m_copym_with_hdrs(),
150 * composite object allocation }
151 * | ^
152 * | |
153 * | +-----------------------+
154 * v |
155 * mcache_alloc/mcache_alloc_ext() mbuf_slab_audit()
156 * | ^
157 * v |
158 * [CPU cache] -------> (found?) -------+
159 * | |
160 * v |
161 * mbuf_slab_alloc() |
162 * | |
163 * v |
164 * +---------> [freelist] -------> (found?) -------+
165 * | |
166 * | v
167 * | m_clalloc()
168 * | |
169 * | v
170 * +---<<---- kmem_mb_alloc()
171 *
172 * b. Composite object:
173 *
174 * { m_getpackets_internal(), m_allocpacket_internal() }
175 * | ^
176 * | |
177 * | +------ (done) ---------+
178 * v |
179 * mcache_alloc/mcache_alloc_ext() mbuf_cslab_audit()
180 * | ^
181 * v |
182 * [CPU cache] -------> (found?) -------+
183 * | |
184 * v |
185 * mbuf_cslab_alloc() |
186 * | |
187 * v |
188 * [freelist] -------> (found?) -------+
189 * | |
190 * v |
191 * (rudimentary object) |
192 * mcache_alloc/mcache_alloc_ext() ------>>-----+
193 *
194 * Auditing notes: If auditing is enabled, buffers will be subjected to
195 * integrity checks by the audit routine. This is done by verifying their
196 * contents against DEADBEEF (free) pattern before returning them to caller.
197 * As part of this step, the routine will also record the transaction and
198 * pattern-fill the buffers with BADDCAFE (uninitialized) pattern. It will
199 * also restore any constructed data structure fields if necessary.
200 *
201 * OBJECT DEALLOCATION:
202 *
203 * Freeing an object simply involves placing it into the CPU cache; this
204 * pollutes the cache to benefit subsequent allocations. The slab layer
205 * will only be entered if the object is to be purged out of the cache.
206 * During normal operations, this happens only when the CPU layer resizes
207 * its bucket while it's adjusting to the allocation load. Deallocation
208 * paths are different depending on the class of objects:
209 *
210 * a. Rudimentary object:
211 *
212 * { m_free(), m_freem_list(), composite object deallocation }
213 * | ^
214 * | |
215 * | +------ (done) ---------+
216 * v |
217 * mcache_free/mcache_free_ext() |
218 * | |
219 * v |
220 * mbuf_slab_audit() |
221 * | |
222 * v |
223 * [CPU cache] ---> (not purging?) -----+
224 * | |
225 * v |
226 * mbuf_slab_free() |
227 * | |
228 * v |
229 * [freelist] ----------->>------------+
230 * (objects never get purged to VM)
231 *
232 * b. Composite object:
233 *
234 * { m_free(), m_freem_list() }
235 * | ^
236 * | |
237 * | +------ (done) ---------+
238 * v |
239 * mcache_free/mcache_free_ext() |
240 * | |
241 * v |
242 * mbuf_cslab_audit() |
243 * | |
244 * v |
245 * [CPU cache] ---> (not purging?) -----+
246 * | |
247 * v |
248 * mbuf_cslab_free() |
249 * | |
250 * v |
251 * [freelist] ---> (not purging?) -----+
252 * | |
253 * v |
254 * (rudimentary object) |
255 * mcache_free/mcache_free_ext() ------->>------+
256 *
257 * Auditing notes: If auditing is enabled, the audit routine will save
258 * any constructed data structure fields (if necessary) before filling the
259 * contents of the buffers with DEADBEEF (free) pattern and recording the
260 * transaction. Buffers that are freed (whether at CPU or slab layer) are
261 * expected to contain the free pattern.
262 *
263 * DEBUGGING:
264 *
265 * Debugging can be enabled by adding "mbuf_debug=0x3" to boot-args; this
266 * translates to the mcache flags (MCF_VERIFY | MCF_AUDIT). Additionally,
267 * the CPU layer cache can be disabled by setting the MCF_NOCPUCACHE flag,
268 * i.e. modify the boot argument parameter to "mbuf_debug=0x13". Leak
269 * detection may also be disabled by setting the MCF_NOLEAKLOG flag, e.g.
270 * "mbuf_debug=0x113". Note that debugging consumes more CPU and memory.
271 *
272 * Each object is associated with exactly one mcache_audit_t structure that
273 * contains the information related to its last buffer transaction. Given
274 * an address of an object, the audit structure can be retrieved by finding
275 * the position of the object relevant to the base address of the cluster:
276 *
277 * +------------+ +=============+
278 * | mbuf addr | | mclaudit[i] |
279 * +------------+ +=============+
280 * | | cl_audit[0] |
281 * i = MTOBG(addr) +-------------+
282 * | +-----> | cl_audit[1] | -----> mcache_audit_t
283 * b = BGTOM(i) | +-------------+
284 * | | | ... |
285 * x = MCLIDX(b, addr) | +-------------+
286 * | | | cl_audit[7] |
287 * +-----------------+ +-------------+
288 * (e.g. x == 1)
289 *
290 * The mclaudit[] array is allocated at initialization time, but its contents
291 * get populated when the corresponding cluster is created. Because a page
292 * can be turned into NMBPBG number of mbufs, we preserve enough space for the
293 * mbufs so that there is a 1-to-1 mapping between them. A page that never
294 * gets (or has not yet) turned into mbufs will use only cl_audit[0] with the
295 * remaining entries unused. For 16KB cluster, only one entry from the first
296 * page is allocated and used for the entire object.
297 */
298
299 /* TODO: should be in header file */
300 /* kernel translater */
301 extern vm_offset_t kmem_mb_alloc(vm_map_t, int, int);
302 extern ppnum_t pmap_find_phys(pmap_t pmap, addr64_t va);
303 extern vm_map_t mb_map; /* special map */
304
305 /* Global lock */
306 decl_lck_mtx_data(static, mbuf_mlock_data);
307 static lck_mtx_t *mbuf_mlock = &mbuf_mlock_data;
308 static lck_attr_t *mbuf_mlock_attr;
309 static lck_grp_t *mbuf_mlock_grp;
310 static lck_grp_attr_t *mbuf_mlock_grp_attr;
311
312 /* Back-end (common) layer */
313 static void *mbuf_worker_run; /* wait channel for worker thread */
314 static int mbuf_worker_ready; /* worker thread is runnable */
315 static int mbuf_expand_mcl; /* number of cluster creation requets */
316 static int mbuf_expand_big; /* number of big cluster creation requests */
317 static int mbuf_expand_16k; /* number of 16KB cluster creation requests */
318 static int ncpu; /* number of CPUs */
319 static ppnum_t *mcl_paddr; /* Array of cluster physical addresses */
320 static ppnum_t mcl_pages; /* Size of array (# physical pages) */
321 static ppnum_t mcl_paddr_base; /* Handle returned by IOMapper::iovmAlloc() */
322 static mcache_t *ref_cache; /* Cache of cluster reference & flags */
323 static mcache_t *mcl_audit_con_cache; /* Audit contents cache */
324 static unsigned int mbuf_debug; /* patchable mbuf mcache flags */
325 static unsigned int mb_normalized; /* number of packets "normalized" */
326
327 #define MB_GROWTH_AGGRESSIVE 1 /* Threshold: 1/2 of total */
328 #define MB_GROWTH_NORMAL 2 /* Threshold: 3/4 of total */
329
330 typedef enum {
331 MC_MBUF = 0, /* Regular mbuf */
332 MC_CL, /* Cluster */
333 MC_BIGCL, /* Large (4KB) cluster */
334 MC_16KCL, /* Jumbo (16KB) cluster */
335 MC_MBUF_CL, /* mbuf + cluster */
336 MC_MBUF_BIGCL, /* mbuf + large (4KB) cluster */
337 MC_MBUF_16KCL /* mbuf + jumbo (16KB) cluster */
338 } mbuf_class_t;
339
340 #define MBUF_CLASS_MIN MC_MBUF
341 #define MBUF_CLASS_MAX MC_MBUF_16KCL
342 #define MBUF_CLASS_LAST MC_16KCL
343 #define MBUF_CLASS_VALID(c) \
344 ((int)(c) >= MBUF_CLASS_MIN && (int)(c) <= MBUF_CLASS_MAX)
345 #define MBUF_CLASS_COMPOSITE(c) \
346 ((int)(c) > MBUF_CLASS_LAST)
347
348
349 /*
350 * mbuf specific mcache allocation request flags.
351 */
352 #define MCR_COMP MCR_USR1 /* for MC_MBUF_{CL,BIGCL,16KCL} caches */
353
354 /*
355 * Per-cluster slab structure.
356 *
357 * A slab is a cluster control structure that contains one or more object
358 * chunks; the available chunks are chained in the slab's freelist (sl_head).
359 * Each time a chunk is taken out of the slab, the slab's reference count
360 * gets incremented. When all chunks have been taken out, the empty slab
361 * gets removed (SLF_DETACHED) from the class's slab list. A chunk that is
362 * returned to a slab causes the slab's reference count to be decremented;
363 * it also causes the slab to be reinserted back to class's slab list, if
364 * it's not already done.
365 *
366 * Compartmentalizing of the object chunks into slabs allows us to easily
367 * merge one or more slabs together when the adjacent slabs are idle, as
368 * well as to convert or move a slab from one class to another; e.g. the
369 * mbuf cluster slab can be converted to a regular cluster slab when all
370 * mbufs in the slab have been freed.
371 *
372 * A slab may also span across multiple clusters for chunks larger than
373 * a cluster's size. In this case, only the slab of the first cluster is
374 * used. The rest of the slabs are marked with SLF_PARTIAL to indicate
375 * that they are part of the larger slab.
376 *
377 * Each slab controls a page of memory.
378 */
379 typedef struct mcl_slab {
380 struct mcl_slab *sl_next; /* neighboring slab */
381 u_int8_t sl_class; /* controlling mbuf class */
382 int8_t sl_refcnt; /* outstanding allocations */
383 int8_t sl_chunks; /* chunks (bufs) in this slab */
384 u_int16_t sl_flags; /* slab flags (see below) */
385 u_int16_t sl_len; /* slab length */
386 void *sl_base; /* base of allocated memory */
387 void *sl_head; /* first free buffer */
388 TAILQ_ENTRY(mcl_slab) sl_link; /* next/prev slab on freelist */
389 } mcl_slab_t;
390
391 #define SLF_MAPPED 0x0001 /* backed by a mapped page */
392 #define SLF_PARTIAL 0x0002 /* part of another slab */
393 #define SLF_DETACHED 0x0004 /* not in slab freelist */
394
395 /*
396 * The array of slabs are broken into groups of arrays per 1MB of kernel
397 * memory to reduce the footprint. Each group is allocated on demand
398 * whenever a new piece of memory mapped in from the VM crosses the 1MB
399 * boundary.
400 */
401 #define NSLABSPMB ((1 << MBSHIFT) >> PGSHIFT) /* 256 slabs/grp */
402
403 typedef struct mcl_slabg {
404 mcl_slab_t slg_slab[NSLABSPMB]; /* group of slabs */
405 } mcl_slabg_t;
406
407 /*
408 * Number of slabs needed to control a 16KB cluster object.
409 */
410 #define NSLABSP16KB (M16KCLBYTES >> PGSHIFT)
411
412 /*
413 * Per-cluster audit structure.
414 */
415 typedef struct {
416 mcache_audit_t *cl_audit[NMBPBG]; /* array of audits */
417 } mcl_audit_t;
418
419 /*
420 * Size of data from the beginning of an mbuf that covers m_hdr, pkthdr
421 * and m_ext structures. If auditing is enabled, we allocate a shadow
422 * mbuf structure of this size inside each audit structure, and the
423 * contents of the real mbuf gets copied into it when the mbuf is freed.
424 * This allows us to pattern-fill the mbuf for integrity check, and to
425 * preserve any constructed mbuf fields (e.g. mbuf + cluster cache case).
426 * Note that we don't save the contents of clusters when they are freed;
427 * we simply pattern-fill them.
428 */
429 #define AUDIT_CONTENTS_SIZE ((MSIZE - MHLEN) + sizeof (_m_ext_t))
430
431 /*
432 * mbuf specific mcache audit flags
433 */
434 #define MB_INUSE 0x01 /* object has not been returned to slab */
435 #define MB_COMP_INUSE 0x02 /* object has not been returned to cslab */
436 #define MB_SCVALID 0x04 /* object has valid saved contents */
437
438 /*
439 * Each of the following two arrays hold up to nmbclusters elements.
440 */
441 static mcl_audit_t *mclaudit; /* array of cluster audit information */
442 static unsigned int maxclaudit; /* max # of entries in audit table */
443 static mcl_slabg_t **slabstbl; /* cluster slabs table */
444 static unsigned int maxslabgrp; /* max # of entries in slabs table */
445 static unsigned int slabgrp; /* # of entries in slabs table */
446
447 /* Globals */
448 int nclusters; /* # of clusters for non-jumbo (legacy) sizes */
449 int njcl; /* # of clusters for jumbo sizes */
450 int njclbytes; /* size of a jumbo cluster */
451 union mbigcluster *mbutl; /* first mapped cluster address */
452 union mbigcluster *embutl; /* ending virtual address of mclusters */
453 int _max_linkhdr; /* largest link-level header */
454 int _max_protohdr; /* largest protocol header */
455 int max_hdr; /* largest link+protocol header */
456 int max_datalen; /* MHLEN - max_hdr */
457
458 static boolean_t mclverify; /* debug: pattern-checking */
459 static boolean_t mcltrace; /* debug: stack tracing */
460 static boolean_t mclfindleak; /* debug: leak detection */
461 static boolean_t mclexpleak; /* debug: expose leak info to user space */
462
463 /* mbuf leak detection variables */
464 static struct mleak_table mleak_table;
465 static mleak_stat_t *mleak_stat;
466
467 #define MLEAK_STAT_SIZE(n) \
468 ((size_t)(&((mleak_stat_t *)0)->ml_trace[n]))
469
470 struct mallocation {
471 mcache_obj_t *element; /* the alloc'ed element, NULL if unused */
472 u_int32_t trace_index; /* mtrace index for corresponding backtrace */
473 u_int32_t count; /* How many objects were requested */
474 u_int64_t hitcount; /* for determining hash effectiveness */
475 };
476
477 struct mtrace {
478 u_int64_t collisions;
479 u_int64_t hitcount;
480 u_int64_t allocs;
481 u_int64_t depth;
482 uintptr_t addr[MLEAK_STACK_DEPTH];
483 };
484
485 /* Size must be a power of two for the zhash to be able to just mask off bits */
486 #define MLEAK_ALLOCATION_MAP_NUM 512
487 #define MLEAK_TRACE_MAP_NUM 256
488
489 /*
490 * Sample factor for how often to record a trace. This is overwritable
491 * by the boot-arg mleak_sample_factor.
492 */
493 #define MLEAK_SAMPLE_FACTOR 500
494
495 /*
496 * Number of top leakers recorded.
497 */
498 #define MLEAK_NUM_TRACES 5
499
500 #define MB_LEAK_SPACING_64 " "
501 #define MB_LEAK_SPACING_32 " "
502
503
504 #define MB_LEAK_HDR_32 "\n\
505 trace [1] trace [2] trace [3] trace [4] trace [5] \n\
506 ---------- ---------- ---------- ---------- ---------- \n\
507 "
508
509 #define MB_LEAK_HDR_64 "\n\
510 trace [1] trace [2] trace [3] \
511 trace [4] trace [5] \n\
512 ------------------ ------------------ ------------------ \
513 ------------------ ------------------ \n\
514 "
515
516 static uint32_t mleak_alloc_buckets = MLEAK_ALLOCATION_MAP_NUM;
517 static uint32_t mleak_trace_buckets = MLEAK_TRACE_MAP_NUM;
518
519 /* Hashmaps of allocations and their corresponding traces */
520 static struct mallocation *mleak_allocations;
521 static struct mtrace *mleak_traces;
522 static struct mtrace *mleak_top_trace[MLEAK_NUM_TRACES];
523
524 /* Lock to protect mleak tables from concurrent modification */
525 decl_lck_mtx_data(static, mleak_lock_data);
526 static lck_mtx_t *mleak_lock = &mleak_lock_data;
527 static lck_attr_t *mleak_lock_attr;
528 static lck_grp_t *mleak_lock_grp;
529 static lck_grp_attr_t *mleak_lock_grp_attr;
530
531 extern u_int32_t high_sb_max;
532
533 /* TODO: should be in header file */
534 int do_reclaim = 0;
535
536 /* The minimum number of objects that are allocated, to start. */
537 #define MINCL 32
538 #define MINBIGCL (MINCL >> 1)
539 #define MIN16KCL (MINCL >> 2)
540
541 /* Low watermarks (only map in pages once free counts go below) */
542 #define MBIGCL_LOWAT MINBIGCL
543 #define M16KCL_LOWAT MIN16KCL
544
545 typedef struct {
546 mbuf_class_t mtbl_class; /* class type */
547 mcache_t *mtbl_cache; /* mcache for this buffer class */
548 TAILQ_HEAD(mcl_slhead, mcl_slab) mtbl_slablist; /* slab list */
549 mcache_obj_t *mtbl_cobjlist; /* composite objects freelist */
550 mb_class_stat_t *mtbl_stats; /* statistics fetchable via sysctl */
551 u_int32_t mtbl_maxsize; /* maximum buffer size */
552 int mtbl_minlimit; /* minimum allowed */
553 int mtbl_maxlimit; /* maximum allowed */
554 u_int32_t mtbl_wantpurge; /* purge during next reclaim */
555 } mbuf_table_t;
556
557 #define m_class(c) mbuf_table[c].mtbl_class
558 #define m_cache(c) mbuf_table[c].mtbl_cache
559 #define m_slablist(c) mbuf_table[c].mtbl_slablist
560 #define m_cobjlist(c) mbuf_table[c].mtbl_cobjlist
561 #define m_maxsize(c) mbuf_table[c].mtbl_maxsize
562 #define m_minlimit(c) mbuf_table[c].mtbl_minlimit
563 #define m_maxlimit(c) mbuf_table[c].mtbl_maxlimit
564 #define m_wantpurge(c) mbuf_table[c].mtbl_wantpurge
565 #define m_cname(c) mbuf_table[c].mtbl_stats->mbcl_cname
566 #define m_size(c) mbuf_table[c].mtbl_stats->mbcl_size
567 #define m_total(c) mbuf_table[c].mtbl_stats->mbcl_total
568 #define m_active(c) mbuf_table[c].mtbl_stats->mbcl_active
569 #define m_infree(c) mbuf_table[c].mtbl_stats->mbcl_infree
570 #define m_slab_cnt(c) mbuf_table[c].mtbl_stats->mbcl_slab_cnt
571 #define m_alloc_cnt(c) mbuf_table[c].mtbl_stats->mbcl_alloc_cnt
572 #define m_free_cnt(c) mbuf_table[c].mtbl_stats->mbcl_free_cnt
573 #define m_notified(c) mbuf_table[c].mtbl_stats->mbcl_notified
574 #define m_purge_cnt(c) mbuf_table[c].mtbl_stats->mbcl_purge_cnt
575 #define m_fail_cnt(c) mbuf_table[c].mtbl_stats->mbcl_fail_cnt
576 #define m_ctotal(c) mbuf_table[c].mtbl_stats->mbcl_ctotal
577
578 static mbuf_table_t mbuf_table[] = {
579 /*
580 * The caches for mbufs, regular clusters and big clusters.
581 */
582 { MC_MBUF, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_MBUF)),
583 NULL, NULL, 0, 0, 0, 0 },
584 { MC_CL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_CL)),
585 NULL, NULL, 0, 0, 0, 0 },
586 { MC_BIGCL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_BIGCL)),
587 NULL, NULL, 0, 0, 0, 0 },
588 { MC_16KCL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_16KCL)),
589 NULL, NULL, 0, 0, 0, 0 },
590 /*
591 * The following are special caches; they serve as intermediate
592 * caches backed by the above rudimentary caches. Each object
593 * in the cache is an mbuf with a cluster attached to it. Unlike
594 * the above caches, these intermediate caches do not directly
595 * deal with the slab structures; instead, the constructed
596 * cached elements are simply stored in the freelists.
597 */
598 { MC_MBUF_CL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0 },
599 { MC_MBUF_BIGCL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0 },
600 { MC_MBUF_16KCL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0 },
601 };
602
603 #define NELEM(a) (sizeof (a) / sizeof ((a)[0]))
604
605 static void *mb_waitchan = &mbuf_table; /* wait channel for all caches */
606 static int mb_waiters; /* number of waiters */
607
608 #define MB_WDT_MAXTIME 10 /* # of secs before watchdog panic */
609 static struct timeval mb_wdtstart; /* watchdog start timestamp */
610 static char *mbuf_dump_buf;
611
612 #define MBUF_DUMP_BUF_SIZE 2048
613
614 /*
615 * mbuf watchdog is enabled by default on embedded platforms. It is
616 * also toggeable via the kern.ipc.mb_watchdog sysctl.
617 */
618 #if CONFIG_EMBEDDED
619 static unsigned int mb_watchdog = 1;
620 #else
621 static unsigned int mb_watchdog = 0;
622 #endif /* CONFIG_EMBEDDED */
623
624 /* The following are used to serialize m_clalloc() */
625 static boolean_t mb_clalloc_busy;
626 static void *mb_clalloc_waitchan = &mb_clalloc_busy;
627 static int mb_clalloc_waiters;
628
629 static void mbuf_mtypes_sync(boolean_t);
630 static int mbstat_sysctl SYSCTL_HANDLER_ARGS;
631 static void mbuf_stat_sync(void);
632 static int mb_stat_sysctl SYSCTL_HANDLER_ARGS;
633 static int mleak_top_trace_sysctl SYSCTL_HANDLER_ARGS;
634 static int mleak_table_sysctl SYSCTL_HANDLER_ARGS;
635 static char *mbuf_dump(void);
636 static void mbuf_table_init(void);
637 static inline void m_incref(struct mbuf *);
638 static inline u_int32_t m_decref(struct mbuf *);
639 static int m_clalloc(const u_int32_t, const int, const u_int32_t);
640 static void mbuf_worker_thread_init(void);
641 static mcache_obj_t *slab_alloc(mbuf_class_t, int);
642 static void slab_free(mbuf_class_t, mcache_obj_t *);
643 static unsigned int mbuf_slab_alloc(void *, mcache_obj_t ***,
644 unsigned int, int);
645 static void mbuf_slab_free(void *, mcache_obj_t *, int);
646 static void mbuf_slab_audit(void *, mcache_obj_t *, boolean_t);
647 static void mbuf_slab_notify(void *, u_int32_t);
648 static unsigned int cslab_alloc(mbuf_class_t, mcache_obj_t ***,
649 unsigned int);
650 static unsigned int cslab_free(mbuf_class_t, mcache_obj_t *, int);
651 static unsigned int mbuf_cslab_alloc(void *, mcache_obj_t ***,
652 unsigned int, int);
653 static void mbuf_cslab_free(void *, mcache_obj_t *, int);
654 static void mbuf_cslab_audit(void *, mcache_obj_t *, boolean_t);
655 static int freelist_populate(mbuf_class_t, unsigned int, int);
656 static void freelist_init(mbuf_class_t);
657 static boolean_t mbuf_cached_above(mbuf_class_t, int);
658 static boolean_t mbuf_steal(mbuf_class_t, unsigned int);
659 static void m_reclaim(mbuf_class_t, unsigned int, boolean_t);
660 static int m_howmany(int, size_t);
661 static void mbuf_worker_thread(void);
662 static void mbuf_watchdog(void);
663 static boolean_t mbuf_sleep(mbuf_class_t, unsigned int, int);
664
665 static void mcl_audit_init(void *, mcache_audit_t **, mcache_obj_t **,
666 size_t, unsigned int);
667 static mcache_audit_t *mcl_audit_buf2mca(mbuf_class_t, mcache_obj_t *);
668 static void mcl_audit_mbuf(mcache_audit_t *, void *, boolean_t, boolean_t);
669 static void mcl_audit_cluster(mcache_audit_t *, void *, size_t, boolean_t,
670 boolean_t);
671 static void mcl_audit_restore_mbuf(struct mbuf *, mcache_audit_t *, boolean_t);
672 static void mcl_audit_save_mbuf(struct mbuf *, mcache_audit_t *);
673 static void mcl_audit_mcheck_panic(struct mbuf *);
674 static void mcl_audit_verify_nextptr(void *, mcache_audit_t *);
675
676 static void mleak_activate(void);
677 static void mleak_logger(u_int32_t, mcache_obj_t *, boolean_t);
678 static boolean_t mleak_log(uintptr_t *, mcache_obj_t *, uint32_t, int);
679 static void mleak_free(mcache_obj_t *);
680 static void mleak_sort_traces(void);
681 static void mleak_update_stats(void);
682
683 static mcl_slab_t *slab_get(void *);
684 static void slab_init(mcl_slab_t *, mbuf_class_t, u_int32_t,
685 void *, void *, unsigned int, int, int);
686 static void slab_insert(mcl_slab_t *, mbuf_class_t);
687 static void slab_remove(mcl_slab_t *, mbuf_class_t);
688 static boolean_t slab_inrange(mcl_slab_t *, void *);
689 static void slab_nextptr_panic(mcl_slab_t *, void *);
690 static void slab_detach(mcl_slab_t *);
691 static boolean_t slab_is_detached(mcl_slab_t *);
692
693 static int m_copyback0(struct mbuf **, int, int, const void *, int, int);
694 static struct mbuf *m_split0(struct mbuf *, int, int, int);
695
696 /* flags for m_copyback0 */
697 #define M_COPYBACK0_COPYBACK 0x0001 /* copyback from cp */
698 #define M_COPYBACK0_PRESERVE 0x0002 /* preserve original data */
699 #define M_COPYBACK0_COW 0x0004 /* do copy-on-write */
700 #define M_COPYBACK0_EXTEND 0x0008 /* extend chain */
701
702 /*
703 * This flag is set for all mbufs that come out of and into the composite
704 * mbuf + cluster caches, i.e. MC_MBUF_CL and MC_MBUF_BIGCL. mbufs that
705 * are marked with such a flag have clusters attached to them, and will be
706 * treated differently when they are freed; instead of being placed back
707 * into the mbuf and cluster freelists, the composite mbuf + cluster objects
708 * are placed back into the appropriate composite cache's freelist, and the
709 * actual freeing is deferred until the composite objects are purged. At
710 * such a time, this flag will be cleared from the mbufs and the objects
711 * will be freed into their own separate freelists.
712 */
713 #define EXTF_COMPOSITE 0x1
714
715 /*
716 * This flag indicates that the external cluster is read-only, i.e. it is
717 * or was referred to by more than one mbufs. Once set, this flag is never
718 * cleared.
719 */
720 #define EXTF_READONLY 0x2
721 #define EXTF_MASK (EXTF_COMPOSITE | EXTF_READONLY)
722
723 #define MEXT_RFA(m) ((m)->m_ext.ext_refflags)
724 #define MEXT_REF(m) (MEXT_RFA(m)->refcnt)
725 #define MEXT_FLAGS(m) (MEXT_RFA(m)->flags)
726 #define MBUF_IS_COMPOSITE(m) \
727 (MEXT_REF(m) == 0 && (MEXT_FLAGS(m) & EXTF_MASK) == EXTF_COMPOSITE)
728
729 /*
730 * Macros used to verify the integrity of the mbuf.
731 */
732 #define _MCHECK(m) { \
733 if ((m)->m_type != MT_FREE) { \
734 if (mclaudit == NULL) \
735 panic("MCHECK: m_type=%d m=%p", \
736 (u_int16_t)(m)->m_type, m); \
737 else \
738 mcl_audit_mcheck_panic(m); \
739 } \
740 }
741
742 #define MBUF_IN_MAP(addr) \
743 ((void *)(addr) >= (void *)mbutl && (void *)(addr) < (void *)embutl)
744
745 #define MRANGE(addr) { \
746 if (!MBUF_IN_MAP(addr)) \
747 panic("MRANGE: address out of range 0x%p", addr); \
748 }
749
750 /*
751 * Macro version of mtod.
752 */
753 #define MTOD(m, t) ((t)((m)->m_data))
754
755 /*
756 * Macros to obtain (4KB) cluster index and base cluster address.
757 */
758
759 #define MTOBG(x) (((char *)(x) - (char *)mbutl) >> MBIGCLSHIFT)
760 #define BGTOM(x) ((union mbigcluster *)(mbutl + (x)))
761
762 /*
763 * Macro to find the mbuf index relative to a base.
764 */
765 #define MCLIDX(c, m) (((char *)(m) - (char *)(c)) >> MSIZESHIFT)
766
767 /*
768 * Same thing for 2KB cluster index.
769 */
770 #define CLBGIDX(c, m) (((char *)(m) - (char *)(c)) >> MCLSHIFT)
771
772 /*
773 * Macros used during mbuf and cluster initialization.
774 */
775 #define MBUF_INIT(m, pkthdr, type) { \
776 _MCHECK(m); \
777 (m)->m_next = (m)->m_nextpkt = NULL; \
778 (m)->m_len = 0; \
779 (m)->m_type = type; \
780 if ((pkthdr) == 0) { \
781 (m)->m_data = (m)->m_dat; \
782 (m)->m_flags = 0; \
783 } else { \
784 (m)->m_data = (m)->m_pktdat; \
785 (m)->m_flags = M_PKTHDR; \
786 (m)->m_pkthdr.rcvif = NULL; \
787 (m)->m_pkthdr.len = 0; \
788 (m)->m_pkthdr.header = NULL; \
789 (m)->m_pkthdr.csum_flags = 0; \
790 (m)->m_pkthdr.csum_data = 0; \
791 (m)->m_pkthdr.tso_segsz = 0; \
792 (m)->m_pkthdr.vlan_tag = 0; \
793 (m)->m_pkthdr.socket_id = 0; \
794 (m)->m_pkthdr.vt_nrecs = 0; \
795 (m)->m_pkthdr.aux_flags = 0; \
796 m_tag_init(m); \
797 m_service_class_init(m); \
798 } \
799 }
800
801 #define MEXT_INIT(m, buf, size, free, arg, rfa, ref, flag) { \
802 (m)->m_data = (m)->m_ext.ext_buf = (buf); \
803 (m)->m_flags |= M_EXT; \
804 (m)->m_ext.ext_size = (size); \
805 (m)->m_ext.ext_free = (free); \
806 (m)->m_ext.ext_arg = (arg); \
807 (m)->m_ext.ext_refs.forward = (m)->m_ext.ext_refs.backward = \
808 &(m)->m_ext.ext_refs; \
809 MEXT_RFA(m) = (rfa); \
810 MEXT_REF(m) = (ref); \
811 MEXT_FLAGS(m) = (flag); \
812 }
813
814 #define MBUF_CL_INIT(m, buf, rfa, ref, flag) \
815 MEXT_INIT(m, buf, m_maxsize(MC_CL), NULL, NULL, rfa, ref, flag)
816
817 #define MBUF_BIGCL_INIT(m, buf, rfa, ref, flag) \
818 MEXT_INIT(m, buf, m_maxsize(MC_BIGCL), m_bigfree, NULL, rfa, ref, flag)
819
820 #define MBUF_16KCL_INIT(m, buf, rfa, ref, flag) \
821 MEXT_INIT(m, buf, m_maxsize(MC_16KCL), m_16kfree, NULL, rfa, ref, flag)
822
823 /*
824 * Macro to convert BSD malloc sleep flag to mcache's
825 */
826 #define MSLEEPF(f) ((!((f) & M_DONTWAIT)) ? MCR_SLEEP : MCR_NOSLEEP)
827
828 /*
829 * The structure that holds all mbuf class statistics exportable via sysctl.
830 * Similar to mbstat structure, the mb_stat structure is protected by the
831 * global mbuf lock. It contains additional information about the classes
832 * that allows for a more accurate view of the state of the allocator.
833 */
834 struct mb_stat *mb_stat;
835 struct omb_stat *omb_stat; /* For backwards compatibility */
836
837 #define MB_STAT_SIZE(n) \
838 ((size_t)(&((mb_stat_t *)0)->mbs_class[n]))
839 #define OMB_STAT_SIZE(n) \
840 ((size_t)(&((struct omb_stat *)0)->mbs_class[n]))
841
842 /*
843 * The legacy structure holding all of the mbuf allocation statistics.
844 * The actual statistics used by the kernel are stored in the mbuf_table
845 * instead, and are updated atomically while the global mbuf lock is held.
846 * They are mirrored in mbstat to support legacy applications (e.g. netstat).
847 * Unlike before, the kernel no longer relies on the contents of mbstat for
848 * its operations (e.g. cluster expansion) because the structure is exposed
849 * to outside and could possibly be modified, therefore making it unsafe.
850 * With the exception of the mbstat.m_mtypes array (see below), all of the
851 * statistics are updated as they change.
852 */
853 struct mbstat mbstat;
854
855 #define MBSTAT_MTYPES_MAX \
856 (sizeof (mbstat.m_mtypes) / sizeof (mbstat.m_mtypes[0]))
857
858 /*
859 * Allocation statistics related to mbuf types (up to MT_MAX-1) are updated
860 * atomically and stored in a per-CPU structure which is lock-free; this is
861 * done in order to avoid writing to the global mbstat data structure which
862 * would cause false sharing. During sysctl request for kern.ipc.mbstat,
863 * the statistics across all CPUs will be converged into the mbstat.m_mtypes
864 * array and returned to the application. Any updates for types greater or
865 * equal than MT_MAX would be done atomically to the mbstat; this slows down
866 * performance but is okay since the kernel uses only up to MT_MAX-1 while
867 * anything beyond that (up to type 255) is considered a corner case.
868 */
869 typedef struct {
870 unsigned int cpu_mtypes[MT_MAX];
871 } __attribute__((aligned(CPU_CACHE_SIZE), packed)) mtypes_cpu_t;
872
873 typedef struct {
874 mtypes_cpu_t mbs_cpu[1];
875 } mbuf_mtypes_t;
876
877 static mbuf_mtypes_t *mbuf_mtypes; /* per-CPU statistics */
878
879 #define MBUF_MTYPES_SIZE(n) \
880 ((size_t)(&((mbuf_mtypes_t *)0)->mbs_cpu[n]))
881
882 #define MTYPES_CPU(p) \
883 ((mtypes_cpu_t *)(void *)((char *)(p) + MBUF_MTYPES_SIZE(cpu_number())))
884
885 #define mtype_stat_add(type, n) { \
886 if ((unsigned)(type) < MT_MAX) { \
887 mtypes_cpu_t *mbs = MTYPES_CPU(mbuf_mtypes); \
888 atomic_add_32(&mbs->cpu_mtypes[type], n); \
889 } else if ((unsigned)(type) < (unsigned)MBSTAT_MTYPES_MAX) { \
890 atomic_add_16((int16_t *)&mbstat.m_mtypes[type], n); \
891 } \
892 }
893
894 #define mtype_stat_sub(t, n) mtype_stat_add(t, -(n))
895 #define mtype_stat_inc(t) mtype_stat_add(t, 1)
896 #define mtype_stat_dec(t) mtype_stat_sub(t, 1)
897
898 static void
899 mbuf_mtypes_sync(boolean_t locked)
900 {
901 int m, n;
902 mtypes_cpu_t mtc;
903
904 if (locked)
905 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
906
907 bzero(&mtc, sizeof (mtc));
908 for (m = 0; m < ncpu; m++) {
909 mtypes_cpu_t *scp = &mbuf_mtypes->mbs_cpu[m];
910 mtypes_cpu_t temp;
911
912 bcopy(&scp->cpu_mtypes, &temp.cpu_mtypes,
913 sizeof (temp.cpu_mtypes));
914
915 for (n = 0; n < MT_MAX; n++)
916 mtc.cpu_mtypes[n] += temp.cpu_mtypes[n];
917 }
918 if (!locked)
919 lck_mtx_lock(mbuf_mlock);
920 for (n = 0; n < MT_MAX; n++)
921 mbstat.m_mtypes[n] = mtc.cpu_mtypes[n];
922 if (!locked)
923 lck_mtx_unlock(mbuf_mlock);
924 }
925
926 static int
927 mbstat_sysctl SYSCTL_HANDLER_ARGS
928 {
929 #pragma unused(oidp, arg1, arg2)
930 mbuf_mtypes_sync(FALSE);
931
932 return (SYSCTL_OUT(req, &mbstat, sizeof (mbstat)));
933 }
934
935 static void
936 mbuf_stat_sync(void)
937 {
938 mb_class_stat_t *sp;
939 mcache_cpu_t *ccp;
940 mcache_t *cp;
941 int k, m, bktsize;
942
943 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
944
945 for (k = 0; k < NELEM(mbuf_table); k++) {
946 cp = m_cache(k);
947 ccp = &cp->mc_cpu[0];
948 bktsize = ccp->cc_bktsize;
949 sp = mbuf_table[k].mtbl_stats;
950
951 if (cp->mc_flags & MCF_NOCPUCACHE)
952 sp->mbcl_mc_state = MCS_DISABLED;
953 else if (cp->mc_purge_cnt > 0)
954 sp->mbcl_mc_state = MCS_PURGING;
955 else if (bktsize == 0)
956 sp->mbcl_mc_state = MCS_OFFLINE;
957 else
958 sp->mbcl_mc_state = MCS_ONLINE;
959
960 sp->mbcl_mc_cached = 0;
961 for (m = 0; m < ncpu; m++) {
962 ccp = &cp->mc_cpu[m];
963 if (ccp->cc_objs > 0)
964 sp->mbcl_mc_cached += ccp->cc_objs;
965 if (ccp->cc_pobjs > 0)
966 sp->mbcl_mc_cached += ccp->cc_pobjs;
967 }
968 sp->mbcl_mc_cached += (cp->mc_full.bl_total * bktsize);
969 sp->mbcl_active = sp->mbcl_total - sp->mbcl_mc_cached -
970 sp->mbcl_infree;
971
972 sp->mbcl_mc_waiter_cnt = cp->mc_waiter_cnt;
973 sp->mbcl_mc_wretry_cnt = cp->mc_wretry_cnt;
974 sp->mbcl_mc_nwretry_cnt = cp->mc_nwretry_cnt;
975
976 /* Calculate total count specific to each class */
977 sp->mbcl_ctotal = sp->mbcl_total;
978 switch (m_class(k)) {
979 case MC_MBUF:
980 /* Deduct mbufs used in composite caches */
981 sp->mbcl_ctotal -= (m_total(MC_MBUF_CL) +
982 m_total(MC_MBUF_BIGCL));
983 break;
984
985 case MC_CL:
986 /* Deduct clusters used in composite cache */
987 sp->mbcl_ctotal -= m_total(MC_MBUF_CL);
988 break;
989
990 case MC_BIGCL:
991 /* Deduct clusters used in composite cache */
992 sp->mbcl_ctotal -= m_total(MC_MBUF_BIGCL);
993 break;
994
995 case MC_16KCL:
996 /* Deduct clusters used in composite cache */
997 sp->mbcl_ctotal -= m_total(MC_MBUF_16KCL);
998 break;
999
1000 default:
1001 break;
1002 }
1003 }
1004 }
1005
1006 static int
1007 mb_stat_sysctl SYSCTL_HANDLER_ARGS
1008 {
1009 #pragma unused(oidp, arg1, arg2)
1010 void *statp;
1011 int k, statsz, proc64 = proc_is64bit(req->p);
1012
1013 lck_mtx_lock(mbuf_mlock);
1014 mbuf_stat_sync();
1015
1016 if (!proc64) {
1017 struct omb_class_stat *oc;
1018 struct mb_class_stat *c;
1019
1020 omb_stat->mbs_cnt = mb_stat->mbs_cnt;
1021 oc = &omb_stat->mbs_class[0];
1022 c = &mb_stat->mbs_class[0];
1023 for (k = 0; k < omb_stat->mbs_cnt; k++, oc++, c++) {
1024 (void) snprintf(oc->mbcl_cname, sizeof (oc->mbcl_cname),
1025 "%s", c->mbcl_cname);
1026 oc->mbcl_size = c->mbcl_size;
1027 oc->mbcl_total = c->mbcl_total;
1028 oc->mbcl_active = c->mbcl_active;
1029 oc->mbcl_infree = c->mbcl_infree;
1030 oc->mbcl_slab_cnt = c->mbcl_slab_cnt;
1031 oc->mbcl_alloc_cnt = c->mbcl_alloc_cnt;
1032 oc->mbcl_free_cnt = c->mbcl_free_cnt;
1033 oc->mbcl_notified = c->mbcl_notified;
1034 oc->mbcl_purge_cnt = c->mbcl_purge_cnt;
1035 oc->mbcl_fail_cnt = c->mbcl_fail_cnt;
1036 oc->mbcl_ctotal = c->mbcl_ctotal;
1037 oc->mbcl_mc_state = c->mbcl_mc_state;
1038 oc->mbcl_mc_cached = c->mbcl_mc_cached;
1039 oc->mbcl_mc_waiter_cnt = c->mbcl_mc_waiter_cnt;
1040 oc->mbcl_mc_wretry_cnt = c->mbcl_mc_wretry_cnt;
1041 oc->mbcl_mc_nwretry_cnt = c->mbcl_mc_nwretry_cnt;
1042 }
1043 statp = omb_stat;
1044 statsz = OMB_STAT_SIZE(NELEM(mbuf_table));
1045 } else {
1046 statp = mb_stat;
1047 statsz = MB_STAT_SIZE(NELEM(mbuf_table));
1048 }
1049
1050 lck_mtx_unlock(mbuf_mlock);
1051
1052 return (SYSCTL_OUT(req, statp, statsz));
1053 }
1054
1055 static int
1056 mleak_top_trace_sysctl SYSCTL_HANDLER_ARGS
1057 {
1058 #pragma unused(oidp, arg1, arg2)
1059 int i;
1060
1061 /* Ensure leak tracing turned on */
1062 if (!mclfindleak || !mclexpleak)
1063 return (ENXIO);
1064
1065 lck_mtx_lock(mleak_lock);
1066 mleak_update_stats();
1067 i = SYSCTL_OUT(req, mleak_stat, MLEAK_STAT_SIZE(MLEAK_NUM_TRACES));
1068 lck_mtx_unlock(mleak_lock);
1069
1070 return (i);
1071 }
1072
1073 static int
1074 mleak_table_sysctl SYSCTL_HANDLER_ARGS
1075 {
1076 #pragma unused(oidp, arg1, arg2)
1077 int i = 0;
1078
1079 /* Ensure leak tracing turned on */
1080 if (!mclfindleak || !mclexpleak)
1081 return (ENXIO);
1082
1083 lck_mtx_lock(mleak_lock);
1084 i = SYSCTL_OUT(req, &mleak_table, sizeof (mleak_table));
1085 lck_mtx_unlock(mleak_lock);
1086
1087 return (i);
1088 }
1089
1090 static inline void
1091 m_incref(struct mbuf *m)
1092 {
1093 UInt32 old, new;
1094 volatile UInt32 *addr = (volatile UInt32 *)&MEXT_REF(m);
1095
1096 do {
1097 old = *addr;
1098 new = old + 1;
1099 ASSERT(new != 0);
1100 } while (!OSCompareAndSwap(old, new, addr));
1101
1102 /*
1103 * If cluster is shared, mark it with (sticky) EXTF_READONLY;
1104 * we don't clear the flag when the refcount goes back to 1
1105 * to simplify code calling m_mclhasreference().
1106 */
1107 if (new > 1 && !(MEXT_FLAGS(m) & EXTF_READONLY))
1108 (void) OSBitOrAtomic(EXTF_READONLY, &MEXT_FLAGS(m));
1109 }
1110
1111 static inline u_int32_t
1112 m_decref(struct mbuf *m)
1113 {
1114 UInt32 old, new;
1115 volatile UInt32 *addr = (volatile UInt32 *)&MEXT_REF(m);
1116
1117 do {
1118 old = *addr;
1119 new = old - 1;
1120 ASSERT(old != 0);
1121 } while (!OSCompareAndSwap(old, new, addr));
1122
1123 return (new);
1124 }
1125
1126 static void
1127 mbuf_table_init(void)
1128 {
1129 unsigned int b, c, s;
1130 int m;
1131
1132 MALLOC(omb_stat, struct omb_stat *, OMB_STAT_SIZE(NELEM(mbuf_table)),
1133 M_TEMP, M_WAITOK | M_ZERO);
1134 VERIFY(omb_stat != NULL);
1135
1136 MALLOC(mb_stat, mb_stat_t *, MB_STAT_SIZE(NELEM(mbuf_table)),
1137 M_TEMP, M_WAITOK | M_ZERO);
1138 VERIFY(mb_stat != NULL);
1139
1140 mb_stat->mbs_cnt = NELEM(mbuf_table);
1141 for (m = 0; m < NELEM(mbuf_table); m++)
1142 mbuf_table[m].mtbl_stats = &mb_stat->mbs_class[m];
1143
1144 #if CONFIG_MBUF_JUMBO
1145 /*
1146 * Set aside 1/3 of the mbuf cluster map for jumbo clusters; we do
1147 * this only on platforms where jumbo cluster pool is enabled.
1148 */
1149 njcl = nmbclusters / 3;
1150 njclbytes = M16KCLBYTES;
1151 #endif /* CONFIG_MBUF_JUMBO */
1152
1153 /*
1154 * nclusters holds both the 2KB and 4KB pools, so ensure it's
1155 * a multiple of 4KB clusters.
1156 */
1157 nclusters = P2ROUNDDOWN(nmbclusters - njcl, NCLPBG);
1158 if (njcl > 0) {
1159 /*
1160 * Each jumbo cluster takes 8 2KB clusters, so make
1161 * sure that the pool size is evenly divisible by 8;
1162 * njcl is in 2KB unit, hence treated as such.
1163 */
1164 njcl = P2ROUNDDOWN(nmbclusters - nclusters, 8);
1165
1166 /* Update nclusters with rounded down value of njcl */
1167 nclusters = P2ROUNDDOWN(nmbclusters - njcl, NCLPBG);
1168 }
1169
1170 /*
1171 * njcl is valid only on platforms with 16KB jumbo clusters, where
1172 * it is configured to 1/3 of the pool size. On these platforms,
1173 * the remaining is used for 2KB and 4KB clusters. On platforms
1174 * without 16KB jumbo clusters, the entire pool is used for both
1175 * 2KB and 4KB clusters. A 4KB cluster can either be splitted into
1176 * 16 mbufs, or into 2 2KB clusters.
1177 *
1178 * +---+---+------------ ... -----------+------- ... -------+
1179 * | c | b | s | njcl |
1180 * +---+---+------------ ... -----------+------- ... -------+
1181 *
1182 * 1/32th of the shared region is reserved for pure 2KB and 4KB
1183 * clusters (1/64th each.)
1184 */
1185 c = P2ROUNDDOWN((nclusters >> 6), 2); /* in 2KB unit */
1186 b = P2ROUNDDOWN((nclusters >> (6 + NCLPBGSHIFT)), 2); /* in 4KB unit */
1187 s = nclusters - (c + (b << NCLPBGSHIFT)); /* in 2KB unit */
1188
1189 /*
1190 * 1/64th (c) is reserved for 2KB clusters.
1191 */
1192 m_minlimit(MC_CL) = c;
1193 m_maxlimit(MC_CL) = s + c; /* in 2KB unit */
1194 m_maxsize(MC_CL) = m_size(MC_CL) = MCLBYTES;
1195 (void) snprintf(m_cname(MC_CL), MAX_MBUF_CNAME, "cl");
1196
1197 /*
1198 * Another 1/64th (b) of the map is reserved for 4KB clusters.
1199 * It cannot be turned into 2KB clusters or mbufs.
1200 */
1201 m_minlimit(MC_BIGCL) = b;
1202 m_maxlimit(MC_BIGCL) = (s >> NCLPBGSHIFT) + b; /* in 4KB unit */
1203 m_maxsize(MC_BIGCL) = m_size(MC_BIGCL) = MBIGCLBYTES;
1204 (void) snprintf(m_cname(MC_BIGCL), MAX_MBUF_CNAME, "bigcl");
1205
1206 /*
1207 * The remaining 31/32ths (s) are all-purpose (mbufs, 2KB, or 4KB)
1208 */
1209 m_minlimit(MC_MBUF) = 0;
1210 m_maxlimit(MC_MBUF) = (s << NMBPCLSHIFT); /* in mbuf unit */
1211 m_maxsize(MC_MBUF) = m_size(MC_MBUF) = MSIZE;
1212 (void) snprintf(m_cname(MC_MBUF), MAX_MBUF_CNAME, "mbuf");
1213
1214 /*
1215 * Set limits for the composite classes.
1216 */
1217 m_minlimit(MC_MBUF_CL) = 0;
1218 m_maxlimit(MC_MBUF_CL) = m_maxlimit(MC_CL);
1219 m_maxsize(MC_MBUF_CL) = MCLBYTES;
1220 m_size(MC_MBUF_CL) = m_size(MC_MBUF) + m_size(MC_CL);
1221 (void) snprintf(m_cname(MC_MBUF_CL), MAX_MBUF_CNAME, "mbuf_cl");
1222
1223 m_minlimit(MC_MBUF_BIGCL) = 0;
1224 m_maxlimit(MC_MBUF_BIGCL) = m_maxlimit(MC_BIGCL);
1225 m_maxsize(MC_MBUF_BIGCL) = MBIGCLBYTES;
1226 m_size(MC_MBUF_BIGCL) = m_size(MC_MBUF) + m_size(MC_BIGCL);
1227 (void) snprintf(m_cname(MC_MBUF_BIGCL), MAX_MBUF_CNAME, "mbuf_bigcl");
1228
1229 /*
1230 * And for jumbo classes.
1231 */
1232 m_minlimit(MC_16KCL) = 0;
1233 m_maxlimit(MC_16KCL) = (njcl >> NCLPJCLSHIFT); /* in 16KB unit */
1234 m_maxsize(MC_16KCL) = m_size(MC_16KCL) = M16KCLBYTES;
1235 (void) snprintf(m_cname(MC_16KCL), MAX_MBUF_CNAME, "16kcl");
1236
1237 m_minlimit(MC_MBUF_16KCL) = 0;
1238 m_maxlimit(MC_MBUF_16KCL) = m_maxlimit(MC_16KCL);
1239 m_maxsize(MC_MBUF_16KCL) = M16KCLBYTES;
1240 m_size(MC_MBUF_16KCL) = m_size(MC_MBUF) + m_size(MC_16KCL);
1241 (void) snprintf(m_cname(MC_MBUF_16KCL), MAX_MBUF_CNAME, "mbuf_16kcl");
1242
1243 /*
1244 * Initialize the legacy mbstat structure.
1245 */
1246 bzero(&mbstat, sizeof (mbstat));
1247 mbstat.m_msize = m_maxsize(MC_MBUF);
1248 mbstat.m_mclbytes = m_maxsize(MC_CL);
1249 mbstat.m_minclsize = MINCLSIZE;
1250 mbstat.m_mlen = MLEN;
1251 mbstat.m_mhlen = MHLEN;
1252 mbstat.m_bigmclbytes = m_maxsize(MC_BIGCL);
1253 }
1254
1255 #if defined(__LP64__)
1256 typedef struct ncl_tbl {
1257 uint64_t nt_maxmem; /* memory (sane) size */
1258 uint32_t nt_mbpool; /* mbuf pool size */
1259 } ncl_tbl_t;
1260
1261 /* Non-server */
1262 static ncl_tbl_t ncl_table[] = {
1263 { (1ULL << GBSHIFT) /* 1 GB */, (64 << MBSHIFT) /* 64 MB */ },
1264 { (1ULL << (GBSHIFT + 3)) /* 8 GB */, (96 << MBSHIFT) /* 96 MB */ },
1265 { (1ULL << (GBSHIFT + 4)) /* 16 GB */, (128 << MBSHIFT) /* 128 MB */ },
1266 { 0, 0 }
1267 };
1268
1269 /* Server */
1270 static ncl_tbl_t ncl_table_srv[] = {
1271 { (1ULL << GBSHIFT) /* 1 GB */, (96 << MBSHIFT) /* 96 MB */ },
1272 { (1ULL << (GBSHIFT + 2)) /* 4 GB */, (128 << MBSHIFT) /* 128 MB */ },
1273 { (1ULL << (GBSHIFT + 3)) /* 8 GB */, (160 << MBSHIFT) /* 160 MB */ },
1274 { (1ULL << (GBSHIFT + 4)) /* 16 GB */, (192 << MBSHIFT) /* 192 MB */ },
1275 { (1ULL << (GBSHIFT + 5)) /* 32 GB */, (256 << MBSHIFT) /* 256 MB */ },
1276 { (1ULL << (GBSHIFT + 6)) /* 64 GB */, (384 << MBSHIFT) /* 384 MB */ },
1277 { 0, 0 }
1278 };
1279 #endif /* __LP64__ */
1280
1281 __private_extern__ unsigned int
1282 mbuf_default_ncl(int server, uint64_t mem)
1283 {
1284 #if !defined(__LP64__)
1285 #pragma unused(server)
1286 unsigned int n;
1287 /*
1288 * 32-bit kernel (default to 64MB of mbuf pool for >= 1GB RAM).
1289 */
1290 if ((n = ((mem / 16) / MCLBYTES)) > 32768)
1291 n = 32768;
1292 #else
1293 unsigned int n, i;
1294 ncl_tbl_t *tbl = (server ? ncl_table_srv : ncl_table);
1295 /*
1296 * 64-bit kernel (mbuf pool size based on table).
1297 */
1298 n = tbl[0].nt_mbpool;
1299 for (i = 0; tbl[i].nt_mbpool != 0; i++) {
1300 if (mem < tbl[i].nt_maxmem)
1301 break;
1302 n = tbl[i].nt_mbpool;
1303 }
1304 n >>= MCLSHIFT;
1305 #endif /* !__LP64__ */
1306 return (n);
1307 }
1308
1309 __private_extern__ void
1310 mbinit(void)
1311 {
1312 unsigned int m;
1313 unsigned int initmcl = 0;
1314 void *buf;
1315 thread_t thread = THREAD_NULL;
1316
1317 /*
1318 * These MBUF_ values must be equal to their private counterparts.
1319 */
1320 _CASSERT(MBUF_EXT == M_EXT);
1321 _CASSERT(MBUF_PKTHDR == M_PKTHDR);
1322 _CASSERT(MBUF_EOR == M_EOR);
1323 _CASSERT(MBUF_LOOP == M_LOOP);
1324 _CASSERT(MBUF_BCAST == M_BCAST);
1325 _CASSERT(MBUF_MCAST == M_MCAST);
1326 _CASSERT(MBUF_FRAG == M_FRAG);
1327 _CASSERT(MBUF_FIRSTFRAG == M_FIRSTFRAG);
1328 _CASSERT(MBUF_LASTFRAG == M_LASTFRAG);
1329 _CASSERT(MBUF_PROMISC == M_PROMISC);
1330 _CASSERT(MBUF_HASFCS == M_HASFCS);
1331
1332 _CASSERT(MBUF_TYPE_FREE == MT_FREE);
1333 _CASSERT(MBUF_TYPE_DATA == MT_DATA);
1334 _CASSERT(MBUF_TYPE_HEADER == MT_HEADER);
1335 _CASSERT(MBUF_TYPE_SOCKET == MT_SOCKET);
1336 _CASSERT(MBUF_TYPE_PCB == MT_PCB);
1337 _CASSERT(MBUF_TYPE_RTABLE == MT_RTABLE);
1338 _CASSERT(MBUF_TYPE_HTABLE == MT_HTABLE);
1339 _CASSERT(MBUF_TYPE_ATABLE == MT_ATABLE);
1340 _CASSERT(MBUF_TYPE_SONAME == MT_SONAME);
1341 _CASSERT(MBUF_TYPE_SOOPTS == MT_SOOPTS);
1342 _CASSERT(MBUF_TYPE_FTABLE == MT_FTABLE);
1343 _CASSERT(MBUF_TYPE_RIGHTS == MT_RIGHTS);
1344 _CASSERT(MBUF_TYPE_IFADDR == MT_IFADDR);
1345 _CASSERT(MBUF_TYPE_CONTROL == MT_CONTROL);
1346 _CASSERT(MBUF_TYPE_OOBDATA == MT_OOBDATA);
1347
1348 _CASSERT(MBUF_TSO_IPV4 == CSUM_TSO_IPV4);
1349 _CASSERT(MBUF_TSO_IPV6 == CSUM_TSO_IPV6);
1350 _CASSERT(MBUF_CSUM_REQ_SUM16 == CSUM_TCP_SUM16);
1351 _CASSERT(MBUF_CSUM_TCP_SUM16 == MBUF_CSUM_REQ_SUM16);
1352 _CASSERT(MBUF_CSUM_REQ_IP == CSUM_IP);
1353 _CASSERT(MBUF_CSUM_REQ_TCP == CSUM_TCP);
1354 _CASSERT(MBUF_CSUM_REQ_UDP == CSUM_UDP);
1355 _CASSERT(MBUF_CSUM_REQ_TCPIPV6 == CSUM_TCPIPV6);
1356 _CASSERT(MBUF_CSUM_REQ_UDPIPV6 == CSUM_UDPIPV6);
1357 _CASSERT(MBUF_CSUM_DID_IP == CSUM_IP_CHECKED);
1358 _CASSERT(MBUF_CSUM_IP_GOOD == CSUM_IP_VALID);
1359 _CASSERT(MBUF_CSUM_DID_DATA == CSUM_DATA_VALID);
1360 _CASSERT(MBUF_CSUM_PSEUDO_HDR == CSUM_PSEUDO_HDR);
1361
1362 _CASSERT(MBUF_WAITOK == M_WAIT);
1363 _CASSERT(MBUF_DONTWAIT == M_DONTWAIT);
1364 _CASSERT(MBUF_COPYALL == M_COPYALL);
1365
1366 _CASSERT(MBUF_PKTAUXF_INET_RESOLVE_RTR == MAUXF_INET_RESOLVE_RTR);
1367 _CASSERT(MBUF_PKTAUXF_INET6_RESOLVE_RTR == MAUXF_INET6_RESOLVE_RTR);
1368
1369 _CASSERT(MBUF_SC2TC(MBUF_SC_BK_SYS) == MBUF_TC_BK);
1370 _CASSERT(MBUF_SC2TC(MBUF_SC_BK) == MBUF_TC_BK);
1371 _CASSERT(MBUF_SC2TC(MBUF_SC_BE) == MBUF_TC_BE);
1372 _CASSERT(MBUF_SC2TC(MBUF_SC_RD) == MBUF_TC_BE);
1373 _CASSERT(MBUF_SC2TC(MBUF_SC_OAM) == MBUF_TC_BE);
1374 _CASSERT(MBUF_SC2TC(MBUF_SC_AV) == MBUF_TC_VI);
1375 _CASSERT(MBUF_SC2TC(MBUF_SC_RV) == MBUF_TC_VI);
1376 _CASSERT(MBUF_SC2TC(MBUF_SC_VI) == MBUF_TC_VI);
1377 _CASSERT(MBUF_SC2TC(MBUF_SC_VO) == MBUF_TC_VO);
1378 _CASSERT(MBUF_SC2TC(MBUF_SC_CTL) == MBUF_TC_VO);
1379
1380 _CASSERT(MBUF_TC2SCVAL(MBUF_TC_BK) == SCVAL_BK);
1381 _CASSERT(MBUF_TC2SCVAL(MBUF_TC_BE) == SCVAL_BE);
1382 _CASSERT(MBUF_TC2SCVAL(MBUF_TC_VI) == SCVAL_VI);
1383 _CASSERT(MBUF_TC2SCVAL(MBUF_TC_VO) == SCVAL_VO);
1384
1385 if (nmbclusters == 0)
1386 nmbclusters = NMBCLUSTERS;
1387
1388 /* This should be a sane (at least even) value by now */
1389 VERIFY(nmbclusters != 0 && !(nmbclusters & 0x1));
1390
1391 /* Setup the mbuf table */
1392 mbuf_table_init();
1393
1394 /* Global lock for common layer */
1395 mbuf_mlock_grp_attr = lck_grp_attr_alloc_init();
1396 mbuf_mlock_grp = lck_grp_alloc_init("mbuf", mbuf_mlock_grp_attr);
1397 mbuf_mlock_attr = lck_attr_alloc_init();
1398 lck_mtx_init(mbuf_mlock, mbuf_mlock_grp, mbuf_mlock_attr);
1399
1400 /*
1401 * Allocate cluster slabs table:
1402 *
1403 * maxslabgrp = (N * 2048) / (1024 * 1024)
1404 *
1405 * Where N is nmbclusters rounded up to the nearest 512. This yields
1406 * mcl_slab_g_t units, each one representing a MB of memory.
1407 */
1408 maxslabgrp =
1409 (P2ROUNDUP(nmbclusters, (MBSIZE >> 11)) << MCLSHIFT) >> MBSHIFT;
1410 MALLOC(slabstbl, mcl_slabg_t **, maxslabgrp * sizeof (mcl_slabg_t *),
1411 M_TEMP, M_WAITOK | M_ZERO);
1412 VERIFY(slabstbl != NULL);
1413
1414 /*
1415 * Allocate audit structures, if needed:
1416 *
1417 * maxclaudit = (maxslabgrp * 1024 * 1024) / 4096
1418 *
1419 * This yields mcl_audit_t units, each one representing a page.
1420 */
1421 PE_parse_boot_argn("mbuf_debug", &mbuf_debug, sizeof (mbuf_debug));
1422 mbuf_debug |= mcache_getflags();
1423 if (mbuf_debug & MCF_DEBUG) {
1424 maxclaudit = ((maxslabgrp << MBSHIFT) >> PGSHIFT);
1425 MALLOC(mclaudit, mcl_audit_t *, maxclaudit * sizeof (*mclaudit),
1426 M_TEMP, M_WAITOK | M_ZERO);
1427 VERIFY(mclaudit != NULL);
1428
1429 mcl_audit_con_cache = mcache_create("mcl_audit_contents",
1430 AUDIT_CONTENTS_SIZE, 0, 0, MCR_SLEEP);
1431 VERIFY(mcl_audit_con_cache != NULL);
1432 }
1433 mclverify = (mbuf_debug & MCF_VERIFY);
1434 mcltrace = (mbuf_debug & MCF_TRACE);
1435 mclfindleak = !(mbuf_debug & MCF_NOLEAKLOG);
1436 mclexpleak = mclfindleak && (mbuf_debug & MCF_EXPLEAKLOG);
1437
1438 /* Enable mbuf leak logging, with a lock to protect the tables */
1439
1440 mleak_lock_grp_attr = lck_grp_attr_alloc_init();
1441 mleak_lock_grp = lck_grp_alloc_init("mleak_lock", mleak_lock_grp_attr);
1442 mleak_lock_attr = lck_attr_alloc_init();
1443 lck_mtx_init(mleak_lock, mleak_lock_grp, mleak_lock_attr);
1444
1445 mleak_activate();
1446
1447 /* Calculate the number of pages assigned to the cluster pool */
1448 mcl_pages = (nmbclusters * MCLBYTES) / CLBYTES;
1449 MALLOC(mcl_paddr, ppnum_t *, mcl_pages * sizeof (ppnum_t),
1450 M_TEMP, M_WAITOK);
1451 VERIFY(mcl_paddr != NULL);
1452
1453 /* Register with the I/O Bus mapper */
1454 mcl_paddr_base = IOMapperIOVMAlloc(mcl_pages);
1455 bzero((char *)mcl_paddr, mcl_pages * sizeof (ppnum_t));
1456
1457 embutl = (union mbigcluster *)
1458 ((void *)((unsigned char *)mbutl + (nmbclusters * MCLBYTES)));
1459 VERIFY((((char *)embutl - (char *)mbutl) % MBIGCLBYTES) == 0);
1460
1461 /* Prime up the freelist */
1462 PE_parse_boot_argn("initmcl", &initmcl, sizeof (initmcl));
1463 if (initmcl != 0) {
1464 initmcl >>= NCLPBGSHIFT; /* become a 4K unit */
1465 if (initmcl > m_maxlimit(MC_BIGCL))
1466 initmcl = m_maxlimit(MC_BIGCL);
1467 }
1468 if (initmcl < m_minlimit(MC_BIGCL))
1469 initmcl = m_minlimit(MC_BIGCL);
1470
1471 lck_mtx_lock(mbuf_mlock);
1472
1473 /*
1474 * For classes with non-zero minimum limits, populate their freelists
1475 * so that m_total(class) is at least m_minlimit(class).
1476 */
1477 VERIFY(m_total(MC_BIGCL) == 0 && m_minlimit(MC_BIGCL) != 0);
1478 freelist_populate(m_class(MC_BIGCL), initmcl, M_WAIT);
1479 VERIFY(m_total(MC_BIGCL) >= m_minlimit(MC_BIGCL));
1480 freelist_init(m_class(MC_CL));
1481
1482 for (m = 0; m < NELEM(mbuf_table); m++) {
1483 /* Make sure we didn't miss any */
1484 VERIFY(m_minlimit(m_class(m)) == 0 ||
1485 m_total(m_class(m)) >= m_minlimit(m_class(m)));
1486 }
1487
1488 lck_mtx_unlock(mbuf_mlock);
1489
1490 (void) kernel_thread_start((thread_continue_t)mbuf_worker_thread_init,
1491 NULL, &thread);
1492 thread_deallocate(thread);
1493
1494 ref_cache = mcache_create("mext_ref", sizeof (struct ext_ref),
1495 0, 0, MCR_SLEEP);
1496
1497 /* Create the cache for each class */
1498 for (m = 0; m < NELEM(mbuf_table); m++) {
1499 void *allocfunc, *freefunc, *auditfunc, *logfunc;
1500 u_int32_t flags;
1501
1502 flags = mbuf_debug;
1503 if (m_class(m) == MC_MBUF_CL || m_class(m) == MC_MBUF_BIGCL ||
1504 m_class(m) == MC_MBUF_16KCL) {
1505 allocfunc = mbuf_cslab_alloc;
1506 freefunc = mbuf_cslab_free;
1507 auditfunc = mbuf_cslab_audit;
1508 logfunc = mleak_logger;
1509 } else {
1510 allocfunc = mbuf_slab_alloc;
1511 freefunc = mbuf_slab_free;
1512 auditfunc = mbuf_slab_audit;
1513 logfunc = mleak_logger;
1514 }
1515
1516 /*
1517 * Disable per-CPU caches for jumbo classes if there
1518 * is no jumbo cluster pool available in the system.
1519 * The cache itself is still created (but will never
1520 * be populated) since it simplifies the code.
1521 */
1522 if ((m_class(m) == MC_MBUF_16KCL || m_class(m) == MC_16KCL) &&
1523 njcl == 0)
1524 flags |= MCF_NOCPUCACHE;
1525
1526 if (!mclfindleak)
1527 flags |= MCF_NOLEAKLOG;
1528
1529 m_cache(m) = mcache_create_ext(m_cname(m), m_maxsize(m),
1530 allocfunc, freefunc, auditfunc, logfunc, mbuf_slab_notify,
1531 (void *)(uintptr_t)m, flags, MCR_SLEEP);
1532 }
1533
1534 /*
1535 * Allocate structure for per-CPU statistics that's aligned
1536 * on the CPU cache boundary; this code assumes that we never
1537 * uninitialize this framework, since the original address
1538 * before alignment is not saved.
1539 */
1540 ncpu = ml_get_max_cpus();
1541 MALLOC(buf, void *, MBUF_MTYPES_SIZE(ncpu) + CPU_CACHE_SIZE,
1542 M_TEMP, M_WAITOK);
1543 VERIFY(buf != NULL);
1544
1545 mbuf_mtypes = (mbuf_mtypes_t *)P2ROUNDUP((intptr_t)buf, CPU_CACHE_SIZE);
1546 bzero(mbuf_mtypes, MBUF_MTYPES_SIZE(ncpu));
1547
1548 /*
1549 * Set the max limit on sb_max to be 1/16 th of the size of
1550 * memory allocated for mbuf clusters.
1551 */
1552 high_sb_max = (nmbclusters << (MCLSHIFT - 4));
1553 if (high_sb_max < sb_max) {
1554 /* sb_max is too large for this configuration, scale it down */
1555 if (high_sb_max > (1 << MBSHIFT)) {
1556 /* We have atleast 16 M of mbuf pool */
1557 sb_max = high_sb_max;
1558 } else if ((nmbclusters << MCLSHIFT) > (1 << MBSHIFT)) {
1559 /*
1560 * If we have more than 1M of mbufpool, cap the size of
1561 * max sock buf at 1M
1562 */
1563 sb_max = high_sb_max = (1 << MBSHIFT);
1564 } else {
1565 sb_max = high_sb_max;
1566 }
1567 }
1568
1569 /* allocate space for mbuf_dump_buf */
1570 MALLOC(mbuf_dump_buf, char *, MBUF_DUMP_BUF_SIZE, M_TEMP, M_WAITOK);
1571 VERIFY(mbuf_dump_buf != NULL);
1572
1573 printf("mbinit: done [%d MB total pool size, (%d/%d) split]\n",
1574 (nmbclusters << MCLSHIFT) >> MBSHIFT,
1575 (nclusters << MCLSHIFT) >> MBSHIFT,
1576 (njcl << MCLSHIFT) >> MBSHIFT);
1577 }
1578
1579 /*
1580 * Obtain a slab of object(s) from the class's freelist.
1581 */
1582 static mcache_obj_t *
1583 slab_alloc(mbuf_class_t class, int wait)
1584 {
1585 mcl_slab_t *sp;
1586 mcache_obj_t *buf;
1587
1588 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1589
1590 VERIFY(class != MC_16KCL || njcl > 0);
1591
1592 /* This should always be NULL for us */
1593 VERIFY(m_cobjlist(class) == NULL);
1594
1595 /*
1596 * Treat composite objects as having longer lifespan by using
1597 * a slab from the reverse direction, in hoping that this could
1598 * reduce the probability of fragmentation for slabs that hold
1599 * more than one buffer chunks (e.g. mbuf slabs). For other
1600 * slabs, this probably doesn't make much of a difference.
1601 */
1602 if ((class == MC_MBUF || class == MC_CL) && (wait & MCR_COMP))
1603 sp = (mcl_slab_t *)TAILQ_LAST(&m_slablist(class), mcl_slhead);
1604 else
1605 sp = (mcl_slab_t *)TAILQ_FIRST(&m_slablist(class));
1606
1607 if (sp == NULL) {
1608 VERIFY(m_infree(class) == 0 && m_slab_cnt(class) == 0);
1609 /* The slab list for this class is empty */
1610 return (NULL);
1611 }
1612
1613 VERIFY(m_infree(class) > 0);
1614 VERIFY(!slab_is_detached(sp));
1615 VERIFY(sp->sl_class == class &&
1616 (sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED);
1617 buf = sp->sl_head;
1618 VERIFY(slab_inrange(sp, buf) && sp == slab_get(buf));
1619
1620 if (class == MC_MBUF) {
1621 sp->sl_head = buf->obj_next;
1622 VERIFY(sp->sl_head != NULL || sp->sl_refcnt == (NMBPBG - 1));
1623 } else if (class == MC_CL) {
1624 sp->sl_head = buf->obj_next;
1625 VERIFY(sp->sl_head != NULL || sp->sl_refcnt == (NCLPBG - 1));
1626 } else {
1627 sp->sl_head = NULL;
1628 }
1629 if (sp->sl_head != NULL && !slab_inrange(sp, sp->sl_head)) {
1630 slab_nextptr_panic(sp, sp->sl_head);
1631 /* In case sl_head is in the map but not in the slab */
1632 VERIFY(slab_inrange(sp, sp->sl_head));
1633 /* NOTREACHED */
1634 }
1635
1636 /* Increment slab reference */
1637 sp->sl_refcnt++;
1638
1639 if (mclaudit != NULL) {
1640 mcache_audit_t *mca = mcl_audit_buf2mca(class, buf);
1641 mca->mca_uflags = 0;
1642 /* Save contents on mbuf objects only */
1643 if (class == MC_MBUF)
1644 mca->mca_uflags |= MB_SCVALID;
1645 }
1646
1647 if (class == MC_CL) {
1648 mbstat.m_clfree = (--m_infree(MC_CL)) + m_infree(MC_MBUF_CL);
1649 /*
1650 * A 2K cluster slab can have at most NCLPBG references.
1651 */
1652 VERIFY(sp->sl_refcnt >= 1 && sp->sl_refcnt <= NCLPBG &&
1653 sp->sl_chunks == NCLPBG &&
1654 sp->sl_len == m_maxsize(MC_BIGCL));
1655 VERIFY(sp->sl_refcnt < NCLPBG || sp->sl_head == NULL);
1656 } else if (class == MC_BIGCL) {
1657 mbstat.m_bigclfree = (--m_infree(MC_BIGCL)) +
1658 m_infree(MC_MBUF_BIGCL);
1659 /*
1660 * A 4K cluster slab can have at most 1 reference.
1661 */
1662 VERIFY(sp->sl_refcnt == 1 && sp->sl_chunks == 1 &&
1663 sp->sl_len == m_maxsize(class) && sp->sl_head == NULL);
1664 } else if (class == MC_16KCL) {
1665 mcl_slab_t *nsp;
1666 int k;
1667
1668 --m_infree(MC_16KCL);
1669 VERIFY(sp->sl_refcnt == 1 && sp->sl_chunks == 1 &&
1670 sp->sl_len == m_maxsize(class) && sp->sl_head == NULL);
1671 /*
1672 * Increment 2nd-Nth slab reference, where N is NSLABSP16KB.
1673 * A 16KB big cluster takes NSLABSP16KB slabs, each having at
1674 * most 1 reference.
1675 */
1676 for (nsp = sp, k = 1; k < NSLABSP16KB; k++) {
1677 nsp = nsp->sl_next;
1678 /* Next slab must already be present */
1679 VERIFY(nsp != NULL);
1680 nsp->sl_refcnt++;
1681 VERIFY(!slab_is_detached(nsp));
1682 VERIFY(nsp->sl_class == MC_16KCL &&
1683 nsp->sl_flags == (SLF_MAPPED | SLF_PARTIAL) &&
1684 nsp->sl_refcnt == 1 && nsp->sl_chunks == 0 &&
1685 nsp->sl_len == 0 && nsp->sl_base == sp->sl_base &&
1686 nsp->sl_head == NULL);
1687 }
1688 } else {
1689 VERIFY(class == MC_MBUF);
1690 --m_infree(MC_MBUF);
1691 /*
1692 * If auditing is turned on, this check is
1693 * deferred until later in mbuf_slab_audit().
1694 */
1695 if (mclaudit == NULL)
1696 _MCHECK((struct mbuf *)buf);
1697 /*
1698 * Since we have incremented the reference count above,
1699 * an mbuf slab (formerly a 4KB cluster slab that was cut
1700 * up into mbufs) must have a reference count between 1
1701 * and NMBPBG at this point.
1702 */
1703 VERIFY(sp->sl_refcnt >= 1 && sp->sl_refcnt <= NMBPBG &&
1704 sp->sl_chunks == NMBPBG &&
1705 sp->sl_len == m_maxsize(MC_BIGCL));
1706 VERIFY(sp->sl_refcnt < NMBPBG || sp->sl_head == NULL);
1707 }
1708
1709 /* If empty, remove this slab from the class's freelist */
1710 if (sp->sl_head == NULL) {
1711 VERIFY(class != MC_MBUF || sp->sl_refcnt == NMBPBG);
1712 VERIFY(class != MC_CL || sp->sl_refcnt == NCLPBG);
1713 slab_remove(sp, class);
1714 }
1715
1716 return (buf);
1717 }
1718
1719 /*
1720 * Place a slab of object(s) back into a class's slab list.
1721 */
1722 static void
1723 slab_free(mbuf_class_t class, mcache_obj_t *buf)
1724 {
1725 mcl_slab_t *sp;
1726
1727 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1728
1729 VERIFY(class != MC_16KCL || njcl > 0);
1730 VERIFY(buf->obj_next == NULL);
1731 sp = slab_get(buf);
1732 VERIFY(sp->sl_class == class && slab_inrange(sp, buf) &&
1733 (sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED);
1734
1735 /* Decrement slab reference */
1736 sp->sl_refcnt--;
1737
1738 if (class == MC_CL) {
1739 VERIFY(IS_P2ALIGNED(buf, MCLBYTES));
1740 /*
1741 * A slab that has been splitted for 2KB clusters can have
1742 * at most 1 outstanding reference at this point.
1743 */
1744 VERIFY(sp->sl_refcnt >= 0 && sp->sl_refcnt <= (NCLPBG - 1) &&
1745 sp->sl_chunks == NCLPBG &&
1746 sp->sl_len == m_maxsize(MC_BIGCL));
1747 VERIFY(sp->sl_refcnt < (NCLPBG - 1) ||
1748 (slab_is_detached(sp) && sp->sl_head == NULL));
1749 } else if (class == MC_BIGCL) {
1750 VERIFY(IS_P2ALIGNED(buf, MCLBYTES));
1751 /*
1752 * A 4KB cluster slab can have at most 1 reference
1753 * which must be 0 at this point.
1754 */
1755 VERIFY(sp->sl_refcnt == 0 && sp->sl_chunks == 1 &&
1756 sp->sl_len == m_maxsize(class) && sp->sl_head == NULL);
1757 VERIFY(slab_is_detached(sp));
1758 } else if (class == MC_16KCL) {
1759 mcl_slab_t *nsp;
1760 int k;
1761 /*
1762 * A 16KB cluster takes NSLABSP16KB slabs, all must
1763 * now have 0 reference.
1764 */
1765 VERIFY(IS_P2ALIGNED(buf, MBIGCLBYTES));
1766 VERIFY(sp->sl_refcnt == 0 && sp->sl_chunks == 1 &&
1767 sp->sl_len == m_maxsize(class) && sp->sl_head == NULL);
1768 VERIFY(slab_is_detached(sp));
1769 for (nsp = sp, k = 1; k < NSLABSP16KB; k++) {
1770 nsp = nsp->sl_next;
1771 /* Next slab must already be present */
1772 VERIFY(nsp != NULL);
1773 nsp->sl_refcnt--;
1774 VERIFY(slab_is_detached(nsp));
1775 VERIFY(nsp->sl_class == MC_16KCL &&
1776 (nsp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) &&
1777 nsp->sl_refcnt == 0 && nsp->sl_chunks == 0 &&
1778 nsp->sl_len == 0 && nsp->sl_base == sp->sl_base &&
1779 nsp->sl_head == NULL);
1780 }
1781 } else {
1782 /*
1783 * A slab that has been splitted for mbufs has at most NMBPBG
1784 * reference counts. Since we have decremented one reference
1785 * above, it must now be between 0 and NMBPBG-1.
1786 */
1787 VERIFY(class == MC_MBUF);
1788 VERIFY(sp->sl_refcnt >= 0 && sp->sl_refcnt <= (NMBPBG - 1) &&
1789 sp->sl_chunks == NMBPBG &&
1790 sp->sl_len == m_maxsize(MC_BIGCL));
1791 VERIFY(sp->sl_refcnt < (NMBPBG - 1) ||
1792 (slab_is_detached(sp) && sp->sl_head == NULL));
1793 }
1794
1795 /*
1796 * When auditing is enabled, ensure that the buffer still
1797 * contains the free pattern. Otherwise it got corrupted
1798 * while at the CPU cache layer.
1799 */
1800 if (mclaudit != NULL) {
1801 mcache_audit_t *mca = mcl_audit_buf2mca(class, buf);
1802 if (mclverify) {
1803 mcache_audit_free_verify(mca, buf, 0, m_maxsize(class));
1804 }
1805 mca->mca_uflags &= ~MB_SCVALID;
1806 }
1807
1808 if (class == MC_CL) {
1809 mbstat.m_clfree = (++m_infree(MC_CL)) + m_infree(MC_MBUF_CL);
1810 buf->obj_next = sp->sl_head;
1811 } else if (class == MC_BIGCL) {
1812 mbstat.m_bigclfree = (++m_infree(MC_BIGCL)) +
1813 m_infree(MC_MBUF_BIGCL);
1814 } else if (class == MC_16KCL) {
1815 ++m_infree(MC_16KCL);
1816 } else {
1817 ++m_infree(MC_MBUF);
1818 buf->obj_next = sp->sl_head;
1819 }
1820 sp->sl_head = buf;
1821
1822 /*
1823 * If a slab has been splitted to either one which holds 2KB clusters,
1824 * or one which holds mbufs, turn it back to one which holds a 4KB
1825 * cluster.
1826 */
1827 if (class == MC_MBUF && sp->sl_refcnt == 0 &&
1828 m_total(class) > m_minlimit(class) &&
1829 m_total(MC_BIGCL) < m_maxlimit(MC_BIGCL)) {
1830 int i = NMBPBG;
1831
1832 m_total(MC_BIGCL)++;
1833 mbstat.m_bigclusters = m_total(MC_BIGCL);
1834 m_total(MC_MBUF) -= NMBPBG;
1835 mbstat.m_mbufs = m_total(MC_MBUF);
1836 m_infree(MC_MBUF) -= NMBPBG;
1837 mtype_stat_add(MT_FREE, -((unsigned)NMBPBG));
1838
1839 VERIFY(m_total(MC_BIGCL) <= m_maxlimit(MC_BIGCL));
1840 VERIFY(m_total(MC_MBUF) >= m_minlimit(MC_MBUF));
1841
1842 while (i--) {
1843 struct mbuf *m = sp->sl_head;
1844 VERIFY(m != NULL);
1845 sp->sl_head = m->m_next;
1846 m->m_next = NULL;
1847 }
1848 VERIFY(sp->sl_head == NULL);
1849
1850 /* Remove the slab from the mbuf class's slab list */
1851 slab_remove(sp, class);
1852
1853 /* Reinitialize it as a 4KB cluster slab */
1854 slab_init(sp, MC_BIGCL, sp->sl_flags, sp->sl_base, sp->sl_base,
1855 sp->sl_len, 0, 1);
1856
1857 if (mclverify) {
1858 mcache_set_pattern(MCACHE_FREE_PATTERN,
1859 (caddr_t)sp->sl_head, m_maxsize(MC_BIGCL));
1860 }
1861 mbstat.m_bigclfree = (++m_infree(MC_BIGCL)) +
1862 m_infree(MC_MBUF_BIGCL);
1863
1864 VERIFY(slab_is_detached(sp));
1865 /* And finally switch class */
1866 class = MC_BIGCL;
1867 } else if (class == MC_CL && sp->sl_refcnt == 0 &&
1868 m_total(class) > m_minlimit(class) &&
1869 m_total(MC_BIGCL) < m_maxlimit(MC_BIGCL)) {
1870 int i = NCLPBG;
1871
1872 m_total(MC_BIGCL)++;
1873 mbstat.m_bigclusters = m_total(MC_BIGCL);
1874 m_total(MC_CL) -= NCLPBG;
1875 mbstat.m_clusters = m_total(MC_CL);
1876 m_infree(MC_CL) -= NCLPBG;
1877 VERIFY(m_total(MC_BIGCL) <= m_maxlimit(MC_BIGCL));
1878 VERIFY(m_total(MC_CL) >= m_minlimit(MC_CL));
1879
1880 while (i--) {
1881 union mcluster *c = sp->sl_head;
1882 VERIFY(c != NULL);
1883 sp->sl_head = c->mcl_next;
1884 c->mcl_next = NULL;
1885 }
1886 VERIFY(sp->sl_head == NULL);
1887
1888 /* Remove the slab from the 2KB cluster class's slab list */
1889 slab_remove(sp, class);
1890
1891 /* Reinitialize it as a 4KB cluster slab */
1892 slab_init(sp, MC_BIGCL, sp->sl_flags, sp->sl_base, sp->sl_base,
1893 sp->sl_len, 0, 1);
1894
1895 if (mclverify) {
1896 mcache_set_pattern(MCACHE_FREE_PATTERN,
1897 (caddr_t)sp->sl_head, m_maxsize(MC_BIGCL));
1898 }
1899 mbstat.m_bigclfree = (++m_infree(MC_BIGCL)) +
1900 m_infree(MC_MBUF_BIGCL);
1901
1902 VERIFY(slab_is_detached(sp));
1903 /* And finally switch class */
1904 class = MC_BIGCL;
1905 }
1906
1907 /* Reinsert the slab to the class's slab list */
1908 if (slab_is_detached(sp))
1909 slab_insert(sp, class);
1910 }
1911
1912 /*
1913 * Common allocator for rudimentary objects called by the CPU cache layer
1914 * during an allocation request whenever there is no available element in the
1915 * bucket layer. It returns one or more elements from the appropriate global
1916 * freelist. If the freelist is empty, it will attempt to populate it and
1917 * retry the allocation.
1918 */
1919 static unsigned int
1920 mbuf_slab_alloc(void *arg, mcache_obj_t ***plist, unsigned int num, int wait)
1921 {
1922 mbuf_class_t class = (mbuf_class_t)arg;
1923 unsigned int need = num;
1924 mcache_obj_t **list = *plist;
1925
1926 ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
1927 ASSERT(need > 0);
1928
1929 lck_mtx_lock(mbuf_mlock);
1930
1931 for (;;) {
1932 if ((*list = slab_alloc(class, wait)) != NULL) {
1933 (*list)->obj_next = NULL;
1934 list = *plist = &(*list)->obj_next;
1935
1936 if (--need == 0) {
1937 /*
1938 * If the number of elements in freelist has
1939 * dropped below low watermark, asynchronously
1940 * populate the freelist now rather than doing
1941 * it later when we run out of elements.
1942 */
1943 if (!mbuf_cached_above(class, wait) &&
1944 m_infree(class) < m_total(class) >> 5) {
1945 (void) freelist_populate(class, 1,
1946 M_DONTWAIT);
1947 }
1948 break;
1949 }
1950 } else {
1951 VERIFY(m_infree(class) == 0 || class == MC_CL);
1952
1953 (void) freelist_populate(class, 1,
1954 (wait & MCR_NOSLEEP) ? M_DONTWAIT : M_WAIT);
1955
1956 if (m_infree(class) > 0)
1957 continue;
1958
1959 /* Check if there's anything at the cache layer */
1960 if (mbuf_cached_above(class, wait))
1961 break;
1962
1963 /* watchdog checkpoint */
1964 mbuf_watchdog();
1965
1966 /* We have nothing and cannot block; give up */
1967 if (wait & MCR_NOSLEEP) {
1968 if (!(wait & MCR_TRYHARD)) {
1969 m_fail_cnt(class)++;
1970 mbstat.m_drops++;
1971 break;
1972 }
1973 }
1974
1975 /*
1976 * If the freelist is still empty and the caller is
1977 * willing to be blocked, sleep on the wait channel
1978 * until an element is available. Otherwise, if
1979 * MCR_TRYHARD is set, do our best to satisfy the
1980 * request without having to go to sleep.
1981 */
1982 if (mbuf_worker_ready &&
1983 mbuf_sleep(class, need, wait))
1984 break;
1985
1986 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1987 }
1988 }
1989
1990 m_alloc_cnt(class) += num - need;
1991 lck_mtx_unlock(mbuf_mlock);
1992
1993 return (num - need);
1994 }
1995
1996 /*
1997 * Common de-allocator for rudimentary objects called by the CPU cache
1998 * layer when one or more elements need to be returned to the appropriate
1999 * global freelist.
2000 */
2001 static void
2002 mbuf_slab_free(void *arg, mcache_obj_t *list, __unused int purged)
2003 {
2004 mbuf_class_t class = (mbuf_class_t)arg;
2005 mcache_obj_t *nlist;
2006 unsigned int num = 0;
2007 int w;
2008
2009 ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
2010
2011 lck_mtx_lock(mbuf_mlock);
2012
2013 for (;;) {
2014 nlist = list->obj_next;
2015 list->obj_next = NULL;
2016 slab_free(class, list);
2017 ++num;
2018 if ((list = nlist) == NULL)
2019 break;
2020 }
2021 m_free_cnt(class) += num;
2022
2023 if ((w = mb_waiters) > 0)
2024 mb_waiters = 0;
2025
2026 lck_mtx_unlock(mbuf_mlock);
2027
2028 if (w != 0)
2029 wakeup(mb_waitchan);
2030 }
2031
2032 /*
2033 * Common auditor for rudimentary objects called by the CPU cache layer
2034 * during an allocation or free request. For the former, this is called
2035 * after the objects are obtained from either the bucket or slab layer
2036 * and before they are returned to the caller. For the latter, this is
2037 * called immediately during free and before placing the objects into
2038 * the bucket or slab layer.
2039 */
2040 static void
2041 mbuf_slab_audit(void *arg, mcache_obj_t *list, boolean_t alloc)
2042 {
2043 mbuf_class_t class = (mbuf_class_t)arg;
2044 mcache_audit_t *mca;
2045
2046 ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
2047
2048 while (list != NULL) {
2049 lck_mtx_lock(mbuf_mlock);
2050 mca = mcl_audit_buf2mca(class, list);
2051
2052 /* Do the sanity checks */
2053 if (class == MC_MBUF) {
2054 mcl_audit_mbuf(mca, list, FALSE, alloc);
2055 ASSERT(mca->mca_uflags & MB_SCVALID);
2056 } else {
2057 mcl_audit_cluster(mca, list, m_maxsize(class),
2058 alloc, TRUE);
2059 ASSERT(!(mca->mca_uflags & MB_SCVALID));
2060 }
2061 /* Record this transaction */
2062 if (mcltrace)
2063 mcache_buffer_log(mca, list, m_cache(class));
2064
2065 if (alloc)
2066 mca->mca_uflags |= MB_INUSE;
2067 else
2068 mca->mca_uflags &= ~MB_INUSE;
2069 /* Unpair the object (unconditionally) */
2070 mca->mca_uptr = NULL;
2071 lck_mtx_unlock(mbuf_mlock);
2072
2073 list = list->obj_next;
2074 }
2075 }
2076
2077 /*
2078 * Common notify routine for all caches. It is called by mcache when
2079 * one or more objects get freed. We use this indication to trigger
2080 * the wakeup of any sleeping threads so that they can retry their
2081 * allocation requests.
2082 */
2083 static void
2084 mbuf_slab_notify(void *arg, u_int32_t reason)
2085 {
2086 mbuf_class_t class = (mbuf_class_t)arg;
2087 int w;
2088
2089 ASSERT(MBUF_CLASS_VALID(class));
2090
2091 if (reason != MCN_RETRYALLOC)
2092 return;
2093
2094 lck_mtx_lock(mbuf_mlock);
2095 if ((w = mb_waiters) > 0) {
2096 m_notified(class)++;
2097 mb_waiters = 0;
2098 }
2099 lck_mtx_unlock(mbuf_mlock);
2100
2101 if (w != 0)
2102 wakeup(mb_waitchan);
2103 }
2104
2105 /*
2106 * Obtain object(s) from the composite class's freelist.
2107 */
2108 static unsigned int
2109 cslab_alloc(mbuf_class_t class, mcache_obj_t ***plist, unsigned int num)
2110 {
2111 unsigned int need = num;
2112 mcl_slab_t *sp, *clsp, *nsp;
2113 struct mbuf *m;
2114 mcache_obj_t **list = *plist;
2115 void *cl;
2116
2117 VERIFY(need > 0);
2118 VERIFY(class != MC_MBUF_16KCL || njcl > 0);
2119 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2120
2121 /* Get what we can from the freelist */
2122 while ((*list = m_cobjlist(class)) != NULL) {
2123 MRANGE(*list);
2124
2125 m = (struct mbuf *)*list;
2126 sp = slab_get(m);
2127 cl = m->m_ext.ext_buf;
2128 clsp = slab_get(cl);
2129 VERIFY(m->m_flags == M_EXT && cl != NULL);
2130 VERIFY(MEXT_RFA(m) != NULL && MBUF_IS_COMPOSITE(m));
2131
2132 if (class == MC_MBUF_CL) {
2133 VERIFY(clsp->sl_refcnt >= 1 &&
2134 clsp->sl_refcnt <= NCLPBG);
2135 } else {
2136 VERIFY(clsp->sl_refcnt == 1);
2137 }
2138
2139 if (class == MC_MBUF_16KCL) {
2140 int k;
2141 for (nsp = clsp, k = 1; k < NSLABSP16KB; k++) {
2142 nsp = nsp->sl_next;
2143 /* Next slab must already be present */
2144 VERIFY(nsp != NULL);
2145 VERIFY(nsp->sl_refcnt == 1);
2146 }
2147 }
2148
2149 if ((m_cobjlist(class) = (*list)->obj_next) != NULL &&
2150 !MBUF_IN_MAP(m_cobjlist(class))) {
2151 slab_nextptr_panic(sp, m_cobjlist(class));
2152 /* NOTREACHED */
2153 }
2154 (*list)->obj_next = NULL;
2155 list = *plist = &(*list)->obj_next;
2156
2157 if (--need == 0)
2158 break;
2159 }
2160 m_infree(class) -= (num - need);
2161
2162 return (num - need);
2163 }
2164
2165 /*
2166 * Place object(s) back into a composite class's freelist.
2167 */
2168 static unsigned int
2169 cslab_free(mbuf_class_t class, mcache_obj_t *list, int purged)
2170 {
2171 mcache_obj_t *o, *tail;
2172 unsigned int num = 0;
2173 struct mbuf *m, *ms;
2174 mcache_audit_t *mca = NULL;
2175 mcache_obj_t *ref_list = NULL;
2176 mcl_slab_t *clsp, *nsp;
2177 void *cl;
2178 mbuf_class_t cl_class;
2179
2180 ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
2181 VERIFY(class != MC_MBUF_16KCL || njcl > 0);
2182 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2183
2184 if (class == MC_MBUF_CL) {
2185 cl_class = MC_CL;
2186 } else if (class == MC_MBUF_BIGCL) {
2187 cl_class = MC_BIGCL;
2188 } else {
2189 VERIFY(class == MC_MBUF_16KCL);
2190 cl_class = MC_16KCL;
2191 }
2192
2193 o = tail = list;
2194
2195 while ((m = ms = (struct mbuf *)o) != NULL) {
2196 mcache_obj_t *rfa, *nexto = o->obj_next;
2197
2198 /* Do the mbuf sanity checks */
2199 if (mclaudit != NULL) {
2200 mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
2201 if (mclverify) {
2202 mcache_audit_free_verify(mca, m, 0,
2203 m_maxsize(MC_MBUF));
2204 }
2205 ms = (struct mbuf *)mca->mca_contents;
2206 }
2207
2208 /* Do the cluster sanity checks */
2209 cl = ms->m_ext.ext_buf;
2210 clsp = slab_get(cl);
2211 if (mclverify) {
2212 size_t size = m_maxsize(cl_class);
2213 mcache_audit_free_verify(mcl_audit_buf2mca(cl_class,
2214 (mcache_obj_t *)cl), cl, 0, size);
2215 }
2216 VERIFY(ms->m_type == MT_FREE);
2217 VERIFY(ms->m_flags == M_EXT);
2218 VERIFY(MEXT_RFA(ms) != NULL && MBUF_IS_COMPOSITE(ms));
2219 if (cl_class == MC_CL) {
2220 VERIFY(clsp->sl_refcnt >= 1 &&
2221 clsp->sl_refcnt <= NCLPBG);
2222 } else {
2223 VERIFY(clsp->sl_refcnt == 1);
2224 }
2225 if (cl_class == MC_16KCL) {
2226 int k;
2227 for (nsp = clsp, k = 1; k < NSLABSP16KB; k++) {
2228 nsp = nsp->sl_next;
2229 /* Next slab must already be present */
2230 VERIFY(nsp != NULL);
2231 VERIFY(nsp->sl_refcnt == 1);
2232 }
2233 }
2234
2235 /*
2236 * If we're asked to purge, restore the actual mbuf using
2237 * contents of the shadow structure (if auditing is enabled)
2238 * and clear EXTF_COMPOSITE flag from the mbuf, as we are
2239 * about to free it and the attached cluster into their caches.
2240 */
2241 if (purged) {
2242 /* Restore constructed mbuf fields */
2243 if (mclaudit != NULL)
2244 mcl_audit_restore_mbuf(m, mca, TRUE);
2245
2246 MEXT_REF(m) = 0;
2247 MEXT_FLAGS(m) = 0;
2248
2249 rfa = (mcache_obj_t *)(void *)MEXT_RFA(m);
2250 rfa->obj_next = ref_list;
2251 ref_list = rfa;
2252 MEXT_RFA(m) = NULL;
2253
2254 m->m_type = MT_FREE;
2255 m->m_flags = m->m_len = 0;
2256 m->m_next = m->m_nextpkt = NULL;
2257
2258 /* Save mbuf fields and make auditing happy */
2259 if (mclaudit != NULL)
2260 mcl_audit_mbuf(mca, o, FALSE, FALSE);
2261
2262 VERIFY(m_total(class) > 0);
2263 m_total(class)--;
2264
2265 /* Free the mbuf */
2266 o->obj_next = NULL;
2267 slab_free(MC_MBUF, o);
2268
2269 /* And free the cluster */
2270 ((mcache_obj_t *)cl)->obj_next = NULL;
2271 if (class == MC_MBUF_CL)
2272 slab_free(MC_CL, cl);
2273 else if (class == MC_MBUF_BIGCL)
2274 slab_free(MC_BIGCL, cl);
2275 else
2276 slab_free(MC_16KCL, cl);
2277 }
2278
2279 ++num;
2280 tail = o;
2281 o = nexto;
2282 }
2283
2284 if (!purged) {
2285 tail->obj_next = m_cobjlist(class);
2286 m_cobjlist(class) = list;
2287 m_infree(class) += num;
2288 } else if (ref_list != NULL) {
2289 mcache_free_ext(ref_cache, ref_list);
2290 }
2291
2292 return (num);
2293 }
2294
2295 /*
2296 * Common allocator for composite objects called by the CPU cache layer
2297 * during an allocation request whenever there is no available element in
2298 * the bucket layer. It returns one or more composite elements from the
2299 * appropriate global freelist. If the freelist is empty, it will attempt
2300 * to obtain the rudimentary objects from their caches and construct them
2301 * into composite mbuf + cluster objects.
2302 */
2303 static unsigned int
2304 mbuf_cslab_alloc(void *arg, mcache_obj_t ***plist, unsigned int needed,
2305 int wait)
2306 {
2307 mbuf_class_t class = (mbuf_class_t)arg;
2308 mbuf_class_t cl_class = 0;
2309 unsigned int num = 0, cnum = 0, want = needed;
2310 mcache_obj_t *ref_list = NULL;
2311 mcache_obj_t *mp_list = NULL;
2312 mcache_obj_t *clp_list = NULL;
2313 mcache_obj_t **list;
2314 struct ext_ref *rfa;
2315 struct mbuf *m;
2316 void *cl;
2317
2318 ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
2319 ASSERT(needed > 0);
2320
2321 VERIFY(class != MC_MBUF_16KCL || njcl > 0);
2322
2323 /* There should not be any slab for this class */
2324 VERIFY(m_slab_cnt(class) == 0 &&
2325 m_slablist(class).tqh_first == NULL &&
2326 m_slablist(class).tqh_last == NULL);
2327
2328 lck_mtx_lock(mbuf_mlock);
2329
2330 /* Try using the freelist first */
2331 num = cslab_alloc(class, plist, needed);
2332 list = *plist;
2333 if (num == needed) {
2334 m_alloc_cnt(class) += num;
2335 lck_mtx_unlock(mbuf_mlock);
2336 return (needed);
2337 }
2338
2339 lck_mtx_unlock(mbuf_mlock);
2340
2341 /*
2342 * We could not satisfy the request using the freelist alone;
2343 * allocate from the appropriate rudimentary caches and use
2344 * whatever we can get to construct the composite objects.
2345 */
2346 needed -= num;
2347
2348 /*
2349 * Mark these allocation requests as coming from a composite cache.
2350 * Also, if the caller is willing to be blocked, mark the request
2351 * with MCR_FAILOK such that we don't end up sleeping at the mbuf
2352 * slab layer waiting for the individual object when one or more
2353 * of the already-constructed composite objects are available.
2354 */
2355 wait |= MCR_COMP;
2356 if (!(wait & MCR_NOSLEEP))
2357 wait |= MCR_FAILOK;
2358
2359 /* allocate mbufs */
2360 needed = mcache_alloc_ext(m_cache(MC_MBUF), &mp_list, needed, wait);
2361 if (needed == 0) {
2362 ASSERT(mp_list == NULL);
2363 goto fail;
2364 }
2365
2366 /* allocate clusters */
2367 if (class == MC_MBUF_CL) {
2368 cl_class = MC_CL;
2369 } else if (class == MC_MBUF_BIGCL) {
2370 cl_class = MC_BIGCL;
2371 } else {
2372 VERIFY(class == MC_MBUF_16KCL);
2373 cl_class = MC_16KCL;
2374 }
2375 needed = mcache_alloc_ext(m_cache(cl_class), &clp_list, needed, wait);
2376 if (needed == 0) {
2377 ASSERT(clp_list == NULL);
2378 goto fail;
2379 }
2380
2381 needed = mcache_alloc_ext(ref_cache, &ref_list, needed, wait);
2382 if (needed == 0) {
2383 ASSERT(ref_list == NULL);
2384 goto fail;
2385 }
2386
2387 /*
2388 * By this time "needed" is MIN(mbuf, cluster, ref). Any left
2389 * overs will get freed accordingly before we return to caller.
2390 */
2391 for (cnum = 0; cnum < needed; cnum++) {
2392 struct mbuf *ms;
2393
2394 m = ms = (struct mbuf *)mp_list;
2395 mp_list = mp_list->obj_next;
2396
2397 cl = clp_list;
2398 clp_list = clp_list->obj_next;
2399 ((mcache_obj_t *)cl)->obj_next = NULL;
2400
2401 rfa = (struct ext_ref *)ref_list;
2402 ref_list = ref_list->obj_next;
2403 ((mcache_obj_t *)(void *)rfa)->obj_next = NULL;
2404
2405 /*
2406 * If auditing is enabled, construct the shadow mbuf
2407 * in the audit structure instead of in the actual one.
2408 * mbuf_cslab_audit() will take care of restoring the
2409 * contents after the integrity check.
2410 */
2411 if (mclaudit != NULL) {
2412 mcache_audit_t *mca, *cl_mca;
2413
2414 lck_mtx_lock(mbuf_mlock);
2415 mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
2416 ms = ((struct mbuf *)mca->mca_contents);
2417 cl_mca = mcl_audit_buf2mca(MC_CL, (mcache_obj_t *)cl);
2418
2419 /*
2420 * Pair them up. Note that this is done at the time
2421 * the mbuf+cluster objects are constructed. This
2422 * information should be treated as "best effort"
2423 * debugging hint since more than one mbufs can refer
2424 * to a cluster. In that case, the cluster might not
2425 * be freed along with the mbuf it was paired with.
2426 */
2427 mca->mca_uptr = cl_mca;
2428 cl_mca->mca_uptr = mca;
2429
2430 ASSERT(mca->mca_uflags & MB_SCVALID);
2431 ASSERT(!(cl_mca->mca_uflags & MB_SCVALID));
2432 lck_mtx_unlock(mbuf_mlock);
2433
2434 /* Technically, they are in the freelist */
2435 if (mclverify) {
2436 size_t size;
2437
2438 mcache_set_pattern(MCACHE_FREE_PATTERN, m,
2439 m_maxsize(MC_MBUF));
2440
2441 if (class == MC_MBUF_CL)
2442 size = m_maxsize(MC_CL);
2443 else if (class == MC_MBUF_BIGCL)
2444 size = m_maxsize(MC_BIGCL);
2445 else
2446 size = m_maxsize(MC_16KCL);
2447
2448 mcache_set_pattern(MCACHE_FREE_PATTERN, cl,
2449 size);
2450 }
2451 }
2452
2453 MBUF_INIT(ms, 0, MT_FREE);
2454 if (class == MC_MBUF_16KCL) {
2455 MBUF_16KCL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE);
2456 } else if (class == MC_MBUF_BIGCL) {
2457 MBUF_BIGCL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE);
2458 } else {
2459 MBUF_CL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE);
2460 }
2461 VERIFY(ms->m_flags == M_EXT);
2462 VERIFY(MEXT_RFA(ms) != NULL && MBUF_IS_COMPOSITE(ms));
2463
2464 *list = (mcache_obj_t *)m;
2465 (*list)->obj_next = NULL;
2466 list = *plist = &(*list)->obj_next;
2467 }
2468
2469 fail:
2470 /*
2471 * Free up what's left of the above.
2472 */
2473 if (mp_list != NULL)
2474 mcache_free_ext(m_cache(MC_MBUF), mp_list);
2475 if (clp_list != NULL)
2476 mcache_free_ext(m_cache(cl_class), clp_list);
2477 if (ref_list != NULL)
2478 mcache_free_ext(ref_cache, ref_list);
2479
2480 lck_mtx_lock(mbuf_mlock);
2481 if (num > 0 || cnum > 0) {
2482 m_total(class) += cnum;
2483 VERIFY(m_total(class) <= m_maxlimit(class));
2484 m_alloc_cnt(class) += num + cnum;
2485 }
2486 if ((num + cnum) < want)
2487 m_fail_cnt(class) += (want - (num + cnum));
2488 lck_mtx_unlock(mbuf_mlock);
2489
2490 return (num + cnum);
2491 }
2492
2493 /*
2494 * Common de-allocator for composite objects called by the CPU cache
2495 * layer when one or more elements need to be returned to the appropriate
2496 * global freelist.
2497 */
2498 static void
2499 mbuf_cslab_free(void *arg, mcache_obj_t *list, int purged)
2500 {
2501 mbuf_class_t class = (mbuf_class_t)arg;
2502 unsigned int num;
2503 int w;
2504
2505 ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
2506
2507 lck_mtx_lock(mbuf_mlock);
2508
2509 num = cslab_free(class, list, purged);
2510 m_free_cnt(class) += num;
2511
2512 if ((w = mb_waiters) > 0)
2513 mb_waiters = 0;
2514
2515 lck_mtx_unlock(mbuf_mlock);
2516
2517 if (w != 0)
2518 wakeup(mb_waitchan);
2519 }
2520
2521 /*
2522 * Common auditor for composite objects called by the CPU cache layer
2523 * during an allocation or free request. For the former, this is called
2524 * after the objects are obtained from either the bucket or slab layer
2525 * and before they are returned to the caller. For the latter, this is
2526 * called immediately during free and before placing the objects into
2527 * the bucket or slab layer.
2528 */
2529 static void
2530 mbuf_cslab_audit(void *arg, mcache_obj_t *list, boolean_t alloc)
2531 {
2532 mbuf_class_t class = (mbuf_class_t)arg;
2533 mcache_audit_t *mca;
2534 struct mbuf *m, *ms;
2535 mcl_slab_t *clsp, *nsp;
2536 size_t size;
2537 void *cl;
2538
2539 ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
2540
2541 while ((m = ms = (struct mbuf *)list) != NULL) {
2542 lck_mtx_lock(mbuf_mlock);
2543 /* Do the mbuf sanity checks and record its transaction */
2544 mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
2545 mcl_audit_mbuf(mca, m, TRUE, alloc);
2546 if (mcltrace)
2547 mcache_buffer_log(mca, m, m_cache(class));
2548
2549 if (alloc)
2550 mca->mca_uflags |= MB_COMP_INUSE;
2551 else
2552 mca->mca_uflags &= ~MB_COMP_INUSE;
2553
2554 /*
2555 * Use the shadow mbuf in the audit structure if we are
2556 * freeing, since the contents of the actual mbuf has been
2557 * pattern-filled by the above call to mcl_audit_mbuf().
2558 */
2559 if (!alloc && mclverify)
2560 ms = (struct mbuf *)mca->mca_contents;
2561
2562 /* Do the cluster sanity checks and record its transaction */
2563 cl = ms->m_ext.ext_buf;
2564 clsp = slab_get(cl);
2565 VERIFY(ms->m_flags == M_EXT && cl != NULL);
2566 VERIFY(MEXT_RFA(ms) != NULL && MBUF_IS_COMPOSITE(ms));
2567 if (class == MC_MBUF_CL)
2568 VERIFY(clsp->sl_refcnt >= 1 &&
2569 clsp->sl_refcnt <= NCLPBG);
2570 else
2571 VERIFY(clsp->sl_refcnt == 1);
2572
2573 if (class == MC_MBUF_16KCL) {
2574 int k;
2575 for (nsp = clsp, k = 1; k < NSLABSP16KB; k++) {
2576 nsp = nsp->sl_next;
2577 /* Next slab must already be present */
2578 VERIFY(nsp != NULL);
2579 VERIFY(nsp->sl_refcnt == 1);
2580 }
2581 }
2582
2583 mca = mcl_audit_buf2mca(MC_CL, cl);
2584 if (class == MC_MBUF_CL)
2585 size = m_maxsize(MC_CL);
2586 else if (class == MC_MBUF_BIGCL)
2587 size = m_maxsize(MC_BIGCL);
2588 else
2589 size = m_maxsize(MC_16KCL);
2590 mcl_audit_cluster(mca, cl, size, alloc, FALSE);
2591 if (mcltrace)
2592 mcache_buffer_log(mca, cl, m_cache(class));
2593
2594 if (alloc)
2595 mca->mca_uflags |= MB_COMP_INUSE;
2596 else
2597 mca->mca_uflags &= ~MB_COMP_INUSE;
2598 lck_mtx_unlock(mbuf_mlock);
2599
2600 list = list->obj_next;
2601 }
2602 }
2603
2604 /*
2605 * Allocate some number of mbuf clusters and place on cluster freelist.
2606 */
2607 static int
2608 m_clalloc(const u_int32_t num, const int wait, const u_int32_t bufsize)
2609 {
2610 int i;
2611 vm_size_t size = 0;
2612 int numpages = 0, large_buffer = (bufsize == m_maxsize(MC_16KCL));
2613 vm_offset_t page = 0;
2614 mcache_audit_t *mca_list = NULL;
2615 mcache_obj_t *con_list = NULL;
2616 mcl_slab_t *sp;
2617
2618 VERIFY(bufsize == m_maxsize(MC_BIGCL) ||
2619 bufsize == m_maxsize(MC_16KCL));
2620
2621 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2622
2623 /*
2624 * Multiple threads may attempt to populate the cluster map one
2625 * after another. Since we drop the lock below prior to acquiring
2626 * the physical page(s), our view of the cluster map may no longer
2627 * be accurate, and we could end up over-committing the pages beyond
2628 * the maximum allowed for each class. To prevent it, this entire
2629 * operation (including the page mapping) is serialized.
2630 */
2631 while (mb_clalloc_busy) {
2632 mb_clalloc_waiters++;
2633 (void) msleep(mb_clalloc_waitchan, mbuf_mlock,
2634 (PZERO-1), "m_clalloc", NULL);
2635 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2636 }
2637
2638 /* We are busy now; tell everyone else to go away */
2639 mb_clalloc_busy = TRUE;
2640
2641 /*
2642 * Honor the caller's wish to block or not block. We have a way
2643 * to grow the pool asynchronously using the mbuf worker thread.
2644 */
2645 i = m_howmany(num, bufsize);
2646 if (i == 0 || (wait & M_DONTWAIT))
2647 goto out;
2648
2649 lck_mtx_unlock(mbuf_mlock);
2650
2651 size = round_page(i * bufsize);
2652 page = kmem_mb_alloc(mb_map, size, large_buffer);
2653
2654 /*
2655 * If we did ask for "n" 16KB physically contiguous chunks
2656 * and didn't get them, then please try again without this
2657 * restriction.
2658 */
2659 if (large_buffer && page == 0)
2660 page = kmem_mb_alloc(mb_map, size, 0);
2661
2662 if (page == 0) {
2663 if (bufsize == m_maxsize(MC_BIGCL)) {
2664 /* Try for 1 page if failed, only 4KB request */
2665 size = NBPG;
2666 page = kmem_mb_alloc(mb_map, size, 0);
2667 }
2668
2669 if (page == 0) {
2670 lck_mtx_lock(mbuf_mlock);
2671 goto out;
2672 }
2673 }
2674
2675 VERIFY(IS_P2ALIGNED(page, NBPG));
2676 numpages = size / NBPG;
2677
2678 /* If auditing is enabled, allocate the audit structures now */
2679 if (mclaudit != NULL) {
2680 int needed;
2681
2682 /*
2683 * Yes, I realize this is a waste of memory for clusters
2684 * that never get transformed into mbufs, as we may end
2685 * up with NMBPBG-1 unused audit structures per cluster.
2686 * But doing so tremendously simplifies the allocation
2687 * strategy, since at this point we are not holding the
2688 * mbuf lock and the caller is okay to be blocked.
2689 */
2690 if (bufsize == m_maxsize(MC_BIGCL)) {
2691 needed = numpages * NMBPBG;
2692
2693 i = mcache_alloc_ext(mcl_audit_con_cache,
2694 &con_list, needed, MCR_SLEEP);
2695
2696 VERIFY(con_list != NULL && i == needed);
2697 } else {
2698 needed = numpages / NSLABSP16KB;
2699 }
2700
2701 i = mcache_alloc_ext(mcache_audit_cache,
2702 (mcache_obj_t **)&mca_list, needed, MCR_SLEEP);
2703
2704 VERIFY(mca_list != NULL && i == needed);
2705 }
2706
2707 lck_mtx_lock(mbuf_mlock);
2708
2709 for (i = 0; i < numpages; i++, page += NBPG) {
2710 ppnum_t offset = ((char *)page - (char *)mbutl) / NBPG;
2711 ppnum_t new_page = pmap_find_phys(kernel_pmap, page);
2712
2713 /*
2714 * In the case of no mapper being available the following
2715 * code noops and returns the input page; if there is a
2716 * mapper the appropriate I/O page is returned.
2717 */
2718 VERIFY(offset < mcl_pages);
2719 if (mcl_paddr_base) {
2720 bzero((void *)(uintptr_t) page, page_size);
2721 new_page = IOMapperInsertPage(mcl_paddr_base, offset, new_page);
2722 }
2723 mcl_paddr[offset] = new_page << PGSHIFT;
2724
2725 /* Pattern-fill this fresh page */
2726 if (mclverify) {
2727 mcache_set_pattern(MCACHE_FREE_PATTERN,
2728 (caddr_t)page, NBPG);
2729 }
2730 if (bufsize == m_maxsize(MC_BIGCL)) {
2731 union mbigcluster *mbc = (union mbigcluster *)page;
2732
2733 /* One for the entire page */
2734 sp = slab_get(mbc);
2735 if (mclaudit != NULL) {
2736 mcl_audit_init(mbc, &mca_list, &con_list,
2737 AUDIT_CONTENTS_SIZE, NMBPBG);
2738 }
2739 VERIFY(sp->sl_refcnt == 0 && sp->sl_flags == 0);
2740 slab_init(sp, MC_BIGCL, SLF_MAPPED,
2741 mbc, mbc, bufsize, 0, 1);
2742
2743 /* Insert this slab */
2744 slab_insert(sp, MC_BIGCL);
2745
2746 /* Update stats now since slab_get() drops the lock */
2747 mbstat.m_bigclfree = ++m_infree(MC_BIGCL) +
2748 m_infree(MC_MBUF_BIGCL);
2749 mbstat.m_bigclusters = ++m_total(MC_BIGCL);
2750 VERIFY(m_total(MC_BIGCL) <= m_maxlimit(MC_BIGCL));
2751 } else if ((i % NSLABSP16KB) == 0) {
2752 union m16kcluster *m16kcl = (union m16kcluster *)page;
2753 mcl_slab_t *nsp;
2754 int k;
2755
2756 VERIFY(njcl > 0);
2757 /* One for the entire 16KB */
2758 sp = slab_get(m16kcl);
2759 if (mclaudit != NULL)
2760 mcl_audit_init(m16kcl, &mca_list, NULL, 0, 1);
2761
2762 VERIFY(sp->sl_refcnt == 0 && sp->sl_flags == 0);
2763 slab_init(sp, MC_16KCL, SLF_MAPPED,
2764 m16kcl, m16kcl, bufsize, 0, 1);
2765
2766 /*
2767 * 2nd-Nth page's slab is part of the first one,
2768 * where N is NSLABSP16KB.
2769 */
2770 for (k = 1; k < NSLABSP16KB; k++) {
2771 nsp = slab_get(((union mbigcluster *)page) + k);
2772 VERIFY(nsp->sl_refcnt == 0 &&
2773 nsp->sl_flags == 0);
2774 slab_init(nsp, MC_16KCL,
2775 SLF_MAPPED | SLF_PARTIAL,
2776 m16kcl, NULL, 0, 0, 0);
2777 }
2778
2779 /* Insert this slab */
2780 slab_insert(sp, MC_16KCL);
2781
2782 /* Update stats now since slab_get() drops the lock */
2783 m_infree(MC_16KCL)++;
2784 m_total(MC_16KCL)++;
2785 VERIFY(m_total(MC_16KCL) <= m_maxlimit(MC_16KCL));
2786 }
2787 }
2788 VERIFY(mca_list == NULL && con_list == NULL);
2789
2790 /* We're done; let others enter */
2791 mb_clalloc_busy = FALSE;
2792 if (mb_clalloc_waiters > 0) {
2793 mb_clalloc_waiters = 0;
2794 wakeup(mb_clalloc_waitchan);
2795 }
2796
2797 if (bufsize == m_maxsize(MC_BIGCL))
2798 return (numpages);
2799
2800 VERIFY(bufsize == m_maxsize(MC_16KCL));
2801 return (numpages / NSLABSP16KB);
2802
2803 out:
2804 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2805
2806 /* We're done; let others enter */
2807 mb_clalloc_busy = FALSE;
2808 if (mb_clalloc_waiters > 0) {
2809 mb_clalloc_waiters = 0;
2810 wakeup(mb_clalloc_waitchan);
2811 }
2812
2813 /*
2814 * When non-blocking we kick a thread if we have to grow the
2815 * pool or if the number of free clusters is less than requested.
2816 */
2817 if (bufsize == m_maxsize(MC_BIGCL)) {
2818 if (i > 0) {
2819 /*
2820 * Remember total number of 4KB clusters needed
2821 * at this time.
2822 */
2823 i += m_total(MC_BIGCL);
2824 if (i > mbuf_expand_big) {
2825 mbuf_expand_big = i;
2826 if (mbuf_worker_ready)
2827 wakeup((caddr_t)&mbuf_worker_run);
2828 }
2829 }
2830
2831 if (m_infree(MC_BIGCL) >= num)
2832 return (1);
2833 } else {
2834 if (i > 0) {
2835 /*
2836 * Remember total number of 16KB clusters needed
2837 * at this time.
2838 */
2839 i += m_total(MC_16KCL);
2840 if (i > mbuf_expand_16k) {
2841 mbuf_expand_16k = i;
2842 if (mbuf_worker_ready)
2843 wakeup((caddr_t)&mbuf_worker_run);
2844 }
2845 }
2846
2847 if (m_infree(MC_16KCL) >= num)
2848 return (1);
2849 }
2850 return (0);
2851 }
2852
2853 /*
2854 * Populate the global freelist of the corresponding buffer class.
2855 */
2856 static int
2857 freelist_populate(mbuf_class_t class, unsigned int num, int wait)
2858 {
2859 mcache_obj_t *o = NULL;
2860 int i, numpages = 0, count;
2861
2862 VERIFY(class == MC_MBUF || class == MC_CL || class == MC_BIGCL ||
2863 class == MC_16KCL);
2864
2865 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2866
2867 switch (class) {
2868 case MC_MBUF:
2869 case MC_CL:
2870 case MC_BIGCL:
2871 numpages = (num * m_size(class) + NBPG - 1) / NBPG;
2872 i = m_clalloc(numpages, wait, m_maxsize(MC_BIGCL));
2873
2874 /* Respect the 4KB clusters minimum limit */
2875 if (m_total(MC_BIGCL) == m_maxlimit(MC_BIGCL) &&
2876 m_infree(MC_BIGCL) <= m_minlimit(MC_BIGCL)) {
2877 if (class != MC_BIGCL || (wait & MCR_COMP))
2878 return (0);
2879 }
2880 if (class == MC_BIGCL)
2881 return (i != 0);
2882 break;
2883
2884 case MC_16KCL:
2885 return (m_clalloc(num, wait, m_maxsize(class)) != 0);
2886 /* NOTREACHED */
2887
2888 default:
2889 VERIFY(0);
2890 /* NOTREACHED */
2891 }
2892
2893 VERIFY(class == MC_MBUF || class == MC_CL);
2894
2895 /* how many objects will we cut the page into? */
2896 int numobj = (class == MC_MBUF ? NMBPBG : NCLPBG);
2897
2898 for (count = 0; count < numpages; count++) {
2899
2900 /* respect totals, minlimit, maxlimit */
2901 if (m_total(MC_BIGCL) <= m_minlimit(MC_BIGCL) ||
2902 m_total(class) >= m_maxlimit(class))
2903 break;
2904
2905 if ((o = slab_alloc(MC_BIGCL, wait)) == NULL)
2906 break;
2907
2908 struct mbuf *m = (struct mbuf *)o;
2909 union mcluster *c = (union mcluster *)o;
2910 mcl_slab_t *sp = slab_get(o);
2911 mcache_audit_t *mca = NULL;
2912
2913 VERIFY(slab_is_detached(sp) &&
2914 (sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED);
2915
2916 /*
2917 * Make sure that the cluster is unmolested
2918 * while in freelist
2919 */
2920 if (mclverify) {
2921 mca = mcl_audit_buf2mca(MC_BIGCL, o);
2922 mcache_audit_free_verify(mca, o, 0,
2923 m_maxsize(MC_BIGCL));
2924 }
2925
2926 /* Reinitialize it as an mbuf or 2K slab */
2927 slab_init(sp, class, sp->sl_flags,
2928 sp->sl_base, NULL, sp->sl_len, 0, numobj);
2929
2930 VERIFY(o == (mcache_obj_t *)sp->sl_base);
2931 VERIFY(sp->sl_head == NULL);
2932
2933 VERIFY(m_total(MC_BIGCL) > 0);
2934 m_total(MC_BIGCL)--;
2935 mbstat.m_bigclusters = m_total(MC_BIGCL);
2936
2937 m_total(class) += numobj;
2938 m_infree(class) += numobj;
2939
2940 VERIFY(m_total(MC_BIGCL) >= m_minlimit(MC_BIGCL));
2941 VERIFY(m_total(class) <= m_maxlimit(class));
2942
2943 i = numobj;
2944 if (class == MC_MBUF) {
2945 mbstat.m_mbufs = m_total(MC_MBUF);
2946 mtype_stat_add(MT_FREE, NMBPBG);
2947 while (i--) {
2948 /*
2949 * If auditing is enabled, construct the
2950 * shadow mbuf in the audit structure
2951 * instead of the actual one.
2952 * mbuf_slab_audit() will take care of
2953 * restoring the contents after the
2954 * integrity check.
2955 */
2956 if (mclaudit != NULL) {
2957 struct mbuf *ms;
2958 mca = mcl_audit_buf2mca(MC_MBUF,
2959 (mcache_obj_t *)m);
2960 ms = ((struct mbuf *)
2961 mca->mca_contents);
2962 ms->m_type = MT_FREE;
2963 } else {
2964 m->m_type = MT_FREE;
2965 }
2966 m->m_next = sp->sl_head;
2967 sp->sl_head = (void *)m++;
2968 }
2969 } else { /* MC_CL */
2970 mbstat.m_clfree =
2971 m_infree(MC_CL) + m_infree(MC_MBUF_CL);
2972 mbstat.m_clusters = m_total(MC_CL);
2973 while (i--) {
2974 c->mcl_next = sp->sl_head;
2975 sp->sl_head = (void *)c++;
2976 }
2977 }
2978
2979 /* Insert into the mbuf or 2k slab list */
2980 slab_insert(sp, class);
2981
2982 if ((i = mb_waiters) > 0)
2983 mb_waiters = 0;
2984 if (i != 0)
2985 wakeup(mb_waitchan);
2986 }
2987 return (count != 0);
2988 }
2989
2990 /*
2991 * For each class, initialize the freelist to hold m_minlimit() objects.
2992 */
2993 static void
2994 freelist_init(mbuf_class_t class)
2995 {
2996 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2997
2998 VERIFY(class == MC_CL || class == MC_BIGCL);
2999 VERIFY(m_total(class) == 0);
3000 VERIFY(m_minlimit(class) > 0);
3001
3002 while (m_total(class) < m_minlimit(class))
3003 (void) freelist_populate(class, m_minlimit(class), M_WAIT);
3004
3005 VERIFY(m_total(class) >= m_minlimit(class));
3006 }
3007
3008 /*
3009 * (Inaccurately) check if it might be worth a trip back to the
3010 * mcache layer due the availability of objects there. We'll
3011 * end up back here if there's nothing up there.
3012 */
3013 static boolean_t
3014 mbuf_cached_above(mbuf_class_t class, int wait)
3015 {
3016 switch (class) {
3017 case MC_MBUF:
3018 if (wait & MCR_COMP)
3019 return (!mcache_bkt_isempty(m_cache(MC_MBUF_CL)) ||
3020 !mcache_bkt_isempty(m_cache(MC_MBUF_BIGCL)));
3021 break;
3022
3023 case MC_CL:
3024 if (wait & MCR_COMP)
3025 return (!mcache_bkt_isempty(m_cache(MC_MBUF_CL)));
3026 break;
3027
3028 case MC_BIGCL:
3029 if (wait & MCR_COMP)
3030 return (!mcache_bkt_isempty(m_cache(MC_MBUF_BIGCL)));
3031 break;
3032
3033 case MC_16KCL:
3034 if (wait & MCR_COMP)
3035 return (!mcache_bkt_isempty(m_cache(MC_MBUF_16KCL)));
3036 break;
3037
3038 case MC_MBUF_CL:
3039 case MC_MBUF_BIGCL:
3040 case MC_MBUF_16KCL:
3041 break;
3042
3043 default:
3044 VERIFY(0);
3045 /* NOTREACHED */
3046 }
3047
3048 return (!mcache_bkt_isempty(m_cache(class)));
3049 }
3050
3051 /*
3052 * If possible, convert constructed objects to raw ones.
3053 */
3054 static boolean_t
3055 mbuf_steal(mbuf_class_t class, unsigned int num)
3056 {
3057 mcache_obj_t *top = NULL;
3058 mcache_obj_t **list = &top;
3059 unsigned int tot = 0;
3060
3061 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
3062
3063 switch (class) {
3064 case MC_MBUF:
3065 case MC_CL:
3066 case MC_BIGCL:
3067 case MC_16KCL:
3068 return (FALSE);
3069
3070 case MC_MBUF_CL:
3071 case MC_MBUF_BIGCL:
3072 case MC_MBUF_16KCL:
3073 /* Get the required number of constructed objects if possible */
3074 if (m_infree(class) > m_minlimit(class)) {
3075 tot = cslab_alloc(class, &list,
3076 MIN(num, m_infree(class)));
3077 }
3078
3079 /* And destroy them to get back the raw objects */
3080 if (top != NULL)
3081 (void) cslab_free(class, top, 1);
3082 break;
3083
3084 default:
3085 VERIFY(0);
3086 /* NOTREACHED */
3087 }
3088
3089 return (tot == num);
3090 }
3091
3092 static void
3093 m_reclaim(mbuf_class_t class, unsigned int num, boolean_t comp)
3094 {
3095 int m, bmap = 0;
3096
3097 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
3098
3099 VERIFY(m_total(MC_CL) <= m_maxlimit(MC_CL));
3100 VERIFY(m_total(MC_BIGCL) <= m_maxlimit(MC_BIGCL));
3101 VERIFY(m_total(MC_16KCL) <= m_maxlimit(MC_16KCL));
3102
3103 /*
3104 * This logic can be made smarter; for now, simply mark
3105 * all other related classes as potential victims.
3106 */
3107 switch (class) {
3108 case MC_MBUF:
3109 m_wantpurge(MC_CL)++;
3110 m_wantpurge(MC_BIGCL)++;
3111 m_wantpurge(MC_MBUF_CL)++;
3112 m_wantpurge(MC_MBUF_BIGCL)++;
3113 break;
3114
3115 case MC_CL:
3116 m_wantpurge(MC_MBUF)++;
3117 m_wantpurge(MC_BIGCL)++;
3118 m_wantpurge(MC_MBUF_BIGCL)++;
3119 if (!comp)
3120 m_wantpurge(MC_MBUF_CL)++;
3121 break;
3122
3123 case MC_BIGCL:
3124 m_wantpurge(MC_MBUF)++;
3125 m_wantpurge(MC_CL)++;
3126 m_wantpurge(MC_MBUF_CL)++;
3127 if (!comp)
3128 m_wantpurge(MC_MBUF_BIGCL)++;
3129 break;
3130
3131 case MC_16KCL:
3132 if (!comp)
3133 m_wantpurge(MC_MBUF_16KCL)++;
3134 break;
3135
3136 default:
3137 VERIFY(0);
3138 /* NOTREACHED */
3139 }
3140
3141 /*
3142 * Run through each marked class and check if we really need to
3143 * purge (and therefore temporarily disable) the per-CPU caches
3144 * layer used by the class. If so, remember the classes since
3145 * we are going to drop the lock below prior to purging.
3146 */
3147 for (m = 0; m < NELEM(mbuf_table); m++) {
3148 if (m_wantpurge(m) > 0) {
3149 m_wantpurge(m) = 0;
3150 /*
3151 * Try hard to steal the required number of objects
3152 * from the freelist of other mbuf classes. Only
3153 * purge and disable the per-CPU caches layer when
3154 * we don't have enough; it's the last resort.
3155 */
3156 if (!mbuf_steal(m, num))
3157 bmap |= (1 << m);
3158 }
3159 }
3160
3161 lck_mtx_unlock(mbuf_mlock);
3162
3163 if (bmap != 0) {
3164 /* drain is performed in pfslowtimo(), to avoid deadlocks */
3165 do_reclaim = 1;
3166
3167 /* Sigh; we have no other choices but to ask mcache to purge */
3168 for (m = 0; m < NELEM(mbuf_table); m++) {
3169 if ((bmap & (1 << m)) &&
3170 mcache_purge_cache(m_cache(m))) {
3171 lck_mtx_lock(mbuf_mlock);
3172 m_purge_cnt(m)++;
3173 mbstat.m_drain++;
3174 lck_mtx_unlock(mbuf_mlock);
3175 }
3176 }
3177 } else {
3178 /*
3179 * Request mcache to reap extra elements from all of its caches;
3180 * note that all reaps are serialized and happen only at a fixed
3181 * interval.
3182 */
3183 mcache_reap();
3184 }
3185 lck_mtx_lock(mbuf_mlock);
3186 }
3187
3188 static inline struct mbuf *
3189 m_get_common(int wait, short type, int hdr)
3190 {
3191 struct mbuf *m;
3192 int mcflags = MSLEEPF(wait);
3193
3194 /* Is this due to a non-blocking retry? If so, then try harder */
3195 if (mcflags & MCR_NOSLEEP)
3196 mcflags |= MCR_TRYHARD;
3197
3198 m = mcache_alloc(m_cache(MC_MBUF), mcflags);
3199 if (m != NULL) {
3200 MBUF_INIT(m, hdr, type);
3201 mtype_stat_inc(type);
3202 mtype_stat_dec(MT_FREE);
3203 #if CONFIG_MACF_NET
3204 if (hdr && mac_init_mbuf(m, wait) != 0) {
3205 m_free(m);
3206 return (NULL);
3207 }
3208 #endif /* MAC_NET */
3209 }
3210 return (m);
3211 }
3212
3213 /*
3214 * Space allocation routines; these are also available as macros
3215 * for critical paths.
3216 */
3217 #define _M_GET(wait, type) m_get_common(wait, type, 0)
3218 #define _M_GETHDR(wait, type) m_get_common(wait, type, 1)
3219 #define _M_RETRY(wait, type) _M_GET(wait, type)
3220 #define _M_RETRYHDR(wait, type) _M_GETHDR(wait, type)
3221 #define _MGET(m, how, type) ((m) = _M_GET(how, type))
3222 #define _MGETHDR(m, how, type) ((m) = _M_GETHDR(how, type))
3223
3224 struct mbuf *
3225 m_get(int wait, int type)
3226 {
3227 return (_M_GET(wait, type));
3228 }
3229
3230 struct mbuf *
3231 m_gethdr(int wait, int type)
3232 {
3233 return (_M_GETHDR(wait, type));
3234 }
3235
3236 struct mbuf *
3237 m_retry(int wait, int type)
3238 {
3239 return (_M_RETRY(wait, type));
3240 }
3241
3242 struct mbuf *
3243 m_retryhdr(int wait, int type)
3244 {
3245 return (_M_RETRYHDR(wait, type));
3246 }
3247
3248 struct mbuf *
3249 m_getclr(int wait, int type)
3250 {
3251 struct mbuf *m;
3252
3253 _MGET(m, wait, type);
3254 if (m != NULL)
3255 bzero(MTOD(m, caddr_t), MLEN);
3256 return (m);
3257 }
3258
3259 struct mbuf *
3260 m_free(struct mbuf *m)
3261 {
3262 struct mbuf *n = m->m_next;
3263
3264 if (m->m_type == MT_FREE)
3265 panic("m_free: freeing an already freed mbuf");
3266
3267 /* Free the aux data and tags if there is any */
3268 if (m->m_flags & M_PKTHDR) {
3269 m_tag_delete_chain(m, NULL);
3270 }
3271
3272 if (m->m_flags & M_EXT) {
3273 u_int32_t refcnt;
3274 u_int32_t composite;
3275
3276 refcnt = m_decref(m);
3277 composite = (MEXT_FLAGS(m) & EXTF_COMPOSITE);
3278 if (refcnt == 0 && !composite) {
3279 if (m->m_ext.ext_free == NULL) {
3280 mcache_free(m_cache(MC_CL), m->m_ext.ext_buf);
3281 } else if (m->m_ext.ext_free == m_bigfree) {
3282 mcache_free(m_cache(MC_BIGCL),
3283 m->m_ext.ext_buf);
3284 } else if (m->m_ext.ext_free == m_16kfree) {
3285 mcache_free(m_cache(MC_16KCL),
3286 m->m_ext.ext_buf);
3287 } else {
3288 (*(m->m_ext.ext_free))(m->m_ext.ext_buf,
3289 m->m_ext.ext_size, m->m_ext.ext_arg);
3290 }
3291 mcache_free(ref_cache, MEXT_RFA(m));
3292 MEXT_RFA(m) = NULL;
3293 } else if (refcnt == 0 && composite) {
3294 VERIFY(m->m_type != MT_FREE);
3295
3296 mtype_stat_dec(m->m_type);
3297 mtype_stat_inc(MT_FREE);
3298
3299 m->m_type = MT_FREE;
3300 m->m_flags = M_EXT;
3301 m->m_len = 0;
3302 m->m_next = m->m_nextpkt = NULL;
3303
3304 MEXT_FLAGS(m) &= ~EXTF_READONLY;
3305
3306 /* "Free" into the intermediate cache */
3307 if (m->m_ext.ext_free == NULL) {
3308 mcache_free(m_cache(MC_MBUF_CL), m);
3309 } else if (m->m_ext.ext_free == m_bigfree) {
3310 mcache_free(m_cache(MC_MBUF_BIGCL), m);
3311 } else {
3312 VERIFY(m->m_ext.ext_free == m_16kfree);
3313 mcache_free(m_cache(MC_MBUF_16KCL), m);
3314 }
3315 return (n);
3316 }
3317 }
3318
3319 if (m->m_type != MT_FREE) {
3320 mtype_stat_dec(m->m_type);
3321 mtype_stat_inc(MT_FREE);
3322 }
3323
3324 m->m_type = MT_FREE;
3325 m->m_flags = m->m_len = 0;
3326 m->m_next = m->m_nextpkt = NULL;
3327
3328 mcache_free(m_cache(MC_MBUF), m);
3329
3330 return (n);
3331 }
3332
3333 __private_extern__ struct mbuf *
3334 m_clattach(struct mbuf *m, int type, caddr_t extbuf,
3335 void (*extfree)(caddr_t, u_int, caddr_t), u_int extsize, caddr_t extarg,
3336 int wait)
3337 {
3338 struct ext_ref *rfa = NULL;
3339
3340 if (m == NULL && (m = _M_GETHDR(wait, type)) == NULL)
3341 return (NULL);
3342
3343 if (m->m_flags & M_EXT) {
3344 u_int32_t refcnt;
3345 u_int32_t composite;
3346
3347 refcnt = m_decref(m);
3348 composite = (MEXT_FLAGS(m) & EXTF_COMPOSITE);
3349 if (refcnt == 0 && !composite) {
3350 if (m->m_ext.ext_free == NULL) {
3351 mcache_free(m_cache(MC_CL), m->m_ext.ext_buf);
3352 } else if (m->m_ext.ext_free == m_bigfree) {
3353 mcache_free(m_cache(MC_BIGCL),
3354 m->m_ext.ext_buf);
3355 } else if (m->m_ext.ext_free == m_16kfree) {
3356 mcache_free(m_cache(MC_16KCL),
3357 m->m_ext.ext_buf);
3358 } else {
3359 (*(m->m_ext.ext_free))(m->m_ext.ext_buf,
3360 m->m_ext.ext_size, m->m_ext.ext_arg);
3361 }
3362 /* Re-use the reference structure */
3363 rfa = MEXT_RFA(m);
3364 } else if (refcnt == 0 && composite) {
3365 VERIFY(m->m_type != MT_FREE);
3366
3367 mtype_stat_dec(m->m_type);
3368 mtype_stat_inc(MT_FREE);
3369
3370 m->m_type = MT_FREE;
3371 m->m_flags = M_EXT;
3372 m->m_len = 0;
3373 m->m_next = m->m_nextpkt = NULL;
3374
3375 MEXT_FLAGS(m) &= ~EXTF_READONLY;
3376
3377 /* "Free" into the intermediate cache */
3378 if (m->m_ext.ext_free == NULL) {
3379 mcache_free(m_cache(MC_MBUF_CL), m);
3380 } else if (m->m_ext.ext_free == m_bigfree) {
3381 mcache_free(m_cache(MC_MBUF_BIGCL), m);
3382 } else {
3383 VERIFY(m->m_ext.ext_free == m_16kfree);
3384 mcache_free(m_cache(MC_MBUF_16KCL), m);
3385 }
3386 /*
3387 * Allocate a new mbuf, since we didn't divorce
3388 * the composite mbuf + cluster pair above.
3389 */
3390 if ((m = _M_GETHDR(wait, type)) == NULL)
3391 return (NULL);
3392 }
3393 }
3394
3395 if (rfa == NULL &&
3396 (rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL) {
3397 m_free(m);
3398 return (NULL);
3399 }
3400
3401 MEXT_INIT(m, extbuf, extsize, extfree, extarg, rfa, 1, 0);
3402
3403 return (m);
3404 }
3405
3406 /*
3407 * Perform `fast' allocation mbuf clusters from a cache of recently-freed
3408 * clusters. (If the cache is empty, new clusters are allocated en-masse.)
3409 */
3410 struct mbuf *
3411 m_getcl(int wait, int type, int flags)
3412 {
3413 struct mbuf *m;
3414 int mcflags = MSLEEPF(wait);
3415 int hdr = (flags & M_PKTHDR);
3416
3417 /* Is this due to a non-blocking retry? If so, then try harder */
3418 if (mcflags & MCR_NOSLEEP)
3419 mcflags |= MCR_TRYHARD;
3420
3421 m = mcache_alloc(m_cache(MC_MBUF_CL), mcflags);
3422 if (m != NULL) {
3423 u_int32_t flag;
3424 struct ext_ref *rfa;
3425 void *cl;
3426
3427 VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT);
3428 cl = m->m_ext.ext_buf;
3429 rfa = MEXT_RFA(m);
3430
3431 ASSERT(cl != NULL && rfa != NULL);
3432 VERIFY(MBUF_IS_COMPOSITE(m) && m->m_ext.ext_free == NULL);
3433
3434 flag = MEXT_FLAGS(m);
3435
3436 MBUF_INIT(m, hdr, type);
3437 MBUF_CL_INIT(m, cl, rfa, 1, flag);
3438
3439 mtype_stat_inc(type);
3440 mtype_stat_dec(MT_FREE);
3441 #if CONFIG_MACF_NET
3442 if (hdr && mac_init_mbuf(m, wait) != 0) {
3443 m_freem(m);
3444 return (NULL);
3445 }
3446 #endif /* MAC_NET */
3447 }
3448 return (m);
3449 }
3450
3451 /* m_mclget() add an mbuf cluster to a normal mbuf */
3452 struct mbuf *
3453 m_mclget(struct mbuf *m, int wait)
3454 {
3455 struct ext_ref *rfa;
3456
3457 if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL)
3458 return (m);
3459
3460 m->m_ext.ext_buf = m_mclalloc(wait);
3461 if (m->m_ext.ext_buf != NULL) {
3462 MBUF_CL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0);
3463 } else {
3464 mcache_free(ref_cache, rfa);
3465 }
3466 return (m);
3467 }
3468
3469 /* Allocate an mbuf cluster */
3470 caddr_t
3471 m_mclalloc(int wait)
3472 {
3473 int mcflags = MSLEEPF(wait);
3474
3475 /* Is this due to a non-blocking retry? If so, then try harder */
3476 if (mcflags & MCR_NOSLEEP)
3477 mcflags |= MCR_TRYHARD;
3478
3479 return (mcache_alloc(m_cache(MC_CL), mcflags));
3480 }
3481
3482 /* Free an mbuf cluster */
3483 void
3484 m_mclfree(caddr_t p)
3485 {
3486 mcache_free(m_cache(MC_CL), p);
3487 }
3488
3489 /*
3490 * mcl_hasreference() checks if a cluster of an mbuf is referenced by
3491 * another mbuf; see comments in m_incref() regarding EXTF_READONLY.
3492 */
3493 int
3494 m_mclhasreference(struct mbuf *m)
3495 {
3496 if (!(m->m_flags & M_EXT))
3497 return (0);
3498
3499 ASSERT(MEXT_RFA(m) != NULL);
3500
3501 return ((MEXT_FLAGS(m) & EXTF_READONLY) ? 1 : 0);
3502 }
3503
3504 __private_extern__ caddr_t
3505 m_bigalloc(int wait)
3506 {
3507 int mcflags = MSLEEPF(wait);
3508
3509 /* Is this due to a non-blocking retry? If so, then try harder */
3510 if (mcflags & MCR_NOSLEEP)
3511 mcflags |= MCR_TRYHARD;
3512
3513 return (mcache_alloc(m_cache(MC_BIGCL), mcflags));
3514 }
3515
3516 __private_extern__ void
3517 m_bigfree(caddr_t p, __unused u_int size, __unused caddr_t arg)
3518 {
3519 mcache_free(m_cache(MC_BIGCL), p);
3520 }
3521
3522 /* m_mbigget() add an 4KB mbuf cluster to a normal mbuf */
3523 __private_extern__ struct mbuf *
3524 m_mbigget(struct mbuf *m, int wait)
3525 {
3526 struct ext_ref *rfa;
3527
3528 if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL)
3529 return (m);
3530
3531 m->m_ext.ext_buf = m_bigalloc(wait);
3532 if (m->m_ext.ext_buf != NULL) {
3533 MBUF_BIGCL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0);
3534 } else {
3535 mcache_free(ref_cache, rfa);
3536 }
3537 return (m);
3538 }
3539
3540 __private_extern__ caddr_t
3541 m_16kalloc(int wait)
3542 {
3543 int mcflags = MSLEEPF(wait);
3544
3545 /* Is this due to a non-blocking retry? If so, then try harder */
3546 if (mcflags & MCR_NOSLEEP)
3547 mcflags |= MCR_TRYHARD;
3548
3549 return (mcache_alloc(m_cache(MC_16KCL), mcflags));
3550 }
3551
3552 __private_extern__ void
3553 m_16kfree(caddr_t p, __unused u_int size, __unused caddr_t arg)
3554 {
3555 mcache_free(m_cache(MC_16KCL), p);
3556 }
3557
3558 /* m_m16kget() add a 16KB mbuf cluster to a normal mbuf */
3559 __private_extern__ struct mbuf *
3560 m_m16kget(struct mbuf *m, int wait)
3561 {
3562 struct ext_ref *rfa;
3563
3564 if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL)
3565 return (m);
3566
3567 m->m_ext.ext_buf = m_16kalloc(wait);
3568 if (m->m_ext.ext_buf != NULL) {
3569 MBUF_16KCL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0);
3570 } else {
3571 mcache_free(ref_cache, rfa);
3572 }
3573 return (m);
3574 }
3575
3576 /*
3577 * "Move" mbuf pkthdr from "from" to "to".
3578 * "from" must have M_PKTHDR set, and "to" must be empty.
3579 */
3580 void
3581 m_copy_pkthdr(struct mbuf *to, struct mbuf *from)
3582 {
3583 /* We will be taking over the tags of 'to' */
3584 if (to->m_flags & M_PKTHDR)
3585 m_tag_delete_chain(to, NULL);
3586 to->m_pkthdr = from->m_pkthdr; /* especially tags */
3587 m_tag_init(from); /* purge tags from src */
3588 m_service_class_init(from); /* reset svc class from src */
3589 from->m_pkthdr.aux_flags = 0; /* clear aux flags from src */
3590 to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT);
3591 if ((to->m_flags & M_EXT) == 0)
3592 to->m_data = to->m_pktdat;
3593 }
3594
3595 /*
3596 * Duplicate "from"'s mbuf pkthdr in "to".
3597 * "from" must have M_PKTHDR set, and "to" must be empty.
3598 * In particular, this does a deep copy of the packet tags.
3599 */
3600 static int
3601 m_dup_pkthdr(struct mbuf *to, struct mbuf *from, int how)
3602 {
3603 if (to->m_flags & M_PKTHDR)
3604 m_tag_delete_chain(to, NULL);
3605 to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT);
3606 if ((to->m_flags & M_EXT) == 0)
3607 to->m_data = to->m_pktdat;
3608 to->m_pkthdr = from->m_pkthdr;
3609 m_tag_init(to);
3610 return (m_tag_copy_chain(to, from, how));
3611 }
3612
3613 void
3614 m_copy_pftag(struct mbuf *to, struct mbuf *from)
3615 {
3616 to->m_pkthdr.pf_mtag = from->m_pkthdr.pf_mtag;
3617 to->m_pkthdr.pf_mtag.pftag_hdr = NULL;
3618 to->m_pkthdr.pf_mtag.pftag_flags &= ~(PF_TAG_HDR_INET|PF_TAG_HDR_INET6);
3619 }
3620
3621 /*
3622 * Return a list of mbuf hdrs that point to clusters. Try for num_needed;
3623 * if wantall is not set, return whatever number were available. Set up the
3624 * first num_with_pkthdrs with mbuf hdrs configured as packet headers; these
3625 * are chained on the m_nextpkt field. Any packets requested beyond this
3626 * are chained onto the last packet header's m_next field. The size of
3627 * the cluster is controlled by the parameter bufsize.
3628 */
3629 __private_extern__ struct mbuf *
3630 m_getpackets_internal(unsigned int *num_needed, int num_with_pkthdrs,
3631 int wait, int wantall, size_t bufsize)
3632 {
3633 struct mbuf *m;
3634 struct mbuf **np, *top;
3635 unsigned int pnum, needed = *num_needed;
3636 mcache_obj_t *mp_list = NULL;
3637 int mcflags = MSLEEPF(wait);
3638 u_int32_t flag;
3639 struct ext_ref *rfa;
3640 mcache_t *cp;
3641 void *cl;
3642
3643 ASSERT(bufsize == m_maxsize(MC_CL) ||
3644 bufsize == m_maxsize(MC_BIGCL) ||
3645 bufsize == m_maxsize(MC_16KCL));
3646
3647 /*
3648 * Caller must first check for njcl because this
3649 * routine is internal and not exposed/used via KPI.
3650 */
3651 VERIFY(bufsize != m_maxsize(MC_16KCL) || njcl > 0);
3652
3653 top = NULL;
3654 np = &top;
3655 pnum = 0;
3656
3657 /*
3658 * The caller doesn't want all the requested buffers; only some.
3659 * Try hard to get what we can, but don't block. This effectively
3660 * overrides MCR_SLEEP, since this thread will not go to sleep
3661 * if we can't get all the buffers.
3662 */
3663 if (!wantall || (mcflags & MCR_NOSLEEP))
3664 mcflags |= MCR_TRYHARD;
3665
3666 /* Allocate the composite mbuf + cluster elements from the cache */
3667 if (bufsize == m_maxsize(MC_CL))
3668 cp = m_cache(MC_MBUF_CL);
3669 else if (bufsize == m_maxsize(MC_BIGCL))
3670 cp = m_cache(MC_MBUF_BIGCL);
3671 else
3672 cp = m_cache(MC_MBUF_16KCL);
3673 needed = mcache_alloc_ext(cp, &mp_list, needed, mcflags);
3674
3675 for (pnum = 0; pnum < needed; pnum++) {
3676 m = (struct mbuf *)mp_list;
3677 mp_list = mp_list->obj_next;
3678
3679 VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT);
3680 cl = m->m_ext.ext_buf;
3681 rfa = MEXT_RFA(m);
3682
3683 ASSERT(cl != NULL && rfa != NULL);
3684 VERIFY(MBUF_IS_COMPOSITE(m));
3685
3686 flag = MEXT_FLAGS(m);
3687
3688 MBUF_INIT(m, num_with_pkthdrs, MT_DATA);
3689 if (bufsize == m_maxsize(MC_16KCL)) {
3690 MBUF_16KCL_INIT(m, cl, rfa, 1, flag);
3691 } else if (bufsize == m_maxsize(MC_BIGCL)) {
3692 MBUF_BIGCL_INIT(m, cl, rfa, 1, flag);
3693 } else {
3694 MBUF_CL_INIT(m, cl, rfa, 1, flag);
3695 }
3696
3697 if (num_with_pkthdrs > 0) {
3698 --num_with_pkthdrs;
3699 #if CONFIG_MACF_NET
3700 if (mac_mbuf_label_init(m, wait) != 0) {
3701 m_freem(m);
3702 break;
3703 }
3704 #endif /* MAC_NET */
3705 }
3706
3707 *np = m;
3708 if (num_with_pkthdrs > 0)
3709 np = &m->m_nextpkt;
3710 else
3711 np = &m->m_next;
3712 }
3713 ASSERT(pnum != *num_needed || mp_list == NULL);
3714 if (mp_list != NULL)
3715 mcache_free_ext(cp, mp_list);
3716
3717 if (pnum > 0) {
3718 mtype_stat_add(MT_DATA, pnum);
3719 mtype_stat_sub(MT_FREE, pnum);
3720 }
3721
3722 if (wantall && (pnum != *num_needed)) {
3723 if (top != NULL)
3724 m_freem_list(top);
3725 return (NULL);
3726 }
3727
3728 if (pnum > *num_needed) {
3729 printf("%s: File a radar related to <rdar://10146739>. \
3730 needed = %u, pnum = %u, num_needed = %u \n",
3731 __func__, needed, pnum, *num_needed);
3732 }
3733
3734 *num_needed = pnum;
3735 return (top);
3736 }
3737
3738 /*
3739 * Return list of mbuf linked by m_nextpkt. Try for numlist, and if
3740 * wantall is not set, return whatever number were available. The size of
3741 * each mbuf in the list is controlled by the parameter packetlen. Each
3742 * mbuf of the list may have a chain of mbufs linked by m_next. Each mbuf
3743 * in the chain is called a segment. If maxsegments is not null and the
3744 * value pointed to is not null, this specify the maximum number of segments
3745 * for a chain of mbufs. If maxsegments is zero or the value pointed to
3746 * is zero the caller does not have any restriction on the number of segments.
3747 * The actual number of segments of a mbuf chain is return in the value
3748 * pointed to by maxsegments.
3749 */
3750 __private_extern__ struct mbuf *
3751 m_allocpacket_internal(unsigned int *numlist, size_t packetlen,
3752 unsigned int *maxsegments, int wait, int wantall, size_t wantsize)
3753 {
3754 struct mbuf **np, *top, *first = NULL;
3755 size_t bufsize, r_bufsize;
3756 unsigned int num = 0;
3757 unsigned int nsegs = 0;
3758 unsigned int needed, resid;
3759 int mcflags = MSLEEPF(wait);
3760 mcache_obj_t *mp_list = NULL, *rmp_list = NULL;
3761 mcache_t *cp = NULL, *rcp = NULL;
3762
3763 if (*numlist == 0)
3764 return (NULL);
3765
3766 top = NULL;
3767 np = &top;
3768
3769 if (wantsize == 0) {
3770 if (packetlen <= MINCLSIZE) {
3771 bufsize = packetlen;
3772 } else if (packetlen > m_maxsize(MC_CL)) {
3773 /* Use 4KB if jumbo cluster pool isn't available */
3774 if (packetlen <= m_maxsize(MC_BIGCL) || njcl == 0)
3775 bufsize = m_maxsize(MC_BIGCL);
3776 else
3777 bufsize = m_maxsize(MC_16KCL);
3778 } else {
3779 bufsize = m_maxsize(MC_CL);
3780 }
3781 } else if (wantsize == m_maxsize(MC_CL) ||
3782 wantsize == m_maxsize(MC_BIGCL) ||
3783 (wantsize == m_maxsize(MC_16KCL) && njcl > 0)) {
3784 bufsize = wantsize;
3785 } else {
3786 return (NULL);
3787 }
3788
3789 if (bufsize <= MHLEN) {
3790 nsegs = 1;
3791 } else if (bufsize <= MINCLSIZE) {
3792 if (maxsegments != NULL && *maxsegments == 1) {
3793 bufsize = m_maxsize(MC_CL);
3794 nsegs = 1;
3795 } else {
3796 nsegs = 2;
3797 }
3798 } else if (bufsize == m_maxsize(MC_16KCL)) {
3799 VERIFY(njcl > 0);
3800 nsegs = ((packetlen - 1) >> (PGSHIFT + 2)) + 1;
3801 } else if (bufsize == m_maxsize(MC_BIGCL)) {
3802 nsegs = ((packetlen - 1) >> PGSHIFT) + 1;
3803 } else {
3804 nsegs = ((packetlen - 1) >> MCLSHIFT) + 1;
3805 }
3806 if (maxsegments != NULL) {
3807 if (*maxsegments && nsegs > *maxsegments) {
3808 *maxsegments = nsegs;
3809 return (NULL);
3810 }
3811 *maxsegments = nsegs;
3812 }
3813
3814 /*
3815 * The caller doesn't want all the requested buffers; only some.
3816 * Try hard to get what we can, but don't block. This effectively
3817 * overrides MCR_SLEEP, since this thread will not go to sleep
3818 * if we can't get all the buffers.
3819 */
3820 if (!wantall || (mcflags & MCR_NOSLEEP))
3821 mcflags |= MCR_TRYHARD;
3822
3823 /*
3824 * Simple case where all elements in the lists/chains are mbufs.
3825 * Unless bufsize is greater than MHLEN, each segment chain is made
3826 * up of exactly 1 mbuf. Otherwise, each segment chain is made up
3827 * of 2 mbufs; the second one is used for the residual data, i.e.
3828 * the remaining data that cannot fit into the first mbuf.
3829 */
3830 if (bufsize <= MINCLSIZE) {
3831 /* Allocate the elements in one shot from the mbuf cache */
3832 ASSERT(bufsize <= MHLEN || nsegs == 2);
3833 cp = m_cache(MC_MBUF);
3834 needed = mcache_alloc_ext(cp, &mp_list,
3835 (*numlist) * nsegs, mcflags);
3836
3837 /*
3838 * The number of elements must be even if we are to use an
3839 * mbuf (instead of a cluster) to store the residual data.
3840 * If we couldn't allocate the requested number of mbufs,
3841 * trim the number down (if it's odd) in order to avoid
3842 * creating a partial segment chain.
3843 */
3844 if (bufsize > MHLEN && (needed & 0x1))
3845 needed--;
3846
3847 while (num < needed) {
3848 struct mbuf *m;
3849
3850 m = (struct mbuf *)mp_list;
3851 mp_list = mp_list->obj_next;
3852 ASSERT(m != NULL);
3853
3854 MBUF_INIT(m, 1, MT_DATA);
3855 #if CONFIG_MACF_NET
3856 if (mac_init_mbuf(m, wait) != 0) {
3857 m_free(m);
3858 break;
3859 }
3860 #endif /* MAC_NET */
3861 num++;
3862 if (bufsize > MHLEN) {
3863 /* A second mbuf for this segment chain */
3864 m->m_next = (struct mbuf *)mp_list;
3865 mp_list = mp_list->obj_next;
3866 ASSERT(m->m_next != NULL);
3867
3868 MBUF_INIT(m->m_next, 0, MT_DATA);
3869 num++;
3870 }
3871 *np = m;
3872 np = &m->m_nextpkt;
3873 }
3874 ASSERT(num != *numlist || mp_list == NULL);
3875
3876 if (num > 0) {
3877 mtype_stat_add(MT_DATA, num);
3878 mtype_stat_sub(MT_FREE, num);
3879 }
3880 num /= nsegs;
3881
3882 /* We've got them all; return to caller */
3883 if (num == *numlist)
3884 return (top);
3885
3886 goto fail;
3887 }
3888
3889 /*
3890 * Complex cases where elements are made up of one or more composite
3891 * mbufs + cluster, depending on packetlen. Each N-segment chain can
3892 * be illustrated as follows:
3893 *
3894 * [mbuf + cluster 1] [mbuf + cluster 2] ... [mbuf + cluster N]
3895 *
3896 * Every composite mbuf + cluster element comes from the intermediate
3897 * cache (either MC_MBUF_CL or MC_MBUF_BIGCL). For space efficiency,
3898 * the last composite element will come from the MC_MBUF_CL cache,
3899 * unless the residual data is larger than 2KB where we use the
3900 * big cluster composite cache (MC_MBUF_BIGCL) instead. Residual
3901 * data is defined as extra data beyond the first element that cannot
3902 * fit into the previous element, i.e. there is no residual data if
3903 * the chain only has 1 segment.
3904 */
3905 r_bufsize = bufsize;
3906 resid = packetlen > bufsize ? packetlen % bufsize : 0;
3907 if (resid > 0) {
3908 /* There is residual data; figure out the cluster size */
3909 if (wantsize == 0 && packetlen > MINCLSIZE) {
3910 /*
3911 * Caller didn't request that all of the segments
3912 * in the chain use the same cluster size; use the
3913 * smaller of the cluster sizes.
3914 */
3915 if (njcl > 0 && resid > m_maxsize(MC_BIGCL))
3916 r_bufsize = m_maxsize(MC_16KCL);
3917 else if (resid > m_maxsize(MC_CL))
3918 r_bufsize = m_maxsize(MC_BIGCL);
3919 else
3920 r_bufsize = m_maxsize(MC_CL);
3921 } else {
3922 /* Use the same cluster size as the other segments */
3923 resid = 0;
3924 }
3925 }
3926
3927 needed = *numlist;
3928 if (resid > 0) {
3929 /*
3930 * Attempt to allocate composite mbuf + cluster elements for
3931 * the residual data in each chain; record the number of such
3932 * elements that can be allocated so that we know how many
3933 * segment chains we can afford to create.
3934 */
3935 if (r_bufsize <= m_maxsize(MC_CL))
3936 rcp = m_cache(MC_MBUF_CL);
3937 else if (r_bufsize <= m_maxsize(MC_BIGCL))
3938 rcp = m_cache(MC_MBUF_BIGCL);
3939 else
3940 rcp = m_cache(MC_MBUF_16KCL);
3941 needed = mcache_alloc_ext(rcp, &rmp_list, *numlist, mcflags);
3942
3943 if (needed == 0)
3944 goto fail;
3945
3946 /* This is temporarily reduced for calculation */
3947 ASSERT(nsegs > 1);
3948 nsegs--;
3949 }
3950
3951 /*
3952 * Attempt to allocate the rest of the composite mbuf + cluster
3953 * elements for the number of segment chains that we need.
3954 */
3955 if (bufsize <= m_maxsize(MC_CL))
3956 cp = m_cache(MC_MBUF_CL);
3957 else if (bufsize <= m_maxsize(MC_BIGCL))
3958 cp = m_cache(MC_MBUF_BIGCL);
3959 else
3960 cp = m_cache(MC_MBUF_16KCL);
3961 needed = mcache_alloc_ext(cp, &mp_list, needed * nsegs, mcflags);
3962
3963 /* Round it down to avoid creating a partial segment chain */
3964 needed = (needed / nsegs) * nsegs;
3965 if (needed == 0)
3966 goto fail;
3967
3968 if (resid > 0) {
3969 /*
3970 * We're about to construct the chain(s); take into account
3971 * the number of segments we have created above to hold the
3972 * residual data for each chain, as well as restore the
3973 * original count of segments per chain.
3974 */
3975 ASSERT(nsegs > 0);
3976 needed += needed / nsegs;
3977 nsegs++;
3978 }
3979
3980 for (;;) {
3981 struct mbuf *m;
3982 u_int32_t flag;
3983 struct ext_ref *rfa;
3984 void *cl;
3985 int pkthdr;
3986
3987 ++num;
3988 if (nsegs == 1 || (num % nsegs) != 0 || resid == 0) {
3989 m = (struct mbuf *)mp_list;
3990 mp_list = mp_list->obj_next;
3991 } else {
3992 m = (struct mbuf *)rmp_list;
3993 rmp_list = rmp_list->obj_next;
3994 }
3995 ASSERT(m != NULL);
3996 VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT);
3997 VERIFY(m->m_ext.ext_free == NULL ||
3998 m->m_ext.ext_free == m_bigfree ||
3999 m->m_ext.ext_free == m_16kfree);
4000
4001 cl = m->m_ext.ext_buf;
4002 rfa = MEXT_RFA(m);
4003
4004 ASSERT(cl != NULL && rfa != NULL);
4005 VERIFY(MBUF_IS_COMPOSITE(m));
4006
4007 flag = MEXT_FLAGS(m);
4008
4009 pkthdr = (nsegs == 1 || (num % nsegs) == 1);
4010 if (pkthdr)
4011 first = m;
4012 MBUF_INIT(m, pkthdr, MT_DATA);
4013 if (m->m_ext.ext_free == m_16kfree) {
4014 MBUF_16KCL_INIT(m, cl, rfa, 1, flag);
4015 } else if (m->m_ext.ext_free == m_bigfree) {
4016 MBUF_BIGCL_INIT(m, cl, rfa, 1, flag);
4017 } else {
4018 MBUF_CL_INIT(m, cl, rfa, 1, flag);
4019 }
4020 #if CONFIG_MACF_NET
4021 if (pkthdr && mac_init_mbuf(m, wait) != 0) {
4022 --num;
4023 m_freem(m);
4024 break;
4025 }
4026 #endif /* MAC_NET */
4027
4028 *np = m;
4029 if ((num % nsegs) == 0)
4030 np = &first->m_nextpkt;
4031 else
4032 np = &m->m_next;
4033
4034 if (num == needed)
4035 break;
4036 }
4037
4038 if (num > 0) {
4039 mtype_stat_add(MT_DATA, num);
4040 mtype_stat_sub(MT_FREE, num);
4041 }
4042
4043 num /= nsegs;
4044
4045 /* We've got them all; return to caller */
4046 if (num == *numlist) {
4047 ASSERT(mp_list == NULL && rmp_list == NULL);
4048 return (top);
4049 }
4050
4051 fail:
4052 /* Free up what's left of the above */
4053 if (mp_list != NULL)
4054 mcache_free_ext(cp, mp_list);
4055 if (rmp_list != NULL)
4056 mcache_free_ext(rcp, rmp_list);
4057 if (wantall && top != NULL) {
4058 m_freem(top);
4059 return (NULL);
4060 }
4061 *numlist = num;
4062 return (top);
4063 }
4064
4065 /*
4066 * Best effort to get a mbuf cluster + pkthdr. Used by drivers to allocated
4067 * packets on receive ring.
4068 */
4069 __private_extern__ struct mbuf *
4070 m_getpacket_how(int wait)
4071 {
4072 unsigned int num_needed = 1;
4073
4074 return (m_getpackets_internal(&num_needed, 1, wait, 1,
4075 m_maxsize(MC_CL)));
4076 }
4077
4078 /*
4079 * Best effort to get a mbuf cluster + pkthdr. Used by drivers to allocated
4080 * packets on receive ring.
4081 */
4082 struct mbuf *
4083 m_getpacket(void)
4084 {
4085 unsigned int num_needed = 1;
4086
4087 return (m_getpackets_internal(&num_needed, 1, M_WAIT, 1,
4088 m_maxsize(MC_CL)));
4089 }
4090
4091 /*
4092 * Return a list of mbuf hdrs that point to clusters. Try for num_needed;
4093 * if this can't be met, return whatever number were available. Set up the
4094 * first num_with_pkthdrs with mbuf hdrs configured as packet headers. These
4095 * are chained on the m_nextpkt field. Any packets requested beyond this are
4096 * chained onto the last packet header's m_next field.
4097 */
4098 struct mbuf *
4099 m_getpackets(int num_needed, int num_with_pkthdrs, int how)
4100 {
4101 unsigned int n = num_needed;
4102
4103 return (m_getpackets_internal(&n, num_with_pkthdrs, how, 0,
4104 m_maxsize(MC_CL)));
4105 }
4106
4107 /*
4108 * Return a list of mbuf hdrs set up as packet hdrs chained together
4109 * on the m_nextpkt field
4110 */
4111 struct mbuf *
4112 m_getpackethdrs(int num_needed, int how)
4113 {
4114 struct mbuf *m;
4115 struct mbuf **np, *top;
4116
4117 top = NULL;
4118 np = &top;
4119
4120 while (num_needed--) {
4121 m = _M_RETRYHDR(how, MT_DATA);
4122 if (m == NULL)
4123 break;
4124
4125 *np = m;
4126 np = &m->m_nextpkt;
4127 }
4128
4129 return (top);
4130 }
4131
4132 /*
4133 * Free an mbuf list (m_nextpkt) while following m_next. Returns the count
4134 * for mbufs packets freed. Used by the drivers.
4135 */
4136 int
4137 m_freem_list(struct mbuf *m)
4138 {
4139 struct mbuf *nextpkt;
4140 mcache_obj_t *mp_list = NULL;
4141 mcache_obj_t *mcl_list = NULL;
4142 mcache_obj_t *mbc_list = NULL;
4143 mcache_obj_t *m16k_list = NULL;
4144 mcache_obj_t *m_mcl_list = NULL;
4145 mcache_obj_t *m_mbc_list = NULL;
4146 mcache_obj_t *m_m16k_list = NULL;
4147 mcache_obj_t *ref_list = NULL;
4148 int pktcount = 0;
4149 int mt_free = 0, mt_data = 0, mt_header = 0, mt_soname = 0, mt_tag = 0;
4150
4151 while (m != NULL) {
4152 pktcount++;
4153
4154 nextpkt = m->m_nextpkt;
4155 m->m_nextpkt = NULL;
4156
4157 while (m != NULL) {
4158 struct mbuf *next = m->m_next;
4159 mcache_obj_t *o, *rfa;
4160 u_int32_t refcnt, composite;
4161
4162 if (m->m_type == MT_FREE)
4163 panic("m_free: freeing an already freed mbuf");
4164
4165 if (m->m_type != MT_FREE)
4166 mt_free++;
4167
4168 if (m->m_flags & M_PKTHDR) {
4169 m_tag_delete_chain(m, NULL);
4170 }
4171
4172 if (!(m->m_flags & M_EXT))
4173 goto simple_free;
4174
4175 o = (mcache_obj_t *)(void *)m->m_ext.ext_buf;
4176 refcnt = m_decref(m);
4177 composite = (MEXT_FLAGS(m) & EXTF_COMPOSITE);
4178 if (refcnt == 0 && !composite) {
4179 if (m->m_ext.ext_free == NULL) {
4180 o->obj_next = mcl_list;
4181 mcl_list = o;
4182 } else if (m->m_ext.ext_free == m_bigfree) {
4183 o->obj_next = mbc_list;
4184 mbc_list = o;
4185 } else if (m->m_ext.ext_free == m_16kfree) {
4186 o->obj_next = m16k_list;
4187 m16k_list = o;
4188 } else {
4189 (*(m->m_ext.ext_free))((caddr_t)o,
4190 m->m_ext.ext_size,
4191 m->m_ext.ext_arg);
4192 }
4193 rfa = (mcache_obj_t *)(void *)MEXT_RFA(m);
4194 rfa->obj_next = ref_list;
4195 ref_list = rfa;
4196 MEXT_RFA(m) = NULL;
4197 } else if (refcnt == 0 && composite) {
4198 VERIFY(m->m_type != MT_FREE);
4199 /*
4200 * Amortize the costs of atomic operations
4201 * by doing them at the end, if possible.
4202 */
4203 if (m->m_type == MT_DATA)
4204 mt_data++;
4205 else if (m->m_type == MT_HEADER)
4206 mt_header++;
4207 else if (m->m_type == MT_SONAME)
4208 mt_soname++;
4209 else if (m->m_type == MT_TAG)
4210 mt_tag++;
4211 else
4212 mtype_stat_dec(m->m_type);
4213
4214 m->m_type = MT_FREE;
4215 m->m_flags = M_EXT;
4216 m->m_len = 0;
4217 m->m_next = m->m_nextpkt = NULL;
4218
4219 MEXT_FLAGS(m) &= ~EXTF_READONLY;
4220
4221 /* "Free" into the intermediate cache */
4222 o = (mcache_obj_t *)m;
4223 if (m->m_ext.ext_free == NULL) {
4224 o->obj_next = m_mcl_list;
4225 m_mcl_list = o;
4226 } else if (m->m_ext.ext_free == m_bigfree) {
4227 o->obj_next = m_mbc_list;
4228 m_mbc_list = o;
4229 } else {
4230 VERIFY(m->m_ext.ext_free == m_16kfree);
4231 o->obj_next = m_m16k_list;
4232 m_m16k_list = o;
4233 }
4234 m = next;
4235 continue;
4236 }
4237 simple_free:
4238 /*
4239 * Amortize the costs of atomic operations
4240 * by doing them at the end, if possible.
4241 */
4242 if (m->m_type == MT_DATA)
4243 mt_data++;
4244 else if (m->m_type == MT_HEADER)
4245 mt_header++;
4246 else if (m->m_type == MT_SONAME)
4247 mt_soname++;
4248 else if (m->m_type == MT_TAG)
4249 mt_tag++;
4250 else if (m->m_type != MT_FREE)
4251 mtype_stat_dec(m->m_type);
4252
4253 m->m_type = MT_FREE;
4254 m->m_flags = m->m_len = 0;
4255 m->m_next = m->m_nextpkt = NULL;
4256
4257 ((mcache_obj_t *)m)->obj_next = mp_list;
4258 mp_list = (mcache_obj_t *)m;
4259
4260 m = next;
4261 }
4262
4263 m = nextpkt;
4264 }
4265
4266 if (mt_free > 0)
4267 mtype_stat_add(MT_FREE, mt_free);
4268 if (mt_data > 0)
4269 mtype_stat_sub(MT_DATA, mt_data);
4270 if (mt_header > 0)
4271 mtype_stat_sub(MT_HEADER, mt_header);
4272 if (mt_soname > 0)
4273 mtype_stat_sub(MT_SONAME, mt_soname);
4274 if (mt_tag > 0)
4275 mtype_stat_sub(MT_TAG, mt_tag);
4276
4277 if (mp_list != NULL)
4278 mcache_free_ext(m_cache(MC_MBUF), mp_list);
4279 if (mcl_list != NULL)
4280 mcache_free_ext(m_cache(MC_CL), mcl_list);
4281 if (mbc_list != NULL)
4282 mcache_free_ext(m_cache(MC_BIGCL), mbc_list);
4283 if (m16k_list != NULL)
4284 mcache_free_ext(m_cache(MC_16KCL), m16k_list);
4285 if (m_mcl_list != NULL)
4286 mcache_free_ext(m_cache(MC_MBUF_CL), m_mcl_list);
4287 if (m_mbc_list != NULL)
4288 mcache_free_ext(m_cache(MC_MBUF_BIGCL), m_mbc_list);
4289 if (m_m16k_list != NULL)
4290 mcache_free_ext(m_cache(MC_MBUF_16KCL), m_m16k_list);
4291 if (ref_list != NULL)
4292 mcache_free_ext(ref_cache, ref_list);
4293
4294 return (pktcount);
4295 }
4296
4297 void
4298 m_freem(struct mbuf *m)
4299 {
4300 while (m != NULL)
4301 m = m_free(m);
4302 }
4303
4304 /*
4305 * Mbuffer utility routines.
4306 */
4307
4308 /*
4309 * Compute the amount of space available before the current start
4310 * of data in an mbuf.
4311 */
4312 int
4313 m_leadingspace(struct mbuf *m)
4314 {
4315 if (m->m_flags & M_EXT) {
4316 if (MCLHASREFERENCE(m))
4317 return (0);
4318 return (m->m_data - m->m_ext.ext_buf);
4319 }
4320 if (m->m_flags & M_PKTHDR)
4321 return (m->m_data - m->m_pktdat);
4322 return (m->m_data - m->m_dat);
4323 }
4324
4325 /*
4326 * Compute the amount of space available after the end of data in an mbuf.
4327 */
4328 int
4329 m_trailingspace(struct mbuf *m)
4330 {
4331 if (m->m_flags & M_EXT) {
4332 if (MCLHASREFERENCE(m))
4333 return (0);
4334 return (m->m_ext.ext_buf + m->m_ext.ext_size -
4335 (m->m_data + m->m_len));
4336 }
4337 return (&m->m_dat[MLEN] - (m->m_data + m->m_len));
4338 }
4339
4340 /*
4341 * Lesser-used path for M_PREPEND: allocate new mbuf to prepend to chain,
4342 * copy junk along. Does not adjust packet header length.
4343 */
4344 struct mbuf *
4345 m_prepend(struct mbuf *m, int len, int how)
4346 {
4347 struct mbuf *mn;
4348
4349 _MGET(mn, how, m->m_type);
4350 if (mn == NULL) {
4351 m_freem(m);
4352 return (NULL);
4353 }
4354 if (m->m_flags & M_PKTHDR) {
4355 M_COPY_PKTHDR(mn, m);
4356 m->m_flags &= ~M_PKTHDR;
4357 }
4358 mn->m_next = m;
4359 m = mn;
4360 if (len < MHLEN)
4361 MH_ALIGN(m, len);
4362 m->m_len = len;
4363 return (m);
4364 }
4365
4366 /*
4367 * Replacement for old M_PREPEND macro: allocate new mbuf to prepend to
4368 * chain, copy junk along, and adjust length.
4369 */
4370 struct mbuf *
4371 m_prepend_2(struct mbuf *m, int len, int how)
4372 {
4373 if (M_LEADINGSPACE(m) >= len) {
4374 m->m_data -= len;
4375 m->m_len += len;
4376 } else {
4377 m = m_prepend(m, len, how);
4378 }
4379 if ((m) && (m->m_flags & M_PKTHDR))
4380 m->m_pkthdr.len += len;
4381 return (m);
4382 }
4383
4384 /*
4385 * Make a copy of an mbuf chain starting "off0" bytes from the beginning,
4386 * continuing for "len" bytes. If len is M_COPYALL, copy to end of mbuf.
4387 * The wait parameter is a choice of M_WAIT/M_DONTWAIT from caller.
4388 */
4389 int MCFail;
4390
4391 struct mbuf *
4392 m_copym(struct mbuf *m, int off0, int len, int wait)
4393 {
4394 struct mbuf *n, *mhdr = NULL, **np;
4395 int off = off0;
4396 struct mbuf *top;
4397 int copyhdr = 0;
4398
4399 if (off < 0 || len < 0)
4400 panic("m_copym: invalid offset %d or len %d", off, len);
4401
4402 if (off == 0 && (m->m_flags & M_PKTHDR)) {
4403 mhdr = m;
4404 copyhdr = 1;
4405 }
4406
4407 while (off >= m->m_len) {
4408 if (m->m_next == NULL)
4409 panic("m_copym: invalid mbuf chain");
4410 off -= m->m_len;
4411 m = m->m_next;
4412 }
4413 np = &top;
4414 top = NULL;
4415
4416 while (len > 0) {
4417 if (m == NULL) {
4418 if (len != M_COPYALL)
4419 panic("m_copym: len != M_COPYALL");
4420 break;
4421 }
4422
4423 n = _M_RETRY(wait, m->m_type);
4424 *np = n;
4425
4426 if (n == NULL)
4427 goto nospace;
4428
4429 if (copyhdr != 0) {
4430 M_COPY_PKTHDR(n, mhdr);
4431 if (len == M_COPYALL)
4432 n->m_pkthdr.len -= off0;
4433 else
4434 n->m_pkthdr.len = len;
4435 copyhdr = 0;
4436 }
4437 if (len == M_COPYALL) {
4438 if (MIN(len, (m->m_len - off)) == len) {
4439 printf("m->m_len %d - off %d = %d, %d\n",
4440 m->m_len, off, m->m_len - off,
4441 MIN(len, (m->m_len - off)));
4442 }
4443 }
4444 n->m_len = MIN(len, (m->m_len - off));
4445 if (n->m_len == M_COPYALL) {
4446 printf("n->m_len == M_COPYALL, fixing\n");
4447 n->m_len = MHLEN;
4448 }
4449 if (m->m_flags & M_EXT) {
4450 n->m_ext = m->m_ext;
4451 m_incref(m);
4452 n->m_data = m->m_data + off;
4453 n->m_flags |= M_EXT;
4454 } else {
4455 bcopy(MTOD(m, caddr_t)+off, MTOD(n, caddr_t),
4456 (unsigned)n->m_len);
4457 }
4458 if (len != M_COPYALL)
4459 len -= n->m_len;
4460 off = 0;
4461 m = m->m_next;
4462 np = &n->m_next;
4463 }
4464
4465 if (top == NULL)
4466 MCFail++;
4467
4468 return (top);
4469 nospace:
4470
4471 m_freem(top);
4472 MCFail++;
4473 return (NULL);
4474 }
4475
4476 /*
4477 * Equivalent to m_copym except that all necessary mbuf hdrs are allocated
4478 * within this routine also, the last mbuf and offset accessed are passed
4479 * out and can be passed back in to avoid having to rescan the entire mbuf
4480 * list (normally hung off of the socket)
4481 */
4482 struct mbuf *
4483 m_copym_with_hdrs(struct mbuf *m, int off0, int len0, int wait,
4484 struct mbuf **m_lastm, int *m_off)
4485 {
4486 struct mbuf *n, **np = NULL;
4487 int off = off0, len = len0;
4488 struct mbuf *top = NULL;
4489 int mcflags = MSLEEPF(wait);
4490 int copyhdr = 0;
4491 int type = 0;
4492 mcache_obj_t *list = NULL;
4493 int needed = 0;
4494
4495 if (off == 0 && (m->m_flags & M_PKTHDR))
4496 copyhdr = 1;
4497
4498 if (*m_lastm != NULL) {
4499 m = *m_lastm;
4500 off = *m_off;
4501 } else {
4502 while (off >= m->m_len) {
4503 off -= m->m_len;
4504 m = m->m_next;
4505 }
4506 }
4507
4508 n = m;
4509 while (len > 0) {
4510 needed++;
4511 ASSERT(n != NULL);
4512 len -= MIN(len, (n->m_len - ((needed == 1) ? off : 0)));
4513 n = n->m_next;
4514 }
4515 needed++;
4516 len = len0;
4517
4518 /*
4519 * If the caller doesn't want to be put to sleep, mark it with
4520 * MCR_TRYHARD so that we may reclaim buffers from other places
4521 * before giving up.
4522 */
4523 if (mcflags & MCR_NOSLEEP)
4524 mcflags |= MCR_TRYHARD;
4525
4526 if (mcache_alloc_ext(m_cache(MC_MBUF), &list, needed,
4527 mcflags) != needed)
4528 goto nospace;
4529
4530 needed = 0;
4531 while (len > 0) {
4532 n = (struct mbuf *)list;
4533 list = list->obj_next;
4534 ASSERT(n != NULL && m != NULL);
4535
4536 type = (top == NULL) ? MT_HEADER : m->m_type;
4537 MBUF_INIT(n, (top == NULL), type);
4538 #if CONFIG_MACF_NET
4539 if (top == NULL && mac_mbuf_label_init(n, wait) != 0) {
4540 mtype_stat_inc(MT_HEADER);
4541 mtype_stat_dec(MT_FREE);
4542 m_free(n);
4543 goto nospace;
4544 }
4545 #endif /* MAC_NET */
4546
4547 if (top == NULL) {
4548 top = n;
4549 np = &top->m_next;
4550 continue;
4551 } else {
4552 needed++;
4553 *np = n;
4554 }
4555
4556 if (copyhdr) {
4557 M_COPY_PKTHDR(n, m);
4558 n->m_pkthdr.len = len;
4559 copyhdr = 0;
4560 }
4561 n->m_len = MIN(len, (m->m_len - off));
4562
4563 if (m->m_flags & M_EXT) {
4564 n->m_ext = m->m_ext;
4565 m_incref(m);
4566 n->m_data = m->m_data + off;
4567 n->m_flags |= M_EXT;
4568 } else {
4569 bcopy(MTOD(m, caddr_t)+off, MTOD(n, caddr_t),
4570 (unsigned)n->m_len);
4571 }
4572 len -= n->m_len;
4573
4574 if (len == 0) {
4575 if ((off + n->m_len) == m->m_len) {
4576 *m_lastm = m->m_next;
4577 *m_off = 0;
4578 } else {
4579 *m_lastm = m;
4580 *m_off = off + n->m_len;
4581 }
4582 break;
4583 }
4584 off = 0;
4585 m = m->m_next;
4586 np = &n->m_next;
4587 }
4588
4589 mtype_stat_inc(MT_HEADER);
4590 mtype_stat_add(type, needed);
4591 mtype_stat_sub(MT_FREE, needed + 1);
4592
4593 ASSERT(list == NULL);
4594 return (top);
4595
4596 nospace:
4597 if (list != NULL)
4598 mcache_free_ext(m_cache(MC_MBUF), list);
4599 if (top != NULL)
4600 m_freem(top);
4601 MCFail++;
4602 return (NULL);
4603 }
4604
4605 /*
4606 * Copy data from an mbuf chain starting "off" bytes from the beginning,
4607 * continuing for "len" bytes, into the indicated buffer.
4608 */
4609 void
4610 m_copydata(struct mbuf *m, int off, int len, void *vp)
4611 {
4612 unsigned count;
4613 char *cp = vp;
4614
4615 if (off < 0 || len < 0)
4616 panic("m_copydata: invalid offset %d or len %d", off, len);
4617
4618 while (off > 0) {
4619 if (m == NULL)
4620 panic("m_copydata: invalid mbuf chain");
4621 if (off < m->m_len)
4622 break;
4623 off -= m->m_len;
4624 m = m->m_next;
4625 }
4626 while (len > 0) {
4627 if (m == NULL)
4628 panic("m_copydata: invalid mbuf chain");
4629 count = MIN(m->m_len - off, len);
4630 bcopy(MTOD(m, caddr_t) + off, cp, count);
4631 len -= count;
4632 cp += count;
4633 off = 0;
4634 m = m->m_next;
4635 }
4636 }
4637
4638 /*
4639 * Concatenate mbuf chain n to m. Both chains must be of the same type
4640 * (e.g. MT_DATA). Any m_pkthdr is not updated.
4641 */
4642 void
4643 m_cat(struct mbuf *m, struct mbuf *n)
4644 {
4645 while (m->m_next)
4646 m = m->m_next;
4647 while (n) {
4648 if ((m->m_flags & M_EXT) ||
4649 m->m_data + m->m_len + n->m_len >= &m->m_dat[MLEN]) {
4650 /* just join the two chains */
4651 m->m_next = n;
4652 return;
4653 }
4654 /* splat the data from one into the other */
4655 bcopy(MTOD(n, caddr_t), MTOD(m, caddr_t) + m->m_len,
4656 (u_int)n->m_len);
4657 m->m_len += n->m_len;
4658 n = m_free(n);
4659 }
4660 }
4661
4662 void
4663 m_adj(struct mbuf *mp, int req_len)
4664 {
4665 int len = req_len;
4666 struct mbuf *m;
4667 int count;
4668
4669 if ((m = mp) == NULL)
4670 return;
4671 if (len >= 0) {
4672 /*
4673 * Trim from head.
4674 */
4675 while (m != NULL && len > 0) {
4676 if (m->m_len <= len) {
4677 len -= m->m_len;
4678 m->m_len = 0;
4679 m = m->m_next;
4680 } else {
4681 m->m_len -= len;
4682 m->m_data += len;
4683 len = 0;
4684 }
4685 }
4686 m = mp;
4687 if (m->m_flags & M_PKTHDR)
4688 m->m_pkthdr.len -= (req_len - len);
4689 } else {
4690 /*
4691 * Trim from tail. Scan the mbuf chain,
4692 * calculating its length and finding the last mbuf.
4693 * If the adjustment only affects this mbuf, then just
4694 * adjust and return. Otherwise, rescan and truncate
4695 * after the remaining size.
4696 */
4697 len = -len;
4698 count = 0;
4699 for (;;) {
4700 count += m->m_len;
4701 if (m->m_next == (struct mbuf *)0)
4702 break;
4703 m = m->m_next;
4704 }
4705 if (m->m_len >= len) {
4706 m->m_len -= len;
4707 m = mp;
4708 if (m->m_flags & M_PKTHDR)
4709 m->m_pkthdr.len -= len;
4710 return;
4711 }
4712 count -= len;
4713 if (count < 0)
4714 count = 0;
4715 /*
4716 * Correct length for chain is "count".
4717 * Find the mbuf with last data, adjust its length,
4718 * and toss data from remaining mbufs on chain.
4719 */
4720 m = mp;
4721 if (m->m_flags & M_PKTHDR)
4722 m->m_pkthdr.len = count;
4723 for (; m; m = m->m_next) {
4724 if (m->m_len >= count) {
4725 m->m_len = count;
4726 break;
4727 }
4728 count -= m->m_len;
4729 }
4730 while ((m = m->m_next))
4731 m->m_len = 0;
4732 }
4733 }
4734
4735 /*
4736 * Rearange an mbuf chain so that len bytes are contiguous
4737 * and in the data area of an mbuf (so that mtod and dtom
4738 * will work for a structure of size len). Returns the resulting
4739 * mbuf chain on success, frees it and returns null on failure.
4740 * If there is room, it will add up to max_protohdr-len extra bytes to the
4741 * contiguous region in an attempt to avoid being called next time.
4742 */
4743 int MPFail;
4744
4745 struct mbuf *
4746 m_pullup(struct mbuf *n, int len)
4747 {
4748 struct mbuf *m;
4749 int count;
4750 int space;
4751
4752 /*
4753 * If first mbuf has no cluster, and has room for len bytes
4754 * without shifting current data, pullup into it,
4755 * otherwise allocate a new mbuf to prepend to the chain.
4756 */
4757 if ((n->m_flags & M_EXT) == 0 &&
4758 n->m_data + len < &n->m_dat[MLEN] && n->m_next) {
4759 if (n->m_len >= len)
4760 return (n);
4761 m = n;
4762 n = n->m_next;
4763 len -= m->m_len;
4764 } else {
4765 if (len > MHLEN)
4766 goto bad;
4767 _MGET(m, M_DONTWAIT, n->m_type);
4768 if (m == 0)
4769 goto bad;
4770 m->m_len = 0;
4771 if (n->m_flags & M_PKTHDR) {
4772 M_COPY_PKTHDR(m, n);
4773 n->m_flags &= ~M_PKTHDR;
4774 }
4775 }
4776 space = &m->m_dat[MLEN] - (m->m_data + m->m_len);
4777 do {
4778 count = MIN(MIN(MAX(len, max_protohdr), space), n->m_len);
4779 bcopy(MTOD(n, caddr_t), MTOD(m, caddr_t) + m->m_len,
4780 (unsigned)count);
4781 len -= count;
4782 m->m_len += count;
4783 n->m_len -= count;
4784 space -= count;
4785 if (n->m_len)
4786 n->m_data += count;
4787 else
4788 n = m_free(n);
4789 } while (len > 0 && n);
4790 if (len > 0) {
4791 (void) m_free(m);
4792 goto bad;
4793 }
4794 m->m_next = n;
4795 return (m);
4796 bad:
4797 m_freem(n);
4798 MPFail++;
4799 return (0);
4800 }
4801
4802 /*
4803 * Like m_pullup(), except a new mbuf is always allocated, and we allow
4804 * the amount of empty space before the data in the new mbuf to be specified
4805 * (in the event that the caller expects to prepend later).
4806 */
4807 __private_extern__ int MSFail = 0;
4808
4809 __private_extern__ struct mbuf *
4810 m_copyup(struct mbuf *n, int len, int dstoff)
4811 {
4812 struct mbuf *m;
4813 int count, space;
4814
4815 if (len > (MHLEN - dstoff))
4816 goto bad;
4817 MGET(m, M_DONTWAIT, n->m_type);
4818 if (m == NULL)
4819 goto bad;
4820 m->m_len = 0;
4821 if (n->m_flags & M_PKTHDR) {
4822 m_copy_pkthdr(m, n);
4823 n->m_flags &= ~M_PKTHDR;
4824 }
4825 m->m_data += dstoff;
4826 space = &m->m_dat[MLEN] - (m->m_data + m->m_len);
4827 do {
4828 count = min(min(max(len, max_protohdr), space), n->m_len);
4829 memcpy(mtod(m, caddr_t) + m->m_len, mtod(n, caddr_t),
4830 (unsigned)count);
4831 len -= count;
4832 m->m_len += count;
4833 n->m_len -= count;
4834 space -= count;
4835 if (n->m_len)
4836 n->m_data += count;
4837 else
4838 n = m_free(n);
4839 } while (len > 0 && n);
4840 if (len > 0) {
4841 (void) m_free(m);
4842 goto bad;
4843 }
4844 m->m_next = n;
4845 return (m);
4846 bad:
4847 m_freem(n);
4848 MSFail++;
4849 return (NULL);
4850 }
4851
4852 /*
4853 * Partition an mbuf chain in two pieces, returning the tail --
4854 * all but the first len0 bytes. In case of failure, it returns NULL and
4855 * attempts to restore the chain to its original state.
4856 */
4857 struct mbuf *
4858 m_split(struct mbuf *m0, int len0, int wait)
4859 {
4860 return (m_split0(m0, len0, wait, 1));
4861 }
4862
4863 static struct mbuf *
4864 m_split0(struct mbuf *m0, int len0, int wait, int copyhdr)
4865 {
4866 struct mbuf *m, *n;
4867 unsigned len = len0, remain;
4868
4869 for (m = m0; m && len > m->m_len; m = m->m_next)
4870 len -= m->m_len;
4871 if (m == NULL)
4872 return (NULL);
4873 remain = m->m_len - len;
4874 if (copyhdr && (m0->m_flags & M_PKTHDR)) {
4875 _MGETHDR(n, wait, m0->m_type);
4876 if (n == NULL)
4877 return (NULL);
4878 n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif;
4879 n->m_pkthdr.len = m0->m_pkthdr.len - len0;
4880 m0->m_pkthdr.len = len0;
4881 if (m->m_flags & M_EXT)
4882 goto extpacket;
4883 if (remain > MHLEN) {
4884 /* m can't be the lead packet */
4885 MH_ALIGN(n, 0);
4886 n->m_next = m_split(m, len, wait);
4887 if (n->m_next == NULL) {
4888 (void) m_free(n);
4889 return (NULL);
4890 } else
4891 return (n);
4892 } else
4893 MH_ALIGN(n, remain);
4894 } else if (remain == 0) {
4895 n = m->m_next;
4896 m->m_next = NULL;
4897 return (n);
4898 } else {
4899 _MGET(n, wait, m->m_type);
4900 if (n == NULL)
4901 return (NULL);
4902 M_ALIGN(n, remain);
4903 }
4904 extpacket:
4905 if (m->m_flags & M_EXT) {
4906 n->m_flags |= M_EXT;
4907 n->m_ext = m->m_ext;
4908 m_incref(m);
4909 n->m_data = m->m_data + len;
4910 } else {
4911 bcopy(MTOD(m, caddr_t) + len, MTOD(n, caddr_t), remain);
4912 }
4913 n->m_len = remain;
4914 m->m_len = len;
4915 n->m_next = m->m_next;
4916 m->m_next = NULL;
4917 return (n);
4918 }
4919
4920 /*
4921 * Routine to copy from device local memory into mbufs.
4922 */
4923 struct mbuf *
4924 m_devget(char *buf, int totlen, int off0, struct ifnet *ifp,
4925 void (*copy)(const void *, void *, size_t))
4926 {
4927 struct mbuf *m;
4928 struct mbuf *top = NULL, **mp = &top;
4929 int off = off0, len;
4930 char *cp;
4931 char *epkt;
4932
4933 cp = buf;
4934 epkt = cp + totlen;
4935 if (off) {
4936 /*
4937 * If 'off' is non-zero, packet is trailer-encapsulated,
4938 * so we have to skip the type and length fields.
4939 */
4940 cp += off + 2 * sizeof (u_int16_t);
4941 totlen -= 2 * sizeof (u_int16_t);
4942 }
4943 _MGETHDR(m, M_DONTWAIT, MT_DATA);
4944 if (m == NULL)
4945 return (NULL);
4946 m->m_pkthdr.rcvif = ifp;
4947 m->m_pkthdr.len = totlen;
4948 m->m_len = MHLEN;
4949
4950 while (totlen > 0) {
4951 if (top != NULL) {
4952 _MGET(m, M_DONTWAIT, MT_DATA);
4953 if (m == NULL) {
4954 m_freem(top);
4955 return (NULL);
4956 }
4957 m->m_len = MLEN;
4958 }
4959 len = MIN(totlen, epkt - cp);
4960 if (len >= MINCLSIZE) {
4961 MCLGET(m, M_DONTWAIT);
4962 if (m->m_flags & M_EXT) {
4963 m->m_len = len = MIN(len, m_maxsize(MC_CL));
4964 } else {
4965 /* give up when it's out of cluster mbufs */
4966 if (top != NULL)
4967 m_freem(top);
4968 m_freem(m);
4969 return (NULL);
4970 }
4971 } else {
4972 /*
4973 * Place initial small packet/header at end of mbuf.
4974 */
4975 if (len < m->m_len) {
4976 if (top == NULL &&
4977 len + max_linkhdr <= m->m_len)
4978 m->m_data += max_linkhdr;
4979 m->m_len = len;
4980 } else {
4981 len = m->m_len;
4982 }
4983 }
4984 if (copy)
4985 copy(cp, MTOD(m, caddr_t), (unsigned)len);
4986 else
4987 bcopy(cp, MTOD(m, caddr_t), (unsigned)len);
4988 cp += len;
4989 *mp = m;
4990 mp = &m->m_next;
4991 totlen -= len;
4992 if (cp == epkt)
4993 cp = buf;
4994 }
4995 return (top);
4996 }
4997
4998 #ifndef MBUF_GROWTH_NORMAL_THRESH
4999 #define MBUF_GROWTH_NORMAL_THRESH 25
5000 #endif
5001
5002 /*
5003 * Cluster freelist allocation check.
5004 */
5005 static int
5006 m_howmany(int num, size_t bufsize)
5007 {
5008 int i = 0, j = 0;
5009 u_int32_t m_mbclusters, m_clusters, m_bigclusters, m_16kclusters;
5010 u_int32_t m_mbfree, m_clfree, m_bigclfree, m_16kclfree;
5011 u_int32_t sumclusters, freeclusters;
5012 u_int32_t percent_pool, percent_kmem;
5013 u_int32_t mb_growth, mb_growth_thresh;
5014
5015 VERIFY(bufsize == m_maxsize(MC_BIGCL) ||
5016 bufsize == m_maxsize(MC_16KCL));
5017
5018 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
5019
5020 /* Numbers in 2K cluster units */
5021 m_mbclusters = m_total(MC_MBUF) >> NMBPCLSHIFT;
5022 m_clusters = m_total(MC_CL);
5023 m_bigclusters = m_total(MC_BIGCL) << NCLPBGSHIFT;
5024 m_16kclusters = m_total(MC_16KCL);
5025 sumclusters = m_mbclusters + m_clusters + m_bigclusters;
5026
5027 m_mbfree = m_infree(MC_MBUF) >> NMBPCLSHIFT;
5028 m_clfree = m_infree(MC_CL);
5029 m_bigclfree = m_infree(MC_BIGCL) << NCLPBGSHIFT;
5030 m_16kclfree = m_infree(MC_16KCL);
5031 freeclusters = m_mbfree + m_clfree + m_bigclfree;
5032
5033 /* Bail if we've maxed out the mbuf memory map */
5034 if ((bufsize == m_maxsize(MC_BIGCL) && sumclusters >= nclusters) ||
5035 (njcl > 0 && bufsize == m_maxsize(MC_16KCL) &&
5036 (m_16kclusters << NCLPJCLSHIFT) >= njcl)) {
5037 return (0);
5038 }
5039
5040 if (bufsize == m_maxsize(MC_BIGCL)) {
5041 /* Under minimum */
5042 if (m_bigclusters < m_minlimit(MC_BIGCL))
5043 return (m_minlimit(MC_BIGCL) - m_bigclusters);
5044
5045 percent_pool =
5046 ((sumclusters - freeclusters) * 100) / sumclusters;
5047 percent_kmem = (sumclusters * 100) / nclusters;
5048
5049 /*
5050 * If a light/normal user, grow conservatively (75%)
5051 * If a heavy user, grow aggressively (50%)
5052 */
5053 if (percent_kmem < MBUF_GROWTH_NORMAL_THRESH)
5054 mb_growth = MB_GROWTH_NORMAL;
5055 else
5056 mb_growth = MB_GROWTH_AGGRESSIVE;
5057
5058 if (percent_kmem < 5) {
5059 /* For initial allocations */
5060 i = num;
5061 } else {
5062 /* Return if >= MBIGCL_LOWAT clusters available */
5063 if (m_infree(MC_BIGCL) >= MBIGCL_LOWAT &&
5064 m_total(MC_BIGCL) >=
5065 MBIGCL_LOWAT + m_minlimit(MC_BIGCL))
5066 return (0);
5067
5068 /* Ensure at least num clusters are accessible */
5069 if (num >= m_infree(MC_BIGCL))
5070 i = num - m_infree(MC_BIGCL);
5071 if (num > m_total(MC_BIGCL) - m_minlimit(MC_BIGCL))
5072 j = num - (m_total(MC_BIGCL) -
5073 m_minlimit(MC_BIGCL));
5074
5075 i = MAX(i, j);
5076
5077 /*
5078 * Grow pool if percent_pool > 75 (normal growth)
5079 * or percent_pool > 50 (aggressive growth).
5080 */
5081 mb_growth_thresh = 100 - (100 / (1 << mb_growth));
5082 if (percent_pool > mb_growth_thresh)
5083 j = ((sumclusters + num) >> mb_growth) -
5084 freeclusters;
5085 i = MAX(i, j);
5086 }
5087
5088 /* Check to ensure we didn't go over limits */
5089 if (i + m_bigclusters >= m_maxlimit(MC_BIGCL))
5090 i = m_maxlimit(MC_BIGCL) - m_bigclusters;
5091 if ((i << 1) + sumclusters >= nclusters)
5092 i = (nclusters - sumclusters) >> 1;
5093 VERIFY((m_total(MC_BIGCL) + i) <= m_maxlimit(MC_BIGCL));
5094 VERIFY(sumclusters + (i << 1) <= nclusters);
5095
5096 } else { /* 16K CL */
5097 VERIFY(njcl > 0);
5098 /* Under minimum */
5099 if (m_16kclusters < MIN16KCL)
5100 return (MIN16KCL - m_16kclusters);
5101 if (m_16kclfree >= M16KCL_LOWAT)
5102 return (0);
5103
5104 /* Ensure at least num clusters are available */
5105 if (num >= m_16kclfree)
5106 i = num - m_16kclfree;
5107
5108 /* Always grow 16KCL pool aggressively */
5109 if (((m_16kclusters + num) >> 1) > m_16kclfree)
5110 j = ((m_16kclusters + num) >> 1) - m_16kclfree;
5111 i = MAX(i, j);
5112
5113 /* Check to ensure we don't go over limit */
5114 if (i + m_16kclusters >= m_maxlimit(MC_16KCL))
5115 i = m_maxlimit(MC_16KCL) - m_16kclusters;
5116 VERIFY((m_total(MC_16KCL) + i) <= m_maxlimit(MC_16KCL));
5117 }
5118 return (i);
5119 }
5120 /*
5121 * Return the number of bytes in the mbuf chain, m.
5122 */
5123 unsigned int
5124 m_length(struct mbuf *m)
5125 {
5126 struct mbuf *m0;
5127 unsigned int pktlen;
5128
5129 if (m->m_flags & M_PKTHDR)
5130 return (m->m_pkthdr.len);
5131
5132 pktlen = 0;
5133 for (m0 = m; m0 != NULL; m0 = m0->m_next)
5134 pktlen += m0->m_len;
5135 return (pktlen);
5136 }
5137
5138 /*
5139 * Copy data from a buffer back into the indicated mbuf chain,
5140 * starting "off" bytes from the beginning, extending the mbuf
5141 * chain if necessary.
5142 */
5143 void
5144 m_copyback(struct mbuf *m0, int off, int len, const void *cp)
5145 {
5146 #if DEBUG
5147 struct mbuf *origm = m0;
5148 int error;
5149 #endif /* DEBUG */
5150
5151 if (m0 == NULL)
5152 return;
5153
5154 #if DEBUG
5155 error =
5156 #endif /* DEBUG */
5157 m_copyback0(&m0, off, len, cp,
5158 M_COPYBACK0_COPYBACK | M_COPYBACK0_EXTEND, M_DONTWAIT);
5159
5160 #if DEBUG
5161 if (error != 0 || (m0 != NULL && origm != m0))
5162 panic("m_copyback");
5163 #endif /* DEBUG */
5164 }
5165
5166 struct mbuf *
5167 m_copyback_cow(struct mbuf *m0, int off, int len, const void *cp, int how)
5168 {
5169 int error;
5170
5171 /* don't support chain expansion */
5172 VERIFY(off + len <= m_length(m0));
5173
5174 error = m_copyback0(&m0, off, len, cp,
5175 M_COPYBACK0_COPYBACK | M_COPYBACK0_COW, how);
5176 if (error) {
5177 /*
5178 * no way to recover from partial success.
5179 * just free the chain.
5180 */
5181 m_freem(m0);
5182 return (NULL);
5183 }
5184 return (m0);
5185 }
5186
5187 /*
5188 * m_makewritable: ensure the specified range writable.
5189 */
5190 int
5191 m_makewritable(struct mbuf **mp, int off, int len, int how)
5192 {
5193 int error;
5194 #if DEBUG
5195 struct mbuf *n;
5196 int origlen, reslen;
5197
5198 origlen = m_length(*mp);
5199 #endif /* DEBUG */
5200
5201 #if 0 /* M_COPYALL is large enough */
5202 if (len == M_COPYALL)
5203 len = m_length(*mp) - off; /* XXX */
5204 #endif
5205
5206 error = m_copyback0(mp, off, len, NULL,
5207 M_COPYBACK0_PRESERVE | M_COPYBACK0_COW, how);
5208
5209 #if DEBUG
5210 reslen = 0;
5211 for (n = *mp; n; n = n->m_next)
5212 reslen += n->m_len;
5213 if (origlen != reslen)
5214 panic("m_makewritable: length changed");
5215 if (((*mp)->m_flags & M_PKTHDR) && reslen != (*mp)->m_pkthdr.len)
5216 panic("m_makewritable: inconsist");
5217 #endif /* DEBUG */
5218
5219 return (error);
5220 }
5221
5222 static int
5223 m_copyback0(struct mbuf **mp0, int off, int len, const void *vp, int flags,
5224 int how)
5225 {
5226 int mlen;
5227 struct mbuf *m, *n;
5228 struct mbuf **mp;
5229 int totlen = 0;
5230 const char *cp = vp;
5231
5232 VERIFY(mp0 != NULL);
5233 VERIFY(*mp0 != NULL);
5234 VERIFY((flags & M_COPYBACK0_PRESERVE) == 0 || cp == NULL);
5235 VERIFY((flags & M_COPYBACK0_COPYBACK) == 0 || cp != NULL);
5236
5237 /*
5238 * we don't bother to update "totlen" in the case of M_COPYBACK0_COW,
5239 * assuming that M_COPYBACK0_EXTEND and M_COPYBACK0_COW are exclusive.
5240 */
5241
5242 VERIFY((~flags & (M_COPYBACK0_EXTEND|M_COPYBACK0_COW)) != 0);
5243
5244 mp = mp0;
5245 m = *mp;
5246 while (off > (mlen = m->m_len)) {
5247 off -= mlen;
5248 totlen += mlen;
5249 if (m->m_next == NULL) {
5250 int tspace;
5251 extend:
5252 if (!(flags & M_COPYBACK0_EXTEND))
5253 goto out;
5254
5255 /*
5256 * try to make some space at the end of "m".
5257 */
5258
5259 mlen = m->m_len;
5260 if (off + len >= MINCLSIZE &&
5261 !(m->m_flags & M_EXT) && m->m_len == 0) {
5262 MCLGET(m, how);
5263 }
5264 tspace = M_TRAILINGSPACE(m);
5265 if (tspace > 0) {
5266 tspace = MIN(tspace, off + len);
5267 VERIFY(tspace > 0);
5268 bzero(mtod(m, char *) + m->m_len,
5269 MIN(off, tspace));
5270 m->m_len += tspace;
5271 off += mlen;
5272 totlen -= mlen;
5273 continue;
5274 }
5275
5276 /*
5277 * need to allocate an mbuf.
5278 */
5279
5280 if (off + len >= MINCLSIZE) {
5281 n = m_getcl(how, m->m_type, 0);
5282 } else {
5283 n = _M_GET(how, m->m_type);
5284 }
5285 if (n == NULL) {
5286 goto out;
5287 }
5288 n->m_len = 0;
5289 n->m_len = MIN(M_TRAILINGSPACE(n), off + len);
5290 bzero(mtod(n, char *), MIN(n->m_len, off));
5291 m->m_next = n;
5292 }
5293 mp = &m->m_next;
5294 m = m->m_next;
5295 }
5296 while (len > 0) {
5297 mlen = m->m_len - off;
5298 if (mlen != 0 && m_mclhasreference(m)) {
5299 char *datap;
5300 int eatlen;
5301
5302 /*
5303 * this mbuf is read-only.
5304 * allocate a new writable mbuf and try again.
5305 */
5306
5307 #if defined(DIAGNOSTIC)
5308 if (!(flags & M_COPYBACK0_COW))
5309 panic("m_copyback0: read-only");
5310 #endif /* defined(DIAGNOSTIC) */
5311
5312 /*
5313 * if we're going to write into the middle of
5314 * a mbuf, split it first.
5315 */
5316 if (off > 0 && len < mlen) {
5317 n = m_split0(m, off, how, 0);
5318 if (n == NULL)
5319 goto enobufs;
5320 m->m_next = n;
5321 mp = &m->m_next;
5322 m = n;
5323 off = 0;
5324 continue;
5325 }
5326
5327 /*
5328 * XXX TODO coalesce into the trailingspace of
5329 * the previous mbuf when possible.
5330 */
5331
5332 /*
5333 * allocate a new mbuf. copy packet header if needed.
5334 */
5335 n = _M_GET(how, m->m_type);
5336 if (n == NULL)
5337 goto enobufs;
5338 if (off == 0 && (m->m_flags & M_PKTHDR)) {
5339 M_COPY_PKTHDR(n, m);
5340 n->m_len = MHLEN;
5341 } else {
5342 if (len >= MINCLSIZE)
5343 MCLGET(n, M_DONTWAIT);
5344 n->m_len =
5345 (n->m_flags & M_EXT) ? MCLBYTES : MLEN;
5346 }
5347 if (n->m_len > len)
5348 n->m_len = len;
5349
5350 /*
5351 * free the region which has been overwritten.
5352 * copying data from old mbufs if requested.
5353 */
5354 if (flags & M_COPYBACK0_PRESERVE)
5355 datap = mtod(n, char *);
5356 else
5357 datap = NULL;
5358 eatlen = n->m_len;
5359 VERIFY(off == 0 || eatlen >= mlen);
5360 if (off > 0) {
5361 VERIFY(len >= mlen);
5362 m->m_len = off;
5363 m->m_next = n;
5364 if (datap) {
5365 m_copydata(m, off, mlen, datap);
5366 datap += mlen;
5367 }
5368 eatlen -= mlen;
5369 mp = &m->m_next;
5370 m = m->m_next;
5371 }
5372 while (m != NULL && m_mclhasreference(m) &&
5373 n->m_type == m->m_type && eatlen > 0) {
5374 mlen = MIN(eatlen, m->m_len);
5375 if (datap) {
5376 m_copydata(m, 0, mlen, datap);
5377 datap += mlen;
5378 }
5379 m->m_data += mlen;
5380 m->m_len -= mlen;
5381 eatlen -= mlen;
5382 if (m->m_len == 0)
5383 *mp = m = m_free(m);
5384 }
5385 if (eatlen > 0)
5386 n->m_len -= eatlen;
5387 n->m_next = m;
5388 *mp = m = n;
5389 continue;
5390 }
5391 mlen = MIN(mlen, len);
5392 if (flags & M_COPYBACK0_COPYBACK) {
5393 bcopy(cp, mtod(m, caddr_t) + off, (unsigned)mlen);
5394 cp += mlen;
5395 }
5396 len -= mlen;
5397 mlen += off;
5398 off = 0;
5399 totlen += mlen;
5400 if (len == 0)
5401 break;
5402 if (m->m_next == NULL) {
5403 goto extend;
5404 }
5405 mp = &m->m_next;
5406 m = m->m_next;
5407 }
5408 out:
5409 if (((m = *mp0)->m_flags & M_PKTHDR) && (m->m_pkthdr.len < totlen)) {
5410 VERIFY(flags & M_COPYBACK0_EXTEND);
5411 m->m_pkthdr.len = totlen;
5412 }
5413
5414 return (0);
5415
5416 enobufs:
5417 return (ENOBUFS);
5418 }
5419
5420 char *
5421 mcl_to_paddr(char *addr)
5422 {
5423 vm_offset_t base_phys;
5424
5425 if (!MBUF_IN_MAP(addr))
5426 return (NULL);
5427 base_phys = mcl_paddr[(addr - (char *)mbutl) >> PGSHIFT];
5428
5429 if (base_phys == 0)
5430 return (NULL);
5431 return ((char *)((uintptr_t)base_phys | ((uintptr_t)addr & PGOFSET)));
5432 }
5433
5434 /*
5435 * Dup the mbuf chain passed in. The whole thing. No cute additional cruft.
5436 * And really copy the thing. That way, we don't "precompute" checksums
5437 * for unsuspecting consumers. Assumption: m->m_nextpkt == 0. Trick: for
5438 * small packets, don't dup into a cluster. That way received packets
5439 * don't take up too much room in the sockbuf (cf. sbspace()).
5440 */
5441 int MDFail;
5442
5443 struct mbuf *
5444 m_dup(struct mbuf *m, int how)
5445 {
5446 struct mbuf *n, **np;
5447 struct mbuf *top;
5448 int copyhdr = 0;
5449
5450 np = &top;
5451 top = NULL;
5452 if (m->m_flags & M_PKTHDR)
5453 copyhdr = 1;
5454
5455 /*
5456 * Quick check: if we have one mbuf and its data fits in an
5457 * mbuf with packet header, just copy and go.
5458 */
5459 if (m->m_next == NULL) {
5460 /* Then just move the data into an mbuf and be done... */
5461 if (copyhdr) {
5462 if (m->m_pkthdr.len <= MHLEN && m->m_len <= MHLEN) {
5463 if ((n = _M_GETHDR(how, m->m_type)) == NULL)
5464 return (NULL);
5465 n->m_len = m->m_len;
5466 m_dup_pkthdr(n, m, how);
5467 bcopy(m->m_data, n->m_data, m->m_len);
5468 return (n);
5469 }
5470 } else if (m->m_len <= MLEN) {
5471 if ((n = _M_GET(how, m->m_type)) == NULL)
5472 return (NULL);
5473 bcopy(m->m_data, n->m_data, m->m_len);
5474 n->m_len = m->m_len;
5475 return (n);
5476 }
5477 }
5478 while (m != NULL) {
5479 #if BLUE_DEBUG
5480 kprintf("<%x: %x, %x, %x\n", m, m->m_flags, m->m_len,
5481 m->m_data);
5482 #endif
5483 if (copyhdr)
5484 n = _M_GETHDR(how, m->m_type);
5485 else
5486 n = _M_GET(how, m->m_type);
5487 if (n == NULL)
5488 goto nospace;
5489 if (m->m_flags & M_EXT) {
5490 if (m->m_len <= m_maxsize(MC_CL))
5491 MCLGET(n, how);
5492 else if (m->m_len <= m_maxsize(MC_BIGCL))
5493 n = m_mbigget(n, how);
5494 else if (m->m_len <= m_maxsize(MC_16KCL) && njcl > 0)
5495 n = m_m16kget(n, how);
5496 if (!(n->m_flags & M_EXT)) {
5497 (void) m_free(n);
5498 goto nospace;
5499 }
5500 }
5501 *np = n;
5502 if (copyhdr) {
5503 /* Don't use M_COPY_PKTHDR: preserve m_data */
5504 m_dup_pkthdr(n, m, how);
5505 copyhdr = 0;
5506 if (!(n->m_flags & M_EXT))
5507 n->m_data = n->m_pktdat;
5508 }
5509 n->m_len = m->m_len;
5510 /*
5511 * Get the dup on the same bdry as the original
5512 * Assume that the two mbufs have the same offset to data area
5513 * (up to word boundaries)
5514 */
5515 bcopy(MTOD(m, caddr_t), MTOD(n, caddr_t), (unsigned)n->m_len);
5516 m = m->m_next;
5517 np = &n->m_next;
5518 #if BLUE_DEBUG
5519 kprintf(">%x: %x, %x, %x\n", n, n->m_flags, n->m_len,
5520 n->m_data);
5521 #endif
5522 }
5523
5524 if (top == NULL)
5525 MDFail++;
5526 return (top);
5527
5528 nospace:
5529 m_freem(top);
5530 MDFail++;
5531 return (NULL);
5532 }
5533
5534 #define MBUF_MULTIPAGES(m) \
5535 (((m)->m_flags & M_EXT) && \
5536 ((IS_P2ALIGNED((m)->m_data, NBPG) && (m)->m_len > NBPG) || \
5537 (!IS_P2ALIGNED((m)->m_data, NBPG) && \
5538 P2ROUNDUP((m)->m_data, NBPG) < ((uintptr_t)(m)->m_data + (m)->m_len))))
5539
5540 static struct mbuf *
5541 m_expand(struct mbuf *m, struct mbuf **last)
5542 {
5543 struct mbuf *top = NULL;
5544 struct mbuf **nm = &top;
5545 uintptr_t data0, data;
5546 unsigned int len0, len;
5547
5548 VERIFY(MBUF_MULTIPAGES(m));
5549 VERIFY(m->m_next == NULL);
5550 data0 = (uintptr_t)m->m_data;
5551 len0 = m->m_len;
5552 *last = top;
5553
5554 for (;;) {
5555 struct mbuf *n;
5556
5557 data = data0;
5558 if (IS_P2ALIGNED(data, NBPG) && len0 > NBPG)
5559 len = NBPG;
5560 else if (!IS_P2ALIGNED(data, NBPG) &&
5561 P2ROUNDUP(data, NBPG) < (data + len0))
5562 len = P2ROUNDUP(data, NBPG) - data;
5563 else
5564 len = len0;
5565
5566 VERIFY(len > 0);
5567 VERIFY(m->m_flags & M_EXT);
5568 m->m_data = (void *)data;
5569 m->m_len = len;
5570
5571 *nm = *last = m;
5572 nm = &m->m_next;
5573 m->m_next = NULL;
5574
5575 data0 += len;
5576 len0 -= len;
5577 if (len0 == 0)
5578 break;
5579
5580 n = _M_RETRY(M_DONTWAIT, MT_DATA);
5581 if (n == NULL) {
5582 m_freem(top);
5583 top = *last = NULL;
5584 break;
5585 }
5586
5587 n->m_ext = m->m_ext;
5588 m_incref(m);
5589 n->m_flags |= M_EXT;
5590 m = n;
5591 }
5592 return (top);
5593 }
5594
5595 struct mbuf *
5596 m_normalize(struct mbuf *m)
5597 {
5598 struct mbuf *top = NULL;
5599 struct mbuf **nm = &top;
5600 boolean_t expanded = FALSE;
5601
5602 while (m != NULL) {
5603 struct mbuf *n;
5604
5605 n = m->m_next;
5606 m->m_next = NULL;
5607
5608 /* Does the data cross one or more page boundaries? */
5609 if (MBUF_MULTIPAGES(m)) {
5610 struct mbuf *last;
5611 if ((m = m_expand(m, &last)) == NULL) {
5612 m_freem(n);
5613 m_freem(top);
5614 top = NULL;
5615 break;
5616 }
5617 *nm = m;
5618 nm = &last->m_next;
5619 expanded = TRUE;
5620 } else {
5621 *nm = m;
5622 nm = &m->m_next;
5623 }
5624 m = n;
5625 }
5626 if (expanded)
5627 atomic_add_32(&mb_normalized, 1);
5628 return (top);
5629 }
5630
5631 /*
5632 * Append the specified data to the indicated mbuf chain,
5633 * Extend the mbuf chain if the new data does not fit in
5634 * existing space.
5635 *
5636 * Return 1 if able to complete the job; otherwise 0.
5637 */
5638 int
5639 m_append(struct mbuf *m0, int len, caddr_t cp)
5640 {
5641 struct mbuf *m, *n;
5642 int remainder, space;
5643
5644 for (m = m0; m->m_next != NULL; m = m->m_next)
5645 ;
5646 remainder = len;
5647 space = M_TRAILINGSPACE(m);
5648 if (space > 0) {
5649 /*
5650 * Copy into available space.
5651 */
5652 if (space > remainder)
5653 space = remainder;
5654 bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
5655 m->m_len += space;
5656 cp += space, remainder -= space;
5657 }
5658 while (remainder > 0) {
5659 /*
5660 * Allocate a new mbuf; could check space
5661 * and allocate a cluster instead.
5662 */
5663 n = m_get(M_WAITOK, m->m_type);
5664 if (n == NULL)
5665 break;
5666 n->m_len = min(MLEN, remainder);
5667 bcopy(cp, mtod(n, caddr_t), n->m_len);
5668 cp += n->m_len;
5669 remainder -= n->m_len;
5670 m->m_next = n;
5671 m = n;
5672 }
5673 if (m0->m_flags & M_PKTHDR)
5674 m0->m_pkthdr.len += len - remainder;
5675 return (remainder == 0);
5676 }
5677
5678 struct mbuf *
5679 m_last(struct mbuf *m)
5680 {
5681 while (m->m_next != NULL)
5682 m = m->m_next;
5683 return (m);
5684 }
5685
5686 unsigned int
5687 m_fixhdr(struct mbuf *m0)
5688 {
5689 u_int len;
5690
5691 len = m_length2(m0, NULL);
5692 m0->m_pkthdr.len = len;
5693 return (len);
5694 }
5695
5696 unsigned int
5697 m_length2(struct mbuf *m0, struct mbuf **last)
5698 {
5699 struct mbuf *m;
5700 u_int len;
5701
5702 len = 0;
5703 for (m = m0; m != NULL; m = m->m_next) {
5704 len += m->m_len;
5705 if (m->m_next == NULL)
5706 break;
5707 }
5708 if (last != NULL)
5709 *last = m;
5710 return (len);
5711 }
5712
5713 /*
5714 * Defragment a mbuf chain, returning the shortest possible chain of mbufs
5715 * and clusters. If allocation fails and this cannot be completed, NULL will
5716 * be returned, but the passed in chain will be unchanged. Upon success,
5717 * the original chain will be freed, and the new chain will be returned.
5718 *
5719 * If a non-packet header is passed in, the original mbuf (chain?) will
5720 * be returned unharmed.
5721 *
5722 * If offset is specfied, the first mbuf in the chain will have a leading
5723 * space of the amount stated by the "off" parameter.
5724 *
5725 * This routine requires that the m_pkthdr.header field of the original
5726 * mbuf chain is cleared by the caller.
5727 */
5728 struct mbuf *
5729 m_defrag_offset(struct mbuf *m0, u_int32_t off, int how)
5730 {
5731 struct mbuf *m_new = NULL, *m_final = NULL;
5732 int progress = 0, length, pktlen;
5733
5734 if (!(m0->m_flags & M_PKTHDR))
5735 return (m0);
5736
5737 VERIFY(off < MHLEN);
5738 m_fixhdr(m0); /* Needed sanity check */
5739
5740 pktlen = m0->m_pkthdr.len + off;
5741 if (pktlen > MHLEN)
5742 m_final = m_getcl(how, MT_DATA, M_PKTHDR);
5743 else
5744 m_final = m_gethdr(how, MT_DATA);
5745
5746 if (m_final == NULL)
5747 goto nospace;
5748
5749 if (off > 0) {
5750 pktlen -= off;
5751 m_final->m_len -= off;
5752 m_final->m_data += off;
5753 }
5754
5755 /*
5756 * Caller must have handled the contents pointed to by this
5757 * pointer before coming here, as otherwise it will point to
5758 * the original mbuf which will get freed upon success.
5759 */
5760 VERIFY(m0->m_pkthdr.header == NULL);
5761
5762 if (m_dup_pkthdr(m_final, m0, how) == 0)
5763 goto nospace;
5764
5765 m_new = m_final;
5766
5767 while (progress < pktlen) {
5768 length = pktlen - progress;
5769 if (length > MCLBYTES)
5770 length = MCLBYTES;
5771
5772 if (m_new == NULL) {
5773 if (length > MLEN)
5774 m_new = m_getcl(how, MT_DATA, 0);
5775 else
5776 m_new = m_get(how, MT_DATA);
5777 if (m_new == NULL)
5778 goto nospace;
5779 }
5780
5781 m_copydata(m0, progress, length, mtod(m_new, caddr_t));
5782 progress += length;
5783 m_new->m_len = length;
5784 if (m_new != m_final)
5785 m_cat(m_final, m_new);
5786 m_new = NULL;
5787 }
5788 m_freem(m0);
5789 m0 = m_final;
5790 return (m0);
5791 nospace:
5792 if (m_final)
5793 m_freem(m_final);
5794 return (NULL);
5795 }
5796
5797 struct mbuf *
5798 m_defrag(struct mbuf *m0, int how)
5799 {
5800 return (m_defrag_offset(m0, 0, how));
5801 }
5802
5803 void
5804 m_mchtype(struct mbuf *m, int t)
5805 {
5806 mtype_stat_inc(t);
5807 mtype_stat_dec(m->m_type);
5808 (m)->m_type = t;
5809 }
5810
5811 void *
5812 m_mtod(struct mbuf *m)
5813 {
5814 return (MTOD(m, void *));
5815 }
5816
5817 struct mbuf *
5818 m_dtom(void *x)
5819 {
5820 return ((struct mbuf *)((uintptr_t)(x) & ~(MSIZE-1)));
5821 }
5822
5823 void
5824 m_mcheck(struct mbuf *m)
5825 {
5826 _MCHECK(m);
5827 }
5828
5829 /*
5830 * Return a pointer to mbuf/offset of location in mbuf chain.
5831 */
5832 struct mbuf *
5833 m_getptr(struct mbuf *m, int loc, int *off)
5834 {
5835
5836 while (loc >= 0) {
5837 /* Normal end of search. */
5838 if (m->m_len > loc) {
5839 *off = loc;
5840 return (m);
5841 } else {
5842 loc -= m->m_len;
5843 if (m->m_next == NULL) {
5844 if (loc == 0) {
5845 /* Point at the end of valid data. */
5846 *off = m->m_len;
5847 return (m);
5848 }
5849 return (NULL);
5850 }
5851 m = m->m_next;
5852 }
5853 }
5854 return (NULL);
5855 }
5856
5857 /*
5858 * Inform the corresponding mcache(s) that there's a waiter below.
5859 */
5860 static void
5861 mbuf_waiter_inc(mbuf_class_t class, boolean_t comp)
5862 {
5863 mcache_waiter_inc(m_cache(class));
5864 if (comp) {
5865 if (class == MC_CL) {
5866 mcache_waiter_inc(m_cache(MC_MBUF_CL));
5867 } else if (class == MC_BIGCL) {
5868 mcache_waiter_inc(m_cache(MC_MBUF_BIGCL));
5869 } else if (class == MC_16KCL) {
5870 mcache_waiter_inc(m_cache(MC_MBUF_16KCL));
5871 } else {
5872 mcache_waiter_inc(m_cache(MC_MBUF_CL));
5873 mcache_waiter_inc(m_cache(MC_MBUF_BIGCL));
5874 }
5875 }
5876 }
5877
5878 /*
5879 * Inform the corresponding mcache(s) that there's no more waiter below.
5880 */
5881 static void
5882 mbuf_waiter_dec(mbuf_class_t class, boolean_t comp)
5883 {
5884 mcache_waiter_dec(m_cache(class));
5885 if (comp) {
5886 if (class == MC_CL) {
5887 mcache_waiter_dec(m_cache(MC_MBUF_CL));
5888 } else if (class == MC_BIGCL) {
5889 mcache_waiter_dec(m_cache(MC_MBUF_BIGCL));
5890 } else if (class == MC_16KCL) {
5891 mcache_waiter_dec(m_cache(MC_MBUF_16KCL));
5892 } else {
5893 mcache_waiter_dec(m_cache(MC_MBUF_CL));
5894 mcache_waiter_dec(m_cache(MC_MBUF_BIGCL));
5895 }
5896 }
5897 }
5898
5899 /*
5900 * Called during slab (blocking and non-blocking) allocation. If there
5901 * is at least one waiter, and the time since the first waiter is blocked
5902 * is greater than the watchdog timeout, panic the system.
5903 */
5904 static void
5905 mbuf_watchdog(void)
5906 {
5907 struct timeval now;
5908 unsigned int since;
5909
5910 if (mb_waiters == 0 || !mb_watchdog)
5911 return;
5912
5913 microuptime(&now);
5914 since = now.tv_sec - mb_wdtstart.tv_sec;
5915 if (since >= MB_WDT_MAXTIME) {
5916 panic_plain("%s: %d waiters stuck for %u secs\n%s", __func__,
5917 mb_waiters, since, mbuf_dump());
5918 /* NOTREACHED */
5919 }
5920 }
5921
5922 /*
5923 * Called during blocking allocation. Returns TRUE if one or more objects
5924 * are available at the per-CPU caches layer and that allocation should be
5925 * retried at that level.
5926 */
5927 static boolean_t
5928 mbuf_sleep(mbuf_class_t class, unsigned int num, int wait)
5929 {
5930 boolean_t mcache_retry = FALSE;
5931
5932 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
5933
5934 /* Check if there's anything at the cache layer */
5935 if (mbuf_cached_above(class, wait)) {
5936 mcache_retry = TRUE;
5937 goto done;
5938 }
5939
5940 /* Nothing? Then try hard to get it from somewhere */
5941 m_reclaim(class, num, (wait & MCR_COMP));
5942
5943 /* We tried hard and got something? */
5944 if (m_infree(class) > 0) {
5945 mbstat.m_wait++;
5946 goto done;
5947 } else if (mbuf_cached_above(class, wait)) {
5948 mbstat.m_wait++;
5949 mcache_retry = TRUE;
5950 goto done;
5951 } else if (wait & MCR_TRYHARD) {
5952 mcache_retry = TRUE;
5953 goto done;
5954 }
5955
5956 /*
5957 * There's really nothing for us right now; inform the
5958 * cache(s) that there is a waiter below and go to sleep.
5959 */
5960 mbuf_waiter_inc(class, (wait & MCR_COMP));
5961
5962 VERIFY(!(wait & MCR_NOSLEEP));
5963
5964 /*
5965 * If this is the first waiter, arm the watchdog timer. Otherwise
5966 * check if we need to panic the system due to watchdog timeout.
5967 */
5968 if (mb_waiters == 0)
5969 microuptime(&mb_wdtstart);
5970 else
5971 mbuf_watchdog();
5972
5973 mb_waiters++;
5974 (void) msleep(mb_waitchan, mbuf_mlock, (PZERO-1), m_cname(class), NULL);
5975
5976 /* We are now up; stop getting notified until next round */
5977 mbuf_waiter_dec(class, (wait & MCR_COMP));
5978
5979 /* We waited and got something */
5980 if (m_infree(class) > 0) {
5981 mbstat.m_wait++;
5982 goto done;
5983 } else if (mbuf_cached_above(class, wait)) {
5984 mbstat.m_wait++;
5985 mcache_retry = TRUE;
5986 }
5987 done:
5988 return (mcache_retry);
5989 }
5990
5991 static void
5992 mbuf_worker_thread(void)
5993 {
5994 int mbuf_expand;
5995
5996 while (1) {
5997 lck_mtx_lock(mbuf_mlock);
5998
5999 mbuf_expand = 0;
6000 if (mbuf_expand_mcl) {
6001 int n;
6002
6003 /* Adjust to current number of cluster in use */
6004 n = mbuf_expand_mcl -
6005 (m_total(MC_CL) - m_infree(MC_CL));
6006 if ((n + m_total(MC_CL)) > m_maxlimit(MC_CL))
6007 n = m_maxlimit(MC_CL) - m_total(MC_CL);
6008 mbuf_expand_mcl = 0;
6009
6010 if (n > 0 && freelist_populate(MC_CL, n, M_WAIT) > 0)
6011 mbuf_expand++;
6012 }
6013 if (mbuf_expand_big) {
6014 int n;
6015
6016 /* Adjust to current number of 4 KB cluster in use */
6017 n = mbuf_expand_big -
6018 (m_total(MC_BIGCL) - m_infree(MC_BIGCL));
6019 if ((n + m_total(MC_BIGCL)) > m_maxlimit(MC_BIGCL))
6020 n = m_maxlimit(MC_BIGCL) - m_total(MC_BIGCL);
6021 mbuf_expand_big = 0;
6022
6023 if (n > 0 && freelist_populate(MC_BIGCL, n, M_WAIT) > 0)
6024 mbuf_expand++;
6025 }
6026 if (mbuf_expand_16k) {
6027 int n;
6028
6029 /* Adjust to current number of 16 KB cluster in use */
6030 n = mbuf_expand_16k -
6031 (m_total(MC_16KCL) - m_infree(MC_16KCL));
6032 if ((n + m_total(MC_16KCL)) > m_maxlimit(MC_16KCL))
6033 n = m_maxlimit(MC_16KCL) - m_total(MC_16KCL);
6034 mbuf_expand_16k = 0;
6035
6036 if (n > 0)
6037 (void) freelist_populate(MC_16KCL, n, M_WAIT);
6038 }
6039
6040 /*
6041 * Because we can run out of memory before filling the mbuf
6042 * map, we should not allocate more clusters than they are
6043 * mbufs -- otherwise we could have a large number of useless
6044 * clusters allocated.
6045 */
6046 if (mbuf_expand) {
6047 while (m_total(MC_MBUF) <
6048 (m_total(MC_BIGCL) + m_total(MC_CL))) {
6049 if (freelist_populate(MC_MBUF, 1, M_WAIT) == 0)
6050 break;
6051 }
6052 }
6053
6054 lck_mtx_unlock(mbuf_mlock);
6055
6056 assert_wait(&mbuf_worker_run, THREAD_UNINT);
6057 (void) thread_block((thread_continue_t)mbuf_worker_thread);
6058 }
6059 }
6060
6061 static void
6062 mbuf_worker_thread_init(void)
6063 {
6064 mbuf_worker_ready++;
6065 mbuf_worker_thread();
6066 }
6067
6068 static mcl_slab_t *
6069 slab_get(void *buf)
6070 {
6071 mcl_slabg_t *slg;
6072 unsigned int ix, k;
6073
6074 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
6075
6076 VERIFY(MBUF_IN_MAP(buf));
6077 ix = ((char *)buf - (char *)mbutl) >> MBSHIFT;
6078 VERIFY(ix < maxslabgrp);
6079
6080 if ((slg = slabstbl[ix]) == NULL) {
6081 /*
6082 * In the current implementation, we never shrink the memory
6083 * pool (hence the cluster map); if we attempt to reallocate
6084 * a cluster group when it's already allocated, panic since
6085 * this is a sign of a memory corruption (slabstbl[ix] got
6086 * nullified). This also means that there shouldn't be any
6087 * hole in the kernel sub-map for the mbuf pool.
6088 */
6089 ++slabgrp;
6090 VERIFY(ix < slabgrp);
6091 /*
6092 * Slabs expansion can only be done single threaded; when
6093 * we get here, it must be as a result of m_clalloc() which
6094 * is serialized and therefore mb_clalloc_busy must be set.
6095 */
6096 VERIFY(mb_clalloc_busy);
6097 lck_mtx_unlock(mbuf_mlock);
6098
6099 /* This is a new buffer; create the slabs group for it */
6100 MALLOC(slg, mcl_slabg_t *, sizeof (*slg), M_TEMP,
6101 M_WAITOK | M_ZERO);
6102 VERIFY(slg != NULL);
6103
6104 lck_mtx_lock(mbuf_mlock);
6105 /*
6106 * No other thread could have gone into m_clalloc() after
6107 * we dropped the lock above, so verify that it's true.
6108 */
6109 VERIFY(mb_clalloc_busy);
6110
6111 slabstbl[ix] = slg;
6112
6113 /* Chain each slab in the group to its forward neighbor */
6114 for (k = 1; k < NSLABSPMB; k++)
6115 slg->slg_slab[k - 1].sl_next = &slg->slg_slab[k];
6116 VERIFY(slg->slg_slab[NSLABSPMB - 1].sl_next == NULL);
6117
6118 /* And chain the last slab in the previous group to this */
6119 if (ix > 0) {
6120 VERIFY(slabstbl[ix - 1]->
6121 slg_slab[NSLABSPMB - 1].sl_next == NULL);
6122 slabstbl[ix - 1]->slg_slab[NSLABSPMB - 1].sl_next =
6123 &slg->slg_slab[0];
6124 }
6125 }
6126
6127 ix = MTOBG(buf) % NSLABSPMB;
6128 VERIFY(ix < NSLABSPMB);
6129
6130 return (&slg->slg_slab[ix]);
6131 }
6132
6133 static void
6134 slab_init(mcl_slab_t *sp, mbuf_class_t class, u_int32_t flags,
6135 void *base, void *head, unsigned int len, int refcnt, int chunks)
6136 {
6137 sp->sl_class = class;
6138 sp->sl_flags = flags;
6139 sp->sl_base = base;
6140 sp->sl_head = head;
6141 sp->sl_len = len;
6142 sp->sl_refcnt = refcnt;
6143 sp->sl_chunks = chunks;
6144 slab_detach(sp);
6145 }
6146
6147 static void
6148 slab_insert(mcl_slab_t *sp, mbuf_class_t class)
6149 {
6150 VERIFY(slab_is_detached(sp));
6151 m_slab_cnt(class)++;
6152 TAILQ_INSERT_TAIL(&m_slablist(class), sp, sl_link);
6153 sp->sl_flags &= ~SLF_DETACHED;
6154 if (class == MC_16KCL) {
6155 int k;
6156 for (k = 1; k < NSLABSP16KB; k++) {
6157 sp = sp->sl_next;
6158 /* Next slab must already be present */
6159 VERIFY(sp != NULL);
6160 VERIFY(slab_is_detached(sp));
6161 sp->sl_flags &= ~SLF_DETACHED;
6162 }
6163 }
6164 }
6165
6166 static void
6167 slab_remove(mcl_slab_t *sp, mbuf_class_t class)
6168 {
6169 VERIFY(!slab_is_detached(sp));
6170 VERIFY(m_slab_cnt(class) > 0);
6171 m_slab_cnt(class)--;
6172 TAILQ_REMOVE(&m_slablist(class), sp, sl_link);
6173 slab_detach(sp);
6174 if (class == MC_16KCL) {
6175 int k;
6176 for (k = 1; k < NSLABSP16KB; k++) {
6177 sp = sp->sl_next;
6178 /* Next slab must already be present */
6179 VERIFY(sp != NULL);
6180 VERIFY(!slab_is_detached(sp));
6181 slab_detach(sp);
6182 }
6183 }
6184 }
6185
6186 static boolean_t
6187 slab_inrange(mcl_slab_t *sp, void *buf)
6188 {
6189 return ((uintptr_t)buf >= (uintptr_t)sp->sl_base &&
6190 (uintptr_t)buf < ((uintptr_t)sp->sl_base + sp->sl_len));
6191 }
6192
6193 #undef panic
6194
6195 static void
6196 slab_nextptr_panic(mcl_slab_t *sp, void *addr)
6197 {
6198 int i;
6199 unsigned int chunk_len = sp->sl_len / sp->sl_chunks;
6200 uintptr_t buf = (uintptr_t)sp->sl_base;
6201
6202 for (i = 0; i < sp->sl_chunks; i++, buf += chunk_len) {
6203 void *next = ((mcache_obj_t *)buf)->obj_next;
6204 if (next != addr)
6205 continue;
6206 if (!mclverify) {
6207 if (next != NULL && !MBUF_IN_MAP(next)) {
6208 mcache_t *cp = m_cache(sp->sl_class);
6209 panic("%s: %s buffer %p in slab %p modified "
6210 "after free at offset 0: %p out of range "
6211 "[%p-%p)\n", __func__, cp->mc_name,
6212 (void *)buf, sp, next, mbutl, embutl);
6213 /* NOTREACHED */
6214 }
6215 } else {
6216 mcache_audit_t *mca = mcl_audit_buf2mca(sp->sl_class,
6217 (mcache_obj_t *)buf);
6218 mcl_audit_verify_nextptr(next, mca);
6219 }
6220 }
6221 }
6222
6223 static void
6224 slab_detach(mcl_slab_t *sp)
6225 {
6226 sp->sl_link.tqe_next = (mcl_slab_t *)-1;
6227 sp->sl_link.tqe_prev = (mcl_slab_t **)-1;
6228 sp->sl_flags |= SLF_DETACHED;
6229 }
6230
6231 static boolean_t
6232 slab_is_detached(mcl_slab_t *sp)
6233 {
6234 return ((intptr_t)sp->sl_link.tqe_next == -1 &&
6235 (intptr_t)sp->sl_link.tqe_prev == -1 &&
6236 (sp->sl_flags & SLF_DETACHED));
6237 }
6238
6239 static void
6240 mcl_audit_init(void *buf, mcache_audit_t **mca_list,
6241 mcache_obj_t **con_list, size_t con_size, unsigned int num)
6242 {
6243 mcache_audit_t *mca, *mca_tail;
6244 mcache_obj_t *con = NULL;
6245 boolean_t save_contents = (con_list != NULL);
6246 unsigned int i, ix;
6247
6248 ASSERT(num <= NMBPBG);
6249 ASSERT(con_list == NULL || con_size != 0);
6250
6251 ix = MTOBG(buf);
6252 VERIFY(ix < maxclaudit);
6253
6254 /* Make sure we haven't been here before */
6255 for (i = 0; i < NMBPBG; i++)
6256 VERIFY(mclaudit[ix].cl_audit[i] == NULL);
6257
6258 mca = mca_tail = *mca_list;
6259 if (save_contents)
6260 con = *con_list;
6261
6262 for (i = 0; i < num; i++) {
6263 mcache_audit_t *next;
6264
6265 next = mca->mca_next;
6266 bzero(mca, sizeof (*mca));
6267 mca->mca_next = next;
6268 mclaudit[ix].cl_audit[i] = mca;
6269
6270 /* Attach the contents buffer if requested */
6271 if (save_contents) {
6272 VERIFY(con != NULL);
6273 mca->mca_contents_size = con_size;
6274 mca->mca_contents = con;
6275 con = con->obj_next;
6276 bzero(mca->mca_contents, mca->mca_contents_size);
6277 }
6278
6279 mca_tail = mca;
6280 mca = mca->mca_next;
6281 }
6282
6283 if (save_contents)
6284 *con_list = con;
6285
6286 *mca_list = mca_tail->mca_next;
6287 mca_tail->mca_next = NULL;
6288 }
6289
6290 /*
6291 * Given an address of a buffer (mbuf/2KB/4KB/16KB), return
6292 * the corresponding audit structure for that buffer.
6293 */
6294 static mcache_audit_t *
6295 mcl_audit_buf2mca(mbuf_class_t class, mcache_obj_t *o)
6296 {
6297 mcache_audit_t *mca = NULL;
6298 int ix = MTOBG(o);
6299
6300 VERIFY(ix < maxclaudit);
6301 VERIFY(IS_P2ALIGNED(o, MIN(m_maxsize(class), NBPG)));
6302
6303 switch (class) {
6304 case MC_MBUF:
6305 /*
6306 * For the mbuf case, find the index of the page
6307 * used by the mbuf and use that index to locate the
6308 * base address of the page. Then find out the
6309 * mbuf index relative to the page base and use
6310 * it to locate the audit structure.
6311 */
6312 VERIFY(MCLIDX(BGTOM(ix), o) < (int)NMBPBG);
6313 mca = mclaudit[ix].cl_audit[MCLIDX(BGTOM(ix), o)];
6314 break;
6315
6316 case MC_CL:
6317 /*
6318 * Same thing as above, but for 2KB clusters in a page.
6319 */
6320 VERIFY(CLBGIDX(BGTOM(ix), o) < (int)NCLPBG);
6321 mca = mclaudit[ix].cl_audit[CLBGIDX(BGTOM(ix), o)];
6322 break;
6323
6324 case MC_BIGCL:
6325 case MC_16KCL:
6326 /*
6327 * Same as above, but only return the first element.
6328 */
6329 mca = mclaudit[ix].cl_audit[0];
6330 break;
6331
6332 default:
6333 VERIFY(0);
6334 /* NOTREACHED */
6335 }
6336
6337 return (mca);
6338 }
6339
6340 static void
6341 mcl_audit_mbuf(mcache_audit_t *mca, void *addr, boolean_t composite,
6342 boolean_t alloc)
6343 {
6344 struct mbuf *m = addr;
6345 mcache_obj_t *next = ((mcache_obj_t *)m)->obj_next;
6346
6347 VERIFY(mca->mca_contents != NULL &&
6348 mca->mca_contents_size == AUDIT_CONTENTS_SIZE);
6349
6350 if (mclverify)
6351 mcl_audit_verify_nextptr(next, mca);
6352
6353 if (!alloc) {
6354 /* Save constructed mbuf fields */
6355 mcl_audit_save_mbuf(m, mca);
6356 if (mclverify) {
6357 mcache_set_pattern(MCACHE_FREE_PATTERN, m,
6358 m_maxsize(MC_MBUF));
6359 }
6360 ((mcache_obj_t *)m)->obj_next = next;
6361 return;
6362 }
6363
6364 /* Check if the buffer has been corrupted while in freelist */
6365 if (mclverify) {
6366 mcache_audit_free_verify_set(mca, addr, 0, m_maxsize(MC_MBUF));
6367 }
6368 /* Restore constructed mbuf fields */
6369 mcl_audit_restore_mbuf(m, mca, composite);
6370 }
6371
6372 static void
6373 mcl_audit_restore_mbuf(struct mbuf *m, mcache_audit_t *mca, boolean_t composite)
6374 {
6375 struct mbuf *ms = (struct mbuf *)mca->mca_contents;
6376
6377 if (composite) {
6378 struct mbuf *next = m->m_next;
6379 VERIFY(ms->m_flags == M_EXT && MEXT_RFA(ms) != NULL &&
6380 MBUF_IS_COMPOSITE(ms));
6381 /*
6382 * We could have hand-picked the mbuf fields and restore
6383 * them individually, but that will be a maintenance
6384 * headache. Instead, restore everything that was saved;
6385 * the mbuf layer will recheck and reinitialize anyway.
6386 */
6387 bcopy(ms, m, mca->mca_contents_size);
6388 m->m_next = next;
6389 } else {
6390 /*
6391 * For a regular mbuf (no cluster attached) there's nothing
6392 * to restore other than the type field, which is expected
6393 * to be MT_FREE.
6394 */
6395 m->m_type = ms->m_type;
6396 }
6397 _MCHECK(m);
6398 }
6399
6400 static void
6401 mcl_audit_save_mbuf(struct mbuf *m, mcache_audit_t *mca)
6402 {
6403 _MCHECK(m);
6404 bcopy(m, mca->mca_contents, mca->mca_contents_size);
6405 }
6406
6407 static void
6408 mcl_audit_cluster(mcache_audit_t *mca, void *addr, size_t size, boolean_t alloc,
6409 boolean_t save_next)
6410 {
6411 mcache_obj_t *next = ((mcache_obj_t *)addr)->obj_next;
6412
6413 if (!alloc) {
6414 if (mclverify) {
6415 mcache_set_pattern(MCACHE_FREE_PATTERN, addr, size);
6416 }
6417 if (save_next) {
6418 mcl_audit_verify_nextptr(next, mca);
6419 ((mcache_obj_t *)addr)->obj_next = next;
6420 }
6421 } else if (mclverify) {
6422 /* Check if the buffer has been corrupted while in freelist */
6423 mcl_audit_verify_nextptr(next, mca);
6424 mcache_audit_free_verify_set(mca, addr, 0, size);
6425 }
6426 }
6427
6428 static void
6429 mcl_audit_mcheck_panic(struct mbuf *m)
6430 {
6431 mcache_audit_t *mca;
6432
6433 MRANGE(m);
6434 mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
6435
6436 panic("mcl_audit: freed mbuf %p with type 0x%x (instead of 0x%x)\n%s\n",
6437 m, (u_int16_t)m->m_type, MT_FREE, mcache_dump_mca(mca));
6438 /* NOTREACHED */
6439 }
6440
6441 static void
6442 mcl_audit_verify_nextptr(void *next, mcache_audit_t *mca)
6443 {
6444 if (next != NULL && !MBUF_IN_MAP(next) &&
6445 (next != (void *)MCACHE_FREE_PATTERN || !mclverify)) {
6446 panic("mcl_audit: buffer %p modified after free at offset 0: "
6447 "%p out of range [%p-%p)\n%s\n",
6448 mca->mca_addr, next, mbutl, embutl, mcache_dump_mca(mca));
6449 /* NOTREACHED */
6450 }
6451 }
6452
6453 /* This function turns on mbuf leak detection */
6454 static void
6455 mleak_activate(void)
6456 {
6457 mleak_table.mleak_sample_factor = MLEAK_SAMPLE_FACTOR;
6458 PE_parse_boot_argn("mleak_sample_factor",
6459 &mleak_table.mleak_sample_factor,
6460 sizeof (mleak_table.mleak_sample_factor));
6461
6462 if (mleak_table.mleak_sample_factor == 0)
6463 mclfindleak = 0;
6464
6465 if (mclfindleak == 0)
6466 return;
6467
6468 vm_size_t alloc_size =
6469 mleak_alloc_buckets * sizeof (struct mallocation);
6470 vm_size_t trace_size = mleak_trace_buckets * sizeof (struct mtrace);
6471
6472 MALLOC(mleak_allocations, struct mallocation *, alloc_size,
6473 M_TEMP, M_WAITOK | M_ZERO);
6474 VERIFY(mleak_allocations != NULL);
6475
6476 MALLOC(mleak_traces, struct mtrace *, trace_size,
6477 M_TEMP, M_WAITOK | M_ZERO);
6478 VERIFY(mleak_traces != NULL);
6479
6480 MALLOC(mleak_stat, mleak_stat_t *, MLEAK_STAT_SIZE(MLEAK_NUM_TRACES),
6481 M_TEMP, M_WAITOK | M_ZERO);
6482 VERIFY(mleak_stat != NULL);
6483 mleak_stat->ml_cnt = MLEAK_NUM_TRACES;
6484 #ifdef __LP64__
6485 mleak_stat->ml_isaddr64 = 1;
6486 #endif /* __LP64__ */
6487 }
6488
6489 static void
6490 mleak_logger(u_int32_t num, mcache_obj_t *addr, boolean_t alloc)
6491 {
6492 int temp;
6493
6494 if (mclfindleak == 0)
6495 return;
6496
6497 if (!alloc)
6498 return (mleak_free(addr));
6499
6500 temp = atomic_add_32_ov(&mleak_table.mleak_capture, 1);
6501
6502 if ((temp % mleak_table.mleak_sample_factor) == 0 && addr != NULL) {
6503 uintptr_t bt[MLEAK_STACK_DEPTH];
6504 int logged = fastbacktrace(bt, MLEAK_STACK_DEPTH);
6505 mleak_log(bt, addr, logged, num);
6506 }
6507 }
6508
6509 /*
6510 * This function records the allocation in the mleak_allocations table
6511 * and the backtrace in the mleak_traces table; if allocation slot is in use,
6512 * replace old allocation with new one if the trace slot is in use, return
6513 * (or increment refcount if same trace).
6514 */
6515 static boolean_t
6516 mleak_log(uintptr_t *bt, mcache_obj_t *addr, uint32_t depth, int num)
6517 {
6518 struct mallocation *allocation;
6519 struct mtrace *trace;
6520 uint32_t trace_index;
6521
6522 /* Quit if someone else modifying the tables */
6523 if (!lck_mtx_try_lock_spin(mleak_lock)) {
6524 mleak_table.total_conflicts++;
6525 return (FALSE);
6526 }
6527
6528 allocation = &mleak_allocations[hashaddr((uintptr_t)addr,
6529 mleak_alloc_buckets)];
6530 trace_index = hashbacktrace(bt, depth, mleak_trace_buckets);
6531 trace = &mleak_traces[trace_index];
6532
6533 VERIFY(allocation <= &mleak_allocations[mleak_alloc_buckets - 1]);
6534 VERIFY(trace <= &mleak_traces[mleak_trace_buckets - 1]);
6535
6536 allocation->hitcount++;
6537 trace->hitcount++;
6538
6539 /*
6540 * If the allocation bucket we want is occupied
6541 * and the occupier has the same trace, just bail.
6542 */
6543 if (allocation->element != NULL &&
6544 trace_index == allocation->trace_index) {
6545 mleak_table.alloc_collisions++;
6546 lck_mtx_unlock(mleak_lock);
6547 return (TRUE);
6548 }
6549
6550 /*
6551 * Store the backtrace in the traces array;
6552 * Size of zero = trace bucket is free.
6553 */
6554 if (trace->allocs > 0 &&
6555 bcmp(trace->addr, bt, (depth * sizeof (uintptr_t))) != 0) {
6556 /* Different, unique trace, but the same hash! Bail out. */
6557 trace->collisions++;
6558 mleak_table.trace_collisions++;
6559 lck_mtx_unlock(mleak_lock);
6560 return (TRUE);
6561 } else if (trace->allocs > 0) {
6562 /* Same trace, already added, so increment refcount */
6563 trace->allocs++;
6564 } else {
6565 /* Found an unused trace bucket, so record the trace here */
6566 if (trace->depth != 0) {
6567 /* this slot previously used but not currently in use */
6568 mleak_table.trace_overwrites++;
6569 }
6570 mleak_table.trace_recorded++;
6571 trace->allocs = 1;
6572 memcpy(trace->addr, bt, (depth * sizeof (uintptr_t)));
6573 trace->depth = depth;
6574 trace->collisions = 0;
6575 }
6576
6577 /* Step 2: Store the allocation record in the allocations array */
6578 if (allocation->element != NULL) {
6579 /*
6580 * Replace an existing allocation. No need to preserve
6581 * because only a subset of the allocations are being
6582 * recorded anyway.
6583 */
6584 mleak_table.alloc_collisions++;
6585 } else if (allocation->trace_index != 0) {
6586 mleak_table.alloc_overwrites++;
6587 }
6588 allocation->element = addr;
6589 allocation->trace_index = trace_index;
6590 allocation->count = num;
6591 mleak_table.alloc_recorded++;
6592 mleak_table.outstanding_allocs++;
6593
6594 lck_mtx_unlock(mleak_lock);
6595 return (TRUE);
6596 }
6597
6598 static void
6599 mleak_free(mcache_obj_t *addr)
6600 {
6601 while (addr != NULL) {
6602 struct mallocation *allocation = &mleak_allocations
6603 [hashaddr((uintptr_t)addr, mleak_alloc_buckets)];
6604
6605 if (allocation->element == addr &&
6606 allocation->trace_index < mleak_trace_buckets) {
6607 lck_mtx_lock_spin(mleak_lock);
6608 if (allocation->element == addr &&
6609 allocation->trace_index < mleak_trace_buckets) {
6610 struct mtrace *trace;
6611 trace = &mleak_traces[allocation->trace_index];
6612 /* allocs = 0 means trace bucket is unused */
6613 if (trace->allocs > 0)
6614 trace->allocs--;
6615 if (trace->allocs == 0)
6616 trace->depth = 0;
6617 /* NULL element means alloc bucket is unused */
6618 allocation->element = NULL;
6619 mleak_table.outstanding_allocs--;
6620 }
6621 lck_mtx_unlock(mleak_lock);
6622 }
6623 addr = addr->obj_next;
6624 }
6625 }
6626
6627 static void
6628 mleak_sort_traces()
6629 {
6630 int i, j, k;
6631 struct mtrace *swap;
6632
6633 for(i = 0; i < MLEAK_NUM_TRACES; i++)
6634 mleak_top_trace[i] = NULL;
6635
6636 for(i = 0, j = 0; j < MLEAK_NUM_TRACES && i < mleak_trace_buckets; i++)
6637 {
6638 if (mleak_traces[i].allocs <= 0)
6639 continue;
6640
6641 mleak_top_trace[j] = &mleak_traces[i];
6642 for (k = j; k > 0; k--) {
6643 if (mleak_top_trace[k]->allocs <=
6644 mleak_top_trace[k-1]->allocs)
6645 break;
6646
6647 swap = mleak_top_trace[k-1];
6648 mleak_top_trace[k-1] = mleak_top_trace[k];
6649 mleak_top_trace[k] = swap;
6650 }
6651 j++;
6652 }
6653
6654 j--;
6655 for(; i < mleak_trace_buckets; i++) {
6656 if (mleak_traces[i].allocs <= mleak_top_trace[j]->allocs)
6657 continue;
6658
6659 mleak_top_trace[j] = &mleak_traces[i];
6660
6661 for (k = j; k > 0; k--) {
6662 if (mleak_top_trace[k]->allocs <=
6663 mleak_top_trace[k-1]->allocs)
6664 break;
6665
6666 swap = mleak_top_trace[k-1];
6667 mleak_top_trace[k-1] = mleak_top_trace[k];
6668 mleak_top_trace[k] = swap;
6669 }
6670 }
6671 }
6672
6673 static void
6674 mleak_update_stats()
6675 {
6676 mleak_trace_stat_t *mltr;
6677 int i;
6678
6679 VERIFY(mleak_stat != NULL);
6680 #ifdef __LP64__
6681 VERIFY(mleak_stat->ml_isaddr64);
6682 #else
6683 VERIFY(!mleak_stat->ml_isaddr64);
6684 #endif /* !__LP64__ */
6685 VERIFY(mleak_stat->ml_cnt == MLEAK_NUM_TRACES);
6686
6687 mleak_sort_traces();
6688
6689 mltr = &mleak_stat->ml_trace[0];
6690 bzero(mltr, sizeof (*mltr) * MLEAK_NUM_TRACES);
6691 for (i = 0; i < MLEAK_NUM_TRACES; i++) {
6692 int j;
6693
6694 if (mleak_top_trace[i] == NULL ||
6695 mleak_top_trace[i]->allocs == 0)
6696 continue;
6697
6698 mltr->mltr_collisions = mleak_top_trace[i]->collisions;
6699 mltr->mltr_hitcount = mleak_top_trace[i]->hitcount;
6700 mltr->mltr_allocs = mleak_top_trace[i]->allocs;
6701 mltr->mltr_depth = mleak_top_trace[i]->depth;
6702
6703 VERIFY(mltr->mltr_depth <= MLEAK_STACK_DEPTH);
6704 for (j = 0; j < mltr->mltr_depth; j++)
6705 mltr->mltr_addr[j] = mleak_top_trace[i]->addr[j];
6706
6707 mltr++;
6708 }
6709 }
6710
6711 static struct mbtypes {
6712 int mt_type;
6713 const char *mt_name;
6714 } mbtypes[] = {
6715 { MT_DATA, "data" },
6716 { MT_OOBDATA, "oob data" },
6717 { MT_CONTROL, "ancillary data" },
6718 { MT_HEADER, "packet headers" },
6719 { MT_SOCKET, "socket structures" },
6720 { MT_PCB, "protocol control blocks" },
6721 { MT_RTABLE, "routing table entries" },
6722 { MT_HTABLE, "IMP host table entries" },
6723 { MT_ATABLE, "address resolution tables" },
6724 { MT_FTABLE, "fragment reassembly queue headers" },
6725 { MT_SONAME, "socket names and addresses" },
6726 { MT_SOOPTS, "socket options" },
6727 { MT_RIGHTS, "access rights" },
6728 { MT_IFADDR, "interface addresses" },
6729 { MT_TAG, "packet tags" },
6730 { 0, NULL }
6731 };
6732
6733 #define MBUF_DUMP_BUF_CHK() { \
6734 clen -= k; \
6735 if (clen < 1) \
6736 goto done; \
6737 c += k; \
6738 }
6739
6740 static char *
6741 mbuf_dump(void)
6742 {
6743 unsigned long totmem = 0, totfree = 0, totmbufs, totused, totpct;
6744 u_int32_t m_mbufs = 0, m_clfree = 0, m_bigclfree = 0;
6745 u_int32_t m_mbufclfree = 0, m_mbufbigclfree = 0;
6746 u_int32_t m_16kclusters = 0, m_16kclfree = 0, m_mbuf16kclfree = 0;
6747 int nmbtypes = sizeof (mbstat.m_mtypes) / sizeof (short);
6748 uint8_t seen[256];
6749 struct mbtypes *mp;
6750 mb_class_stat_t *sp;
6751 mleak_trace_stat_t *mltr;
6752 char *c = mbuf_dump_buf;
6753 int i, k, clen = MBUF_DUMP_BUF_SIZE;
6754
6755 mbuf_dump_buf[0] = '\0';
6756
6757 /* synchronize all statistics in the mbuf table */
6758 mbuf_stat_sync();
6759 mbuf_mtypes_sync(TRUE);
6760
6761 sp = &mb_stat->mbs_class[0];
6762 for (i = 0; i < mb_stat->mbs_cnt; i++, sp++) {
6763 u_int32_t mem;
6764
6765 if (m_class(i) == MC_MBUF) {
6766 m_mbufs = sp->mbcl_active;
6767 } else if (m_class(i) == MC_CL) {
6768 m_clfree = sp->mbcl_total - sp->mbcl_active;
6769 } else if (m_class(i) == MC_BIGCL) {
6770 m_bigclfree = sp->mbcl_total - sp->mbcl_active;
6771 } else if (njcl > 0 && m_class(i) == MC_16KCL) {
6772 m_16kclfree = sp->mbcl_total - sp->mbcl_active;
6773 m_16kclusters = sp->mbcl_total;
6774 } else if (m_class(i) == MC_MBUF_CL) {
6775 m_mbufclfree = sp->mbcl_total - sp->mbcl_active;
6776 } else if (m_class(i) == MC_MBUF_BIGCL) {
6777 m_mbufbigclfree = sp->mbcl_total - sp->mbcl_active;
6778 } else if (njcl > 0 && m_class(i) == MC_MBUF_16KCL) {
6779 m_mbuf16kclfree = sp->mbcl_total - sp->mbcl_active;
6780 }
6781
6782 mem = sp->mbcl_ctotal * sp->mbcl_size;
6783 totmem += mem;
6784 totfree += (sp->mbcl_mc_cached + sp->mbcl_infree) *
6785 sp->mbcl_size;
6786
6787 }
6788
6789 /* adjust free counts to include composite caches */
6790 m_clfree += m_mbufclfree;
6791 m_bigclfree += m_mbufbigclfree;
6792 m_16kclfree += m_mbuf16kclfree;
6793
6794 totmbufs = 0;
6795 for (mp = mbtypes; mp->mt_name != NULL; mp++)
6796 totmbufs += mbstat.m_mtypes[mp->mt_type];
6797 if (totmbufs > m_mbufs)
6798 totmbufs = m_mbufs;
6799 k = snprintf(c, clen, "%lu/%u mbufs in use:\n", totmbufs, m_mbufs);
6800 MBUF_DUMP_BUF_CHK();
6801
6802 bzero(&seen, sizeof (seen));
6803 for (mp = mbtypes; mp->mt_name != NULL; mp++) {
6804 if (mbstat.m_mtypes[mp->mt_type] != 0) {
6805 seen[mp->mt_type] = 1;
6806 k = snprintf(c, clen, "\t%u mbufs allocated to %s\n",
6807 mbstat.m_mtypes[mp->mt_type], mp->mt_name);
6808 MBUF_DUMP_BUF_CHK();
6809 }
6810 }
6811 seen[MT_FREE] = 1;
6812 for (i = 0; i < nmbtypes; i++)
6813 if (!seen[i] && mbstat.m_mtypes[i] != 0) {
6814 k = snprintf(c, clen, "\t%u mbufs allocated to "
6815 "<mbuf type %d>\n", mbstat.m_mtypes[i], i);
6816 MBUF_DUMP_BUF_CHK();
6817 }
6818 if ((m_mbufs - totmbufs) > 0) {
6819 k = snprintf(c, clen, "\t%lu mbufs allocated to caches\n",
6820 m_mbufs - totmbufs);
6821 MBUF_DUMP_BUF_CHK();
6822 }
6823 k = snprintf(c, clen, "%u/%u mbuf 2KB clusters in use\n"
6824 "%u/%u mbuf 4KB clusters in use\n",
6825 (unsigned int)(mbstat.m_clusters - m_clfree),
6826 (unsigned int)mbstat.m_clusters,
6827 (unsigned int)(mbstat.m_bigclusters - m_bigclfree),
6828 (unsigned int)mbstat.m_bigclusters);
6829 MBUF_DUMP_BUF_CHK();
6830
6831 if (njcl > 0) {
6832 k = snprintf(c, clen, "%u/%u mbuf %uKB clusters in use\n",
6833 m_16kclusters - m_16kclfree, m_16kclusters,
6834 njclbytes / 1024);
6835 MBUF_DUMP_BUF_CHK();
6836 }
6837 totused = totmem - totfree;
6838 if (totmem == 0) {
6839 totpct = 0;
6840 } else if (totused < (ULONG_MAX / 100)) {
6841 totpct = (totused * 100) / totmem;
6842 } else {
6843 u_long totmem1 = totmem / 100;
6844 u_long totused1 = totused / 100;
6845 totpct = (totused1 * 100) / totmem1;
6846 }
6847 k = snprintf(c, clen, "%lu KB allocated to network (approx. %lu%% "
6848 "in use)\n", totmem / 1024, totpct);
6849 MBUF_DUMP_BUF_CHK();
6850
6851 /* mbuf leak detection statistics */
6852 mleak_update_stats();
6853
6854 k = snprintf(c, clen, "\nmbuf leak detection table:\n");
6855 MBUF_DUMP_BUF_CHK();
6856 k = snprintf(c, clen, "\ttotal captured: %u (one per %u)\n",
6857 mleak_table.mleak_capture / mleak_table.mleak_sample_factor,
6858 mleak_table.mleak_sample_factor);
6859 MBUF_DUMP_BUF_CHK();
6860 k = snprintf(c, clen, "\ttotal allocs outstanding: %llu\n",
6861 mleak_table.outstanding_allocs);
6862 MBUF_DUMP_BUF_CHK();
6863 k = snprintf(c, clen, "\tnew hash recorded: %llu allocs, %llu traces\n",
6864 mleak_table.alloc_recorded, mleak_table.trace_recorded);
6865 MBUF_DUMP_BUF_CHK();
6866 k = snprintf(c, clen, "\thash collisions: %llu allocs, %llu traces\n",
6867 mleak_table.alloc_collisions, mleak_table.trace_collisions);
6868 MBUF_DUMP_BUF_CHK();
6869 k = snprintf(c, clen, "\toverwrites: %llu allocs, %llu traces\n",
6870 mleak_table.alloc_overwrites, mleak_table.trace_overwrites);
6871 MBUF_DUMP_BUF_CHK();
6872 k = snprintf(c, clen, "\tlock conflicts: %llu\n\n",
6873 mleak_table.total_conflicts);
6874 MBUF_DUMP_BUF_CHK();
6875
6876 k = snprintf(c, clen, "top %d outstanding traces:\n",
6877 mleak_stat->ml_cnt);
6878 MBUF_DUMP_BUF_CHK();
6879 for (i = 0; i < mleak_stat->ml_cnt; i++) {
6880 mltr = &mleak_stat->ml_trace[i];
6881 k = snprintf(c, clen, "[%d] %llu outstanding alloc(s), "
6882 "%llu hit(s), %llu collision(s)\n", (i + 1),
6883 mltr->mltr_allocs, mltr->mltr_hitcount,
6884 mltr->mltr_collisions);
6885 MBUF_DUMP_BUF_CHK();
6886 }
6887
6888 if (mleak_stat->ml_isaddr64)
6889 k = snprintf(c, clen, MB_LEAK_HDR_64);
6890 else
6891 k = snprintf(c, clen, MB_LEAK_HDR_32);
6892 MBUF_DUMP_BUF_CHK();
6893
6894 for (i = 0; i < MLEAK_STACK_DEPTH; i++) {
6895 int j;
6896 k = snprintf(c, clen, "%2d: ", (i + 1));
6897 MBUF_DUMP_BUF_CHK();
6898 for (j = 0; j < mleak_stat->ml_cnt; j++) {
6899 mltr = &mleak_stat->ml_trace[j];
6900 if (i < mltr->mltr_depth) {
6901 if (mleak_stat->ml_isaddr64) {
6902 k = snprintf(c, clen, "0x%0llx ",
6903 mltr->mltr_addr[i]);
6904 } else {
6905 k = snprintf(c, clen,
6906 "0x%08x ",
6907 (u_int32_t)mltr->mltr_addr[i]);
6908 }
6909 } else {
6910 if (mleak_stat->ml_isaddr64)
6911 k = snprintf(c, clen,
6912 MB_LEAK_SPACING_64);
6913 else
6914 k = snprintf(c, clen,
6915 MB_LEAK_SPACING_32);
6916 }
6917 MBUF_DUMP_BUF_CHK();
6918 }
6919 k = snprintf(c, clen, "\n");
6920 MBUF_DUMP_BUF_CHK();
6921 }
6922 done:
6923 return (mbuf_dump_buf);
6924 }
6925
6926 #undef MBUF_DUMP_BUF_CHK
6927
6928 SYSCTL_DECL(_kern_ipc);
6929 SYSCTL_PROC(_kern_ipc, KIPC_MBSTAT, mbstat,
6930 CTLFLAG_RD | CTLFLAG_LOCKED,
6931 0, 0, mbstat_sysctl, "S,mbstat", "");
6932 SYSCTL_PROC(_kern_ipc, OID_AUTO, mb_stat,
6933 CTLFLAG_RD | CTLFLAG_LOCKED,
6934 0, 0, mb_stat_sysctl, "S,mb_stat", "");
6935 SYSCTL_PROC(_kern_ipc, OID_AUTO, mleak_top_trace,
6936 CTLFLAG_RD | CTLFLAG_LOCKED,
6937 0, 0, mleak_top_trace_sysctl, "S,mb_top_trace", "");
6938 SYSCTL_PROC(_kern_ipc, OID_AUTO, mleak_table,
6939 CTLFLAG_RD | CTLFLAG_LOCKED,
6940 0, 0, mleak_table_sysctl, "S,mleak_table", "");
6941 SYSCTL_INT(_kern_ipc, OID_AUTO, mleak_sample_factor,
6942 CTLFLAG_RW | CTLFLAG_LOCKED, &mleak_table.mleak_sample_factor, 0, "");
6943 SYSCTL_INT(_kern_ipc, OID_AUTO, mb_normalized,
6944 CTLFLAG_RD | CTLFLAG_LOCKED, &mb_normalized, 0, "");
6945 SYSCTL_INT(_kern_ipc, OID_AUTO, mb_watchdog,
6946 CTLFLAG_RW | CTLFLAG_LOCKED, &mb_watchdog, 0, "");