]> git.saurik.com Git - apple/xnu.git/blob - bsd/kern/uipc_mbuf.c
xnu-1699.26.8.tar.gz
[apple/xnu.git] / bsd / kern / uipc_mbuf.c
1 /*
2 * Copyright (c) 1998-2011 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
29 /*
30 * Copyright (c) 1982, 1986, 1988, 1991, 1993
31 * The Regents of the University of California. All rights reserved.
32 *
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
35 * are met:
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
41 * 3. All advertising materials mentioning features or use of this software
42 * must display the following acknowledgement:
43 * This product includes software developed by the University of
44 * California, Berkeley and its contributors.
45 * 4. Neither the name of the University nor the names of its contributors
46 * may be used to endorse or promote products derived from this software
47 * without specific prior written permission.
48 *
49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59 * SUCH DAMAGE.
60 *
61 * @(#)uipc_mbuf.c 8.2 (Berkeley) 1/4/94
62 */
63 /*
64 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
65 * support for mandatory and extensible security protections. This notice
66 * is included in support of clause 2.2 (b) of the Apple Public License,
67 * Version 2.0.
68 */
69
70 #include <sys/param.h>
71 #include <sys/systm.h>
72 #include <sys/malloc.h>
73 #include <sys/mbuf.h>
74 #include <sys/kernel.h>
75 #include <sys/sysctl.h>
76 #include <sys/syslog.h>
77 #include <sys/protosw.h>
78 #include <sys/domain.h>
79 #include <sys/queue.h>
80 #include <sys/proc.h>
81
82 #include <kern/kern_types.h>
83 #include <kern/simple_lock.h>
84 #include <kern/queue.h>
85 #include <kern/sched_prim.h>
86 #include <kern/cpu_number.h>
87 #include <kern/zalloc.h>
88
89 #include <libkern/OSAtomic.h>
90 #include <libkern/libkern.h>
91
92 #include <IOKit/IOMapper.h>
93
94 #include <machine/limits.h>
95 #include <machine/machine_routines.h>
96
97 #if CONFIG_MACF_NET
98 #include <security/mac_framework.h>
99 #endif /* MAC_NET */
100
101 #include <sys/mcache.h>
102
103 /*
104 * MBUF IMPLEMENTATION NOTES.
105 *
106 * There is a total of 5 per-CPU caches:
107 *
108 * MC_MBUF:
109 * This is a cache of rudimentary objects of MSIZE in size; each
110 * object represents an mbuf structure. This cache preserves only
111 * the m_type field of the mbuf during its transactions.
112 *
113 * MC_CL:
114 * This is a cache of rudimentary objects of MCLBYTES in size; each
115 * object represents a mcluster structure. This cache does not
116 * preserve the contents of the objects during its transactions.
117 *
118 * MC_BIGCL:
119 * This is a cache of rudimentary objects of MBIGCLBYTES in size; each
120 * object represents a mbigcluster structure. This cache does not
121 * preserve the contents of the objects during its transaction.
122 *
123 * MC_MBUF_CL:
124 * This is a cache of mbufs each having a cluster attached to it.
125 * It is backed by MC_MBUF and MC_CL rudimentary caches. Several
126 * fields of the mbuf related to the external cluster are preserved
127 * during transactions.
128 *
129 * MC_MBUF_BIGCL:
130 * This is a cache of mbufs each having a big cluster attached to it.
131 * It is backed by MC_MBUF and MC_BIGCL rudimentary caches. Several
132 * fields of the mbuf related to the external cluster are preserved
133 * during transactions.
134 *
135 * OBJECT ALLOCATION:
136 *
137 * Allocation requests are handled first at the per-CPU (mcache) layer
138 * before falling back to the slab layer. Performance is optimal when
139 * the request is satisfied at the CPU layer because global data/lock
140 * never gets accessed. When the slab layer is entered for allocation,
141 * the slab freelist will be checked first for available objects before
142 * the VM backing store is invoked. Slab layer operations are serialized
143 * for all of the caches as the mbuf global lock is held most of the time.
144 * Allocation paths are different depending on the class of objects:
145 *
146 * a. Rudimentary object:
147 *
148 * { m_get_common(), m_clattach(), m_mclget(),
149 * m_mclalloc(), m_bigalloc(), m_copym_with_hdrs(),
150 * composite object allocation }
151 * | ^
152 * | |
153 * | +-----------------------+
154 * v |
155 * mcache_alloc/mcache_alloc_ext() mbuf_slab_audit()
156 * | ^
157 * v |
158 * [CPU cache] -------> (found?) -------+
159 * | |
160 * v |
161 * mbuf_slab_alloc() |
162 * | |
163 * v |
164 * +---------> [freelist] -------> (found?) -------+
165 * | |
166 * | v
167 * | m_clalloc()
168 * | |
169 * | v
170 * +---<<---- kmem_mb_alloc()
171 *
172 * b. Composite object:
173 *
174 * { m_getpackets_internal(), m_allocpacket_internal() }
175 * | ^
176 * | |
177 * | +------ (done) ---------+
178 * v |
179 * mcache_alloc/mcache_alloc_ext() mbuf_cslab_audit()
180 * | ^
181 * v |
182 * [CPU cache] -------> (found?) -------+
183 * | |
184 * v |
185 * mbuf_cslab_alloc() |
186 * | |
187 * v |
188 * [freelist] -------> (found?) -------+
189 * | |
190 * v |
191 * (rudimentary object) |
192 * mcache_alloc/mcache_alloc_ext() ------>>-----+
193 *
194 * Auditing notes: If auditing is enabled, buffers will be subjected to
195 * integrity checks by the audit routine. This is done by verifying their
196 * contents against DEADBEEF (free) pattern before returning them to caller.
197 * As part of this step, the routine will also record the transaction and
198 * pattern-fill the buffers with BADDCAFE (uninitialized) pattern. It will
199 * also restore any constructed data structure fields if necessary.
200 *
201 * OBJECT DEALLOCATION:
202 *
203 * Freeing an object simply involves placing it into the CPU cache; this
204 * pollutes the cache to benefit subsequent allocations. The slab layer
205 * will only be entered if the object is to be purged out of the cache.
206 * During normal operations, this happens only when the CPU layer resizes
207 * its bucket while it's adjusting to the allocation load. Deallocation
208 * paths are different depending on the class of objects:
209 *
210 * a. Rudimentary object:
211 *
212 * { m_free(), m_freem_list(), composite object deallocation }
213 * | ^
214 * | |
215 * | +------ (done) ---------+
216 * v |
217 * mcache_free/mcache_free_ext() |
218 * | |
219 * v |
220 * mbuf_slab_audit() |
221 * | |
222 * v |
223 * [CPU cache] ---> (not purging?) -----+
224 * | |
225 * v |
226 * mbuf_slab_free() |
227 * | |
228 * v |
229 * [freelist] ----------->>------------+
230 * (objects never get purged to VM)
231 *
232 * b. Composite object:
233 *
234 * { m_free(), m_freem_list() }
235 * | ^
236 * | |
237 * | +------ (done) ---------+
238 * v |
239 * mcache_free/mcache_free_ext() |
240 * | |
241 * v |
242 * mbuf_cslab_audit() |
243 * | |
244 * v |
245 * [CPU cache] ---> (not purging?) -----+
246 * | |
247 * v |
248 * mbuf_cslab_free() |
249 * | |
250 * v |
251 * [freelist] ---> (not purging?) -----+
252 * | |
253 * v |
254 * (rudimentary object) |
255 * mcache_free/mcache_free_ext() ------->>------+
256 *
257 * Auditing notes: If auditing is enabled, the audit routine will save
258 * any constructed data structure fields (if necessary) before filling the
259 * contents of the buffers with DEADBEEF (free) pattern and recording the
260 * transaction. Buffers that are freed (whether at CPU or slab layer) are
261 * expected to contain the free pattern.
262 *
263 * DEBUGGING:
264 *
265 * Debugging can be enabled by adding "mbuf_debug=0x3" to boot-args; this
266 * translates to the mcache flags (MCF_VERIFY | MCF_AUDIT). Additionally,
267 * the CPU layer cache can be disabled by setting the MCF_NOCPUCACHE flag,
268 * i.e. modify the boot argument parameter to "mbuf_debug=0x13". Leak
269 * detection may also be disabled by setting the MCF_NOLEAKLOG flag, e.g.
270 * "mbuf_debug=0x113". Note that debugging consumes more CPU and memory.
271 *
272 * Each object is associated with exactly one mcache_audit_t structure that
273 * contains the information related to its last buffer transaction. Given
274 * an address of an object, the audit structure can be retrieved by finding
275 * the position of the object relevant to the base address of the cluster:
276 *
277 * +------------+ +=============+
278 * | mbuf addr | | mclaudit[i] |
279 * +------------+ +=============+
280 * | | cl_audit[0] |
281 * i = MTOBG(addr) +-------------+
282 * | +-----> | cl_audit[1] | -----> mcache_audit_t
283 * b = BGTOM(i) | +-------------+
284 * | | | ... |
285 * x = MCLIDX(b, addr) | +-------------+
286 * | | | cl_audit[7] |
287 * +-----------------+ +-------------+
288 * (e.g. x == 1)
289 *
290 * The mclaudit[] array is allocated at initialization time, but its contents
291 * get populated when the corresponding cluster is created. Because a page
292 * can be turned into NMBPBG number of mbufs, we preserve enough space for the
293 * mbufs so that there is a 1-to-1 mapping between them. A page that never
294 * gets (or has not yet) turned into mbufs will use only cl_audit[0] with the
295 * remaining entries unused. For 16KB cluster, only one entry from the first
296 * page is allocated and used for the entire object.
297 */
298
299 /* TODO: should be in header file */
300 /* kernel translater */
301 extern vm_offset_t kmem_mb_alloc(vm_map_t, int, int);
302 extern ppnum_t pmap_find_phys(pmap_t pmap, addr64_t va);
303 extern vm_map_t mb_map; /* special map */
304
305 /* Global lock */
306 static lck_mtx_t *mbuf_mlock;
307 static lck_attr_t *mbuf_mlock_attr;
308 static lck_grp_t *mbuf_mlock_grp;
309 static lck_grp_attr_t *mbuf_mlock_grp_attr;
310
311 /* Back-end (common) layer */
312 static void *mbuf_worker_run; /* wait channel for worker thread */
313 static int mbuf_worker_ready; /* worker thread is runnable */
314 static int mbuf_expand_mcl; /* number of cluster creation requets */
315 static int mbuf_expand_big; /* number of big cluster creation requests */
316 static int mbuf_expand_16k; /* number of 16KB cluster creation requests */
317 static int ncpu; /* number of CPUs */
318 static ppnum_t *mcl_paddr; /* Array of cluster physical addresses */
319 static ppnum_t mcl_pages; /* Size of array (# physical pages) */
320 static ppnum_t mcl_paddr_base; /* Handle returned by IOMapper::iovmAlloc() */
321 static mcache_t *ref_cache; /* Cache of cluster reference & flags */
322 static mcache_t *mcl_audit_con_cache; /* Audit contents cache */
323 static unsigned int mbuf_debug; /* patchable mbuf mcache flags */
324 static unsigned int mb_normalized; /* number of packets "normalized" */
325
326 #define MB_GROWTH_AGGRESSIVE 1 /* Threshold: 1/2 of total */
327 #define MB_GROWTH_NORMAL 2 /* Threshold: 3/4 of total */
328
329 typedef enum {
330 MC_MBUF = 0, /* Regular mbuf */
331 MC_CL, /* Cluster */
332 MC_BIGCL, /* Large (4KB) cluster */
333 MC_16KCL, /* Jumbo (16KB) cluster */
334 MC_MBUF_CL, /* mbuf + cluster */
335 MC_MBUF_BIGCL, /* mbuf + large (4KB) cluster */
336 MC_MBUF_16KCL /* mbuf + jumbo (16KB) cluster */
337 } mbuf_class_t;
338
339 #define MBUF_CLASS_MIN MC_MBUF
340 #define MBUF_CLASS_MAX MC_MBUF_16KCL
341 #define MBUF_CLASS_LAST MC_16KCL
342 #define MBUF_CLASS_VALID(c) \
343 ((int)(c) >= MBUF_CLASS_MIN && (int)(c) <= MBUF_CLASS_MAX)
344 #define MBUF_CLASS_COMPOSITE(c) \
345 ((int)(c) > MBUF_CLASS_LAST)
346
347
348 /*
349 * mbuf specific mcache allocation request flags.
350 */
351 #define MCR_COMP MCR_USR1 /* for MC_MBUF_{CL,BIGCL,16KCL} caches */
352
353 /*
354 * Per-cluster slab structure.
355 *
356 * A slab is a cluster control structure that contains one or more object
357 * chunks; the available chunks are chained in the slab's freelist (sl_head).
358 * Each time a chunk is taken out of the slab, the slab's reference count
359 * gets incremented. When all chunks have been taken out, the empty slab
360 * gets removed (SLF_DETACHED) from the class's slab list. A chunk that is
361 * returned to a slab causes the slab's reference count to be decremented;
362 * it also causes the slab to be reinserted back to class's slab list, if
363 * it's not already done.
364 *
365 * Compartmentalizing of the object chunks into slabs allows us to easily
366 * merge one or more slabs together when the adjacent slabs are idle, as
367 * well as to convert or move a slab from one class to another; e.g. the
368 * mbuf cluster slab can be converted to a regular cluster slab when all
369 * mbufs in the slab have been freed.
370 *
371 * A slab may also span across multiple clusters for chunks larger than
372 * a cluster's size. In this case, only the slab of the first cluster is
373 * used. The rest of the slabs are marked with SLF_PARTIAL to indicate
374 * that they are part of the larger slab.
375 *
376 * Each slab controls a page of memory.
377 */
378 typedef struct mcl_slab {
379 struct mcl_slab *sl_next; /* neighboring slab */
380 u_int8_t sl_class; /* controlling mbuf class */
381 int8_t sl_refcnt; /* outstanding allocations */
382 int8_t sl_chunks; /* chunks (bufs) in this slab */
383 u_int16_t sl_flags; /* slab flags (see below) */
384 u_int16_t sl_len; /* slab length */
385 void *sl_base; /* base of allocated memory */
386 void *sl_head; /* first free buffer */
387 TAILQ_ENTRY(mcl_slab) sl_link; /* next/prev slab on freelist */
388 } mcl_slab_t;
389
390 #define SLF_MAPPED 0x0001 /* backed by a mapped page */
391 #define SLF_PARTIAL 0x0002 /* part of another slab */
392 #define SLF_DETACHED 0x0004 /* not in slab freelist */
393
394 /*
395 * The array of slabs are broken into groups of arrays per 1MB of kernel
396 * memory to reduce the footprint. Each group is allocated on demand
397 * whenever a new piece of memory mapped in from the VM crosses the 1MB
398 * boundary.
399 */
400 #define NSLABSPMB ((1 << MBSHIFT) >> PGSHIFT) /* 256 slabs/grp */
401
402 typedef struct mcl_slabg {
403 mcl_slab_t slg_slab[NSLABSPMB]; /* group of slabs */
404 } mcl_slabg_t;
405
406 /*
407 * Number of slabs needed to control a 16KB cluster object.
408 */
409 #define NSLABSP16KB (M16KCLBYTES >> PGSHIFT)
410
411 /*
412 * Per-cluster audit structure.
413 */
414 typedef struct {
415 mcache_audit_t *cl_audit[NMBPBG]; /* array of audits */
416 } mcl_audit_t;
417
418 /*
419 * Size of data from the beginning of an mbuf that covers m_hdr, pkthdr
420 * and m_ext structures. If auditing is enabled, we allocate a shadow
421 * mbuf structure of this size inside each audit structure, and the
422 * contents of the real mbuf gets copied into it when the mbuf is freed.
423 * This allows us to pattern-fill the mbuf for integrity check, and to
424 * preserve any constructed mbuf fields (e.g. mbuf + cluster cache case).
425 * Note that we don't save the contents of clusters when they are freed;
426 * we simply pattern-fill them.
427 */
428 #define AUDIT_CONTENTS_SIZE ((MSIZE - MHLEN) + sizeof (_m_ext_t))
429
430 /*
431 * mbuf specific mcache audit flags
432 */
433 #define MB_INUSE 0x01 /* object has not been returned to slab */
434 #define MB_COMP_INUSE 0x02 /* object has not been returned to cslab */
435 #define MB_SCVALID 0x04 /* object has valid saved contents */
436
437 /*
438 * Each of the following two arrays hold up to nmbclusters elements.
439 */
440 static mcl_audit_t *mclaudit; /* array of cluster audit information */
441 static unsigned int maxclaudit; /* max # of entries in audit table */
442 static mcl_slabg_t **slabstbl; /* cluster slabs table */
443 static unsigned int maxslabgrp; /* max # of entries in slabs table */
444 static unsigned int slabgrp; /* # of entries in slabs table */
445
446 /* Globals */
447 int nclusters; /* # of clusters for non-jumbo (legacy) sizes */
448 int njcl; /* # of clusters for jumbo sizes */
449 int njclbytes; /* size of a jumbo cluster */
450 union mbigcluster *mbutl; /* first mapped cluster address */
451 union mbigcluster *embutl; /* ending virtual address of mclusters */
452 int max_linkhdr; /* largest link-level header */
453 int max_protohdr; /* largest protocol header */
454 int max_hdr; /* largest link+protocol header */
455 int max_datalen; /* MHLEN - max_hdr */
456
457 static boolean_t mclverify; /* debug: pattern-checking */
458 static boolean_t mcltrace; /* debug: stack tracing */
459 static boolean_t mclfindleak; /* debug: leak detection */
460
461 /* mbuf leak detection variables */
462 static struct mleak_table mleak_table;
463 static mleak_stat_t *mleak_stat;
464
465 #define MLEAK_STAT_SIZE(n) \
466 ((size_t)(&((mleak_stat_t *)0)->ml_trace[n]))
467
468 struct mallocation {
469 mcache_obj_t *element; /* the alloc'ed element, NULL if unused */
470 u_int32_t trace_index; /* mtrace index for corresponding backtrace */
471 u_int32_t count; /* How many objects were requested */
472 u_int64_t hitcount; /* for determining hash effectiveness */
473 };
474
475 struct mtrace {
476 u_int64_t collisions;
477 u_int64_t hitcount;
478 u_int64_t allocs;
479 u_int64_t depth;
480 uintptr_t addr[MLEAK_STACK_DEPTH];
481 };
482
483 /* Size must be a power of two for the zhash to be able to just mask off bits */
484 #define MLEAK_ALLOCATION_MAP_NUM 512
485 #define MLEAK_TRACE_MAP_NUM 256
486
487 /*
488 * Sample factor for how often to record a trace. This is overwritable
489 * by the boot-arg mleak_sample_factor.
490 */
491 #define MLEAK_SAMPLE_FACTOR 500
492
493 /*
494 * Number of top leakers recorded.
495 */
496 #define MLEAK_NUM_TRACES 5
497
498 static uint32_t mleak_alloc_buckets = MLEAK_ALLOCATION_MAP_NUM;
499 static uint32_t mleak_trace_buckets = MLEAK_TRACE_MAP_NUM;
500
501 /* Hashmaps of allocations and their corresponding traces */
502 static struct mallocation *mleak_allocations;
503 static struct mtrace *mleak_traces;
504 static struct mtrace *mleak_top_trace[MLEAK_NUM_TRACES];
505
506 /* Lock to protect mleak tables from concurrent modification */
507 static lck_mtx_t *mleak_lock;
508 static lck_attr_t *mleak_lock_attr;
509 static lck_grp_t *mleak_lock_grp;
510 static lck_grp_attr_t *mleak_lock_grp_attr;
511
512 extern u_int32_t high_sb_max;
513
514 /* TODO: should be in header file */
515 int do_reclaim = 0;
516
517 /* The minimum number of objects that are allocated, to start. */
518 #define MINCL 32
519 #define MINBIGCL (MINCL >> 1)
520 #define MIN16KCL (MINCL >> 2)
521
522 /* Low watermarks (only map in pages once free counts go below) */
523 #define MBIGCL_LOWAT MINBIGCL
524 #define M16KCL_LOWAT MIN16KCL
525
526 typedef struct {
527 mbuf_class_t mtbl_class; /* class type */
528 mcache_t *mtbl_cache; /* mcache for this buffer class */
529 TAILQ_HEAD(mcl_slhead, mcl_slab) mtbl_slablist; /* slab list */
530 mcache_obj_t *mtbl_cobjlist; /* composite objects freelist */
531 mb_class_stat_t *mtbl_stats; /* statistics fetchable via sysctl */
532 u_int32_t mtbl_maxsize; /* maximum buffer size */
533 int mtbl_minlimit; /* minimum allowed */
534 int mtbl_maxlimit; /* maximum allowed */
535 u_int32_t mtbl_wantpurge; /* purge during next reclaim */
536 } mbuf_table_t;
537
538 #define m_class(c) mbuf_table[c].mtbl_class
539 #define m_cache(c) mbuf_table[c].mtbl_cache
540 #define m_slablist(c) mbuf_table[c].mtbl_slablist
541 #define m_cobjlist(c) mbuf_table[c].mtbl_cobjlist
542 #define m_maxsize(c) mbuf_table[c].mtbl_maxsize
543 #define m_minlimit(c) mbuf_table[c].mtbl_minlimit
544 #define m_maxlimit(c) mbuf_table[c].mtbl_maxlimit
545 #define m_wantpurge(c) mbuf_table[c].mtbl_wantpurge
546 #define m_cname(c) mbuf_table[c].mtbl_stats->mbcl_cname
547 #define m_size(c) mbuf_table[c].mtbl_stats->mbcl_size
548 #define m_total(c) mbuf_table[c].mtbl_stats->mbcl_total
549 #define m_active(c) mbuf_table[c].mtbl_stats->mbcl_active
550 #define m_infree(c) mbuf_table[c].mtbl_stats->mbcl_infree
551 #define m_slab_cnt(c) mbuf_table[c].mtbl_stats->mbcl_slab_cnt
552 #define m_alloc_cnt(c) mbuf_table[c].mtbl_stats->mbcl_alloc_cnt
553 #define m_free_cnt(c) mbuf_table[c].mtbl_stats->mbcl_free_cnt
554 #define m_notified(c) mbuf_table[c].mtbl_stats->mbcl_notified
555 #define m_purge_cnt(c) mbuf_table[c].mtbl_stats->mbcl_purge_cnt
556 #define m_fail_cnt(c) mbuf_table[c].mtbl_stats->mbcl_fail_cnt
557 #define m_ctotal(c) mbuf_table[c].mtbl_stats->mbcl_ctotal
558
559 static mbuf_table_t mbuf_table[] = {
560 /*
561 * The caches for mbufs, regular clusters and big clusters.
562 */
563 { MC_MBUF, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_MBUF)),
564 NULL, NULL, 0, 0, 0, 0 },
565 { MC_CL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_CL)),
566 NULL, NULL, 0, 0, 0, 0 },
567 { MC_BIGCL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_BIGCL)),
568 NULL, NULL, 0, 0, 0, 0 },
569 { MC_16KCL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_16KCL)),
570 NULL, NULL, 0, 0, 0, 0 },
571 /*
572 * The following are special caches; they serve as intermediate
573 * caches backed by the above rudimentary caches. Each object
574 * in the cache is an mbuf with a cluster attached to it. Unlike
575 * the above caches, these intermediate caches do not directly
576 * deal with the slab structures; instead, the constructed
577 * cached elements are simply stored in the freelists.
578 */
579 { MC_MBUF_CL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0 },
580 { MC_MBUF_BIGCL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0 },
581 { MC_MBUF_16KCL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0 },
582 };
583
584 #define NELEM(a) (sizeof (a) / sizeof ((a)[0]))
585
586 static void *mb_waitchan = &mbuf_table; /* wait channel for all caches */
587 static int mb_waiters; /* number of waiters */
588
589 #define MB_WDT_MAXTIME 10 /* # of secs before watchdog panic */
590 static struct timeval mb_wdtstart; /* watchdog start timestamp */
591 static char mbuf_dump_buf[256];
592
593 /*
594 * mbuf watchdog is enabled by default on embedded platforms. It is
595 * also toggeable via the kern.ipc.mb_watchdog sysctl.
596 */
597 #if CONFIG_EMBEDDED
598 static unsigned int mb_watchdog = 1;
599 #else
600 static unsigned int mb_watchdog = 0;
601 #endif /* CONFIG_EMBEDDED */
602
603 /* The following are used to serialize m_clalloc() */
604 static boolean_t mb_clalloc_busy;
605 static void *mb_clalloc_waitchan = &mb_clalloc_busy;
606 static int mb_clalloc_waiters;
607
608 static void mbuf_mtypes_sync(boolean_t);
609 static int mbstat_sysctl SYSCTL_HANDLER_ARGS;
610 static void mbuf_stat_sync(void);
611 static int mb_stat_sysctl SYSCTL_HANDLER_ARGS;
612 static int mleak_top_trace_sysctl SYSCTL_HANDLER_ARGS;
613 static int mleak_table_sysctl SYSCTL_HANDLER_ARGS;
614 static char *mbuf_dump(void);
615 static void mbuf_table_init(void);
616 static inline void m_incref(struct mbuf *);
617 static inline u_int32_t m_decref(struct mbuf *);
618 static int m_clalloc(const u_int32_t, const int, const u_int32_t);
619 static void mbuf_worker_thread_init(void);
620 static mcache_obj_t *slab_alloc(mbuf_class_t, int);
621 static void slab_free(mbuf_class_t, mcache_obj_t *);
622 static unsigned int mbuf_slab_alloc(void *, mcache_obj_t ***,
623 unsigned int, int);
624 static void mbuf_slab_free(void *, mcache_obj_t *, int);
625 static void mbuf_slab_audit(void *, mcache_obj_t *, boolean_t);
626 static void mbuf_slab_notify(void *, u_int32_t);
627 static unsigned int cslab_alloc(mbuf_class_t, mcache_obj_t ***,
628 unsigned int);
629 static unsigned int cslab_free(mbuf_class_t, mcache_obj_t *, int);
630 static unsigned int mbuf_cslab_alloc(void *, mcache_obj_t ***,
631 unsigned int, int);
632 static void mbuf_cslab_free(void *, mcache_obj_t *, int);
633 static void mbuf_cslab_audit(void *, mcache_obj_t *, boolean_t);
634 static int freelist_populate(mbuf_class_t, unsigned int, int);
635 static void freelist_init(mbuf_class_t);
636 static boolean_t mbuf_cached_above(mbuf_class_t, int);
637 static boolean_t mbuf_steal(mbuf_class_t, unsigned int);
638 static void m_reclaim(mbuf_class_t, unsigned int, boolean_t);
639 static int m_howmany(int, size_t);
640 static void mbuf_worker_thread(void);
641 static void mbuf_watchdog(void);
642 static boolean_t mbuf_sleep(mbuf_class_t, unsigned int, int);
643
644 static void mcl_audit_init(void *, mcache_audit_t **, mcache_obj_t **,
645 size_t, unsigned int);
646 static mcache_audit_t *mcl_audit_buf2mca(mbuf_class_t, mcache_obj_t *);
647 static void mcl_audit_mbuf(mcache_audit_t *, void *, boolean_t, boolean_t);
648 static void mcl_audit_cluster(mcache_audit_t *, void *, size_t, boolean_t,
649 boolean_t);
650 static void mcl_audit_restore_mbuf(struct mbuf *, mcache_audit_t *, boolean_t);
651 static void mcl_audit_save_mbuf(struct mbuf *, mcache_audit_t *);
652 static void mcl_audit_mcheck_panic(struct mbuf *);
653 static void mcl_audit_verify_nextptr(void *, mcache_audit_t *);
654
655 static void mleak_activate(void);
656 static void mleak_logger(u_int32_t, mcache_obj_t *, boolean_t);
657 static boolean_t mleak_log(uintptr_t *, mcache_obj_t *, uint32_t, int);
658 static void mleak_free(mcache_obj_t *);
659
660 static mcl_slab_t *slab_get(void *);
661 static void slab_init(mcl_slab_t *, mbuf_class_t, u_int32_t,
662 void *, void *, unsigned int, int, int);
663 static void slab_insert(mcl_slab_t *, mbuf_class_t);
664 static void slab_remove(mcl_slab_t *, mbuf_class_t);
665 static boolean_t slab_inrange(mcl_slab_t *, void *);
666 static void slab_nextptr_panic(mcl_slab_t *, void *);
667 static void slab_detach(mcl_slab_t *);
668 static boolean_t slab_is_detached(mcl_slab_t *);
669
670 static int m_copyback0(struct mbuf **, int, int, const void *, int, int);
671 static struct mbuf *m_split0(struct mbuf *, int, int, int);
672
673 /* flags for m_copyback0 */
674 #define M_COPYBACK0_COPYBACK 0x0001 /* copyback from cp */
675 #define M_COPYBACK0_PRESERVE 0x0002 /* preserve original data */
676 #define M_COPYBACK0_COW 0x0004 /* do copy-on-write */
677 #define M_COPYBACK0_EXTEND 0x0008 /* extend chain */
678
679 /*
680 * This flag is set for all mbufs that come out of and into the composite
681 * mbuf + cluster caches, i.e. MC_MBUF_CL and MC_MBUF_BIGCL. mbufs that
682 * are marked with such a flag have clusters attached to them, and will be
683 * treated differently when they are freed; instead of being placed back
684 * into the mbuf and cluster freelists, the composite mbuf + cluster objects
685 * are placed back into the appropriate composite cache's freelist, and the
686 * actual freeing is deferred until the composite objects are purged. At
687 * such a time, this flag will be cleared from the mbufs and the objects
688 * will be freed into their own separate freelists.
689 */
690 #define EXTF_COMPOSITE 0x1
691
692 /*
693 * This flag indicates that the external cluster is read-only, i.e. it is
694 * or was referred to by more than one mbufs. Once set, this flag is never
695 * cleared.
696 */
697 #define EXTF_READONLY 0x2
698 #define EXTF_MASK (EXTF_COMPOSITE | EXTF_READONLY)
699
700 #define MEXT_RFA(m) ((m)->m_ext.ext_refflags)
701 #define MEXT_REF(m) (MEXT_RFA(m)->refcnt)
702 #define MEXT_FLAGS(m) (MEXT_RFA(m)->flags)
703 #define MBUF_IS_COMPOSITE(m) \
704 (MEXT_REF(m) == 0 && (MEXT_FLAGS(m) & EXTF_MASK) == EXTF_COMPOSITE)
705
706 /*
707 * Macros used to verify the integrity of the mbuf.
708 */
709 #define _MCHECK(m) { \
710 if ((m)->m_type != MT_FREE) { \
711 if (mclaudit == NULL) \
712 panic("MCHECK: m_type=%d m=%p", \
713 (u_int16_t)(m)->m_type, m); \
714 else \
715 mcl_audit_mcheck_panic(m); \
716 } \
717 }
718
719 #define MBUF_IN_MAP(addr) \
720 ((void *)(addr) >= (void *)mbutl && (void *)(addr) < (void *)embutl)
721
722 #define MRANGE(addr) { \
723 if (!MBUF_IN_MAP(addr)) \
724 panic("MRANGE: address out of range 0x%p", addr); \
725 }
726
727 /*
728 * Macro version of mtod.
729 */
730 #define MTOD(m, t) ((t)((m)->m_data))
731
732 /*
733 * Macros to obtain (4KB) cluster index and base cluster address.
734 */
735
736 #define MTOBG(x) (((char *)(x) - (char *)mbutl) >> MBIGCLSHIFT)
737 #define BGTOM(x) ((union mbigcluster *)(mbutl + (x)))
738
739 /*
740 * Macro to find the mbuf index relative to a base.
741 */
742 #define MCLIDX(c, m) (((char *)(m) - (char *)(c)) >> MSIZESHIFT)
743
744 /*
745 * Same thing for 2KB cluster index.
746 */
747 #define CLBGIDX(c, m) (((char *)(m) - (char *)(c)) >> MCLSHIFT)
748
749 /*
750 * Macros used during mbuf and cluster initialization.
751 */
752 #define MBUF_INIT(m, pkthdr, type) { \
753 _MCHECK(m); \
754 (m)->m_next = (m)->m_nextpkt = NULL; \
755 (m)->m_len = 0; \
756 (m)->m_type = type; \
757 if ((pkthdr) == 0) { \
758 (m)->m_data = (m)->m_dat; \
759 (m)->m_flags = 0; \
760 } else { \
761 (m)->m_data = (m)->m_pktdat; \
762 (m)->m_flags = M_PKTHDR; \
763 (m)->m_pkthdr.rcvif = NULL; \
764 (m)->m_pkthdr.len = 0; \
765 (m)->m_pkthdr.header = NULL; \
766 (m)->m_pkthdr.csum_flags = 0; \
767 (m)->m_pkthdr.csum_data = 0; \
768 (m)->m_pkthdr.tso_segsz = 0; \
769 (m)->m_pkthdr.vlan_tag = 0; \
770 (m)->m_pkthdr.socket_id = 0; \
771 (m)->m_pkthdr.vt_nrecs = 0; \
772 m_tag_init(m); \
773 m_prio_init(m); \
774 } \
775 }
776
777 #define MEXT_INIT(m, buf, size, free, arg, rfa, ref, flag) { \
778 (m)->m_data = (m)->m_ext.ext_buf = (buf); \
779 (m)->m_flags |= M_EXT; \
780 (m)->m_ext.ext_size = (size); \
781 (m)->m_ext.ext_free = (free); \
782 (m)->m_ext.ext_arg = (arg); \
783 (m)->m_ext.ext_refs.forward = (m)->m_ext.ext_refs.backward = \
784 &(m)->m_ext.ext_refs; \
785 MEXT_RFA(m) = (rfa); \
786 MEXT_REF(m) = (ref); \
787 MEXT_FLAGS(m) = (flag); \
788 }
789
790 #define MBUF_CL_INIT(m, buf, rfa, ref, flag) \
791 MEXT_INIT(m, buf, m_maxsize(MC_CL), NULL, NULL, rfa, ref, flag)
792
793 #define MBUF_BIGCL_INIT(m, buf, rfa, ref, flag) \
794 MEXT_INIT(m, buf, m_maxsize(MC_BIGCL), m_bigfree, NULL, rfa, ref, flag)
795
796 #define MBUF_16KCL_INIT(m, buf, rfa, ref, flag) \
797 MEXT_INIT(m, buf, m_maxsize(MC_16KCL), m_16kfree, NULL, rfa, ref, flag)
798
799 /*
800 * Macro to convert BSD malloc sleep flag to mcache's
801 */
802 #define MSLEEPF(f) ((!((f) & M_DONTWAIT)) ? MCR_SLEEP : MCR_NOSLEEP)
803
804 /*
805 * The structure that holds all mbuf class statistics exportable via sysctl.
806 * Similar to mbstat structure, the mb_stat structure is protected by the
807 * global mbuf lock. It contains additional information about the classes
808 * that allows for a more accurate view of the state of the allocator.
809 */
810 struct mb_stat *mb_stat;
811 struct omb_stat *omb_stat; /* For backwards compatibility */
812
813 #define MB_STAT_SIZE(n) \
814 ((size_t)(&((mb_stat_t *)0)->mbs_class[n]))
815 #define OMB_STAT_SIZE(n) \
816 ((size_t)(&((struct omb_stat *)0)->mbs_class[n]))
817
818 /*
819 * The legacy structure holding all of the mbuf allocation statistics.
820 * The actual statistics used by the kernel are stored in the mbuf_table
821 * instead, and are updated atomically while the global mbuf lock is held.
822 * They are mirrored in mbstat to support legacy applications (e.g. netstat).
823 * Unlike before, the kernel no longer relies on the contents of mbstat for
824 * its operations (e.g. cluster expansion) because the structure is exposed
825 * to outside and could possibly be modified, therefore making it unsafe.
826 * With the exception of the mbstat.m_mtypes array (see below), all of the
827 * statistics are updated as they change.
828 */
829 struct mbstat mbstat;
830
831 #define MBSTAT_MTYPES_MAX \
832 (sizeof (mbstat.m_mtypes) / sizeof (mbstat.m_mtypes[0]))
833
834 /*
835 * Allocation statistics related to mbuf types (up to MT_MAX-1) are updated
836 * atomically and stored in a per-CPU structure which is lock-free; this is
837 * done in order to avoid writing to the global mbstat data structure which
838 * would cause false sharing. During sysctl request for kern.ipc.mbstat,
839 * the statistics across all CPUs will be converged into the mbstat.m_mtypes
840 * array and returned to the application. Any updates for types greater or
841 * equal than MT_MAX would be done atomically to the mbstat; this slows down
842 * performance but is okay since the kernel uses only up to MT_MAX-1 while
843 * anything beyond that (up to type 255) is considered a corner case.
844 */
845 typedef struct {
846 unsigned int cpu_mtypes[MT_MAX];
847 } __attribute__((aligned(CPU_CACHE_SIZE), packed)) mtypes_cpu_t;
848
849 typedef struct {
850 mtypes_cpu_t mbs_cpu[1];
851 } mbuf_mtypes_t;
852
853 static mbuf_mtypes_t *mbuf_mtypes; /* per-CPU statistics */
854
855 #define MBUF_MTYPES_SIZE(n) \
856 ((size_t)(&((mbuf_mtypes_t *)0)->mbs_cpu[n]))
857
858 #define MTYPES_CPU(p) \
859 ((mtypes_cpu_t *)((char *)(p) + MBUF_MTYPES_SIZE(cpu_number())))
860
861 #define mtype_stat_add(type, n) { \
862 if ((unsigned)(type) < MT_MAX) { \
863 mtypes_cpu_t *mbs = MTYPES_CPU(mbuf_mtypes); \
864 atomic_add_32(&mbs->cpu_mtypes[type], n); \
865 } else if ((unsigned)(type) < (unsigned)MBSTAT_MTYPES_MAX) { \
866 atomic_add_16((int16_t *)&mbstat.m_mtypes[type], n); \
867 } \
868 }
869
870 #define mtype_stat_sub(t, n) mtype_stat_add(t, -(n))
871 #define mtype_stat_inc(t) mtype_stat_add(t, 1)
872 #define mtype_stat_dec(t) mtype_stat_sub(t, 1)
873
874 static void
875 mbuf_mtypes_sync(boolean_t locked)
876 {
877 int m, n;
878 mtypes_cpu_t mtc;
879
880 if (locked)
881 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
882
883 bzero(&mtc, sizeof (mtc));
884 for (m = 0; m < ncpu; m++) {
885 mtypes_cpu_t *scp = &mbuf_mtypes->mbs_cpu[m];
886 mtypes_cpu_t temp;
887
888 bcopy(&scp->cpu_mtypes, &temp.cpu_mtypes,
889 sizeof (temp.cpu_mtypes));
890
891 for (n = 0; n < MT_MAX; n++)
892 mtc.cpu_mtypes[n] += temp.cpu_mtypes[n];
893 }
894 if (!locked)
895 lck_mtx_lock(mbuf_mlock);
896 for (n = 0; n < MT_MAX; n++)
897 mbstat.m_mtypes[n] = mtc.cpu_mtypes[n];
898 if (!locked)
899 lck_mtx_unlock(mbuf_mlock);
900 }
901
902 static int
903 mbstat_sysctl SYSCTL_HANDLER_ARGS
904 {
905 #pragma unused(oidp, arg1, arg2)
906 mbuf_mtypes_sync(FALSE);
907
908 return (SYSCTL_OUT(req, &mbstat, sizeof (mbstat)));
909 }
910
911 static void
912 mbuf_stat_sync(void)
913 {
914 mb_class_stat_t *sp;
915 mcache_cpu_t *ccp;
916 mcache_t *cp;
917 int k, m, bktsize;
918
919 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
920
921 for (k = 0; k < NELEM(mbuf_table); k++) {
922 cp = m_cache(k);
923 ccp = &cp->mc_cpu[0];
924 bktsize = ccp->cc_bktsize;
925 sp = mbuf_table[k].mtbl_stats;
926
927 if (cp->mc_flags & MCF_NOCPUCACHE)
928 sp->mbcl_mc_state = MCS_DISABLED;
929 else if (cp->mc_purge_cnt > 0)
930 sp->mbcl_mc_state = MCS_PURGING;
931 else if (bktsize == 0)
932 sp->mbcl_mc_state = MCS_OFFLINE;
933 else
934 sp->mbcl_mc_state = MCS_ONLINE;
935
936 sp->mbcl_mc_cached = 0;
937 for (m = 0; m < ncpu; m++) {
938 ccp = &cp->mc_cpu[m];
939 if (ccp->cc_objs > 0)
940 sp->mbcl_mc_cached += ccp->cc_objs;
941 if (ccp->cc_pobjs > 0)
942 sp->mbcl_mc_cached += ccp->cc_pobjs;
943 }
944 sp->mbcl_mc_cached += (cp->mc_full.bl_total * bktsize);
945 sp->mbcl_active = sp->mbcl_total - sp->mbcl_mc_cached -
946 sp->mbcl_infree;
947
948 sp->mbcl_mc_waiter_cnt = cp->mc_waiter_cnt;
949 sp->mbcl_mc_wretry_cnt = cp->mc_wretry_cnt;
950 sp->mbcl_mc_nwretry_cnt = cp->mc_nwretry_cnt;
951
952 /* Calculate total count specific to each class */
953 sp->mbcl_ctotal = sp->mbcl_total;
954 switch (m_class(k)) {
955 case MC_MBUF:
956 /* Deduct mbufs used in composite caches */
957 sp->mbcl_ctotal -= (m_total(MC_MBUF_CL) +
958 m_total(MC_MBUF_BIGCL));
959 break;
960
961 case MC_CL:
962 /* Deduct clusters used in composite cache */
963 sp->mbcl_ctotal -= m_total(MC_MBUF_CL);
964 break;
965
966 case MC_BIGCL:
967 /* Deduct clusters used in composite cache */
968 sp->mbcl_ctotal -= m_total(MC_MBUF_BIGCL);
969 break;
970
971 case MC_16KCL:
972 /* Deduct clusters used in composite cache */
973 sp->mbcl_ctotal -= m_total(MC_MBUF_16KCL);
974 break;
975
976 default:
977 break;
978 }
979 }
980 }
981
982 static int
983 mb_stat_sysctl SYSCTL_HANDLER_ARGS
984 {
985 #pragma unused(oidp, arg1, arg2)
986 void *statp;
987 int k, statsz, proc64 = proc_is64bit(req->p);
988
989 lck_mtx_lock(mbuf_mlock);
990 mbuf_stat_sync();
991
992 if (!proc64) {
993 struct omb_class_stat *oc;
994 struct mb_class_stat *c;
995
996 omb_stat->mbs_cnt = mb_stat->mbs_cnt;
997 oc = &omb_stat->mbs_class[0];
998 c = &mb_stat->mbs_class[0];
999 for (k = 0; k < omb_stat->mbs_cnt; k++, oc++, c++) {
1000 (void) snprintf(oc->mbcl_cname, sizeof (oc->mbcl_cname),
1001 "%s", c->mbcl_cname);
1002 oc->mbcl_size = c->mbcl_size;
1003 oc->mbcl_total = c->mbcl_total;
1004 oc->mbcl_active = c->mbcl_active;
1005 oc->mbcl_infree = c->mbcl_infree;
1006 oc->mbcl_slab_cnt = c->mbcl_slab_cnt;
1007 oc->mbcl_alloc_cnt = c->mbcl_alloc_cnt;
1008 oc->mbcl_free_cnt = c->mbcl_free_cnt;
1009 oc->mbcl_notified = c->mbcl_notified;
1010 oc->mbcl_purge_cnt = c->mbcl_purge_cnt;
1011 oc->mbcl_fail_cnt = c->mbcl_fail_cnt;
1012 oc->mbcl_ctotal = c->mbcl_ctotal;
1013 oc->mbcl_mc_state = c->mbcl_mc_state;
1014 oc->mbcl_mc_cached = c->mbcl_mc_cached;
1015 oc->mbcl_mc_waiter_cnt = c->mbcl_mc_waiter_cnt;
1016 oc->mbcl_mc_wretry_cnt = c->mbcl_mc_wretry_cnt;
1017 oc->mbcl_mc_nwretry_cnt = c->mbcl_mc_nwretry_cnt;
1018 }
1019 statp = omb_stat;
1020 statsz = OMB_STAT_SIZE(NELEM(mbuf_table));
1021 } else {
1022 statp = mb_stat;
1023 statsz = MB_STAT_SIZE(NELEM(mbuf_table));
1024 }
1025
1026 lck_mtx_unlock(mbuf_mlock);
1027
1028 return (SYSCTL_OUT(req, statp, statsz));
1029 }
1030
1031 static int
1032 mleak_top_trace_sysctl SYSCTL_HANDLER_ARGS
1033 {
1034 #pragma unused(oidp, arg1, arg2)
1035 mleak_trace_stat_t *mltr;
1036 int i;
1037
1038 /* Ensure leak tracing turned on */
1039 if (!mclfindleak)
1040 return (ENXIO);
1041
1042 VERIFY(mleak_stat != NULL);
1043 #ifdef __LP64__
1044 VERIFY(mleak_stat->ml_isaddr64);
1045 #else
1046 VERIFY(!mleak_stat->ml_isaddr64);
1047 #endif /* !__LP64__ */
1048 VERIFY(mleak_stat->ml_cnt == MLEAK_NUM_TRACES);
1049
1050 lck_mtx_lock(mleak_lock);
1051 mltr = &mleak_stat->ml_trace[0];
1052 bzero(mltr, sizeof (*mltr) * MLEAK_NUM_TRACES);
1053 for (i = 0; i < MLEAK_NUM_TRACES; i++) {
1054 int j;
1055
1056 if (mleak_top_trace[i] == NULL ||
1057 mleak_top_trace[i]->allocs == 0)
1058 continue;
1059
1060 mltr->mltr_collisions = mleak_top_trace[i]->collisions;
1061 mltr->mltr_hitcount = mleak_top_trace[i]->hitcount;
1062 mltr->mltr_allocs = mleak_top_trace[i]->allocs;
1063 mltr->mltr_depth = mleak_top_trace[i]->depth;
1064
1065 VERIFY(mltr->mltr_depth <= MLEAK_STACK_DEPTH);
1066 for (j = 0; j < mltr->mltr_depth; j++)
1067 mltr->mltr_addr[j] = mleak_top_trace[i]->addr[j];
1068
1069 mltr++;
1070 }
1071 i = SYSCTL_OUT(req, mleak_stat, MLEAK_STAT_SIZE(MLEAK_NUM_TRACES));
1072 lck_mtx_unlock(mleak_lock);
1073
1074 return (i);
1075 }
1076
1077 static int
1078 mleak_table_sysctl SYSCTL_HANDLER_ARGS
1079 {
1080 #pragma unused(oidp, arg1, arg2)
1081 int i = 0;
1082
1083 /* Ensure leak tracing turned on */
1084 if (!mclfindleak)
1085 return (ENXIO);
1086
1087 lck_mtx_lock(mleak_lock);
1088 i = SYSCTL_OUT(req, &mleak_table, sizeof (mleak_table));
1089 lck_mtx_unlock(mleak_lock);
1090
1091 return (i);
1092 }
1093
1094 static inline void
1095 m_incref(struct mbuf *m)
1096 {
1097 UInt32 old, new;
1098 volatile UInt32 *addr = (volatile UInt32 *)&MEXT_REF(m);
1099
1100 do {
1101 old = *addr;
1102 new = old + 1;
1103 ASSERT(new != 0);
1104 } while (!OSCompareAndSwap(old, new, addr));
1105
1106 /*
1107 * If cluster is shared, mark it with (sticky) EXTF_READONLY;
1108 * we don't clear the flag when the refcount goes back to 1
1109 * to simplify code calling m_mclhasreference().
1110 */
1111 if (new > 1 && !(MEXT_FLAGS(m) & EXTF_READONLY))
1112 (void) OSBitOrAtomic(EXTF_READONLY, &MEXT_FLAGS(m));
1113 }
1114
1115 static inline u_int32_t
1116 m_decref(struct mbuf *m)
1117 {
1118 UInt32 old, new;
1119 volatile UInt32 *addr = (volatile UInt32 *)&MEXT_REF(m);
1120
1121 do {
1122 old = *addr;
1123 new = old - 1;
1124 ASSERT(old != 0);
1125 } while (!OSCompareAndSwap(old, new, addr));
1126
1127 return (new);
1128 }
1129
1130 static void
1131 mbuf_table_init(void)
1132 {
1133 unsigned int b, c, s;
1134 int m;
1135
1136 MALLOC(omb_stat, struct omb_stat *, OMB_STAT_SIZE(NELEM(mbuf_table)),
1137 M_TEMP, M_WAITOK | M_ZERO);
1138 VERIFY(omb_stat != NULL);
1139
1140 MALLOC(mb_stat, mb_stat_t *, MB_STAT_SIZE(NELEM(mbuf_table)),
1141 M_TEMP, M_WAITOK | M_ZERO);
1142 VERIFY(mb_stat != NULL);
1143
1144 mb_stat->mbs_cnt = NELEM(mbuf_table);
1145 for (m = 0; m < NELEM(mbuf_table); m++)
1146 mbuf_table[m].mtbl_stats = &mb_stat->mbs_class[m];
1147
1148 #if CONFIG_MBUF_JUMBO
1149 /*
1150 * Set aside 1/3 of the mbuf cluster map for jumbo clusters; we do
1151 * this only on platforms where jumbo cluster pool is enabled.
1152 */
1153 njcl = nmbclusters / 3;
1154 njclbytes = M16KCLBYTES;
1155 #endif /* CONFIG_MBUF_JUMBO */
1156
1157 /*
1158 * nclusters holds both the 2KB and 4KB pools, so ensure it's
1159 * a multiple of 4KB clusters.
1160 */
1161 nclusters = P2ROUNDDOWN(nmbclusters - njcl, NCLPBG);
1162 if (njcl > 0) {
1163 /*
1164 * Each jumbo cluster takes 8 2KB clusters, so make
1165 * sure that the pool size is evenly divisible by 8;
1166 * njcl is in 2KB unit, hence treated as such.
1167 */
1168 njcl = P2ROUNDDOWN(nmbclusters - nclusters, 8);
1169
1170 /* Update nclusters with rounded down value of njcl */
1171 nclusters = P2ROUNDDOWN(nmbclusters - njcl, NCLPBG);
1172 }
1173
1174 /*
1175 * njcl is valid only on platforms with 16KB jumbo clusters, where
1176 * it is configured to 1/3 of the pool size. On these platforms,
1177 * the remaining is used for 2KB and 4KB clusters. On platforms
1178 * without 16KB jumbo clusters, the entire pool is used for both
1179 * 2KB and 4KB clusters. A 4KB cluster can either be splitted into
1180 * 16 mbufs, or into 2 2KB clusters.
1181 *
1182 * +---+---+------------ ... -----------+------- ... -------+
1183 * | c | b | s | njcl |
1184 * +---+---+------------ ... -----------+------- ... -------+
1185 *
1186 * 1/32th of the shared region is reserved for pure 2KB and 4KB
1187 * clusters (1/64th each.)
1188 */
1189 c = P2ROUNDDOWN((nclusters >> 6), 2); /* in 2KB unit */
1190 b = P2ROUNDDOWN((nclusters >> (6 + NCLPBGSHIFT)), 2); /* in 4KB unit */
1191 s = nclusters - (c + (b << NCLPBGSHIFT)); /* in 2KB unit */
1192
1193 /*
1194 * 1/64th (c) is reserved for 2KB clusters.
1195 */
1196 m_minlimit(MC_CL) = c;
1197 m_maxlimit(MC_CL) = s + c; /* in 2KB unit */
1198 m_maxsize(MC_CL) = m_size(MC_CL) = MCLBYTES;
1199 (void) snprintf(m_cname(MC_CL), MAX_MBUF_CNAME, "cl");
1200
1201 /*
1202 * Another 1/64th (b) of the map is reserved for 4KB clusters.
1203 * It cannot be turned into 2KB clusters or mbufs.
1204 */
1205 m_minlimit(MC_BIGCL) = b;
1206 m_maxlimit(MC_BIGCL) = (s >> NCLPBGSHIFT) + b; /* in 4KB unit */
1207 m_maxsize(MC_BIGCL) = m_size(MC_BIGCL) = MBIGCLBYTES;
1208 (void) snprintf(m_cname(MC_BIGCL), MAX_MBUF_CNAME, "bigcl");
1209
1210 /*
1211 * The remaining 31/32ths (s) are all-purpose (mbufs, 2KB, or 4KB)
1212 */
1213 m_minlimit(MC_MBUF) = 0;
1214 m_maxlimit(MC_MBUF) = (s << NMBPCLSHIFT); /* in mbuf unit */
1215 m_maxsize(MC_MBUF) = m_size(MC_MBUF) = MSIZE;
1216 (void) snprintf(m_cname(MC_MBUF), MAX_MBUF_CNAME, "mbuf");
1217
1218 /*
1219 * Set limits for the composite classes.
1220 */
1221 m_minlimit(MC_MBUF_CL) = 0;
1222 m_maxlimit(MC_MBUF_CL) = m_maxlimit(MC_CL);
1223 m_maxsize(MC_MBUF_CL) = MCLBYTES;
1224 m_size(MC_MBUF_CL) = m_size(MC_MBUF) + m_size(MC_CL);
1225 (void) snprintf(m_cname(MC_MBUF_CL), MAX_MBUF_CNAME, "mbuf_cl");
1226
1227 m_minlimit(MC_MBUF_BIGCL) = 0;
1228 m_maxlimit(MC_MBUF_BIGCL) = m_maxlimit(MC_BIGCL);
1229 m_maxsize(MC_MBUF_BIGCL) = MBIGCLBYTES;
1230 m_size(MC_MBUF_BIGCL) = m_size(MC_MBUF) + m_size(MC_BIGCL);
1231 (void) snprintf(m_cname(MC_MBUF_BIGCL), MAX_MBUF_CNAME, "mbuf_bigcl");
1232
1233 /*
1234 * And for jumbo classes.
1235 */
1236 m_minlimit(MC_16KCL) = 0;
1237 m_maxlimit(MC_16KCL) = (njcl >> NCLPJCLSHIFT); /* in 16KB unit */
1238 m_maxsize(MC_16KCL) = m_size(MC_16KCL) = M16KCLBYTES;
1239 (void) snprintf(m_cname(MC_16KCL), MAX_MBUF_CNAME, "16kcl");
1240
1241 m_minlimit(MC_MBUF_16KCL) = 0;
1242 m_maxlimit(MC_MBUF_16KCL) = m_maxlimit(MC_16KCL);
1243 m_maxsize(MC_MBUF_16KCL) = M16KCLBYTES;
1244 m_size(MC_MBUF_16KCL) = m_size(MC_MBUF) + m_size(MC_16KCL);
1245 (void) snprintf(m_cname(MC_MBUF_16KCL), MAX_MBUF_CNAME, "mbuf_16kcl");
1246
1247 /*
1248 * Initialize the legacy mbstat structure.
1249 */
1250 bzero(&mbstat, sizeof (mbstat));
1251 mbstat.m_msize = m_maxsize(MC_MBUF);
1252 mbstat.m_mclbytes = m_maxsize(MC_CL);
1253 mbstat.m_minclsize = MINCLSIZE;
1254 mbstat.m_mlen = MLEN;
1255 mbstat.m_mhlen = MHLEN;
1256 mbstat.m_bigmclbytes = m_maxsize(MC_BIGCL);
1257 }
1258
1259 #if defined(__LP64__)
1260 typedef struct ncl_tbl {
1261 uint64_t nt_maxmem; /* memory (sane) size */
1262 uint32_t nt_mbpool; /* mbuf pool size */
1263 } ncl_tbl_t;
1264
1265 /* Non-server */
1266 static ncl_tbl_t ncl_table[] = {
1267 { (1ULL << GBSHIFT) /* 1 GB */, (64 << MBSHIFT) /* 64 MB */ },
1268 { (1ULL << (GBSHIFT + 3)) /* 8 GB */, (96 << MBSHIFT) /* 96 MB */ },
1269 { (1ULL << (GBSHIFT + 4)) /* 16 GB */, (128 << MBSHIFT) /* 128 MB */ },
1270 { 0, 0 }
1271 };
1272
1273 /* Server */
1274 static ncl_tbl_t ncl_table_srv[] = {
1275 { (1ULL << GBSHIFT) /* 1 GB */, (96 << MBSHIFT) /* 96 MB */ },
1276 { (1ULL << (GBSHIFT + 2)) /* 4 GB */, (128 << MBSHIFT) /* 128 MB */ },
1277 { (1ULL << (GBSHIFT + 3)) /* 8 GB */, (160 << MBSHIFT) /* 160 MB */ },
1278 { (1ULL << (GBSHIFT + 4)) /* 16 GB */, (192 << MBSHIFT) /* 192 MB */ },
1279 { (1ULL << (GBSHIFT + 5)) /* 32 GB */, (256 << MBSHIFT) /* 256 MB */ },
1280 { (1ULL << (GBSHIFT + 6)) /* 64 GB */, (384 << MBSHIFT) /* 384 MB */ },
1281 { 0, 0 }
1282 };
1283 #endif /* __LP64__ */
1284
1285 __private_extern__ unsigned int
1286 mbuf_default_ncl(int server, uint64_t mem)
1287 {
1288 #if !defined(__LP64__)
1289 #pragma unused(server)
1290 unsigned int n;
1291 /*
1292 * 32-bit kernel (default to 64MB of mbuf pool for >= 1GB RAM).
1293 */
1294 if ((n = ((mem / 16) / MCLBYTES)) > 32768)
1295 n = 32768;
1296 #else
1297 unsigned int n, i;
1298 ncl_tbl_t *tbl = (server ? ncl_table_srv : ncl_table);
1299 /*
1300 * 64-bit kernel (mbuf pool size based on table).
1301 */
1302 n = tbl[0].nt_mbpool;
1303 for (i = 0; tbl[i].nt_mbpool != 0; i++) {
1304 if (mem < tbl[i].nt_maxmem)
1305 break;
1306 n = tbl[i].nt_mbpool;
1307 }
1308 n >>= MCLSHIFT;
1309 #endif /* !__LP64__ */
1310 return (n);
1311 }
1312
1313 __private_extern__ void
1314 mbinit(void)
1315 {
1316 unsigned int m;
1317 unsigned int initmcl = 0;
1318 void *buf;
1319 thread_t thread = THREAD_NULL;
1320
1321 if (nmbclusters == 0)
1322 nmbclusters = NMBCLUSTERS;
1323
1324 /* This should be a sane (at least even) value by now */
1325 VERIFY(nmbclusters != 0 && !(nmbclusters & 0x1));
1326
1327 /* Setup the mbuf table */
1328 mbuf_table_init();
1329
1330 /* Global lock for common layer */
1331 mbuf_mlock_grp_attr = lck_grp_attr_alloc_init();
1332 mbuf_mlock_grp = lck_grp_alloc_init("mbuf", mbuf_mlock_grp_attr);
1333 mbuf_mlock_attr = lck_attr_alloc_init();
1334 mbuf_mlock = lck_mtx_alloc_init(mbuf_mlock_grp, mbuf_mlock_attr);
1335
1336 /*
1337 * Allocate cluster slabs table:
1338 *
1339 * maxslabgrp = (N * 2048) / (1024 * 1024)
1340 *
1341 * Where N is nmbclusters rounded up to the nearest 512. This yields
1342 * mcl_slab_g_t units, each one representing a MB of memory.
1343 */
1344 maxslabgrp =
1345 (P2ROUNDUP(nmbclusters, (MBSIZE >> 11)) << MCLSHIFT) >> MBSHIFT;
1346 MALLOC(slabstbl, mcl_slabg_t **, maxslabgrp * sizeof (mcl_slabg_t *),
1347 M_TEMP, M_WAITOK | M_ZERO);
1348 VERIFY(slabstbl != NULL);
1349
1350 /*
1351 * Allocate audit structures, if needed:
1352 *
1353 * maxclaudit = (maxslabgrp * 1024 * 1024) / 4096
1354 *
1355 * This yields mcl_audit_t units, each one representing a page.
1356 */
1357 PE_parse_boot_argn("mbuf_debug", &mbuf_debug, sizeof (mbuf_debug));
1358 mbuf_debug |= mcache_getflags();
1359 if (mbuf_debug & MCF_DEBUG) {
1360 maxclaudit = ((maxslabgrp << MBSHIFT) >> PGSHIFT);
1361 MALLOC(mclaudit, mcl_audit_t *, maxclaudit * sizeof (*mclaudit),
1362 M_TEMP, M_WAITOK | M_ZERO);
1363 VERIFY(mclaudit != NULL);
1364
1365 mcl_audit_con_cache = mcache_create("mcl_audit_contents",
1366 AUDIT_CONTENTS_SIZE, 0, 0, MCR_SLEEP);
1367 VERIFY(mcl_audit_con_cache != NULL);
1368 }
1369 mclverify = (mbuf_debug & MCF_VERIFY);
1370 mcltrace = (mbuf_debug & MCF_TRACE);
1371 mclfindleak = !(mbuf_debug & MCF_NOLEAKLOG);
1372
1373 /* Enable mbuf leak logging, with a lock to protect the tables */
1374
1375 mleak_lock_grp_attr = lck_grp_attr_alloc_init();
1376 mleak_lock_grp = lck_grp_alloc_init("mleak_lock", mleak_lock_grp_attr);
1377 mleak_lock_attr = lck_attr_alloc_init();
1378 mleak_lock = lck_mtx_alloc_init(mleak_lock_grp, mleak_lock_attr);
1379
1380 mleak_activate();
1381
1382 /* Calculate the number of pages assigned to the cluster pool */
1383 mcl_pages = (nmbclusters * MCLBYTES) / CLBYTES;
1384 MALLOC(mcl_paddr, ppnum_t *, mcl_pages * sizeof (ppnum_t),
1385 M_TEMP, M_WAITOK);
1386 VERIFY(mcl_paddr != NULL);
1387
1388 /* Register with the I/O Bus mapper */
1389 mcl_paddr_base = IOMapperIOVMAlloc(mcl_pages);
1390 bzero((char *)mcl_paddr, mcl_pages * sizeof (ppnum_t));
1391
1392 embutl = (union mbigcluster *)
1393 ((unsigned char *)mbutl + (nmbclusters * MCLBYTES));
1394 VERIFY((((char *)embutl - (char *)mbutl) % MBIGCLBYTES) == 0);
1395
1396 /* Prime up the freelist */
1397 PE_parse_boot_argn("initmcl", &initmcl, sizeof (initmcl));
1398 if (initmcl != 0) {
1399 initmcl >>= NCLPBGSHIFT; /* become a 4K unit */
1400 if (initmcl > m_maxlimit(MC_BIGCL))
1401 initmcl = m_maxlimit(MC_BIGCL);
1402 }
1403 if (initmcl < m_minlimit(MC_BIGCL))
1404 initmcl = m_minlimit(MC_BIGCL);
1405
1406 lck_mtx_lock(mbuf_mlock);
1407
1408 /*
1409 * For classes with non-zero minimum limits, populate their freelists
1410 * so that m_total(class) is at least m_minlimit(class).
1411 */
1412 VERIFY(m_total(MC_BIGCL) == 0 && m_minlimit(MC_BIGCL) != 0);
1413 freelist_populate(m_class(MC_BIGCL), initmcl, M_WAIT);
1414 VERIFY(m_total(MC_BIGCL) >= m_minlimit(MC_BIGCL));
1415 freelist_init(m_class(MC_CL));
1416
1417 for (m = 0; m < NELEM(mbuf_table); m++) {
1418 /* Make sure we didn't miss any */
1419 VERIFY(m_minlimit(m_class(m)) == 0 ||
1420 m_total(m_class(m)) >= m_minlimit(m_class(m)));
1421 }
1422
1423 lck_mtx_unlock(mbuf_mlock);
1424
1425 (void) kernel_thread_start((thread_continue_t)mbuf_worker_thread_init,
1426 NULL, &thread);
1427 thread_deallocate(thread);
1428
1429 ref_cache = mcache_create("mext_ref", sizeof (struct ext_ref),
1430 0, 0, MCR_SLEEP);
1431
1432 /* Create the cache for each class */
1433 for (m = 0; m < NELEM(mbuf_table); m++) {
1434 void *allocfunc, *freefunc, *auditfunc, *logfunc;
1435 u_int32_t flags;
1436
1437 flags = mbuf_debug;
1438 if (m_class(m) == MC_MBUF_CL || m_class(m) == MC_MBUF_BIGCL ||
1439 m_class(m) == MC_MBUF_16KCL) {
1440 allocfunc = mbuf_cslab_alloc;
1441 freefunc = mbuf_cslab_free;
1442 auditfunc = mbuf_cslab_audit;
1443 logfunc = mleak_logger;
1444 } else {
1445 allocfunc = mbuf_slab_alloc;
1446 freefunc = mbuf_slab_free;
1447 auditfunc = mbuf_slab_audit;
1448 logfunc = mleak_logger;
1449 }
1450
1451 /*
1452 * Disable per-CPU caches for jumbo classes if there
1453 * is no jumbo cluster pool available in the system.
1454 * The cache itself is still created (but will never
1455 * be populated) since it simplifies the code.
1456 */
1457 if ((m_class(m) == MC_MBUF_16KCL || m_class(m) == MC_16KCL) &&
1458 njcl == 0)
1459 flags |= MCF_NOCPUCACHE;
1460
1461 if (!mclfindleak)
1462 flags |= MCF_NOLEAKLOG;
1463
1464 m_cache(m) = mcache_create_ext(m_cname(m), m_maxsize(m),
1465 allocfunc, freefunc, auditfunc, logfunc, mbuf_slab_notify,
1466 (void *)(uintptr_t)m, flags, MCR_SLEEP);
1467 }
1468
1469 /*
1470 * Allocate structure for per-CPU statistics that's aligned
1471 * on the CPU cache boundary; this code assumes that we never
1472 * uninitialize this framework, since the original address
1473 * before alignment is not saved.
1474 */
1475 ncpu = ml_get_max_cpus();
1476 MALLOC(buf, void *, MBUF_MTYPES_SIZE(ncpu) + CPU_CACHE_SIZE,
1477 M_TEMP, M_WAITOK);
1478 VERIFY(buf != NULL);
1479
1480 mbuf_mtypes = (mbuf_mtypes_t *)P2ROUNDUP((intptr_t)buf, CPU_CACHE_SIZE);
1481 bzero(mbuf_mtypes, MBUF_MTYPES_SIZE(ncpu));
1482
1483 /*
1484 * Set the max limit on sb_max to be 1/16 th of the size of
1485 * memory allocated for mbuf clusters.
1486 */
1487 high_sb_max = (nmbclusters << (MCLSHIFT - 4));
1488 if (high_sb_max < sb_max) {
1489 /* sb_max is too large for this configuration, scale it down */
1490 if (high_sb_max > (1 << MBSHIFT)) {
1491 /* We have atleast 16 M of mbuf pool */
1492 sb_max = high_sb_max;
1493 } else if ((nmbclusters << MCLSHIFT) > (1 << MBSHIFT)) {
1494 /*
1495 * If we have more than 1M of mbufpool, cap the size of
1496 * max sock buf at 1M
1497 */
1498 sb_max = high_sb_max = (1 << MBSHIFT);
1499 } else {
1500 sb_max = high_sb_max;
1501 }
1502 }
1503
1504 printf("mbinit: done [%d MB total pool size, (%d/%d) split]\n",
1505 (nmbclusters << MCLSHIFT) >> MBSHIFT,
1506 (nclusters << MCLSHIFT) >> MBSHIFT,
1507 (njcl << MCLSHIFT) >> MBSHIFT);
1508 }
1509
1510 /*
1511 * Obtain a slab of object(s) from the class's freelist.
1512 */
1513 static mcache_obj_t *
1514 slab_alloc(mbuf_class_t class, int wait)
1515 {
1516 mcl_slab_t *sp;
1517 mcache_obj_t *buf;
1518
1519 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1520
1521 VERIFY(class != MC_16KCL || njcl > 0);
1522
1523 /* This should always be NULL for us */
1524 VERIFY(m_cobjlist(class) == NULL);
1525
1526 /*
1527 * Treat composite objects as having longer lifespan by using
1528 * a slab from the reverse direction, in hoping that this could
1529 * reduce the probability of fragmentation for slabs that hold
1530 * more than one buffer chunks (e.g. mbuf slabs). For other
1531 * slabs, this probably doesn't make much of a difference.
1532 */
1533 if ((class == MC_MBUF || class == MC_CL) && (wait & MCR_COMP))
1534 sp = (mcl_slab_t *)TAILQ_LAST(&m_slablist(class), mcl_slhead);
1535 else
1536 sp = (mcl_slab_t *)TAILQ_FIRST(&m_slablist(class));
1537
1538 if (sp == NULL) {
1539 VERIFY(m_infree(class) == 0 && m_slab_cnt(class) == 0);
1540 /* The slab list for this class is empty */
1541 return (NULL);
1542 }
1543
1544 VERIFY(m_infree(class) > 0);
1545 VERIFY(!slab_is_detached(sp));
1546 VERIFY(sp->sl_class == class &&
1547 (sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED);
1548 buf = sp->sl_head;
1549 VERIFY(slab_inrange(sp, buf) && sp == slab_get(buf));
1550
1551 if (class == MC_MBUF) {
1552 sp->sl_head = buf->obj_next;
1553 VERIFY(sp->sl_head != NULL || sp->sl_refcnt == (NMBPBG - 1));
1554 } else if (class == MC_CL) {
1555 sp->sl_head = buf->obj_next;
1556 VERIFY(sp->sl_head != NULL || sp->sl_refcnt == (NCLPBG - 1));
1557 } else {
1558 sp->sl_head = NULL;
1559 }
1560 if (sp->sl_head != NULL && !slab_inrange(sp, sp->sl_head)) {
1561 slab_nextptr_panic(sp, sp->sl_head);
1562 /* In case sl_head is in the map but not in the slab */
1563 VERIFY(slab_inrange(sp, sp->sl_head));
1564 /* NOTREACHED */
1565 }
1566
1567 /* Increment slab reference */
1568 sp->sl_refcnt++;
1569
1570 if (mclaudit != NULL) {
1571 mcache_audit_t *mca = mcl_audit_buf2mca(class, buf);
1572 mca->mca_uflags = 0;
1573 /* Save contents on mbuf objects only */
1574 if (class == MC_MBUF)
1575 mca->mca_uflags |= MB_SCVALID;
1576 }
1577
1578 if (class == MC_CL) {
1579 mbstat.m_clfree = (--m_infree(MC_CL)) + m_infree(MC_MBUF_CL);
1580 /*
1581 * A 2K cluster slab can have at most NCLPBG references.
1582 */
1583 VERIFY(sp->sl_refcnt >= 1 && sp->sl_refcnt <= NCLPBG &&
1584 sp->sl_chunks == NCLPBG &&
1585 sp->sl_len == m_maxsize(MC_BIGCL));
1586 VERIFY(sp->sl_refcnt < NCLPBG || sp->sl_head == NULL);
1587 } else if (class == MC_BIGCL) {
1588 mbstat.m_bigclfree = (--m_infree(MC_BIGCL)) +
1589 m_infree(MC_MBUF_BIGCL);
1590 /*
1591 * A 4K cluster slab can have at most 1 reference.
1592 */
1593 VERIFY(sp->sl_refcnt == 1 && sp->sl_chunks == 1 &&
1594 sp->sl_len == m_maxsize(class) && sp->sl_head == NULL);
1595 } else if (class == MC_16KCL) {
1596 mcl_slab_t *nsp;
1597 int k;
1598
1599 --m_infree(MC_16KCL);
1600 VERIFY(sp->sl_refcnt == 1 && sp->sl_chunks == 1 &&
1601 sp->sl_len == m_maxsize(class) && sp->sl_head == NULL);
1602 /*
1603 * Increment 2nd-Nth slab reference, where N is NSLABSP16KB.
1604 * A 16KB big cluster takes NSLABSP16KB slabs, each having at
1605 * most 1 reference.
1606 */
1607 for (nsp = sp, k = 1; k < NSLABSP16KB; k++) {
1608 nsp = nsp->sl_next;
1609 /* Next slab must already be present */
1610 VERIFY(nsp != NULL);
1611 nsp->sl_refcnt++;
1612 VERIFY(!slab_is_detached(nsp));
1613 VERIFY(nsp->sl_class == MC_16KCL &&
1614 nsp->sl_flags == (SLF_MAPPED | SLF_PARTIAL) &&
1615 nsp->sl_refcnt == 1 && nsp->sl_chunks == 0 &&
1616 nsp->sl_len == 0 && nsp->sl_base == sp->sl_base &&
1617 nsp->sl_head == NULL);
1618 }
1619 } else {
1620 VERIFY(class == MC_MBUF);
1621 --m_infree(MC_MBUF);
1622 /*
1623 * If auditing is turned on, this check is
1624 * deferred until later in mbuf_slab_audit().
1625 */
1626 if (mclaudit == NULL)
1627 _MCHECK((struct mbuf *)buf);
1628 /*
1629 * Since we have incremented the reference count above,
1630 * an mbuf slab (formerly a 4KB cluster slab that was cut
1631 * up into mbufs) must have a reference count between 1
1632 * and NMBPBG at this point.
1633 */
1634 VERIFY(sp->sl_refcnt >= 1 && sp->sl_refcnt <= NMBPBG &&
1635 sp->sl_chunks == NMBPBG &&
1636 sp->sl_len == m_maxsize(MC_BIGCL));
1637 VERIFY(sp->sl_refcnt < NMBPBG || sp->sl_head == NULL);
1638 }
1639
1640 /* If empty, remove this slab from the class's freelist */
1641 if (sp->sl_head == NULL) {
1642 VERIFY(class != MC_MBUF || sp->sl_refcnt == NMBPBG);
1643 VERIFY(class != MC_CL || sp->sl_refcnt == NCLPBG);
1644 slab_remove(sp, class);
1645 }
1646
1647 return (buf);
1648 }
1649
1650 /*
1651 * Place a slab of object(s) back into a class's slab list.
1652 */
1653 static void
1654 slab_free(mbuf_class_t class, mcache_obj_t *buf)
1655 {
1656 mcl_slab_t *sp;
1657
1658 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1659
1660 VERIFY(class != MC_16KCL || njcl > 0);
1661 VERIFY(buf->obj_next == NULL);
1662 sp = slab_get(buf);
1663 VERIFY(sp->sl_class == class && slab_inrange(sp, buf) &&
1664 (sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED);
1665
1666 /* Decrement slab reference */
1667 sp->sl_refcnt--;
1668
1669 if (class == MC_CL) {
1670 VERIFY(IS_P2ALIGNED(buf, MCLBYTES));
1671 /*
1672 * A slab that has been splitted for 2KB clusters can have
1673 * at most 1 outstanding reference at this point.
1674 */
1675 VERIFY(sp->sl_refcnt >= 0 && sp->sl_refcnt <= (NCLPBG - 1) &&
1676 sp->sl_chunks == NCLPBG &&
1677 sp->sl_len == m_maxsize(MC_BIGCL));
1678 VERIFY(sp->sl_refcnt < (NCLPBG - 1) ||
1679 (slab_is_detached(sp) && sp->sl_head == NULL));
1680 } else if (class == MC_BIGCL) {
1681 VERIFY(IS_P2ALIGNED(buf, MCLBYTES));
1682 /*
1683 * A 4KB cluster slab can have at most 1 reference
1684 * which must be 0 at this point.
1685 */
1686 VERIFY(sp->sl_refcnt == 0 && sp->sl_chunks == 1 &&
1687 sp->sl_len == m_maxsize(class) && sp->sl_head == NULL);
1688 VERIFY(slab_is_detached(sp));
1689 } else if (class == MC_16KCL) {
1690 mcl_slab_t *nsp;
1691 int k;
1692 /*
1693 * A 16KB cluster takes NSLABSP16KB slabs, all must
1694 * now have 0 reference.
1695 */
1696 VERIFY(IS_P2ALIGNED(buf, MBIGCLBYTES));
1697 VERIFY(sp->sl_refcnt == 0 && sp->sl_chunks == 1 &&
1698 sp->sl_len == m_maxsize(class) && sp->sl_head == NULL);
1699 VERIFY(slab_is_detached(sp));
1700 for (nsp = sp, k = 1; k < NSLABSP16KB; k++) {
1701 nsp = nsp->sl_next;
1702 /* Next slab must already be present */
1703 VERIFY(nsp != NULL);
1704 nsp->sl_refcnt--;
1705 VERIFY(slab_is_detached(nsp));
1706 VERIFY(nsp->sl_class == MC_16KCL &&
1707 (nsp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) &&
1708 nsp->sl_refcnt == 0 && nsp->sl_chunks == 0 &&
1709 nsp->sl_len == 0 && nsp->sl_base == sp->sl_base &&
1710 nsp->sl_head == NULL);
1711 }
1712 } else {
1713 /*
1714 * A slab that has been splitted for mbufs has at most NMBPBG
1715 * reference counts. Since we have decremented one reference
1716 * above, it must now be between 0 and NMBPBG-1.
1717 */
1718 VERIFY(class == MC_MBUF);
1719 VERIFY(sp->sl_refcnt >= 0 && sp->sl_refcnt <= (NMBPBG - 1) &&
1720 sp->sl_chunks == NMBPBG &&
1721 sp->sl_len == m_maxsize(MC_BIGCL));
1722 VERIFY(sp->sl_refcnt < (NMBPBG - 1) ||
1723 (slab_is_detached(sp) && sp->sl_head == NULL));
1724 }
1725
1726 /*
1727 * When auditing is enabled, ensure that the buffer still
1728 * contains the free pattern. Otherwise it got corrupted
1729 * while at the CPU cache layer.
1730 */
1731 if (mclaudit != NULL) {
1732 mcache_audit_t *mca = mcl_audit_buf2mca(class, buf);
1733 if (mclverify) {
1734 mcache_audit_free_verify(mca, buf, 0, m_maxsize(class));
1735 }
1736 mca->mca_uflags &= ~MB_SCVALID;
1737 }
1738
1739 if (class == MC_CL) {
1740 mbstat.m_clfree = (++m_infree(MC_CL)) + m_infree(MC_MBUF_CL);
1741 buf->obj_next = sp->sl_head;
1742 } else if (class == MC_BIGCL) {
1743 mbstat.m_bigclfree = (++m_infree(MC_BIGCL)) +
1744 m_infree(MC_MBUF_BIGCL);
1745 } else if (class == MC_16KCL) {
1746 ++m_infree(MC_16KCL);
1747 } else {
1748 ++m_infree(MC_MBUF);
1749 buf->obj_next = sp->sl_head;
1750 }
1751 sp->sl_head = buf;
1752
1753 /*
1754 * If a slab has been splitted to either one which holds 2KB clusters,
1755 * or one which holds mbufs, turn it back to one which holds a 4KB
1756 * cluster.
1757 */
1758 if (class == MC_MBUF && sp->sl_refcnt == 0 &&
1759 m_total(class) > m_minlimit(class) &&
1760 m_total(MC_BIGCL) < m_maxlimit(MC_BIGCL)) {
1761 int i = NMBPBG;
1762
1763 m_total(MC_BIGCL)++;
1764 mbstat.m_bigclusters = m_total(MC_BIGCL);
1765 m_total(MC_MBUF) -= NMBPBG;
1766 mbstat.m_mbufs = m_total(MC_MBUF);
1767 m_infree(MC_MBUF) -= NMBPBG;
1768 mtype_stat_add(MT_FREE, -((unsigned)NMBPBG));
1769
1770 VERIFY(m_total(MC_BIGCL) <= m_maxlimit(MC_BIGCL));
1771 VERIFY(m_total(MC_MBUF) >= m_minlimit(MC_MBUF));
1772
1773 while (i--) {
1774 struct mbuf *m = sp->sl_head;
1775 VERIFY(m != NULL);
1776 sp->sl_head = m->m_next;
1777 m->m_next = NULL;
1778 }
1779 VERIFY(sp->sl_head == NULL);
1780
1781 /* Remove the slab from the mbuf class's slab list */
1782 slab_remove(sp, class);
1783
1784 /* Reinitialize it as a 4KB cluster slab */
1785 slab_init(sp, MC_BIGCL, sp->sl_flags, sp->sl_base, sp->sl_base,
1786 sp->sl_len, 0, 1);
1787
1788 if (mclverify) {
1789 mcache_set_pattern(MCACHE_FREE_PATTERN,
1790 (caddr_t)sp->sl_head, m_maxsize(MC_BIGCL));
1791 }
1792 mbstat.m_bigclfree = (++m_infree(MC_BIGCL)) +
1793 m_infree(MC_MBUF_BIGCL);
1794
1795 VERIFY(slab_is_detached(sp));
1796 /* And finally switch class */
1797 class = MC_BIGCL;
1798 } else if (class == MC_CL && sp->sl_refcnt == 0 &&
1799 m_total(class) > m_minlimit(class) &&
1800 m_total(MC_BIGCL) < m_maxlimit(MC_BIGCL)) {
1801 int i = NCLPBG;
1802
1803 m_total(MC_BIGCL)++;
1804 mbstat.m_bigclusters = m_total(MC_BIGCL);
1805 m_total(MC_CL) -= NCLPBG;
1806 mbstat.m_clusters = m_total(MC_CL);
1807 m_infree(MC_CL) -= NCLPBG;
1808 VERIFY(m_total(MC_BIGCL) <= m_maxlimit(MC_BIGCL));
1809 VERIFY(m_total(MC_CL) >= m_minlimit(MC_CL));
1810
1811 while (i--) {
1812 union mcluster *c = sp->sl_head;
1813 VERIFY(c != NULL);
1814 sp->sl_head = c->mcl_next;
1815 c->mcl_next = NULL;
1816 }
1817 VERIFY(sp->sl_head == NULL);
1818
1819 /* Remove the slab from the 2KB cluster class's slab list */
1820 slab_remove(sp, class);
1821
1822 /* Reinitialize it as a 4KB cluster slab */
1823 slab_init(sp, MC_BIGCL, sp->sl_flags, sp->sl_base, sp->sl_base,
1824 sp->sl_len, 0, 1);
1825
1826 if (mclverify) {
1827 mcache_set_pattern(MCACHE_FREE_PATTERN,
1828 (caddr_t)sp->sl_head, m_maxsize(MC_BIGCL));
1829 }
1830 mbstat.m_bigclfree = (++m_infree(MC_BIGCL)) +
1831 m_infree(MC_MBUF_BIGCL);
1832
1833 VERIFY(slab_is_detached(sp));
1834 /* And finally switch class */
1835 class = MC_BIGCL;
1836 }
1837
1838 /* Reinsert the slab to the class's slab list */
1839 if (slab_is_detached(sp))
1840 slab_insert(sp, class);
1841 }
1842
1843 /*
1844 * Common allocator for rudimentary objects called by the CPU cache layer
1845 * during an allocation request whenever there is no available element in the
1846 * bucket layer. It returns one or more elements from the appropriate global
1847 * freelist. If the freelist is empty, it will attempt to populate it and
1848 * retry the allocation.
1849 */
1850 static unsigned int
1851 mbuf_slab_alloc(void *arg, mcache_obj_t ***plist, unsigned int num, int wait)
1852 {
1853 mbuf_class_t class = (mbuf_class_t)arg;
1854 unsigned int need = num;
1855 mcache_obj_t **list = *plist;
1856
1857 ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
1858 ASSERT(need > 0);
1859
1860 lck_mtx_lock(mbuf_mlock);
1861
1862 for (;;) {
1863 if ((*list = slab_alloc(class, wait)) != NULL) {
1864 (*list)->obj_next = NULL;
1865 list = *plist = &(*list)->obj_next;
1866
1867 if (--need == 0) {
1868 /*
1869 * If the number of elements in freelist has
1870 * dropped below low watermark, asynchronously
1871 * populate the freelist now rather than doing
1872 * it later when we run out of elements.
1873 */
1874 if (!mbuf_cached_above(class, wait) &&
1875 m_infree(class) < m_total(class) >> 5) {
1876 (void) freelist_populate(class, 1,
1877 M_DONTWAIT);
1878 }
1879 break;
1880 }
1881 } else {
1882 VERIFY(m_infree(class) == 0 || class == MC_CL);
1883
1884 (void) freelist_populate(class, 1,
1885 (wait & MCR_NOSLEEP) ? M_DONTWAIT : M_WAIT);
1886
1887 if (m_infree(class) > 0)
1888 continue;
1889
1890 /* Check if there's anything at the cache layer */
1891 if (mbuf_cached_above(class, wait))
1892 break;
1893
1894 /* watchdog checkpoint */
1895 mbuf_watchdog();
1896
1897 /* We have nothing and cannot block; give up */
1898 if (wait & MCR_NOSLEEP) {
1899 if (!(wait & MCR_TRYHARD)) {
1900 m_fail_cnt(class)++;
1901 mbstat.m_drops++;
1902 break;
1903 }
1904 }
1905
1906 /*
1907 * If the freelist is still empty and the caller is
1908 * willing to be blocked, sleep on the wait channel
1909 * until an element is available. Otherwise, if
1910 * MCR_TRYHARD is set, do our best to satisfy the
1911 * request without having to go to sleep.
1912 */
1913 if (mbuf_worker_ready &&
1914 mbuf_sleep(class, need, wait))
1915 break;
1916
1917 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1918 }
1919 }
1920
1921 m_alloc_cnt(class) += num - need;
1922 lck_mtx_unlock(mbuf_mlock);
1923
1924 return (num - need);
1925 }
1926
1927 /*
1928 * Common de-allocator for rudimentary objects called by the CPU cache
1929 * layer when one or more elements need to be returned to the appropriate
1930 * global freelist.
1931 */
1932 static void
1933 mbuf_slab_free(void *arg, mcache_obj_t *list, __unused int purged)
1934 {
1935 mbuf_class_t class = (mbuf_class_t)arg;
1936 mcache_obj_t *nlist;
1937 unsigned int num = 0;
1938 int w;
1939
1940 ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
1941
1942 lck_mtx_lock(mbuf_mlock);
1943
1944 for (;;) {
1945 nlist = list->obj_next;
1946 list->obj_next = NULL;
1947 slab_free(class, list);
1948 ++num;
1949 if ((list = nlist) == NULL)
1950 break;
1951 }
1952 m_free_cnt(class) += num;
1953
1954 if ((w = mb_waiters) > 0)
1955 mb_waiters = 0;
1956
1957 lck_mtx_unlock(mbuf_mlock);
1958
1959 if (w != 0)
1960 wakeup(mb_waitchan);
1961 }
1962
1963 /*
1964 * Common auditor for rudimentary objects called by the CPU cache layer
1965 * during an allocation or free request. For the former, this is called
1966 * after the objects are obtained from either the bucket or slab layer
1967 * and before they are returned to the caller. For the latter, this is
1968 * called immediately during free and before placing the objects into
1969 * the bucket or slab layer.
1970 */
1971 static void
1972 mbuf_slab_audit(void *arg, mcache_obj_t *list, boolean_t alloc)
1973 {
1974 mbuf_class_t class = (mbuf_class_t)arg;
1975 mcache_audit_t *mca;
1976
1977 ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
1978
1979 while (list != NULL) {
1980 lck_mtx_lock(mbuf_mlock);
1981 mca = mcl_audit_buf2mca(class, list);
1982
1983 /* Do the sanity checks */
1984 if (class == MC_MBUF) {
1985 mcl_audit_mbuf(mca, list, FALSE, alloc);
1986 ASSERT(mca->mca_uflags & MB_SCVALID);
1987 } else {
1988 mcl_audit_cluster(mca, list, m_maxsize(class),
1989 alloc, TRUE);
1990 ASSERT(!(mca->mca_uflags & MB_SCVALID));
1991 }
1992 /* Record this transaction */
1993 if (mcltrace)
1994 mcache_buffer_log(mca, list, m_cache(class));
1995
1996 if (alloc)
1997 mca->mca_uflags |= MB_INUSE;
1998 else
1999 mca->mca_uflags &= ~MB_INUSE;
2000 /* Unpair the object (unconditionally) */
2001 mca->mca_uptr = NULL;
2002 lck_mtx_unlock(mbuf_mlock);
2003
2004 list = list->obj_next;
2005 }
2006 }
2007
2008 /*
2009 * Common notify routine for all caches. It is called by mcache when
2010 * one or more objects get freed. We use this indication to trigger
2011 * the wakeup of any sleeping threads so that they can retry their
2012 * allocation requests.
2013 */
2014 static void
2015 mbuf_slab_notify(void *arg, u_int32_t reason)
2016 {
2017 mbuf_class_t class = (mbuf_class_t)arg;
2018 int w;
2019
2020 ASSERT(MBUF_CLASS_VALID(class));
2021
2022 if (reason != MCN_RETRYALLOC)
2023 return;
2024
2025 lck_mtx_lock(mbuf_mlock);
2026 if ((w = mb_waiters) > 0) {
2027 m_notified(class)++;
2028 mb_waiters = 0;
2029 }
2030 lck_mtx_unlock(mbuf_mlock);
2031
2032 if (w != 0)
2033 wakeup(mb_waitchan);
2034 }
2035
2036 /*
2037 * Obtain object(s) from the composite class's freelist.
2038 */
2039 static unsigned int
2040 cslab_alloc(mbuf_class_t class, mcache_obj_t ***plist, unsigned int num)
2041 {
2042 unsigned int need = num;
2043 mcl_slab_t *sp, *clsp, *nsp;
2044 struct mbuf *m;
2045 mcache_obj_t **list = *plist;
2046 void *cl;
2047
2048 VERIFY(need > 0);
2049 VERIFY(class != MC_MBUF_16KCL || njcl > 0);
2050 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2051
2052 /* Get what we can from the freelist */
2053 while ((*list = m_cobjlist(class)) != NULL) {
2054 MRANGE(*list);
2055
2056 m = (struct mbuf *)*list;
2057 sp = slab_get(m);
2058 cl = m->m_ext.ext_buf;
2059 clsp = slab_get(cl);
2060 VERIFY(m->m_flags == M_EXT && cl != NULL);
2061 VERIFY(MEXT_RFA(m) != NULL && MBUF_IS_COMPOSITE(m));
2062
2063 if (class == MC_MBUF_CL) {
2064 VERIFY(clsp->sl_refcnt >= 1 &&
2065 clsp->sl_refcnt <= NCLPBG);
2066 } else {
2067 VERIFY(clsp->sl_refcnt == 1);
2068 }
2069
2070 if (class == MC_MBUF_16KCL) {
2071 int k;
2072 for (nsp = clsp, k = 1; k < NSLABSP16KB; k++) {
2073 nsp = nsp->sl_next;
2074 /* Next slab must already be present */
2075 VERIFY(nsp != NULL);
2076 VERIFY(nsp->sl_refcnt == 1);
2077 }
2078 }
2079
2080 if ((m_cobjlist(class) = (*list)->obj_next) != NULL &&
2081 !MBUF_IN_MAP(m_cobjlist(class))) {
2082 slab_nextptr_panic(sp, m_cobjlist(class));
2083 /* NOTREACHED */
2084 }
2085 (*list)->obj_next = NULL;
2086 list = *plist = &(*list)->obj_next;
2087
2088 if (--need == 0)
2089 break;
2090 }
2091 m_infree(class) -= (num - need);
2092
2093 return (num - need);
2094 }
2095
2096 /*
2097 * Place object(s) back into a composite class's freelist.
2098 */
2099 static unsigned int
2100 cslab_free(mbuf_class_t class, mcache_obj_t *list, int purged)
2101 {
2102 mcache_obj_t *o, *tail;
2103 unsigned int num = 0;
2104 struct mbuf *m, *ms;
2105 mcache_audit_t *mca = NULL;
2106 mcache_obj_t *ref_list = NULL;
2107 mcl_slab_t *clsp, *nsp;
2108 void *cl;
2109 mbuf_class_t cl_class;
2110
2111 ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
2112 VERIFY(class != MC_MBUF_16KCL || njcl > 0);
2113 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2114
2115 if (class == MC_MBUF_CL) {
2116 cl_class = MC_CL;
2117 } else if (class == MC_MBUF_BIGCL) {
2118 cl_class = MC_BIGCL;
2119 } else {
2120 VERIFY(class == MC_MBUF_16KCL);
2121 cl_class = MC_16KCL;
2122 }
2123
2124 o = tail = list;
2125
2126 while ((m = ms = (struct mbuf *)o) != NULL) {
2127 mcache_obj_t *rfa, *nexto = o->obj_next;
2128
2129 /* Do the mbuf sanity checks */
2130 if (mclaudit != NULL) {
2131 mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
2132 if (mclverify) {
2133 mcache_audit_free_verify(mca, m, 0,
2134 m_maxsize(MC_MBUF));
2135 }
2136 ms = (struct mbuf *)mca->mca_contents;
2137 }
2138
2139 /* Do the cluster sanity checks */
2140 cl = ms->m_ext.ext_buf;
2141 clsp = slab_get(cl);
2142 if (mclverify) {
2143 size_t size = m_maxsize(cl_class);
2144 mcache_audit_free_verify(mcl_audit_buf2mca(cl_class,
2145 (mcache_obj_t *)cl), cl, 0, size);
2146 }
2147 VERIFY(ms->m_type == MT_FREE);
2148 VERIFY(ms->m_flags == M_EXT);
2149 VERIFY(MEXT_RFA(ms) != NULL && MBUF_IS_COMPOSITE(ms));
2150 if (cl_class == MC_CL) {
2151 VERIFY(clsp->sl_refcnt >= 1 &&
2152 clsp->sl_refcnt <= NCLPBG);
2153 } else {
2154 VERIFY(clsp->sl_refcnt == 1);
2155 }
2156 if (cl_class == MC_16KCL) {
2157 int k;
2158 for (nsp = clsp, k = 1; k < NSLABSP16KB; k++) {
2159 nsp = nsp->sl_next;
2160 /* Next slab must already be present */
2161 VERIFY(nsp != NULL);
2162 VERIFY(nsp->sl_refcnt == 1);
2163 }
2164 }
2165
2166 /*
2167 * If we're asked to purge, restore the actual mbuf using
2168 * contents of the shadow structure (if auditing is enabled)
2169 * and clear EXTF_COMPOSITE flag from the mbuf, as we are
2170 * about to free it and the attached cluster into their caches.
2171 */
2172 if (purged) {
2173 /* Restore constructed mbuf fields */
2174 if (mclaudit != NULL)
2175 mcl_audit_restore_mbuf(m, mca, TRUE);
2176
2177 MEXT_REF(m) = 0;
2178 MEXT_FLAGS(m) = 0;
2179
2180 rfa = (mcache_obj_t *)MEXT_RFA(m);
2181 rfa->obj_next = ref_list;
2182 ref_list = rfa;
2183 MEXT_RFA(m) = NULL;
2184
2185 m->m_type = MT_FREE;
2186 m->m_flags = m->m_len = 0;
2187 m->m_next = m->m_nextpkt = NULL;
2188
2189 /* Save mbuf fields and make auditing happy */
2190 if (mclaudit != NULL)
2191 mcl_audit_mbuf(mca, o, FALSE, FALSE);
2192
2193 VERIFY(m_total(class) > 0);
2194 m_total(class)--;
2195
2196 /* Free the mbuf */
2197 o->obj_next = NULL;
2198 slab_free(MC_MBUF, o);
2199
2200 /* And free the cluster */
2201 ((mcache_obj_t *)cl)->obj_next = NULL;
2202 if (class == MC_MBUF_CL)
2203 slab_free(MC_CL, cl);
2204 else if (class == MC_MBUF_BIGCL)
2205 slab_free(MC_BIGCL, cl);
2206 else
2207 slab_free(MC_16KCL, cl);
2208 }
2209
2210 ++num;
2211 tail = o;
2212 o = nexto;
2213 }
2214
2215 if (!purged) {
2216 tail->obj_next = m_cobjlist(class);
2217 m_cobjlist(class) = list;
2218 m_infree(class) += num;
2219 } else if (ref_list != NULL) {
2220 mcache_free_ext(ref_cache, ref_list);
2221 }
2222
2223 return (num);
2224 }
2225
2226 /*
2227 * Common allocator for composite objects called by the CPU cache layer
2228 * during an allocation request whenever there is no available element in
2229 * the bucket layer. It returns one or more composite elements from the
2230 * appropriate global freelist. If the freelist is empty, it will attempt
2231 * to obtain the rudimentary objects from their caches and construct them
2232 * into composite mbuf + cluster objects.
2233 */
2234 static unsigned int
2235 mbuf_cslab_alloc(void *arg, mcache_obj_t ***plist, unsigned int needed,
2236 int wait)
2237 {
2238 mbuf_class_t class = (mbuf_class_t)arg;
2239 mbuf_class_t cl_class = 0;
2240 unsigned int num = 0, cnum = 0, want = needed;
2241 mcache_obj_t *ref_list = NULL;
2242 mcache_obj_t *mp_list = NULL;
2243 mcache_obj_t *clp_list = NULL;
2244 mcache_obj_t **list;
2245 struct ext_ref *rfa;
2246 struct mbuf *m;
2247 void *cl;
2248
2249 ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
2250 ASSERT(needed > 0);
2251
2252 VERIFY(class != MC_MBUF_16KCL || njcl > 0);
2253
2254 /* There should not be any slab for this class */
2255 VERIFY(m_slab_cnt(class) == 0 &&
2256 m_slablist(class).tqh_first == NULL &&
2257 m_slablist(class).tqh_last == NULL);
2258
2259 lck_mtx_lock(mbuf_mlock);
2260
2261 /* Try using the freelist first */
2262 num = cslab_alloc(class, plist, needed);
2263 list = *plist;
2264 if (num == needed) {
2265 m_alloc_cnt(class) += num;
2266 lck_mtx_unlock(mbuf_mlock);
2267 return (needed);
2268 }
2269
2270 lck_mtx_unlock(mbuf_mlock);
2271
2272 /*
2273 * We could not satisfy the request using the freelist alone;
2274 * allocate from the appropriate rudimentary caches and use
2275 * whatever we can get to construct the composite objects.
2276 */
2277 needed -= num;
2278
2279 /*
2280 * Mark these allocation requests as coming from a composite cache.
2281 * Also, if the caller is willing to be blocked, mark the request
2282 * with MCR_FAILOK such that we don't end up sleeping at the mbuf
2283 * slab layer waiting for the individual object when one or more
2284 * of the already-constructed composite objects are available.
2285 */
2286 wait |= MCR_COMP;
2287 if (!(wait & MCR_NOSLEEP))
2288 wait |= MCR_FAILOK;
2289
2290 /* allocate mbufs */
2291 needed = mcache_alloc_ext(m_cache(MC_MBUF), &mp_list, needed, wait);
2292 if (needed == 0) {
2293 ASSERT(mp_list == NULL);
2294 goto fail;
2295 }
2296
2297 /* allocate clusters */
2298 if (class == MC_MBUF_CL) {
2299 cl_class = MC_CL;
2300 } else if (class == MC_MBUF_BIGCL) {
2301 cl_class = MC_BIGCL;
2302 } else {
2303 VERIFY(class == MC_MBUF_16KCL);
2304 cl_class = MC_16KCL;
2305 }
2306 needed = mcache_alloc_ext(m_cache(cl_class), &clp_list, needed, wait);
2307 if (needed == 0) {
2308 ASSERT(clp_list == NULL);
2309 goto fail;
2310 }
2311
2312 needed = mcache_alloc_ext(ref_cache, &ref_list, needed, wait);
2313 if (needed == 0) {
2314 ASSERT(ref_list == NULL);
2315 goto fail;
2316 }
2317
2318 /*
2319 * By this time "needed" is MIN(mbuf, cluster, ref). Any left
2320 * overs will get freed accordingly before we return to caller.
2321 */
2322 for (cnum = 0; cnum < needed; cnum++) {
2323 struct mbuf *ms;
2324
2325 m = ms = (struct mbuf *)mp_list;
2326 mp_list = mp_list->obj_next;
2327
2328 cl = clp_list;
2329 clp_list = clp_list->obj_next;
2330 ((mcache_obj_t *)cl)->obj_next = NULL;
2331
2332 rfa = (struct ext_ref *)ref_list;
2333 ref_list = ref_list->obj_next;
2334 ((mcache_obj_t *)rfa)->obj_next = NULL;
2335
2336 /*
2337 * If auditing is enabled, construct the shadow mbuf
2338 * in the audit structure instead of in the actual one.
2339 * mbuf_cslab_audit() will take care of restoring the
2340 * contents after the integrity check.
2341 */
2342 if (mclaudit != NULL) {
2343 mcache_audit_t *mca, *cl_mca;
2344
2345 lck_mtx_lock(mbuf_mlock);
2346 mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
2347 ms = ((struct mbuf *)mca->mca_contents);
2348 cl_mca = mcl_audit_buf2mca(MC_CL, (mcache_obj_t *)cl);
2349
2350 /*
2351 * Pair them up. Note that this is done at the time
2352 * the mbuf+cluster objects are constructed. This
2353 * information should be treated as "best effort"
2354 * debugging hint since more than one mbufs can refer
2355 * to a cluster. In that case, the cluster might not
2356 * be freed along with the mbuf it was paired with.
2357 */
2358 mca->mca_uptr = cl_mca;
2359 cl_mca->mca_uptr = mca;
2360
2361 ASSERT(mca->mca_uflags & MB_SCVALID);
2362 ASSERT(!(cl_mca->mca_uflags & MB_SCVALID));
2363 lck_mtx_unlock(mbuf_mlock);
2364
2365 /* Technically, they are in the freelist */
2366 if (mclverify) {
2367 size_t size;
2368
2369 mcache_set_pattern(MCACHE_FREE_PATTERN, m,
2370 m_maxsize(MC_MBUF));
2371
2372 if (class == MC_MBUF_CL)
2373 size = m_maxsize(MC_CL);
2374 else if (class == MC_MBUF_BIGCL)
2375 size = m_maxsize(MC_BIGCL);
2376 else
2377 size = m_maxsize(MC_16KCL);
2378
2379 mcache_set_pattern(MCACHE_FREE_PATTERN, cl,
2380 size);
2381 }
2382 }
2383
2384 MBUF_INIT(ms, 0, MT_FREE);
2385 if (class == MC_MBUF_16KCL) {
2386 MBUF_16KCL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE);
2387 } else if (class == MC_MBUF_BIGCL) {
2388 MBUF_BIGCL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE);
2389 } else {
2390 MBUF_CL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE);
2391 }
2392 VERIFY(ms->m_flags == M_EXT);
2393 VERIFY(MEXT_RFA(ms) != NULL && MBUF_IS_COMPOSITE(ms));
2394
2395 *list = (mcache_obj_t *)m;
2396 (*list)->obj_next = NULL;
2397 list = *plist = &(*list)->obj_next;
2398 }
2399
2400 fail:
2401 /*
2402 * Free up what's left of the above.
2403 */
2404 if (mp_list != NULL)
2405 mcache_free_ext(m_cache(MC_MBUF), mp_list);
2406 if (clp_list != NULL)
2407 mcache_free_ext(m_cache(cl_class), clp_list);
2408 if (ref_list != NULL)
2409 mcache_free_ext(ref_cache, ref_list);
2410
2411 lck_mtx_lock(mbuf_mlock);
2412 if (num > 0 || cnum > 0) {
2413 m_total(class) += cnum;
2414 VERIFY(m_total(class) <= m_maxlimit(class));
2415 m_alloc_cnt(class) += num + cnum;
2416 }
2417 if ((num + cnum) < want)
2418 m_fail_cnt(class) += (want - (num + cnum));
2419 lck_mtx_unlock(mbuf_mlock);
2420
2421 return (num + cnum);
2422 }
2423
2424 /*
2425 * Common de-allocator for composite objects called by the CPU cache
2426 * layer when one or more elements need to be returned to the appropriate
2427 * global freelist.
2428 */
2429 static void
2430 mbuf_cslab_free(void *arg, mcache_obj_t *list, int purged)
2431 {
2432 mbuf_class_t class = (mbuf_class_t)arg;
2433 unsigned int num;
2434 int w;
2435
2436 ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
2437
2438 lck_mtx_lock(mbuf_mlock);
2439
2440 num = cslab_free(class, list, purged);
2441 m_free_cnt(class) += num;
2442
2443 if ((w = mb_waiters) > 0)
2444 mb_waiters = 0;
2445
2446 lck_mtx_unlock(mbuf_mlock);
2447
2448 if (w != 0)
2449 wakeup(mb_waitchan);
2450 }
2451
2452 /*
2453 * Common auditor for composite objects called by the CPU cache layer
2454 * during an allocation or free request. For the former, this is called
2455 * after the objects are obtained from either the bucket or slab layer
2456 * and before they are returned to the caller. For the latter, this is
2457 * called immediately during free and before placing the objects into
2458 * the bucket or slab layer.
2459 */
2460 static void
2461 mbuf_cslab_audit(void *arg, mcache_obj_t *list, boolean_t alloc)
2462 {
2463 mbuf_class_t class = (mbuf_class_t)arg;
2464 mcache_audit_t *mca;
2465 struct mbuf *m, *ms;
2466 mcl_slab_t *clsp, *nsp;
2467 size_t size;
2468 void *cl;
2469
2470 ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
2471
2472 while ((m = ms = (struct mbuf *)list) != NULL) {
2473 lck_mtx_lock(mbuf_mlock);
2474 /* Do the mbuf sanity checks and record its transaction */
2475 mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
2476 mcl_audit_mbuf(mca, m, TRUE, alloc);
2477 if (mcltrace)
2478 mcache_buffer_log(mca, m, m_cache(class));
2479
2480 if (alloc)
2481 mca->mca_uflags |= MB_COMP_INUSE;
2482 else
2483 mca->mca_uflags &= ~MB_COMP_INUSE;
2484
2485 /*
2486 * Use the shadow mbuf in the audit structure if we are
2487 * freeing, since the contents of the actual mbuf has been
2488 * pattern-filled by the above call to mcl_audit_mbuf().
2489 */
2490 if (!alloc && mclverify)
2491 ms = (struct mbuf *)mca->mca_contents;
2492
2493 /* Do the cluster sanity checks and record its transaction */
2494 cl = ms->m_ext.ext_buf;
2495 clsp = slab_get(cl);
2496 VERIFY(ms->m_flags == M_EXT && cl != NULL);
2497 VERIFY(MEXT_RFA(ms) != NULL && MBUF_IS_COMPOSITE(ms));
2498 if (class == MC_MBUF_CL)
2499 VERIFY(clsp->sl_refcnt >= 1 &&
2500 clsp->sl_refcnt <= NCLPBG);
2501 else
2502 VERIFY(clsp->sl_refcnt == 1);
2503
2504 if (class == MC_MBUF_16KCL) {
2505 int k;
2506 for (nsp = clsp, k = 1; k < NSLABSP16KB; k++) {
2507 nsp = nsp->sl_next;
2508 /* Next slab must already be present */
2509 VERIFY(nsp != NULL);
2510 VERIFY(nsp->sl_refcnt == 1);
2511 }
2512 }
2513
2514 mca = mcl_audit_buf2mca(MC_CL, cl);
2515 if (class == MC_MBUF_CL)
2516 size = m_maxsize(MC_CL);
2517 else if (class == MC_MBUF_BIGCL)
2518 size = m_maxsize(MC_BIGCL);
2519 else
2520 size = m_maxsize(MC_16KCL);
2521 mcl_audit_cluster(mca, cl, size, alloc, FALSE);
2522 if (mcltrace)
2523 mcache_buffer_log(mca, cl, m_cache(class));
2524
2525 if (alloc)
2526 mca->mca_uflags |= MB_COMP_INUSE;
2527 else
2528 mca->mca_uflags &= ~MB_COMP_INUSE;
2529 lck_mtx_unlock(mbuf_mlock);
2530
2531 list = list->obj_next;
2532 }
2533 }
2534
2535 /*
2536 * Allocate some number of mbuf clusters and place on cluster freelist.
2537 */
2538 static int
2539 m_clalloc(const u_int32_t num, const int wait, const u_int32_t bufsize)
2540 {
2541 int i;
2542 vm_size_t size = 0;
2543 int numpages = 0, large_buffer = (bufsize == m_maxsize(MC_16KCL));
2544 vm_offset_t page = 0;
2545 mcache_audit_t *mca_list = NULL;
2546 mcache_obj_t *con_list = NULL;
2547 mcl_slab_t *sp;
2548
2549 VERIFY(bufsize == m_maxsize(MC_BIGCL) ||
2550 bufsize == m_maxsize(MC_16KCL));
2551
2552 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2553
2554 /*
2555 * Multiple threads may attempt to populate the cluster map one
2556 * after another. Since we drop the lock below prior to acquiring
2557 * the physical page(s), our view of the cluster map may no longer
2558 * be accurate, and we could end up over-committing the pages beyond
2559 * the maximum allowed for each class. To prevent it, this entire
2560 * operation (including the page mapping) is serialized.
2561 */
2562 while (mb_clalloc_busy) {
2563 mb_clalloc_waiters++;
2564 (void) msleep(mb_clalloc_waitchan, mbuf_mlock,
2565 (PZERO-1), "m_clalloc", NULL);
2566 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2567 }
2568
2569 /* We are busy now; tell everyone else to go away */
2570 mb_clalloc_busy = TRUE;
2571
2572 /*
2573 * Honor the caller's wish to block or not block. We have a way
2574 * to grow the pool asynchronously using the mbuf worker thread.
2575 */
2576 i = m_howmany(num, bufsize);
2577 if (i == 0 || (wait & M_DONTWAIT))
2578 goto out;
2579
2580 lck_mtx_unlock(mbuf_mlock);
2581
2582 size = round_page(i * bufsize);
2583 page = kmem_mb_alloc(mb_map, size, large_buffer);
2584
2585 /*
2586 * If we did ask for "n" 16KB physically contiguous chunks
2587 * and didn't get them, then please try again without this
2588 * restriction.
2589 */
2590 if (large_buffer && page == 0)
2591 page = kmem_mb_alloc(mb_map, size, 0);
2592
2593 if (page == 0) {
2594 if (bufsize == m_maxsize(MC_BIGCL)) {
2595 /* Try for 1 page if failed, only 4KB request */
2596 size = NBPG;
2597 page = kmem_mb_alloc(mb_map, size, 0);
2598 }
2599
2600 if (page == 0) {
2601 lck_mtx_lock(mbuf_mlock);
2602 goto out;
2603 }
2604 }
2605
2606 VERIFY(IS_P2ALIGNED(page, NBPG));
2607 numpages = size / NBPG;
2608
2609 /* If auditing is enabled, allocate the audit structures now */
2610 if (mclaudit != NULL) {
2611 int needed;
2612
2613 /*
2614 * Yes, I realize this is a waste of memory for clusters
2615 * that never get transformed into mbufs, as we may end
2616 * up with NMBPBG-1 unused audit structures per cluster.
2617 * But doing so tremendously simplifies the allocation
2618 * strategy, since at this point we are not holding the
2619 * mbuf lock and the caller is okay to be blocked.
2620 */
2621 if (bufsize == m_maxsize(MC_BIGCL)) {
2622 needed = numpages * NMBPBG;
2623
2624 i = mcache_alloc_ext(mcl_audit_con_cache,
2625 &con_list, needed, MCR_SLEEP);
2626
2627 VERIFY(con_list != NULL && i == needed);
2628 } else {
2629 needed = numpages / NSLABSP16KB;
2630 }
2631
2632 i = mcache_alloc_ext(mcache_audit_cache,
2633 (mcache_obj_t **)&mca_list, needed, MCR_SLEEP);
2634
2635 VERIFY(mca_list != NULL && i == needed);
2636 }
2637
2638 lck_mtx_lock(mbuf_mlock);
2639
2640 for (i = 0; i < numpages; i++, page += NBPG) {
2641 ppnum_t offset = ((char *)page - (char *)mbutl) / NBPG;
2642 ppnum_t new_page = pmap_find_phys(kernel_pmap,
2643 (vm_offset_t)page);
2644
2645 /*
2646 * In the case of no mapper being available the following
2647 * code noops and returns the input page; if there is a
2648 * mapper the appropriate I/O page is returned.
2649 */
2650 VERIFY(offset < mcl_pages);
2651 new_page = IOMapperInsertPage(mcl_paddr_base, offset, new_page);
2652 mcl_paddr[offset] = new_page << PGSHIFT;
2653
2654 /* Pattern-fill this fresh page */
2655 if (mclverify) {
2656 mcache_set_pattern(MCACHE_FREE_PATTERN,
2657 (caddr_t)page, NBPG);
2658 }
2659 if (bufsize == m_maxsize(MC_BIGCL)) {
2660 union mbigcluster *mbc = (union mbigcluster *)page;
2661
2662 /* One for the entire page */
2663 sp = slab_get(mbc);
2664 if (mclaudit != NULL) {
2665 mcl_audit_init(mbc, &mca_list, &con_list,
2666 AUDIT_CONTENTS_SIZE, NMBPBG);
2667 }
2668 VERIFY(sp->sl_refcnt == 0 && sp->sl_flags == 0);
2669 slab_init(sp, MC_BIGCL, SLF_MAPPED,
2670 mbc, mbc, bufsize, 0, 1);
2671
2672 /* Insert this slab */
2673 slab_insert(sp, MC_BIGCL);
2674
2675 /* Update stats now since slab_get() drops the lock */
2676 mbstat.m_bigclfree = ++m_infree(MC_BIGCL) +
2677 m_infree(MC_MBUF_BIGCL);
2678 mbstat.m_bigclusters = ++m_total(MC_BIGCL);
2679 VERIFY(m_total(MC_BIGCL) <= m_maxlimit(MC_BIGCL));
2680 } else if ((i % NSLABSP16KB) == 0) {
2681 union m16kcluster *m16kcl = (union m16kcluster *)page;
2682 mcl_slab_t *nsp;
2683 int k;
2684
2685 VERIFY(njcl > 0);
2686 /* One for the entire 16KB */
2687 sp = slab_get(m16kcl);
2688 if (mclaudit != NULL)
2689 mcl_audit_init(m16kcl, &mca_list, NULL, 0, 1);
2690
2691 VERIFY(sp->sl_refcnt == 0 && sp->sl_flags == 0);
2692 slab_init(sp, MC_16KCL, SLF_MAPPED,
2693 m16kcl, m16kcl, bufsize, 0, 1);
2694
2695 /*
2696 * 2nd-Nth page's slab is part of the first one,
2697 * where N is NSLABSP16KB.
2698 */
2699 for (k = 1; k < NSLABSP16KB; k++) {
2700 nsp = slab_get(((union mbigcluster *)page) + k);
2701 VERIFY(nsp->sl_refcnt == 0 &&
2702 nsp->sl_flags == 0);
2703 slab_init(nsp, MC_16KCL,
2704 SLF_MAPPED | SLF_PARTIAL,
2705 m16kcl, NULL, 0, 0, 0);
2706 }
2707
2708 /* Insert this slab */
2709 slab_insert(sp, MC_16KCL);
2710
2711 /* Update stats now since slab_get() drops the lock */
2712 m_infree(MC_16KCL)++;
2713 m_total(MC_16KCL)++;
2714 VERIFY(m_total(MC_16KCL) <= m_maxlimit(MC_16KCL));
2715 }
2716 }
2717 VERIFY(mca_list == NULL && con_list == NULL);
2718
2719 /* We're done; let others enter */
2720 mb_clalloc_busy = FALSE;
2721 if (mb_clalloc_waiters > 0) {
2722 mb_clalloc_waiters = 0;
2723 wakeup(mb_clalloc_waitchan);
2724 }
2725
2726 if (bufsize == m_maxsize(MC_BIGCL))
2727 return (numpages);
2728
2729 VERIFY(bufsize == m_maxsize(MC_16KCL));
2730 return (numpages / NSLABSP16KB);
2731
2732 out:
2733 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2734
2735 /* We're done; let others enter */
2736 mb_clalloc_busy = FALSE;
2737 if (mb_clalloc_waiters > 0) {
2738 mb_clalloc_waiters = 0;
2739 wakeup(mb_clalloc_waitchan);
2740 }
2741
2742 /*
2743 * When non-blocking we kick a thread if we have to grow the
2744 * pool or if the number of free clusters is less than requested.
2745 */
2746 if (bufsize == m_maxsize(MC_BIGCL)) {
2747 if (i > 0) {
2748 /*
2749 * Remember total number of 4KB clusters needed
2750 * at this time.
2751 */
2752 i += m_total(MC_BIGCL);
2753 if (i > mbuf_expand_big) {
2754 mbuf_expand_big = i;
2755 if (mbuf_worker_ready)
2756 wakeup((caddr_t)&mbuf_worker_run);
2757 }
2758 }
2759
2760 if (m_infree(MC_BIGCL) >= num)
2761 return (1);
2762 } else {
2763 if (i > 0) {
2764 /*
2765 * Remember total number of 16KB clusters needed
2766 * at this time.
2767 */
2768 i += m_total(MC_16KCL);
2769 if (i > mbuf_expand_16k) {
2770 mbuf_expand_16k = i;
2771 if (mbuf_worker_ready)
2772 wakeup((caddr_t)&mbuf_worker_run);
2773 }
2774 }
2775
2776 if (m_infree(MC_16KCL) >= num)
2777 return (1);
2778 }
2779 return (0);
2780 }
2781
2782 /*
2783 * Populate the global freelist of the corresponding buffer class.
2784 */
2785 static int
2786 freelist_populate(mbuf_class_t class, unsigned int num, int wait)
2787 {
2788 mcache_obj_t *o = NULL;
2789 int i, numpages = 0, count;
2790
2791 VERIFY(class == MC_MBUF || class == MC_CL || class == MC_BIGCL ||
2792 class == MC_16KCL);
2793
2794 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2795
2796 switch (class) {
2797 case MC_MBUF:
2798 case MC_CL:
2799 case MC_BIGCL:
2800 numpages = (num * m_size(class) + NBPG - 1) / NBPG;
2801 i = m_clalloc(numpages, wait, m_maxsize(MC_BIGCL));
2802
2803 /* Respect the 4KB clusters minimum limit */
2804 if (m_total(MC_BIGCL) == m_maxlimit(MC_BIGCL) &&
2805 m_infree(MC_BIGCL) <= m_minlimit(MC_BIGCL)) {
2806 if (class != MC_BIGCL || (wait & MCR_COMP))
2807 return (0);
2808 }
2809 if (class == MC_BIGCL)
2810 return (i != 0);
2811 break;
2812
2813 case MC_16KCL:
2814 return (m_clalloc(num, wait, m_maxsize(class)) != 0);
2815 /* NOTREACHED */
2816
2817 default:
2818 VERIFY(0);
2819 /* NOTREACHED */
2820 }
2821
2822 VERIFY(class == MC_MBUF || class == MC_CL);
2823
2824 /* how many objects will we cut the page into? */
2825 int numobj = (class == MC_MBUF ? NMBPBG : NCLPBG);
2826
2827 for (count = 0; count < numpages; count++) {
2828
2829 /* respect totals, minlimit, maxlimit */
2830 if (m_total(MC_BIGCL) <= m_minlimit(MC_BIGCL) ||
2831 m_total(class) >= m_maxlimit(class))
2832 break;
2833
2834 if ((o = slab_alloc(MC_BIGCL, wait)) == NULL)
2835 break;
2836
2837 struct mbuf *m = (struct mbuf *)o;
2838 union mcluster *c = (union mcluster *)o;
2839 mcl_slab_t *sp = slab_get(o);
2840 mcache_audit_t *mca = NULL;
2841
2842 VERIFY(slab_is_detached(sp) &&
2843 (sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED);
2844
2845 /*
2846 * Make sure that the cluster is unmolested
2847 * while in freelist
2848 */
2849 if (mclverify) {
2850 mca = mcl_audit_buf2mca(MC_BIGCL, o);
2851 mcache_audit_free_verify(mca, o, 0,
2852 m_maxsize(MC_BIGCL));
2853 }
2854
2855 /* Reinitialize it as an mbuf or 2K slab */
2856 slab_init(sp, class, sp->sl_flags,
2857 sp->sl_base, NULL, sp->sl_len, 0, numobj);
2858
2859 VERIFY(o == (mcache_obj_t *)sp->sl_base);
2860 VERIFY(sp->sl_head == NULL);
2861
2862 VERIFY(m_total(MC_BIGCL) > 0);
2863 m_total(MC_BIGCL)--;
2864 mbstat.m_bigclusters = m_total(MC_BIGCL);
2865
2866 m_total(class) += numobj;
2867 m_infree(class) += numobj;
2868
2869 VERIFY(m_total(MC_BIGCL) >= m_minlimit(MC_BIGCL));
2870 VERIFY(m_total(class) <= m_maxlimit(class));
2871
2872 i = numobj;
2873 if (class == MC_MBUF) {
2874 mbstat.m_mbufs = m_total(MC_MBUF);
2875 mtype_stat_add(MT_FREE, NMBPBG);
2876 while (i--) {
2877 /*
2878 * If auditing is enabled, construct the
2879 * shadow mbuf in the audit structure
2880 * instead of the actual one.
2881 * mbuf_slab_audit() will take care of
2882 * restoring the contents after the
2883 * integrity check.
2884 */
2885 if (mclaudit != NULL) {
2886 struct mbuf *ms;
2887 mca = mcl_audit_buf2mca(MC_MBUF,
2888 (mcache_obj_t *)m);
2889 ms = ((struct mbuf *)
2890 mca->mca_contents);
2891 ms->m_type = MT_FREE;
2892 } else {
2893 m->m_type = MT_FREE;
2894 }
2895 m->m_next = sp->sl_head;
2896 sp->sl_head = (void *)m++;
2897 }
2898 } else { /* MC_CL */
2899 mbstat.m_clfree =
2900 m_infree(MC_CL) + m_infree(MC_MBUF_CL);
2901 mbstat.m_clusters = m_total(MC_CL);
2902 while (i--) {
2903 c->mcl_next = sp->sl_head;
2904 sp->sl_head = (void *)c++;
2905 }
2906 }
2907
2908 /* Insert into the mbuf or 2k slab list */
2909 slab_insert(sp, class);
2910
2911 if ((i = mb_waiters) > 0)
2912 mb_waiters = 0;
2913 if (i != 0)
2914 wakeup(mb_waitchan);
2915 }
2916 return (count != 0);
2917 }
2918
2919 /*
2920 * For each class, initialize the freelist to hold m_minlimit() objects.
2921 */
2922 static void
2923 freelist_init(mbuf_class_t class)
2924 {
2925 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2926
2927 VERIFY(class == MC_CL || class == MC_BIGCL);
2928 VERIFY(m_total(class) == 0);
2929 VERIFY(m_minlimit(class) > 0);
2930
2931 while (m_total(class) < m_minlimit(class))
2932 (void) freelist_populate(class, m_minlimit(class), M_WAIT);
2933
2934 VERIFY(m_total(class) >= m_minlimit(class));
2935 }
2936
2937 /*
2938 * (Inaccurately) check if it might be worth a trip back to the
2939 * mcache layer due the availability of objects there. We'll
2940 * end up back here if there's nothing up there.
2941 */
2942 static boolean_t
2943 mbuf_cached_above(mbuf_class_t class, int wait)
2944 {
2945 switch (class) {
2946 case MC_MBUF:
2947 if (wait & MCR_COMP)
2948 return (!mcache_bkt_isempty(m_cache(MC_MBUF_CL)) ||
2949 !mcache_bkt_isempty(m_cache(MC_MBUF_BIGCL)));
2950 break;
2951
2952 case MC_CL:
2953 if (wait & MCR_COMP)
2954 return (!mcache_bkt_isempty(m_cache(MC_MBUF_CL)));
2955 break;
2956
2957 case MC_BIGCL:
2958 if (wait & MCR_COMP)
2959 return (!mcache_bkt_isempty(m_cache(MC_MBUF_BIGCL)));
2960 break;
2961
2962 case MC_16KCL:
2963 if (wait & MCR_COMP)
2964 return (!mcache_bkt_isempty(m_cache(MC_MBUF_16KCL)));
2965 break;
2966
2967 case MC_MBUF_CL:
2968 case MC_MBUF_BIGCL:
2969 case MC_MBUF_16KCL:
2970 break;
2971
2972 default:
2973 VERIFY(0);
2974 /* NOTREACHED */
2975 }
2976
2977 return (!mcache_bkt_isempty(m_cache(class)));
2978 }
2979
2980 /*
2981 * If possible, convert constructed objects to raw ones.
2982 */
2983 static boolean_t
2984 mbuf_steal(mbuf_class_t class, unsigned int num)
2985 {
2986 mcache_obj_t *top = NULL;
2987 mcache_obj_t **list = &top;
2988 unsigned int tot = 0;
2989
2990 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2991
2992 switch (class) {
2993 case MC_MBUF:
2994 case MC_CL:
2995 case MC_BIGCL:
2996 case MC_16KCL:
2997 return (FALSE);
2998
2999 case MC_MBUF_CL:
3000 case MC_MBUF_BIGCL:
3001 case MC_MBUF_16KCL:
3002 /* Get the required number of constructed objects if possible */
3003 if (m_infree(class) > m_minlimit(class)) {
3004 tot = cslab_alloc(class, &list,
3005 MIN(num, m_infree(class)));
3006 }
3007
3008 /* And destroy them to get back the raw objects */
3009 if (top != NULL)
3010 (void) cslab_free(class, top, 1);
3011 break;
3012
3013 default:
3014 VERIFY(0);
3015 /* NOTREACHED */
3016 }
3017
3018 return (tot == num);
3019 }
3020
3021 static void
3022 m_reclaim(mbuf_class_t class, unsigned int num, boolean_t comp)
3023 {
3024 int m, bmap = 0;
3025
3026 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
3027
3028 VERIFY(m_total(MC_CL) <= m_maxlimit(MC_CL));
3029 VERIFY(m_total(MC_BIGCL) <= m_maxlimit(MC_BIGCL));
3030 VERIFY(m_total(MC_16KCL) <= m_maxlimit(MC_16KCL));
3031
3032 /*
3033 * This logic can be made smarter; for now, simply mark
3034 * all other related classes as potential victims.
3035 */
3036 switch (class) {
3037 case MC_MBUF:
3038 m_wantpurge(MC_CL)++;
3039 m_wantpurge(MC_BIGCL)++;
3040 m_wantpurge(MC_MBUF_CL)++;
3041 m_wantpurge(MC_MBUF_BIGCL)++;
3042 break;
3043
3044 case MC_CL:
3045 m_wantpurge(MC_MBUF)++;
3046 m_wantpurge(MC_BIGCL)++;
3047 m_wantpurge(MC_MBUF_BIGCL)++;
3048 if (!comp)
3049 m_wantpurge(MC_MBUF_CL)++;
3050 break;
3051
3052 case MC_BIGCL:
3053 m_wantpurge(MC_MBUF)++;
3054 m_wantpurge(MC_CL)++;
3055 m_wantpurge(MC_MBUF_CL)++;
3056 if (!comp)
3057 m_wantpurge(MC_MBUF_BIGCL)++;
3058 break;
3059
3060 case MC_16KCL:
3061 if (!comp)
3062 m_wantpurge(MC_MBUF_16KCL)++;
3063 break;
3064
3065 default:
3066 VERIFY(0);
3067 /* NOTREACHED */
3068 }
3069
3070 /*
3071 * Run through each marked class and check if we really need to
3072 * purge (and therefore temporarily disable) the per-CPU caches
3073 * layer used by the class. If so, remember the classes since
3074 * we are going to drop the lock below prior to purging.
3075 */
3076 for (m = 0; m < NELEM(mbuf_table); m++) {
3077 if (m_wantpurge(m) > 0) {
3078 m_wantpurge(m) = 0;
3079 /*
3080 * Try hard to steal the required number of objects
3081 * from the freelist of other mbuf classes. Only
3082 * purge and disable the per-CPU caches layer when
3083 * we don't have enough; it's the last resort.
3084 */
3085 if (!mbuf_steal(m, num))
3086 bmap |= (1 << m);
3087 }
3088 }
3089
3090 lck_mtx_unlock(mbuf_mlock);
3091
3092 if (bmap != 0) {
3093 /* drain is performed in pfslowtimo(), to avoid deadlocks */
3094 do_reclaim = 1;
3095
3096 /* Sigh; we have no other choices but to ask mcache to purge */
3097 for (m = 0; m < NELEM(mbuf_table); m++) {
3098 if ((bmap & (1 << m)) &&
3099 mcache_purge_cache(m_cache(m))) {
3100 lck_mtx_lock(mbuf_mlock);
3101 m_purge_cnt(m)++;
3102 mbstat.m_drain++;
3103 lck_mtx_unlock(mbuf_mlock);
3104 }
3105 }
3106 } else {
3107 /*
3108 * Request mcache to reap extra elements from all of its caches;
3109 * note that all reaps are serialized and happen only at a fixed
3110 * interval.
3111 */
3112 mcache_reap();
3113 }
3114 lck_mtx_lock(mbuf_mlock);
3115 }
3116
3117 static inline struct mbuf *
3118 m_get_common(int wait, short type, int hdr)
3119 {
3120 struct mbuf *m;
3121 int mcflags = MSLEEPF(wait);
3122
3123 /* Is this due to a non-blocking retry? If so, then try harder */
3124 if (mcflags & MCR_NOSLEEP)
3125 mcflags |= MCR_TRYHARD;
3126
3127 m = mcache_alloc(m_cache(MC_MBUF), mcflags);
3128 if (m != NULL) {
3129 MBUF_INIT(m, hdr, type);
3130 mtype_stat_inc(type);
3131 mtype_stat_dec(MT_FREE);
3132 #if CONFIG_MACF_NET
3133 if (hdr && mac_init_mbuf(m, wait) != 0) {
3134 m_free(m);
3135 return (NULL);
3136 }
3137 #endif /* MAC_NET */
3138 }
3139 return (m);
3140 }
3141
3142 /*
3143 * Space allocation routines; these are also available as macros
3144 * for critical paths.
3145 */
3146 #define _M_GET(wait, type) m_get_common(wait, type, 0)
3147 #define _M_GETHDR(wait, type) m_get_common(wait, type, 1)
3148 #define _M_RETRY(wait, type) _M_GET(wait, type)
3149 #define _M_RETRYHDR(wait, type) _M_GETHDR(wait, type)
3150 #define _MGET(m, how, type) ((m) = _M_GET(how, type))
3151 #define _MGETHDR(m, how, type) ((m) = _M_GETHDR(how, type))
3152
3153 struct mbuf *
3154 m_get(int wait, int type)
3155 {
3156 return (_M_GET(wait, type));
3157 }
3158
3159 struct mbuf *
3160 m_gethdr(int wait, int type)
3161 {
3162 return (_M_GETHDR(wait, type));
3163 }
3164
3165 struct mbuf *
3166 m_retry(int wait, int type)
3167 {
3168 return (_M_RETRY(wait, type));
3169 }
3170
3171 struct mbuf *
3172 m_retryhdr(int wait, int type)
3173 {
3174 return (_M_RETRYHDR(wait, type));
3175 }
3176
3177 struct mbuf *
3178 m_getclr(int wait, int type)
3179 {
3180 struct mbuf *m;
3181
3182 _MGET(m, wait, type);
3183 if (m != NULL)
3184 bzero(MTOD(m, caddr_t), MLEN);
3185 return (m);
3186 }
3187
3188 struct mbuf *
3189 m_free(struct mbuf *m)
3190 {
3191 struct mbuf *n = m->m_next;
3192
3193 if (m->m_type == MT_FREE)
3194 panic("m_free: freeing an already freed mbuf");
3195
3196 /* Free the aux data and tags if there is any */
3197 if (m->m_flags & M_PKTHDR) {
3198 m_tag_delete_chain(m, NULL);
3199 }
3200
3201 if (m->m_flags & M_EXT) {
3202 u_int32_t refcnt;
3203 u_int32_t composite;
3204
3205 refcnt = m_decref(m);
3206 composite = (MEXT_FLAGS(m) & EXTF_COMPOSITE);
3207 if (refcnt == 0 && !composite) {
3208 if (m->m_ext.ext_free == NULL) {
3209 mcache_free(m_cache(MC_CL), m->m_ext.ext_buf);
3210 } else if (m->m_ext.ext_free == m_bigfree) {
3211 mcache_free(m_cache(MC_BIGCL),
3212 m->m_ext.ext_buf);
3213 } else if (m->m_ext.ext_free == m_16kfree) {
3214 mcache_free(m_cache(MC_16KCL),
3215 m->m_ext.ext_buf);
3216 } else {
3217 (*(m->m_ext.ext_free))(m->m_ext.ext_buf,
3218 m->m_ext.ext_size, m->m_ext.ext_arg);
3219 }
3220 mcache_free(ref_cache, MEXT_RFA(m));
3221 MEXT_RFA(m) = NULL;
3222 } else if (refcnt == 0 && composite) {
3223 VERIFY(m->m_type != MT_FREE);
3224
3225 mtype_stat_dec(m->m_type);
3226 mtype_stat_inc(MT_FREE);
3227
3228 m->m_type = MT_FREE;
3229 m->m_flags = M_EXT;
3230 m->m_len = 0;
3231 m->m_next = m->m_nextpkt = NULL;
3232
3233 MEXT_FLAGS(m) &= ~EXTF_READONLY;
3234
3235 /* "Free" into the intermediate cache */
3236 if (m->m_ext.ext_free == NULL) {
3237 mcache_free(m_cache(MC_MBUF_CL), m);
3238 } else if (m->m_ext.ext_free == m_bigfree) {
3239 mcache_free(m_cache(MC_MBUF_BIGCL), m);
3240 } else {
3241 VERIFY(m->m_ext.ext_free == m_16kfree);
3242 mcache_free(m_cache(MC_MBUF_16KCL), m);
3243 }
3244 return (n);
3245 }
3246 }
3247
3248 if (m->m_type != MT_FREE) {
3249 mtype_stat_dec(m->m_type);
3250 mtype_stat_inc(MT_FREE);
3251 }
3252
3253 m->m_type = MT_FREE;
3254 m->m_flags = m->m_len = 0;
3255 m->m_next = m->m_nextpkt = NULL;
3256
3257 mcache_free(m_cache(MC_MBUF), m);
3258
3259 return (n);
3260 }
3261
3262 __private_extern__ struct mbuf *
3263 m_clattach(struct mbuf *m, int type, caddr_t extbuf,
3264 void (*extfree)(caddr_t, u_int, caddr_t), u_int extsize, caddr_t extarg,
3265 int wait)
3266 {
3267 struct ext_ref *rfa = NULL;
3268
3269 if (m == NULL && (m = _M_GETHDR(wait, type)) == NULL)
3270 return (NULL);
3271
3272 if (m->m_flags & M_EXT) {
3273 u_int32_t refcnt;
3274 u_int32_t composite;
3275
3276 refcnt = m_decref(m);
3277 composite = (MEXT_FLAGS(m) & EXTF_COMPOSITE);
3278 if (refcnt == 0 && !composite) {
3279 if (m->m_ext.ext_free == NULL) {
3280 mcache_free(m_cache(MC_CL), m->m_ext.ext_buf);
3281 } else if (m->m_ext.ext_free == m_bigfree) {
3282 mcache_free(m_cache(MC_BIGCL),
3283 m->m_ext.ext_buf);
3284 } else if (m->m_ext.ext_free == m_16kfree) {
3285 mcache_free(m_cache(MC_16KCL),
3286 m->m_ext.ext_buf);
3287 } else {
3288 (*(m->m_ext.ext_free))(m->m_ext.ext_buf,
3289 m->m_ext.ext_size, m->m_ext.ext_arg);
3290 }
3291 /* Re-use the reference structure */
3292 rfa = MEXT_RFA(m);
3293 } else if (refcnt == 0 && composite) {
3294 VERIFY(m->m_type != MT_FREE);
3295
3296 mtype_stat_dec(m->m_type);
3297 mtype_stat_inc(MT_FREE);
3298
3299 m->m_type = MT_FREE;
3300 m->m_flags = M_EXT;
3301 m->m_len = 0;
3302 m->m_next = m->m_nextpkt = NULL;
3303
3304 MEXT_FLAGS(m) &= ~EXTF_READONLY;
3305
3306 /* "Free" into the intermediate cache */
3307 if (m->m_ext.ext_free == NULL) {
3308 mcache_free(m_cache(MC_MBUF_CL), m);
3309 } else if (m->m_ext.ext_free == m_bigfree) {
3310 mcache_free(m_cache(MC_MBUF_BIGCL), m);
3311 } else {
3312 VERIFY(m->m_ext.ext_free == m_16kfree);
3313 mcache_free(m_cache(MC_MBUF_16KCL), m);
3314 }
3315 /*
3316 * Allocate a new mbuf, since we didn't divorce
3317 * the composite mbuf + cluster pair above.
3318 */
3319 if ((m = _M_GETHDR(wait, type)) == NULL)
3320 return (NULL);
3321 }
3322 }
3323
3324 if (rfa == NULL &&
3325 (rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL) {
3326 m_free(m);
3327 return (NULL);
3328 }
3329
3330 MEXT_INIT(m, extbuf, extsize, extfree, extarg, rfa, 1, 0);
3331
3332 return (m);
3333 }
3334
3335 /*
3336 * Perform `fast' allocation mbuf clusters from a cache of recently-freed
3337 * clusters. (If the cache is empty, new clusters are allocated en-masse.)
3338 */
3339 struct mbuf *
3340 m_getcl(int wait, int type, int flags)
3341 {
3342 struct mbuf *m;
3343 int mcflags = MSLEEPF(wait);
3344 int hdr = (flags & M_PKTHDR);
3345
3346 /* Is this due to a non-blocking retry? If so, then try harder */
3347 if (mcflags & MCR_NOSLEEP)
3348 mcflags |= MCR_TRYHARD;
3349
3350 m = mcache_alloc(m_cache(MC_MBUF_CL), mcflags);
3351 if (m != NULL) {
3352 u_int32_t flag;
3353 struct ext_ref *rfa;
3354 void *cl;
3355
3356 VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT);
3357 cl = m->m_ext.ext_buf;
3358 rfa = MEXT_RFA(m);
3359
3360 ASSERT(cl != NULL && rfa != NULL);
3361 VERIFY(MBUF_IS_COMPOSITE(m) && m->m_ext.ext_free == NULL);
3362
3363 flag = MEXT_FLAGS(m);
3364
3365 MBUF_INIT(m, hdr, type);
3366 MBUF_CL_INIT(m, cl, rfa, 1, flag);
3367
3368 mtype_stat_inc(type);
3369 mtype_stat_dec(MT_FREE);
3370 #if CONFIG_MACF_NET
3371 if (hdr && mac_init_mbuf(m, wait) != 0) {
3372 m_freem(m);
3373 return (NULL);
3374 }
3375 #endif /* MAC_NET */
3376 }
3377 return (m);
3378 }
3379
3380 /* m_mclget() add an mbuf cluster to a normal mbuf */
3381 struct mbuf *
3382 m_mclget(struct mbuf *m, int wait)
3383 {
3384 struct ext_ref *rfa;
3385
3386 if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL)
3387 return (m);
3388
3389 m->m_ext.ext_buf = m_mclalloc(wait);
3390 if (m->m_ext.ext_buf != NULL) {
3391 MBUF_CL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0);
3392 } else {
3393 mcache_free(ref_cache, rfa);
3394 }
3395 return (m);
3396 }
3397
3398 /* Allocate an mbuf cluster */
3399 caddr_t
3400 m_mclalloc(int wait)
3401 {
3402 int mcflags = MSLEEPF(wait);
3403
3404 /* Is this due to a non-blocking retry? If so, then try harder */
3405 if (mcflags & MCR_NOSLEEP)
3406 mcflags |= MCR_TRYHARD;
3407
3408 return (mcache_alloc(m_cache(MC_CL), mcflags));
3409 }
3410
3411 /* Free an mbuf cluster */
3412 void
3413 m_mclfree(caddr_t p)
3414 {
3415 mcache_free(m_cache(MC_CL), p);
3416 }
3417
3418 /*
3419 * mcl_hasreference() checks if a cluster of an mbuf is referenced by
3420 * another mbuf; see comments in m_incref() regarding EXTF_READONLY.
3421 */
3422 int
3423 m_mclhasreference(struct mbuf *m)
3424 {
3425 if (!(m->m_flags & M_EXT))
3426 return (0);
3427
3428 ASSERT(MEXT_RFA(m) != NULL);
3429
3430 return ((MEXT_FLAGS(m) & EXTF_READONLY) ? 1 : 0);
3431 }
3432
3433 __private_extern__ caddr_t
3434 m_bigalloc(int wait)
3435 {
3436 int mcflags = MSLEEPF(wait);
3437
3438 /* Is this due to a non-blocking retry? If so, then try harder */
3439 if (mcflags & MCR_NOSLEEP)
3440 mcflags |= MCR_TRYHARD;
3441
3442 return (mcache_alloc(m_cache(MC_BIGCL), mcflags));
3443 }
3444
3445 __private_extern__ void
3446 m_bigfree(caddr_t p, __unused u_int size, __unused caddr_t arg)
3447 {
3448 mcache_free(m_cache(MC_BIGCL), p);
3449 }
3450
3451 /* m_mbigget() add an 4KB mbuf cluster to a normal mbuf */
3452 __private_extern__ struct mbuf *
3453 m_mbigget(struct mbuf *m, int wait)
3454 {
3455 struct ext_ref *rfa;
3456
3457 if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL)
3458 return (m);
3459
3460 m->m_ext.ext_buf = m_bigalloc(wait);
3461 if (m->m_ext.ext_buf != NULL) {
3462 MBUF_BIGCL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0);
3463 } else {
3464 mcache_free(ref_cache, rfa);
3465 }
3466 return (m);
3467 }
3468
3469 __private_extern__ caddr_t
3470 m_16kalloc(int wait)
3471 {
3472 int mcflags = MSLEEPF(wait);
3473
3474 /* Is this due to a non-blocking retry? If so, then try harder */
3475 if (mcflags & MCR_NOSLEEP)
3476 mcflags |= MCR_TRYHARD;
3477
3478 return (mcache_alloc(m_cache(MC_16KCL), mcflags));
3479 }
3480
3481 __private_extern__ void
3482 m_16kfree(caddr_t p, __unused u_int size, __unused caddr_t arg)
3483 {
3484 mcache_free(m_cache(MC_16KCL), p);
3485 }
3486
3487 /* m_m16kget() add a 16KB mbuf cluster to a normal mbuf */
3488 __private_extern__ struct mbuf *
3489 m_m16kget(struct mbuf *m, int wait)
3490 {
3491 struct ext_ref *rfa;
3492
3493 if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL)
3494 return (m);
3495
3496 m->m_ext.ext_buf = m_16kalloc(wait);
3497 if (m->m_ext.ext_buf != NULL) {
3498 MBUF_16KCL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0);
3499 } else {
3500 mcache_free(ref_cache, rfa);
3501 }
3502 return (m);
3503 }
3504
3505 /*
3506 * "Move" mbuf pkthdr from "from" to "to".
3507 * "from" must have M_PKTHDR set, and "to" must be empty.
3508 */
3509 void
3510 m_copy_pkthdr(struct mbuf *to, struct mbuf *from)
3511 {
3512 /* We will be taking over the tags of 'to' */
3513 if (to->m_flags & M_PKTHDR)
3514 m_tag_delete_chain(to, NULL);
3515 to->m_pkthdr = from->m_pkthdr; /* especially tags */
3516 m_tag_init(from); /* purge tags from src */
3517 m_prio_init(from); /* reset priority from src */
3518 to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT);
3519 if ((to->m_flags & M_EXT) == 0)
3520 to->m_data = to->m_pktdat;
3521 }
3522
3523 /*
3524 * Duplicate "from"'s mbuf pkthdr in "to".
3525 * "from" must have M_PKTHDR set, and "to" must be empty.
3526 * In particular, this does a deep copy of the packet tags.
3527 */
3528 static int
3529 m_dup_pkthdr(struct mbuf *to, struct mbuf *from, int how)
3530 {
3531 if (to->m_flags & M_PKTHDR)
3532 m_tag_delete_chain(to, NULL);
3533 to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT);
3534 if ((to->m_flags & M_EXT) == 0)
3535 to->m_data = to->m_pktdat;
3536 to->m_pkthdr = from->m_pkthdr;
3537 m_tag_init(to);
3538 return (m_tag_copy_chain(to, from, how));
3539 }
3540
3541 /*
3542 * Return a list of mbuf hdrs that point to clusters. Try for num_needed;
3543 * if wantall is not set, return whatever number were available. Set up the
3544 * first num_with_pkthdrs with mbuf hdrs configured as packet headers; these
3545 * are chained on the m_nextpkt field. Any packets requested beyond this
3546 * are chained onto the last packet header's m_next field. The size of
3547 * the cluster is controlled by the parameter bufsize.
3548 */
3549 __private_extern__ struct mbuf *
3550 m_getpackets_internal(unsigned int *num_needed, int num_with_pkthdrs,
3551 int wait, int wantall, size_t bufsize)
3552 {
3553 struct mbuf *m;
3554 struct mbuf **np, *top;
3555 unsigned int pnum, needed = *num_needed;
3556 mcache_obj_t *mp_list = NULL;
3557 int mcflags = MSLEEPF(wait);
3558 u_int32_t flag;
3559 struct ext_ref *rfa;
3560 mcache_t *cp;
3561 void *cl;
3562
3563 ASSERT(bufsize == m_maxsize(MC_CL) ||
3564 bufsize == m_maxsize(MC_BIGCL) ||
3565 bufsize == m_maxsize(MC_16KCL));
3566
3567 /*
3568 * Caller must first check for njcl because this
3569 * routine is internal and not exposed/used via KPI.
3570 */
3571 VERIFY(bufsize != m_maxsize(MC_16KCL) || njcl > 0);
3572
3573 top = NULL;
3574 np = &top;
3575 pnum = 0;
3576
3577 /*
3578 * The caller doesn't want all the requested buffers; only some.
3579 * Try hard to get what we can, but don't block. This effectively
3580 * overrides MCR_SLEEP, since this thread will not go to sleep
3581 * if we can't get all the buffers.
3582 */
3583 if (!wantall || (mcflags & MCR_NOSLEEP))
3584 mcflags |= MCR_TRYHARD;
3585
3586 /* Allocate the composite mbuf + cluster elements from the cache */
3587 if (bufsize == m_maxsize(MC_CL))
3588 cp = m_cache(MC_MBUF_CL);
3589 else if (bufsize == m_maxsize(MC_BIGCL))
3590 cp = m_cache(MC_MBUF_BIGCL);
3591 else
3592 cp = m_cache(MC_MBUF_16KCL);
3593 needed = mcache_alloc_ext(cp, &mp_list, needed, mcflags);
3594
3595 for (pnum = 0; pnum < needed; pnum++) {
3596 m = (struct mbuf *)mp_list;
3597 mp_list = mp_list->obj_next;
3598
3599 VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT);
3600 cl = m->m_ext.ext_buf;
3601 rfa = MEXT_RFA(m);
3602
3603 ASSERT(cl != NULL && rfa != NULL);
3604 VERIFY(MBUF_IS_COMPOSITE(m));
3605
3606 flag = MEXT_FLAGS(m);
3607
3608 MBUF_INIT(m, num_with_pkthdrs, MT_DATA);
3609 if (bufsize == m_maxsize(MC_16KCL)) {
3610 MBUF_16KCL_INIT(m, cl, rfa, 1, flag);
3611 } else if (bufsize == m_maxsize(MC_BIGCL)) {
3612 MBUF_BIGCL_INIT(m, cl, rfa, 1, flag);
3613 } else {
3614 MBUF_CL_INIT(m, cl, rfa, 1, flag);
3615 }
3616
3617 if (num_with_pkthdrs > 0) {
3618 --num_with_pkthdrs;
3619 #if CONFIG_MACF_NET
3620 if (mac_mbuf_label_init(m, wait) != 0) {
3621 m_freem(m);
3622 break;
3623 }
3624 #endif /* MAC_NET */
3625 }
3626
3627 *np = m;
3628 if (num_with_pkthdrs > 0)
3629 np = &m->m_nextpkt;
3630 else
3631 np = &m->m_next;
3632 }
3633 ASSERT(pnum != *num_needed || mp_list == NULL);
3634 if (mp_list != NULL)
3635 mcache_free_ext(cp, mp_list);
3636
3637 if (pnum > 0) {
3638 mtype_stat_add(MT_DATA, pnum);
3639 mtype_stat_sub(MT_FREE, pnum);
3640 }
3641
3642 if (wantall && (pnum != *num_needed)) {
3643 if (top != NULL)
3644 m_freem_list(top);
3645 return (NULL);
3646 }
3647
3648 *num_needed = pnum;
3649 return (top);
3650 }
3651
3652 /*
3653 * Return list of mbuf linked by m_nextpkt. Try for numlist, and if
3654 * wantall is not set, return whatever number were available. The size of
3655 * each mbuf in the list is controlled by the parameter packetlen. Each
3656 * mbuf of the list may have a chain of mbufs linked by m_next. Each mbuf
3657 * in the chain is called a segment. If maxsegments is not null and the
3658 * value pointed to is not null, this specify the maximum number of segments
3659 * for a chain of mbufs. If maxsegments is zero or the value pointed to
3660 * is zero the caller does not have any restriction on the number of segments.
3661 * The actual number of segments of a mbuf chain is return in the value
3662 * pointed to by maxsegments.
3663 */
3664 __private_extern__ struct mbuf *
3665 m_allocpacket_internal(unsigned int *numlist, size_t packetlen,
3666 unsigned int *maxsegments, int wait, int wantall, size_t wantsize)
3667 {
3668 struct mbuf **np, *top, *first = NULL;
3669 size_t bufsize, r_bufsize;
3670 unsigned int num = 0;
3671 unsigned int nsegs = 0;
3672 unsigned int needed, resid;
3673 int mcflags = MSLEEPF(wait);
3674 mcache_obj_t *mp_list = NULL, *rmp_list = NULL;
3675 mcache_t *cp = NULL, *rcp = NULL;
3676
3677 if (*numlist == 0)
3678 return (NULL);
3679
3680 top = NULL;
3681 np = &top;
3682
3683 if (wantsize == 0) {
3684 if (packetlen <= MINCLSIZE) {
3685 bufsize = packetlen;
3686 } else if (packetlen > m_maxsize(MC_CL)) {
3687 /* Use 4KB if jumbo cluster pool isn't available */
3688 if (packetlen <= m_maxsize(MC_BIGCL) || njcl == 0)
3689 bufsize = m_maxsize(MC_BIGCL);
3690 else
3691 bufsize = m_maxsize(MC_16KCL);
3692 } else {
3693 bufsize = m_maxsize(MC_CL);
3694 }
3695 } else if (wantsize == m_maxsize(MC_CL) ||
3696 wantsize == m_maxsize(MC_BIGCL) ||
3697 (wantsize == m_maxsize(MC_16KCL) && njcl > 0)) {
3698 bufsize = wantsize;
3699 } else {
3700 return (NULL);
3701 }
3702
3703 if (bufsize <= MHLEN) {
3704 nsegs = 1;
3705 } else if (bufsize <= MINCLSIZE) {
3706 if (maxsegments != NULL && *maxsegments == 1) {
3707 bufsize = m_maxsize(MC_CL);
3708 nsegs = 1;
3709 } else {
3710 nsegs = 2;
3711 }
3712 } else if (bufsize == m_maxsize(MC_16KCL)) {
3713 VERIFY(njcl > 0);
3714 nsegs = ((packetlen - 1) >> (PGSHIFT + 2)) + 1;
3715 } else if (bufsize == m_maxsize(MC_BIGCL)) {
3716 nsegs = ((packetlen - 1) >> PGSHIFT) + 1;
3717 } else {
3718 nsegs = ((packetlen - 1) >> MCLSHIFT) + 1;
3719 }
3720 if (maxsegments != NULL) {
3721 if (*maxsegments && nsegs > *maxsegments) {
3722 *maxsegments = nsegs;
3723 return (NULL);
3724 }
3725 *maxsegments = nsegs;
3726 }
3727
3728 /*
3729 * The caller doesn't want all the requested buffers; only some.
3730 * Try hard to get what we can, but don't block. This effectively
3731 * overrides MCR_SLEEP, since this thread will not go to sleep
3732 * if we can't get all the buffers.
3733 */
3734 if (!wantall || (mcflags & MCR_NOSLEEP))
3735 mcflags |= MCR_TRYHARD;
3736
3737 /*
3738 * Simple case where all elements in the lists/chains are mbufs.
3739 * Unless bufsize is greater than MHLEN, each segment chain is made
3740 * up of exactly 1 mbuf. Otherwise, each segment chain is made up
3741 * of 2 mbufs; the second one is used for the residual data, i.e.
3742 * the remaining data that cannot fit into the first mbuf.
3743 */
3744 if (bufsize <= MINCLSIZE) {
3745 /* Allocate the elements in one shot from the mbuf cache */
3746 ASSERT(bufsize <= MHLEN || nsegs == 2);
3747 cp = m_cache(MC_MBUF);
3748 needed = mcache_alloc_ext(cp, &mp_list,
3749 (*numlist) * nsegs, mcflags);
3750
3751 /*
3752 * The number of elements must be even if we are to use an
3753 * mbuf (instead of a cluster) to store the residual data.
3754 * If we couldn't allocate the requested number of mbufs,
3755 * trim the number down (if it's odd) in order to avoid
3756 * creating a partial segment chain.
3757 */
3758 if (bufsize > MHLEN && (needed & 0x1))
3759 needed--;
3760
3761 while (num < needed) {
3762 struct mbuf *m;
3763
3764 m = (struct mbuf *)mp_list;
3765 mp_list = mp_list->obj_next;
3766 ASSERT(m != NULL);
3767
3768 MBUF_INIT(m, 1, MT_DATA);
3769 #if CONFIG_MACF_NET
3770 if (mac_init_mbuf(m, wait) != 0) {
3771 m_free(m);
3772 break;
3773 }
3774 #endif /* MAC_NET */
3775 num++;
3776 if (bufsize > MHLEN) {
3777 /* A second mbuf for this segment chain */
3778 m->m_next = (struct mbuf *)mp_list;
3779 mp_list = mp_list->obj_next;
3780 ASSERT(m->m_next != NULL);
3781
3782 MBUF_INIT(m->m_next, 0, MT_DATA);
3783 num++;
3784 }
3785 *np = m;
3786 np = &m->m_nextpkt;
3787 }
3788 ASSERT(num != *numlist || mp_list == NULL);
3789
3790 if (num > 0) {
3791 mtype_stat_add(MT_DATA, num);
3792 mtype_stat_sub(MT_FREE, num);
3793 }
3794 num /= nsegs;
3795
3796 /* We've got them all; return to caller */
3797 if (num == *numlist)
3798 return (top);
3799
3800 goto fail;
3801 }
3802
3803 /*
3804 * Complex cases where elements are made up of one or more composite
3805 * mbufs + cluster, depending on packetlen. Each N-segment chain can
3806 * be illustrated as follows:
3807 *
3808 * [mbuf + cluster 1] [mbuf + cluster 2] ... [mbuf + cluster N]
3809 *
3810 * Every composite mbuf + cluster element comes from the intermediate
3811 * cache (either MC_MBUF_CL or MC_MBUF_BIGCL). For space efficiency,
3812 * the last composite element will come from the MC_MBUF_CL cache,
3813 * unless the residual data is larger than 2KB where we use the
3814 * big cluster composite cache (MC_MBUF_BIGCL) instead. Residual
3815 * data is defined as extra data beyond the first element that cannot
3816 * fit into the previous element, i.e. there is no residual data if
3817 * the chain only has 1 segment.
3818 */
3819 r_bufsize = bufsize;
3820 resid = packetlen > bufsize ? packetlen % bufsize : 0;
3821 if (resid > 0) {
3822 /* There is residual data; figure out the cluster size */
3823 if (wantsize == 0 && packetlen > MINCLSIZE) {
3824 /*
3825 * Caller didn't request that all of the segments
3826 * in the chain use the same cluster size; use the
3827 * smaller of the cluster sizes.
3828 */
3829 if (njcl > 0 && resid > m_maxsize(MC_BIGCL))
3830 r_bufsize = m_maxsize(MC_16KCL);
3831 else if (resid > m_maxsize(MC_CL))
3832 r_bufsize = m_maxsize(MC_BIGCL);
3833 else
3834 r_bufsize = m_maxsize(MC_CL);
3835 } else {
3836 /* Use the same cluster size as the other segments */
3837 resid = 0;
3838 }
3839 }
3840
3841 needed = *numlist;
3842 if (resid > 0) {
3843 /*
3844 * Attempt to allocate composite mbuf + cluster elements for
3845 * the residual data in each chain; record the number of such
3846 * elements that can be allocated so that we know how many
3847 * segment chains we can afford to create.
3848 */
3849 if (r_bufsize <= m_maxsize(MC_CL))
3850 rcp = m_cache(MC_MBUF_CL);
3851 else if (r_bufsize <= m_maxsize(MC_BIGCL))
3852 rcp = m_cache(MC_MBUF_BIGCL);
3853 else
3854 rcp = m_cache(MC_MBUF_16KCL);
3855 needed = mcache_alloc_ext(rcp, &rmp_list, *numlist, mcflags);
3856
3857 if (needed == 0)
3858 goto fail;
3859
3860 /* This is temporarily reduced for calculation */
3861 ASSERT(nsegs > 1);
3862 nsegs--;
3863 }
3864
3865 /*
3866 * Attempt to allocate the rest of the composite mbuf + cluster
3867 * elements for the number of segment chains that we need.
3868 */
3869 if (bufsize <= m_maxsize(MC_CL))
3870 cp = m_cache(MC_MBUF_CL);
3871 else if (bufsize <= m_maxsize(MC_BIGCL))
3872 cp = m_cache(MC_MBUF_BIGCL);
3873 else
3874 cp = m_cache(MC_MBUF_16KCL);
3875 needed = mcache_alloc_ext(cp, &mp_list, needed * nsegs, mcflags);
3876
3877 /* Round it down to avoid creating a partial segment chain */
3878 needed = (needed / nsegs) * nsegs;
3879 if (needed == 0)
3880 goto fail;
3881
3882 if (resid > 0) {
3883 /*
3884 * We're about to construct the chain(s); take into account
3885 * the number of segments we have created above to hold the
3886 * residual data for each chain, as well as restore the
3887 * original count of segments per chain.
3888 */
3889 ASSERT(nsegs > 0);
3890 needed += needed / nsegs;
3891 nsegs++;
3892 }
3893
3894 for (;;) {
3895 struct mbuf *m;
3896 u_int32_t flag;
3897 struct ext_ref *rfa;
3898 void *cl;
3899 int pkthdr;
3900
3901 ++num;
3902 if (nsegs == 1 || (num % nsegs) != 0 || resid == 0) {
3903 m = (struct mbuf *)mp_list;
3904 mp_list = mp_list->obj_next;
3905 } else {
3906 m = (struct mbuf *)rmp_list;
3907 rmp_list = rmp_list->obj_next;
3908 }
3909 ASSERT(m != NULL);
3910 VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT);
3911 VERIFY(m->m_ext.ext_free == NULL ||
3912 m->m_ext.ext_free == m_bigfree ||
3913 m->m_ext.ext_free == m_16kfree);
3914
3915 cl = m->m_ext.ext_buf;
3916 rfa = MEXT_RFA(m);
3917
3918 ASSERT(cl != NULL && rfa != NULL);
3919 VERIFY(MBUF_IS_COMPOSITE(m));
3920
3921 flag = MEXT_FLAGS(m);
3922
3923 pkthdr = (nsegs == 1 || (num % nsegs) == 1);
3924 if (pkthdr)
3925 first = m;
3926 MBUF_INIT(m, pkthdr, MT_DATA);
3927 if (m->m_ext.ext_free == m_16kfree) {
3928 MBUF_16KCL_INIT(m, cl, rfa, 1, flag);
3929 } else if (m->m_ext.ext_free == m_bigfree) {
3930 MBUF_BIGCL_INIT(m, cl, rfa, 1, flag);
3931 } else {
3932 MBUF_CL_INIT(m, cl, rfa, 1, flag);
3933 }
3934 #if CONFIG_MACF_NET
3935 if (pkthdr && mac_init_mbuf(m, wait) != 0) {
3936 --num;
3937 m_freem(m);
3938 break;
3939 }
3940 #endif /* MAC_NET */
3941
3942 *np = m;
3943 if ((num % nsegs) == 0)
3944 np = &first->m_nextpkt;
3945 else
3946 np = &m->m_next;
3947
3948 if (num == needed)
3949 break;
3950 }
3951
3952 if (num > 0) {
3953 mtype_stat_add(MT_DATA, num);
3954 mtype_stat_sub(MT_FREE, num);
3955 }
3956
3957 num /= nsegs;
3958
3959 /* We've got them all; return to caller */
3960 if (num == *numlist) {
3961 ASSERT(mp_list == NULL && rmp_list == NULL);
3962 return (top);
3963 }
3964
3965 fail:
3966 /* Free up what's left of the above */
3967 if (mp_list != NULL)
3968 mcache_free_ext(cp, mp_list);
3969 if (rmp_list != NULL)
3970 mcache_free_ext(rcp, rmp_list);
3971 if (wantall && top != NULL) {
3972 m_freem(top);
3973 return (NULL);
3974 }
3975 *numlist = num;
3976 return (top);
3977 }
3978
3979 /*
3980 * Best effort to get a mbuf cluster + pkthdr. Used by drivers to allocated
3981 * packets on receive ring.
3982 */
3983 __private_extern__ struct mbuf *
3984 m_getpacket_how(int wait)
3985 {
3986 unsigned int num_needed = 1;
3987
3988 return (m_getpackets_internal(&num_needed, 1, wait, 1,
3989 m_maxsize(MC_CL)));
3990 }
3991
3992 /*
3993 * Best effort to get a mbuf cluster + pkthdr. Used by drivers to allocated
3994 * packets on receive ring.
3995 */
3996 struct mbuf *
3997 m_getpacket(void)
3998 {
3999 unsigned int num_needed = 1;
4000
4001 return (m_getpackets_internal(&num_needed, 1, M_WAIT, 1,
4002 m_maxsize(MC_CL)));
4003 }
4004
4005 /*
4006 * Return a list of mbuf hdrs that point to clusters. Try for num_needed;
4007 * if this can't be met, return whatever number were available. Set up the
4008 * first num_with_pkthdrs with mbuf hdrs configured as packet headers. These
4009 * are chained on the m_nextpkt field. Any packets requested beyond this are
4010 * chained onto the last packet header's m_next field.
4011 */
4012 struct mbuf *
4013 m_getpackets(int num_needed, int num_with_pkthdrs, int how)
4014 {
4015 unsigned int n = num_needed;
4016
4017 return (m_getpackets_internal(&n, num_with_pkthdrs, how, 0,
4018 m_maxsize(MC_CL)));
4019 }
4020
4021 /*
4022 * Return a list of mbuf hdrs set up as packet hdrs chained together
4023 * on the m_nextpkt field
4024 */
4025 struct mbuf *
4026 m_getpackethdrs(int num_needed, int how)
4027 {
4028 struct mbuf *m;
4029 struct mbuf **np, *top;
4030
4031 top = NULL;
4032 np = &top;
4033
4034 while (num_needed--) {
4035 m = _M_RETRYHDR(how, MT_DATA);
4036 if (m == NULL)
4037 break;
4038
4039 *np = m;
4040 np = &m->m_nextpkt;
4041 }
4042
4043 return (top);
4044 }
4045
4046 /*
4047 * Free an mbuf list (m_nextpkt) while following m_next. Returns the count
4048 * for mbufs packets freed. Used by the drivers.
4049 */
4050 int
4051 m_freem_list(struct mbuf *m)
4052 {
4053 struct mbuf *nextpkt;
4054 mcache_obj_t *mp_list = NULL;
4055 mcache_obj_t *mcl_list = NULL;
4056 mcache_obj_t *mbc_list = NULL;
4057 mcache_obj_t *m16k_list = NULL;
4058 mcache_obj_t *m_mcl_list = NULL;
4059 mcache_obj_t *m_mbc_list = NULL;
4060 mcache_obj_t *m_m16k_list = NULL;
4061 mcache_obj_t *ref_list = NULL;
4062 int pktcount = 0;
4063 int mt_free = 0, mt_data = 0, mt_header = 0, mt_soname = 0, mt_tag = 0;
4064
4065 while (m != NULL) {
4066 pktcount++;
4067
4068 nextpkt = m->m_nextpkt;
4069 m->m_nextpkt = NULL;
4070
4071 while (m != NULL) {
4072 struct mbuf *next = m->m_next;
4073 mcache_obj_t *o, *rfa;
4074 u_int32_t refcnt, composite;
4075
4076 if (m->m_type == MT_FREE)
4077 panic("m_free: freeing an already freed mbuf");
4078
4079 if (m->m_type != MT_FREE)
4080 mt_free++;
4081
4082 if (m->m_flags & M_PKTHDR) {
4083 m_tag_delete_chain(m, NULL);
4084 }
4085
4086 if (!(m->m_flags & M_EXT))
4087 goto simple_free;
4088
4089 o = (mcache_obj_t *)m->m_ext.ext_buf;
4090 refcnt = m_decref(m);
4091 composite = (MEXT_FLAGS(m) & EXTF_COMPOSITE);
4092 if (refcnt == 0 && !composite) {
4093 if (m->m_ext.ext_free == NULL) {
4094 o->obj_next = mcl_list;
4095 mcl_list = o;
4096 } else if (m->m_ext.ext_free == m_bigfree) {
4097 o->obj_next = mbc_list;
4098 mbc_list = o;
4099 } else if (m->m_ext.ext_free == m_16kfree) {
4100 o->obj_next = m16k_list;
4101 m16k_list = o;
4102 } else {
4103 (*(m->m_ext.ext_free))((caddr_t)o,
4104 m->m_ext.ext_size,
4105 m->m_ext.ext_arg);
4106 }
4107 rfa = (mcache_obj_t *)MEXT_RFA(m);
4108 rfa->obj_next = ref_list;
4109 ref_list = rfa;
4110 MEXT_RFA(m) = NULL;
4111 } else if (refcnt == 0 && composite) {
4112 VERIFY(m->m_type != MT_FREE);
4113 /*
4114 * Amortize the costs of atomic operations
4115 * by doing them at the end, if possible.
4116 */
4117 if (m->m_type == MT_DATA)
4118 mt_data++;
4119 else if (m->m_type == MT_HEADER)
4120 mt_header++;
4121 else if (m->m_type == MT_SONAME)
4122 mt_soname++;
4123 else if (m->m_type == MT_TAG)
4124 mt_tag++;
4125 else
4126 mtype_stat_dec(m->m_type);
4127
4128 m->m_type = MT_FREE;
4129 m->m_flags = M_EXT;
4130 m->m_len = 0;
4131 m->m_next = m->m_nextpkt = NULL;
4132
4133 MEXT_FLAGS(m) &= ~EXTF_READONLY;
4134
4135 /* "Free" into the intermediate cache */
4136 o = (mcache_obj_t *)m;
4137 if (m->m_ext.ext_free == NULL) {
4138 o->obj_next = m_mcl_list;
4139 m_mcl_list = o;
4140 } else if (m->m_ext.ext_free == m_bigfree) {
4141 o->obj_next = m_mbc_list;
4142 m_mbc_list = o;
4143 } else {
4144 VERIFY(m->m_ext.ext_free == m_16kfree);
4145 o->obj_next = m_m16k_list;
4146 m_m16k_list = o;
4147 }
4148 m = next;
4149 continue;
4150 }
4151 simple_free:
4152 /*
4153 * Amortize the costs of atomic operations
4154 * by doing them at the end, if possible.
4155 */
4156 if (m->m_type == MT_DATA)
4157 mt_data++;
4158 else if (m->m_type == MT_HEADER)
4159 mt_header++;
4160 else if (m->m_type == MT_SONAME)
4161 mt_soname++;
4162 else if (m->m_type == MT_TAG)
4163 mt_tag++;
4164 else if (m->m_type != MT_FREE)
4165 mtype_stat_dec(m->m_type);
4166
4167 m->m_type = MT_FREE;
4168 m->m_flags = m->m_len = 0;
4169 m->m_next = m->m_nextpkt = NULL;
4170
4171 ((mcache_obj_t *)m)->obj_next = mp_list;
4172 mp_list = (mcache_obj_t *)m;
4173
4174 m = next;
4175 }
4176
4177 m = nextpkt;
4178 }
4179
4180 if (mt_free > 0)
4181 mtype_stat_add(MT_FREE, mt_free);
4182 if (mt_data > 0)
4183 mtype_stat_sub(MT_DATA, mt_data);
4184 if (mt_header > 0)
4185 mtype_stat_sub(MT_HEADER, mt_header);
4186 if (mt_soname > 0)
4187 mtype_stat_sub(MT_SONAME, mt_soname);
4188 if (mt_tag > 0)
4189 mtype_stat_sub(MT_TAG, mt_tag);
4190
4191 if (mp_list != NULL)
4192 mcache_free_ext(m_cache(MC_MBUF), mp_list);
4193 if (mcl_list != NULL)
4194 mcache_free_ext(m_cache(MC_CL), mcl_list);
4195 if (mbc_list != NULL)
4196 mcache_free_ext(m_cache(MC_BIGCL), mbc_list);
4197 if (m16k_list != NULL)
4198 mcache_free_ext(m_cache(MC_16KCL), m16k_list);
4199 if (m_mcl_list != NULL)
4200 mcache_free_ext(m_cache(MC_MBUF_CL), m_mcl_list);
4201 if (m_mbc_list != NULL)
4202 mcache_free_ext(m_cache(MC_MBUF_BIGCL), m_mbc_list);
4203 if (m_m16k_list != NULL)
4204 mcache_free_ext(m_cache(MC_MBUF_16KCL), m_m16k_list);
4205 if (ref_list != NULL)
4206 mcache_free_ext(ref_cache, ref_list);
4207
4208 return (pktcount);
4209 }
4210
4211 void
4212 m_freem(struct mbuf *m)
4213 {
4214 while (m != NULL)
4215 m = m_free(m);
4216 }
4217
4218 /*
4219 * Mbuffer utility routines.
4220 */
4221
4222 /*
4223 * Compute the amount of space available before the current start
4224 * of data in an mbuf.
4225 */
4226 int
4227 m_leadingspace(struct mbuf *m)
4228 {
4229 if (m->m_flags & M_EXT) {
4230 if (MCLHASREFERENCE(m))
4231 return (0);
4232 return (m->m_data - m->m_ext.ext_buf);
4233 }
4234 if (m->m_flags & M_PKTHDR)
4235 return (m->m_data - m->m_pktdat);
4236 return (m->m_data - m->m_dat);
4237 }
4238
4239 /*
4240 * Compute the amount of space available after the end of data in an mbuf.
4241 */
4242 int
4243 m_trailingspace(struct mbuf *m)
4244 {
4245 if (m->m_flags & M_EXT) {
4246 if (MCLHASREFERENCE(m))
4247 return (0);
4248 return (m->m_ext.ext_buf + m->m_ext.ext_size -
4249 (m->m_data + m->m_len));
4250 }
4251 return (&m->m_dat[MLEN] - (m->m_data + m->m_len));
4252 }
4253
4254 /*
4255 * Lesser-used path for M_PREPEND: allocate new mbuf to prepend to chain,
4256 * copy junk along. Does not adjust packet header length.
4257 */
4258 struct mbuf *
4259 m_prepend(struct mbuf *m, int len, int how)
4260 {
4261 struct mbuf *mn;
4262
4263 _MGET(mn, how, m->m_type);
4264 if (mn == NULL) {
4265 m_freem(m);
4266 return (NULL);
4267 }
4268 if (m->m_flags & M_PKTHDR) {
4269 M_COPY_PKTHDR(mn, m);
4270 m->m_flags &= ~M_PKTHDR;
4271 }
4272 mn->m_next = m;
4273 m = mn;
4274 if (len < MHLEN)
4275 MH_ALIGN(m, len);
4276 m->m_len = len;
4277 return (m);
4278 }
4279
4280 /*
4281 * Replacement for old M_PREPEND macro: allocate new mbuf to prepend to
4282 * chain, copy junk along, and adjust length.
4283 */
4284 struct mbuf *
4285 m_prepend_2(struct mbuf *m, int len, int how)
4286 {
4287 if (M_LEADINGSPACE(m) >= len) {
4288 m->m_data -= len;
4289 m->m_len += len;
4290 } else {
4291 m = m_prepend(m, len, how);
4292 }
4293 if ((m) && (m->m_flags & M_PKTHDR))
4294 m->m_pkthdr.len += len;
4295 return (m);
4296 }
4297
4298 /*
4299 * Make a copy of an mbuf chain starting "off0" bytes from the beginning,
4300 * continuing for "len" bytes. If len is M_COPYALL, copy to end of mbuf.
4301 * The wait parameter is a choice of M_WAIT/M_DONTWAIT from caller.
4302 */
4303 int MCFail;
4304
4305 struct mbuf *
4306 m_copym(struct mbuf *m, int off0, int len, int wait)
4307 {
4308 struct mbuf *n, *mhdr = NULL, **np;
4309 int off = off0;
4310 struct mbuf *top;
4311 int copyhdr = 0;
4312
4313 if (off < 0 || len < 0)
4314 panic("m_copym: invalid offset %d or len %d", off, len);
4315
4316 if (off == 0 && (m->m_flags & M_PKTHDR)) {
4317 mhdr = m;
4318 copyhdr = 1;
4319 }
4320
4321 while (off >= m->m_len) {
4322 if (m->m_next == NULL)
4323 panic("m_copym: invalid mbuf chain");
4324 off -= m->m_len;
4325 m = m->m_next;
4326 }
4327 np = &top;
4328 top = NULL;
4329
4330 while (len > 0) {
4331 if (m == NULL) {
4332 if (len != M_COPYALL)
4333 panic("m_copym: len != M_COPYALL");
4334 break;
4335 }
4336
4337 n = _M_RETRY(wait, m->m_type);
4338 *np = n;
4339
4340 if (n == NULL)
4341 goto nospace;
4342
4343 if (copyhdr != 0) {
4344 M_COPY_PKTHDR(n, mhdr);
4345 if (len == M_COPYALL)
4346 n->m_pkthdr.len -= off0;
4347 else
4348 n->m_pkthdr.len = len;
4349 copyhdr = 0;
4350 }
4351 if (len == M_COPYALL) {
4352 if (MIN(len, (m->m_len - off)) == len) {
4353 printf("m->m_len %d - off %d = %d, %d\n",
4354 m->m_len, off, m->m_len - off,
4355 MIN(len, (m->m_len - off)));
4356 }
4357 }
4358 n->m_len = MIN(len, (m->m_len - off));
4359 if (n->m_len == M_COPYALL) {
4360 printf("n->m_len == M_COPYALL, fixing\n");
4361 n->m_len = MHLEN;
4362 }
4363 if (m->m_flags & M_EXT) {
4364 n->m_ext = m->m_ext;
4365 m_incref(m);
4366 n->m_data = m->m_data + off;
4367 n->m_flags |= M_EXT;
4368 } else {
4369 bcopy(MTOD(m, caddr_t)+off, MTOD(n, caddr_t),
4370 (unsigned)n->m_len);
4371 }
4372 if (len != M_COPYALL)
4373 len -= n->m_len;
4374 off = 0;
4375 m = m->m_next;
4376 np = &n->m_next;
4377 }
4378
4379 if (top == NULL)
4380 MCFail++;
4381
4382 return (top);
4383 nospace:
4384
4385 m_freem(top);
4386 MCFail++;
4387 return (NULL);
4388 }
4389
4390 /*
4391 * Equivalent to m_copym except that all necessary mbuf hdrs are allocated
4392 * within this routine also, the last mbuf and offset accessed are passed
4393 * out and can be passed back in to avoid having to rescan the entire mbuf
4394 * list (normally hung off of the socket)
4395 */
4396 struct mbuf *
4397 m_copym_with_hdrs(struct mbuf *m, int off0, int len0, int wait,
4398 struct mbuf **m_lastm, int *m_off)
4399 {
4400 struct mbuf *n, **np = NULL;
4401 int off = off0, len = len0;
4402 struct mbuf *top = NULL;
4403 int mcflags = MSLEEPF(wait);
4404 int copyhdr = 0;
4405 int type = 0;
4406 mcache_obj_t *list = NULL;
4407 int needed = 0;
4408
4409 if (off == 0 && (m->m_flags & M_PKTHDR))
4410 copyhdr = 1;
4411
4412 if (*m_lastm != NULL) {
4413 m = *m_lastm;
4414 off = *m_off;
4415 } else {
4416 while (off >= m->m_len) {
4417 off -= m->m_len;
4418 m = m->m_next;
4419 }
4420 }
4421
4422 n = m;
4423 while (len > 0) {
4424 needed++;
4425 ASSERT(n != NULL);
4426 len -= MIN(len, (n->m_len - ((needed == 1) ? off : 0)));
4427 n = n->m_next;
4428 }
4429 needed++;
4430 len = len0;
4431
4432 /*
4433 * If the caller doesn't want to be put to sleep, mark it with
4434 * MCR_TRYHARD so that we may reclaim buffers from other places
4435 * before giving up.
4436 */
4437 if (mcflags & MCR_NOSLEEP)
4438 mcflags |= MCR_TRYHARD;
4439
4440 if (mcache_alloc_ext(m_cache(MC_MBUF), &list, needed,
4441 mcflags) != needed)
4442 goto nospace;
4443
4444 needed = 0;
4445 while (len > 0) {
4446 n = (struct mbuf *)list;
4447 list = list->obj_next;
4448 ASSERT(n != NULL && m != NULL);
4449
4450 type = (top == NULL) ? MT_HEADER : m->m_type;
4451 MBUF_INIT(n, (top == NULL), type);
4452 #if CONFIG_MACF_NET
4453 if (top == NULL && mac_mbuf_label_init(n, wait) != 0) {
4454 mtype_stat_inc(MT_HEADER);
4455 mtype_stat_dec(MT_FREE);
4456 m_free(n);
4457 goto nospace;
4458 }
4459 #endif /* MAC_NET */
4460
4461 if (top == NULL) {
4462 top = n;
4463 np = &top->m_next;
4464 continue;
4465 } else {
4466 needed++;
4467 *np = n;
4468 }
4469
4470 if (copyhdr) {
4471 M_COPY_PKTHDR(n, m);
4472 n->m_pkthdr.len = len;
4473 copyhdr = 0;
4474 }
4475 n->m_len = MIN(len, (m->m_len - off));
4476
4477 if (m->m_flags & M_EXT) {
4478 n->m_ext = m->m_ext;
4479 m_incref(m);
4480 n->m_data = m->m_data + off;
4481 n->m_flags |= M_EXT;
4482 } else {
4483 bcopy(MTOD(m, caddr_t)+off, MTOD(n, caddr_t),
4484 (unsigned)n->m_len);
4485 }
4486 len -= n->m_len;
4487
4488 if (len == 0) {
4489 if ((off + n->m_len) == m->m_len) {
4490 *m_lastm = m->m_next;
4491 *m_off = 0;
4492 } else {
4493 *m_lastm = m;
4494 *m_off = off + n->m_len;
4495 }
4496 break;
4497 }
4498 off = 0;
4499 m = m->m_next;
4500 np = &n->m_next;
4501 }
4502
4503 mtype_stat_inc(MT_HEADER);
4504 mtype_stat_add(type, needed);
4505 mtype_stat_sub(MT_FREE, needed + 1);
4506
4507 ASSERT(list == NULL);
4508 return (top);
4509
4510 nospace:
4511 if (list != NULL)
4512 mcache_free_ext(m_cache(MC_MBUF), list);
4513 if (top != NULL)
4514 m_freem(top);
4515 MCFail++;
4516 return (NULL);
4517 }
4518
4519 /*
4520 * Copy data from an mbuf chain starting "off" bytes from the beginning,
4521 * continuing for "len" bytes, into the indicated buffer.
4522 */
4523 void
4524 m_copydata(struct mbuf *m, int off, int len, void *vp)
4525 {
4526 unsigned count;
4527 char *cp = vp;
4528
4529 if (off < 0 || len < 0)
4530 panic("m_copydata: invalid offset %d or len %d", off, len);
4531
4532 while (off > 0) {
4533 if (m == NULL)
4534 panic("m_copydata: invalid mbuf chain");
4535 if (off < m->m_len)
4536 break;
4537 off -= m->m_len;
4538 m = m->m_next;
4539 }
4540 while (len > 0) {
4541 if (m == NULL)
4542 panic("m_copydata: invalid mbuf chain");
4543 count = MIN(m->m_len - off, len);
4544 bcopy(MTOD(m, caddr_t) + off, cp, count);
4545 len -= count;
4546 cp += count;
4547 off = 0;
4548 m = m->m_next;
4549 }
4550 }
4551
4552 /*
4553 * Concatenate mbuf chain n to m. Both chains must be of the same type
4554 * (e.g. MT_DATA). Any m_pkthdr is not updated.
4555 */
4556 void
4557 m_cat(struct mbuf *m, struct mbuf *n)
4558 {
4559 while (m->m_next)
4560 m = m->m_next;
4561 while (n) {
4562 if ((m->m_flags & M_EXT) ||
4563 m->m_data + m->m_len + n->m_len >= &m->m_dat[MLEN]) {
4564 /* just join the two chains */
4565 m->m_next = n;
4566 return;
4567 }
4568 /* splat the data from one into the other */
4569 bcopy(MTOD(n, caddr_t), MTOD(m, caddr_t) + m->m_len,
4570 (u_int)n->m_len);
4571 m->m_len += n->m_len;
4572 n = m_free(n);
4573 }
4574 }
4575
4576 void
4577 m_adj(struct mbuf *mp, int req_len)
4578 {
4579 int len = req_len;
4580 struct mbuf *m;
4581 int count;
4582
4583 if ((m = mp) == NULL)
4584 return;
4585 if (len >= 0) {
4586 /*
4587 * Trim from head.
4588 */
4589 while (m != NULL && len > 0) {
4590 if (m->m_len <= len) {
4591 len -= m->m_len;
4592 m->m_len = 0;
4593 m = m->m_next;
4594 } else {
4595 m->m_len -= len;
4596 m->m_data += len;
4597 len = 0;
4598 }
4599 }
4600 m = mp;
4601 if (m->m_flags & M_PKTHDR)
4602 m->m_pkthdr.len -= (req_len - len);
4603 } else {
4604 /*
4605 * Trim from tail. Scan the mbuf chain,
4606 * calculating its length and finding the last mbuf.
4607 * If the adjustment only affects this mbuf, then just
4608 * adjust and return. Otherwise, rescan and truncate
4609 * after the remaining size.
4610 */
4611 len = -len;
4612 count = 0;
4613 for (;;) {
4614 count += m->m_len;
4615 if (m->m_next == (struct mbuf *)0)
4616 break;
4617 m = m->m_next;
4618 }
4619 if (m->m_len >= len) {
4620 m->m_len -= len;
4621 m = mp;
4622 if (m->m_flags & M_PKTHDR)
4623 m->m_pkthdr.len -= len;
4624 return;
4625 }
4626 count -= len;
4627 if (count < 0)
4628 count = 0;
4629 /*
4630 * Correct length for chain is "count".
4631 * Find the mbuf with last data, adjust its length,
4632 * and toss data from remaining mbufs on chain.
4633 */
4634 m = mp;
4635 if (m->m_flags & M_PKTHDR)
4636 m->m_pkthdr.len = count;
4637 for (; m; m = m->m_next) {
4638 if (m->m_len >= count) {
4639 m->m_len = count;
4640 break;
4641 }
4642 count -= m->m_len;
4643 }
4644 while ((m = m->m_next))
4645 m->m_len = 0;
4646 }
4647 }
4648
4649 /*
4650 * Rearange an mbuf chain so that len bytes are contiguous
4651 * and in the data area of an mbuf (so that mtod and dtom
4652 * will work for a structure of size len). Returns the resulting
4653 * mbuf chain on success, frees it and returns null on failure.
4654 * If there is room, it will add up to max_protohdr-len extra bytes to the
4655 * contiguous region in an attempt to avoid being called next time.
4656 */
4657 int MPFail;
4658
4659 struct mbuf *
4660 m_pullup(struct mbuf *n, int len)
4661 {
4662 struct mbuf *m;
4663 int count;
4664 int space;
4665
4666 /*
4667 * If first mbuf has no cluster, and has room for len bytes
4668 * without shifting current data, pullup into it,
4669 * otherwise allocate a new mbuf to prepend to the chain.
4670 */
4671 if ((n->m_flags & M_EXT) == 0 &&
4672 n->m_data + len < &n->m_dat[MLEN] && n->m_next) {
4673 if (n->m_len >= len)
4674 return (n);
4675 m = n;
4676 n = n->m_next;
4677 len -= m->m_len;
4678 } else {
4679 if (len > MHLEN)
4680 goto bad;
4681 _MGET(m, M_DONTWAIT, n->m_type);
4682 if (m == 0)
4683 goto bad;
4684 m->m_len = 0;
4685 if (n->m_flags & M_PKTHDR) {
4686 M_COPY_PKTHDR(m, n);
4687 n->m_flags &= ~M_PKTHDR;
4688 }
4689 }
4690 space = &m->m_dat[MLEN] - (m->m_data + m->m_len);
4691 do {
4692 count = MIN(MIN(MAX(len, max_protohdr), space), n->m_len);
4693 bcopy(MTOD(n, caddr_t), MTOD(m, caddr_t) + m->m_len,
4694 (unsigned)count);
4695 len -= count;
4696 m->m_len += count;
4697 n->m_len -= count;
4698 space -= count;
4699 if (n->m_len)
4700 n->m_data += count;
4701 else
4702 n = m_free(n);
4703 } while (len > 0 && n);
4704 if (len > 0) {
4705 (void) m_free(m);
4706 goto bad;
4707 }
4708 m->m_next = n;
4709 return (m);
4710 bad:
4711 m_freem(n);
4712 MPFail++;
4713 return (0);
4714 }
4715
4716 /*
4717 * Like m_pullup(), except a new mbuf is always allocated, and we allow
4718 * the amount of empty space before the data in the new mbuf to be specified
4719 * (in the event that the caller expects to prepend later).
4720 */
4721 __private_extern__ int MSFail = 0;
4722
4723 __private_extern__ struct mbuf *
4724 m_copyup(struct mbuf *n, int len, int dstoff)
4725 {
4726 struct mbuf *m;
4727 int count, space;
4728
4729 if (len > (MHLEN - dstoff))
4730 goto bad;
4731 MGET(m, M_DONTWAIT, n->m_type);
4732 if (m == NULL)
4733 goto bad;
4734 m->m_len = 0;
4735 if (n->m_flags & M_PKTHDR) {
4736 m_copy_pkthdr(m, n);
4737 n->m_flags &= ~M_PKTHDR;
4738 }
4739 m->m_data += dstoff;
4740 space = &m->m_dat[MLEN] - (m->m_data + m->m_len);
4741 do {
4742 count = min(min(max(len, max_protohdr), space), n->m_len);
4743 memcpy(mtod(m, caddr_t) + m->m_len, mtod(n, caddr_t),
4744 (unsigned)count);
4745 len -= count;
4746 m->m_len += count;
4747 n->m_len -= count;
4748 space -= count;
4749 if (n->m_len)
4750 n->m_data += count;
4751 else
4752 n = m_free(n);
4753 } while (len > 0 && n);
4754 if (len > 0) {
4755 (void) m_free(m);
4756 goto bad;
4757 }
4758 m->m_next = n;
4759 return (m);
4760 bad:
4761 m_freem(n);
4762 MSFail++;
4763 return (NULL);
4764 }
4765
4766 /*
4767 * Partition an mbuf chain in two pieces, returning the tail --
4768 * all but the first len0 bytes. In case of failure, it returns NULL and
4769 * attempts to restore the chain to its original state.
4770 */
4771 struct mbuf *
4772 m_split(struct mbuf *m0, int len0, int wait)
4773 {
4774 return (m_split0(m0, len0, wait, 1));
4775 }
4776
4777 static struct mbuf *
4778 m_split0(struct mbuf *m0, int len0, int wait, int copyhdr)
4779 {
4780 struct mbuf *m, *n;
4781 unsigned len = len0, remain;
4782
4783 for (m = m0; m && len > m->m_len; m = m->m_next)
4784 len -= m->m_len;
4785 if (m == NULL)
4786 return (NULL);
4787 remain = m->m_len - len;
4788 if (copyhdr && (m0->m_flags & M_PKTHDR)) {
4789 _MGETHDR(n, wait, m0->m_type);
4790 if (n == NULL)
4791 return (NULL);
4792 n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif;
4793 n->m_pkthdr.len = m0->m_pkthdr.len - len0;
4794 m0->m_pkthdr.len = len0;
4795 if (m->m_flags & M_EXT)
4796 goto extpacket;
4797 if (remain > MHLEN) {
4798 /* m can't be the lead packet */
4799 MH_ALIGN(n, 0);
4800 n->m_next = m_split(m, len, wait);
4801 if (n->m_next == NULL) {
4802 (void) m_free(n);
4803 return (NULL);
4804 } else
4805 return (n);
4806 } else
4807 MH_ALIGN(n, remain);
4808 } else if (remain == 0) {
4809 n = m->m_next;
4810 m->m_next = NULL;
4811 return (n);
4812 } else {
4813 _MGET(n, wait, m->m_type);
4814 if (n == NULL)
4815 return (NULL);
4816 M_ALIGN(n, remain);
4817 }
4818 extpacket:
4819 if (m->m_flags & M_EXT) {
4820 n->m_flags |= M_EXT;
4821 n->m_ext = m->m_ext;
4822 m_incref(m);
4823 n->m_data = m->m_data + len;
4824 } else {
4825 bcopy(MTOD(m, caddr_t) + len, MTOD(n, caddr_t), remain);
4826 }
4827 n->m_len = remain;
4828 m->m_len = len;
4829 n->m_next = m->m_next;
4830 m->m_next = NULL;
4831 return (n);
4832 }
4833
4834 /*
4835 * Routine to copy from device local memory into mbufs.
4836 */
4837 struct mbuf *
4838 m_devget(char *buf, int totlen, int off0, struct ifnet *ifp,
4839 void (*copy)(const void *, void *, size_t))
4840 {
4841 struct mbuf *m;
4842 struct mbuf *top = NULL, **mp = &top;
4843 int off = off0, len;
4844 char *cp;
4845 char *epkt;
4846
4847 cp = buf;
4848 epkt = cp + totlen;
4849 if (off) {
4850 /*
4851 * If 'off' is non-zero, packet is trailer-encapsulated,
4852 * so we have to skip the type and length fields.
4853 */
4854 cp += off + 2 * sizeof (u_int16_t);
4855 totlen -= 2 * sizeof (u_int16_t);
4856 }
4857 _MGETHDR(m, M_DONTWAIT, MT_DATA);
4858 if (m == NULL)
4859 return (NULL);
4860 m->m_pkthdr.rcvif = ifp;
4861 m->m_pkthdr.len = totlen;
4862 m->m_len = MHLEN;
4863
4864 while (totlen > 0) {
4865 if (top != NULL) {
4866 _MGET(m, M_DONTWAIT, MT_DATA);
4867 if (m == NULL) {
4868 m_freem(top);
4869 return (NULL);
4870 }
4871 m->m_len = MLEN;
4872 }
4873 len = MIN(totlen, epkt - cp);
4874 if (len >= MINCLSIZE) {
4875 MCLGET(m, M_DONTWAIT);
4876 if (m->m_flags & M_EXT) {
4877 m->m_len = len = MIN(len, m_maxsize(MC_CL));
4878 } else {
4879 /* give up when it's out of cluster mbufs */
4880 if (top != NULL)
4881 m_freem(top);
4882 m_freem(m);
4883 return (NULL);
4884 }
4885 } else {
4886 /*
4887 * Place initial small packet/header at end of mbuf.
4888 */
4889 if (len < m->m_len) {
4890 if (top == NULL &&
4891 len + max_linkhdr <= m->m_len)
4892 m->m_data += max_linkhdr;
4893 m->m_len = len;
4894 } else {
4895 len = m->m_len;
4896 }
4897 }
4898 if (copy)
4899 copy(cp, MTOD(m, caddr_t), (unsigned)len);
4900 else
4901 bcopy(cp, MTOD(m, caddr_t), (unsigned)len);
4902 cp += len;
4903 *mp = m;
4904 mp = &m->m_next;
4905 totlen -= len;
4906 if (cp == epkt)
4907 cp = buf;
4908 }
4909 return (top);
4910 }
4911
4912 #ifndef MBUF_GROWTH_NORMAL_THRESH
4913 #define MBUF_GROWTH_NORMAL_THRESH 25
4914 #endif
4915
4916 /*
4917 * Cluster freelist allocation check.
4918 */
4919 static int
4920 m_howmany(int num, size_t bufsize)
4921 {
4922 int i = 0, j = 0;
4923 u_int32_t m_mbclusters, m_clusters, m_bigclusters, m_16kclusters;
4924 u_int32_t m_mbfree, m_clfree, m_bigclfree, m_16kclfree;
4925 u_int32_t sumclusters, freeclusters;
4926 u_int32_t percent_pool, percent_kmem;
4927 u_int32_t mb_growth, mb_growth_thresh;
4928
4929 VERIFY(bufsize == m_maxsize(MC_BIGCL) ||
4930 bufsize == m_maxsize(MC_16KCL));
4931
4932 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
4933
4934 /* Numbers in 2K cluster units */
4935 m_mbclusters = m_total(MC_MBUF) >> NMBPCLSHIFT;
4936 m_clusters = m_total(MC_CL);
4937 m_bigclusters = m_total(MC_BIGCL) << NCLPBGSHIFT;
4938 m_16kclusters = m_total(MC_16KCL);
4939 sumclusters = m_mbclusters + m_clusters + m_bigclusters;
4940
4941 m_mbfree = m_infree(MC_MBUF) >> NMBPCLSHIFT;
4942 m_clfree = m_infree(MC_CL);
4943 m_bigclfree = m_infree(MC_BIGCL) << NCLPBGSHIFT;
4944 m_16kclfree = m_infree(MC_16KCL);
4945 freeclusters = m_mbfree + m_clfree + m_bigclfree;
4946
4947 /* Bail if we've maxed out the mbuf memory map */
4948 if ((bufsize == m_maxsize(MC_BIGCL) && sumclusters >= nclusters) ||
4949 (njcl > 0 && bufsize == m_maxsize(MC_16KCL) &&
4950 (m_16kclusters << NCLPJCLSHIFT) >= njcl)) {
4951 return (0);
4952 }
4953
4954 if (bufsize == m_maxsize(MC_BIGCL)) {
4955 /* Under minimum */
4956 if (m_bigclusters < m_minlimit(MC_BIGCL))
4957 return (m_minlimit(MC_BIGCL) - m_bigclusters);
4958
4959 percent_pool =
4960 ((sumclusters - freeclusters) * 100) / sumclusters;
4961 percent_kmem = (sumclusters * 100) / nclusters;
4962
4963 /*
4964 * If a light/normal user, grow conservatively (75%)
4965 * If a heavy user, grow aggressively (50%)
4966 */
4967 if (percent_kmem < MBUF_GROWTH_NORMAL_THRESH)
4968 mb_growth = MB_GROWTH_NORMAL;
4969 else
4970 mb_growth = MB_GROWTH_AGGRESSIVE;
4971
4972 if (percent_kmem < 5) {
4973 /* For initial allocations */
4974 i = num;
4975 } else {
4976 /* Return if >= MBIGCL_LOWAT clusters available */
4977 if (m_infree(MC_BIGCL) >= MBIGCL_LOWAT &&
4978 m_total(MC_BIGCL) >=
4979 MBIGCL_LOWAT + m_minlimit(MC_BIGCL))
4980 return (0);
4981
4982 /* Ensure at least num clusters are accessible */
4983 if (num >= m_infree(MC_BIGCL))
4984 i = num - m_infree(MC_BIGCL);
4985 if (num > m_total(MC_BIGCL) - m_minlimit(MC_BIGCL))
4986 j = num - (m_total(MC_BIGCL) -
4987 m_minlimit(MC_BIGCL));
4988
4989 i = MAX(i, j);
4990
4991 /*
4992 * Grow pool if percent_pool > 75 (normal growth)
4993 * or percent_pool > 50 (aggressive growth).
4994 */
4995 mb_growth_thresh = 100 - (100 / (1 << mb_growth));
4996 if (percent_pool > mb_growth_thresh)
4997 j = ((sumclusters + num) >> mb_growth) -
4998 freeclusters;
4999 i = MAX(i, j);
5000 }
5001
5002 /* Check to ensure we didn't go over limits */
5003 if (i + m_bigclusters >= m_maxlimit(MC_BIGCL))
5004 i = m_maxlimit(MC_BIGCL) - m_bigclusters;
5005 if ((i << 1) + sumclusters >= nclusters)
5006 i = (nclusters - sumclusters) >> 1;
5007 VERIFY((m_total(MC_BIGCL) + i) <= m_maxlimit(MC_BIGCL));
5008 VERIFY(sumclusters + (i << 1) <= nclusters);
5009
5010 } else { /* 16K CL */
5011 VERIFY(njcl > 0);
5012 /* Under minimum */
5013 if (m_16kclusters < MIN16KCL)
5014 return (MIN16KCL - m_16kclusters);
5015 if (m_16kclfree >= M16KCL_LOWAT)
5016 return (0);
5017
5018 /* Ensure at least num clusters are available */
5019 if (num >= m_16kclfree)
5020 i = num - m_16kclfree;
5021
5022 /* Always grow 16KCL pool aggressively */
5023 if (((m_16kclusters + num) >> 1) > m_16kclfree)
5024 j = ((m_16kclusters + num) >> 1) - m_16kclfree;
5025 i = MAX(i, j);
5026
5027 /* Check to ensure we don't go over limit */
5028 if (i + m_16kclusters >= m_maxlimit(MC_16KCL))
5029 i = m_maxlimit(MC_16KCL) - m_16kclusters;
5030 VERIFY((m_total(MC_16KCL) + i) <= m_maxlimit(MC_16KCL));
5031 }
5032 return (i);
5033 }
5034 /*
5035 * Return the number of bytes in the mbuf chain, m.
5036 */
5037 unsigned int
5038 m_length(struct mbuf *m)
5039 {
5040 struct mbuf *m0;
5041 unsigned int pktlen;
5042
5043 if (m->m_flags & M_PKTHDR)
5044 return (m->m_pkthdr.len);
5045
5046 pktlen = 0;
5047 for (m0 = m; m0 != NULL; m0 = m0->m_next)
5048 pktlen += m0->m_len;
5049 return (pktlen);
5050 }
5051
5052 /*
5053 * Copy data from a buffer back into the indicated mbuf chain,
5054 * starting "off" bytes from the beginning, extending the mbuf
5055 * chain if necessary.
5056 */
5057 void
5058 m_copyback(struct mbuf *m0, int off, int len, const void *cp)
5059 {
5060 #if DEBUG
5061 struct mbuf *origm = m0;
5062 int error;
5063 #endif /* DEBUG */
5064
5065 if (m0 == NULL)
5066 return;
5067
5068 #if DEBUG
5069 error =
5070 #endif /* DEBUG */
5071 m_copyback0(&m0, off, len, cp,
5072 M_COPYBACK0_COPYBACK | M_COPYBACK0_EXTEND, M_DONTWAIT);
5073
5074 #if DEBUG
5075 if (error != 0 || (m0 != NULL && origm != m0))
5076 panic("m_copyback");
5077 #endif /* DEBUG */
5078 }
5079
5080 struct mbuf *
5081 m_copyback_cow(struct mbuf *m0, int off, int len, const void *cp, int how)
5082 {
5083 int error;
5084
5085 /* don't support chain expansion */
5086 VERIFY(off + len <= m_length(m0));
5087
5088 error = m_copyback0(&m0, off, len, cp,
5089 M_COPYBACK0_COPYBACK | M_COPYBACK0_COW, how);
5090 if (error) {
5091 /*
5092 * no way to recover from partial success.
5093 * just free the chain.
5094 */
5095 m_freem(m0);
5096 return (NULL);
5097 }
5098 return (m0);
5099 }
5100
5101 /*
5102 * m_makewritable: ensure the specified range writable.
5103 */
5104 int
5105 m_makewritable(struct mbuf **mp, int off, int len, int how)
5106 {
5107 int error;
5108 #if DEBUG
5109 struct mbuf *n;
5110 int origlen, reslen;
5111
5112 origlen = m_length(*mp);
5113 #endif /* DEBUG */
5114
5115 #if 0 /* M_COPYALL is large enough */
5116 if (len == M_COPYALL)
5117 len = m_length(*mp) - off; /* XXX */
5118 #endif
5119
5120 error = m_copyback0(mp, off, len, NULL,
5121 M_COPYBACK0_PRESERVE | M_COPYBACK0_COW, how);
5122
5123 #if DEBUG
5124 reslen = 0;
5125 for (n = *mp; n; n = n->m_next)
5126 reslen += n->m_len;
5127 if (origlen != reslen)
5128 panic("m_makewritable: length changed");
5129 if (((*mp)->m_flags & M_PKTHDR) && reslen != (*mp)->m_pkthdr.len)
5130 panic("m_makewritable: inconsist");
5131 #endif /* DEBUG */
5132
5133 return (error);
5134 }
5135
5136 static int
5137 m_copyback0(struct mbuf **mp0, int off, int len, const void *vp, int flags,
5138 int how)
5139 {
5140 int mlen;
5141 struct mbuf *m, *n;
5142 struct mbuf **mp;
5143 int totlen = 0;
5144 const char *cp = vp;
5145
5146 VERIFY(mp0 != NULL);
5147 VERIFY(*mp0 != NULL);
5148 VERIFY((flags & M_COPYBACK0_PRESERVE) == 0 || cp == NULL);
5149 VERIFY((flags & M_COPYBACK0_COPYBACK) == 0 || cp != NULL);
5150
5151 /*
5152 * we don't bother to update "totlen" in the case of M_COPYBACK0_COW,
5153 * assuming that M_COPYBACK0_EXTEND and M_COPYBACK0_COW are exclusive.
5154 */
5155
5156 VERIFY((~flags & (M_COPYBACK0_EXTEND|M_COPYBACK0_COW)) != 0);
5157
5158 mp = mp0;
5159 m = *mp;
5160 while (off > (mlen = m->m_len)) {
5161 off -= mlen;
5162 totlen += mlen;
5163 if (m->m_next == NULL) {
5164 int tspace;
5165 extend:
5166 if (!(flags & M_COPYBACK0_EXTEND))
5167 goto out;
5168
5169 /*
5170 * try to make some space at the end of "m".
5171 */
5172
5173 mlen = m->m_len;
5174 if (off + len >= MINCLSIZE &&
5175 !(m->m_flags & M_EXT) && m->m_len == 0) {
5176 MCLGET(m, how);
5177 }
5178 tspace = M_TRAILINGSPACE(m);
5179 if (tspace > 0) {
5180 tspace = MIN(tspace, off + len);
5181 VERIFY(tspace > 0);
5182 bzero(mtod(m, char *) + m->m_len,
5183 MIN(off, tspace));
5184 m->m_len += tspace;
5185 off += mlen;
5186 totlen -= mlen;
5187 continue;
5188 }
5189
5190 /*
5191 * need to allocate an mbuf.
5192 */
5193
5194 if (off + len >= MINCLSIZE) {
5195 n = m_getcl(how, m->m_type, 0);
5196 } else {
5197 n = _M_GET(how, m->m_type);
5198 }
5199 if (n == NULL) {
5200 goto out;
5201 }
5202 n->m_len = 0;
5203 n->m_len = MIN(M_TRAILINGSPACE(n), off + len);
5204 bzero(mtod(n, char *), MIN(n->m_len, off));
5205 m->m_next = n;
5206 }
5207 mp = &m->m_next;
5208 m = m->m_next;
5209 }
5210 while (len > 0) {
5211 mlen = m->m_len - off;
5212 if (mlen != 0 && m_mclhasreference(m)) {
5213 char *datap;
5214 int eatlen;
5215
5216 /*
5217 * this mbuf is read-only.
5218 * allocate a new writable mbuf and try again.
5219 */
5220
5221 #if defined(DIAGNOSTIC)
5222 if (!(flags & M_COPYBACK0_COW))
5223 panic("m_copyback0: read-only");
5224 #endif /* defined(DIAGNOSTIC) */
5225
5226 /*
5227 * if we're going to write into the middle of
5228 * a mbuf, split it first.
5229 */
5230 if (off > 0 && len < mlen) {
5231 n = m_split0(m, off, how, 0);
5232 if (n == NULL)
5233 goto enobufs;
5234 m->m_next = n;
5235 mp = &m->m_next;
5236 m = n;
5237 off = 0;
5238 continue;
5239 }
5240
5241 /*
5242 * XXX TODO coalesce into the trailingspace of
5243 * the previous mbuf when possible.
5244 */
5245
5246 /*
5247 * allocate a new mbuf. copy packet header if needed.
5248 */
5249 n = _M_GET(how, m->m_type);
5250 if (n == NULL)
5251 goto enobufs;
5252 if (off == 0 && (m->m_flags & M_PKTHDR)) {
5253 M_COPY_PKTHDR(n, m);
5254 n->m_len = MHLEN;
5255 } else {
5256 if (len >= MINCLSIZE)
5257 MCLGET(n, M_DONTWAIT);
5258 n->m_len =
5259 (n->m_flags & M_EXT) ? MCLBYTES : MLEN;
5260 }
5261 if (n->m_len > len)
5262 n->m_len = len;
5263
5264 /*
5265 * free the region which has been overwritten.
5266 * copying data from old mbufs if requested.
5267 */
5268 if (flags & M_COPYBACK0_PRESERVE)
5269 datap = mtod(n, char *);
5270 else
5271 datap = NULL;
5272 eatlen = n->m_len;
5273 VERIFY(off == 0 || eatlen >= mlen);
5274 if (off > 0) {
5275 VERIFY(len >= mlen);
5276 m->m_len = off;
5277 m->m_next = n;
5278 if (datap) {
5279 m_copydata(m, off, mlen, datap);
5280 datap += mlen;
5281 }
5282 eatlen -= mlen;
5283 mp = &m->m_next;
5284 m = m->m_next;
5285 }
5286 while (m != NULL && m_mclhasreference(m) &&
5287 n->m_type == m->m_type && eatlen > 0) {
5288 mlen = MIN(eatlen, m->m_len);
5289 if (datap) {
5290 m_copydata(m, 0, mlen, datap);
5291 datap += mlen;
5292 }
5293 m->m_data += mlen;
5294 m->m_len -= mlen;
5295 eatlen -= mlen;
5296 if (m->m_len == 0)
5297 *mp = m = m_free(m);
5298 }
5299 if (eatlen > 0)
5300 n->m_len -= eatlen;
5301 n->m_next = m;
5302 *mp = m = n;
5303 continue;
5304 }
5305 mlen = MIN(mlen, len);
5306 if (flags & M_COPYBACK0_COPYBACK) {
5307 bcopy(cp, mtod(m, caddr_t) + off, (unsigned)mlen);
5308 cp += mlen;
5309 }
5310 len -= mlen;
5311 mlen += off;
5312 off = 0;
5313 totlen += mlen;
5314 if (len == 0)
5315 break;
5316 if (m->m_next == NULL) {
5317 goto extend;
5318 }
5319 mp = &m->m_next;
5320 m = m->m_next;
5321 }
5322 out:
5323 if (((m = *mp0)->m_flags & M_PKTHDR) && (m->m_pkthdr.len < totlen)) {
5324 VERIFY(flags & M_COPYBACK0_EXTEND);
5325 m->m_pkthdr.len = totlen;
5326 }
5327
5328 return (0);
5329
5330 enobufs:
5331 return (ENOBUFS);
5332 }
5333
5334 char *
5335 mcl_to_paddr(char *addr)
5336 {
5337 vm_offset_t base_phys;
5338
5339 if (!MBUF_IN_MAP(addr))
5340 return (NULL);
5341 base_phys = mcl_paddr[(addr - (char *)mbutl) >> PGSHIFT];
5342
5343 if (base_phys == 0)
5344 return (NULL);
5345 return ((char *)((uintptr_t)base_phys | ((uintptr_t)addr & PGOFSET)));
5346 }
5347
5348 /*
5349 * Dup the mbuf chain passed in. The whole thing. No cute additional cruft.
5350 * And really copy the thing. That way, we don't "precompute" checksums
5351 * for unsuspecting consumers. Assumption: m->m_nextpkt == 0. Trick: for
5352 * small packets, don't dup into a cluster. That way received packets
5353 * don't take up too much room in the sockbuf (cf. sbspace()).
5354 */
5355 int MDFail;
5356
5357 struct mbuf *
5358 m_dup(struct mbuf *m, int how)
5359 {
5360 struct mbuf *n, **np;
5361 struct mbuf *top;
5362 int copyhdr = 0;
5363
5364 np = &top;
5365 top = NULL;
5366 if (m->m_flags & M_PKTHDR)
5367 copyhdr = 1;
5368
5369 /*
5370 * Quick check: if we have one mbuf and its data fits in an
5371 * mbuf with packet header, just copy and go.
5372 */
5373 if (m->m_next == NULL) {
5374 /* Then just move the data into an mbuf and be done... */
5375 if (copyhdr) {
5376 if (m->m_pkthdr.len <= MHLEN && m->m_len <= MHLEN) {
5377 if ((n = _M_GETHDR(how, m->m_type)) == NULL)
5378 return (NULL);
5379 n->m_len = m->m_len;
5380 m_dup_pkthdr(n, m, how);
5381 bcopy(m->m_data, n->m_data, m->m_len);
5382 return (n);
5383 }
5384 } else if (m->m_len <= MLEN) {
5385 if ((n = _M_GET(how, m->m_type)) == NULL)
5386 return (NULL);
5387 bcopy(m->m_data, n->m_data, m->m_len);
5388 n->m_len = m->m_len;
5389 return (n);
5390 }
5391 }
5392 while (m != NULL) {
5393 #if BLUE_DEBUG
5394 kprintf("<%x: %x, %x, %x\n", m, m->m_flags, m->m_len,
5395 m->m_data);
5396 #endif
5397 if (copyhdr)
5398 n = _M_GETHDR(how, m->m_type);
5399 else
5400 n = _M_GET(how, m->m_type);
5401 if (n == NULL)
5402 goto nospace;
5403 if (m->m_flags & M_EXT) {
5404 if (m->m_len <= m_maxsize(MC_CL))
5405 MCLGET(n, how);
5406 else if (m->m_len <= m_maxsize(MC_BIGCL))
5407 n = m_mbigget(n, how);
5408 else if (m->m_len <= m_maxsize(MC_16KCL) && njcl > 0)
5409 n = m_m16kget(n, how);
5410 if (!(n->m_flags & M_EXT)) {
5411 (void) m_free(n);
5412 goto nospace;
5413 }
5414 }
5415 *np = n;
5416 if (copyhdr) {
5417 /* Don't use M_COPY_PKTHDR: preserve m_data */
5418 m_dup_pkthdr(n, m, how);
5419 copyhdr = 0;
5420 if (!(n->m_flags & M_EXT))
5421 n->m_data = n->m_pktdat;
5422 }
5423 n->m_len = m->m_len;
5424 /*
5425 * Get the dup on the same bdry as the original
5426 * Assume that the two mbufs have the same offset to data area
5427 * (up to word boundaries)
5428 */
5429 bcopy(MTOD(m, caddr_t), MTOD(n, caddr_t), (unsigned)n->m_len);
5430 m = m->m_next;
5431 np = &n->m_next;
5432 #if BLUE_DEBUG
5433 kprintf(">%x: %x, %x, %x\n", n, n->m_flags, n->m_len,
5434 n->m_data);
5435 #endif
5436 }
5437
5438 if (top == NULL)
5439 MDFail++;
5440 return (top);
5441
5442 nospace:
5443 m_freem(top);
5444 MDFail++;
5445 return (NULL);
5446 }
5447
5448 #define MBUF_MULTIPAGES(m) \
5449 (((m)->m_flags & M_EXT) && \
5450 ((IS_P2ALIGNED((m)->m_data, NBPG) && (m)->m_len > NBPG) || \
5451 (!IS_P2ALIGNED((m)->m_data, NBPG) && \
5452 P2ROUNDUP((m)->m_data, NBPG) < ((uintptr_t)(m)->m_data + (m)->m_len))))
5453
5454 static struct mbuf *
5455 m_expand(struct mbuf *m, struct mbuf **last)
5456 {
5457 struct mbuf *top = NULL;
5458 struct mbuf **nm = &top;
5459 uintptr_t data0, data;
5460 unsigned int len0, len;
5461
5462 VERIFY(MBUF_MULTIPAGES(m));
5463 VERIFY(m->m_next == NULL);
5464 data0 = (uintptr_t)m->m_data;
5465 len0 = m->m_len;
5466 *last = top;
5467
5468 for (;;) {
5469 struct mbuf *n;
5470
5471 data = data0;
5472 if (IS_P2ALIGNED(data, NBPG) && len0 > NBPG)
5473 len = NBPG;
5474 else if (!IS_P2ALIGNED(data, NBPG) &&
5475 P2ROUNDUP(data, NBPG) < (data + len0))
5476 len = P2ROUNDUP(data, NBPG) - data;
5477 else
5478 len = len0;
5479
5480 VERIFY(len > 0);
5481 VERIFY(m->m_flags & M_EXT);
5482 m->m_data = (void *)data;
5483 m->m_len = len;
5484
5485 *nm = *last = m;
5486 nm = &m->m_next;
5487 m->m_next = NULL;
5488
5489 data0 += len;
5490 len0 -= len;
5491 if (len0 == 0)
5492 break;
5493
5494 n = _M_RETRY(M_DONTWAIT, MT_DATA);
5495 if (n == NULL) {
5496 m_freem(top);
5497 top = *last = NULL;
5498 break;
5499 }
5500
5501 n->m_ext = m->m_ext;
5502 m_incref(m);
5503 n->m_flags |= M_EXT;
5504 m = n;
5505 }
5506 return (top);
5507 }
5508
5509 struct mbuf *
5510 m_normalize(struct mbuf *m)
5511 {
5512 struct mbuf *top = NULL;
5513 struct mbuf **nm = &top;
5514 boolean_t expanded = FALSE;
5515
5516 while (m != NULL) {
5517 struct mbuf *n;
5518
5519 n = m->m_next;
5520 m->m_next = NULL;
5521
5522 /* Does the data cross one or more page boundaries? */
5523 if (MBUF_MULTIPAGES(m)) {
5524 struct mbuf *last;
5525 if ((m = m_expand(m, &last)) == NULL) {
5526 m_freem(n);
5527 m_freem(top);
5528 top = NULL;
5529 break;
5530 }
5531 *nm = m;
5532 nm = &last->m_next;
5533 expanded = TRUE;
5534 } else {
5535 *nm = m;
5536 nm = &m->m_next;
5537 }
5538 m = n;
5539 }
5540 if (expanded)
5541 atomic_add_32(&mb_normalized, 1);
5542 return (top);
5543 }
5544
5545 /*
5546 * Append the specified data to the indicated mbuf chain,
5547 * Extend the mbuf chain if the new data does not fit in
5548 * existing space.
5549 *
5550 * Return 1 if able to complete the job; otherwise 0.
5551 */
5552 int
5553 m_append(struct mbuf *m0, int len, caddr_t cp)
5554 {
5555 struct mbuf *m, *n;
5556 int remainder, space;
5557
5558 for (m = m0; m->m_next != NULL; m = m->m_next)
5559 ;
5560 remainder = len;
5561 space = M_TRAILINGSPACE(m);
5562 if (space > 0) {
5563 /*
5564 * Copy into available space.
5565 */
5566 if (space > remainder)
5567 space = remainder;
5568 bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
5569 m->m_len += space;
5570 cp += space, remainder -= space;
5571 }
5572 while (remainder > 0) {
5573 /*
5574 * Allocate a new mbuf; could check space
5575 * and allocate a cluster instead.
5576 */
5577 n = m_get(M_WAITOK, m->m_type);
5578 if (n == NULL)
5579 break;
5580 n->m_len = min(MLEN, remainder);
5581 bcopy(cp, mtod(n, caddr_t), n->m_len);
5582 cp += n->m_len;
5583 remainder -= n->m_len;
5584 m->m_next = n;
5585 m = n;
5586 }
5587 if (m0->m_flags & M_PKTHDR)
5588 m0->m_pkthdr.len += len - remainder;
5589 return (remainder == 0);
5590 }
5591
5592 struct mbuf *
5593 m_last(struct mbuf *m)
5594 {
5595 while (m->m_next != NULL)
5596 m = m->m_next;
5597 return (m);
5598 }
5599
5600 void
5601 m_mchtype(struct mbuf *m, int t)
5602 {
5603 mtype_stat_inc(t);
5604 mtype_stat_dec(m->m_type);
5605 (m)->m_type = t;
5606 }
5607
5608 void *
5609 m_mtod(struct mbuf *m)
5610 {
5611 return (MTOD(m, void *));
5612 }
5613
5614 struct mbuf *
5615 m_dtom(void *x)
5616 {
5617 return ((struct mbuf *)((uintptr_t)(x) & ~(MSIZE-1)));
5618 }
5619
5620 void
5621 m_mcheck(struct mbuf *m)
5622 {
5623 _MCHECK(m);
5624 }
5625
5626 /*
5627 * Return a pointer to mbuf/offset of location in mbuf chain.
5628 */
5629 struct mbuf *
5630 m_getptr(struct mbuf *m, int loc, int *off)
5631 {
5632
5633 while (loc >= 0) {
5634 /* Normal end of search. */
5635 if (m->m_len > loc) {
5636 *off = loc;
5637 return (m);
5638 } else {
5639 loc -= m->m_len;
5640 if (m->m_next == NULL) {
5641 if (loc == 0) {
5642 /* Point at the end of valid data. */
5643 *off = m->m_len;
5644 return (m);
5645 }
5646 return (NULL);
5647 }
5648 m = m->m_next;
5649 }
5650 }
5651 return (NULL);
5652 }
5653
5654 /*
5655 * Inform the corresponding mcache(s) that there's a waiter below.
5656 */
5657 static void
5658 mbuf_waiter_inc(mbuf_class_t class, boolean_t comp)
5659 {
5660 mcache_waiter_inc(m_cache(class));
5661 if (comp) {
5662 if (class == MC_CL) {
5663 mcache_waiter_inc(m_cache(MC_MBUF_CL));
5664 } else if (class == MC_BIGCL) {
5665 mcache_waiter_inc(m_cache(MC_MBUF_BIGCL));
5666 } else if (class == MC_16KCL) {
5667 mcache_waiter_inc(m_cache(MC_MBUF_16KCL));
5668 } else {
5669 mcache_waiter_inc(m_cache(MC_MBUF_CL));
5670 mcache_waiter_inc(m_cache(MC_MBUF_BIGCL));
5671 }
5672 }
5673 }
5674
5675 /*
5676 * Inform the corresponding mcache(s) that there's no more waiter below.
5677 */
5678 static void
5679 mbuf_waiter_dec(mbuf_class_t class, boolean_t comp)
5680 {
5681 mcache_waiter_dec(m_cache(class));
5682 if (comp) {
5683 if (class == MC_CL) {
5684 mcache_waiter_dec(m_cache(MC_MBUF_CL));
5685 } else if (class == MC_BIGCL) {
5686 mcache_waiter_dec(m_cache(MC_MBUF_BIGCL));
5687 } else if (class == MC_16KCL) {
5688 mcache_waiter_dec(m_cache(MC_MBUF_16KCL));
5689 } else {
5690 mcache_waiter_dec(m_cache(MC_MBUF_CL));
5691 mcache_waiter_dec(m_cache(MC_MBUF_BIGCL));
5692 }
5693 }
5694 }
5695
5696 /*
5697 * Called during slab (blocking and non-blocking) allocation. If there
5698 * is at least one waiter, and the time since the first waiter is blocked
5699 * is greater than the watchdog timeout, panic the system.
5700 */
5701 static void
5702 mbuf_watchdog(void)
5703 {
5704 struct timeval now;
5705 unsigned int since;
5706
5707 if (mb_waiters == 0 || !mb_watchdog)
5708 return;
5709
5710 microuptime(&now);
5711 since = now.tv_sec - mb_wdtstart.tv_sec;
5712 if (since >= MB_WDT_MAXTIME) {
5713 panic_plain("%s: %d waiters stuck for %u secs\n%s", __func__,
5714 mb_waiters, since, mbuf_dump());
5715 /* NOTREACHED */
5716 }
5717 }
5718
5719 /*
5720 * Called during blocking allocation. Returns TRUE if one or more objects
5721 * are available at the per-CPU caches layer and that allocation should be
5722 * retried at that level.
5723 */
5724 static boolean_t
5725 mbuf_sleep(mbuf_class_t class, unsigned int num, int wait)
5726 {
5727 boolean_t mcache_retry = FALSE;
5728
5729 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
5730
5731 /* Check if there's anything at the cache layer */
5732 if (mbuf_cached_above(class, wait)) {
5733 mcache_retry = TRUE;
5734 goto done;
5735 }
5736
5737 /* Nothing? Then try hard to get it from somewhere */
5738 m_reclaim(class, num, (wait & MCR_COMP));
5739
5740 /* We tried hard and got something? */
5741 if (m_infree(class) > 0) {
5742 mbstat.m_wait++;
5743 goto done;
5744 } else if (mbuf_cached_above(class, wait)) {
5745 mbstat.m_wait++;
5746 mcache_retry = TRUE;
5747 goto done;
5748 } else if (wait & MCR_TRYHARD) {
5749 mcache_retry = TRUE;
5750 goto done;
5751 }
5752
5753 /*
5754 * There's really nothing for us right now; inform the
5755 * cache(s) that there is a waiter below and go to sleep.
5756 */
5757 mbuf_waiter_inc(class, (wait & MCR_COMP));
5758
5759 VERIFY(!(wait & MCR_NOSLEEP));
5760
5761 /*
5762 * If this is the first waiter, arm the watchdog timer. Otherwise
5763 * check if we need to panic the system due to watchdog timeout.
5764 */
5765 if (mb_waiters == 0)
5766 microuptime(&mb_wdtstart);
5767 else
5768 mbuf_watchdog();
5769
5770 mb_waiters++;
5771 (void) msleep(mb_waitchan, mbuf_mlock, (PZERO-1), m_cname(class), NULL);
5772
5773 /* We are now up; stop getting notified until next round */
5774 mbuf_waiter_dec(class, (wait & MCR_COMP));
5775
5776 /* We waited and got something */
5777 if (m_infree(class) > 0) {
5778 mbstat.m_wait++;
5779 goto done;
5780 } else if (mbuf_cached_above(class, wait)) {
5781 mbstat.m_wait++;
5782 mcache_retry = TRUE;
5783 }
5784 done:
5785 return (mcache_retry);
5786 }
5787
5788 static void
5789 mbuf_worker_thread(void)
5790 {
5791 int mbuf_expand;
5792
5793 while (1) {
5794 lck_mtx_lock(mbuf_mlock);
5795
5796 mbuf_expand = 0;
5797 if (mbuf_expand_mcl) {
5798 int n;
5799
5800 /* Adjust to current number of cluster in use */
5801 n = mbuf_expand_mcl -
5802 (m_total(MC_CL) - m_infree(MC_CL));
5803 if ((n + m_total(MC_CL)) > m_maxlimit(MC_CL))
5804 n = m_maxlimit(MC_CL) - m_total(MC_CL);
5805 mbuf_expand_mcl = 0;
5806
5807 if (n > 0 && freelist_populate(MC_CL, n, M_WAIT) > 0)
5808 mbuf_expand++;
5809 }
5810 if (mbuf_expand_big) {
5811 int n;
5812
5813 /* Adjust to current number of 4 KB cluster in use */
5814 n = mbuf_expand_big -
5815 (m_total(MC_BIGCL) - m_infree(MC_BIGCL));
5816 if ((n + m_total(MC_BIGCL)) > m_maxlimit(MC_BIGCL))
5817 n = m_maxlimit(MC_BIGCL) - m_total(MC_BIGCL);
5818 mbuf_expand_big = 0;
5819
5820 if (n > 0 && freelist_populate(MC_BIGCL, n, M_WAIT) > 0)
5821 mbuf_expand++;
5822 }
5823 if (mbuf_expand_16k) {
5824 int n;
5825
5826 /* Adjust to current number of 16 KB cluster in use */
5827 n = mbuf_expand_16k -
5828 (m_total(MC_16KCL) - m_infree(MC_16KCL));
5829 if ((n + m_total(MC_16KCL)) > m_maxlimit(MC_16KCL))
5830 n = m_maxlimit(MC_16KCL) - m_total(MC_16KCL);
5831 mbuf_expand_16k = 0;
5832
5833 if (n > 0)
5834 (void) freelist_populate(MC_16KCL, n, M_WAIT);
5835 }
5836
5837 /*
5838 * Because we can run out of memory before filling the mbuf
5839 * map, we should not allocate more clusters than they are
5840 * mbufs -- otherwise we could have a large number of useless
5841 * clusters allocated.
5842 */
5843 if (mbuf_expand) {
5844 while (m_total(MC_MBUF) <
5845 (m_total(MC_BIGCL) + m_total(MC_CL))) {
5846 if (freelist_populate(MC_MBUF, 1, M_WAIT) == 0)
5847 break;
5848 }
5849 }
5850
5851 lck_mtx_unlock(mbuf_mlock);
5852
5853 assert_wait(&mbuf_worker_run, THREAD_UNINT);
5854 (void) thread_block((thread_continue_t)mbuf_worker_thread);
5855 }
5856 }
5857
5858 static void
5859 mbuf_worker_thread_init(void)
5860 {
5861 mbuf_worker_ready++;
5862 mbuf_worker_thread();
5863 }
5864
5865 static mcl_slab_t *
5866 slab_get(void *buf)
5867 {
5868 mcl_slabg_t *slg;
5869 unsigned int ix, k;
5870
5871 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
5872
5873 VERIFY(MBUF_IN_MAP(buf));
5874 ix = ((char *)buf - (char *)mbutl) >> MBSHIFT;
5875 VERIFY(ix < maxslabgrp);
5876
5877 if ((slg = slabstbl[ix]) == NULL) {
5878 /*
5879 * In the current implementation, we never shrink the memory
5880 * pool (hence the cluster map); if we attempt to reallocate
5881 * a cluster group when it's already allocated, panic since
5882 * this is a sign of a memory corruption (slabstbl[ix] got
5883 * nullified). This also means that there shouldn't be any
5884 * hole in the kernel sub-map for the mbuf pool.
5885 */
5886 ++slabgrp;
5887 VERIFY(ix < slabgrp);
5888 /*
5889 * Slabs expansion can only be done single threaded; when
5890 * we get here, it must be as a result of m_clalloc() which
5891 * is serialized and therefore mb_clalloc_busy must be set.
5892 */
5893 VERIFY(mb_clalloc_busy);
5894 lck_mtx_unlock(mbuf_mlock);
5895
5896 /* This is a new buffer; create the slabs group for it */
5897 MALLOC(slg, mcl_slabg_t *, sizeof (*slg), M_TEMP,
5898 M_WAITOK | M_ZERO);
5899 VERIFY(slg != NULL);
5900
5901 lck_mtx_lock(mbuf_mlock);
5902 /*
5903 * No other thread could have gone into m_clalloc() after
5904 * we dropped the lock above, so verify that it's true.
5905 */
5906 VERIFY(mb_clalloc_busy);
5907
5908 slabstbl[ix] = slg;
5909
5910 /* Chain each slab in the group to its forward neighbor */
5911 for (k = 1; k < NSLABSPMB; k++)
5912 slg->slg_slab[k - 1].sl_next = &slg->slg_slab[k];
5913 VERIFY(slg->slg_slab[NSLABSPMB - 1].sl_next == NULL);
5914
5915 /* And chain the last slab in the previous group to this */
5916 if (ix > 0) {
5917 VERIFY(slabstbl[ix - 1]->
5918 slg_slab[NSLABSPMB - 1].sl_next == NULL);
5919 slabstbl[ix - 1]->slg_slab[NSLABSPMB - 1].sl_next =
5920 &slg->slg_slab[0];
5921 }
5922 }
5923
5924 ix = MTOBG(buf) % NSLABSPMB;
5925 VERIFY(ix < NSLABSPMB);
5926
5927 return (&slg->slg_slab[ix]);
5928 }
5929
5930 static void
5931 slab_init(mcl_slab_t *sp, mbuf_class_t class, u_int32_t flags,
5932 void *base, void *head, unsigned int len, int refcnt, int chunks)
5933 {
5934 sp->sl_class = class;
5935 sp->sl_flags = flags;
5936 sp->sl_base = base;
5937 sp->sl_head = head;
5938 sp->sl_len = len;
5939 sp->sl_refcnt = refcnt;
5940 sp->sl_chunks = chunks;
5941 slab_detach(sp);
5942 }
5943
5944 static void
5945 slab_insert(mcl_slab_t *sp, mbuf_class_t class)
5946 {
5947 VERIFY(slab_is_detached(sp));
5948 m_slab_cnt(class)++;
5949 TAILQ_INSERT_TAIL(&m_slablist(class), sp, sl_link);
5950 sp->sl_flags &= ~SLF_DETACHED;
5951 if (class == MC_16KCL) {
5952 int k;
5953 for (k = 1; k < NSLABSP16KB; k++) {
5954 sp = sp->sl_next;
5955 /* Next slab must already be present */
5956 VERIFY(sp != NULL);
5957 VERIFY(slab_is_detached(sp));
5958 sp->sl_flags &= ~SLF_DETACHED;
5959 }
5960 }
5961 }
5962
5963 static void
5964 slab_remove(mcl_slab_t *sp, mbuf_class_t class)
5965 {
5966 VERIFY(!slab_is_detached(sp));
5967 VERIFY(m_slab_cnt(class) > 0);
5968 m_slab_cnt(class)--;
5969 TAILQ_REMOVE(&m_slablist(class), sp, sl_link);
5970 slab_detach(sp);
5971 if (class == MC_16KCL) {
5972 int k;
5973 for (k = 1; k < NSLABSP16KB; k++) {
5974 sp = sp->sl_next;
5975 /* Next slab must already be present */
5976 VERIFY(sp != NULL);
5977 VERIFY(!slab_is_detached(sp));
5978 slab_detach(sp);
5979 }
5980 }
5981 }
5982
5983 static boolean_t
5984 slab_inrange(mcl_slab_t *sp, void *buf)
5985 {
5986 return ((uintptr_t)buf >= (uintptr_t)sp->sl_base &&
5987 (uintptr_t)buf < ((uintptr_t)sp->sl_base + sp->sl_len));
5988 }
5989
5990 #undef panic
5991
5992 static void
5993 slab_nextptr_panic(mcl_slab_t *sp, void *addr)
5994 {
5995 int i;
5996 unsigned int chunk_len = sp->sl_len / sp->sl_chunks;
5997 uintptr_t buf = (uintptr_t)sp->sl_base;
5998
5999 for (i = 0; i < sp->sl_chunks; i++, buf += chunk_len) {
6000 void *next = ((mcache_obj_t *)buf)->obj_next;
6001 if (next != addr)
6002 continue;
6003 if (!mclverify) {
6004 if (next != NULL && !MBUF_IN_MAP(next)) {
6005 mcache_t *cp = m_cache(sp->sl_class);
6006 panic("%s: %s buffer %p in slab %p modified "
6007 "after free at offset 0: %p out of range "
6008 "[%p-%p)\n", __func__, cp->mc_name,
6009 (void *)buf, sp, next, mbutl, embutl);
6010 /* NOTREACHED */
6011 }
6012 } else {
6013 mcache_audit_t *mca = mcl_audit_buf2mca(sp->sl_class,
6014 (mcache_obj_t *)buf);
6015 mcl_audit_verify_nextptr(next, mca);
6016 }
6017 }
6018 }
6019
6020 static void
6021 slab_detach(mcl_slab_t *sp)
6022 {
6023 sp->sl_link.tqe_next = (mcl_slab_t *)-1;
6024 sp->sl_link.tqe_prev = (mcl_slab_t **)-1;
6025 sp->sl_flags |= SLF_DETACHED;
6026 }
6027
6028 static boolean_t
6029 slab_is_detached(mcl_slab_t *sp)
6030 {
6031 return ((intptr_t)sp->sl_link.tqe_next == -1 &&
6032 (intptr_t)sp->sl_link.tqe_prev == -1 &&
6033 (sp->sl_flags & SLF_DETACHED));
6034 }
6035
6036 static void
6037 mcl_audit_init(void *buf, mcache_audit_t **mca_list,
6038 mcache_obj_t **con_list, size_t con_size, unsigned int num)
6039 {
6040 mcache_audit_t *mca, *mca_tail;
6041 mcache_obj_t *con = NULL;
6042 boolean_t save_contents = (con_list != NULL);
6043 unsigned int i, ix;
6044
6045 ASSERT(num <= NMBPBG);
6046 ASSERT(con_list == NULL || con_size != 0);
6047
6048 ix = MTOBG(buf);
6049 VERIFY(ix < maxclaudit);
6050
6051 /* Make sure we haven't been here before */
6052 for (i = 0; i < NMBPBG; i++)
6053 VERIFY(mclaudit[ix].cl_audit[i] == NULL);
6054
6055 mca = mca_tail = *mca_list;
6056 if (save_contents)
6057 con = *con_list;
6058
6059 for (i = 0; i < num; i++) {
6060 mcache_audit_t *next;
6061
6062 next = mca->mca_next;
6063 bzero(mca, sizeof (*mca));
6064 mca->mca_next = next;
6065 mclaudit[ix].cl_audit[i] = mca;
6066
6067 /* Attach the contents buffer if requested */
6068 if (save_contents) {
6069 VERIFY(con != NULL);
6070 mca->mca_contents_size = con_size;
6071 mca->mca_contents = con;
6072 con = con->obj_next;
6073 bzero(mca->mca_contents, mca->mca_contents_size);
6074 }
6075
6076 mca_tail = mca;
6077 mca = mca->mca_next;
6078 }
6079
6080 if (save_contents)
6081 *con_list = con;
6082
6083 *mca_list = mca_tail->mca_next;
6084 mca_tail->mca_next = NULL;
6085 }
6086
6087 /*
6088 * Given an address of a buffer (mbuf/2KB/4KB/16KB), return
6089 * the corresponding audit structure for that buffer.
6090 */
6091 static mcache_audit_t *
6092 mcl_audit_buf2mca(mbuf_class_t class, mcache_obj_t *o)
6093 {
6094 mcache_audit_t *mca = NULL;
6095 int ix = MTOBG(o);
6096
6097 VERIFY(ix < maxclaudit);
6098 VERIFY(IS_P2ALIGNED(o, MIN(m_maxsize(class), NBPG)));
6099
6100 switch (class) {
6101 case MC_MBUF:
6102 /*
6103 * For the mbuf case, find the index of the page
6104 * used by the mbuf and use that index to locate the
6105 * base address of the page. Then find out the
6106 * mbuf index relative to the page base and use
6107 * it to locate the audit structure.
6108 */
6109 VERIFY(MCLIDX(BGTOM(ix), o) < (int)NMBPBG);
6110 mca = mclaudit[ix].cl_audit[MCLIDX(BGTOM(ix), o)];
6111 break;
6112
6113 case MC_CL:
6114 /*
6115 * Same thing as above, but for 2KB clusters in a page.
6116 */
6117 VERIFY(CLBGIDX(BGTOM(ix), o) < (int)NCLPBG);
6118 mca = mclaudit[ix].cl_audit[CLBGIDX(BGTOM(ix), o)];
6119 break;
6120
6121 case MC_BIGCL:
6122 case MC_16KCL:
6123 /*
6124 * Same as above, but only return the first element.
6125 */
6126 mca = mclaudit[ix].cl_audit[0];
6127 break;
6128
6129 default:
6130 VERIFY(0);
6131 /* NOTREACHED */
6132 }
6133
6134 return (mca);
6135 }
6136
6137 static void
6138 mcl_audit_mbuf(mcache_audit_t *mca, void *addr, boolean_t composite,
6139 boolean_t alloc)
6140 {
6141 struct mbuf *m = addr;
6142 mcache_obj_t *next = ((mcache_obj_t *)m)->obj_next;
6143
6144 VERIFY(mca->mca_contents != NULL &&
6145 mca->mca_contents_size == AUDIT_CONTENTS_SIZE);
6146
6147 if (mclverify)
6148 mcl_audit_verify_nextptr(next, mca);
6149
6150 if (!alloc) {
6151 /* Save constructed mbuf fields */
6152 mcl_audit_save_mbuf(m, mca);
6153 if (mclverify) {
6154 mcache_set_pattern(MCACHE_FREE_PATTERN, m,
6155 m_maxsize(MC_MBUF));
6156 }
6157 ((mcache_obj_t *)m)->obj_next = next;
6158 return;
6159 }
6160
6161 /* Check if the buffer has been corrupted while in freelist */
6162 if (mclverify) {
6163 mcache_audit_free_verify_set(mca, addr, 0, m_maxsize(MC_MBUF));
6164 }
6165 /* Restore constructed mbuf fields */
6166 mcl_audit_restore_mbuf(m, mca, composite);
6167 }
6168
6169 static void
6170 mcl_audit_restore_mbuf(struct mbuf *m, mcache_audit_t *mca, boolean_t composite)
6171 {
6172 struct mbuf *ms = (struct mbuf *)mca->mca_contents;
6173
6174 if (composite) {
6175 struct mbuf *next = m->m_next;
6176 VERIFY(ms->m_flags == M_EXT && MEXT_RFA(ms) != NULL &&
6177 MBUF_IS_COMPOSITE(ms));
6178 /*
6179 * We could have hand-picked the mbuf fields and restore
6180 * them individually, but that will be a maintenance
6181 * headache. Instead, restore everything that was saved;
6182 * the mbuf layer will recheck and reinitialize anyway.
6183 */
6184 bcopy(ms, m, mca->mca_contents_size);
6185 m->m_next = next;
6186 } else {
6187 /*
6188 * For a regular mbuf (no cluster attached) there's nothing
6189 * to restore other than the type field, which is expected
6190 * to be MT_FREE.
6191 */
6192 m->m_type = ms->m_type;
6193 }
6194 _MCHECK(m);
6195 }
6196
6197 static void
6198 mcl_audit_save_mbuf(struct mbuf *m, mcache_audit_t *mca)
6199 {
6200 _MCHECK(m);
6201 bcopy(m, mca->mca_contents, mca->mca_contents_size);
6202 }
6203
6204 static void
6205 mcl_audit_cluster(mcache_audit_t *mca, void *addr, size_t size, boolean_t alloc,
6206 boolean_t save_next)
6207 {
6208 mcache_obj_t *next = ((mcache_obj_t *)addr)->obj_next;
6209
6210 if (!alloc) {
6211 if (mclverify) {
6212 mcache_set_pattern(MCACHE_FREE_PATTERN, addr, size);
6213 }
6214 if (save_next) {
6215 mcl_audit_verify_nextptr(next, mca);
6216 ((mcache_obj_t *)addr)->obj_next = next;
6217 }
6218 } else if (mclverify) {
6219 /* Check if the buffer has been corrupted while in freelist */
6220 mcl_audit_verify_nextptr(next, mca);
6221 mcache_audit_free_verify_set(mca, addr, 0, size);
6222 }
6223 }
6224
6225 static void
6226 mcl_audit_mcheck_panic(struct mbuf *m)
6227 {
6228 mcache_audit_t *mca;
6229
6230 MRANGE(m);
6231 mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
6232
6233 panic("mcl_audit: freed mbuf %p with type 0x%x (instead of 0x%x)\n%s\n",
6234 m, (u_int16_t)m->m_type, MT_FREE, mcache_dump_mca(mca));
6235 /* NOTREACHED */
6236 }
6237
6238 static void
6239 mcl_audit_verify_nextptr(void *next, mcache_audit_t *mca)
6240 {
6241 if (next != NULL && !MBUF_IN_MAP(next) &&
6242 (next != (void *)MCACHE_FREE_PATTERN || !mclverify)) {
6243 panic("mcl_audit: buffer %p modified after free at offset 0: "
6244 "%p out of range [%p-%p)\n%s\n",
6245 mca->mca_addr, next, mbutl, embutl, mcache_dump_mca(mca));
6246 /* NOTREACHED */
6247 }
6248 }
6249
6250 /* This function turns on mbuf leak detection */
6251 static void
6252 mleak_activate(void)
6253 {
6254 mleak_table.mleak_sample_factor = MLEAK_SAMPLE_FACTOR;
6255 PE_parse_boot_argn("mleak_sample_factor",
6256 &mleak_table.mleak_sample_factor,
6257 sizeof (mleak_table.mleak_sample_factor));
6258
6259 if (mleak_table.mleak_sample_factor == 0)
6260 mclfindleak = 0;
6261
6262 if (mclfindleak == 0)
6263 return;
6264
6265 vm_size_t alloc_size =
6266 mleak_alloc_buckets * sizeof (struct mallocation);
6267 vm_size_t trace_size = mleak_trace_buckets * sizeof (struct mtrace);
6268
6269 MALLOC(mleak_allocations, struct mallocation *, alloc_size,
6270 M_TEMP, M_WAITOK | M_ZERO);
6271 VERIFY(mleak_allocations != NULL);
6272
6273 MALLOC(mleak_traces, struct mtrace *, trace_size,
6274 M_TEMP, M_WAITOK | M_ZERO);
6275 VERIFY(mleak_traces != NULL);
6276
6277 MALLOC(mleak_stat, mleak_stat_t *, MLEAK_STAT_SIZE(MLEAK_NUM_TRACES),
6278 M_TEMP, M_WAITOK | M_ZERO);
6279 VERIFY(mleak_stat != NULL);
6280 mleak_stat->ml_cnt = MLEAK_NUM_TRACES;
6281 #ifdef __LP64__
6282 mleak_stat->ml_isaddr64 = 1;
6283 #endif /* __LP64__ */
6284 }
6285
6286 static void
6287 mleak_logger(u_int32_t num, mcache_obj_t *addr, boolean_t alloc)
6288 {
6289 int temp;
6290
6291 if (mclfindleak == 0)
6292 return;
6293
6294 if (!alloc)
6295 return (mleak_free(addr));
6296
6297 temp = atomic_add_32_ov(&mleak_table.mleak_capture, 1);
6298
6299 if ((temp % mleak_table.mleak_sample_factor) == 0 && addr != NULL) {
6300 uintptr_t bt[MLEAK_STACK_DEPTH];
6301 int logged = fastbacktrace(bt, MLEAK_STACK_DEPTH);
6302 mleak_log(bt, addr, logged, num);
6303 }
6304 }
6305
6306 /*
6307 * This function records the allocation in the mleak_allocations table
6308 * and the backtrace in the mleak_traces table; if allocation slot is in use,
6309 * replace old allocation with new one if the trace slot is in use, return
6310 * (or increment refcount if same trace).
6311 */
6312 static boolean_t
6313 mleak_log(uintptr_t *bt, mcache_obj_t *addr, uint32_t depth, int num)
6314 {
6315 struct mallocation *allocation;
6316 struct mtrace *trace;
6317 uint32_t trace_index;
6318 int i;
6319
6320 /* Quit if someone else modifying the tables */
6321 if (!lck_mtx_try_lock_spin(mleak_lock)) {
6322 mleak_table.total_conflicts++;
6323 return (FALSE);
6324 }
6325
6326 allocation = &mleak_allocations[hashaddr((uintptr_t)addr,
6327 mleak_alloc_buckets)];
6328 trace_index = hashbacktrace(bt, depth, mleak_trace_buckets);
6329 trace = &mleak_traces[trace_index];
6330
6331 VERIFY(allocation <= &mleak_allocations[mleak_alloc_buckets - 1]);
6332 VERIFY(trace <= &mleak_traces[mleak_trace_buckets - 1]);
6333
6334 allocation->hitcount++;
6335 trace->hitcount++;
6336
6337 /*
6338 * If the allocation bucket we want is occupied
6339 * and the occupier has the same trace, just bail.
6340 */
6341 if (allocation->element != NULL &&
6342 trace_index == allocation->trace_index) {
6343 mleak_table.alloc_collisions++;
6344 lck_mtx_unlock(mleak_lock);
6345 return (TRUE);
6346 }
6347
6348 /*
6349 * Store the backtrace in the traces array;
6350 * Size of zero = trace bucket is free.
6351 */
6352 if (trace->allocs > 0 &&
6353 bcmp(trace->addr, bt, (depth * sizeof (uintptr_t))) != 0) {
6354 /* Different, unique trace, but the same hash! Bail out. */
6355 trace->collisions++;
6356 mleak_table.trace_collisions++;
6357 lck_mtx_unlock(mleak_lock);
6358 return (TRUE);
6359 } else if (trace->allocs > 0) {
6360 /* Same trace, already added, so increment refcount */
6361 trace->allocs++;
6362 } else {
6363 /* Found an unused trace bucket, so record the trace here */
6364 if (trace->depth != 0) {
6365 /* this slot previously used but not currently in use */
6366 mleak_table.trace_overwrites++;
6367 }
6368 mleak_table.trace_recorded++;
6369 trace->allocs = 1;
6370 memcpy(trace->addr, bt, (depth * sizeof (uintptr_t)));
6371 trace->depth = depth;
6372 trace->collisions = 0;
6373 }
6374
6375 /* Step 2: Store the allocation record in the allocations array */
6376 if (allocation->element != NULL) {
6377 /*
6378 * Replace an existing allocation. No need to preserve
6379 * because only a subset of the allocations are being
6380 * recorded anyway.
6381 */
6382 mleak_table.alloc_collisions++;
6383 } else if (allocation->trace_index != 0) {
6384 mleak_table.alloc_overwrites++;
6385 }
6386 allocation->element = addr;
6387 allocation->trace_index = trace_index;
6388 allocation->count = num;
6389 mleak_table.alloc_recorded++;
6390 mleak_table.outstanding_allocs++;
6391
6392 /* keep a log of the last 5 traces to be top trace, in order */
6393 for (i = 0; i < MLEAK_NUM_TRACES; i++) {
6394 if (mleak_top_trace[i] == NULL ||
6395 mleak_top_trace[i]->allocs <= trace->allocs) {
6396 if (mleak_top_trace[i] != trace) {
6397 int j = MLEAK_NUM_TRACES;
6398 while (--j > i) {
6399 mleak_top_trace[j] =
6400 mleak_top_trace[j - 1];
6401 }
6402 mleak_top_trace[i] = trace;
6403 }
6404 break;
6405 }
6406 }
6407
6408 lck_mtx_unlock(mleak_lock);
6409 return (TRUE);
6410 }
6411
6412 static void
6413 mleak_free(mcache_obj_t *addr)
6414 {
6415 while (addr != NULL) {
6416 struct mallocation *allocation = &mleak_allocations
6417 [hashaddr((uintptr_t)addr, mleak_alloc_buckets)];
6418
6419 if (allocation->element == addr &&
6420 allocation->trace_index < mleak_trace_buckets) {
6421 lck_mtx_lock_spin(mleak_lock);
6422 if (allocation->element == addr &&
6423 allocation->trace_index < mleak_trace_buckets) {
6424 struct mtrace *trace;
6425 trace = &mleak_traces[allocation->trace_index];
6426 /* allocs = 0 means trace bucket is unused */
6427 if (trace->allocs > 0)
6428 trace->allocs--;
6429 if (trace->allocs == 0)
6430 trace->depth = 0;
6431 /* NULL element means alloc bucket is unused */
6432 allocation->element = NULL;
6433 mleak_table.outstanding_allocs--;
6434 }
6435 lck_mtx_unlock(mleak_lock);
6436 }
6437 addr = addr->obj_next;
6438 }
6439 }
6440
6441 static struct mbtypes {
6442 int mt_type;
6443 const char *mt_name;
6444 } mbtypes[] = {
6445 { MT_DATA, "data" },
6446 { MT_OOBDATA, "oob data" },
6447 { MT_CONTROL, "ancillary data" },
6448 { MT_HEADER, "packet headers" },
6449 { MT_SOCKET, "socket structures" },
6450 { MT_PCB, "protocol control blocks" },
6451 { MT_RTABLE, "routing table entries" },
6452 { MT_HTABLE, "IMP host table entries" },
6453 { MT_ATABLE, "address resolution tables" },
6454 { MT_FTABLE, "fragment reassembly queue headers" },
6455 { MT_SONAME, "socket names and addresses" },
6456 { MT_SOOPTS, "socket options" },
6457 { MT_RIGHTS, "access rights" },
6458 { MT_IFADDR, "interface addresses" },
6459 { MT_TAG, "packet tags" },
6460 { 0, NULL }
6461 };
6462
6463 #define MBUF_DUMP_BUF_CHK() { \
6464 clen -= k; \
6465 if (clen < 1) \
6466 goto done; \
6467 c += k; \
6468 }
6469
6470 static char *
6471 mbuf_dump(void)
6472 {
6473 unsigned long totmem = 0, totfree = 0, totmbufs, totused, totpct;
6474 u_int32_t m_mbufs = 0, m_clfree = 0, m_bigclfree = 0;
6475 u_int32_t m_mbufclfree = 0, m_mbufbigclfree = 0;
6476 u_int32_t m_16kclusters = 0, m_16kclfree = 0, m_mbuf16kclfree = 0;
6477 int nmbtypes = sizeof (mbstat.m_mtypes) / sizeof (short);
6478 uint8_t seen[256];
6479 struct mbtypes *mp;
6480 mb_class_stat_t *sp;
6481 char *c = mbuf_dump_buf;
6482 int i, k, clen = sizeof (mbuf_dump_buf);
6483
6484 mbuf_dump_buf[0] = '\0';
6485
6486 /* synchronize all statistics in the mbuf table */
6487 mbuf_stat_sync();
6488 mbuf_mtypes_sync(TRUE);
6489
6490 sp = &mb_stat->mbs_class[0];
6491 for (i = 0; i < mb_stat->mbs_cnt; i++, sp++) {
6492 u_int32_t mem;
6493
6494 if (m_class(i) == MC_MBUF) {
6495 m_mbufs = sp->mbcl_active;
6496 } else if (m_class(i) == MC_CL) {
6497 m_clfree = sp->mbcl_total - sp->mbcl_active;
6498 } else if (m_class(i) == MC_BIGCL) {
6499 m_bigclfree = sp->mbcl_total - sp->mbcl_active;
6500 } else if (njcl > 0 && m_class(i) == MC_16KCL) {
6501 m_16kclfree = sp->mbcl_total - sp->mbcl_active;
6502 m_16kclusters = sp->mbcl_total;
6503 } else if (m_class(i) == MC_MBUF_CL) {
6504 m_mbufclfree = sp->mbcl_total - sp->mbcl_active;
6505 } else if (m_class(i) == MC_MBUF_BIGCL) {
6506 m_mbufbigclfree = sp->mbcl_total - sp->mbcl_active;
6507 } else if (njcl > 0 && m_class(i) == MC_MBUF_16KCL) {
6508 m_mbuf16kclfree = sp->mbcl_total - sp->mbcl_active;
6509 }
6510
6511 mem = sp->mbcl_ctotal * sp->mbcl_size;
6512 totmem += mem;
6513 totfree += (sp->mbcl_mc_cached + sp->mbcl_infree) *
6514 sp->mbcl_size;
6515
6516 }
6517
6518 /* adjust free counts to include composite caches */
6519 m_clfree += m_mbufclfree;
6520 m_bigclfree += m_mbufbigclfree;
6521 m_16kclfree += m_mbuf16kclfree;
6522
6523 totmbufs = 0;
6524 for (mp = mbtypes; mp->mt_name != NULL; mp++)
6525 totmbufs += mbstat.m_mtypes[mp->mt_type];
6526 if (totmbufs > m_mbufs)
6527 totmbufs = m_mbufs;
6528 k = snprintf(c, clen, "%lu/%u mbufs in use:\n", totmbufs, m_mbufs);
6529 MBUF_DUMP_BUF_CHK();
6530
6531 bzero(&seen, sizeof (seen));
6532 for (mp = mbtypes; mp->mt_name != NULL; mp++) {
6533 if (mbstat.m_mtypes[mp->mt_type] != 0) {
6534 seen[mp->mt_type] = 1;
6535 k = snprintf(c, clen, "\t%u mbufs allocated to %s\n",
6536 mbstat.m_mtypes[mp->mt_type], mp->mt_name);
6537 MBUF_DUMP_BUF_CHK();
6538 }
6539 }
6540 seen[MT_FREE] = 1;
6541 for (i = 0; i < nmbtypes; i++)
6542 if (!seen[i] && mbstat.m_mtypes[i] != 0) {
6543 k = snprintf(c, clen, "\t%u mbufs allocated to "
6544 "<mbuf type %d>\n", mbstat.m_mtypes[i], i);
6545 MBUF_DUMP_BUF_CHK();
6546 }
6547 if ((m_mbufs - totmbufs) > 0) {
6548 k = snprintf(c, clen, "\t%lu mbufs allocated to caches\n",
6549 m_mbufs - totmbufs);
6550 MBUF_DUMP_BUF_CHK();
6551 }
6552 k = snprintf(c, clen, "%u/%u mbuf 2KB clusters in use\n"
6553 "%u/%u mbuf 4KB clusters in use\n",
6554 (unsigned int)(mbstat.m_clusters - m_clfree),
6555 (unsigned int)mbstat.m_clusters,
6556 (unsigned int)(mbstat.m_bigclusters - m_bigclfree),
6557 (unsigned int)mbstat.m_bigclusters);
6558 MBUF_DUMP_BUF_CHK();
6559
6560 if (njcl > 0) {
6561 k = snprintf(c, clen, "%u/%u mbuf %uKB clusters in use\n",
6562 m_16kclusters - m_16kclfree, m_16kclusters,
6563 njclbytes / 1024);
6564 MBUF_DUMP_BUF_CHK();
6565 }
6566 totused = totmem - totfree;
6567 if (totmem == 0) {
6568 totpct = 0;
6569 } else if (totused < (ULONG_MAX / 100)) {
6570 totpct = (totused * 100) / totmem;
6571 } else {
6572 u_long totmem1 = totmem / 100;
6573 u_long totused1 = totused / 100;
6574 totpct = (totused1 * 100) / totmem1;
6575 }
6576 k = snprintf(c, clen, "%lu KB allocated to network (approx. %lu%% "
6577 "in use)\n", totmem / 1024, totpct);
6578 MBUF_DUMP_BUF_CHK();
6579
6580 done:
6581 return (mbuf_dump_buf);
6582 }
6583
6584 #undef MBUF_DUMP_BUF_CHK
6585
6586 SYSCTL_DECL(_kern_ipc);
6587 SYSCTL_PROC(_kern_ipc, KIPC_MBSTAT, mbstat,
6588 CTLFLAG_RD | CTLFLAG_LOCKED,
6589 0, 0, mbstat_sysctl, "S,mbstat", "");
6590 SYSCTL_PROC(_kern_ipc, OID_AUTO, mb_stat,
6591 CTLFLAG_RD | CTLFLAG_LOCKED,
6592 0, 0, mb_stat_sysctl, "S,mb_stat", "");
6593 SYSCTL_PROC(_kern_ipc, OID_AUTO, mleak_top_trace,
6594 CTLFLAG_RD | CTLFLAG_LOCKED,
6595 0, 0, mleak_top_trace_sysctl, "S,mb_top_trace", "");
6596 SYSCTL_PROC(_kern_ipc, OID_AUTO, mleak_table,
6597 CTLFLAG_RD | CTLFLAG_LOCKED,
6598 0, 0, mleak_table_sysctl, "S,mleak_table", "");
6599 SYSCTL_INT(_kern_ipc, OID_AUTO, mleak_sample_factor,
6600 CTLFLAG_RW | CTLFLAG_LOCKED, &mleak_table.mleak_sample_factor, 0, "");
6601 SYSCTL_INT(_kern_ipc, OID_AUTO, mb_normalized,
6602 CTLFLAG_RD | CTLFLAG_LOCKED, &mb_normalized, 0, "");
6603 SYSCTL_INT(_kern_ipc, OID_AUTO, mb_watchdog,
6604 CTLFLAG_RW | CTLFLAG_LOCKED, &mb_watchdog, 0, "");