]> git.saurik.com Git - apple/xnu.git/blob - bsd/kern/uipc_mbuf.c
xnu-1228.7.58.tar.gz
[apple/xnu.git] / bsd / kern / uipc_mbuf.c
1 /*
2 * Copyright (c) 2000-2007 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
29 /*
30 * Copyright (c) 1982, 1986, 1988, 1991, 1993
31 * The Regents of the University of California. All rights reserved.
32 *
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
35 * are met:
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
41 * 3. All advertising materials mentioning features or use of this software
42 * must display the following acknowledgement:
43 * This product includes software developed by the University of
44 * California, Berkeley and its contributors.
45 * 4. Neither the name of the University nor the names of its contributors
46 * may be used to endorse or promote products derived from this software
47 * without specific prior written permission.
48 *
49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59 * SUCH DAMAGE.
60 *
61 * @(#)uipc_mbuf.c 8.2 (Berkeley) 1/4/94
62 */
63 /*
64 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
65 * support for mandatory and extensible security protections. This notice
66 * is included in support of clause 2.2 (b) of the Apple Public License,
67 * Version 2.0.
68 */
69
70 #include <sys/param.h>
71 #include <sys/systm.h>
72 #include <sys/malloc.h>
73 #include <sys/mbuf.h>
74 #include <sys/kernel.h>
75 #include <sys/sysctl.h>
76 #include <sys/syslog.h>
77 #include <sys/protosw.h>
78 #include <sys/domain.h>
79 #include <sys/queue.h>
80
81 #include <kern/kern_types.h>
82 #include <kern/simple_lock.h>
83 #include <kern/queue.h>
84 #include <kern/sched_prim.h>
85 #include <kern/cpu_number.h>
86
87 #include <libkern/OSAtomic.h>
88 #include <libkern/libkern.h>
89
90 #include <IOKit/IOMapper.h>
91
92 #include <machine/limits.h>
93 #include <machine/machine_routines.h>
94
95 #if CONFIG_MACF_NET
96 #include <security/mac_framework.h>
97 #endif /* MAC_NET */
98
99 #include <sys/mcache.h>
100
101 /*
102 * MBUF IMPLEMENTATION NOTES.
103 *
104 * There is a total of 5 per-CPU caches:
105 *
106 * MC_MBUF:
107 * This is a cache of rudimentary objects of MSIZE in size; each
108 * object represents an mbuf structure. This cache preserves only
109 * the m_type field of the mbuf during its transactions.
110 *
111 * MC_CL:
112 * This is a cache of rudimentary objects of MCLBYTES in size; each
113 * object represents a mcluster structure. This cache does not
114 * preserve the contents of the objects during its transactions.
115 *
116 * MC_BIGCL:
117 * This is a cache of rudimentary objects of NBPG in size; each
118 * object represents a mbigcluster structure. This cache does not
119 * preserve the contents of the objects during its transaction.
120 *
121 * MC_MBUF_CL:
122 * This is a cache of mbufs each having a cluster attached to it.
123 * It is backed by MC_MBUF and MC_CL rudimentary caches. Several
124 * fields of the mbuf related to the external cluster are preserved
125 * during transactions.
126 *
127 * MC_MBUF_BIGCL:
128 * This is a cache of mbufs each having a big cluster attached to it.
129 * It is backed by MC_MBUF and MC_BIGCL rudimentary caches. Several
130 * fields of the mbuf related to the external cluster are preserved
131 * during transactions.
132 *
133 * OBJECT ALLOCATION:
134 *
135 * Allocation requests are handled first at the per-CPU (mcache) layer
136 * before falling back to the slab layer. Performance is optimal when
137 * the request is satisfied at the CPU layer because global data/lock
138 * never gets accessed. When the slab layer is entered for allocation,
139 * the slab freelist will be checked first for available objects before
140 * the VM backing store is invoked. Slab layer operations are serialized
141 * for all of the caches as the mbuf global lock is held most of the time.
142 * Allocation paths are different depending on the class of objects:
143 *
144 * a. Rudimentary object:
145 *
146 * { m_get_common(), m_clattach(), m_mclget(),
147 * m_mclalloc(), m_bigalloc(), m_copym_with_hdrs(),
148 * composite object allocation }
149 * | ^
150 * | |
151 * | +-----------------------+
152 * v |
153 * mcache_alloc/mcache_alloc_ext() mbuf_slab_audit()
154 * | ^
155 * v |
156 * [CPU cache] -------> (found?) -------+
157 * | |
158 * v |
159 * mbuf_slab_alloc() |
160 * | |
161 * v |
162 * +---------> [freelist] -------> (found?) -------+
163 * | |
164 * | v
165 * | m_clalloc()
166 * | |
167 * | v
168 * +---<<---- kmem_mb_alloc()
169 *
170 * b. Composite object:
171 *
172 * { m_getpackets_internal(), m_allocpacket_internal() }
173 * | ^
174 * | |
175 * | +------ (done) ---------+
176 * v |
177 * mcache_alloc/mcache_alloc_ext() mbuf_cslab_audit()
178 * | ^
179 * v |
180 * [CPU cache] -------> (found?) -------+
181 * | |
182 * v |
183 * mbuf_cslab_alloc() |
184 * | |
185 * v |
186 * [freelist] -------> (found?) -------+
187 * | |
188 * v |
189 * (rudimentary object) |
190 * mcache_alloc/mcache_alloc_ext() ------>>-----+
191 *
192 * Auditing notes: If auditing is enabled, buffers will be subjected to
193 * integrity checks by the audit routine. This is done by verifying their
194 * contents against DEADBEEF (free) pattern before returning them to caller.
195 * As part of this step, the routine will also record the transaction and
196 * pattern-fill the buffers with BADDCAFE (uninitialized) pattern. It will
197 * also restore any constructed data structure fields if necessary.
198 *
199 * OBJECT DEALLOCATION:
200 *
201 * Freeing an object simply involves placing it into the CPU cache; this
202 * pollutes the cache to benefit subsequent allocations. The slab layer
203 * will only be entered if the object is to be purged out of the cache.
204 * During normal operations, this happens only when the CPU layer resizes
205 * its bucket while it's adjusting to the allocation load. Deallocation
206 * paths are different depending on the class of objects:
207 *
208 * a. Rudimentary object:
209 *
210 * { m_free(), m_freem_list(), composite object deallocation }
211 * | ^
212 * | |
213 * | +------ (done) ---------+
214 * v |
215 * mcache_free/mcache_free_ext() |
216 * | |
217 * v |
218 * mbuf_slab_audit() |
219 * | |
220 * v |
221 * [CPU cache] ---> (not purging?) -----+
222 * | |
223 * v |
224 * mbuf_slab_free() |
225 * | |
226 * v |
227 * [freelist] ----------->>------------+
228 * (objects never get purged to VM)
229 *
230 * b. Composite object:
231 *
232 * { m_free(), m_freem_list() }
233 * | ^
234 * | |
235 * | +------ (done) ---------+
236 * v |
237 * mcache_free/mcache_free_ext() |
238 * | |
239 * v |
240 * mbuf_cslab_audit() |
241 * | |
242 * v |
243 * [CPU cache] ---> (not purging?) -----+
244 * | |
245 * v |
246 * mbuf_cslab_free() |
247 * | |
248 * v |
249 * [freelist] ---> (not purging?) -----+
250 * | |
251 * v |
252 * (rudimentary object) |
253 * mcache_free/mcache_free_ext() ------->>------+
254 *
255 * Auditing notes: If auditing is enabled, the audit routine will save
256 * any constructed data structure fields (if necessary) before filling the
257 * contents of the buffers with DEADBEEF (free) pattern and recording the
258 * transaction. Buffers that are freed (whether at CPU or slab layer) are
259 * expected to contain the free pattern.
260 *
261 * DEBUGGING:
262 *
263 * Debugging can be enabled by adding "mbuf_debug=0x3" to boot-args; this
264 * translates to the mcache flags (MCF_VERIFY | MCF_AUDIT). Additionally,
265 * the CPU layer cache can be disabled by setting the MCF_NOCPUCACHE flag,
266 * i.e. modify the boot argument parameter to "mbuf_debug=0x13". Note
267 * that debugging consumes more CPU and memory.
268 *
269 * Each object is associated with exactly one mcache_audit_t structure that
270 * contains the information related to its last buffer transaction. Given
271 * an address of an object, the audit structure can be retrieved by finding
272 * the position of the object relevant to the base address of the cluster:
273 *
274 * +------------+ +=============+
275 * | mbuf addr | | mclaudit[i] |
276 * +------------+ +=============+
277 * | | cl_audit[0] |
278 * i = MTOCL(addr) +-------------+
279 * | +-----> | cl_audit[1] | -----> mcache_audit_t
280 * b = CLTOM(i) | +-------------+
281 * | | | ... |
282 * x = MCLIDX(b, addr) | +-------------+
283 * | | | cl_audit[7] |
284 * +-----------------+ +-------------+
285 * (e.g. x == 1)
286 *
287 * The mclaudit[] array is allocated at initialization time, but its contents
288 * get populated when the corresponding cluster is created. Because a cluster
289 * can be turned into NMBPCL number of mbufs, we preserve enough space for the
290 * mbufs so that there is a 1-to-1 mapping between them. A cluster that never
291 * gets (or has not yet) turned into mbufs will use only cl_audit[0] with the
292 * remaining entries unused. For big clusters, only one entry is allocated
293 * and used for the entire cluster pair.
294 */
295
296 /* TODO: should be in header file */
297 /* kernel translater */
298 extern vm_offset_t kmem_mb_alloc(vm_map_t, int);
299 extern ppnum_t pmap_find_phys(pmap_t pmap, addr64_t va);
300 extern vm_map_t mb_map; /* special map */
301
302 /* Global lock */
303 static lck_mtx_t *mbuf_mlock;
304 static lck_attr_t *mbuf_mlock_attr;
305 static lck_grp_t *mbuf_mlock_grp;
306 static lck_grp_attr_t *mbuf_mlock_grp_attr;
307
308 /* Back-end (common) layer */
309 static void *mbuf_worker_run; /* wait channel for worker thread */
310 static int mbuf_worker_ready; /* worker thread is runnable */
311 static int mbuf_expand_mcl; /* number of cluster creation requets */
312 static int mbuf_expand_big; /* number of big cluster creation requests */
313 static int mbuf_expand_16k; /* number of 16K cluster creation requests */
314 static int ncpu; /* number of CPUs */
315 static int *mcl_paddr; /* Array of cluster physical addresses */
316 static ppnum_t mcl_paddr_base; /* Handle returned by IOMapper::iovmAlloc() */
317 static mcache_t *ref_cache; /* Cache of cluster reference & flags */
318 static mcache_t *mcl_audit_con_cache; /* Audit contents cache */
319 static unsigned int mbuf_debug; /* patchable mbuf mcache flags */
320 static unsigned int mb_normalized; /* number of packets "normalized" */
321
322 typedef enum {
323 MC_MBUF = 0, /* Regular mbuf */
324 MC_CL, /* Cluster */
325 MC_BIGCL, /* Large (4K) cluster */
326 MC_16KCL, /* Jumbo (16K) cluster */
327 MC_MBUF_CL, /* mbuf + cluster */
328 MC_MBUF_BIGCL, /* mbuf + large (4K) cluster */
329 MC_MBUF_16KCL /* mbuf + jumbo (16K) cluster */
330 } mbuf_class_t;
331
332 #define MBUF_CLASS_MIN MC_MBUF
333 #define MBUF_CLASS_MAX MC_MBUF_16KCL
334 #define MBUF_CLASS_LAST MC_16KCL
335 #define MBUF_CLASS_VALID(c) \
336 ((int)(c) >= MBUF_CLASS_MIN && (int)(c) <= MBUF_CLASS_MAX)
337 #define MBUF_CLASS_COMPOSITE(c) \
338 ((int)(c) > MBUF_CLASS_LAST)
339
340
341 /*
342 * mbuf specific mcache allocation request flags.
343 */
344 #define MCR_COMP MCR_USR1 /* for MC_MBUF_{CL,BIGCL,16KCL} caches */
345
346 /*
347 * Per-cluster slab structure.
348 *
349 * A slab is a cluster control structure that contains one or more object
350 * chunks; the available chunks are chained in the slab's freelist (sl_head).
351 * Each time a chunk is taken out of the slab, the slab's reference count
352 * gets incremented. When all chunks have been taken out, the empty slab
353 * gets removed (SLF_DETACHED) from the class's slab list. A chunk that is
354 * returned to a slab causes the slab's reference count to be decremented;
355 * it also causes the slab to be reinserted back to class's slab list, if
356 * it's not already done.
357 *
358 * Compartmentalizing of the object chunks into slabs allows us to easily
359 * merge one or more slabs together when the adjacent slabs are idle, as
360 * well as to convert or move a slab from one class to another; e.g. the
361 * mbuf cluster slab can be converted to a regular cluster slab when all
362 * mbufs in the slab have been freed.
363 *
364 * A slab may also span across multiple clusters for chunks larger than
365 * a cluster's size. In this case, only the slab of the first cluster is
366 * used. The rest of the slabs are marked with SLF_PARTIAL to indicate
367 * that they are part of the larger slab.
368 */
369 typedef struct mcl_slab {
370 struct mcl_slab *sl_next; /* neighboring slab */
371 u_int8_t sl_class; /* controlling mbuf class */
372 int8_t sl_refcnt; /* outstanding allocations */
373 int8_t sl_chunks; /* chunks (bufs) in this slab */
374 u_int16_t sl_flags; /* slab flags (see below) */
375 u_int16_t sl_len; /* slab length */
376 void *sl_base; /* base of allocated memory */
377 void *sl_head; /* first free buffer */
378 TAILQ_ENTRY(mcl_slab) sl_link; /* next/prev slab on freelist */
379 } mcl_slab_t;
380
381 #define SLF_MAPPED 0x0001 /* backed by a mapped page */
382 #define SLF_PARTIAL 0x0002 /* part of another slab */
383 #define SLF_DETACHED 0x0004 /* not in slab freelist */
384
385 /*
386 * The array of slabs are broken into groups of arrays per 1MB of kernel
387 * memory to reduce the footprint. Each group is allocated on demand
388 * whenever a new piece of memory mapped in from the VM crosses the 1MB
389 * boundary.
390 */
391 #define MBSHIFT 20 /* 1MB */
392 #define NSLABSPMB ((1 << MBSHIFT) >> MCLSHIFT) /* 512 slabs/grp */
393
394 typedef struct mcl_slabg {
395 mcl_slab_t slg_slab[NSLABSPMB]; /* group of slabs */
396 } mcl_slabg_t;
397
398 /*
399 * Per-cluster audit structure.
400 */
401 typedef struct {
402 mcache_audit_t *cl_audit[NMBPCL]; /* array of audits */
403 } mcl_audit_t;
404
405 #if CONFIG_MBUF_NOEXPAND
406 static unsigned int maxmbufcl;
407 #endif /* CONFIG_MBUF_NOEXPAND */
408
409 /*
410 * Size of data from the beginning of an mbuf that covers m_hdr, pkthdr
411 * and m_ext structures. If auditing is enabled, we allocate a shadow
412 * mbuf structure of this size inside each audit structure, and the
413 * contents of the real mbuf gets copied into it when the mbuf is freed.
414 * This allows us to pattern-fill the mbuf for integrity check, and to
415 * preserve any constructed mbuf fields (e.g. mbuf + cluster cache case).
416 * Note that we don't save the contents of clusters when they are freed;
417 * we simply pattern-fill them.
418 */
419 #if defined(__LP64__)
420 #define AUDIT_CONTENTS_SIZE 160
421 #else
422 #define AUDIT_CONTENTS_SIZE 80
423 #endif /* __LP64__ */
424
425 /*
426 * mbuf specific mcache audit flags
427 */
428 #define MB_INUSE 0x01 /* object has not been returned to slab */
429 #define MB_COMP_INUSE 0x02 /* object has not been returned to cslab */
430 #define MB_SCVALID 0x04 /* object has valid saved contents */
431
432 /*
433 * Each of the following two arrays hold up to nmbclusters elements.
434 */
435 static mcl_audit_t *mclaudit; /* array of cluster audit information */
436 static mcl_slabg_t **slabstbl; /* cluster slabs table */
437 static unsigned int maxslabgrp; /* max # of entries in slabs table */
438 static unsigned int slabgrp; /* # of entries in slabs table */
439
440 /* Globals */
441 int nclusters; /* # of clusters for non-jumbo (legacy) sizes */
442 int njcl; /* # of clusters for jumbo sizes */
443 int njclbytes; /* size of a jumbo cluster */
444 union mcluster *mbutl; /* first mapped cluster address */
445 union mcluster *embutl; /* ending virtual address of mclusters */
446 int max_linkhdr; /* largest link-level header */
447 int max_protohdr; /* largest protocol header */
448 int max_hdr; /* largest link+protocol header */
449 int max_datalen; /* MHLEN - max_hdr */
450
451 /* TODO: should be in header file */
452 int do_reclaim = 0;
453
454 /* The minimum number of objects that are allocated, to start. */
455 #define MINCL 32
456 #define MINBIGCL (MINCL >> 1)
457 #define MIN16KCL (MINCL >> 2)
458
459 /* Low watermarks (only map in pages once free counts go below) */
460 #define MCL_LOWAT MINCL
461 #define MBIGCL_LOWAT MINBIGCL
462 #define M16KCL_LOWAT MIN16KCL
463
464 typedef struct {
465 mbuf_class_t mtbl_class; /* class type */
466 mcache_t *mtbl_cache; /* mcache for this buffer class */
467 TAILQ_HEAD(mcl_slhead, mcl_slab) mtbl_slablist; /* slab list */
468 mcache_obj_t *mtbl_cobjlist; /* composite objects freelist */
469 mb_class_stat_t *mtbl_stats; /* statistics fetchable via sysctl */
470 u_int32_t mtbl_maxsize; /* maximum buffer size */
471 int mtbl_minlimit; /* minimum allowed */
472 int mtbl_maxlimit; /* maximum allowed */
473 u_int32_t mtbl_wantpurge; /* purge during next reclaim */
474 } mbuf_table_t;
475
476 #define m_class(c) mbuf_table[c].mtbl_class
477 #define m_cache(c) mbuf_table[c].mtbl_cache
478 #define m_slablist(c) mbuf_table[c].mtbl_slablist
479 #define m_cobjlist(c) mbuf_table[c].mtbl_cobjlist
480 #define m_maxsize(c) mbuf_table[c].mtbl_maxsize
481 #define m_minlimit(c) mbuf_table[c].mtbl_minlimit
482 #define m_maxlimit(c) mbuf_table[c].mtbl_maxlimit
483 #define m_wantpurge(c) mbuf_table[c].mtbl_wantpurge
484 #define m_cname(c) mbuf_table[c].mtbl_stats->mbcl_cname
485 #define m_size(c) mbuf_table[c].mtbl_stats->mbcl_size
486 #define m_total(c) mbuf_table[c].mtbl_stats->mbcl_total
487 #define m_active(c) mbuf_table[c].mtbl_stats->mbcl_active
488 #define m_infree(c) mbuf_table[c].mtbl_stats->mbcl_infree
489 #define m_slab_cnt(c) mbuf_table[c].mtbl_stats->mbcl_slab_cnt
490 #define m_alloc_cnt(c) mbuf_table[c].mtbl_stats->mbcl_alloc_cnt
491 #define m_free_cnt(c) mbuf_table[c].mtbl_stats->mbcl_free_cnt
492 #define m_notified(c) mbuf_table[c].mtbl_stats->mbcl_notified
493 #define m_purge_cnt(c) mbuf_table[c].mtbl_stats->mbcl_purge_cnt
494 #define m_fail_cnt(c) mbuf_table[c].mtbl_stats->mbcl_fail_cnt
495 #define m_ctotal(c) mbuf_table[c].mtbl_stats->mbcl_ctotal
496
497 static mbuf_table_t mbuf_table[] = {
498 /*
499 * The caches for mbufs, regular clusters and big clusters.
500 */
501 { MC_MBUF, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_MBUF)),
502 NULL, NULL, 0, 0, 0, 0 },
503 { MC_CL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_CL)),
504 NULL, NULL, 0, 0, 0, 0 },
505 { MC_BIGCL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_BIGCL)),
506 NULL, NULL, 0, 0, 0, 0 },
507 { MC_16KCL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_16KCL)),
508 NULL, NULL, 0, 0, 0, 0 },
509 /*
510 * The following are special caches; they serve as intermediate
511 * caches backed by the above rudimentary caches. Each object
512 * in the cache is an mbuf with a cluster attached to it. Unlike
513 * the above caches, these intermediate caches do not directly
514 * deal with the slab structures; instead, the constructed
515 * cached elements are simply stored in the freelists.
516 */
517 { MC_MBUF_CL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0 },
518 { MC_MBUF_BIGCL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0 },
519 { MC_MBUF_16KCL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0 },
520 };
521
522 #define NELEM(a) (sizeof (a) / sizeof ((a)[0]))
523
524 static void *mb_waitchan = &mbuf_table; /* wait channel for all caches */
525 static int mb_waiters; /* number of sleepers */
526
527 /* The following are used to serialize m_clalloc() */
528 static boolean_t mb_clalloc_busy;
529 static void *mb_clalloc_waitchan = &mb_clalloc_busy;
530 static int mb_clalloc_waiters;
531
532 static int mbstat_sysctl SYSCTL_HANDLER_ARGS;
533 static int mb_stat_sysctl SYSCTL_HANDLER_ARGS;
534 static void mbuf_table_init(void);
535 static inline void m_incref(struct mbuf *);
536 static inline u_int32_t m_decref(struct mbuf *);
537 static int m_clalloc(const u_int32_t, const int, const u_int32_t);
538 static void mbuf_worker_thread_init(void);
539 static mcache_obj_t *slab_alloc(mbuf_class_t, int);
540 static void slab_free(mbuf_class_t, mcache_obj_t *);
541 static unsigned int mbuf_slab_alloc(void *, mcache_obj_t ***,
542 unsigned int, int);
543 static void mbuf_slab_free(void *, mcache_obj_t *, int);
544 static void mbuf_slab_audit(void *, mcache_obj_t *, boolean_t);
545 static void mbuf_slab_notify(void *, u_int32_t);
546 static unsigned int cslab_alloc(mbuf_class_t, mcache_obj_t ***,
547 unsigned int);
548 static unsigned int cslab_free(mbuf_class_t, mcache_obj_t *, int);
549 static unsigned int mbuf_cslab_alloc(void *, mcache_obj_t ***,
550 unsigned int, int);
551 static void mbuf_cslab_free(void *, mcache_obj_t *, int);
552 static void mbuf_cslab_audit(void *, mcache_obj_t *, boolean_t);
553 static int freelist_populate(mbuf_class_t, unsigned int, int);
554 static boolean_t mbuf_cached_above(mbuf_class_t, int);
555 static boolean_t mbuf_steal(mbuf_class_t, unsigned int);
556 static void m_reclaim(mbuf_class_t, unsigned int, boolean_t);
557 static int m_howmany(int, size_t);
558 static void mbuf_worker_thread(void);
559 static boolean_t mbuf_sleep(mbuf_class_t, unsigned int, int);
560
561 static void mcl_audit_init(void *, mcache_audit_t **, mcache_obj_t **,
562 size_t, unsigned int);
563 static mcache_audit_t *mcl_audit_buf2mca(mbuf_class_t, mcache_obj_t *);
564 static void mcl_audit_mbuf(mcache_audit_t *, void *, boolean_t, boolean_t);
565 static void mcl_audit_cluster(mcache_audit_t *, void *, size_t, boolean_t,
566 boolean_t);
567 static void mcl_audit_restore_mbuf(struct mbuf *, mcache_audit_t *, boolean_t);
568 static void mcl_audit_save_mbuf(struct mbuf *, mcache_audit_t *);
569 static void mcl_audit_mcheck_panic(struct mbuf *);
570 static void mcl_audit_verify_nextptr(void *, mcache_audit_t *);
571
572 static mcl_slab_t *slab_get(void *);
573 static void slab_init(mcl_slab_t *, mbuf_class_t, u_int32_t,
574 void *, void *, unsigned int, int, int);
575 static void slab_insert(mcl_slab_t *, mbuf_class_t);
576 static void slab_remove(mcl_slab_t *, mbuf_class_t);
577 static boolean_t slab_inrange(mcl_slab_t *, void *);
578 static void slab_nextptr_panic(mcl_slab_t *, void *);
579 static void slab_detach(mcl_slab_t *);
580 static boolean_t slab_is_detached(mcl_slab_t *);
581
582 /*
583 * This flag is set for all mbufs that come out of and into the composite
584 * mbuf + cluster caches, i.e. MC_MBUF_CL and MC_MBUF_BIGCL. mbufs that
585 * are marked with such a flag have clusters attached to them, and will be
586 * treated differently when they are freed; instead of being placed back
587 * into the mbuf and cluster freelists, the composite mbuf + cluster objects
588 * are placed back into the appropriate composite cache's freelist, and the
589 * actual freeing is deferred until the composite objects are purged. At
590 * such a time, this flag will be cleared from the mbufs and the objects
591 * will be freed into their own separate freelists.
592 */
593 #define EXTF_COMPOSITE 0x1
594
595 #define MEXT_RFA(m) ((m)->m_ext.ext_refflags)
596 #define MEXT_REF(m) (MEXT_RFA(m)->refcnt)
597 #define MEXT_FLAGS(m) (MEXT_RFA(m)->flags)
598 #define MBUF_IS_COMPOSITE(m) \
599 (MEXT_REF(m) == 0 && (MEXT_FLAGS(m) & EXTF_COMPOSITE))
600
601 /*
602 * Macros used to verify the integrity of the mbuf.
603 */
604 #define _MCHECK(m) { \
605 if ((m)->m_type != MT_FREE) { \
606 if (mclaudit == NULL) \
607 panic("MCHECK: m_type=%d m=%p", \
608 (u_int16_t)(m)->m_type, m); \
609 else \
610 mcl_audit_mcheck_panic(m); \
611 } \
612 }
613
614 #define MBUF_IN_MAP(addr) \
615 ((void *)(addr) >= (void *)mbutl && (void *)(addr) < (void *)embutl)
616
617 #define MRANGE(addr) { \
618 if (!MBUF_IN_MAP(addr)) \
619 panic("MRANGE: address out of range 0x%p", addr); \
620 }
621
622 /*
623 * Macro version of mtod.
624 */
625 #define MTOD(m, t) ((t)((m)->m_data))
626
627 /*
628 * Macros to obtain cluster index and base cluster address.
629 */
630 #define MTOCL(x) (((char *)(x) - (char *)mbutl) >> MCLSHIFT)
631 #define CLTOM(x) ((union mcluster *)(mbutl + (x)))
632
633 /*
634 * Macro to find the mbuf index relative to the cluster base.
635 */
636 #define MCLIDX(c, m) (((char *)(m) - (char *)(c)) >> 8)
637
638 /*
639 * Macros used during mbuf and cluster initialization.
640 */
641 #define MBUF_INIT(m, pkthdr, type) { \
642 _MCHECK(m); \
643 (m)->m_next = (m)->m_nextpkt = NULL; \
644 (m)->m_len = 0; \
645 (m)->m_type = type; \
646 if ((pkthdr) == 0) { \
647 (m)->m_data = (m)->m_dat; \
648 (m)->m_flags = 0; \
649 } else { \
650 (m)->m_data = (m)->m_pktdat; \
651 (m)->m_flags = M_PKTHDR; \
652 (m)->m_pkthdr.rcvif = NULL; \
653 (m)->m_pkthdr.len = 0; \
654 (m)->m_pkthdr.header = NULL; \
655 (m)->m_pkthdr.csum_flags = 0; \
656 (m)->m_pkthdr.csum_data = 0; \
657 (m)->m_pkthdr.reserved0 = NULL; \
658 (m)->m_pkthdr.vlan_tag = 0; \
659 (m)->m_pkthdr.socket_id = 0; \
660 m_tag_init(m); \
661 } \
662 }
663
664 #define MEXT_INIT(m, buf, size, free, arg, rfa, ref, flag) { \
665 (m)->m_data = (m)->m_ext.ext_buf = (buf); \
666 (m)->m_flags |= M_EXT; \
667 (m)->m_ext.ext_size = (size); \
668 (m)->m_ext.ext_free = (free); \
669 (m)->m_ext.ext_arg = (arg); \
670 (m)->m_ext.ext_refs.forward = (m)->m_ext.ext_refs.backward = \
671 &(m)->m_ext.ext_refs; \
672 MEXT_RFA(m) = (rfa); \
673 MEXT_REF(m) = (ref); \
674 MEXT_FLAGS(m) = (flag); \
675 }
676
677 #define MBUF_CL_INIT(m, buf, rfa, ref, flag) \
678 MEXT_INIT(m, buf, m_maxsize(MC_CL), NULL, NULL, rfa, ref, flag)
679
680 #define MBUF_BIGCL_INIT(m, buf, rfa, ref, flag) \
681 MEXT_INIT(m, buf, m_maxsize(MC_BIGCL), m_bigfree, NULL, rfa, ref, flag)
682
683 #define MBUF_16KCL_INIT(m, buf, rfa, ref, flag) \
684 MEXT_INIT(m, buf, m_maxsize(MC_16KCL), m_16kfree, NULL, rfa, ref, flag)
685
686 /*
687 * Macro to convert BSD malloc sleep flag to mcache's
688 */
689 #define MSLEEPF(f) ((!((f) & M_DONTWAIT)) ? MCR_SLEEP : MCR_NOSLEEP)
690
691 /*
692 * The structure that holds all mbuf class statistics exportable via sysctl.
693 * Similar to mbstat structure, the mb_stat structure is protected by the
694 * global mbuf lock. It contains additional information about the classes
695 * that allows for a more accurate view of the state of the allocator.
696 */
697 struct mb_stat *mb_stat;
698
699 #define MB_STAT_SIZE(n) \
700 ((size_t)(&((mb_stat_t *)0)->mbs_class[n]))
701
702 /*
703 * The legacy structure holding all of the mbuf allocation statistics.
704 * The actual statistics used by the kernel are stored in the mbuf_table
705 * instead, and are updated atomically while the global mbuf lock is held.
706 * They are mirrored in mbstat to support legacy applications (e.g. netstat).
707 * Unlike before, the kernel no longer relies on the contents of mbstat for
708 * its operations (e.g. cluster expansion) because the structure is exposed
709 * to outside and could possibly be modified, therefore making it unsafe.
710 * With the exception of the mbstat.m_mtypes array (see below), all of the
711 * statistics are updated as they change.
712 */
713 struct mbstat mbstat;
714
715 #define MBSTAT_MTYPES_MAX \
716 (sizeof (mbstat.m_mtypes) / sizeof (mbstat.m_mtypes[0]))
717
718 /*
719 * Allocation statistics related to mbuf types (up to MT_MAX-1) are updated
720 * atomically and stored in a per-CPU structure which is lock-free; this is
721 * done in order to avoid writing to the global mbstat data structure which
722 * would cause false sharing. During sysctl request for kern.ipc.mbstat,
723 * the statistics across all CPUs will be converged into the mbstat.m_mtypes
724 * array and returned to the application. Any updates for types greater or
725 * equal than MT_MAX would be done atomically to the mbstat; this slows down
726 * performance but is okay since the kernel uses only up to MT_MAX-1 while
727 * anything beyond that (up to type 255) is considered a corner case.
728 */
729 typedef struct {
730 unsigned int cpu_mtypes[MT_MAX];
731 } __attribute__((aligned(CPU_CACHE_SIZE), packed)) mtypes_cpu_t;
732
733 typedef struct {
734 mtypes_cpu_t mbs_cpu[1];
735 } mbuf_mtypes_t;
736
737 static mbuf_mtypes_t *mbuf_mtypes; /* per-CPU statistics */
738
739 #define MBUF_MTYPES_SIZE(n) \
740 ((size_t)(&((mbuf_mtypes_t *)0)->mbs_cpu[n]))
741
742 #define MTYPES_CPU(p) \
743 ((mtypes_cpu_t *)((char *)(p) + MBUF_MTYPES_SIZE(cpu_number())))
744
745 /* This should be in a header file */
746 #define atomic_add_32(a, n) ((void) OSAddAtomic(n, (volatile SInt32 *)a))
747
748 #define mtype_stat_add(type, n) { \
749 if ((unsigned)(type) < MT_MAX) { \
750 mtypes_cpu_t *mbs = MTYPES_CPU(mbuf_mtypes); \
751 atomic_add_32(&mbs->cpu_mtypes[type], n); \
752 } else if ((unsigned)(type) < MBSTAT_MTYPES_MAX) { \
753 atomic_add_32(&mbstat.m_mtypes[type], n); \
754 } \
755 }
756
757 #define mtype_stat_sub(t, n) mtype_stat_add(t, -(n))
758 #define mtype_stat_inc(t) mtype_stat_add(t, 1)
759 #define mtype_stat_dec(t) mtype_stat_sub(t, 1)
760
761 static int
762 mbstat_sysctl SYSCTL_HANDLER_ARGS
763 {
764 #pragma unused(oidp, arg1, arg2)
765 int m, n;
766 mtypes_cpu_t mtc;
767
768 bzero(&mtc, sizeof (mtc));
769 for (m = 0; m < ncpu; m++) {
770 mtypes_cpu_t *scp = &mbuf_mtypes->mbs_cpu[m];
771 mtypes_cpu_t temp;
772
773 bcopy(&scp->cpu_mtypes, &temp.cpu_mtypes,
774 sizeof (temp.cpu_mtypes));
775
776 for (n = 0; n < MT_MAX; n++)
777 mtc.cpu_mtypes[n] += temp.cpu_mtypes[n];
778 }
779 lck_mtx_lock(mbuf_mlock);
780 for (n = 0; n < MT_MAX; n++)
781 mbstat.m_mtypes[n] = mtc.cpu_mtypes[n];
782 lck_mtx_unlock(mbuf_mlock);
783
784 return (SYSCTL_OUT(req, &mbstat, sizeof (mbstat)));
785 }
786
787 static int
788 mb_stat_sysctl SYSCTL_HANDLER_ARGS
789 {
790 #pragma unused(oidp, arg1, arg2)
791 mcache_t *cp;
792 mcache_cpu_t *ccp;
793 mb_class_stat_t *sp;
794 int k, m, bktsize;
795
796 lck_mtx_lock(mbuf_mlock);
797 for (k = 0; k < NELEM(mbuf_table); k++) {
798 cp = m_cache(k);
799 ccp = &cp->mc_cpu[0];
800 bktsize = ccp->cc_bktsize;
801 sp = mbuf_table[k].mtbl_stats;
802
803 if (cp->mc_flags & MCF_NOCPUCACHE)
804 sp->mbcl_mc_state = MCS_DISABLED;
805 else if (cp->mc_purge_cnt > 0)
806 sp->mbcl_mc_state = MCS_PURGING;
807 else if (bktsize == 0)
808 sp->mbcl_mc_state = MCS_OFFLINE;
809 else
810 sp->mbcl_mc_state = MCS_ONLINE;
811
812 sp->mbcl_mc_cached = 0;
813 for (m = 0; m < ncpu; m++) {
814 ccp = &cp->mc_cpu[m];
815 if (ccp->cc_objs > 0)
816 sp->mbcl_mc_cached += ccp->cc_objs;
817 if (ccp->cc_pobjs > 0)
818 sp->mbcl_mc_cached += ccp->cc_pobjs;
819 }
820 sp->mbcl_mc_cached += (cp->mc_full.bl_total * bktsize);
821 sp->mbcl_active = sp->mbcl_total - sp->mbcl_mc_cached -
822 sp->mbcl_infree;
823
824 sp->mbcl_mc_waiter_cnt = cp->mc_waiter_cnt;
825 sp->mbcl_mc_wretry_cnt = cp->mc_wretry_cnt;
826 sp->mbcl_mc_nwretry_cnt = cp->mc_nwretry_cnt;
827
828 /* Calculate total count specific to each class */
829 sp->mbcl_ctotal = sp->mbcl_total;
830 switch (m_class(k)) {
831 case MC_MBUF:
832 /* Deduct mbufs used in composite caches */
833 sp->mbcl_ctotal -= (m_total(MC_MBUF_CL) +
834 m_total(MC_MBUF_BIGCL));
835 break;
836
837 case MC_CL:
838 /* Deduct clusters used in composite cache and mbufs */
839 sp->mbcl_ctotal -= (m_total(MC_MBUF_CL) +
840 (P2ROUNDUP(m_total(MC_MBUF), NMBPCL)/NMBPCL));
841 break;
842
843 case MC_BIGCL:
844 /* Deduct clusters used in composite cache */
845 sp->mbcl_ctotal -= m_total(MC_MBUF_BIGCL);
846 break;
847
848 case MC_16KCL:
849 /* Deduct clusters used in composite cache */
850 sp->mbcl_ctotal -= m_total(MC_MBUF_16KCL);
851 break;
852
853 default:
854 break;
855 }
856 }
857 lck_mtx_unlock(mbuf_mlock);
858
859 return (SYSCTL_OUT(req, mb_stat, MB_STAT_SIZE(NELEM(mbuf_table))));
860 }
861
862 static inline void
863 m_incref(struct mbuf *m)
864 {
865 UInt32 old, new;
866 volatile UInt32 *addr = (volatile UInt32 *)&MEXT_REF(m);
867
868 do {
869 old = *addr;
870 new = old + 1;
871 ASSERT(new != 0);
872 } while (!OSCompareAndSwap(old, new, addr));
873 }
874
875 static inline u_int32_t
876 m_decref(struct mbuf *m)
877 {
878 UInt32 old, new;
879 volatile UInt32 *addr = (volatile UInt32 *)&MEXT_REF(m);
880
881 do {
882 old = *addr;
883 new = old - 1;
884 ASSERT(old != 0);
885 } while (!OSCompareAndSwap(old, new, addr));
886
887 return (new);
888 }
889
890 static void
891 mbuf_table_init(void)
892 {
893 int m;
894
895 MALLOC(mb_stat, mb_stat_t *, MB_STAT_SIZE(NELEM(mbuf_table)),
896 M_TEMP, M_WAITOK | M_ZERO);
897 VERIFY(mb_stat != NULL);
898
899 mb_stat->mbs_cnt = NELEM(mbuf_table);
900 for (m = 0; m < NELEM(mbuf_table); m++)
901 mbuf_table[m].mtbl_stats = &mb_stat->mbs_class[m];
902
903 #if CONFIG_MBUF_JUMBO
904 /*
905 * Set aside 1/3 of the mbuf cluster map for jumbo clusters; we do
906 * this only on platforms where jumbo cluster pool is enabled.
907 */
908 njcl = nmbclusters / 3;
909 njclbytes = M16KCLBYTES;
910 #endif /* CONFIG_MBUF_JUMBO */
911
912 /*
913 * nclusters is going to be split in 2 to hold both the 2K
914 * and the 4K pools, so make sure each half is even.
915 */
916 nclusters = P2ROUNDDOWN(nmbclusters - njcl, 4);
917 if (njcl > 0) {
918 /*
919 * Each jumbo cluster takes 8 2K clusters, so make
920 * sure that the pool size is evenly divisible by 8.
921 */
922 njcl = P2ROUNDDOWN(nmbclusters - nclusters, 8);
923 }
924
925 #if CONFIG_MBUF_NOEXPAND
926 /* Only use 4k clusters if we're setting aside more than 256k */
927 if (nmbclusters <= 128) {
928 maxmbufcl = nmbclusters / 4;
929 } else {
930 /* Half to big clusters, half to small */
931 maxmbufcl = (nmbclusters / 4) * 3;
932 }
933 #endif /* CONFIG_MBUF_NOEXPAND */
934
935 /*
936 * 1/2 of the map is reserved for 2K clusters. Out of this, 1/16th
937 * of the total number of 2K clusters allocated is reserved and cannot
938 * be turned into mbufs. It can only be used for pure cluster objects.
939 */
940 m_minlimit(MC_CL) = (nclusters >> 5);
941 m_maxlimit(MC_CL) = (nclusters >> 1);
942 m_maxsize(MC_CL) = m_size(MC_CL) = MCLBYTES;
943 (void) snprintf(m_cname(MC_CL), MAX_MBUF_CNAME, "cl");
944
945 /*
946 * The remaining (15/16th) can be turned into mbufs.
947 */
948 m_minlimit(MC_MBUF) = 0;
949 m_maxlimit(MC_MBUF) = (m_maxlimit(MC_CL) - m_minlimit(MC_CL)) * NMBPCL;
950 m_maxsize(MC_MBUF) = m_size(MC_MBUF) = MSIZE;
951 (void) snprintf(m_cname(MC_MBUF), MAX_MBUF_CNAME, "mbuf");
952
953 /*
954 * The other 1/2 of the map is reserved for 4K clusters.
955 */
956 m_minlimit(MC_BIGCL) = 0;
957 m_maxlimit(MC_BIGCL) = m_maxlimit(MC_CL) >> 1;
958 m_maxsize(MC_BIGCL) = m_size(MC_BIGCL) = NBPG;
959 (void) snprintf(m_cname(MC_BIGCL), MAX_MBUF_CNAME, "bigcl");
960
961 /*
962 * Set limits for the composite classes.
963 */
964 m_minlimit(MC_MBUF_CL) = 0;
965 m_maxlimit(MC_MBUF_CL) = m_maxlimit(MC_CL) - m_minlimit(MC_CL);
966 m_maxsize(MC_MBUF_CL) = MCLBYTES;
967 m_size(MC_MBUF_CL) = m_size(MC_MBUF) + m_size(MC_CL);
968 (void) snprintf(m_cname(MC_MBUF_CL), MAX_MBUF_CNAME, "mbuf_cl");
969
970 m_minlimit(MC_MBUF_BIGCL) = 0;
971 m_maxlimit(MC_MBUF_BIGCL) = m_maxlimit(MC_BIGCL);
972 m_maxsize(MC_MBUF_BIGCL) = NBPG;
973 m_size(MC_MBUF_BIGCL) = m_size(MC_MBUF) + m_size(MC_BIGCL);
974 (void) snprintf(m_cname(MC_MBUF_BIGCL), MAX_MBUF_CNAME, "mbuf_bigcl");
975
976 /*
977 * And for jumbo classes.
978 */
979 m_minlimit(MC_16KCL) = 0;
980 m_maxlimit(MC_16KCL) = (njcl >> 3);
981 m_maxsize(MC_16KCL) = m_size(MC_16KCL) = M16KCLBYTES;
982 (void) snprintf(m_cname(MC_16KCL), MAX_MBUF_CNAME, "16kcl");
983
984 m_minlimit(MC_MBUF_16KCL) = 0;
985 m_maxlimit(MC_MBUF_16KCL) = m_maxlimit(MC_16KCL);
986 m_maxsize(MC_MBUF_16KCL) = M16KCLBYTES;
987 m_size(MC_MBUF_16KCL) = m_size(MC_MBUF) + m_size(MC_16KCL);
988 (void) snprintf(m_cname(MC_MBUF_16KCL), MAX_MBUF_CNAME, "mbuf_16kcl");
989
990 /*
991 * Initialize the legacy mbstat structure.
992 */
993 bzero(&mbstat, sizeof (mbstat));
994 mbstat.m_msize = m_maxsize(MC_MBUF);
995 mbstat.m_mclbytes = m_maxsize(MC_CL);
996 mbstat.m_minclsize = MINCLSIZE;
997 mbstat.m_mlen = MLEN;
998 mbstat.m_mhlen = MHLEN;
999 mbstat.m_bigmclbytes = m_maxsize(MC_BIGCL);
1000 }
1001
1002 __private_extern__ void
1003 mbinit(void)
1004 {
1005 unsigned int m;
1006 int initmcl = MINCL;
1007 int mcl_pages;
1008 void *buf;
1009
1010 if (nmbclusters == 0)
1011 nmbclusters = NMBCLUSTERS;
1012
1013 /* Setup the mbuf table */
1014 mbuf_table_init();
1015
1016 /* Global lock for common layer */
1017 mbuf_mlock_grp_attr = lck_grp_attr_alloc_init();
1018 mbuf_mlock_grp = lck_grp_alloc_init("mbuf", mbuf_mlock_grp_attr);
1019 mbuf_mlock_attr = lck_attr_alloc_init();
1020 mbuf_mlock = lck_mtx_alloc_init(mbuf_mlock_grp, mbuf_mlock_attr);
1021
1022 /* Allocate cluster slabs table */
1023 maxslabgrp = P2ROUNDUP(nmbclusters, NSLABSPMB) / NSLABSPMB;
1024 MALLOC(slabstbl, mcl_slabg_t **, maxslabgrp * sizeof (mcl_slabg_t *),
1025 M_TEMP, M_WAITOK | M_ZERO);
1026 VERIFY(slabstbl != NULL);
1027
1028 /* Allocate audit structures if needed */
1029 PE_parse_boot_arg("mbuf_debug", &mbuf_debug);
1030 mbuf_debug |= mcache_getflags();
1031 if (mbuf_debug & MCF_AUDIT) {
1032 MALLOC(mclaudit, mcl_audit_t *,
1033 nmbclusters * sizeof (*mclaudit), M_TEMP,
1034 M_WAITOK | M_ZERO);
1035 VERIFY(mclaudit != NULL);
1036
1037 mcl_audit_con_cache = mcache_create("mcl_audit_contents",
1038 AUDIT_CONTENTS_SIZE, 0, 0, MCR_SLEEP);
1039 VERIFY(mcl_audit_con_cache != NULL);
1040 }
1041
1042 /* Calculate the number of pages assigned to the cluster pool */
1043 mcl_pages = nmbclusters/(NBPG/CLBYTES);
1044 MALLOC(mcl_paddr, int *, mcl_pages * sizeof (int), M_TEMP, M_WAITOK);
1045 VERIFY(mcl_paddr != NULL);
1046
1047 /* Register with the I/O Bus mapper */
1048 mcl_paddr_base = IOMapperIOVMAlloc(mcl_pages);
1049 bzero((char *)mcl_paddr, mcl_pages * sizeof (int));
1050
1051 embutl = (union mcluster *)
1052 ((unsigned char *)mbutl + (nmbclusters * MCLBYTES));
1053
1054 PE_parse_boot_arg("initmcl", &initmcl);
1055
1056 lck_mtx_lock(mbuf_mlock);
1057
1058 if (m_clalloc(MAX(NBPG/CLBYTES, 1) * initmcl, M_WAIT, MCLBYTES) == 0)
1059 panic("mbinit: m_clalloc failed\n");
1060
1061 lck_mtx_unlock(mbuf_mlock);
1062
1063 (void) kernel_thread(kernel_task, mbuf_worker_thread_init);
1064
1065 ref_cache = mcache_create("mext_ref", sizeof (struct ext_ref),
1066 0, 0, MCR_SLEEP);
1067
1068 /* Create the cache for each class */
1069 for (m = 0; m < NELEM(mbuf_table); m++) {
1070 void *allocfunc, *freefunc, *auditfunc;
1071 u_int32_t flags;
1072
1073 flags = mbuf_debug;
1074 if (m_class(m) == MC_MBUF_CL || m_class(m) == MC_MBUF_BIGCL ||
1075 m_class(m) == MC_MBUF_16KCL) {
1076 allocfunc = mbuf_cslab_alloc;
1077 freefunc = mbuf_cslab_free;
1078 auditfunc = mbuf_cslab_audit;
1079 } else {
1080 allocfunc = mbuf_slab_alloc;
1081 freefunc = mbuf_slab_free;
1082 auditfunc = mbuf_slab_audit;
1083 }
1084
1085 /*
1086 * Disable per-CPU caches for jumbo classes if there
1087 * is no jumbo cluster pool available in the system.
1088 * The cache itself is still created (but will never
1089 * be populated) since it simplifies the code.
1090 */
1091 if ((m_class(m) == MC_MBUF_16KCL || m_class(m) == MC_16KCL) &&
1092 njcl == 0)
1093 flags |= MCF_NOCPUCACHE;
1094
1095 m_cache(m) = mcache_create_ext(m_cname(m), m_maxsize(m),
1096 allocfunc, freefunc, auditfunc, mbuf_slab_notify,
1097 (void *)m, flags, MCR_SLEEP);
1098 }
1099
1100 /*
1101 * Allocate structure for per-CPU statistics that's aligned
1102 * on the CPU cache boundary; this code assumes that we never
1103 * uninitialize this framework, since the original address
1104 * before alignment is not saved.
1105 */
1106 ncpu = ml_get_max_cpus();
1107 MALLOC(buf, void *, MBUF_MTYPES_SIZE(ncpu) + CPU_CACHE_SIZE,
1108 M_TEMP, M_WAITOK);
1109 VERIFY(buf != NULL);
1110
1111 mbuf_mtypes = (mbuf_mtypes_t *)P2ROUNDUP((intptr_t)buf, CPU_CACHE_SIZE);
1112 bzero(mbuf_mtypes, MBUF_MTYPES_SIZE(ncpu));
1113
1114 printf("mbinit: done\n");
1115 }
1116
1117 /*
1118 * Obtain a slab of object(s) from the class's freelist.
1119 */
1120 static mcache_obj_t *
1121 slab_alloc(mbuf_class_t class, int wait)
1122 {
1123 mcl_slab_t *sp;
1124 mcache_obj_t *buf;
1125
1126 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1127
1128 VERIFY(class != MC_16KCL || njcl > 0);
1129
1130 /* This should always be NULL for us */
1131 VERIFY(m_cobjlist(class) == NULL);
1132
1133 /*
1134 * Treat composite objects as having longer lifespan by using
1135 * a slab from the reverse direction, in hoping that this could
1136 * reduce the probability of fragmentation for slabs that hold
1137 * more than one buffer chunks (e.g. mbuf slabs). For other
1138 * slabs, this probably doesn't make much of a difference.
1139 */
1140 if (class == MC_MBUF && (wait & MCR_COMP))
1141 sp = (mcl_slab_t *)TAILQ_LAST(&m_slablist(class), mcl_slhead);
1142 else
1143 sp = (mcl_slab_t *)TAILQ_FIRST(&m_slablist(class));
1144
1145 if (sp == NULL) {
1146 VERIFY(m_infree(class) == 0 && m_slab_cnt(class) == 0);
1147 /* The slab list for this class is empty */
1148 return (NULL);
1149 }
1150
1151 VERIFY(m_infree(class) > 0);
1152 VERIFY(!slab_is_detached(sp));
1153 VERIFY(sp->sl_class == class &&
1154 (sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED);
1155 buf = sp->sl_head;
1156 VERIFY(slab_inrange(sp, buf) && sp == slab_get(buf));
1157
1158 if (class == MC_MBUF) {
1159 sp->sl_head = buf->obj_next;
1160 VERIFY(sp->sl_head != NULL || sp->sl_refcnt == (NMBPCL - 1));
1161 } else {
1162 sp->sl_head = NULL;
1163 }
1164 if (sp->sl_head != NULL && !slab_inrange(sp, sp->sl_head)) {
1165 slab_nextptr_panic(sp, sp->sl_head);
1166 /* In case sl_head is in the map but not in the slab */
1167 VERIFY(slab_inrange(sp, sp->sl_head));
1168 /* NOTREACHED */
1169 }
1170
1171 /* Increment slab reference */
1172 sp->sl_refcnt++;
1173
1174 if (mclaudit != NULL) {
1175 mcache_audit_t *mca = mcl_audit_buf2mca(class, buf);
1176 mca->mca_uflags = 0;
1177 /* Save contents on mbuf objects only */
1178 if (class == MC_MBUF)
1179 mca->mca_uflags |= MB_SCVALID;
1180 }
1181
1182 if (class == MC_CL) {
1183 mbstat.m_clfree = (--m_infree(MC_CL)) + m_infree(MC_MBUF_CL);
1184 /*
1185 * A 2K cluster slab can have at most 1 reference.
1186 */
1187 VERIFY(sp->sl_refcnt == 1 && sp->sl_chunks == 1 &&
1188 sp->sl_len == m_maxsize(MC_CL) && sp->sl_head == NULL);
1189 } else if (class == MC_BIGCL) {
1190 mcl_slab_t *nsp = sp->sl_next;
1191 mbstat.m_bigclfree = (--m_infree(MC_BIGCL)) +
1192 m_infree(MC_MBUF_BIGCL);
1193 /*
1194 * Increment 2nd slab. A 4K big cluster takes
1195 * 2 slabs, each having at most 1 reference.
1196 */
1197 VERIFY(sp->sl_refcnt == 1 && sp->sl_chunks == 1 &&
1198 sp->sl_len == m_maxsize(MC_BIGCL) && sp->sl_head == NULL);
1199 /* Next slab must already be present */
1200 VERIFY(nsp != NULL);
1201 nsp->sl_refcnt++;
1202 VERIFY(!slab_is_detached(nsp));
1203 VERIFY(nsp->sl_class == MC_BIGCL &&
1204 nsp->sl_flags == (SLF_MAPPED | SLF_PARTIAL) &&
1205 nsp->sl_refcnt == 1 && nsp->sl_chunks == 0 &&
1206 nsp->sl_len == 0 && nsp->sl_base == sp->sl_base &&
1207 nsp->sl_head == NULL);
1208 } else if (class == MC_16KCL) {
1209 mcl_slab_t *nsp;
1210 int k;
1211
1212 --m_infree(MC_16KCL);
1213 VERIFY(sp->sl_refcnt == 1 && sp->sl_chunks == 1 &&
1214 sp->sl_len == m_maxsize(MC_16KCL) && sp->sl_head == NULL);
1215 /*
1216 * Increment 2nd-8th slab. A 16K big cluster takes
1217 * 8 cluster slabs, each having at most 1 reference.
1218 */
1219 for (nsp = sp, k = 1; k < (M16KCLBYTES / MCLBYTES); k++) {
1220 nsp = nsp->sl_next;
1221 /* Next slab must already be present */
1222 VERIFY(nsp != NULL);
1223 nsp->sl_refcnt++;
1224 VERIFY(!slab_is_detached(nsp));
1225 VERIFY(nsp->sl_class == MC_16KCL &&
1226 nsp->sl_flags == (SLF_MAPPED | SLF_PARTIAL) &&
1227 nsp->sl_refcnt == 1 && nsp->sl_chunks == 0 &&
1228 nsp->sl_len == 0 && nsp->sl_base == sp->sl_base &&
1229 nsp->sl_head == NULL);
1230 }
1231 } else {
1232 ASSERT(class == MC_MBUF);
1233 --m_infree(MC_MBUF);
1234 /*
1235 * If auditing is turned on, this check is
1236 * deferred until later in mbuf_slab_audit().
1237 */
1238 if (mclaudit == NULL)
1239 _MCHECK((struct mbuf *)buf);
1240 /*
1241 * Since we have incremented the reference count above,
1242 * an mbuf slab (formerly a 2K cluster slab that was cut
1243 * up into mbufs) must have a reference count between 1
1244 * and NMBPCL at this point.
1245 */
1246 VERIFY(sp->sl_refcnt >= 1 &&
1247 (unsigned short)sp->sl_refcnt <= NMBPCL &&
1248 sp->sl_chunks == NMBPCL && sp->sl_len == m_maxsize(MC_CL));
1249 VERIFY((unsigned short)sp->sl_refcnt < NMBPCL ||
1250 sp->sl_head == NULL);
1251 }
1252
1253 /* If empty, remove this slab from the class's freelist */
1254 if (sp->sl_head == NULL) {
1255 VERIFY(class != MC_MBUF || sp->sl_refcnt == NMBPCL);
1256 slab_remove(sp, class);
1257 }
1258
1259 return (buf);
1260 }
1261
1262 /*
1263 * Place a slab of object(s) back into a class's slab list.
1264 */
1265 static void
1266 slab_free(mbuf_class_t class, mcache_obj_t *buf)
1267 {
1268 mcl_slab_t *sp;
1269
1270 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1271
1272 VERIFY(class != MC_16KCL || njcl > 0);
1273 VERIFY(buf->obj_next == NULL);
1274 sp = slab_get(buf);
1275 VERIFY(sp->sl_class == class && slab_inrange(sp, buf) &&
1276 (sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED);
1277
1278 /* Decrement slab reference */
1279 sp->sl_refcnt--;
1280
1281 if (class == MC_CL || class == MC_BIGCL) {
1282 VERIFY(IS_P2ALIGNED(buf, MCLBYTES));
1283 /*
1284 * A 2K cluster slab can have at most 1 reference
1285 * which must be 0 at this point.
1286 */
1287 VERIFY(sp->sl_refcnt == 0 && sp->sl_chunks == 1 &&
1288 sp->sl_len == m_maxsize(class) && sp->sl_head == NULL);
1289 VERIFY(slab_is_detached(sp));
1290 if (class == MC_BIGCL) {
1291 mcl_slab_t *nsp = sp->sl_next;
1292 VERIFY(IS_P2ALIGNED(buf, NBPG));
1293 /* Next slab must already be present */
1294 VERIFY(nsp != NULL);
1295 /* Decrement 2nd slab reference */
1296 nsp->sl_refcnt--;
1297 /*
1298 * A 4K big cluster takes 2 slabs, both
1299 * must now have 0 reference.
1300 */
1301 VERIFY(slab_is_detached(nsp));
1302 VERIFY(nsp->sl_class == MC_BIGCL &&
1303 (nsp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) &&
1304 nsp->sl_refcnt == 0 && nsp->sl_chunks == 0 &&
1305 nsp->sl_len == 0 && nsp->sl_base == sp->sl_base &&
1306 nsp->sl_head == NULL);
1307 }
1308 } else if (class == MC_16KCL) {
1309 mcl_slab_t *nsp;
1310 int k;
1311 /*
1312 * A 16K cluster takes 8 cluster slabs, all must
1313 * now have 0 reference.
1314 */
1315 VERIFY(IS_P2ALIGNED(buf, NBPG));
1316 VERIFY(sp->sl_refcnt == 0 && sp->sl_chunks == 1 &&
1317 sp->sl_len == m_maxsize(MC_16KCL) && sp->sl_head == NULL);
1318 VERIFY(slab_is_detached(sp));
1319 for (nsp = sp, k = 1; k < (M16KCLBYTES / MCLBYTES); k++) {
1320 nsp = nsp->sl_next;
1321 /* Next slab must already be present */
1322 VERIFY(nsp != NULL);
1323 nsp->sl_refcnt--;
1324 VERIFY(slab_is_detached(nsp));
1325 VERIFY(nsp->sl_class == MC_16KCL &&
1326 (nsp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) &&
1327 nsp->sl_refcnt == 0 && nsp->sl_chunks == 0 &&
1328 nsp->sl_len == 0 && nsp->sl_base == sp->sl_base &&
1329 nsp->sl_head == NULL);
1330 }
1331 } else {
1332 /*
1333 * An mbuf slab has a total of NMBPL reference counts.
1334 * Since we have decremented the reference above, it
1335 * must now be between 0 and NMBPCL-1.
1336 */
1337 VERIFY(sp->sl_refcnt >= 0 &&
1338 (unsigned short)sp->sl_refcnt <= (NMBPCL - 1) &&
1339 sp->sl_chunks == NMBPCL && sp->sl_len == m_maxsize(MC_CL));
1340 VERIFY(sp->sl_refcnt < (NMBPCL - 1) ||
1341 (slab_is_detached(sp) && sp->sl_head == NULL));
1342 }
1343
1344 /*
1345 * When auditing is enabled, ensure that the buffer still
1346 * contains the free pattern. Otherwise it got corrupted
1347 * while at the CPU cache layer.
1348 */
1349 if (mclaudit != NULL) {
1350 mcache_audit_t *mca = mcl_audit_buf2mca(class, buf);
1351 mcache_audit_free_verify(mca, buf, 0, m_maxsize(class));
1352 mca->mca_uflags &= ~MB_SCVALID;
1353 }
1354
1355 if (class == MC_CL) {
1356 mbstat.m_clfree = (++m_infree(MC_CL)) + m_infree(MC_MBUF_CL);
1357 } else if (class == MC_BIGCL) {
1358 mbstat.m_bigclfree = (++m_infree(MC_BIGCL)) +
1359 m_infree(MC_MBUF_BIGCL);
1360 } else if (class == MC_16KCL) {
1361 ++m_infree(MC_16KCL);
1362 } else {
1363 ++m_infree(MC_MBUF);
1364 buf->obj_next = sp->sl_head;
1365 }
1366 sp->sl_head = buf;
1367
1368 /* All mbufs are freed; return the cluster that we stole earlier */
1369 if (sp->sl_refcnt == 0 && class == MC_MBUF) {
1370 int i = NMBPCL;
1371
1372 m_total(MC_MBUF) -= NMBPCL;
1373 mbstat.m_mbufs = m_total(MC_MBUF);
1374 m_infree(MC_MBUF) -= NMBPCL;
1375 mtype_stat_add(MT_FREE, -NMBPCL);
1376
1377 while (i--) {
1378 struct mbuf *m = sp->sl_head;
1379 VERIFY(m != NULL);
1380 sp->sl_head = m->m_next;
1381 m->m_next = NULL;
1382 }
1383 VERIFY(sp->sl_head == NULL);
1384
1385 /* Remove the slab from the mbuf class's slab list */
1386 slab_remove(sp, class);
1387
1388 /* Reinitialize it as a 2K cluster slab */
1389 slab_init(sp, MC_CL, sp->sl_flags, sp->sl_base, sp->sl_base,
1390 sp->sl_len, 0, 1);
1391
1392 if (mclaudit != NULL)
1393 mcache_set_pattern(MCACHE_FREE_PATTERN,
1394 (caddr_t)sp->sl_head, m_maxsize(MC_CL));
1395
1396 mbstat.m_clfree = (++m_infree(MC_CL)) + m_infree(MC_MBUF_CL);
1397
1398 VERIFY(slab_is_detached(sp));
1399 /* And finally switch class */
1400 class = MC_CL;
1401 }
1402
1403 /* Reinsert the slab to the class's slab list */
1404 if (slab_is_detached(sp))
1405 slab_insert(sp, class);
1406 }
1407
1408 /*
1409 * Common allocator for rudimentary objects called by the CPU cache layer
1410 * during an allocation request whenever there is no available element in the
1411 * bucket layer. It returns one or more elements from the appropriate global
1412 * freelist. If the freelist is empty, it will attempt to populate it and
1413 * retry the allocation.
1414 */
1415 static unsigned int
1416 mbuf_slab_alloc(void *arg, mcache_obj_t ***plist, unsigned int num, int wait)
1417 {
1418 mbuf_class_t class = (mbuf_class_t)arg;
1419 unsigned int need = num;
1420 mcache_obj_t **list = *plist;
1421
1422 ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
1423 ASSERT(need > 0);
1424
1425 lck_mtx_lock(mbuf_mlock);
1426
1427 for (;;) {
1428 if ((*list = slab_alloc(class, wait)) != NULL) {
1429 (*list)->obj_next = NULL;
1430 list = *plist = &(*list)->obj_next;
1431
1432 if (--need == 0) {
1433 /*
1434 * If the number of elements in freelist has
1435 * dropped below low watermark, asynchronously
1436 * populate the freelist now rather than doing
1437 * it later when we run out of elements.
1438 */
1439 if (!mbuf_cached_above(class, wait) &&
1440 m_infree(class) < m_total(class) >> 5) {
1441 (void) freelist_populate(class, 1,
1442 M_DONTWAIT);
1443 }
1444 break;
1445 }
1446 } else {
1447 VERIFY(m_infree(class) == 0 || class == MC_CL);
1448
1449 (void) freelist_populate(class, 1,
1450 (wait & MCR_NOSLEEP) ? M_DONTWAIT : M_WAIT);
1451
1452 if (m_infree(class) > 0)
1453 continue;
1454
1455 /* Check if there's anything at the cache layer */
1456 if (mbuf_cached_above(class, wait))
1457 break;
1458
1459 /* We have nothing and cannot block; give up */
1460 if (wait & MCR_NOSLEEP) {
1461 if (!(wait & MCR_TRYHARD)) {
1462 m_fail_cnt(class)++;
1463 mbstat.m_drops++;
1464 break;
1465 }
1466 }
1467
1468 /*
1469 * If the freelist is still empty and the caller is
1470 * willing to be blocked, sleep on the wait channel
1471 * until an element is available. Otherwise, if
1472 * MCR_TRYHARD is set, do our best to satisfy the
1473 * request without having to go to sleep.
1474 */
1475 if (mbuf_worker_ready &&
1476 mbuf_sleep(class, need, wait))
1477 break;
1478
1479 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1480 }
1481 }
1482
1483 m_alloc_cnt(class) += num - need;
1484 lck_mtx_unlock(mbuf_mlock);
1485
1486 return (num - need);
1487 }
1488
1489 /*
1490 * Common de-allocator for rudimentary objects called by the CPU cache
1491 * layer when one or more elements need to be returned to the appropriate
1492 * global freelist.
1493 */
1494 static void
1495 mbuf_slab_free(void *arg, mcache_obj_t *list, __unused int purged)
1496 {
1497 mbuf_class_t class = (mbuf_class_t)arg;
1498 mcache_obj_t *nlist;
1499 unsigned int num = 0;
1500 int w;
1501
1502 ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
1503
1504 lck_mtx_lock(mbuf_mlock);
1505
1506 for (;;) {
1507 nlist = list->obj_next;
1508 list->obj_next = NULL;
1509 slab_free(class, list);
1510 ++num;
1511 if ((list = nlist) == NULL)
1512 break;
1513 }
1514 m_free_cnt(class) += num;
1515
1516 if ((w = mb_waiters) > 0)
1517 mb_waiters = 0;
1518
1519 lck_mtx_unlock(mbuf_mlock);
1520
1521 if (w != 0)
1522 wakeup(mb_waitchan);
1523 }
1524
1525 /*
1526 * Common auditor for rudimentary objects called by the CPU cache layer
1527 * during an allocation or free request. For the former, this is called
1528 * after the objects are obtained from either the bucket or slab layer
1529 * and before they are returned to the caller. For the latter, this is
1530 * called immediately during free and before placing the objects into
1531 * the bucket or slab layer.
1532 */
1533 static void
1534 mbuf_slab_audit(void *arg, mcache_obj_t *list, boolean_t alloc)
1535 {
1536 mbuf_class_t class = (mbuf_class_t)arg;
1537 mcache_audit_t *mca;
1538
1539 ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
1540
1541 while (list != NULL) {
1542 lck_mtx_lock(mbuf_mlock);
1543 mca = mcl_audit_buf2mca(class, list);
1544
1545 /* Do the sanity checks */
1546 if (class == MC_MBUF) {
1547 mcl_audit_mbuf(mca, list, FALSE, alloc);
1548 ASSERT(mca->mca_uflags & MB_SCVALID);
1549 } else {
1550 mcl_audit_cluster(mca, list, m_maxsize(class),
1551 alloc, TRUE);
1552 ASSERT(!(mca->mca_uflags & MB_SCVALID));
1553 }
1554 /* Record this transaction */
1555 mcache_buffer_log(mca, list, m_cache(class));
1556 if (alloc)
1557 mca->mca_uflags |= MB_INUSE;
1558 else
1559 mca->mca_uflags &= ~MB_INUSE;
1560 /* Unpair the object (unconditionally) */
1561 mca->mca_uptr = NULL;
1562 lck_mtx_unlock(mbuf_mlock);
1563
1564 list = list->obj_next;
1565 }
1566 }
1567
1568 /*
1569 * Common notify routine for all caches. It is called by mcache when
1570 * one or more objects get freed. We use this indication to trigger
1571 * the wakeup of any sleeping threads so that they can retry their
1572 * allocation requests.
1573 */
1574 static void
1575 mbuf_slab_notify(void *arg, u_int32_t reason)
1576 {
1577 mbuf_class_t class = (mbuf_class_t)arg;
1578 int w;
1579
1580 ASSERT(MBUF_CLASS_VALID(class));
1581
1582 if (reason != MCN_RETRYALLOC)
1583 return;
1584
1585 lck_mtx_lock(mbuf_mlock);
1586 if ((w = mb_waiters) > 0) {
1587 m_notified(class)++;
1588 mb_waiters = 0;
1589 }
1590 lck_mtx_unlock(mbuf_mlock);
1591
1592 if (w != 0)
1593 wakeup(mb_waitchan);
1594 }
1595
1596 /*
1597 * Obtain object(s) from the composite class's freelist.
1598 */
1599 static unsigned int
1600 cslab_alloc(mbuf_class_t class, mcache_obj_t ***plist, unsigned int num)
1601 {
1602 unsigned int need = num;
1603 mcl_slab_t *sp, *clsp, *nsp;
1604 struct mbuf *m;
1605 mcache_obj_t **list = *plist;
1606 void *cl;
1607
1608 VERIFY(need > 0);
1609 VERIFY(class != MC_MBUF_16KCL || njcl > 0);
1610 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1611
1612 /* Get what we can from the freelist */
1613 while ((*list = m_cobjlist(class)) != NULL) {
1614 MRANGE(*list);
1615
1616 m = (struct mbuf *)*list;
1617 sp = slab_get(m);
1618 cl = m->m_ext.ext_buf;
1619 clsp = slab_get(cl);
1620 VERIFY(m->m_flags == M_EXT && cl != NULL);
1621 VERIFY(MEXT_RFA(m) != NULL && MBUF_IS_COMPOSITE(m));
1622 VERIFY(clsp->sl_refcnt == 1);
1623 if (class == MC_MBUF_BIGCL) {
1624 nsp = clsp->sl_next;
1625 /* Next slab must already be present */
1626 VERIFY(nsp != NULL);
1627 VERIFY(nsp->sl_refcnt == 1);
1628 } else if (class == MC_MBUF_16KCL) {
1629 int k;
1630 for (nsp = clsp, k = 1;
1631 k < (M16KCLBYTES / MCLBYTES); k++) {
1632 nsp = nsp->sl_next;
1633 /* Next slab must already be present */
1634 VERIFY(nsp != NULL);
1635 VERIFY(nsp->sl_refcnt == 1);
1636 }
1637 }
1638
1639 if ((m_cobjlist(class) = (*list)->obj_next) != NULL &&
1640 !MBUF_IN_MAP(m_cobjlist(class))) {
1641 slab_nextptr_panic(sp, m_cobjlist(class));
1642 /* NOTREACHED */
1643 }
1644 (*list)->obj_next = NULL;
1645 list = *plist = &(*list)->obj_next;
1646
1647 if (--need == 0)
1648 break;
1649 }
1650 m_infree(class) -= (num - need);
1651
1652 return (num - need);
1653 }
1654
1655 /*
1656 * Place object(s) back into a composite class's freelist.
1657 */
1658 static unsigned int
1659 cslab_free(mbuf_class_t class, mcache_obj_t *list, int purged)
1660 {
1661 mcache_obj_t *o, *tail;
1662 unsigned int num = 0;
1663 struct mbuf *m, *ms;
1664 mcache_audit_t *mca = NULL;
1665 mcache_obj_t *ref_list = NULL;
1666 mcl_slab_t *clsp, *nsp;
1667 void *cl;
1668
1669 ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
1670 VERIFY(class != MC_MBUF_16KCL || njcl > 0);
1671 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1672
1673 o = tail = list;
1674
1675 while ((m = ms = (struct mbuf *)o) != NULL) {
1676 mcache_obj_t *rfa, *nexto = o->obj_next;
1677
1678 /* Do the mbuf sanity checks */
1679 if (mclaudit != NULL) {
1680 mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
1681 mcache_audit_free_verify(mca, m, 0, m_maxsize(MC_MBUF));
1682 ms = (struct mbuf *)mca->mca_contents;
1683 }
1684
1685 /* Do the cluster sanity checks */
1686 cl = ms->m_ext.ext_buf;
1687 clsp = slab_get(cl);
1688 if (mclaudit != NULL) {
1689 size_t size;
1690 if (class == MC_MBUF_CL)
1691 size = m_maxsize(MC_CL);
1692 else if (class == MC_MBUF_BIGCL)
1693 size = m_maxsize(MC_BIGCL);
1694 else
1695 size = m_maxsize(MC_16KCL);
1696 mcache_audit_free_verify(mcl_audit_buf2mca(MC_CL,
1697 (mcache_obj_t *)cl), cl, 0, size);
1698 }
1699 VERIFY(ms->m_type == MT_FREE);
1700 VERIFY(ms->m_flags == M_EXT);
1701 VERIFY(MEXT_RFA(ms) != NULL && MBUF_IS_COMPOSITE(ms));
1702 VERIFY(clsp->sl_refcnt == 1);
1703 if (class == MC_MBUF_BIGCL) {
1704 nsp = clsp->sl_next;
1705 /* Next slab must already be present */
1706 VERIFY(nsp != NULL);
1707 VERIFY(nsp->sl_refcnt == 1);
1708 } else if (class == MC_MBUF_16KCL) {
1709 int k;
1710 for (nsp = clsp, k = 1;
1711 k < (M16KCLBYTES / MCLBYTES); k++) {
1712 nsp = nsp->sl_next;
1713 /* Next slab must already be present */
1714 VERIFY(nsp != NULL);
1715 VERIFY(nsp->sl_refcnt == 1);
1716 }
1717 }
1718
1719 /*
1720 * If we're asked to purge, restore the actual mbuf using
1721 * contents of the shadow structure (if auditing is enabled)
1722 * and clear EXTF_COMPOSITE flag from the mbuf, as we are
1723 * about to free it and the attached cluster into their caches.
1724 */
1725 if (purged) {
1726 /* Restore constructed mbuf fields */
1727 if (mclaudit != NULL)
1728 mcl_audit_restore_mbuf(m, mca, TRUE);
1729
1730 MEXT_REF(m) = 0;
1731 MEXT_FLAGS(m) = 0;
1732
1733 rfa = (mcache_obj_t *)MEXT_RFA(m);
1734 rfa->obj_next = ref_list;
1735 ref_list = rfa;
1736 MEXT_RFA(m) = NULL;
1737
1738 m->m_type = MT_FREE;
1739 m->m_flags = m->m_len = 0;
1740 m->m_next = m->m_nextpkt = NULL;
1741
1742 /* Save mbuf fields and make auditing happy */
1743 if (mclaudit != NULL)
1744 mcl_audit_mbuf(mca, o, FALSE, FALSE);
1745
1746 VERIFY(m_total(class) > 0);
1747 m_total(class)--;
1748
1749 /* Free the mbuf */
1750 o->obj_next = NULL;
1751 slab_free(MC_MBUF, o);
1752
1753 /* And free the cluster */
1754 ((mcache_obj_t *)cl)->obj_next = NULL;
1755 if (class == MC_MBUF_CL)
1756 slab_free(MC_CL, cl);
1757 else if (class == MC_MBUF_BIGCL)
1758 slab_free(MC_BIGCL, cl);
1759 else
1760 slab_free(MC_16KCL, cl);
1761 }
1762
1763 ++num;
1764 tail = o;
1765 o = nexto;
1766 }
1767
1768 if (!purged) {
1769 tail->obj_next = m_cobjlist(class);
1770 m_cobjlist(class) = list;
1771 m_infree(class) += num;
1772 } else if (ref_list != NULL) {
1773 mcache_free_ext(ref_cache, ref_list);
1774 }
1775
1776 return (num);
1777 }
1778
1779 /*
1780 * Common allocator for composite objects called by the CPU cache layer
1781 * during an allocation request whenever there is no available element in
1782 * the bucket layer. It returns one or more composite elements from the
1783 * appropriate global freelist. If the freelist is empty, it will attempt
1784 * to obtain the rudimentary objects from their caches and construct them
1785 * into composite mbuf + cluster objects.
1786 */
1787 static unsigned int
1788 mbuf_cslab_alloc(void *arg, mcache_obj_t ***plist, unsigned int needed,
1789 int wait)
1790 {
1791 mbuf_class_t class = (mbuf_class_t)arg;
1792 mcache_t *cp = NULL;
1793 unsigned int num = 0, cnum = 0, want = needed;
1794 mcache_obj_t *ref_list = NULL;
1795 mcache_obj_t *mp_list = NULL;
1796 mcache_obj_t *clp_list = NULL;
1797 mcache_obj_t **list;
1798 struct ext_ref *rfa;
1799 struct mbuf *m;
1800 void *cl;
1801
1802 ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
1803 ASSERT(needed > 0);
1804
1805 VERIFY(class != MC_MBUF_16KCL || njcl > 0);
1806
1807 /* There should not be any slab for this class */
1808 VERIFY(m_slab_cnt(class) == 0 &&
1809 m_slablist(class).tqh_first == NULL &&
1810 m_slablist(class).tqh_last == NULL);
1811
1812 lck_mtx_lock(mbuf_mlock);
1813
1814 /* Try using the freelist first */
1815 num = cslab_alloc(class, plist, needed);
1816 list = *plist;
1817 if (num == needed) {
1818 m_alloc_cnt(class) += num;
1819 lck_mtx_unlock(mbuf_mlock);
1820 return (needed);
1821 }
1822
1823 lck_mtx_unlock(mbuf_mlock);
1824
1825 /*
1826 * We could not satisfy the request using the freelist alone;
1827 * allocate from the appropriate rudimentary caches and use
1828 * whatever we can get to construct the composite objects.
1829 */
1830 needed -= num;
1831
1832 /*
1833 * Mark these allocation requests as coming from a composite cache.
1834 * Also, if the caller is willing to be blocked, mark the request
1835 * with MCR_FAILOK such that we don't end up sleeping at the mbuf
1836 * slab layer waiting for the individual object when one or more
1837 * of the already-constructed composite objects are available.
1838 */
1839 wait |= MCR_COMP;
1840 if (!(wait & MCR_NOSLEEP))
1841 wait |= MCR_FAILOK;
1842
1843 needed = mcache_alloc_ext(m_cache(MC_MBUF), &mp_list, needed, wait);
1844 if (needed == 0) {
1845 ASSERT(mp_list == NULL);
1846 goto fail;
1847 }
1848 if (class == MC_MBUF_CL)
1849 cp = m_cache(MC_CL);
1850 else if (class == MC_MBUF_BIGCL)
1851 cp = m_cache(MC_BIGCL);
1852 else
1853 cp = m_cache(MC_16KCL);
1854 needed = mcache_alloc_ext(cp, &clp_list, needed, wait);
1855 if (needed == 0) {
1856 ASSERT(clp_list == NULL);
1857 goto fail;
1858 }
1859 needed = mcache_alloc_ext(ref_cache, &ref_list, needed, wait);
1860 if (needed == 0) {
1861 ASSERT(ref_list == NULL);
1862 goto fail;
1863 }
1864
1865 /*
1866 * By this time "needed" is MIN(mbuf, cluster, ref). Any left
1867 * overs will get freed accordingly before we return to caller.
1868 */
1869 for (cnum = 0; cnum < needed; cnum++) {
1870 struct mbuf *ms;
1871
1872 m = ms = (struct mbuf *)mp_list;
1873 mp_list = mp_list->obj_next;
1874
1875 cl = clp_list;
1876 clp_list = clp_list->obj_next;
1877 ((mcache_obj_t *)cl)->obj_next = NULL;
1878
1879 rfa = (struct ext_ref *)ref_list;
1880 ref_list = ref_list->obj_next;
1881 ((mcache_obj_t *)rfa)->obj_next = NULL;
1882
1883 /*
1884 * If auditing is enabled, construct the shadow mbuf
1885 * in the audit structure instead of in the actual one.
1886 * mbuf_cslab_audit() will take care of restoring the
1887 * contents after the integrity check.
1888 */
1889 if (mclaudit != NULL) {
1890 mcache_audit_t *mca, *cl_mca;
1891 size_t size;
1892
1893 lck_mtx_lock(mbuf_mlock);
1894 mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
1895 ms = ((struct mbuf *)mca->mca_contents);
1896 cl_mca = mcl_audit_buf2mca(MC_CL, (mcache_obj_t *)cl);
1897
1898 /*
1899 * Pair them up. Note that this is done at the time
1900 * the mbuf+cluster objects are constructed. This
1901 * information should be treated as "best effort"
1902 * debugging hint since more than one mbufs can refer
1903 * to a cluster. In that case, the cluster might not
1904 * be freed along with the mbuf it was paired with.
1905 */
1906 mca->mca_uptr = cl_mca;
1907 cl_mca->mca_uptr = mca;
1908
1909 ASSERT(mca->mca_uflags & MB_SCVALID);
1910 ASSERT(!(cl_mca->mca_uflags & MB_SCVALID));
1911 lck_mtx_unlock(mbuf_mlock);
1912
1913 /* Technically, they are in the freelist */
1914 mcache_set_pattern(MCACHE_FREE_PATTERN, m,
1915 m_maxsize(MC_MBUF));
1916 if (class == MC_MBUF_CL)
1917 size = m_maxsize(MC_CL);
1918 else if (class == MC_MBUF_BIGCL)
1919 size = m_maxsize(MC_BIGCL);
1920 else
1921 size = m_maxsize(MC_16KCL);
1922 mcache_set_pattern(MCACHE_FREE_PATTERN, cl, size);
1923 }
1924
1925 MBUF_INIT(ms, 0, MT_FREE);
1926 if (class == MC_MBUF_16KCL) {
1927 MBUF_16KCL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE);
1928 } else if (class == MC_MBUF_BIGCL) {
1929 MBUF_BIGCL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE);
1930 } else {
1931 MBUF_CL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE);
1932 }
1933 VERIFY(ms->m_flags == M_EXT);
1934 VERIFY(MEXT_RFA(ms) != NULL && MBUF_IS_COMPOSITE(ms));
1935
1936 *list = (mcache_obj_t *)m;
1937 (*list)->obj_next = NULL;
1938 list = *plist = &(*list)->obj_next;
1939 }
1940
1941 fail:
1942 /*
1943 * Free up what's left of the above.
1944 */
1945 if (mp_list != NULL)
1946 mcache_free_ext(m_cache(MC_MBUF), mp_list);
1947 if (clp_list != NULL)
1948 mcache_free_ext(cp, clp_list);
1949 if (ref_list != NULL)
1950 mcache_free_ext(ref_cache, ref_list);
1951
1952 lck_mtx_lock(mbuf_mlock);
1953 if (num > 0 || cnum > 0) {
1954 m_total(class) += cnum;
1955 VERIFY(m_total(class) <= m_maxlimit(class));
1956 m_alloc_cnt(class) += num + cnum;
1957 }
1958 if ((num + cnum) < want)
1959 m_fail_cnt(class) += (want - (num + cnum));
1960 lck_mtx_unlock(mbuf_mlock);
1961
1962 return (num + cnum);
1963 }
1964
1965 /*
1966 * Common de-allocator for composite objects called by the CPU cache
1967 * layer when one or more elements need to be returned to the appropriate
1968 * global freelist.
1969 */
1970 static void
1971 mbuf_cslab_free(void *arg, mcache_obj_t *list, int purged)
1972 {
1973 mbuf_class_t class = (mbuf_class_t)arg;
1974 unsigned int num;
1975 int w;
1976
1977 ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
1978
1979 lck_mtx_lock(mbuf_mlock);
1980
1981 num = cslab_free(class, list, purged);
1982 m_free_cnt(class) += num;
1983
1984 if ((w = mb_waiters) > 0)
1985 mb_waiters = 0;
1986
1987 lck_mtx_unlock(mbuf_mlock);
1988
1989 if (w != 0)
1990 wakeup(mb_waitchan);
1991 }
1992
1993 /*
1994 * Common auditor for composite objects called by the CPU cache layer
1995 * during an allocation or free request. For the former, this is called
1996 * after the objects are obtained from either the bucket or slab layer
1997 * and before they are returned to the caller. For the latter, this is
1998 * called immediately during free and before placing the objects into
1999 * the bucket or slab layer.
2000 */
2001 static void
2002 mbuf_cslab_audit(void *arg, mcache_obj_t *list, boolean_t alloc)
2003 {
2004 mbuf_class_t class = (mbuf_class_t)arg;
2005 mcache_audit_t *mca;
2006 struct mbuf *m, *ms;
2007 mcl_slab_t *clsp, *nsp;
2008 size_t size;
2009 void *cl;
2010
2011 ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
2012
2013 while ((m = ms = (struct mbuf *)list) != NULL) {
2014 lck_mtx_lock(mbuf_mlock);
2015 /* Do the mbuf sanity checks and record its transaction */
2016 mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
2017 mcl_audit_mbuf(mca, m, TRUE, alloc);
2018 mcache_buffer_log(mca, m, m_cache(class));
2019 if (alloc)
2020 mca->mca_uflags |= MB_COMP_INUSE;
2021 else
2022 mca->mca_uflags &= ~MB_COMP_INUSE;
2023
2024 /*
2025 * Use the shadow mbuf in the audit structure if we are
2026 * freeing, since the contents of the actual mbuf has been
2027 * pattern-filled by the above call to mcl_audit_mbuf().
2028 */
2029 if (!alloc)
2030 ms = (struct mbuf *)mca->mca_contents;
2031
2032 /* Do the cluster sanity checks and record its transaction */
2033 cl = ms->m_ext.ext_buf;
2034 clsp = slab_get(cl);
2035 VERIFY(ms->m_flags == M_EXT && cl != NULL);
2036 VERIFY(MEXT_RFA(ms) != NULL && MBUF_IS_COMPOSITE(ms));
2037 VERIFY(clsp->sl_refcnt == 1);
2038 if (class == MC_MBUF_BIGCL) {
2039 nsp = clsp->sl_next;
2040 /* Next slab must already be present */
2041 VERIFY(nsp != NULL);
2042 VERIFY(nsp->sl_refcnt == 1);
2043 } else if (class == MC_MBUF_16KCL) {
2044 int k;
2045 for (nsp = clsp, k = 1;
2046 k < (M16KCLBYTES / MCLBYTES); k++) {
2047 nsp = nsp->sl_next;
2048 /* Next slab must already be present */
2049 VERIFY(nsp != NULL);
2050 VERIFY(nsp->sl_refcnt == 1);
2051 }
2052 }
2053
2054 mca = mcl_audit_buf2mca(MC_CL, cl);
2055 if (class == MC_MBUF_CL)
2056 size = m_maxsize(MC_CL);
2057 else if (class == MC_MBUF_BIGCL)
2058 size = m_maxsize(MC_BIGCL);
2059 else
2060 size = m_maxsize(MC_16KCL);
2061 mcl_audit_cluster(mca, cl, size, alloc, FALSE);
2062 mcache_buffer_log(mca, cl, m_cache(class));
2063 if (alloc)
2064 mca->mca_uflags |= MB_COMP_INUSE;
2065 else
2066 mca->mca_uflags &= ~MB_COMP_INUSE;
2067 lck_mtx_unlock(mbuf_mlock);
2068
2069 list = list->obj_next;
2070 }
2071 }
2072
2073 /*
2074 * Allocate some number of mbuf clusters and place on cluster freelist.
2075 */
2076 static int
2077 m_clalloc(const u_int32_t num, const int wait, const u_int32_t bufsize)
2078 {
2079 int i;
2080 vm_size_t size = 0;
2081 int numpages = 0;
2082 vm_offset_t page = 0;
2083 mcache_audit_t *mca_list = NULL;
2084 mcache_obj_t *con_list = NULL;
2085 mcl_slab_t *sp;
2086
2087 VERIFY(bufsize == m_maxsize(MC_CL) ||
2088 bufsize == m_maxsize(MC_BIGCL) || bufsize == m_maxsize(MC_16KCL));
2089
2090 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2091
2092 /*
2093 * Multiple threads may attempt to populate the cluster map one
2094 * after another. Since we drop the lock below prior to acquiring
2095 * the physical page(s), our view of the cluster map may no longer
2096 * be accurate, and we could end up over-committing the pages beyond
2097 * the maximum allowed for each class. To prevent it, this entire
2098 * operation (including the page mapping) is serialized.
2099 */
2100 while (mb_clalloc_busy) {
2101 mb_clalloc_waiters++;
2102 (void) msleep(mb_clalloc_waitchan, mbuf_mlock,
2103 (PZERO-1), "m_clalloc", NULL);
2104 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2105 }
2106
2107 /* We are busy now; tell everyone else to go away */
2108 mb_clalloc_busy = TRUE;
2109
2110 /*
2111 * Honor the caller's wish to block or not block. We have a way
2112 * to grow the pool asynchronously using the mbuf worker thread.
2113 */
2114 i = m_howmany(num, bufsize);
2115 if (i == 0 || (wait & M_DONTWAIT))
2116 goto out;
2117
2118 lck_mtx_unlock(mbuf_mlock);
2119
2120 size = round_page_32(i * bufsize);
2121 page = kmem_mb_alloc(mb_map, size);
2122
2123 if (page == 0) {
2124 if (bufsize <= m_maxsize(MC_BIGCL)) {
2125 /* Try for 1 page if failed, only for 2KB/4KB request */
2126 size = NBPG;
2127 page = kmem_mb_alloc(mb_map, size);
2128 }
2129
2130 if (page == 0) {
2131 lck_mtx_lock(mbuf_mlock);
2132 goto out;
2133 }
2134 }
2135
2136 VERIFY(IS_P2ALIGNED(page, NBPG));
2137 numpages = size / NBPG;
2138
2139 /* If auditing is enabled, allocate the audit structures now */
2140 if (mclaudit != NULL) {
2141 int needed;
2142
2143 /*
2144 * Yes, I realize this is a waste of memory for clusters
2145 * that never get transformed into mbufs, as we may end
2146 * up with NMBPCL-1 unused audit structures per cluster.
2147 * But doing so tremendously simplifies the allocation
2148 * strategy, since at this point we are not holding the
2149 * mbuf lock and the caller is okay to be blocked. For
2150 * the case of big clusters, we allocate one structure
2151 * for each as we never turn them into mbufs.
2152 */
2153 if (bufsize == m_maxsize(MC_CL)) {
2154 needed = numpages * 2 * NMBPCL;
2155
2156 i = mcache_alloc_ext(mcl_audit_con_cache,
2157 &con_list, needed, MCR_SLEEP);
2158
2159 VERIFY(con_list != NULL && i == needed);
2160 } else if (bufsize == m_maxsize(MC_BIGCL)) {
2161 needed = numpages;
2162 } else {
2163 needed = numpages / (M16KCLBYTES / NBPG);
2164 }
2165
2166 i = mcache_alloc_ext(mcache_audit_cache,
2167 (mcache_obj_t **)&mca_list, needed, MCR_SLEEP);
2168
2169 VERIFY(mca_list != NULL && i == needed);
2170 }
2171
2172 lck_mtx_lock(mbuf_mlock);
2173
2174 for (i = 0; i < numpages; i++, page += NBPG) {
2175 ppnum_t offset = ((char *)page - (char *)mbutl) / NBPG;
2176 ppnum_t new_page = pmap_find_phys(kernel_pmap,
2177 (vm_address_t)page);
2178
2179 /*
2180 * In the case of no mapper being available the following
2181 * code noops and returns the input page; if there is a
2182 * mapper the appropriate I/O page is returned.
2183 */
2184 new_page = IOMapperInsertPage(mcl_paddr_base, offset, new_page);
2185 mcl_paddr[offset] = new_page << PGSHIFT;
2186
2187 /* Pattern-fill this fresh page */
2188 if (mclaudit != NULL)
2189 mcache_set_pattern(MCACHE_FREE_PATTERN,
2190 (caddr_t)page, NBPG);
2191
2192 if (bufsize == m_maxsize(MC_CL)) {
2193 union mcluster *mcl = (union mcluster *)page;
2194
2195 /* 1st cluster in the page */
2196 sp = slab_get(mcl);
2197 if (mclaudit != NULL)
2198 mcl_audit_init(mcl, &mca_list, &con_list,
2199 AUDIT_CONTENTS_SIZE, NMBPCL);
2200
2201 VERIFY(sp->sl_refcnt == 0 && sp->sl_flags == 0);
2202 slab_init(sp, MC_CL, SLF_MAPPED,
2203 mcl, mcl, bufsize, 0, 1);
2204
2205 /* Insert this slab */
2206 slab_insert(sp, MC_CL);
2207
2208 /* Update stats now since slab_get() drops the lock */
2209 mbstat.m_clfree = ++m_infree(MC_CL) +
2210 m_infree(MC_MBUF_CL);
2211 mbstat.m_clusters = ++m_total(MC_CL);
2212 VERIFY(m_total(MC_CL) <= m_maxlimit(MC_CL));
2213
2214 /* 2nd cluster in the page */
2215 sp = slab_get(++mcl);
2216 if (mclaudit != NULL)
2217 mcl_audit_init(mcl, &mca_list, &con_list,
2218 AUDIT_CONTENTS_SIZE, NMBPCL);
2219
2220 VERIFY(sp->sl_refcnt == 0 && sp->sl_flags == 0);
2221 slab_init(sp, MC_CL, SLF_MAPPED,
2222 mcl, mcl, bufsize, 0, 1);
2223
2224 /* Insert this slab */
2225 slab_insert(sp, MC_CL);
2226
2227 /* Update stats now since slab_get() drops the lock */
2228 mbstat.m_clfree = ++m_infree(MC_CL) +
2229 m_infree(MC_MBUF_CL);
2230 mbstat.m_clusters = ++m_total(MC_CL);
2231 VERIFY(m_total(MC_CL) <= m_maxlimit(MC_CL));
2232 } else if (bufsize == m_maxsize(MC_BIGCL)) {
2233 union mbigcluster *mbc = (union mbigcluster *)page;
2234 mcl_slab_t *nsp;
2235
2236 /* One for the entire page */
2237 sp = slab_get(mbc);
2238 if (mclaudit != NULL)
2239 mcl_audit_init(mbc, &mca_list, NULL, 0, 1);
2240
2241 VERIFY(sp->sl_refcnt == 0 && sp->sl_flags == 0);
2242 slab_init(sp, MC_BIGCL, SLF_MAPPED,
2243 mbc, mbc, bufsize, 0, 1);
2244
2245 /* 2nd cluster's slab is part of the previous one */
2246 nsp = slab_get(((union mcluster *)page) + 1);
2247 slab_init(nsp, MC_BIGCL, SLF_MAPPED | SLF_PARTIAL,
2248 mbc, NULL, 0, 0, 0);
2249
2250 /* Insert this slab */
2251 slab_insert(sp, MC_BIGCL);
2252
2253 /* Update stats now since slab_get() drops the lock */
2254 mbstat.m_bigclfree = ++m_infree(MC_BIGCL) +
2255 m_infree(MC_MBUF_BIGCL);
2256 mbstat.m_bigclusters = ++m_total(MC_BIGCL);
2257 VERIFY(m_total(MC_BIGCL) <= m_maxlimit(MC_BIGCL));
2258 } else if ((i % (M16KCLBYTES / NBPG)) == 0) {
2259 union m16kcluster *m16kcl = (union m16kcluster *)page;
2260 mcl_slab_t *nsp;
2261 int k;
2262
2263 VERIFY(njcl > 0);
2264 /* One for the entire 16KB */
2265 sp = slab_get(m16kcl);
2266 if (mclaudit != NULL)
2267 mcl_audit_init(m16kcl, &mca_list, NULL, 0, 1);
2268
2269 VERIFY(sp->sl_refcnt == 0 && sp->sl_flags == 0);
2270 slab_init(sp, MC_16KCL, SLF_MAPPED,
2271 m16kcl, m16kcl, bufsize, 0, 1);
2272
2273 /* 2nd-8th cluster's slab is part of the first one */
2274 for (k = 1; k < (M16KCLBYTES / MCLBYTES); k++) {
2275 nsp = slab_get(((union mcluster *)page) + k);
2276 VERIFY(nsp->sl_refcnt == 0 &&
2277 nsp->sl_flags == 0);
2278 slab_init(nsp, MC_16KCL,
2279 SLF_MAPPED | SLF_PARTIAL,
2280 m16kcl, NULL, 0, 0, 0);
2281 }
2282
2283 /* Insert this slab */
2284 slab_insert(sp, MC_16KCL);
2285
2286 /* Update stats now since slab_get() drops the lock */
2287 m_infree(MC_16KCL)++;
2288 m_total(MC_16KCL)++;
2289 VERIFY(m_total(MC_16KCL) <= m_maxlimit(MC_16KCL));
2290 }
2291 }
2292 VERIFY(mca_list == NULL && con_list == NULL);
2293
2294 /* We're done; let others enter */
2295 mb_clalloc_busy = FALSE;
2296 if (mb_clalloc_waiters > 0) {
2297 mb_clalloc_waiters = 0;
2298 wakeup(mb_clalloc_waitchan);
2299 }
2300
2301 if (bufsize == m_maxsize(MC_CL))
2302 return (numpages << 1);
2303 else if (bufsize == m_maxsize(MC_BIGCL))
2304 return (numpages);
2305
2306 VERIFY(bufsize == m_maxsize(MC_16KCL));
2307 return (numpages / (M16KCLBYTES / NBPG));
2308
2309 out:
2310 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2311
2312 /* We're done; let others enter */
2313 mb_clalloc_busy = FALSE;
2314 if (mb_clalloc_waiters > 0) {
2315 mb_clalloc_waiters = 0;
2316 wakeup(mb_clalloc_waitchan);
2317 }
2318
2319 /*
2320 * When non-blocking we kick a thread if we have to grow the
2321 * pool or if the number of free clusters is less than requested.
2322 */
2323 if (bufsize == m_maxsize(MC_CL)) {
2324 if (i > 0) {
2325 /*
2326 * Remember total number of clusters needed
2327 * at this time.
2328 */
2329 i += m_total(MC_CL);
2330 if (i > mbuf_expand_mcl) {
2331 mbuf_expand_mcl = i;
2332 if (mbuf_worker_ready)
2333 wakeup((caddr_t)&mbuf_worker_run);
2334 }
2335 }
2336
2337 if (m_infree(MC_CL) >= num)
2338 return (1);
2339 } else if (bufsize == m_maxsize(MC_BIGCL)) {
2340 if (i > 0) {
2341 /*
2342 * Remember total number of 4KB clusters needed
2343 * at this time.
2344 */
2345 i += m_total(MC_BIGCL);
2346 if (i > mbuf_expand_big) {
2347 mbuf_expand_big = i;
2348 if (mbuf_worker_ready)
2349 wakeup((caddr_t)&mbuf_worker_run);
2350 }
2351 }
2352
2353 if (m_infree(MC_BIGCL) >= num)
2354 return (1);
2355 } else {
2356 if (i > 0) {
2357 /*
2358 * Remember total number of 16KB clusters needed
2359 * at this time.
2360 */
2361 i += m_total(MC_16KCL);
2362 if (i > mbuf_expand_16k) {
2363 mbuf_expand_16k = i;
2364 if (mbuf_worker_ready)
2365 wakeup((caddr_t)&mbuf_worker_run);
2366 }
2367 }
2368
2369 if (m_infree(MC_16KCL) >= num)
2370 return (1);
2371 }
2372 return (0);
2373 }
2374
2375 /*
2376 * Populate the global freelist of the corresponding buffer class.
2377 */
2378 static int
2379 freelist_populate(mbuf_class_t class, unsigned int num, int wait)
2380 {
2381 mcache_obj_t *o = NULL;
2382 int i;
2383
2384 VERIFY(class == MC_MBUF || class == MC_CL || class == MC_BIGCL ||
2385 class == MC_16KCL);
2386
2387 #if CONFIG_MBUF_NOEXPAND
2388 if ((mbstat.m_mbufs / NMBPCL) >= maxmbufcl) {
2389 #if DEBUG
2390 static int printonce = 1;
2391 if (printonce == 1) {
2392 printonce = 0;
2393 printf("m_expand failed, allocated %ld out of %d "
2394 "clusters\n", mbstat.m_mbufs / NMBPCL,
2395 nmbclusters);
2396 }
2397 #endif /* DEBUG */
2398 return (0);
2399 }
2400 #endif /* CONFIG_MBUF_NOEXPAND */
2401
2402 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2403
2404 switch (class) {
2405 case MC_MBUF:
2406 case MC_CL:
2407 i = m_clalloc(num, wait, m_maxsize(MC_CL));
2408
2409 /* Respect the 2K clusters minimum limit */
2410 if (m_total(MC_CL) == m_maxlimit(MC_CL) &&
2411 m_infree(MC_CL) <= m_minlimit(MC_CL)) {
2412 if (class != MC_CL || (wait & MCR_COMP))
2413 return (0);
2414 }
2415 if (class == MC_CL)
2416 return (i != 0);
2417 break;
2418
2419 case MC_BIGCL:
2420 case MC_16KCL:
2421 return (m_clalloc(num, wait, m_maxsize(class)) != 0);
2422 /* NOTREACHED */
2423
2424 default:
2425 VERIFY(0);
2426 /* NOTREACHED */
2427 }
2428
2429 /* Steal a cluster and cut it up to create NMBPCL mbufs */
2430 if ((o = slab_alloc(MC_CL, wait)) != NULL) {
2431 struct mbuf *m = (struct mbuf *)o;
2432 mcache_audit_t *mca = NULL;
2433 mcl_slab_t *sp = slab_get(o);
2434
2435 VERIFY(slab_is_detached(sp) &&
2436 (sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED);
2437
2438 /* Make sure that the cluster is unmolested while in freelist */
2439 if (mclaudit != NULL) {
2440 mca = mcl_audit_buf2mca(MC_CL, o);
2441 mcache_audit_free_verify(mca, o, 0, m_maxsize(MC_CL));
2442 }
2443
2444 /* Reinitialize it as an mbuf slab */
2445 slab_init(sp, MC_MBUF, sp->sl_flags, sp->sl_base, NULL,
2446 sp->sl_len, 0, NMBPCL);
2447
2448 VERIFY(m == (struct mbuf *)sp->sl_base);
2449 VERIFY(sp->sl_head == NULL);
2450
2451 m_total(MC_MBUF) += NMBPCL;
2452 mbstat.m_mbufs = m_total(MC_MBUF);
2453 m_infree(MC_MBUF) += NMBPCL;
2454 mtype_stat_add(MT_FREE, NMBPCL);
2455
2456 i = NMBPCL;
2457 while (i--) {
2458 /*
2459 * If auditing is enabled, construct the shadow mbuf
2460 * in the audit structure instead of the actual one.
2461 * mbuf_slab_audit() will take care of restoring the
2462 * contents after the integrity check.
2463 */
2464 if (mclaudit != NULL) {
2465 struct mbuf *ms;
2466 mca = mcl_audit_buf2mca(MC_MBUF,
2467 (mcache_obj_t *)m);
2468 ms = ((struct mbuf *)mca->mca_contents);
2469 ms->m_type = MT_FREE;
2470 } else {
2471 m->m_type = MT_FREE;
2472 }
2473 m->m_next = sp->sl_head;
2474 sp->sl_head = (void *)m++;
2475 }
2476
2477 /* Insert it into the mbuf class's slab list */
2478 slab_insert(sp, MC_MBUF);
2479
2480 if ((i = mb_waiters) > 0)
2481 mb_waiters = 0;
2482 if (i != 0)
2483 wakeup(mb_waitchan);
2484
2485 return (1);
2486 }
2487
2488 return (0);
2489 }
2490
2491 /*
2492 * (Inaccurately) check if it might be worth a trip back to the
2493 * mcache layer due the availability of objects there. We'll
2494 * end up back here if there's nothing up there.
2495 */
2496 static boolean_t
2497 mbuf_cached_above(mbuf_class_t class, int wait)
2498 {
2499 switch (class) {
2500 case MC_MBUF:
2501 if (wait & MCR_COMP)
2502 return (!mcache_bkt_isempty(m_cache(MC_MBUF_CL)) ||
2503 !mcache_bkt_isempty(m_cache(MC_MBUF_BIGCL)));
2504 break;
2505
2506 case MC_CL:
2507 if (wait & MCR_COMP)
2508 return (!mcache_bkt_isempty(m_cache(MC_MBUF_CL)));
2509 break;
2510
2511 case MC_BIGCL:
2512 if (wait & MCR_COMP)
2513 return (!mcache_bkt_isempty(m_cache(MC_MBUF_BIGCL)));
2514 break;
2515
2516 case MC_16KCL:
2517 if (wait & MCR_COMP)
2518 return (!mcache_bkt_isempty(m_cache(MC_MBUF_16KCL)));
2519 break;
2520
2521 case MC_MBUF_CL:
2522 case MC_MBUF_BIGCL:
2523 case MC_MBUF_16KCL:
2524 break;
2525
2526 default:
2527 VERIFY(0);
2528 /* NOTREACHED */
2529 }
2530
2531 return (!mcache_bkt_isempty(m_cache(class)));
2532 }
2533
2534 /*
2535 * If possible, convert constructed objects to raw ones.
2536 */
2537 static boolean_t
2538 mbuf_steal(mbuf_class_t class, unsigned int num)
2539 {
2540 mcache_obj_t *top = NULL;
2541 mcache_obj_t **list = &top;
2542 unsigned int tot = 0;
2543
2544 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2545
2546 switch (class) {
2547 case MC_MBUF:
2548 case MC_CL:
2549 case MC_BIGCL:
2550 case MC_16KCL:
2551 return (FALSE);
2552
2553 case MC_MBUF_CL:
2554 case MC_MBUF_BIGCL:
2555 case MC_MBUF_16KCL:
2556 /* Get the required number of constructed objects if possible */
2557 if (m_infree(class) > m_minlimit(class)) {
2558 tot = cslab_alloc(class, &list,
2559 MIN(num, m_infree(class)));
2560 }
2561
2562 /* And destroy them to get back the raw objects */
2563 if (top != NULL)
2564 (void) cslab_free(class, top, 1);
2565 break;
2566
2567 default:
2568 VERIFY(0);
2569 /* NOTREACHED */
2570 }
2571
2572 return (tot == num);
2573 }
2574
2575 static void
2576 m_reclaim(mbuf_class_t class, unsigned int num, boolean_t comp)
2577 {
2578 int m, bmap = 0;
2579
2580 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2581
2582 VERIFY(m_total(MC_CL) <= m_maxlimit(MC_CL));
2583 VERIFY(m_total(MC_BIGCL) <= m_maxlimit(MC_BIGCL));
2584 VERIFY(m_total(MC_16KCL) <= m_maxlimit(MC_16KCL));
2585
2586 /*
2587 * This logic can be made smarter; for now, simply mark
2588 * all other related classes as potential victims.
2589 */
2590 switch (class) {
2591 case MC_MBUF:
2592 m_wantpurge(MC_CL)++;
2593 m_wantpurge(MC_MBUF_CL)++;
2594 m_wantpurge(MC_MBUF_BIGCL)++;
2595 break;
2596
2597 case MC_CL:
2598 m_wantpurge(MC_MBUF)++;
2599 if (!comp)
2600 m_wantpurge(MC_MBUF_CL)++;
2601 break;
2602
2603 case MC_BIGCL:
2604 if (!comp)
2605 m_wantpurge(MC_MBUF_BIGCL)++;
2606 break;
2607
2608 case MC_16KCL:
2609 if (!comp)
2610 m_wantpurge(MC_MBUF_16KCL)++;
2611 break;
2612
2613 default:
2614 VERIFY(0);
2615 /* NOTREACHED */
2616 }
2617
2618 /*
2619 * Run through each marked class and check if we really need to
2620 * purge (and therefore temporarily disable) the per-CPU caches
2621 * layer used by the class. If so, remember the classes since
2622 * we are going to drop the lock below prior to purging.
2623 */
2624 for (m = 0; m < NELEM(mbuf_table); m++) {
2625 if (m_wantpurge(m) > 0) {
2626 m_wantpurge(m) = 0;
2627 /*
2628 * Try hard to steal the required number of objects
2629 * from the freelist of other mbuf classes. Only
2630 * purge and disable the per-CPU caches layer when
2631 * we don't have enough; it's the last resort.
2632 */
2633 if (!mbuf_steal(m, num))
2634 bmap |= (1 << m);
2635 }
2636 }
2637
2638 lck_mtx_unlock(mbuf_mlock);
2639
2640 if (bmap != 0) {
2641 /* drain is performed in pfslowtimo(), to avoid deadlocks */
2642 do_reclaim = 1;
2643
2644 /* Sigh; we have no other choices but to ask mcache to purge */
2645 for (m = 0; m < NELEM(mbuf_table); m++) {
2646 if ((bmap & (1 << m)) &&
2647 mcache_purge_cache(m_cache(m))) {
2648 lck_mtx_lock(mbuf_mlock);
2649 m_purge_cnt(m)++;
2650 mbstat.m_drain++;
2651 lck_mtx_unlock(mbuf_mlock);
2652 }
2653 }
2654 } else {
2655 /*
2656 * Request mcache to reap extra elements from all of its caches;
2657 * note that all reaps are serialized and happen only at a fixed
2658 * interval.
2659 */
2660 mcache_reap();
2661 }
2662 lck_mtx_lock(mbuf_mlock);
2663 }
2664
2665 static inline struct mbuf *
2666 m_get_common(int wait, short type, int hdr)
2667 {
2668 struct mbuf *m;
2669 int mcflags = MSLEEPF(wait);
2670
2671 /* Is this due to a non-blocking retry? If so, then try harder */
2672 if (mcflags & MCR_NOSLEEP)
2673 mcflags |= MCR_TRYHARD;
2674
2675 m = mcache_alloc(m_cache(MC_MBUF), mcflags);
2676 if (m != NULL) {
2677 MBUF_INIT(m, hdr, type);
2678 mtype_stat_inc(type);
2679 mtype_stat_dec(MT_FREE);
2680 #if CONFIG_MACF_NET
2681 if (hdr && mac_init_mbuf(m, wait) != 0) {
2682 m_free(m);
2683 return (NULL);
2684 }
2685 #endif /* MAC_NET */
2686 }
2687 return (m);
2688 }
2689
2690 /*
2691 * Space allocation routines; these are also available as macros
2692 * for critical paths.
2693 */
2694 #define _M_GET(wait, type) m_get_common(wait, type, 0)
2695 #define _M_GETHDR(wait, type) m_get_common(wait, type, 1)
2696 #define _M_RETRY(wait, type) _M_GET(wait, type)
2697 #define _M_RETRYHDR(wait, type) _M_GETHDR(wait, type)
2698 #define _MGET(m, how, type) ((m) = _M_GET(how, type))
2699 #define _MGETHDR(m, how, type) ((m) = _M_GETHDR(how, type))
2700
2701 struct mbuf *
2702 m_get(int wait, int type)
2703 {
2704 return (_M_GET(wait, type));
2705 }
2706
2707 struct mbuf *
2708 m_gethdr(int wait, int type)
2709 {
2710 return (_M_GETHDR(wait, type));
2711 }
2712
2713 struct mbuf *
2714 m_retry(int wait, int type)
2715 {
2716 return (_M_RETRY(wait, type));
2717 }
2718
2719 struct mbuf *
2720 m_retryhdr(int wait, int type)
2721 {
2722 return (_M_RETRYHDR(wait, type));
2723 }
2724
2725 struct mbuf *
2726 m_getclr(int wait, int type)
2727 {
2728 struct mbuf *m;
2729
2730 _MGET(m, wait, type);
2731 if (m != NULL)
2732 bzero(MTOD(m, caddr_t), MLEN);
2733 return (m);
2734 }
2735
2736 struct mbuf *
2737 m_free(struct mbuf *m)
2738 {
2739 struct mbuf *n = m->m_next;
2740
2741 if (m->m_type == MT_FREE)
2742 panic("m_free: freeing an already freed mbuf");
2743
2744 /* Free the aux data and tags if there is any */
2745 if (m->m_flags & M_PKTHDR) {
2746 m_tag_delete_chain(m, NULL);
2747 }
2748
2749 if (m->m_flags & M_EXT) {
2750 u_int32_t refcnt;
2751 u_int32_t flags;
2752
2753 refcnt = m_decref(m);
2754 flags = MEXT_FLAGS(m);
2755 if (refcnt == 0 && flags == 0) {
2756 if (m->m_ext.ext_free == NULL) {
2757 mcache_free(m_cache(MC_CL), m->m_ext.ext_buf);
2758 } else if (m->m_ext.ext_free == m_bigfree) {
2759 mcache_free(m_cache(MC_BIGCL),
2760 m->m_ext.ext_buf);
2761 } else if (m->m_ext.ext_free == m_16kfree) {
2762 mcache_free(m_cache(MC_16KCL),
2763 m->m_ext.ext_buf);
2764 } else {
2765 (*(m->m_ext.ext_free))(m->m_ext.ext_buf,
2766 m->m_ext.ext_size, m->m_ext.ext_arg);
2767 }
2768 mcache_free(ref_cache, MEXT_RFA(m));
2769 MEXT_RFA(m) = NULL;
2770 } else if (refcnt == 0 && (flags & EXTF_COMPOSITE)) {
2771 VERIFY(m->m_type != MT_FREE);
2772
2773 mtype_stat_dec(m->m_type);
2774 mtype_stat_inc(MT_FREE);
2775
2776 m->m_type = MT_FREE;
2777 m->m_flags = M_EXT;
2778 m->m_len = 0;
2779 m->m_next = m->m_nextpkt = NULL;
2780
2781 /* "Free" into the intermediate cache */
2782 if (m->m_ext.ext_free == NULL) {
2783 mcache_free(m_cache(MC_MBUF_CL), m);
2784 } else if (m->m_ext.ext_free == m_bigfree) {
2785 mcache_free(m_cache(MC_MBUF_BIGCL), m);
2786 } else {
2787 VERIFY(m->m_ext.ext_free == m_16kfree);
2788 mcache_free(m_cache(MC_MBUF_16KCL), m);
2789 }
2790 return (n);
2791 }
2792 }
2793
2794 if (m->m_type != MT_FREE) {
2795 mtype_stat_dec(m->m_type);
2796 mtype_stat_inc(MT_FREE);
2797 }
2798
2799 m->m_type = MT_FREE;
2800 m->m_flags = m->m_len = 0;
2801 m->m_next = m->m_nextpkt = NULL;
2802
2803 mcache_free(m_cache(MC_MBUF), m);
2804
2805 return (n);
2806 }
2807
2808 __private_extern__ struct mbuf *
2809 m_clattach(struct mbuf *m, int type, caddr_t extbuf,
2810 void (*extfree)(caddr_t, u_int, caddr_t), u_int extsize, caddr_t extarg,
2811 int wait)
2812 {
2813 struct ext_ref *rfa = NULL;
2814
2815 if (m == NULL && (m = _M_GETHDR(wait, type)) == NULL)
2816 return (NULL);
2817
2818 if (m->m_flags & M_EXT) {
2819 u_int32_t refcnt;
2820 u_int32_t flags;
2821
2822 refcnt = m_decref(m);
2823 flags = MEXT_FLAGS(m);
2824 if (refcnt == 0 && flags == 0) {
2825 if (m->m_ext.ext_free == NULL) {
2826 mcache_free(m_cache(MC_CL), m->m_ext.ext_buf);
2827 } else if (m->m_ext.ext_free == m_bigfree) {
2828 mcache_free(m_cache(MC_BIGCL),
2829 m->m_ext.ext_buf);
2830 } else if (m->m_ext.ext_free == m_16kfree) {
2831 mcache_free(m_cache(MC_16KCL),
2832 m->m_ext.ext_buf);
2833 } else {
2834 (*(m->m_ext.ext_free))(m->m_ext.ext_buf,
2835 m->m_ext.ext_size, m->m_ext.ext_arg);
2836 }
2837 /* Re-use the reference structure */
2838 rfa = MEXT_RFA(m);
2839 } else if (refcnt == 0 && (flags & EXTF_COMPOSITE)) {
2840 VERIFY(m->m_type != MT_FREE);
2841
2842 mtype_stat_dec(m->m_type);
2843 mtype_stat_inc(MT_FREE);
2844
2845 m->m_type = MT_FREE;
2846 m->m_flags = M_EXT;
2847 m->m_len = 0;
2848 m->m_next = m->m_nextpkt = NULL;
2849 /* "Free" into the intermediate cache */
2850 if (m->m_ext.ext_free == NULL) {
2851 mcache_free(m_cache(MC_MBUF_CL), m);
2852 } else if (m->m_ext.ext_free == m_bigfree) {
2853 mcache_free(m_cache(MC_MBUF_BIGCL), m);
2854 } else {
2855 VERIFY(m->m_ext.ext_free == m_16kfree);
2856 mcache_free(m_cache(MC_MBUF_16KCL), m);
2857 }
2858 /*
2859 * Allocate a new mbuf, since we didn't divorce
2860 * the composite mbuf + cluster pair above.
2861 */
2862 if ((m = _M_GETHDR(wait, type)) == NULL)
2863 return (NULL);
2864 }
2865 }
2866
2867 if (rfa == NULL &&
2868 (rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL) {
2869 m_free(m);
2870 return (NULL);
2871 }
2872
2873 MEXT_INIT(m, extbuf, extsize, extfree, extarg, rfa, 1, 0);
2874
2875 return (m);
2876 }
2877
2878 /* m_mclget() add an mbuf cluster to a normal mbuf */
2879 struct mbuf *
2880 m_mclget(struct mbuf *m, int wait)
2881 {
2882 struct ext_ref *rfa;
2883
2884 if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL)
2885 return (m);
2886
2887 m->m_ext.ext_buf = m_mclalloc(wait);
2888 if (m->m_ext.ext_buf != NULL) {
2889 MBUF_CL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0);
2890 } else {
2891 mcache_free(ref_cache, rfa);
2892 }
2893 return (m);
2894 }
2895
2896 /* Allocate an mbuf cluster */
2897 caddr_t
2898 m_mclalloc(int wait)
2899 {
2900 int mcflags = MSLEEPF(wait);
2901
2902 /* Is this due to a non-blocking retry? If so, then try harder */
2903 if (mcflags & MCR_NOSLEEP)
2904 mcflags |= MCR_TRYHARD;
2905
2906 return (mcache_alloc(m_cache(MC_CL), mcflags));
2907 }
2908
2909 /* Free an mbuf cluster */
2910 void
2911 m_mclfree(caddr_t p)
2912 {
2913 mcache_free(m_cache(MC_CL), p);
2914 }
2915
2916 /*
2917 * mcl_hasreference() checks if a cluster of an mbuf is referenced by
2918 * another mbuf
2919 */
2920 int
2921 m_mclhasreference(struct mbuf *m)
2922 {
2923 if (!(m->m_flags & M_EXT))
2924 return (0);
2925
2926 ASSERT(MEXT_RFA(m) != NULL);
2927
2928 return (MEXT_REF(m) > 1);
2929 }
2930
2931 __private_extern__ caddr_t
2932 m_bigalloc(int wait)
2933 {
2934 int mcflags = MSLEEPF(wait);
2935
2936 /* Is this due to a non-blocking retry? If so, then try harder */
2937 if (mcflags & MCR_NOSLEEP)
2938 mcflags |= MCR_TRYHARD;
2939
2940 return (mcache_alloc(m_cache(MC_BIGCL), mcflags));
2941 }
2942
2943 __private_extern__ void
2944 m_bigfree(caddr_t p, __unused u_int size, __unused caddr_t arg)
2945 {
2946 mcache_free(m_cache(MC_BIGCL), p);
2947 }
2948
2949 /* m_mbigget() add an 4KB mbuf cluster to a normal mbuf */
2950 __private_extern__ struct mbuf *
2951 m_mbigget(struct mbuf *m, int wait)
2952 {
2953 struct ext_ref *rfa;
2954
2955 if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL)
2956 return (m);
2957
2958 m->m_ext.ext_buf = m_bigalloc(wait);
2959 if (m->m_ext.ext_buf != NULL) {
2960 MBUF_BIGCL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0);
2961 } else {
2962 mcache_free(ref_cache, rfa);
2963 }
2964 return (m);
2965 }
2966
2967 __private_extern__ caddr_t
2968 m_16kalloc(int wait)
2969 {
2970 int mcflags = MSLEEPF(wait);
2971
2972 /* Is this due to a non-blocking retry? If so, then try harder */
2973 if (mcflags & MCR_NOSLEEP)
2974 mcflags |= MCR_TRYHARD;
2975
2976 return (mcache_alloc(m_cache(MC_16KCL), mcflags));
2977 }
2978
2979 __private_extern__ void
2980 m_16kfree(caddr_t p, __unused u_int size, __unused caddr_t arg)
2981 {
2982 mcache_free(m_cache(MC_16KCL), p);
2983 }
2984
2985 /* m_m16kget() add a 16KB mbuf cluster to a normal mbuf */
2986 __private_extern__ struct mbuf *
2987 m_m16kget(struct mbuf *m, int wait)
2988 {
2989 struct ext_ref *rfa;
2990
2991 if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL)
2992 return (m);
2993
2994 m->m_ext.ext_buf = m_16kalloc(wait);
2995 if (m->m_ext.ext_buf != NULL) {
2996 MBUF_16KCL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0);
2997 } else {
2998 mcache_free(ref_cache, rfa);
2999 }
3000 return (m);
3001 }
3002
3003 /* */
3004 void
3005 m_copy_pkthdr(struct mbuf *to, struct mbuf *from)
3006 {
3007 #if CONFIG_MACF_NET
3008 /* We will be taking over the tags of 'to' */
3009 if (to->m_flags & M_PKTHDR)
3010 m_tag_delete_chain(to, NULL);
3011 #endif /* MAC_NET */
3012 to->m_pkthdr = from->m_pkthdr; /* especially tags */
3013 m_tag_init(from); /* purge tags from src */
3014 to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT);
3015 if ((to->m_flags & M_EXT) == 0)
3016 to->m_data = to->m_pktdat;
3017 }
3018
3019 /*
3020 * Duplicate "from"'s mbuf pkthdr in "to".
3021 * "from" must have M_PKTHDR set, and "to" must be empty.
3022 * In particular, this does a deep copy of the packet tags.
3023 */
3024 static int
3025 m_dup_pkthdr(struct mbuf *to, struct mbuf *from, int how)
3026 {
3027 #if CONFIG_MACF_NET
3028 if (to->m_flags & M_PKTHDR)
3029 m_tag_delete_chain(to, NULL);
3030 #endif /* MAC_NET */
3031 to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT);
3032 if ((to->m_flags & M_EXT) == 0)
3033 to->m_data = to->m_pktdat;
3034 to->m_pkthdr = from->m_pkthdr;
3035 m_tag_init(to);
3036 return (m_tag_copy_chain(to, from, how));
3037 }
3038
3039 /*
3040 * Return a list of mbuf hdrs that point to clusters. Try for num_needed;
3041 * if wantall is not set, return whatever number were available. Set up the
3042 * first num_with_pkthdrs with mbuf hdrs configured as packet headers; these
3043 * are chained on the m_nextpkt field. Any packets requested beyond this
3044 * are chained onto the last packet header's m_next field. The size of
3045 * the cluster is controlled by the parameter bufsize.
3046 */
3047 __private_extern__ struct mbuf *
3048 m_getpackets_internal(unsigned int *num_needed, int num_with_pkthdrs,
3049 int wait, int wantall, size_t bufsize)
3050 {
3051 struct mbuf *m;
3052 struct mbuf **np, *top;
3053 unsigned int pnum, needed = *num_needed;
3054 mcache_obj_t *mp_list = NULL;
3055 int mcflags = MSLEEPF(wait);
3056 u_int32_t flag;
3057 struct ext_ref *rfa;
3058 mcache_t *cp;
3059 void *cl;
3060
3061 ASSERT(bufsize == m_maxsize(MC_CL) ||
3062 bufsize == m_maxsize(MC_BIGCL) ||
3063 bufsize == m_maxsize(MC_16KCL));
3064
3065 /*
3066 * Caller must first check for njcl because this
3067 * routine is internal and not exposed/used via KPI.
3068 */
3069 VERIFY(bufsize != m_maxsize(MC_16KCL) || njcl > 0);
3070
3071 top = NULL;
3072 np = &top;
3073 pnum = 0;
3074
3075 /*
3076 * The caller doesn't want all the requested buffers; only some.
3077 * Try hard to get what we can, but don't block. This effectively
3078 * overrides MCR_SLEEP, since this thread will not go to sleep
3079 * if we can't get all the buffers.
3080 */
3081 if (!wantall || (mcflags & MCR_NOSLEEP))
3082 mcflags |= MCR_TRYHARD;
3083
3084 /* Allocate the composite mbuf + cluster elements from the cache */
3085 if (bufsize == m_maxsize(MC_CL))
3086 cp = m_cache(MC_MBUF_CL);
3087 else if (bufsize == m_maxsize(MC_BIGCL))
3088 cp = m_cache(MC_MBUF_BIGCL);
3089 else
3090 cp = m_cache(MC_MBUF_16KCL);
3091 needed = mcache_alloc_ext(cp, &mp_list, needed, mcflags);
3092
3093 for (pnum = 0; pnum < needed; pnum++) {
3094 m = (struct mbuf *)mp_list;
3095 mp_list = mp_list->obj_next;
3096
3097 VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT);
3098 cl = m->m_ext.ext_buf;
3099 rfa = MEXT_RFA(m);
3100
3101 ASSERT(cl != NULL && rfa != NULL);
3102 VERIFY(MBUF_IS_COMPOSITE(m));
3103
3104 flag = MEXT_FLAGS(m);
3105
3106 MBUF_INIT(m, num_with_pkthdrs, MT_DATA);
3107 if (bufsize == m_maxsize(MC_16KCL)) {
3108 MBUF_16KCL_INIT(m, cl, rfa, 1, flag);
3109 } else if (bufsize == m_maxsize(MC_BIGCL)) {
3110 MBUF_BIGCL_INIT(m, cl, rfa, 1, flag);
3111 } else {
3112 MBUF_CL_INIT(m, cl, rfa, 1, flag);
3113 }
3114
3115 if (num_with_pkthdrs > 0) {
3116 --num_with_pkthdrs;
3117 #if CONFIG_MACF_NET
3118 if (mac_mbuf_label_init(m, wait) != 0) {
3119 m_free(m);
3120 break;
3121 }
3122 #endif /* MAC_NET */
3123 }
3124
3125 *np = m;
3126 if (num_with_pkthdrs > 0)
3127 np = &m->m_nextpkt;
3128 else
3129 np = &m->m_next;
3130 }
3131 ASSERT(pnum != *num_needed || mp_list == NULL);
3132 if (mp_list != NULL)
3133 mcache_free_ext(cp, mp_list);
3134
3135 if (pnum > 0) {
3136 mtype_stat_add(MT_DATA, pnum);
3137 mtype_stat_sub(MT_FREE, pnum);
3138 }
3139
3140 if (wantall && (pnum != *num_needed)) {
3141 if (top != NULL)
3142 m_freem_list(top);
3143 return (NULL);
3144 }
3145
3146 *num_needed = pnum;
3147 return (top);
3148 }
3149
3150 /*
3151 * Return list of mbuf linked by m_nextpkt. Try for numlist, and if
3152 * wantall is not set, return whatever number were available. The size of
3153 * each mbuf in the list is controlled by the parameter packetlen. Each
3154 * mbuf of the list may have a chain of mbufs linked by m_next. Each mbuf
3155 * in the chain is called a segment. If maxsegments is not null and the
3156 * value pointed to is not null, this specify the maximum number of segments
3157 * for a chain of mbufs. If maxsegments is zero or the value pointed to
3158 * is zero the caller does not have any restriction on the number of segments.
3159 * The actual number of segments of a mbuf chain is return in the value
3160 * pointed to by maxsegments.
3161 */
3162 __private_extern__ struct mbuf *
3163 m_allocpacket_internal(unsigned int *numlist, size_t packetlen,
3164 unsigned int *maxsegments, int wait, int wantall, size_t wantsize)
3165 {
3166 struct mbuf **np, *top, *first = NULL;
3167 size_t bufsize, r_bufsize;
3168 unsigned int num = 0;
3169 unsigned int nsegs = 0;
3170 unsigned int needed, resid;
3171 int mcflags = MSLEEPF(wait);
3172 mcache_obj_t *mp_list = NULL, *rmp_list = NULL;
3173 mcache_t *cp = NULL, *rcp = NULL;
3174
3175 if (*numlist == 0)
3176 return (NULL);
3177
3178 top = NULL;
3179 np = &top;
3180
3181 if (wantsize == 0) {
3182 if (packetlen <= MINCLSIZE) {
3183 bufsize = packetlen;
3184 } else if (packetlen > m_maxsize(MC_CL)) {
3185 /* Use 4KB if jumbo cluster pool isn't available */
3186 if (packetlen <= m_maxsize(MC_BIGCL) || njcl == 0)
3187 bufsize = m_maxsize(MC_BIGCL);
3188 else
3189 bufsize = m_maxsize(MC_16KCL);
3190 } else {
3191 bufsize = m_maxsize(MC_CL);
3192 }
3193 } else if (wantsize == m_maxsize(MC_CL) ||
3194 wantsize == m_maxsize(MC_BIGCL) ||
3195 (wantsize == m_maxsize(MC_16KCL) && njcl > 0)) {
3196 bufsize = wantsize;
3197 } else {
3198 return (NULL);
3199 }
3200
3201 if (bufsize <= MHLEN) {
3202 nsegs = 1;
3203 } else if (bufsize <= MINCLSIZE) {
3204 if (maxsegments != NULL && *maxsegments == 1) {
3205 bufsize = m_maxsize(MC_CL);
3206 nsegs = 1;
3207 } else {
3208 nsegs = 2;
3209 }
3210 } else if (bufsize == m_maxsize(MC_16KCL)) {
3211 VERIFY(njcl > 0);
3212 nsegs = ((packetlen - 1) >> (PGSHIFT + 2)) + 1;
3213 } else if (bufsize == m_maxsize(MC_BIGCL)) {
3214 nsegs = ((packetlen - 1) >> PGSHIFT) + 1;
3215 } else {
3216 nsegs = ((packetlen - 1) >> MCLSHIFT) + 1;
3217 }
3218 if (maxsegments != NULL) {
3219 if (*maxsegments && nsegs > *maxsegments) {
3220 *maxsegments = nsegs;
3221 return (NULL);
3222 }
3223 *maxsegments = nsegs;
3224 }
3225
3226 /*
3227 * The caller doesn't want all the requested buffers; only some.
3228 * Try hard to get what we can, but don't block. This effectively
3229 * overrides MCR_SLEEP, since this thread will not go to sleep
3230 * if we can't get all the buffers.
3231 */
3232 if (!wantall || (mcflags & MCR_NOSLEEP))
3233 mcflags |= MCR_TRYHARD;
3234
3235 /*
3236 * Simple case where all elements in the lists/chains are mbufs.
3237 * Unless bufsize is greater than MHLEN, each segment chain is made
3238 * up of exactly 1 mbuf. Otherwise, each segment chain is made up
3239 * of 2 mbufs; the second one is used for the residual data, i.e.
3240 * the remaining data that cannot fit into the first mbuf.
3241 */
3242 if (bufsize <= MINCLSIZE) {
3243 /* Allocate the elements in one shot from the mbuf cache */
3244 ASSERT(bufsize <= MHLEN || nsegs == 2);
3245 cp = m_cache(MC_MBUF);
3246 needed = mcache_alloc_ext(cp, &mp_list,
3247 (*numlist) * nsegs, mcflags);
3248
3249 /*
3250 * The number of elements must be even if we are to use an
3251 * mbuf (instead of a cluster) to store the residual data.
3252 * If we couldn't allocate the requested number of mbufs,
3253 * trim the number down (if it's odd) in order to avoid
3254 * creating a partial segment chain.
3255 */
3256 if (bufsize > MHLEN && (needed & 0x1))
3257 needed--;
3258
3259 while (num < needed) {
3260 struct mbuf *m;
3261
3262 m = (struct mbuf *)mp_list;
3263 mp_list = mp_list->obj_next;
3264 ASSERT(m != NULL);
3265
3266 MBUF_INIT(m, 1, MT_DATA);
3267 #if CONFIG_MACF_NET
3268 if (mac_init_mbuf(m, wait) != 0) {
3269 m_free(m);
3270 break;
3271 }
3272 #endif /* MAC_NET */
3273 num++;
3274 if (bufsize > MHLEN) {
3275 /* A second mbuf for this segment chain */
3276 m->m_next = (struct mbuf *)mp_list;
3277 mp_list = mp_list->obj_next;
3278 ASSERT(m->m_next != NULL);
3279
3280 MBUF_INIT(m->m_next, 0, MT_DATA);
3281 num++;
3282 }
3283 *np = m;
3284 np = &m->m_nextpkt;
3285 }
3286 ASSERT(num != *numlist || mp_list == NULL);
3287
3288 if (num > 0) {
3289 mtype_stat_add(MT_DATA, num);
3290 mtype_stat_sub(MT_FREE, num);
3291 }
3292 num /= nsegs;
3293
3294 /* We've got them all; return to caller */
3295 if (num == *numlist)
3296 return (top);
3297
3298 goto fail;
3299 }
3300
3301 /*
3302 * Complex cases where elements are made up of one or more composite
3303 * mbufs + cluster, depending on packetlen. Each N-segment chain can
3304 * be illustrated as follows:
3305 *
3306 * [mbuf + cluster 1] [mbuf + cluster 2] ... [mbuf + cluster N]
3307 *
3308 * Every composite mbuf + cluster element comes from the intermediate
3309 * cache (either MC_MBUF_CL or MC_MBUF_BIGCL). For space efficiency,
3310 * the last composite element will come from the MC_MBUF_CL cache,
3311 * unless the residual data is larger than 2KB where we use the
3312 * big cluster composite cache (MC_MBUF_BIGCL) instead. Residual
3313 * data is defined as extra data beyond the first element that cannot
3314 * fit into the previous element, i.e. there is no residual data if
3315 * the chain only has 1 segment.
3316 */
3317 r_bufsize = bufsize;
3318 resid = packetlen > bufsize ? packetlen % bufsize : 0;
3319 if (resid > 0) {
3320 /* There is residual data; figure out the cluster size */
3321 if (wantsize == 0 && packetlen > MINCLSIZE) {
3322 /*
3323 * Caller didn't request that all of the segments
3324 * in the chain use the same cluster size; use the
3325 * smaller of the cluster sizes.
3326 */
3327 if (njcl > 0 && resid > m_maxsize(MC_BIGCL))
3328 r_bufsize = m_maxsize(MC_16KCL);
3329 else if (resid > m_maxsize(MC_CL))
3330 r_bufsize = m_maxsize(MC_BIGCL);
3331 else
3332 r_bufsize = m_maxsize(MC_CL);
3333 } else {
3334 /* Use the same cluster size as the other segments */
3335 resid = 0;
3336 }
3337 }
3338
3339 needed = *numlist;
3340 if (resid > 0) {
3341 /*
3342 * Attempt to allocate composite mbuf + cluster elements for
3343 * the residual data in each chain; record the number of such
3344 * elements that can be allocated so that we know how many
3345 * segment chains we can afford to create.
3346 */
3347 if (r_bufsize <= m_maxsize(MC_CL))
3348 rcp = m_cache(MC_MBUF_CL);
3349 else if (r_bufsize <= m_maxsize(MC_BIGCL))
3350 rcp = m_cache(MC_MBUF_BIGCL);
3351 else
3352 rcp = m_cache(MC_MBUF_16KCL);
3353 needed = mcache_alloc_ext(rcp, &rmp_list, *numlist, mcflags);
3354
3355 if (needed == 0)
3356 goto fail;
3357
3358 /* This is temporarily reduced for calculation */
3359 ASSERT(nsegs > 1);
3360 nsegs--;
3361 }
3362
3363 /*
3364 * Attempt to allocate the rest of the composite mbuf + cluster
3365 * elements for the number of segment chains that we need.
3366 */
3367 if (bufsize <= m_maxsize(MC_CL))
3368 cp = m_cache(MC_MBUF_CL);
3369 else if (bufsize <= m_maxsize(MC_BIGCL))
3370 cp = m_cache(MC_MBUF_BIGCL);
3371 else
3372 cp = m_cache(MC_MBUF_16KCL);
3373 needed = mcache_alloc_ext(cp, &mp_list, needed * nsegs, mcflags);
3374
3375 /* Round it down to avoid creating a partial segment chain */
3376 needed = (needed / nsegs) * nsegs;
3377 if (needed == 0)
3378 goto fail;
3379
3380 if (resid > 0) {
3381 /*
3382 * We're about to construct the chain(s); take into account
3383 * the number of segments we have created above to hold the
3384 * residual data for each chain, as well as restore the
3385 * original count of segments per chain.
3386 */
3387 ASSERT(nsegs > 0);
3388 needed += needed / nsegs;
3389 nsegs++;
3390 }
3391
3392 for (;;) {
3393 struct mbuf *m;
3394 u_int32_t flag;
3395 struct ext_ref *rfa;
3396 void *cl;
3397 int pkthdr;
3398
3399 ++num;
3400 if (nsegs == 1 || (num % nsegs) != 0 || resid == 0) {
3401 m = (struct mbuf *)mp_list;
3402 mp_list = mp_list->obj_next;
3403 } else {
3404 m = (struct mbuf *)rmp_list;
3405 rmp_list = rmp_list->obj_next;
3406 }
3407 ASSERT(m != NULL);
3408 VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT);
3409 VERIFY(m->m_ext.ext_free == NULL ||
3410 m->m_ext.ext_free == m_bigfree ||
3411 m->m_ext.ext_free == m_16kfree);
3412
3413 cl = m->m_ext.ext_buf;
3414 rfa = MEXT_RFA(m);
3415
3416 ASSERT(cl != NULL && rfa != NULL);
3417 VERIFY(MBUF_IS_COMPOSITE(m));
3418
3419 flag = MEXT_FLAGS(m);
3420
3421 pkthdr = (nsegs == 1 || (num % nsegs) == 1);
3422 if (pkthdr)
3423 first = m;
3424 MBUF_INIT(m, pkthdr, MT_DATA);
3425 if (m->m_ext.ext_free == m_16kfree) {
3426 MBUF_16KCL_INIT(m, cl, rfa, 1, flag);
3427 } else if (m->m_ext.ext_free == m_bigfree) {
3428 MBUF_BIGCL_INIT(m, cl, rfa, 1, flag);
3429 } else {
3430 MBUF_CL_INIT(m, cl, rfa, 1, flag);
3431 }
3432 #if CONFIG_MACF_NET
3433 if (pkthdr && mac_init_mbuf(m, wait) != 0) {
3434 --num;
3435 m_free(m);
3436 break;
3437 }
3438 #endif /* MAC_NET */
3439
3440 *np = m;
3441 if ((num % nsegs) == 0)
3442 np = &first->m_nextpkt;
3443 else
3444 np = &m->m_next;
3445
3446 if (num == needed)
3447 break;
3448 }
3449
3450 if (num > 0) {
3451 mtype_stat_add(MT_DATA, num);
3452 mtype_stat_sub(MT_FREE, num);
3453 }
3454
3455 num /= nsegs;
3456
3457 /* We've got them all; return to caller */
3458 if (num == *numlist) {
3459 ASSERT(mp_list == NULL && rmp_list == NULL);
3460 return (top);
3461 }
3462
3463 fail:
3464 /* Free up what's left of the above */
3465 if (mp_list != NULL)
3466 mcache_free_ext(cp, mp_list);
3467 if (rmp_list != NULL)
3468 mcache_free_ext(rcp, rmp_list);
3469 if (wantall && top != NULL) {
3470 m_freem(top);
3471 return (NULL);
3472 }
3473 *numlist = num;
3474 return (top);
3475 }
3476
3477 /*
3478 * Best effort to get a mbuf cluster + pkthdr. Used by drivers to allocated
3479 * packets on receive ring.
3480 */
3481 __private_extern__ struct mbuf *
3482 m_getpacket_how(int wait)
3483 {
3484 unsigned int num_needed = 1;
3485
3486 return (m_getpackets_internal(&num_needed, 1, wait, 1,
3487 m_maxsize(MC_CL)));
3488 }
3489
3490 /*
3491 * Best effort to get a mbuf cluster + pkthdr. Used by drivers to allocated
3492 * packets on receive ring.
3493 */
3494 struct mbuf *
3495 m_getpacket(void)
3496 {
3497 unsigned int num_needed = 1;
3498
3499 return (m_getpackets_internal(&num_needed, 1, M_WAIT, 1,
3500 m_maxsize(MC_CL)));
3501 }
3502
3503 /*
3504 * Return a list of mbuf hdrs that point to clusters. Try for num_needed;
3505 * if this can't be met, return whatever number were available. Set up the
3506 * first num_with_pkthdrs with mbuf hdrs configured as packet headers. These
3507 * are chained on the m_nextpkt field. Any packets requested beyond this are
3508 * chained onto the last packet header's m_next field.
3509 */
3510 struct mbuf *
3511 m_getpackets(int num_needed, int num_with_pkthdrs, int how)
3512 {
3513 unsigned int n = num_needed;
3514
3515 return (m_getpackets_internal(&n, num_with_pkthdrs, how, 0,
3516 m_maxsize(MC_CL)));
3517 }
3518
3519 /*
3520 * Return a list of mbuf hdrs set up as packet hdrs chained together
3521 * on the m_nextpkt field
3522 */
3523 struct mbuf *
3524 m_getpackethdrs(int num_needed, int how)
3525 {
3526 struct mbuf *m;
3527 struct mbuf **np, *top;
3528
3529 top = NULL;
3530 np = &top;
3531
3532 while (num_needed--) {
3533 m = _M_RETRYHDR(how, MT_DATA);
3534 if (m == NULL)
3535 break;
3536
3537 *np = m;
3538 np = &m->m_nextpkt;
3539 }
3540
3541 return (top);
3542 }
3543
3544 /*
3545 * Free an mbuf list (m_nextpkt) while following m_next. Returns the count
3546 * for mbufs packets freed. Used by the drivers.
3547 */
3548 int
3549 m_freem_list(struct mbuf *m)
3550 {
3551 struct mbuf *nextpkt;
3552 mcache_obj_t *mp_list = NULL;
3553 mcache_obj_t *mcl_list = NULL;
3554 mcache_obj_t *mbc_list = NULL;
3555 mcache_obj_t *m16k_list = NULL;
3556 mcache_obj_t *m_mcl_list = NULL;
3557 mcache_obj_t *m_mbc_list = NULL;
3558 mcache_obj_t *m_m16k_list = NULL;
3559 mcache_obj_t *ref_list = NULL;
3560 int pktcount = 0;
3561 int mt_free = 0, mt_data = 0, mt_header = 0, mt_soname = 0, mt_tag = 0;
3562
3563 while (m != NULL) {
3564 pktcount++;
3565
3566 nextpkt = m->m_nextpkt;
3567 m->m_nextpkt = NULL;
3568
3569 while (m != NULL) {
3570 struct mbuf *next = m->m_next;
3571 mcache_obj_t *o, *rfa;
3572 u_int32_t refcnt, flags;
3573
3574 if (m->m_type == MT_FREE)
3575 panic("m_free: freeing an already freed mbuf");
3576
3577 if (m->m_type != MT_FREE)
3578 mt_free++;
3579
3580 if (m->m_flags & M_PKTHDR) {
3581 m_tag_delete_chain(m, NULL);
3582 }
3583
3584 if (!(m->m_flags & M_EXT))
3585 goto simple_free;
3586
3587 o = (mcache_obj_t *)m->m_ext.ext_buf;
3588 refcnt = m_decref(m);
3589 flags = MEXT_FLAGS(m);
3590 if (refcnt == 0 && flags == 0) {
3591 if (m->m_ext.ext_free == NULL) {
3592 o->obj_next = mcl_list;
3593 mcl_list = o;
3594 } else if (m->m_ext.ext_free == m_bigfree) {
3595 o->obj_next = mbc_list;
3596 mbc_list = o;
3597 } else if (m->m_ext.ext_free == m_16kfree) {
3598 o->obj_next = m16k_list;
3599 m16k_list = o;
3600 } else {
3601 (*(m->m_ext.ext_free))((caddr_t)o,
3602 m->m_ext.ext_size,
3603 m->m_ext.ext_arg);
3604 }
3605 rfa = (mcache_obj_t *)MEXT_RFA(m);
3606 rfa->obj_next = ref_list;
3607 ref_list = rfa;
3608 MEXT_RFA(m) = NULL;
3609 } else if (refcnt == 0 && (flags & EXTF_COMPOSITE)) {
3610 VERIFY(m->m_type != MT_FREE);
3611 /*
3612 * Amortize the costs of atomic operations
3613 * by doing them at the end, if possible.
3614 */
3615 if (m->m_type == MT_DATA)
3616 mt_data++;
3617 else if (m->m_type == MT_HEADER)
3618 mt_header++;
3619 else if (m->m_type == MT_SONAME)
3620 mt_soname++;
3621 else if (m->m_type == MT_TAG)
3622 mt_tag++;
3623 else
3624 mtype_stat_dec(m->m_type);
3625
3626 m->m_type = MT_FREE;
3627 m->m_flags = M_EXT;
3628 m->m_len = 0;
3629 m->m_next = m->m_nextpkt = NULL;
3630
3631 /* "Free" into the intermediate cache */
3632 o = (mcache_obj_t *)m;
3633 if (m->m_ext.ext_free == NULL) {
3634 o->obj_next = m_mcl_list;
3635 m_mcl_list = o;
3636 } else if (m->m_ext.ext_free == m_bigfree) {
3637 o->obj_next = m_mbc_list;
3638 m_mbc_list = o;
3639 } else {
3640 VERIFY(m->m_ext.ext_free == m_16kfree);
3641 o->obj_next = m_m16k_list;
3642 m_m16k_list = o;
3643 }
3644 m = next;
3645 continue;
3646 }
3647 simple_free:
3648 /*
3649 * Amortize the costs of atomic operations
3650 * by doing them at the end, if possible.
3651 */
3652 if (m->m_type == MT_DATA)
3653 mt_data++;
3654 else if (m->m_type == MT_HEADER)
3655 mt_header++;
3656 else if (m->m_type == MT_SONAME)
3657 mt_soname++;
3658 else if (m->m_type == MT_TAG)
3659 mt_tag++;
3660 else if (m->m_type != MT_FREE)
3661 mtype_stat_dec(m->m_type);
3662
3663 m->m_type = MT_FREE;
3664 m->m_flags = m->m_len = 0;
3665 m->m_next = m->m_nextpkt = NULL;
3666
3667 ((mcache_obj_t *)m)->obj_next = mp_list;
3668 mp_list = (mcache_obj_t *)m;
3669
3670 m = next;
3671 }
3672
3673 m = nextpkt;
3674 }
3675
3676 if (mt_free > 0)
3677 mtype_stat_add(MT_FREE, mt_free);
3678 if (mt_data > 0)
3679 mtype_stat_sub(MT_DATA, mt_data);
3680 if (mt_header > 0)
3681 mtype_stat_sub(MT_HEADER, mt_header);
3682 if (mt_soname > 0)
3683 mtype_stat_sub(MT_SONAME, mt_soname);
3684 if (mt_tag > 0)
3685 mtype_stat_sub(MT_TAG, mt_tag);
3686
3687 if (mp_list != NULL)
3688 mcache_free_ext(m_cache(MC_MBUF), mp_list);
3689 if (mcl_list != NULL)
3690 mcache_free_ext(m_cache(MC_CL), mcl_list);
3691 if (mbc_list != NULL)
3692 mcache_free_ext(m_cache(MC_BIGCL), mbc_list);
3693 if (m16k_list != NULL)
3694 mcache_free_ext(m_cache(MC_16KCL), m16k_list);
3695 if (m_mcl_list != NULL)
3696 mcache_free_ext(m_cache(MC_MBUF_CL), m_mcl_list);
3697 if (m_mbc_list != NULL)
3698 mcache_free_ext(m_cache(MC_MBUF_BIGCL), m_mbc_list);
3699 if (m_m16k_list != NULL)
3700 mcache_free_ext(m_cache(MC_MBUF_16KCL), m_m16k_list);
3701 if (ref_list != NULL)
3702 mcache_free_ext(ref_cache, ref_list);
3703
3704 return (pktcount);
3705 }
3706
3707 void
3708 m_freem(struct mbuf *m)
3709 {
3710 while (m != NULL)
3711 m = m_free(m);
3712 }
3713
3714 /*
3715 * Mbuffer utility routines.
3716 */
3717
3718 /*
3719 * Compute the amount of space available before the current start
3720 * of data in an mbuf.
3721 */
3722 int
3723 m_leadingspace(struct mbuf *m)
3724 {
3725 if (m->m_flags & M_EXT) {
3726 if (MCLHASREFERENCE(m))
3727 return (0);
3728 return (m->m_data - m->m_ext.ext_buf);
3729 }
3730 if (m->m_flags & M_PKTHDR)
3731 return (m->m_data - m->m_pktdat);
3732 return (m->m_data - m->m_dat);
3733 }
3734
3735 /*
3736 * Compute the amount of space available after the end of data in an mbuf.
3737 */
3738 int
3739 m_trailingspace(struct mbuf *m)
3740 {
3741 if (m->m_flags & M_EXT) {
3742 if (MCLHASREFERENCE(m))
3743 return (0);
3744 return (m->m_ext.ext_buf + m->m_ext.ext_size -
3745 (m->m_data + m->m_len));
3746 }
3747 return (&m->m_dat[MLEN] - (m->m_data + m->m_len));
3748 }
3749
3750 /*
3751 * Lesser-used path for M_PREPEND: allocate new mbuf to prepend to chain,
3752 * copy junk along. Does not adjust packet header length.
3753 */
3754 struct mbuf *
3755 m_prepend(struct mbuf *m, int len, int how)
3756 {
3757 struct mbuf *mn;
3758
3759 _MGET(mn, how, m->m_type);
3760 if (mn == NULL) {
3761 m_freem(m);
3762 return (NULL);
3763 }
3764 if (m->m_flags & M_PKTHDR) {
3765 M_COPY_PKTHDR(mn, m);
3766 m->m_flags &= ~M_PKTHDR;
3767 }
3768 mn->m_next = m;
3769 m = mn;
3770 if (len < MHLEN)
3771 MH_ALIGN(m, len);
3772 m->m_len = len;
3773 return (m);
3774 }
3775
3776 /*
3777 * Replacement for old M_PREPEND macro: allocate new mbuf to prepend to
3778 * chain, copy junk along, and adjust length.
3779 */
3780 struct mbuf *
3781 m_prepend_2(struct mbuf *m, int len, int how)
3782 {
3783 if (M_LEADINGSPACE(m) >= len) {
3784 m->m_data -= len;
3785 m->m_len += len;
3786 } else {
3787 m = m_prepend(m, len, how);
3788 }
3789 if ((m) && (m->m_flags & M_PKTHDR))
3790 m->m_pkthdr.len += len;
3791 return (m);
3792 }
3793
3794 /*
3795 * Make a copy of an mbuf chain starting "off0" bytes from the beginning,
3796 * continuing for "len" bytes. If len is M_COPYALL, copy to end of mbuf.
3797 * The wait parameter is a choice of M_WAIT/M_DONTWAIT from caller.
3798 */
3799 int MCFail;
3800
3801 struct mbuf *
3802 m_copym(struct mbuf *m, int off0, int len, int wait)
3803 {
3804 struct mbuf *n, *mhdr = NULL, **np;
3805 int off = off0;
3806 struct mbuf *top;
3807 int copyhdr = 0;
3808
3809 if (off < 0 || len < 0)
3810 panic("m_copym: invalid offset %d or len %d", off, len);
3811
3812 if (off == 0 && (m->m_flags & M_PKTHDR)) {
3813 mhdr = m;
3814 copyhdr = 1;
3815 }
3816
3817 while (off >= m->m_len) {
3818 if (m->m_next == NULL)
3819 panic("m_copym: invalid mbuf chain");
3820 off -= m->m_len;
3821 m = m->m_next;
3822 }
3823 np = &top;
3824 top = NULL;
3825
3826 while (len > 0) {
3827 if (m == NULL) {
3828 if (len != M_COPYALL)
3829 panic("m_copym: len != M_COPYALL");
3830 break;
3831 }
3832
3833 n = _M_RETRY(wait, m->m_type);
3834 *np = n;
3835
3836 if (n == NULL)
3837 goto nospace;
3838
3839 if (copyhdr != 0) {
3840 M_COPY_PKTHDR(n, mhdr);
3841 if (len == M_COPYALL)
3842 n->m_pkthdr.len -= off0;
3843 else
3844 n->m_pkthdr.len = len;
3845 copyhdr = 0;
3846 }
3847 if (len == M_COPYALL) {
3848 if (MIN(len, (m->m_len - off)) == len) {
3849 printf("m->m_len %ld - off %d = %ld, %ld\n",
3850 m->m_len, off, m->m_len - off,
3851 MIN(len, (m->m_len - off)));
3852 }
3853 }
3854 n->m_len = MIN(len, (m->m_len - off));
3855 if (n->m_len == M_COPYALL) {
3856 printf("n->m_len == M_COPYALL, fixing\n");
3857 n->m_len = MHLEN;
3858 }
3859 if (m->m_flags & M_EXT) {
3860 n->m_ext = m->m_ext;
3861 m_incref(m);
3862 n->m_data = m->m_data + off;
3863 n->m_flags |= M_EXT;
3864 } else {
3865 bcopy(MTOD(m, caddr_t)+off, MTOD(n, caddr_t),
3866 (unsigned)n->m_len);
3867 }
3868 if (len != M_COPYALL)
3869 len -= n->m_len;
3870 off = 0;
3871 m = m->m_next;
3872 np = &n->m_next;
3873 }
3874
3875 if (top == NULL)
3876 MCFail++;
3877
3878 return (top);
3879 nospace:
3880
3881 m_freem(top);
3882 MCFail++;
3883 return (NULL);
3884 }
3885
3886 /*
3887 * Equivalent to m_copym except that all necessary mbuf hdrs are allocated
3888 * within this routine also, the last mbuf and offset accessed are passed
3889 * out and can be passed back in to avoid having to rescan the entire mbuf
3890 * list (normally hung off of the socket)
3891 */
3892 struct mbuf *
3893 m_copym_with_hdrs(struct mbuf *m, int off0, int len0, int wait,
3894 struct mbuf **m_last, int *m_off)
3895 {
3896 struct mbuf *n, **np = NULL;
3897 int off = off0, len = len0;
3898 struct mbuf *top = NULL;
3899 int mcflags = MSLEEPF(wait);
3900 int copyhdr = 0;
3901 int type = 0;
3902 mcache_obj_t *list = NULL;
3903 int needed = 0;
3904
3905 if (off == 0 && (m->m_flags & M_PKTHDR))
3906 copyhdr = 1;
3907
3908 if (*m_last != NULL) {
3909 m = *m_last;
3910 off = *m_off;
3911 } else {
3912 while (off >= m->m_len) {
3913 off -= m->m_len;
3914 m = m->m_next;
3915 }
3916 }
3917
3918 n = m;
3919 while (len > 0) {
3920 needed++;
3921 ASSERT(n != NULL);
3922 len -= MIN(len, (n->m_len - ((needed == 1) ? off : 0)));
3923 n = n->m_next;
3924 }
3925 needed++;
3926 len = len0;
3927
3928 /*
3929 * If the caller doesn't want to be put to sleep, mark it with
3930 * MCR_TRYHARD so that we may reclaim buffers from other places
3931 * before giving up.
3932 */
3933 if (mcflags & MCR_NOSLEEP)
3934 mcflags |= MCR_TRYHARD;
3935
3936 if (mcache_alloc_ext(m_cache(MC_MBUF), &list, needed,
3937 mcflags) != needed)
3938 goto nospace;
3939
3940 needed = 0;
3941 while (len > 0) {
3942 n = (struct mbuf *)list;
3943 list = list->obj_next;
3944 ASSERT(n != NULL && m != NULL);
3945
3946 type = (top == NULL) ? MT_HEADER : m->m_type;
3947 MBUF_INIT(n, (top == NULL), type);
3948 #if CONFIG_MACF_NET
3949 if (top == NULL && mac_mbuf_label_init(n, wait) != 0) {
3950 mtype_stat_inc(MT_HEADER);
3951 mtype_stat_dec(MT_FREE);
3952 m_free(n);
3953 goto nospace;
3954 }
3955 #endif /* MAC_NET */
3956
3957 if (top == NULL) {
3958 top = n;
3959 np = &top->m_next;
3960 continue;
3961 } else {
3962 needed++;
3963 *np = n;
3964 }
3965
3966 if (copyhdr) {
3967 M_COPY_PKTHDR(n, m);
3968 n->m_pkthdr.len = len;
3969 copyhdr = 0;
3970 }
3971 n->m_len = MIN(len, (m->m_len - off));
3972
3973 if (m->m_flags & M_EXT) {
3974 n->m_ext = m->m_ext;
3975 m_incref(m);
3976 n->m_data = m->m_data + off;
3977 n->m_flags |= M_EXT;
3978 } else {
3979 bcopy(MTOD(m, caddr_t)+off, MTOD(n, caddr_t),
3980 (unsigned)n->m_len);
3981 }
3982 len -= n->m_len;
3983
3984 if (len == 0) {
3985 if ((off + n->m_len) == m->m_len) {
3986 *m_last = m->m_next;
3987 *m_off = 0;
3988 } else {
3989 *m_last = m;
3990 *m_off = off + n->m_len;
3991 }
3992 break;
3993 }
3994 off = 0;
3995 m = m->m_next;
3996 np = &n->m_next;
3997 }
3998
3999 mtype_stat_inc(MT_HEADER);
4000 mtype_stat_add(type, needed);
4001 mtype_stat_sub(MT_FREE, needed + 1);
4002
4003 ASSERT(list == NULL);
4004 return (top);
4005
4006 nospace:
4007 if (list != NULL)
4008 mcache_free_ext(m_cache(MC_MBUF), list);
4009 if (top != NULL)
4010 m_freem(top);
4011 MCFail++;
4012 return (NULL);
4013 }
4014
4015 /*
4016 * Copy data from an mbuf chain starting "off" bytes from the beginning,
4017 * continuing for "len" bytes, into the indicated buffer.
4018 */
4019 void
4020 m_copydata(struct mbuf *m, int off, int len, caddr_t cp)
4021 {
4022 unsigned count;
4023
4024 if (off < 0 || len < 0)
4025 panic("m_copydata: invalid offset %d or len %d", off, len);
4026
4027 while (off > 0) {
4028 if (m == NULL)
4029 panic("m_copydata: invalid mbuf chain");
4030 if (off < m->m_len)
4031 break;
4032 off -= m->m_len;
4033 m = m->m_next;
4034 }
4035 while (len > 0) {
4036 if (m == NULL)
4037 panic("m_copydata: invalid mbuf chain");
4038 count = MIN(m->m_len - off, len);
4039 bcopy(MTOD(m, caddr_t) + off, cp, count);
4040 len -= count;
4041 cp += count;
4042 off = 0;
4043 m = m->m_next;
4044 }
4045 }
4046
4047 /*
4048 * Concatenate mbuf chain n to m. Both chains must be of the same type
4049 * (e.g. MT_DATA). Any m_pkthdr is not updated.
4050 */
4051 void
4052 m_cat(struct mbuf *m, struct mbuf *n)
4053 {
4054 while (m->m_next)
4055 m = m->m_next;
4056 while (n) {
4057 if ((m->m_flags & M_EXT) ||
4058 m->m_data + m->m_len + n->m_len >= &m->m_dat[MLEN]) {
4059 /* just join the two chains */
4060 m->m_next = n;
4061 return;
4062 }
4063 /* splat the data from one into the other */
4064 bcopy(MTOD(n, caddr_t), MTOD(m, caddr_t) + m->m_len,
4065 (u_int)n->m_len);
4066 m->m_len += n->m_len;
4067 n = m_free(n);
4068 }
4069 }
4070
4071 void
4072 m_adj(struct mbuf *mp, int req_len)
4073 {
4074 int len = req_len;
4075 struct mbuf *m;
4076 int count;
4077
4078 if ((m = mp) == NULL)
4079 return;
4080 if (len >= 0) {
4081 /*
4082 * Trim from head.
4083 */
4084 while (m != NULL && len > 0) {
4085 if (m->m_len <= len) {
4086 len -= m->m_len;
4087 m->m_len = 0;
4088 m = m->m_next;
4089 } else {
4090 m->m_len -= len;
4091 m->m_data += len;
4092 len = 0;
4093 }
4094 }
4095 m = mp;
4096 if (m->m_flags & M_PKTHDR)
4097 m->m_pkthdr.len -= (req_len - len);
4098 } else {
4099 /*
4100 * Trim from tail. Scan the mbuf chain,
4101 * calculating its length and finding the last mbuf.
4102 * If the adjustment only affects this mbuf, then just
4103 * adjust and return. Otherwise, rescan and truncate
4104 * after the remaining size.
4105 */
4106 len = -len;
4107 count = 0;
4108 for (;;) {
4109 count += m->m_len;
4110 if (m->m_next == (struct mbuf *)0)
4111 break;
4112 m = m->m_next;
4113 }
4114 if (m->m_len >= len) {
4115 m->m_len -= len;
4116 m = mp;
4117 if (m->m_flags & M_PKTHDR)
4118 m->m_pkthdr.len -= len;
4119 return;
4120 }
4121 count -= len;
4122 if (count < 0)
4123 count = 0;
4124 /*
4125 * Correct length for chain is "count".
4126 * Find the mbuf with last data, adjust its length,
4127 * and toss data from remaining mbufs on chain.
4128 */
4129 m = mp;
4130 if (m->m_flags & M_PKTHDR)
4131 m->m_pkthdr.len = count;
4132 for (; m; m = m->m_next) {
4133 if (m->m_len >= count) {
4134 m->m_len = count;
4135 break;
4136 }
4137 count -= m->m_len;
4138 }
4139 while ((m = m->m_next))
4140 m->m_len = 0;
4141 }
4142 }
4143
4144 /*
4145 * Rearange an mbuf chain so that len bytes are contiguous
4146 * and in the data area of an mbuf (so that mtod and dtom
4147 * will work for a structure of size len). Returns the resulting
4148 * mbuf chain on success, frees it and returns null on failure.
4149 * If there is room, it will add up to max_protohdr-len extra bytes to the
4150 * contiguous region in an attempt to avoid being called next time.
4151 */
4152 int MPFail;
4153
4154 struct mbuf *
4155 m_pullup(struct mbuf *n, int len)
4156 {
4157 struct mbuf *m;
4158 int count;
4159 int space;
4160
4161 /*
4162 * If first mbuf has no cluster, and has room for len bytes
4163 * without shifting current data, pullup into it,
4164 * otherwise allocate a new mbuf to prepend to the chain.
4165 */
4166 if ((n->m_flags & M_EXT) == 0 &&
4167 n->m_data + len < &n->m_dat[MLEN] && n->m_next) {
4168 if (n->m_len >= len)
4169 return (n);
4170 m = n;
4171 n = n->m_next;
4172 len -= m->m_len;
4173 } else {
4174 if (len > MHLEN)
4175 goto bad;
4176 _MGET(m, M_DONTWAIT, n->m_type);
4177 if (m == 0)
4178 goto bad;
4179 m->m_len = 0;
4180 if (n->m_flags & M_PKTHDR) {
4181 M_COPY_PKTHDR(m, n);
4182 n->m_flags &= ~M_PKTHDR;
4183 }
4184 }
4185 space = &m->m_dat[MLEN] - (m->m_data + m->m_len);
4186 do {
4187 count = MIN(MIN(MAX(len, max_protohdr), space), n->m_len);
4188 bcopy(MTOD(n, caddr_t), MTOD(m, caddr_t) + m->m_len,
4189 (unsigned)count);
4190 len -= count;
4191 m->m_len += count;
4192 n->m_len -= count;
4193 space -= count;
4194 if (n->m_len)
4195 n->m_data += count;
4196 else
4197 n = m_free(n);
4198 } while (len > 0 && n);
4199 if (len > 0) {
4200 (void) m_free(m);
4201 goto bad;
4202 }
4203 m->m_next = n;
4204 return (m);
4205 bad:
4206 m_freem(n);
4207 MPFail++;
4208 return (0);
4209 }
4210
4211 /*
4212 * Partition an mbuf chain in two pieces, returning the tail --
4213 * all but the first len0 bytes. In case of failure, it returns NULL and
4214 * attempts to restore the chain to its original state.
4215 */
4216 struct mbuf *
4217 m_split(struct mbuf *m0, int len0, int wait)
4218 {
4219 struct mbuf *m, *n;
4220 unsigned len = len0, remain;
4221
4222 for (m = m0; m && len > m->m_len; m = m->m_next)
4223 len -= m->m_len;
4224 if (m == NULL)
4225 return (NULL);
4226 remain = m->m_len - len;
4227 if (m0->m_flags & M_PKTHDR) {
4228 _MGETHDR(n, wait, m0->m_type);
4229 if (n == NULL)
4230 return (NULL);
4231 n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif;
4232 n->m_pkthdr.len = m0->m_pkthdr.len - len0;
4233 m0->m_pkthdr.len = len0;
4234 if (m->m_flags & M_EXT)
4235 goto extpacket;
4236 if (remain > MHLEN) {
4237 /* m can't be the lead packet */
4238 MH_ALIGN(n, 0);
4239 n->m_next = m_split(m, len, wait);
4240 if (n->m_next == NULL) {
4241 (void) m_free(n);
4242 return (NULL);
4243 } else
4244 return (n);
4245 } else
4246 MH_ALIGN(n, remain);
4247 } else if (remain == 0) {
4248 n = m->m_next;
4249 m->m_next = NULL;
4250 return (n);
4251 } else {
4252 _MGET(n, wait, m->m_type);
4253 if (n == NULL)
4254 return (NULL);
4255 M_ALIGN(n, remain);
4256 }
4257 extpacket:
4258 if (m->m_flags & M_EXT) {
4259 n->m_flags |= M_EXT;
4260 n->m_ext = m->m_ext;
4261 m_incref(m);
4262 n->m_data = m->m_data + len;
4263 } else {
4264 bcopy(MTOD(m, caddr_t) + len, MTOD(n, caddr_t), remain);
4265 }
4266 n->m_len = remain;
4267 m->m_len = len;
4268 n->m_next = m->m_next;
4269 m->m_next = NULL;
4270 return (n);
4271 }
4272
4273 /*
4274 * Routine to copy from device local memory into mbufs.
4275 */
4276 struct mbuf *
4277 m_devget(char *buf, int totlen, int off0, struct ifnet *ifp,
4278 void (*copy)(const void *, void *, size_t))
4279 {
4280 struct mbuf *m;
4281 struct mbuf *top = NULL, **mp = &top;
4282 int off = off0, len;
4283 char *cp;
4284 char *epkt;
4285
4286 cp = buf;
4287 epkt = cp + totlen;
4288 if (off) {
4289 /*
4290 * If 'off' is non-zero, packet is trailer-encapsulated,
4291 * so we have to skip the type and length fields.
4292 */
4293 cp += off + 2 * sizeof (u_int16_t);
4294 totlen -= 2 * sizeof (u_int16_t);
4295 }
4296 _MGETHDR(m, M_DONTWAIT, MT_DATA);
4297 if (m == NULL)
4298 return (NULL);
4299 m->m_pkthdr.rcvif = ifp;
4300 m->m_pkthdr.len = totlen;
4301 m->m_len = MHLEN;
4302
4303 while (totlen > 0) {
4304 if (top != NULL) {
4305 _MGET(m, M_DONTWAIT, MT_DATA);
4306 if (m == NULL) {
4307 m_freem(top);
4308 return (NULL);
4309 }
4310 m->m_len = MLEN;
4311 }
4312 len = MIN(totlen, epkt - cp);
4313 if (len >= MINCLSIZE) {
4314 MCLGET(m, M_DONTWAIT);
4315 if (m->m_flags & M_EXT) {
4316 m->m_len = len = MIN(len, m_maxsize(MC_CL));
4317 } else {
4318 /* give up when it's out of cluster mbufs */
4319 if (top != NULL)
4320 m_freem(top);
4321 m_freem(m);
4322 return (NULL);
4323 }
4324 } else {
4325 /*
4326 * Place initial small packet/header at end of mbuf.
4327 */
4328 if (len < m->m_len) {
4329 if (top == NULL &&
4330 len + max_linkhdr <= m->m_len)
4331 m->m_data += max_linkhdr;
4332 m->m_len = len;
4333 } else {
4334 len = m->m_len;
4335 }
4336 }
4337 if (copy)
4338 copy(cp, MTOD(m, caddr_t), (unsigned)len);
4339 else
4340 bcopy(cp, MTOD(m, caddr_t), (unsigned)len);
4341 cp += len;
4342 *mp = m;
4343 mp = &m->m_next;
4344 totlen -= len;
4345 if (cp == epkt)
4346 cp = buf;
4347 }
4348 return (top);
4349 }
4350
4351 /*
4352 * Cluster freelist allocation check.
4353 */
4354 static int
4355 m_howmany(int num, size_t bufsize)
4356 {
4357 int i = 0, j = 0;
4358 u_int32_t m_clusters, m_bigclusters, m_16kclusters;
4359 u_int32_t m_clfree, m_bigclfree, m_16kclfree;
4360
4361 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
4362
4363 m_clusters = m_total(MC_CL);
4364 m_bigclusters = m_total(MC_BIGCL);
4365 m_16kclusters = m_total(MC_16KCL);
4366 m_clfree = m_infree(MC_CL);
4367 m_bigclfree = m_infree(MC_BIGCL);
4368 m_16kclfree = m_infree(MC_16KCL);
4369
4370 /* Bail if we've maxed out the mbuf memory map */
4371 if ((bufsize != m_maxsize(MC_16KCL) &&
4372 (m_clusters + (m_bigclusters << 1) >= nclusters)) ||
4373 (njcl > 0 && bufsize == m_maxsize(MC_16KCL) &&
4374 (m_16kclusters << 3) >= njcl)) {
4375 #if DEBUG
4376 if (bufsize == MCLBYTES && num > m_clfree) {
4377 printf("m_howmany - out of small clusters, "
4378 "%d short\n", num - mbstat.m_clfree);
4379 }
4380 #endif /* DEBUG */
4381 return (0);
4382 }
4383
4384 if (bufsize == m_maxsize(MC_CL)) {
4385 /* Under minimum */
4386 if (m_clusters < MINCL)
4387 return (MINCL - m_clusters);
4388 /* Too few (free < 1/16 total) and not over maximum */
4389 if (m_clusters < m_maxlimit(MC_CL)) {
4390 if (m_clfree >= MCL_LOWAT)
4391 return (0);
4392 if (num >= m_clfree)
4393 i = num - m_clfree;
4394 if (((m_clusters + num) >> 4) > m_clfree)
4395 j = ((m_clusters + num) >> 4) - m_clfree;
4396 i = MAX(i, j);
4397 if (i + m_clusters >= m_maxlimit(MC_CL))
4398 i = m_maxlimit(MC_CL) - m_clusters;
4399 }
4400 VERIFY((m_total(MC_CL) + i) <= m_maxlimit(MC_CL));
4401 } else if (bufsize == m_maxsize(MC_BIGCL)) {
4402 /* Under minimum */
4403 if (m_bigclusters < MINBIGCL)
4404 return (MINBIGCL - m_bigclusters);
4405 /* Too few (free < 1/16 total) and not over maximum */
4406 if (m_bigclusters < m_maxlimit(MC_BIGCL)) {
4407 if (m_bigclfree >= MBIGCL_LOWAT)
4408 return (0);
4409 if (num >= m_bigclfree)
4410 i = num - m_bigclfree;
4411 if (((m_bigclusters + num) >> 4) > m_bigclfree)
4412 j = ((m_bigclusters + num) >> 4) - m_bigclfree;
4413 i = MAX(i, j);
4414 if (i + m_bigclusters >= m_maxlimit(MC_BIGCL))
4415 i = m_maxlimit(MC_BIGCL) - m_bigclusters;
4416 }
4417 VERIFY((m_total(MC_BIGCL) + i) <= m_maxlimit(MC_BIGCL));
4418 } else {
4419 VERIFY(njcl > 0);
4420 /* Under minimum */
4421 if (m_16kclusters < MIN16KCL)
4422 return (MIN16KCL - m_16kclusters);
4423 /* Too few (free < 1/16 total) and not over maximum */
4424 if (m_16kclusters < m_maxlimit(MC_16KCL)) {
4425 if (m_16kclfree >= M16KCL_LOWAT)
4426 return (0);
4427 if (num >= m_16kclfree)
4428 i = num - m_16kclfree;
4429 if (((m_16kclusters + num) >> 4) > m_16kclfree)
4430 j = ((m_16kclusters + num) >> 4) - m_16kclfree;
4431 i = MAX(i, j);
4432 if (i + m_16kclusters >= m_maxlimit(MC_16KCL))
4433 i = m_maxlimit(MC_16KCL) - m_16kclusters;
4434 }
4435 VERIFY((m_total(MC_16KCL) + i) <= m_maxlimit(MC_16KCL));
4436 }
4437
4438 return (i);
4439 }
4440
4441 /*
4442 * Copy data from a buffer back into the indicated mbuf chain,
4443 * starting "off" bytes from the beginning, extending the mbuf
4444 * chain if necessary.
4445 */
4446 void
4447 m_copyback(struct mbuf *m0, int off, int len, caddr_t cp)
4448 {
4449 int mlen;
4450 struct mbuf *m = m0, *n;
4451 int totlen = 0;
4452
4453 if (m0 == NULL)
4454 return;
4455 while (off > (mlen = m->m_len)) {
4456 off -= mlen;
4457 totlen += mlen;
4458 if (m->m_next == NULL) {
4459 n = m_getclr(M_DONTWAIT, m->m_type);
4460 if (n == NULL)
4461 goto out;
4462 n->m_len = MIN(MLEN, len + off);
4463 m->m_next = n;
4464 }
4465 m = m->m_next;
4466 }
4467 while (len > 0) {
4468 mlen = MIN(m->m_len - off, len);
4469 bcopy(cp, off + MTOD(m, caddr_t), (unsigned)mlen);
4470 cp += mlen;
4471 len -= mlen;
4472 mlen += off;
4473 off = 0;
4474 totlen += mlen;
4475 if (len == 0)
4476 break;
4477 if (m->m_next == NULL) {
4478 n = _M_GET(M_DONTWAIT, m->m_type);
4479 if (n == NULL)
4480 break;
4481 n->m_len = MIN(MLEN, len);
4482 m->m_next = n;
4483 }
4484 m = m->m_next;
4485 }
4486 out:
4487 if (((m = m0)->m_flags & M_PKTHDR) && (m->m_pkthdr.len < totlen))
4488 m->m_pkthdr.len = totlen;
4489 }
4490
4491 char *
4492 mcl_to_paddr(char *addr)
4493 {
4494 int base_phys;
4495
4496 if (!MBUF_IN_MAP(addr))
4497 return (NULL);
4498 base_phys = mcl_paddr[(addr - (char *)mbutl) >> PGSHIFT];
4499
4500 if (base_phys == 0)
4501 return (NULL);
4502 return ((char *)((int)base_phys | ((int)addr & PGOFSET)));
4503 }
4504
4505 /*
4506 * Dup the mbuf chain passed in. The whole thing. No cute additional cruft.
4507 * And really copy the thing. That way, we don't "precompute" checksums
4508 * for unsuspecting consumers. Assumption: m->m_nextpkt == 0. Trick: for
4509 * small packets, don't dup into a cluster. That way received packets
4510 * don't take up too much room in the sockbuf (cf. sbspace()).
4511 */
4512 int MDFail;
4513
4514 struct mbuf *
4515 m_dup(struct mbuf *m, int how)
4516 {
4517 struct mbuf *n, **np;
4518 struct mbuf *top;
4519 int copyhdr = 0;
4520
4521 np = &top;
4522 top = NULL;
4523 if (m->m_flags & M_PKTHDR)
4524 copyhdr = 1;
4525
4526 /*
4527 * Quick check: if we have one mbuf and its data fits in an
4528 * mbuf with packet header, just copy and go.
4529 */
4530 if (m->m_next == NULL) {
4531 /* Then just move the data into an mbuf and be done... */
4532 if (copyhdr) {
4533 if (m->m_pkthdr.len <= MHLEN && m->m_len <= MHLEN) {
4534 if ((n = _M_GETHDR(how, m->m_type)) == NULL)
4535 return (NULL);
4536 n->m_len = m->m_len;
4537 m_dup_pkthdr(n, m, how);
4538 bcopy(m->m_data, n->m_data, m->m_len);
4539 return (n);
4540 }
4541 } else if (m->m_len <= MLEN) {
4542 if ((n = _M_GET(how, m->m_type)) == NULL)
4543 return (NULL);
4544 bcopy(m->m_data, n->m_data, m->m_len);
4545 n->m_len = m->m_len;
4546 return (n);
4547 }
4548 }
4549 while (m != NULL) {
4550 #if BLUE_DEBUG
4551 kprintf("<%x: %x, %x, %x\n", m, m->m_flags, m->m_len,
4552 m->m_data);
4553 #endif
4554 if (copyhdr)
4555 n = _M_GETHDR(how, m->m_type);
4556 else
4557 n = _M_GET(how, m->m_type);
4558 if (n == NULL)
4559 goto nospace;
4560 if (m->m_flags & M_EXT) {
4561 if (m->m_len <= m_maxsize(MC_CL))
4562 MCLGET(n, how);
4563 else if (m->m_len <= m_maxsize(MC_BIGCL))
4564 n = m_mbigget(n, how);
4565 else if (m->m_len <= m_maxsize(MC_16KCL) && njcl > 0)
4566 n = m_m16kget(n, how);
4567 if (!(n->m_flags & M_EXT)) {
4568 (void) m_free(n);
4569 goto nospace;
4570 }
4571 }
4572 *np = n;
4573 if (copyhdr) {
4574 /* Don't use M_COPY_PKTHDR: preserve m_data */
4575 m_dup_pkthdr(n, m, how);
4576 copyhdr = 0;
4577 if (!(n->m_flags & M_EXT))
4578 n->m_data = n->m_pktdat;
4579 }
4580 n->m_len = m->m_len;
4581 /*
4582 * Get the dup on the same bdry as the original
4583 * Assume that the two mbufs have the same offset to data area
4584 * (up to word boundaries)
4585 */
4586 bcopy(MTOD(m, caddr_t), MTOD(n, caddr_t), (unsigned)n->m_len);
4587 m = m->m_next;
4588 np = &n->m_next;
4589 #if BLUE_DEBUG
4590 kprintf(">%x: %x, %x, %x\n", n, n->m_flags, n->m_len,
4591 n->m_data);
4592 #endif
4593 }
4594
4595 if (top == NULL)
4596 MDFail++;
4597 return (top);
4598
4599 nospace:
4600 m_freem(top);
4601 MDFail++;
4602 return (NULL);
4603 }
4604
4605 #define MBUF_MULTIPAGES(m) \
4606 (((m)->m_flags & M_EXT) && \
4607 ((IS_P2ALIGNED((m)->m_data, NBPG) && (m)->m_len > NBPG) || \
4608 (!IS_P2ALIGNED((m)->m_data, NBPG) && \
4609 P2ROUNDUP((m)->m_data, NBPG) < ((uintptr_t)(m)->m_data + (m)->m_len))))
4610
4611 static struct mbuf *
4612 m_expand(struct mbuf *m, struct mbuf **last)
4613 {
4614 struct mbuf *top = NULL;
4615 struct mbuf **nm = &top;
4616 uintptr_t data0, data;
4617 unsigned int len0, len;
4618
4619 VERIFY(MBUF_MULTIPAGES(m));
4620 VERIFY(m->m_next == NULL);
4621 data0 = (uintptr_t)m->m_data;
4622 len0 = m->m_len;
4623 *last = top;
4624
4625 for (;;) {
4626 struct mbuf *n;
4627
4628 data = data0;
4629 if (IS_P2ALIGNED(data, NBPG) && len0 > NBPG)
4630 len = NBPG;
4631 else if (!IS_P2ALIGNED(data, NBPG) &&
4632 P2ROUNDUP(data, NBPG) < (data + len0))
4633 len = P2ROUNDUP(data, NBPG) - data;
4634 else
4635 len = len0;
4636
4637 VERIFY(len > 0);
4638 VERIFY(m->m_flags & M_EXT);
4639 m->m_data = (void *)data;
4640 m->m_len = len;
4641
4642 *nm = *last = m;
4643 nm = &m->m_next;
4644 m->m_next = NULL;
4645
4646 data0 += len;
4647 len0 -= len;
4648 if (len0 == 0)
4649 break;
4650
4651 n = _M_RETRY(M_DONTWAIT, MT_DATA);
4652 if (n == NULL) {
4653 m_freem(top);
4654 top = *last = NULL;
4655 break;
4656 }
4657
4658 n->m_ext = m->m_ext;
4659 m_incref(m);
4660 n->m_flags |= M_EXT;
4661 m = n;
4662 }
4663 return (top);
4664 }
4665
4666 struct mbuf *
4667 m_normalize(struct mbuf *m)
4668 {
4669 struct mbuf *top = NULL;
4670 struct mbuf **nm = &top;
4671 boolean_t expanded = FALSE;
4672
4673 while (m != NULL) {
4674 struct mbuf *n;
4675
4676 n = m->m_next;
4677 m->m_next = NULL;
4678
4679 /* Does the data cross one or more page boundaries? */
4680 if (MBUF_MULTIPAGES(m)) {
4681 struct mbuf *last;
4682 if ((m = m_expand(m, &last)) == NULL) {
4683 m_freem(n);
4684 m_freem(top);
4685 top = NULL;
4686 break;
4687 }
4688 *nm = m;
4689 nm = &last->m_next;
4690 expanded = TRUE;
4691 } else {
4692 *nm = m;
4693 nm = &m->m_next;
4694 }
4695 m = n;
4696 }
4697 if (expanded)
4698 atomic_add_32(&mb_normalized, 1);
4699 return (top);
4700 }
4701
4702 void
4703 m_mchtype(struct mbuf *m, int t)
4704 {
4705 mtype_stat_inc(t);
4706 mtype_stat_dec(m->m_type);
4707 (m)->m_type = t;
4708 }
4709
4710 void *
4711 m_mtod(struct mbuf *m)
4712 {
4713 return (MTOD(m, void *));
4714 }
4715
4716 struct mbuf *
4717 m_dtom(void *x)
4718 {
4719 return ((struct mbuf *)((u_long)(x) & ~(MSIZE-1)));
4720 }
4721
4722 void
4723 m_mcheck(struct mbuf *m)
4724 {
4725 _MCHECK(m);
4726 }
4727
4728 /*
4729 * Inform the corresponding mcache(s) that there's a waiter below.
4730 */
4731 static void
4732 mbuf_waiter_inc(mbuf_class_t class, boolean_t comp)
4733 {
4734 mcache_waiter_inc(m_cache(class));
4735 if (comp) {
4736 if (class == MC_CL) {
4737 mcache_waiter_inc(m_cache(MC_MBUF_CL));
4738 } else if (class == MC_BIGCL) {
4739 mcache_waiter_inc(m_cache(MC_MBUF_BIGCL));
4740 } else if (class == MC_16KCL) {
4741 mcache_waiter_inc(m_cache(MC_MBUF_16KCL));
4742 } else {
4743 mcache_waiter_inc(m_cache(MC_MBUF_CL));
4744 mcache_waiter_inc(m_cache(MC_MBUF_BIGCL));
4745 }
4746 }
4747 }
4748
4749 /*
4750 * Inform the corresponding mcache(s) that there's no more waiter below.
4751 */
4752 static void
4753 mbuf_waiter_dec(mbuf_class_t class, boolean_t comp)
4754 {
4755 mcache_waiter_dec(m_cache(class));
4756 if (comp) {
4757 if (class == MC_CL) {
4758 mcache_waiter_dec(m_cache(MC_MBUF_CL));
4759 } else if (class == MC_BIGCL) {
4760 mcache_waiter_dec(m_cache(MC_MBUF_BIGCL));
4761 } else if (class == MC_16KCL) {
4762 mcache_waiter_dec(m_cache(MC_MBUF_16KCL));
4763 } else {
4764 mcache_waiter_dec(m_cache(MC_MBUF_CL));
4765 mcache_waiter_dec(m_cache(MC_MBUF_BIGCL));
4766 }
4767 }
4768 }
4769
4770 /*
4771 * Called during blocking allocation. Returns TRUE if one or more objects
4772 * are available at the per-CPU caches layer and that allocation should be
4773 * retried at that level.
4774 */
4775 static boolean_t
4776 mbuf_sleep(mbuf_class_t class, unsigned int num, int wait)
4777 {
4778 boolean_t mcache_retry = FALSE;
4779
4780 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
4781
4782 /* Check if there's anything at the cache layer */
4783 if (mbuf_cached_above(class, wait)) {
4784 mcache_retry = TRUE;
4785 goto done;
4786 }
4787
4788 /* Nothing? Then try hard to get it from somewhere */
4789 m_reclaim(class, num, (wait & MCR_COMP));
4790
4791 /* We tried hard and got something? */
4792 if (m_infree(class) > 0) {
4793 mbstat.m_wait++;
4794 goto done;
4795 } else if (mbuf_cached_above(class, wait)) {
4796 mbstat.m_wait++;
4797 mcache_retry = TRUE;
4798 goto done;
4799 } else if (wait & MCR_TRYHARD) {
4800 mcache_retry = TRUE;
4801 goto done;
4802 }
4803
4804 /*
4805 * There's really nothing for us right now; inform the
4806 * cache(s) that there is a waiter below and go to sleep.
4807 */
4808 mbuf_waiter_inc(class, (wait & MCR_COMP));
4809
4810 VERIFY(!(wait & MCR_NOSLEEP));
4811 mb_waiters++;
4812 (void) msleep(mb_waitchan, mbuf_mlock, (PZERO-1), m_cname(class), NULL);
4813
4814 /* We are now up; stop getting notified until next round */
4815 mbuf_waiter_dec(class, (wait & MCR_COMP));
4816
4817 /* We waited and got something */
4818 if (m_infree(class) > 0) {
4819 mbstat.m_wait++;
4820 goto done;
4821 } else if (mbuf_cached_above(class, wait)) {
4822 mbstat.m_wait++;
4823 mcache_retry = TRUE;
4824 }
4825 done:
4826 return (mcache_retry);
4827 }
4828
4829 static void
4830 mbuf_worker_thread(void)
4831 {
4832 int mbuf_expand;
4833
4834 while (1) {
4835 lck_mtx_lock(mbuf_mlock);
4836
4837 mbuf_expand = 0;
4838 if (mbuf_expand_mcl) {
4839 int n;
4840
4841 /* Adjust to current number of cluster in use */
4842 n = mbuf_expand_mcl -
4843 (m_total(MC_CL) - m_infree(MC_CL));
4844 if ((n + m_total(MC_CL)) > m_maxlimit(MC_CL))
4845 n = m_maxlimit(MC_CL) - m_total(MC_CL);
4846 mbuf_expand_mcl = 0;
4847
4848 if (n > 0 && freelist_populate(MC_CL, n, M_WAIT) > 0)
4849 mbuf_expand++;
4850 }
4851 if (mbuf_expand_big) {
4852 int n;
4853
4854 /* Adjust to current number of 4 KB cluster in use */
4855 n = mbuf_expand_big -
4856 (m_total(MC_BIGCL) - m_infree(MC_BIGCL));
4857 if ((n + m_total(MC_BIGCL)) > m_maxlimit(MC_BIGCL))
4858 n = m_maxlimit(MC_BIGCL) - m_total(MC_BIGCL);
4859 mbuf_expand_big = 0;
4860
4861 if (n > 0 && freelist_populate(MC_BIGCL, n, M_WAIT) > 0)
4862 mbuf_expand++;
4863 }
4864 if (mbuf_expand_16k) {
4865 int n;
4866
4867 /* Adjust to current number of 16 KB cluster in use */
4868 n = mbuf_expand_16k -
4869 (m_total(MC_16KCL) - m_infree(MC_16KCL));
4870 if ((n + m_total(MC_16KCL)) > m_maxlimit(MC_16KCL))
4871 n = m_maxlimit(MC_16KCL) - m_total(MC_16KCL);
4872 mbuf_expand_16k = 0;
4873
4874 if (n > 0)
4875 (void) freelist_populate(MC_16KCL, n, M_WAIT);
4876 }
4877
4878 /*
4879 * Because we can run out of memory before filling the mbuf
4880 * map, we should not allocate more clusters than they are
4881 * mbufs -- otherwise we could have a large number of useless
4882 * clusters allocated.
4883 */
4884 if (mbuf_expand) {
4885 while (m_total(MC_MBUF) <
4886 (m_total(MC_BIGCL) + m_total(MC_CL))) {
4887 if (freelist_populate(MC_MBUF, 1, M_WAIT) == 0)
4888 break;
4889 }
4890 }
4891
4892 lck_mtx_unlock(mbuf_mlock);
4893
4894 assert_wait(&mbuf_worker_run, THREAD_UNINT);
4895 (void) thread_block((thread_continue_t)mbuf_worker_thread);
4896 }
4897 }
4898
4899 static void
4900 mbuf_worker_thread_init(void)
4901 {
4902 mbuf_worker_ready++;
4903 mbuf_worker_thread();
4904 }
4905
4906 static mcl_slab_t *
4907 slab_get(void *buf)
4908 {
4909 mcl_slabg_t *slg;
4910 unsigned int ix, k;
4911
4912 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
4913
4914 VERIFY(MBUF_IN_MAP(buf));
4915 ix = ((char *)buf - (char *)mbutl) >> MBSHIFT;
4916 VERIFY(ix < maxslabgrp);
4917
4918 if ((slg = slabstbl[ix]) == NULL) {
4919 /*
4920 * In the current implementation, we never shrink the memory
4921 * pool (hence the cluster map); if we attempt to reallocate
4922 * a cluster group when it's already allocated, panic since
4923 * this is a sign of a memory corruption (slabstbl[ix] got
4924 * nullified). This also means that there shouldn't be any
4925 * hole in the kernel sub-map for the mbuf pool.
4926 */
4927 ++slabgrp;
4928 VERIFY(ix < slabgrp);
4929 /*
4930 * Slabs expansion can only be done single threaded; when
4931 * we get here, it must be as a result of m_clalloc() which
4932 * is serialized and therefore mb_clalloc_busy must be set.
4933 */
4934 VERIFY(mb_clalloc_busy);
4935 lck_mtx_unlock(mbuf_mlock);
4936
4937 /* This is a new buffer; create the slabs group for it */
4938 MALLOC(slg, mcl_slabg_t *, sizeof (*slg), M_TEMP,
4939 M_WAITOK | M_ZERO);
4940 VERIFY(slg != NULL);
4941
4942 lck_mtx_lock(mbuf_mlock);
4943 /*
4944 * No other thread could have gone into m_clalloc() after
4945 * we dropped the lock above, so verify that it's true.
4946 */
4947 VERIFY(mb_clalloc_busy);
4948
4949 slabstbl[ix] = slg;
4950
4951 /* Chain each slab in the group to its forward neighbor */
4952 for (k = 1; k < NSLABSPMB; k++)
4953 slg->slg_slab[k - 1].sl_next = &slg->slg_slab[k];
4954 VERIFY(slg->slg_slab[NSLABSPMB - 1].sl_next == NULL);
4955
4956 /* And chain the last slab in the previous group to this */
4957 if (ix > 0) {
4958 VERIFY(slabstbl[ix - 1]->
4959 slg_slab[NSLABSPMB - 1].sl_next == NULL);
4960 slabstbl[ix - 1]->slg_slab[NSLABSPMB - 1].sl_next =
4961 &slg->slg_slab[0];
4962 }
4963 }
4964
4965 ix = MTOCL(buf) % NSLABSPMB;
4966 VERIFY(ix < NSLABSPMB);
4967
4968 return (&slg->slg_slab[ix]);
4969 }
4970
4971 static void
4972 slab_init(mcl_slab_t *sp, mbuf_class_t class, u_int32_t flags,
4973 void *base, void *head, unsigned int len, int refcnt, int chunks)
4974 {
4975 sp->sl_class = class;
4976 sp->sl_flags = flags;
4977 sp->sl_base = base;
4978 sp->sl_head = head;
4979 sp->sl_len = len;
4980 sp->sl_refcnt = refcnt;
4981 sp->sl_chunks = chunks;
4982 slab_detach(sp);
4983 }
4984
4985 static void
4986 slab_insert(mcl_slab_t *sp, mbuf_class_t class)
4987 {
4988 VERIFY(slab_is_detached(sp));
4989 m_slab_cnt(class)++;
4990 TAILQ_INSERT_TAIL(&m_slablist(class), sp, sl_link);
4991 sp->sl_flags &= ~SLF_DETACHED;
4992 if (class == MC_BIGCL) {
4993 sp = sp->sl_next;
4994 /* Next slab must already be present */
4995 VERIFY(sp != NULL);
4996 VERIFY(slab_is_detached(sp));
4997 sp->sl_flags &= ~SLF_DETACHED;
4998 } else if (class == MC_16KCL) {
4999 int k;
5000 for (k = 1; k < (M16KCLBYTES / MCLBYTES); k++) {
5001 sp = sp->sl_next;
5002 /* Next slab must already be present */
5003 VERIFY(sp != NULL);
5004 VERIFY(slab_is_detached(sp));
5005 sp->sl_flags &= ~SLF_DETACHED;
5006 }
5007 }
5008 }
5009
5010 static void
5011 slab_remove(mcl_slab_t *sp, mbuf_class_t class)
5012 {
5013 VERIFY(!slab_is_detached(sp));
5014 VERIFY(m_slab_cnt(class) > 0);
5015 m_slab_cnt(class)--;
5016 TAILQ_REMOVE(&m_slablist(class), sp, sl_link);
5017 slab_detach(sp);
5018 if (class == MC_BIGCL) {
5019 sp = sp->sl_next;
5020 /* Next slab must already be present */
5021 VERIFY(sp != NULL);
5022 VERIFY(!slab_is_detached(sp));
5023 slab_detach(sp);
5024 } else if (class == MC_16KCL) {
5025 int k;
5026 for (k = 1; k < (M16KCLBYTES / MCLBYTES); k++) {
5027 sp = sp->sl_next;
5028 /* Next slab must already be present */
5029 VERIFY(sp != NULL);
5030 VERIFY(!slab_is_detached(sp));
5031 slab_detach(sp);
5032 }
5033 }
5034 }
5035
5036 static boolean_t
5037 slab_inrange(mcl_slab_t *sp, void *buf)
5038 {
5039 return ((uintptr_t)buf >= (uintptr_t)sp->sl_base &&
5040 (uintptr_t)buf < ((uintptr_t)sp->sl_base + sp->sl_len));
5041 }
5042
5043 #undef panic(...)
5044
5045 static void
5046 slab_nextptr_panic(mcl_slab_t *sp, void *addr)
5047 {
5048 int i;
5049 unsigned int chunk_len = sp->sl_len / sp->sl_chunks;
5050 uintptr_t buf = (uintptr_t)sp->sl_base;
5051
5052 for (i = 0; i < sp->sl_chunks; i++, buf += chunk_len) {
5053 void *next = ((mcache_obj_t *)buf)->obj_next;
5054 if (next != addr)
5055 continue;
5056 if (mclaudit == NULL) {
5057 if (next != NULL && !MBUF_IN_MAP(next)) {
5058 mcache_t *cp = m_cache(sp->sl_class);
5059 panic("%s: %s buffer %p in slab %p modified "
5060 "after free at offset 0: %p out of range "
5061 "[%p-%p)\n", __func__, cp->mc_name,
5062 (void *)buf, sp, next, mbutl, embutl);
5063 /* NOTREACHED */
5064 }
5065 } else {
5066 mcache_audit_t *mca = mcl_audit_buf2mca(sp->sl_class,
5067 (mcache_obj_t *)buf);
5068 mcl_audit_verify_nextptr(next, mca);
5069 }
5070 }
5071 }
5072
5073 static void
5074 slab_detach(mcl_slab_t *sp)
5075 {
5076 sp->sl_link.tqe_next = (mcl_slab_t *)-1;
5077 sp->sl_link.tqe_prev = (mcl_slab_t **)-1;
5078 sp->sl_flags |= SLF_DETACHED;
5079 }
5080
5081 static boolean_t
5082 slab_is_detached(mcl_slab_t *sp)
5083 {
5084 return ((intptr_t)sp->sl_link.tqe_next == -1 &&
5085 (intptr_t)sp->sl_link.tqe_prev == -1 &&
5086 (sp->sl_flags & SLF_DETACHED));
5087 }
5088
5089 static void
5090 mcl_audit_init(void *buf, mcache_audit_t **mca_list,
5091 mcache_obj_t **con_list, size_t con_size, unsigned int num)
5092 {
5093 mcache_audit_t *mca, *mca_tail;
5094 mcache_obj_t *con = NULL;
5095 boolean_t save_contents = (con_list != NULL);
5096 unsigned int i, ix;
5097
5098 ASSERT(num <= NMBPCL);
5099 ASSERT(con_list == NULL || con_size != 0);
5100
5101 ix = MTOCL(buf);
5102 /* Make sure we haven't been here before */
5103 for (i = 0; i < NMBPCL; i++)
5104 VERIFY(mclaudit[ix].cl_audit[i] == NULL);
5105
5106 mca = mca_tail = *mca_list;
5107 if (save_contents)
5108 con = *con_list;
5109
5110 for (i = 0; i < num; i++) {
5111 mcache_audit_t *next;
5112
5113 next = mca->mca_next;
5114 bzero(mca, sizeof (*mca));
5115 mca->mca_next = next;
5116 mclaudit[ix].cl_audit[i] = mca;
5117
5118 /* Attach the contents buffer if requested */
5119 if (save_contents) {
5120 VERIFY(con != NULL);
5121 mca->mca_contents_size = con_size;
5122 mca->mca_contents = con;
5123 con = con->obj_next;
5124 bzero(mca->mca_contents, mca->mca_contents_size);
5125 }
5126
5127 mca_tail = mca;
5128 mca = mca->mca_next;
5129 }
5130
5131 if (save_contents)
5132 *con_list = con;
5133
5134 *mca_list = mca_tail->mca_next;
5135 mca_tail->mca_next = NULL;
5136 }
5137
5138 /*
5139 * Given an address of a buffer (mbuf/cluster/big cluster), return
5140 * the corresponding audit structure for that buffer.
5141 */
5142 static mcache_audit_t *
5143 mcl_audit_buf2mca(mbuf_class_t class, mcache_obj_t *o)
5144 {
5145 mcache_audit_t *mca = NULL;
5146 int ix = MTOCL(o);
5147
5148 VERIFY(IS_P2ALIGNED(o, MIN(m_maxsize(class), NBPG)));
5149
5150 switch (class) {
5151 case MC_MBUF:
5152 /*
5153 * For the mbuf case, find the index of the cluster
5154 * used by the mbuf and use that index to locate the
5155 * base address of the cluster. Then find out the
5156 * mbuf index relative to the cluster base and use
5157 * it to locate the audit structure.
5158 */
5159 VERIFY(MCLIDX(CLTOM(ix), o) < (int)NMBPCL);
5160 mca = mclaudit[ix].cl_audit[MCLIDX(CLTOM(ix), o)];
5161 break;
5162
5163 case MC_CL:
5164 case MC_BIGCL:
5165 case MC_16KCL:
5166 /*
5167 * Same as above, but only return the first element.
5168 */
5169 mca = mclaudit[ix].cl_audit[0];
5170 break;
5171
5172 default:
5173 VERIFY(0);
5174 /* NOTREACHED */
5175 }
5176
5177 return (mca);
5178 }
5179
5180 static void
5181 mcl_audit_mbuf(mcache_audit_t *mca, void *addr, boolean_t composite,
5182 boolean_t alloc)
5183 {
5184 struct mbuf *m = addr;
5185 mcache_obj_t *next = ((mcache_obj_t *)m)->obj_next;
5186
5187 VERIFY(mca->mca_contents != NULL &&
5188 mca->mca_contents_size == AUDIT_CONTENTS_SIZE);
5189
5190 mcl_audit_verify_nextptr(next, mca);
5191
5192 if (!alloc) {
5193 /* Save constructed mbuf fields */
5194 mcl_audit_save_mbuf(m, mca);
5195 mcache_set_pattern(MCACHE_FREE_PATTERN, m, m_maxsize(MC_MBUF));
5196 ((mcache_obj_t *)m)->obj_next = next;
5197 return;
5198 }
5199
5200 /* Check if the buffer has been corrupted while in freelist */
5201 mcache_audit_free_verify_set(mca, addr, 0, m_maxsize(MC_MBUF));
5202
5203 /* Restore constructed mbuf fields */
5204 mcl_audit_restore_mbuf(m, mca, composite);
5205 }
5206
5207 static void
5208 mcl_audit_restore_mbuf(struct mbuf *m, mcache_audit_t *mca, boolean_t composite)
5209 {
5210 struct mbuf *ms = (struct mbuf *)mca->mca_contents;
5211
5212 if (composite) {
5213 struct mbuf *next = m->m_next;
5214 VERIFY(ms->m_flags == M_EXT && MEXT_RFA(ms) != NULL &&
5215 MBUF_IS_COMPOSITE(ms));
5216 /*
5217 * We could have hand-picked the mbuf fields and restore
5218 * them individually, but that will be a maintenance
5219 * headache. Instead, restore everything that was saved;
5220 * the mbuf layer will recheck and reinitialize anyway.
5221 */
5222 bcopy(ms, m, mca->mca_contents_size);
5223 m->m_next = next;
5224 } else {
5225 /*
5226 * For a regular mbuf (no cluster attached) there's nothing
5227 * to restore other than the type field, which is expected
5228 * to be MT_FREE.
5229 */
5230 m->m_type = ms->m_type;
5231 }
5232 _MCHECK(m);
5233 }
5234
5235 static void
5236 mcl_audit_save_mbuf(struct mbuf *m, mcache_audit_t *mca)
5237 {
5238 _MCHECK(m);
5239 bcopy(m, mca->mca_contents, mca->mca_contents_size);
5240 }
5241
5242 static void
5243 mcl_audit_cluster(mcache_audit_t *mca, void *addr, size_t size, boolean_t alloc,
5244 boolean_t save_next)
5245 {
5246 mcache_obj_t *next = ((mcache_obj_t *)addr)->obj_next;
5247
5248 if (!alloc) {
5249 mcache_set_pattern(MCACHE_FREE_PATTERN, addr, size);
5250 if (save_next) {
5251 mcl_audit_verify_nextptr(next, mca);
5252 ((mcache_obj_t *)addr)->obj_next = next;
5253 }
5254 } else {
5255 /* Check if the buffer has been corrupted while in freelist */
5256 mcl_audit_verify_nextptr(next, mca);
5257 mcache_audit_free_verify_set(mca, addr, 0, size);
5258 }
5259 }
5260
5261 static void
5262 mcl_audit_mcheck_panic(struct mbuf *m)
5263 {
5264 mcache_audit_t *mca;
5265
5266 MRANGE(m);
5267 mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
5268
5269 panic("mcl_audit: freed mbuf %p with type 0x%x (instead of 0x%x)\n%s\n",
5270 m, (u_int16_t)m->m_type, MT_FREE, mcache_dump_mca(mca));
5271 /* NOTREACHED */
5272 }
5273
5274 static void
5275 mcl_audit_verify_nextptr(void *next, mcache_audit_t *mca)
5276 {
5277 if (next != NULL && next != (void *)MCACHE_FREE_PATTERN &&
5278 !MBUF_IN_MAP(next)) {
5279 panic("mcl_audit: buffer %p modified after free at offset 0: "
5280 "%p out of range [%p-%p)\n%s\n",
5281 mca->mca_addr, next, mbutl, embutl, mcache_dump_mca(mca));
5282 /* NOTREACHED */
5283 }
5284 }
5285
5286 SYSCTL_DECL(_kern_ipc);
5287 SYSCTL_PROC(_kern_ipc, KIPC_MBSTAT, mbstat, CTLFLAG_RD | CTLFLAG_LOCKED,
5288 0, 0, mbstat_sysctl, "S,mbstat", "");
5289 SYSCTL_PROC(_kern_ipc, OID_AUTO, mb_stat, CTLFLAG_RD | CTLFLAG_LOCKED,
5290 0, 0, mb_stat_sysctl, "S,mb_stat", "");
5291 SYSCTL_INT(_kern_ipc, OID_AUTO, mb_normalized, CTLFLAG_RD | CTLFLAG_LOCKED,
5292 &mb_normalized, 0, "");