2 * Copyright (c) 2000-2020 Apple Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34 * All Rights Reserved.
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
46 * Carnegie Mellon requests users of this software to return to
48 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
60 * Author: Avadis Tevanian, Jr.
62 * Zone-based memory allocator. A zone is a collection of fixed size
63 * data blocks for which quick allocation/deallocation is possible.
66 #define ZALLOC_ALLOW_DEPRECATED 1
67 #include <mach/mach_types.h>
68 #include <mach/vm_param.h>
69 #include <mach/kern_return.h>
70 #include <mach/mach_host_server.h>
71 #include <mach/task_server.h>
72 #include <mach/machine/vm_types.h>
73 #include <mach/vm_map.h>
76 #include <kern/bits.h>
77 #include <kern/startup.h>
78 #include <kern/kern_types.h>
79 #include <kern/assert.h>
80 #include <kern/backtrace.h>
81 #include <kern/host.h>
82 #include <kern/macro_help.h>
83 #include <kern/sched.h>
84 #include <kern/locks.h>
85 #include <kern/sched_prim.h>
86 #include <kern/misc_protos.h>
87 #include <kern/thread_call.h>
88 #include <kern/zalloc_internal.h>
89 #include <kern/kalloc.h>
91 #include <prng/random.h>
94 #include <vm/vm_map.h>
95 #include <vm/vm_kern.h>
96 #include <vm/vm_page.h>
97 #include <vm/vm_compressor.h> /* C_SLOT_PACKED_PTR* */
99 #include <pexpert/pexpert.h>
101 #include <machine/machparam.h>
102 #include <machine/machine_routines.h> /* ml_cpu_get_info */
104 #include <os/atomic.h>
106 #include <libkern/OSDebug.h>
107 #include <libkern/OSAtomic.h>
108 #include <libkern/section_keywords.h>
109 #include <sys/kdebug.h>
111 #include <san/kasan.h>
114 #define ZONE_ENABLE_LOGGING 0
115 #elif DEBUG || DEVELOPMENT
116 #define ZONE_ENABLE_LOGGING 1
118 #define ZONE_ENABLE_LOGGING 0
121 extern void vm_pageout_garbage_collect(int collect
);
123 /* Returns pid of the task with the largest number of VM map entries. */
124 extern pid_t
find_largest_process_vm_map_entries(void);
127 * Callout to jetsam. If pid is -1, we wake up the memorystatus thread to do asynchronous kills.
128 * For any other pid we try to kill that process synchronously.
130 extern boolean_t
memorystatus_kill_on_zone_map_exhaustion(pid_t pid
);
132 extern zone_t vm_map_entry_zone
;
133 extern zone_t vm_object_zone
;
134 extern vm_offset_t kmapoff_kaddr
;
135 extern unsigned int kmapoff_pgcnt
;
136 extern unsigned int stack_total
;
137 extern unsigned long long stack_allocs
;
140 * The max # of elements in a chunk should fit into
141 * zone_page_metadata.free_count (uint16_t).
143 * Update this if the type of free_count changes.
145 #define ZONE_CHUNK_MAXELEMENTS (UINT16_MAX)
147 #define ZONE_PAGECOUNT_BITS 14
149 /* Zone elements must fit both a next pointer and a backup pointer */
150 #define ZONE_MIN_ELEM_SIZE (2 * sizeof(vm_offset_t))
151 #define ZONE_MAX_ALLOC_SIZE (32 * 1024)
153 /* per-cpu zones are special because of counters */
154 #define ZONE_MIN_PCPU_ELEM_SIZE (1 * sizeof(vm_offset_t))
156 struct zone_map_range
{
157 vm_offset_t min_address
;
158 vm_offset_t max_address
;
161 struct zone_page_metadata
{
162 /* The index of the zone this metadata page belongs to */
166 * zm_secondary_page == 0: number of pages in this run
167 * zm_secondary_page == 1: offset to the chunk start
169 uint16_t zm_page_count
: ZONE_PAGECOUNT_BITS
;
171 /* Whether this page is part of a chunk run */
172 uint16_t zm_percpu
: 1;
173 uint16_t zm_secondary_page
: 1;
176 * The start of the freelist can be maintained as a 16-bit
177 * offset instead of a pointer because the free elements would
178 * be at max ZONE_MAX_ALLOC_SIZE bytes away from the start
179 * of the allocation chunk.
181 * Offset from start of the allocation chunk to free element
184 uint16_t zm_freelist_offs
;
187 * zm_secondary_page == 0: number of allocated elements in the chunk
188 * zm_secondary_page == 1: unused
190 * PAGE_METADATA_EMPTY_FREELIST indicates an empty freelist
192 uint16_t zm_alloc_count
;
193 #define PAGE_METADATA_EMPTY_FREELIST UINT16_MAX
195 zone_pva_t zm_page_next
;
196 zone_pva_t zm_page_prev
;
199 * This is only for the sake of debuggers
201 #define ZONE_FOREIGN_COOKIE 0x123456789abcdef
202 uint64_t zm_foreign_cookie
[];
206 /* Align elements that use the zone page list to 32 byte boundaries. */
207 #define ZONE_PAGE_FIRST_OFFSET(kind) ((kind) == ZONE_ADDR_NATIVE ? 0 : 32)
209 static_assert(sizeof(struct zone_page_metadata
) == 16, "validate packing");
211 static __security_const_late
struct {
212 struct zone_map_range zi_map_range
;
213 struct zone_map_range zi_general_range
;
214 struct zone_map_range zi_meta_range
;
215 struct zone_map_range zi_foreign_range
;
218 * The metadata lives within the zi_meta_range address range.
220 * The correct formula to find a metadata index is:
221 * absolute_page_index - page_index(zi_meta_range.min_address)
223 * And then this index is used to dereference zi_meta_range.min_address
224 * as a `struct zone_page_metadata` array.
226 * To avoid doing that substraction all the time in the various fast-paths,
227 * zi_array_base is offset by `page_index(zi_meta_range.min_address)`
228 * to avoid redoing that math all the time.
230 struct zone_page_metadata
*zi_array_base
;
234 * The zone_locks_grp allows for collecting lock statistics.
235 * All locks are associated to this group in zinit.
236 * Look at tools/lockstat for debugging lock contention.
238 LCK_GRP_DECLARE(zone_locks_grp
, "zone_locks");
239 LCK_MTX_EARLY_DECLARE(zone_metadata_region_lck
, &zone_locks_grp
);
242 * Exclude more than one concurrent garbage collection
244 LCK_GRP_DECLARE(zone_gc_lck_grp
, "zone_gc");
245 LCK_MTX_EARLY_DECLARE(zone_gc_lock
, &zone_gc_lck_grp
);
247 boolean_t panic_include_zprint
= FALSE
;
248 mach_memory_info_t
*panic_kext_memory_info
= NULL
;
249 vm_size_t panic_kext_memory_size
= 0;
252 * Protects zone_array, num_zones, num_zones_in_use, and
253 * zone_destroyed_bitmap
255 static SIMPLE_LOCK_DECLARE(all_zones_lock
, 0);
256 static unsigned int num_zones_in_use
;
257 unsigned int _Atomic num_zones
;
258 SECURITY_READ_ONLY_LATE(unsigned int) zone_view_count
;
261 #define MAX_ZONES 566
262 #else /* !KASAN_ZALLOC */
263 #define MAX_ZONES 402
264 #endif/* !KASAN_ZALLOC */
265 struct zone zone_array
[MAX_ZONES
];
267 /* Initialized in zone_bootstrap(), how many "copies" the per-cpu system does */
268 static SECURITY_READ_ONLY_LATE(unsigned) zpercpu_early_count
;
270 /* Used to keep track of destroyed slots in the zone_array */
271 static bitmap_t zone_destroyed_bitmap
[BITMAP_LEN(MAX_ZONES
)];
273 /* number of pages used by all zones */
274 static long _Atomic zones_phys_page_count
;
276 /* number of zone mapped pages used by all zones */
277 static long _Atomic zones_phys_page_mapped_count
;
280 * Turn ZSECURITY_OPTIONS_STRICT_IOKIT_FREE off on x86 so as not
281 * not break third party kexts that haven't yet been recompiled
282 * to use the new iokit macros.
284 #if XNU_TARGET_OS_OSX && __x86_64__
285 #define ZSECURITY_OPTIONS_STRICT_IOKIT_FREE_DEFAULT 0
287 #define ZSECURITY_OPTIONS_STRICT_IOKIT_FREE_DEFAULT \
288 ZSECURITY_OPTIONS_STRICT_IOKIT_FREE
291 #define ZSECURITY_DEFAULT ( \
292 ZSECURITY_OPTIONS_SEQUESTER | \
293 ZSECURITY_OPTIONS_SUBMAP_USER_DATA | \
294 ZSECURITY_OPTIONS_SEQUESTER_KEXT_KALLOC | \
295 ZSECURITY_OPTIONS_STRICT_IOKIT_FREE_DEFAULT | \
297 TUNABLE(zone_security_options_t
, zsecurity_options
, "zs", ZSECURITY_DEFAULT
);
300 /* enable tags for zones that ask for it */
301 TUNABLE(bool, zone_tagging_on
, "-zt", false);
302 #endif /* VM_MAX_TAG_ZONES */
304 #if DEBUG || DEVELOPMENT
305 TUNABLE(bool, zalloc_disable_copyio_check
, "-no-copyio-zalloc-check", false);
306 __options_decl(zalloc_debug_t
, uint32_t, {
307 ZALLOC_DEBUG_ZONEGC
= 0x00000001,
308 ZALLOC_DEBUG_ZCRAM
= 0x00000002,
311 TUNABLE(zalloc_debug_t
, zalloc_debug
, "zalloc_debug", 0);
312 #endif /* DEBUG || DEVELOPMENT */
314 /* Making pointer scanning leaks detection possible for all zones */
315 TUNABLE(bool, zone_leaks_scan_enable
, "-zl", false);
317 #define zone_leaks_scan_enable false
321 * Async allocation of zones
322 * This mechanism allows for bootstrapping an empty zone which is setup with
323 * non-blocking flags. The first call to zalloc_noblock() will kick off a thread_call
324 * to zalloc_async. We perform a zalloc() (which may block) and then an immediate free.
325 * This will prime the zone for the next use.
327 * Currently the thread_callout function (zalloc_async) will loop through all zones
328 * looking for any zone with async_pending set and do the work for it.
330 * NOTE: If the calling thread for zalloc_noblock is lower priority than thread_call,
331 * then zalloc_noblock to an empty zone may succeed.
333 static void zalloc_async(thread_call_param_t p0
, thread_call_param_t p1
);
334 static thread_call_data_t call_async_alloc
;
335 static void zcram_and_lock(zone_t zone
, vm_offset_t newmem
, vm_size_t size
);
338 * Zone Corruption Debugging
340 * We use four techniques to detect modification of a zone element
341 * after it's been freed.
343 * (1) Check the freelist next pointer for sanity.
344 * (2) Store a backup of the next pointer at the end of the element,
345 * and compare it to the primary next pointer when the element is allocated
346 * to detect corruption of the freelist due to use-after-free bugs.
347 * The backup pointer is also XORed with a per-boot random cookie.
348 * (3) Poison the freed element by overwriting it with 0xdeadbeef,
349 * and check for that value when the element is being reused to make sure
350 * no part of the element has been modified while it was on the freelist.
351 * This will also help catch read-after-frees, as code will now dereference
352 * 0xdeadbeef instead of a valid but freed pointer.
353 * (4) If the zfree_clear_mem flag is set clear the element on free and
354 * assert that it is still clear when alloc-ed.
356 * (1) and (2) occur for every allocation and free to a zone.
357 * This is done to make it slightly more difficult for an attacker to
358 * manipulate the freelist to behave in a specific way.
360 * Poisoning (3) occurs periodically for every N frees (counted per-zone).
361 * If -zp is passed as a boot arg, poisoning occurs for every free.
363 * Zeroing (4) is done for those zones that pass the ZC_ZFREE_CLEARMEM
364 * flag on creation or if the element size is less than one cacheline.
366 * Performance slowdown is inversely proportional to the frequency of poisoning,
367 * with a 4-5% hit around N=1, down to ~0.3% at N=16 and just "noise" at N=32
368 * and higher. You can expect to find a 100% reproducible bug in an average of
369 * N tries, with a standard deviation of about N, but you will want to set
370 * "-zp" to always poison every free if you are attempting to reproduce
373 * For a more heavyweight, but finer-grained method of detecting misuse
374 * of zone memory, look up the "Guard mode" zone allocator in gzalloc.c.
376 * Zone Corruption Logging
378 * You can also track where corruptions come from by using the boot-arguments
379 * "zlog=<zone name to log> -zc". Search for "Zone corruption logging" later
380 * in this document for more implementation and usage information.
382 * Zone Leak Detection
384 * To debug leaks of zone memory, use the zone leak detection tool 'zleaks'
385 * found later in this file via the showtopztrace and showz* macros in kgmacros,
386 * or use zlog without the -zc argument.
390 #define ZP_DEFAULT_SAMPLING_FACTOR 16
391 #define ZP_DEFAULT_SCALE_FACTOR 4
394 * set by zp-factor=N boot arg
396 * A zp_factor of 0 indicates zone poisoning is disabled and can also be set by
397 * passing the -no-zp boot-arg.
399 * A zp_factor of 1 indicates zone poisoning is on for all elements and can be
400 * set by passing the -zp boot-arg.
402 static TUNABLE(uint32_t, zp_factor
, "zp-factor", ZP_DEFAULT_SAMPLING_FACTOR
);
404 /* set by zp-scale=N boot arg, scales zp_factor by zone size */
405 static TUNABLE(uint32_t, zp_scale
, "zp-scale", ZP_DEFAULT_SCALE_FACTOR
);
407 /* initialized to a per-boot random value in zp_bootstrap */
408 static SECURITY_READ_ONLY_LATE(uintptr_t) zp_poisoned_cookie
;
409 static SECURITY_READ_ONLY_LATE(uintptr_t) zp_nopoison_cookie
;
410 static SECURITY_READ_ONLY_LATE(uintptr_t) zp_min_size
;
411 static SECURITY_READ_ONLY_LATE(uint64_t) zone_phys_mapped_max
;
413 static SECURITY_READ_ONLY_LATE(vm_map_t
) zone_submaps
[Z_SUBMAP_IDX_COUNT
];
414 static SECURITY_READ_ONLY_LATE(uint32_t) zone_last_submap_idx
;
416 static struct bool_gen zone_bool_gen
;
417 static zone_t
zone_find_largest(void);
418 static void zone_drop_free_elements(zone_t z
);
420 #define submap_for_zone(z) zone_submaps[(z)->submap_idx]
421 #define MAX_SUBMAP_NAME 16
423 /* Globals for random boolean generator for elements in free list */
424 #define MAX_ENTROPY_PER_ZCRAM 4
428 * Specifies a single zone to enable CPU caching for.
429 * Can be set using boot-args: zcc_enable_for_zone_name=<zone>
431 static char cache_zone_name
[MAX_ZONE_NAME
];
432 static TUNABLE(bool, zcc_kalloc
, "zcc_kalloc", false);
434 __header_always_inline
bool
435 zone_caching_enabled(zone_t z
)
437 return z
->zcache
.zcc_depot
!= NULL
;
440 __header_always_inline
bool
441 zone_caching_enabled(zone_t z __unused
)
445 #endif /* CONFIG_ZCACHE */
447 #pragma mark Zone metadata
449 __enum_closed_decl(zone_addr_kind_t
, bool, {
454 static inline zone_id_t
457 return (zone_id_t
)(z
- zone_array
);
461 zone_has_index(zone_t z
, zone_id_t zid
)
463 return zone_array
+ zid
== z
;
466 static inline vm_size_t
467 zone_elem_count(zone_t zone
, vm_size_t alloc_size
, zone_addr_kind_t kind
)
469 if (kind
== ZONE_ADDR_NATIVE
) {
471 return PAGE_SIZE
/ zone_elem_size(zone
);
473 return alloc_size
/ zone_elem_size(zone
);
475 assert(alloc_size
== PAGE_SIZE
);
476 return (PAGE_SIZE
- ZONE_PAGE_FIRST_OFFSET(kind
)) / zone_elem_size(zone
);
482 zone_metadata_corruption(zone_t zone
, struct zone_page_metadata
*meta
,
485 panic("zone metadata corruption: %s (meta %p, zone %s%s)",
486 kind
, meta
, zone_heap_name(zone
), zone
->z_name
);
491 zone_invalid_element_addr_panic(zone_t zone
, vm_offset_t addr
)
493 panic("zone element pointer validation failed (addr: %p, zone %s%s)",
494 (void *)addr
, zone_heap_name(zone
), zone
->z_name
);
499 zone_page_metadata_index_confusion_panic(zone_t zone
, vm_offset_t addr
,
500 struct zone_page_metadata
*meta
)
502 panic("%p not in the expected zone %s%s (%d != %d)",
503 (void *)addr
, zone_heap_name(zone
), zone
->z_name
,
504 meta
->zm_index
, zone_index(zone
));
509 zone_page_metadata_native_queue_corruption(zone_t zone
, zone_pva_t
*queue
)
511 panic("foreign metadata index %d enqueued in native head %p from zone %s%s",
512 queue
->packed_address
, queue
, zone_heap_name(zone
),
518 zone_page_metadata_list_corruption(zone_t zone
, struct zone_page_metadata
*meta
)
520 panic("metadata list corruption through element %p detected in zone %s%s",
521 meta
, zone_heap_name(zone
), zone
->z_name
);
526 zone_page_metadata_foreign_queue_corruption(zone_t zone
, zone_pva_t
*queue
)
528 panic("native metadata index %d enqueued in foreign head %p from zone %s%s",
529 queue
->packed_address
, queue
, zone_heap_name(zone
), zone
->z_name
);
534 zone_page_metadata_foreign_confusion_panic(zone_t zone
, vm_offset_t addr
)
536 panic("manipulating foreign address %p in a native-only zone %s%s",
537 (void *)addr
, zone_heap_name(zone
), zone
->z_name
);
542 zone_invalid_foreign_addr_panic(zone_t zone
, vm_offset_t addr
)
544 panic("addr %p being freed to foreign zone %s%s not from foreign range",
545 (void *)addr
, zone_heap_name(zone
), zone
->z_name
);
550 zone_page_meta_accounting_panic(zone_t zone
, struct zone_page_metadata
*meta
,
553 panic("accounting mismatch (%s) for zone %s%s, meta %p", kind
,
554 zone_heap_name(zone
), zone
->z_name
, meta
);
559 zone_accounting_panic(zone_t zone
, const char *kind
)
561 panic("accounting mismatch (%s) for zone %s%s", kind
,
562 zone_heap_name(zone
), zone
->z_name
);
567 zone_nofail_panic(zone_t zone
)
569 panic("zalloc(Z_NOFAIL) can't be satisfied for zone %s%s (potential leak)",
570 zone_heap_name(zone
), zone
->z_name
);
574 // <rdar://problem/48304934> arm64 doesn't use ldp when I'd expect it to
575 #define zone_range_load(r, rmin, rmax) \
576 asm("ldp %[rmin], %[rmax], [%[range]]" \
577 : [rmin] "=r"(rmin), [rmax] "=r"(rmax) \
580 #define zone_range_load(r, rmin, rmax) \
581 ({ rmin = (r)->min_address; rmax = (r)->max_address; })
584 __header_always_inline
bool
585 zone_range_contains(const struct zone_map_range
*r
, vm_offset_t addr
, vm_offset_t size
)
587 vm_offset_t rmin
, rmax
;
590 * The `&` is not a typo: we really expect the check to pass,
591 * so encourage the compiler to eagerly load and test without branches
593 zone_range_load(r
, rmin
, rmax
);
594 return (addr
>= rmin
) & (addr
+ size
>= rmin
) & (addr
+ size
<= rmax
);
597 __header_always_inline vm_size_t
598 zone_range_size(const struct zone_map_range
*r
)
600 vm_offset_t rmin
, rmax
;
602 zone_range_load(r
, rmin
, rmax
);
606 #define from_zone_map(addr, size) \
607 zone_range_contains(&zone_info.zi_map_range, (vm_offset_t)(addr), size)
609 #define from_general_submap(addr, size) \
610 zone_range_contains(&zone_info.zi_general_range, (vm_offset_t)(addr), size)
612 #define from_foreign_range(addr, size) \
613 zone_range_contains(&zone_info.zi_foreign_range, (vm_offset_t)(addr), size)
615 #define from_native_meta_map(addr) \
616 zone_range_contains(&zone_info.zi_meta_range, (vm_offset_t)(addr), \
617 sizeof(struct zone_page_metadata))
619 #define zone_addr_kind(addr, size) \
620 (from_zone_map(addr, size) ? ZONE_ADDR_NATIVE : ZONE_ADDR_FOREIGN)
622 __header_always_inline
bool
623 zone_pva_is_null(zone_pva_t page
)
625 return page
.packed_address
== 0;
628 __header_always_inline
bool
629 zone_pva_is_queue(zone_pva_t page
)
631 // actual kernel pages have the top bit set
632 return (int32_t)page
.packed_address
> 0;
635 __header_always_inline
bool
636 zone_pva_is_equal(zone_pva_t pva1
, zone_pva_t pva2
)
638 return pva1
.packed_address
== pva2
.packed_address
;
641 __header_always_inline
void
642 zone_queue_set_head(zone_t z
, zone_pva_t queue
, zone_pva_t oldv
,
643 struct zone_page_metadata
*meta
)
645 zone_pva_t
*queue_head
= &((zone_pva_t
*)zone_array
)[queue
.packed_address
];
647 if (!zone_pva_is_equal(*queue_head
, oldv
)) {
648 zone_page_metadata_list_corruption(z
, meta
);
650 *queue_head
= meta
->zm_page_next
;
653 __header_always_inline zone_pva_t
654 zone_queue_encode(zone_pva_t
*headp
)
656 return (zone_pva_t
){ (uint32_t)(headp
- (zone_pva_t
*)zone_array
) };
659 __header_always_inline zone_pva_t
660 zone_pva_from_addr(vm_address_t addr
)
662 // cannot use atop() because we want to maintain the sign bit
663 return (zone_pva_t
){ (uint32_t)((intptr_t)addr
>> PAGE_SHIFT
) };
666 __header_always_inline vm_address_t
667 zone_pva_to_addr(zone_pva_t page
)
669 // cause sign extension so that we end up with the right address
670 return (vm_offset_t
)(int32_t)page
.packed_address
<< PAGE_SHIFT
;
673 __header_always_inline
struct zone_page_metadata
*
674 zone_pva_to_meta(zone_pva_t page
, zone_addr_kind_t kind
)
676 if (kind
== ZONE_ADDR_NATIVE
) {
677 return &zone_info
.zi_array_base
[page
.packed_address
];
679 return (struct zone_page_metadata
*)zone_pva_to_addr(page
);
683 __header_always_inline zone_pva_t
684 zone_pva_from_meta(struct zone_page_metadata
*meta
, zone_addr_kind_t kind
)
686 if (kind
== ZONE_ADDR_NATIVE
) {
687 uint32_t index
= (uint32_t)(meta
- zone_info
.zi_array_base
);
688 return (zone_pva_t
){ index
};
690 return zone_pva_from_addr((vm_address_t
)meta
);
694 __header_always_inline
struct zone_page_metadata
*
695 zone_meta_from_addr(vm_offset_t addr
, zone_addr_kind_t kind
)
697 if (kind
== ZONE_ADDR_NATIVE
) {
698 return zone_pva_to_meta(zone_pva_from_addr(addr
), kind
);
700 return (struct zone_page_metadata
*)trunc_page(addr
);
704 #define zone_native_meta_from_addr(addr) \
705 zone_meta_from_addr((vm_offset_t)(addr), ZONE_ADDR_NATIVE)
707 __header_always_inline vm_offset_t
708 zone_meta_to_addr(struct zone_page_metadata
*meta
, zone_addr_kind_t kind
)
710 if (kind
== ZONE_ADDR_NATIVE
) {
711 return ptoa((int)(meta
- zone_info
.zi_array_base
));
713 return (vm_offset_t
)meta
;
717 __header_always_inline
void
718 zone_meta_queue_push(zone_t z
, zone_pva_t
*headp
,
719 struct zone_page_metadata
*meta
, zone_addr_kind_t kind
)
721 zone_pva_t head
= *headp
;
722 zone_pva_t queue_pva
= zone_queue_encode(headp
);
723 struct zone_page_metadata
*tmp
;
725 meta
->zm_page_next
= head
;
726 if (!zone_pva_is_null(head
)) {
727 tmp
= zone_pva_to_meta(head
, kind
);
728 if (!zone_pva_is_equal(tmp
->zm_page_prev
, queue_pva
)) {
729 zone_page_metadata_list_corruption(z
, meta
);
731 tmp
->zm_page_prev
= zone_pva_from_meta(meta
, kind
);
733 meta
->zm_page_prev
= queue_pva
;
734 *headp
= zone_pva_from_meta(meta
, kind
);
737 __header_always_inline
struct zone_page_metadata
*
738 zone_meta_queue_pop(zone_t z
, zone_pva_t
*headp
, zone_addr_kind_t kind
,
739 vm_offset_t
*page_addrp
)
741 zone_pva_t head
= *headp
;
742 struct zone_page_metadata
*meta
= zone_pva_to_meta(head
, kind
);
743 vm_offset_t page_addr
= zone_pva_to_addr(head
);
744 struct zone_page_metadata
*tmp
;
746 if (kind
== ZONE_ADDR_NATIVE
&& !from_native_meta_map(meta
)) {
747 zone_page_metadata_native_queue_corruption(z
, headp
);
749 if (kind
== ZONE_ADDR_FOREIGN
&& from_zone_map(meta
, sizeof(*meta
))) {
750 zone_page_metadata_foreign_queue_corruption(z
, headp
);
753 if (!zone_pva_is_null(meta
->zm_page_next
)) {
754 tmp
= zone_pva_to_meta(meta
->zm_page_next
, kind
);
755 if (!zone_pva_is_equal(tmp
->zm_page_prev
, head
)) {
756 zone_page_metadata_list_corruption(z
, meta
);
758 tmp
->zm_page_prev
= meta
->zm_page_prev
;
760 *headp
= meta
->zm_page_next
;
762 *page_addrp
= page_addr
;
766 __header_always_inline
void
767 zone_meta_requeue(zone_t z
, zone_pva_t
*headp
,
768 struct zone_page_metadata
*meta
, zone_addr_kind_t kind
)
770 zone_pva_t meta_pva
= zone_pva_from_meta(meta
, kind
);
771 struct zone_page_metadata
*tmp
;
773 if (!zone_pva_is_null(meta
->zm_page_next
)) {
774 tmp
= zone_pva_to_meta(meta
->zm_page_next
, kind
);
775 if (!zone_pva_is_equal(tmp
->zm_page_prev
, meta_pva
)) {
776 zone_page_metadata_list_corruption(z
, meta
);
778 tmp
->zm_page_prev
= meta
->zm_page_prev
;
780 if (zone_pva_is_queue(meta
->zm_page_prev
)) {
781 zone_queue_set_head(z
, meta
->zm_page_prev
, meta_pva
, meta
);
783 tmp
= zone_pva_to_meta(meta
->zm_page_prev
, kind
);
784 if (!zone_pva_is_equal(tmp
->zm_page_next
, meta_pva
)) {
785 zone_page_metadata_list_corruption(z
, meta
);
787 tmp
->zm_page_next
= meta
->zm_page_next
;
790 zone_meta_queue_push(z
, headp
, meta
, kind
);
794 * Routine to populate a page backing metadata in the zone_metadata_region.
795 * Must be called without the zone lock held as it might potentially block.
798 zone_meta_populate(struct zone_page_metadata
*from
, struct zone_page_metadata
*to
)
800 vm_offset_t page_addr
= trunc_page(from
);
802 for (; page_addr
< (vm_offset_t
)to
; page_addr
+= PAGE_SIZE
) {
805 * This can race with another thread doing a populate on the same metadata
806 * page, where we see an updated pmap but unmapped KASan shadow, causing a
807 * fault in the shadow when we first access the metadata page. Avoid this
808 * by always synchronizing on the zone_metadata_region lock with KASan.
810 if (pmap_find_phys(kernel_pmap
, page_addr
)) {
816 kern_return_t ret
= KERN_SUCCESS
;
818 /* All updates to the zone_metadata_region are done under the zone_metadata_region_lck */
819 lck_mtx_lock(&zone_metadata_region_lck
);
820 if (0 == pmap_find_phys(kernel_pmap
, page_addr
)) {
821 ret
= kernel_memory_populate(kernel_map
, page_addr
,
822 PAGE_SIZE
, KMA_NOPAGEWAIT
| KMA_KOBJECT
| KMA_ZERO
,
823 VM_KERN_MEMORY_OSFMK
);
825 lck_mtx_unlock(&zone_metadata_region_lck
);
827 if (ret
== KERN_SUCCESS
) {
832 * We can't pass KMA_NOPAGEWAIT under a global lock as it leads
833 * to bad system deadlocks, so if the allocation failed,
834 * we need to do the VM_PAGE_WAIT() outside of the lock.
842 zone_allocated_element_offset_is_valid(zone_t zone
, vm_offset_t addr
,
843 vm_offset_t page
, zone_addr_kind_t kind
)
845 vm_offset_t offs
= addr
- page
- ZONE_PAGE_FIRST_OFFSET(kind
);
846 vm_offset_t esize
= zone_elem_size(zone
);
848 if (esize
& (esize
- 1)) { /* not a power of 2 */
849 return (offs
% esize
) == 0;
851 return (offs
& (esize
- 1)) == 0;
855 __attribute__((always_inline
))
856 static struct zone_page_metadata
*
857 zone_allocated_element_resolve(zone_t zone
, vm_offset_t addr
,
858 vm_offset_t
*pagep
, zone_addr_kind_t
*kindp
)
860 struct zone_page_metadata
*meta
;
861 zone_addr_kind_t kind
;
863 vm_offset_t esize
= zone_elem_size(zone
);
865 kind
= zone_addr_kind(addr
, esize
);
866 page
= trunc_page(addr
);
867 meta
= zone_meta_from_addr(addr
, kind
);
869 if (kind
== ZONE_ADDR_NATIVE
) {
870 if (meta
->zm_secondary_page
) {
871 if (meta
->zm_percpu
) {
872 zone_invalid_element_addr_panic(zone
, addr
);
874 page
-= ptoa(meta
->zm_page_count
);
875 meta
-= meta
->zm_page_count
;
877 } else if (!zone
->allows_foreign
) {
878 zone_page_metadata_foreign_confusion_panic(zone
, addr
);
880 } else if (!from_foreign_range(addr
, esize
)) {
881 zone_invalid_foreign_addr_panic(zone
, addr
);
883 } else if (!pmap_kernel_va(addr
)) {
884 zone_invalid_element_addr_panic(zone
, addr
);
888 if (!zone_allocated_element_offset_is_valid(zone
, addr
, page
, kind
)) {
889 zone_invalid_element_addr_panic(zone
, addr
);
892 if (!zone_has_index(zone
, meta
->zm_index
)) {
893 zone_page_metadata_index_confusion_panic(zone
, addr
, meta
);
905 __attribute__((always_inline
))
907 zone_allocated_element_validate(zone_t zone
, vm_offset_t addr
)
909 zone_allocated_element_resolve(zone
, addr
, NULL
, NULL
);
912 __header_always_inline vm_offset_t
913 zone_page_meta_get_freelist(zone_t zone
, struct zone_page_metadata
*meta
,
916 assert(!meta
->zm_secondary_page
);
917 if (meta
->zm_freelist_offs
== PAGE_METADATA_EMPTY_FREELIST
) {
921 vm_size_t size
= ptoa(meta
->zm_percpu
? 1 : meta
->zm_page_count
);
922 if (meta
->zm_freelist_offs
+ zone_elem_size(zone
) > size
) {
923 zone_metadata_corruption(zone
, meta
, "freelist corruption");
926 return page
+ meta
->zm_freelist_offs
;
929 __header_always_inline
void
930 zone_page_meta_set_freelist(struct zone_page_metadata
*meta
,
931 vm_offset_t page
, vm_offset_t addr
)
933 assert(!meta
->zm_secondary_page
);
935 meta
->zm_freelist_offs
= (uint16_t)(addr
- page
);
937 meta
->zm_freelist_offs
= PAGE_METADATA_EMPTY_FREELIST
;
942 zone_page_meta_is_sane_element(zone_t zone
, struct zone_page_metadata
*meta
,
943 vm_offset_t page
, vm_offset_t element
, zone_addr_kind_t kind
)
946 /* ends of the freelist are NULL */
949 if (element
< page
+ ZONE_PAGE_FIRST_OFFSET(kind
)) {
952 vm_size_t size
= ptoa(meta
->zm_percpu
? 1 : meta
->zm_page_count
);
953 if (element
> page
+ size
- zone_elem_size(zone
)) {
959 /* Routine to get the size of a zone allocated address.
960 * If the address doesnt belong to the zone maps, returns 0.
963 zone_element_size(void *addr
, zone_t
*z
)
965 struct zone_page_metadata
*meta
;
966 struct zone
*src_zone
;
968 if (from_zone_map(addr
, sizeof(void *))) {
969 meta
= zone_native_meta_from_addr(addr
);
970 src_zone
= &zone_array
[meta
->zm_index
];
974 return zone_elem_size(src_zone
);
977 if (__improbable(gzalloc_enabled())) {
979 if (gzalloc_element_size(addr
, z
, &gzsize
)) {
983 #endif /* CONFIG_GZALLOC */
988 /* This function just formats the reason for the panics by redoing the checks */
991 zone_require_panic(zone_t zone
, void *addr
)
996 if (!from_zone_map(addr
, zone_elem_size(zone
))) {
997 panic("zone_require failed: address not in a zone (addr: %p)", addr
);
1000 zindex
= zone_native_meta_from_addr(addr
)->zm_index
;
1001 other
= &zone_array
[zindex
];
1002 if (zindex
>= os_atomic_load(&num_zones
, relaxed
) || !other
->z_self
) {
1003 panic("zone_require failed: invalid zone index %d "
1004 "(addr: %p, expected: %s%s)", zindex
,
1005 addr
, zone_heap_name(zone
), zone
->z_name
);
1007 panic("zone_require failed: address in unexpected zone id %d (%s%s) "
1008 "(addr: %p, expected: %s%s)",
1009 zindex
, zone_heap_name(other
), other
->z_name
,
1010 addr
, zone_heap_name(zone
), zone
->z_name
);
1016 zone_id_require_panic(zone_id_t zid
, void *addr
)
1018 zone_require_panic(&zone_array
[zid
], addr
);
1022 * Routines to panic if a pointer is not mapped to an expected zone.
1023 * This can be used as a means of pinning an object to the zone it is expected
1024 * to be a part of. Causes a panic if the address does not belong to any
1025 * specified zone, does not belong to any zone, has been freed and therefore
1026 * unmapped from the zone, or the pointer contains an uninitialized value that
1027 * does not belong to any zone.
1029 * Note that this can only work with collectable zones without foreign pages.
1032 zone_require(zone_t zone
, void *addr
)
1034 if (__probable(from_general_submap(addr
, zone_elem_size(zone
)) &&
1035 (zone_has_index(zone
, zone_native_meta_from_addr(addr
)->zm_index
)))) {
1039 if (__probable(gzalloc_enabled())) {
1043 zone_require_panic(zone
, addr
);
1047 zone_id_require(zone_id_t zid
, vm_size_t esize
, void *addr
)
1049 if (__probable(from_general_submap(addr
, esize
) &&
1050 (zid
== zone_native_meta_from_addr(addr
)->zm_index
))) {
1054 if (__probable(gzalloc_enabled())) {
1058 zone_id_require_panic(zid
, addr
);
1062 zone_owns(zone_t zone
, void *addr
)
1064 if (__probable(from_general_submap(addr
, zone_elem_size(zone
)) &&
1065 (zone_has_index(zone
, zone_native_meta_from_addr(addr
)->zm_index
)))) {
1069 if (__probable(gzalloc_enabled())) {
1077 #if VM_MAX_TAG_ZONES
1079 // for zones with tagging enabled:
1081 // calculate a pointer to the tag base entry,
1082 // holding either a uint32_t the first tag offset for a page in the zone map,
1083 // or two uint16_t tags if the page can only hold one or two elements
1085 #define ZTAGBASE(zone, element) \
1086 (&((uint32_t *)zone_tagbase_min)[atop((element) - zone_info.zi_map_range.min_address)])
1088 // pointer to the tag for an element
1089 #define ZTAG(zone, element) \
1091 vm_tag_t * result; \
1092 if ((zone)->tags_inline) { \
1093 result = (vm_tag_t *) ZTAGBASE((zone), (element)); \
1094 if ((page_mask & element) >= zone_elem_size(zone)) result++; \
1096 result = &((vm_tag_t *)zone_tags_min)[ZTAGBASE((zone), (element))[0] + ((element) & page_mask) / zone_elem_size((zone))]; \
1102 static vm_offset_t zone_tagbase_min
;
1103 static vm_offset_t zone_tagbase_max
;
1104 static vm_offset_t zone_tagbase_map_size
;
1105 static vm_map_t zone_tagbase_map
;
1107 static vm_offset_t zone_tags_min
;
1108 static vm_offset_t zone_tags_max
;
1109 static vm_offset_t zone_tags_map_size
;
1110 static vm_map_t zone_tags_map
;
1112 // simple heap allocator for allocating the tags for new memory
1114 LCK_MTX_EARLY_DECLARE(ztLock
, &zone_locks_grp
); /* heap lock */
1117 ztFreeIndexCount
= 8,
1118 ztFreeIndexMax
= (ztFreeIndexCount
- 1),
1123 #if __LITTLE_ENDIAN__
1129 // ztBlock needs free bit least significant
1130 #error !__LITTLE_ENDIAN__
1133 typedef struct ztBlock ztBlock
;
1135 static ztBlock
* ztBlocks
;
1136 static uint32_t ztBlocksCount
;
1137 static uint32_t ztBlocksFree
;
1140 ztLog2up(uint32_t size
)
1145 size
= 32 - __builtin_clz(size
- 1);
1151 ztLog2down(uint32_t size
)
1153 size
= 31 - __builtin_clz(size
);
1158 ztFault(vm_map_t map
, const void * address
, size_t size
, uint32_t flags
)
1160 vm_map_offset_t addr
= (vm_map_offset_t
) address
;
1161 vm_map_offset_t page
, end
;
1163 page
= trunc_page(addr
);
1164 end
= round_page(addr
+ size
);
1166 for (; page
< end
; page
+= page_size
) {
1167 if (!pmap_find_phys(kernel_pmap
, page
)) {
1168 kern_return_t __unused
1169 ret
= kernel_memory_populate(map
, page
, PAGE_SIZE
,
1170 KMA_KOBJECT
| flags
, VM_KERN_MEMORY_DIAG
);
1171 assert(ret
== KERN_SUCCESS
);
1177 ztPresent(const void * address
, size_t size
)
1179 vm_map_offset_t addr
= (vm_map_offset_t
) address
;
1180 vm_map_offset_t page
, end
;
1183 page
= trunc_page(addr
);
1184 end
= round_page(addr
+ size
);
1185 for (result
= TRUE
; (page
< end
); page
+= page_size
) {
1186 result
= pmap_find_phys(kernel_pmap
, page
);
1196 ztDump(boolean_t sanity
);
1198 ztDump(boolean_t sanity
)
1202 for (q
= 0; q
<= ztFreeIndexMax
; q
++) {
1206 cq
= ztLog2down(ztBlocks
[p
].size
);
1207 if (cq
> ztFreeIndexMax
) {
1208 cq
= ztFreeIndexMax
;
1210 if (!ztBlocks
[p
].free
1211 || ((p
!= q
) && (q
!= cq
))
1212 || (ztBlocks
[ztBlocks
[p
].next
].prev
!= p
)
1213 || (ztBlocks
[ztBlocks
[p
].prev
].next
!= p
)) {
1214 kprintf("zterror at %d", p
);
1216 kprintf("zterror at %d", p
);
1221 kprintf("zt[%03d]%c %d, %d, %d\n",
1222 p
, ztBlocks
[p
].free
? 'F' : 'A',
1223 ztBlocks
[p
].next
, ztBlocks
[p
].prev
,
1225 p
= ztBlocks
[p
].next
;
1235 printf("-----------------------\n");
1241 #define ZTBDEQ(idx) \
1242 ztBlocks[ztBlocks[(idx)].prev].next = ztBlocks[(idx)].next; \
1243 ztBlocks[ztBlocks[(idx)].next].prev = ztBlocks[(idx)].prev;
1246 ztFree(zone_t zone __unused
, uint32_t index
, uint32_t count
)
1248 uint32_t q
, w
, p
, size
, merge
;
1251 ztBlocksFree
+= count
;
1253 // merge with preceding
1254 merge
= (index
+ count
);
1255 if ((merge
< ztBlocksCount
)
1256 && ztPresent(&ztBlocks
[merge
], sizeof(ztBlocks
[merge
]))
1257 && ztBlocks
[merge
].free
) {
1259 count
+= ztBlocks
[merge
].size
;
1262 // merge with following
1263 merge
= (index
- 1);
1264 if ((merge
> ztFreeIndexMax
)
1265 && ztPresent(&ztBlocks
[merge
], sizeof(ztBlocks
[merge
]))
1266 && ztBlocks
[merge
].free
) {
1267 size
= ztBlocks
[merge
].size
;
1273 q
= ztLog2down(count
);
1274 if (q
> ztFreeIndexMax
) {
1278 // queue in order of size
1280 p
= ztBlocks
[w
].next
;
1284 if (ztBlocks
[p
].size
>= count
) {
1289 ztBlocks
[p
].prev
= index
;
1290 ztBlocks
[w
].next
= index
;
1293 ztFault(zone_tags_map
, &ztBlocks
[index
], sizeof(ztBlocks
[index
]), 0);
1295 // mark first & last with free flag and size
1296 ztBlocks
[index
].free
= TRUE
;
1297 ztBlocks
[index
].size
= count
;
1298 ztBlocks
[index
].prev
= w
;
1299 ztBlocks
[index
].next
= p
;
1301 index
+= (count
- 1);
1303 ztFault(zone_tags_map
, &ztBlocks
[index
], sizeof(ztBlocks
[index
]), 0);
1304 ztBlocks
[index
].free
= TRUE
;
1305 ztBlocks
[index
].size
= count
;
1310 ztAlloc(zone_t zone
, uint32_t count
)
1312 uint32_t q
, w
, p
, leftover
;
1316 q
= ztLog2up(count
);
1317 if (q
> ztFreeIndexMax
) {
1323 p
= ztBlocks
[w
].next
;
1327 if (ztBlocks
[p
].size
>= count
) {
1328 // dequeue, mark both ends allocated
1329 ztBlocks
[w
].next
= ztBlocks
[p
].next
;
1330 ztBlocks
[ztBlocks
[p
].next
].prev
= w
;
1331 ztBlocks
[p
].free
= FALSE
;
1332 ztBlocksFree
-= ztBlocks
[p
].size
;
1333 if (ztBlocks
[p
].size
> 1) {
1334 ztBlocks
[p
+ ztBlocks
[p
].size
- 1].free
= FALSE
;
1337 // fault all the allocation
1338 ztFault(zone_tags_map
, &ztBlocks
[p
], count
* sizeof(ztBlocks
[p
]), 0);
1339 // mark last as allocated
1341 ztBlocks
[p
+ count
- 1].free
= FALSE
;
1344 leftover
= ztBlocks
[p
].size
- count
;
1346 ztFree(zone
, p
+ ztBlocks
[p
].size
- leftover
, leftover
);
1354 }while (q
<= ztFreeIndexMax
);
1361 zone_tagging_init(vm_size_t max_zonemap_size
)
1364 vm_map_kernel_flags_t vmk_flags
;
1367 // allocate submaps VM_KERN_MEMORY_DIAG
1369 zone_tagbase_map_size
= atop(max_zonemap_size
) * sizeof(uint32_t);
1370 vmk_flags
= VM_MAP_KERNEL_FLAGS_NONE
;
1371 vmk_flags
.vmkf_permanent
= TRUE
;
1372 ret
= kmem_suballoc(kernel_map
, &zone_tagbase_min
, zone_tagbase_map_size
,
1373 FALSE
, VM_FLAGS_ANYWHERE
, vmk_flags
, VM_KERN_MEMORY_DIAG
,
1376 if (ret
!= KERN_SUCCESS
) {
1377 panic("zone_init: kmem_suballoc failed");
1379 zone_tagbase_max
= zone_tagbase_min
+ round_page(zone_tagbase_map_size
);
1381 zone_tags_map_size
= 2048 * 1024 * sizeof(vm_tag_t
);
1382 vmk_flags
= VM_MAP_KERNEL_FLAGS_NONE
;
1383 vmk_flags
.vmkf_permanent
= TRUE
;
1384 ret
= kmem_suballoc(kernel_map
, &zone_tags_min
, zone_tags_map_size
,
1385 FALSE
, VM_FLAGS_ANYWHERE
, vmk_flags
, VM_KERN_MEMORY_DIAG
,
1388 if (ret
!= KERN_SUCCESS
) {
1389 panic("zone_init: kmem_suballoc failed");
1391 zone_tags_max
= zone_tags_min
+ round_page(zone_tags_map_size
);
1393 ztBlocks
= (ztBlock
*) zone_tags_min
;
1394 ztBlocksCount
= (uint32_t)(zone_tags_map_size
/ sizeof(ztBlock
));
1396 // initialize the qheads
1397 lck_mtx_lock(&ztLock
);
1399 ztFault(zone_tags_map
, &ztBlocks
[0], sizeof(ztBlocks
[0]), 0);
1400 for (idx
= 0; idx
< ztFreeIndexCount
; idx
++) {
1401 ztBlocks
[idx
].free
= TRUE
;
1402 ztBlocks
[idx
].next
= idx
;
1403 ztBlocks
[idx
].prev
= idx
;
1404 ztBlocks
[idx
].size
= 0;
1406 // free remaining space
1407 ztFree(NULL
, ztFreeIndexCount
, ztBlocksCount
- ztFreeIndexCount
);
1409 lck_mtx_unlock(&ztLock
);
1413 ztMemoryAdd(zone_t zone
, vm_offset_t mem
, vm_size_t size
)
1416 uint32_t count
, block
, blocks
, idx
;
1420 tagbase
= ZTAGBASE(zone
, mem
);
1422 lck_mtx_lock(&ztLock
);
1425 ztFault(zone_tagbase_map
, tagbase
, pages
* sizeof(uint32_t), 0);
1427 if (!zone
->tags_inline
) {
1429 count
= (uint32_t)(size
/ zone_elem_size(zone
));
1430 blocks
= ((count
+ ztTagsPerBlock
- 1) / ztTagsPerBlock
);
1431 block
= ztAlloc(zone
, blocks
);
1435 assert(-1U != block
);
1438 lck_mtx_unlock(&ztLock
);
1440 if (!zone
->tags_inline
) {
1441 // set tag base for each page
1442 block
*= ztTagsPerBlock
;
1443 for (idx
= 0; idx
< pages
; idx
++) {
1444 vm_offset_t esize
= zone_elem_size(zone
);
1445 tagbase
[idx
] = block
+ (uint32_t)((ptoa(idx
) + esize
- 1) / esize
);
1451 ztMemoryRemove(zone_t zone
, vm_offset_t mem
, vm_size_t size
)
1454 uint32_t count
, block
, blocks
, idx
;
1457 // set tag base for each page
1459 tagbase
= ZTAGBASE(zone
, mem
);
1461 for (idx
= 0; idx
< pages
; idx
++) {
1462 tagbase
[idx
] = 0xFFFFFFFF;
1465 lck_mtx_lock(&ztLock
);
1466 if (!zone
->tags_inline
) {
1467 count
= (uint32_t)(size
/ zone_elem_size(zone
));
1468 blocks
= ((count
+ ztTagsPerBlock
- 1) / ztTagsPerBlock
);
1469 assert(block
!= 0xFFFFFFFF);
1470 block
/= ztTagsPerBlock
;
1471 ztFree(NULL
/* zone is unlocked */, block
, blocks
);
1474 lck_mtx_unlock(&ztLock
);
1478 zone_index_from_tag_index(uint32_t tag_zone_index
, vm_size_t
* elem_size
)
1480 simple_lock(&all_zones_lock
, &zone_locks_grp
);
1482 zone_index_foreach(idx
) {
1483 zone_t z
= &zone_array
[idx
];
1487 if (tag_zone_index
!= z
->tag_zone_index
) {
1491 *elem_size
= zone_elem_size(z
);
1492 simple_unlock(&all_zones_lock
);
1496 simple_unlock(&all_zones_lock
);
1501 #endif /* VM_MAX_TAG_ZONES */
1502 #pragma mark zalloc helpers
1511 zone_heap_name(zone_t z
)
1513 if (__probable(z
->kalloc_heap
< KHEAP_ID_COUNT
)) {
1514 return kalloc_heap_names
[z
->kalloc_heap
];
1519 static inline vm_size_t
1520 zone_submaps_approx_size(void)
1524 for (unsigned idx
= 0; idx
<= zone_last_submap_idx
; idx
++) {
1525 size
+= zone_submaps
[idx
]->size
;
1532 zone_maps_owned(vm_address_t addr
, vm_size_t size
)
1534 return from_zone_map(addr
, size
);
1539 vm_map_size_t
*psize
,
1540 vm_map_size_t
*pfree
,
1541 vm_map_size_t
*plargest_free
)
1543 vm_map_sizes(zone_submaps
[Z_SUBMAP_IDX_GENERAL_MAP
], psize
, pfree
, plargest_free
);
1547 zone_submap(zone_t zone
)
1549 return submap_for_zone(zone
);
1555 return zpercpu_early_count
;
1559 track_this_zone(const char *zonename
, const char *logname
)
1562 const char *zc
= zonename
;
1563 const char *lc
= logname
;
1566 * Compare the strings. We bound the compare by MAX_ZONE_NAME.
1569 for (len
= 1; len
<= MAX_ZONE_NAME
; zc
++, lc
++, len
++) {
1571 * If the current characters don't match, check for a space in
1572 * in the zone name and a corresponding period in the log name.
1573 * If that's not there, then the strings don't match.
1576 if (*zc
!= *lc
&& !(*zc
== ' ' && *lc
== '.')) {
1581 * The strings are equal so far. If we're at the end, then it's a match.
1592 #if DEBUG || DEVELOPMENT
1595 zone_element_info(void *addr
, vm_tag_t
* ptag
)
1598 vm_tag_t tag
= VM_KERN_MEMORY_NONE
;
1599 struct zone_page_metadata
*meta
;
1600 struct zone
*src_zone
;
1602 if (from_zone_map(addr
, sizeof(void *))) {
1603 meta
= zone_native_meta_from_addr(addr
);
1604 src_zone
= &zone_array
[meta
->zm_index
];
1605 #if VM_MAX_TAG_ZONES
1606 if (__improbable(src_zone
->tags
)) {
1607 tag
= (ZTAG(src_zone
, (vm_offset_t
) addr
)[0] >> 1);
1609 #endif /* VM_MAX_TAG_ZONES */
1610 size
= zone_elem_size(src_zone
);
1613 gzalloc_element_size(addr
, NULL
, &size
);
1614 #endif /* CONFIG_GZALLOC */
1620 #endif /* DEBUG || DEVELOPMENT */
1622 /* Someone wrote to freed memory. */
1625 zone_element_was_modified_panic(
1627 vm_offset_t element
,
1629 vm_offset_t expected
,
1632 panic("a freed zone element has been modified in zone %s%s: "
1633 "expected %p but found %p, bits changed %p, "
1634 "at offset %d of %d in element %p, cookies %p %p",
1635 zone_heap_name(zone
),
1639 (void *) (expected
^ found
),
1641 (uint32_t) zone_elem_size(zone
),
1643 (void *) zp_nopoison_cookie
,
1644 (void *) zp_poisoned_cookie
);
1647 /* The backup pointer is stored in the last pointer-sized location in an element. */
1648 __header_always_inline vm_offset_t
*
1649 get_backup_ptr(vm_size_t elem_size
, vm_offset_t
*element
)
1651 return (vm_offset_t
*)((vm_offset_t
)element
+ elem_size
- sizeof(vm_offset_t
));
1655 * The primary and backup pointers don't match.
1656 * Determine which one was likely the corrupted pointer, find out what it
1657 * probably should have been, and panic.
1661 backup_ptr_mismatch_panic(
1663 struct zone_page_metadata
*page_meta
,
1665 vm_offset_t element
)
1667 vm_offset_t primary
= *(vm_offset_t
*)element
;
1668 vm_offset_t backup
= *get_backup_ptr(zone_elem_size(zone
), &element
);
1669 vm_offset_t likely_backup
;
1670 vm_offset_t likely_primary
;
1671 zone_addr_kind_t kind
= zone_addr_kind(page
, zone_elem_size(zone
));
1673 likely_primary
= primary
^ zp_nopoison_cookie
;
1674 boolean_t sane_backup
;
1675 boolean_t sane_primary
= zone_page_meta_is_sane_element(zone
, page_meta
,
1676 page
, likely_primary
, kind
);
1677 boolean_t element_was_poisoned
= (backup
& 0x1);
1679 #if defined(__LP64__)
1680 /* We can inspect the tag in the upper bits for additional confirmation */
1681 if ((backup
& 0xFFFFFF0000000000) == 0xFACADE0000000000) {
1682 element_was_poisoned
= TRUE
;
1683 } else if ((backup
& 0xFFFFFF0000000000) == 0xC0FFEE0000000000) {
1684 element_was_poisoned
= FALSE
;
1688 if (element_was_poisoned
) {
1689 likely_backup
= backup
^ zp_poisoned_cookie
;
1691 likely_backup
= backup
^ zp_nopoison_cookie
;
1693 sane_backup
= zone_page_meta_is_sane_element(zone
, page_meta
,
1694 page
, likely_backup
, kind
);
1696 /* The primary is definitely the corrupted one */
1697 if (!sane_primary
&& sane_backup
) {
1698 zone_element_was_modified_panic(zone
, element
, primary
, (likely_backup
^ zp_nopoison_cookie
), 0);
1701 /* The backup is definitely the corrupted one */
1702 if (sane_primary
&& !sane_backup
) {
1703 zone_element_was_modified_panic(zone
, element
, backup
,
1704 (likely_primary
^ (element_was_poisoned
? zp_poisoned_cookie
: zp_nopoison_cookie
)),
1705 zone_elem_size(zone
) - sizeof(vm_offset_t
));
1709 * Not sure which is the corrupted one.
1710 * It's less likely that the backup pointer was overwritten with
1711 * ( (sane address) ^ (valid cookie) ), so we'll guess that the
1712 * primary pointer has been overwritten with a sane but incorrect address.
1714 if (sane_primary
&& sane_backup
) {
1715 zone_element_was_modified_panic(zone
, element
, primary
, (likely_backup
^ zp_nopoison_cookie
), 0);
1718 /* Neither are sane, so just guess. */
1719 zone_element_was_modified_panic(zone
, element
, primary
, (likely_backup
^ zp_nopoison_cookie
), 0);
1723 * zone_sequestered_page_get
1726 static struct zone_page_metadata
*
1727 zone_sequestered_page_get(zone_t z
, vm_offset_t
*page
)
1729 const zone_addr_kind_t kind
= ZONE_ADDR_NATIVE
;
1731 if (!zone_pva_is_null(z
->pages_sequester
)) {
1732 if (os_sub_overflow(z
->sequester_page_count
, z
->alloc_pages
,
1733 &z
->sequester_page_count
)) {
1734 zone_accounting_panic(z
, "sequester_page_count wrap-around");
1736 return zone_meta_queue_pop(z
, &z
->pages_sequester
, kind
, page
);
1743 * zone_sequestered_page_populate
1745 * page_meta is invalid on failure
1747 static kern_return_t
1748 zone_sequestered_page_populate(zone_t z
, struct zone_page_metadata
*page_meta
,
1749 vm_offset_t space
, vm_size_t alloc_size
, int zflags
)
1751 kern_return_t retval
;
1753 assert(alloc_size
== ptoa(z
->alloc_pages
));
1754 retval
= kernel_memory_populate(submap_for_zone(z
), space
, alloc_size
,
1755 zflags
, VM_KERN_MEMORY_ZONE
);
1756 if (retval
!= KERN_SUCCESS
) {
1758 zone_meta_queue_push(z
, &z
->pages_sequester
, page_meta
, ZONE_ADDR_NATIVE
);
1759 z
->sequester_page_count
+= z
->alloc_pages
;
1765 #pragma mark Zone poisoning/zeroing
1768 * Initialize zone poisoning
1769 * called from zone_bootstrap before any allocations are made from zalloc
1778 * Initialize backup pointer random cookie for poisoned elements
1779 * Try not to call early_random() back to back, it may return
1780 * the same value if mach_absolute_time doesn't have sufficient time
1781 * to tick over between calls. <rdar://problem/11597395>
1782 * (This is only a problem on embedded devices)
1784 zp_poisoned_cookie
= (uintptr_t) early_random();
1786 /* -zp: enable poisoning for every alloc and free */
1787 if (PE_parse_boot_argn("-zp", temp_buf
, sizeof(temp_buf
))) {
1791 /* -no-zp: disable poisoning */
1792 if (PE_parse_boot_argn("-no-zp", temp_buf
, sizeof(temp_buf
))) {
1794 printf("Zone poisoning disabled\n");
1797 /* Initialize backup pointer random cookie for unpoisoned elements */
1798 zp_nopoison_cookie
= (uintptr_t) early_random();
1801 if (zp_poisoned_cookie
== zp_nopoison_cookie
) {
1802 panic("early_random() is broken: %p and %p are not random\n",
1803 (void *) zp_poisoned_cookie
, (void *) zp_nopoison_cookie
);
1808 * Use the last bit in the backup pointer to hint poisoning state
1809 * to backup_ptr_mismatch_panic. Valid zone pointers are aligned, so
1810 * the low bits are zero.
1812 zp_poisoned_cookie
|= (uintptr_t)0x1ULL
;
1813 zp_nopoison_cookie
&= ~((uintptr_t)0x1ULL
);
1815 #if defined(__LP64__)
1817 * Make backup pointers more obvious in GDB for 64 bit
1818 * by making OxFFFFFF... ^ cookie = 0xFACADE...
1819 * (0xFACADE = 0xFFFFFF ^ 0x053521)
1820 * (0xC0FFEE = 0xFFFFFF ^ 0x3f0011)
1821 * The high 3 bytes of a zone pointer are always 0xFFFFFF, and are checked
1822 * by the sanity check, so it's OK for that part of the cookie to be predictable.
1824 * TODO: Use #defines, xors, and shifts
1827 zp_poisoned_cookie
&= 0x000000FFFFFFFFFF;
1828 zp_poisoned_cookie
|= 0x0535210000000000; /* 0xFACADE */
1830 zp_nopoison_cookie
&= 0x000000FFFFFFFFFF;
1831 zp_nopoison_cookie
|= 0x3f00110000000000; /* 0xC0FFEE */
1835 * Initialize zp_min_size to two cachelines. Elements smaller than this will
1838 ml_cpu_info_t cpu_info
;
1839 ml_cpu_get_info(&cpu_info
);
1840 zp_min_size
= 2 * cpu_info
.cache_line_size
;
1844 zone_poison_count_init(zone_t zone
)
1846 return zp_factor
+ (((uint32_t)zone_elem_size(zone
)) >> zp_scale
) ^
1847 (mach_absolute_time() & 0x7);
1850 #if ZALLOC_ENABLE_POISONING
1852 zfree_poison_element(zone_t zone
, uint32_t *zp_count
, vm_offset_t elem
)
1854 bool poison
= false;
1855 uint32_t zp_count_local
;
1857 assert(!zone
->percpu
);
1858 if (zp_factor
!= 0) {
1860 * Poison the memory of every zp_count-th element before it ends up
1861 * on the freelist to catch use-after-free and use of uninitialized
1864 * Every element is poisoned when zp_factor is set to 1.
1867 zp_count_local
= os_atomic_load(zp_count
, relaxed
);
1868 if (__improbable(zp_count_local
== 0 || zp_factor
== 1)) {
1871 os_atomic_store(zp_count
, zone_poison_count_init(zone
), relaxed
);
1873 /* memset_pattern{4|8} could help make this faster: <rdar://problem/4662004> */
1874 vm_offset_t
*element_cursor
= ((vm_offset_t
*) elem
);
1875 vm_offset_t
*end_cursor
= (vm_offset_t
*)(elem
+ zone_elem_size(zone
));
1877 for (; element_cursor
< end_cursor
; element_cursor
++) {
1878 *element_cursor
= ZONE_POISON
;
1881 os_atomic_store(zp_count
, zp_count_local
- 1, relaxed
);
1883 * Zero first zp_min_size bytes of elements that aren't being poisoned.
1884 * Element size is larger than zp_min_size in this path as elements
1885 * that are smaller will always be zero-ed.
1887 bzero((void *) elem
, zp_min_size
);
1894 zfree_poison_element(zone_t zone
, uint32_t *zp_count
, vm_offset_t elem
)
1896 #pragma unused(zone, zp_count, elem)
1897 assert(!zone
->percpu
);
1902 __attribute__((always_inline
))
1904 zfree_clear(zone_t zone
, vm_offset_t addr
, vm_size_t elem_size
)
1906 assert(zone
->zfree_clear_mem
);
1908 zpercpu_foreach_cpu(i
) {
1909 bzero((void *)(addr
+ ptoa(i
)), elem_size
);
1912 bzero((void *)addr
, elem_size
);
1919 * Zero the element if zone has zfree_clear_mem flag set else poison
1920 * the element if zp_count hits 0.
1922 __attribute__((always_inline
))
1924 zfree_clear_or_poison(zone_t zone
, uint32_t *zp_count
, vm_offset_t addr
)
1926 vm_size_t elem_size
= zone_elem_size(zone
);
1928 if (zone
->zfree_clear_mem
) {
1929 return zfree_clear(zone
, addr
, elem_size
);
1932 return zfree_poison_element(zone
, zp_count
, (vm_offset_t
)addr
);
1936 * Clear out the old next pointer and backup to avoid leaking the zone
1937 * poisoning cookie and so that only values on the freelist have a valid
1941 zone_clear_freelist_pointers(zone_t zone
, vm_offset_t addr
)
1943 vm_offset_t perm_value
= 0;
1945 if (!zone
->zfree_clear_mem
) {
1946 perm_value
= ZONE_POISON
;
1949 vm_offset_t
*primary
= (vm_offset_t
*) addr
;
1950 vm_offset_t
*backup
= get_backup_ptr(zone_elem_size(zone
), primary
);
1952 *primary
= perm_value
;
1953 *backup
= perm_value
;
1956 #if ZALLOC_ENABLE_POISONING
1959 zone_element_not_clear_panic(zone_t zone
, void *addr
)
1961 panic("Zone element %p was modified after free for zone %s%s: "
1962 "Expected element to be cleared", addr
, zone_heap_name(zone
),
1967 * Validate that the element was not tampered with while it was in the
1971 zalloc_validate_element(zone_t zone
, vm_offset_t addr
, vm_size_t size
, bool validate
)
1974 assert(zone
->zfree_clear_mem
);
1975 zpercpu_foreach_cpu(i
) {
1976 if (memcmp_zero_ptr_aligned((void *)(addr
+ ptoa(i
)), size
)) {
1977 zone_element_not_clear_panic(zone
, (void *)(addr
+ ptoa(i
)));
1980 } else if (zone
->zfree_clear_mem
) {
1981 if (memcmp_zero_ptr_aligned((void *)addr
, size
)) {
1982 zone_element_not_clear_panic(zone
, (void *)addr
);
1984 } else if (__improbable(validate
)) {
1985 const vm_offset_t
*p
= (vm_offset_t
*)addr
;
1986 const vm_offset_t
*end
= (vm_offset_t
*)(addr
+ size
);
1988 for (; p
< end
; p
++) {
1989 if (*p
!= ZONE_POISON
) {
1990 zone_element_was_modified_panic(zone
, addr
,
1991 *p
, ZONE_POISON
, (vm_offset_t
)p
- addr
);
1996 * If element wasn't poisoned or entirely cleared, validate that the
1997 * minimum bytes that were cleared on free haven't been corrupted.
1998 * addr is advanced by ptr size as we have already validated and cleared
1999 * the freelist pointer/zcache canary.
2001 if (memcmp_zero_ptr_aligned((void *) (addr
+ sizeof(vm_offset_t
)),
2002 zp_min_size
- sizeof(vm_offset_t
))) {
2003 zone_element_not_clear_panic(zone
, (void *)addr
);
2007 #endif /* ZALLOC_ENABLE_POISONING */
2009 #pragma mark Zone Leak Detection
2012 * Zone leak debugging code
2014 * When enabled, this code keeps a log to track allocations to a particular zone that have not
2015 * yet been freed. Examining this log will reveal the source of a zone leak. The log is allocated
2016 * only when logging is enabled, so there is no effect on the system when it's turned off. Logging is
2019 * Enable the logging via the boot-args. Add the parameter "zlog=<zone>" to boot-args where <zone>
2020 * is the name of the zone you wish to log.
2022 * This code only tracks one zone, so you need to identify which one is leaking first.
2023 * Generally, you'll know you have a leak when you get a "zalloc retry failed 3" panic from the zone
2024 * garbage collector. Note that the zone name printed in the panic message is not necessarily the one
2025 * containing the leak. So do a zprint from gdb and locate the zone with the bloated size. This
2026 * is most likely the problem zone, so set zlog in boot-args to this zone name, reboot and re-run the test. The
2027 * next time it panics with this message, examine the log using the kgmacros zstack, findoldest and countpcs.
2028 * See the help in the kgmacros for usage info.
2031 * Zone corruption logging
2033 * Logging can also be used to help identify the source of a zone corruption. First, identify the zone
2034 * that is being corrupted, then add "-zc zlog=<zone name>" to the boot-args. When -zc is used in conjunction
2035 * with zlog, it changes the logging style to track both allocations and frees to the zone. So when the
2036 * corruption is detected, examining the log will show you the stack traces of the callers who last allocated
2037 * and freed any particular element in the zone. Use the findelem kgmacro with the address of the element that's been
2038 * corrupted to examine its history. This should lead to the source of the corruption.
2041 /* Returns TRUE if we rolled over the counter at factor */
2042 __header_always_inline
bool
2043 sample_counter(volatile uint32_t *count_p
, uint32_t factor
)
2045 uint32_t old_count
, new_count
= 0;
2046 if (count_p
!= NULL
) {
2047 os_atomic_rmw_loop(count_p
, old_count
, new_count
, relaxed
, {
2048 new_count
= old_count
+ 1;
2049 if (new_count
>= factor
) {
2055 return new_count
== 0;
2058 #if ZONE_ENABLE_LOGGING
2059 /* Log allocations and frees to help debug a zone element corruption */
2060 TUNABLE(bool, corruption_debug_flag
, "-zc", false);
2062 #define MAX_NUM_ZONES_ALLOWED_LOGGING 10 /* Maximum 10 zones can be logged at once */
2064 static int max_num_zones_to_log
= MAX_NUM_ZONES_ALLOWED_LOGGING
;
2065 static int num_zones_logged
= 0;
2068 * The number of records in the log is configurable via the zrecs parameter in boot-args. Set this to
2069 * the number of records you want in the log. For example, "zrecs=10" sets it to 10 records. Since this
2070 * is the number of stacks suspected of leaking, we don't need many records.
2073 #if defined(__LP64__)
2074 #define ZRECORDS_MAX 2560 /* Max records allowed in the log */
2076 #define ZRECORDS_MAX 1536 /* Max records allowed in the log */
2078 #define ZRECORDS_DEFAULT 1024 /* default records in log if zrecs is not specificed in boot-args */
2080 static TUNABLE(uint32_t, log_records
, "zrecs", ZRECORDS_DEFAULT
);
2083 zone_enable_logging(zone_t z
)
2085 z
->zlog_btlog
= btlog_create(log_records
, MAX_ZTRACE_DEPTH
,
2086 (corruption_debug_flag
== FALSE
) /* caller_will_remove_entries_for_element? */);
2088 if (z
->zlog_btlog
) {
2089 printf("zone: logging started for zone %s%s\n",
2090 zone_heap_name(z
), z
->z_name
);
2092 printf("zone: couldn't allocate memory for zrecords, turning off zleak logging\n");
2093 z
->zone_logging
= false;
2098 * @function zone_setup_logging
2101 * Optionally sets up a zone for logging.
2104 * We recognized two boot-args:
2106 * zlog=<zone_to_log>
2107 * zrecs=<num_records_in_log>
2109 * The zlog arg is used to specify the zone name that should be logged,
2110 * and zrecs is used to control the size of the log.
2112 * If zrecs is not specified, a default value is used.
2115 zone_setup_logging(zone_t z
)
2117 char zone_name
[MAX_ZONE_NAME
]; /* Temp. buffer for the zone name */
2118 char zlog_name
[MAX_ZONE_NAME
]; /* Temp. buffer to create the strings zlog1, zlog2 etc... */
2119 char zlog_val
[MAX_ZONE_NAME
]; /* the zone name we're logging, if any */
2122 * Don't allow more than ZRECORDS_MAX records even if the user asked for more.
2124 * This prevents accidentally hogging too much kernel memory
2125 * and making the system unusable.
2127 if (log_records
> ZRECORDS_MAX
) {
2128 log_records
= ZRECORDS_MAX
;
2132 * Append kalloc heap name to zone name (if zone is used by kalloc)
2134 snprintf(zone_name
, MAX_ZONE_NAME
, "%s%s", zone_heap_name(z
), z
->z_name
);
2136 /* zlog0 isn't allowed. */
2137 for (int i
= 1; i
<= max_num_zones_to_log
; i
++) {
2138 snprintf(zlog_name
, MAX_ZONE_NAME
, "zlog%d", i
);
2140 if (PE_parse_boot_argn(zlog_name
, zlog_val
, sizeof(zlog_val
)) &&
2141 track_this_zone(zone_name
, zlog_val
)) {
2142 z
->zone_logging
= true;
2149 * Backwards compat. with the old boot-arg used to specify single zone
2150 * logging i.e. zlog Needs to happen after the newer zlogn checks
2151 * because the prefix will match all the zlogn
2154 if (!z
->zone_logging
&&
2155 PE_parse_boot_argn("zlog", zlog_val
, sizeof(zlog_val
)) &&
2156 track_this_zone(zone_name
, zlog_val
)) {
2157 z
->zone_logging
= true;
2163 * If we want to log a zone, see if we need to allocate buffer space for
2166 * Some vm related zones are zinit'ed before we can do a kmem_alloc, so
2167 * we have to defer allocation in that case.
2169 * zone_init() will finish the job.
2171 * If we want to log one of the VM related zones that's set up early on,
2172 * we will skip allocation of the log until zinit is called again later
2173 * on some other zone.
2175 if (z
->zone_logging
&& startup_phase
>= STARTUP_SUB_KMEM_ALLOC
) {
2176 zone_enable_logging(z
);
2181 * Each record in the log contains a pointer to the zone element it refers to,
2182 * and a small array to hold the pc's from the stack trace. A
2183 * record is added to the log each time a zalloc() is done in the zone_of_interest. For leak debugging,
2184 * the record is cleared when a zfree() is done. For corruption debugging, the log tracks both allocs and frees.
2185 * If the log fills, old records are replaced as if it were a circular buffer.
2190 * Decide if we want to log this zone by doing a string compare between a zone name and the name
2191 * of the zone to log. Return true if the strings are equal, false otherwise. Because it's not
2192 * possible to include spaces in strings passed in via the boot-args, a period in the logname will
2193 * match a space in the zone name.
2197 * Test if we want to log this zalloc/zfree event. We log if this is the zone we're interested in and
2198 * the buffer for the records has been allocated.
2201 #define DO_LOGGING(z) (z->zlog_btlog != NULL)
2202 #else /* !ZONE_ENABLE_LOGGING */
2203 #define DO_LOGGING(z) 0
2204 #endif /* !ZONE_ENABLE_LOGGING */
2209 * The zone leak detector, abbreviated 'zleak', keeps track of a subset of the currently outstanding
2210 * allocations made by the zone allocator. Every zleak_sample_factor allocations in each zone, we capture a
2211 * backtrace. Every free, we examine the table and determine if the allocation was being tracked,
2212 * and stop tracking it if it was being tracked.
2214 * We track the allocations in the zallocations hash table, which stores the address that was returned from
2215 * the zone allocator. Each stored entry in the zallocations table points to an entry in the ztraces table, which
2216 * stores the backtrace associated with that allocation. This provides uniquing for the relatively large
2217 * backtraces - we don't store them more than once.
2219 * Data collection begins when the zone map is 50% full, and only occurs for zones that are taking up
2220 * a large amount of virtual space.
2222 #define ZLEAK_STATE_ENABLED 0x01 /* Zone leak monitoring should be turned on if zone_map fills up. */
2223 #define ZLEAK_STATE_ACTIVE 0x02 /* We are actively collecting traces. */
2224 #define ZLEAK_STATE_ACTIVATING 0x04 /* Some thread is doing setup; others should move along. */
2225 #define ZLEAK_STATE_FAILED 0x08 /* Attempt to allocate tables failed. We will not try again. */
2226 uint32_t zleak_state
= 0; /* State of collection, as above */
2228 boolean_t panic_include_ztrace
= FALSE
; /* Enable zleak logging on panic */
2229 vm_size_t zleak_global_tracking_threshold
; /* Size of zone map at which to start collecting data */
2230 vm_size_t zleak_per_zone_tracking_threshold
; /* Size a zone will have before we will collect data on it */
2231 unsigned int zleak_sample_factor
= 1000; /* Allocations per sample attempt */
2234 * Counters for allocation statistics.
2237 /* Times two active records want to occupy the same spot */
2238 unsigned int z_alloc_collisions
= 0;
2239 unsigned int z_trace_collisions
= 0;
2241 /* Times a new record lands on a spot previously occupied by a freed allocation */
2242 unsigned int z_alloc_overwrites
= 0;
2243 unsigned int z_trace_overwrites
= 0;
2245 /* Times a new alloc or trace is put into the hash table */
2246 unsigned int z_alloc_recorded
= 0;
2247 unsigned int z_trace_recorded
= 0;
2249 /* Times zleak_log returned false due to not being able to acquire the lock */
2250 unsigned int z_total_conflicts
= 0;
2253 * Structure for keeping track of an allocation
2254 * An allocation bucket is in use if its element is not NULL
2256 struct zallocation
{
2257 uintptr_t za_element
; /* the element that was zalloc'ed or zfree'ed, NULL if bucket unused */
2258 vm_size_t za_size
; /* how much memory did this allocation take up? */
2259 uint32_t za_trace_index
; /* index into ztraces for backtrace associated with allocation */
2260 /* TODO: #if this out */
2261 uint32_t za_hit_count
; /* for determining effectiveness of hash function */
2264 /* Size must be a power of two for the zhash to be able to just mask off bits instead of mod */
2265 uint32_t zleak_alloc_buckets
= CONFIG_ZLEAK_ALLOCATION_MAP_NUM
;
2266 uint32_t zleak_trace_buckets
= CONFIG_ZLEAK_TRACE_MAP_NUM
;
2268 vm_size_t zleak_max_zonemap_size
;
2270 /* Hashmaps of allocations and their corresponding traces */
2271 static struct zallocation
* zallocations
;
2272 static struct ztrace
* ztraces
;
2274 /* not static so that panic can see this, see kern/debug.c */
2275 struct ztrace
* top_ztrace
;
2277 /* Lock to protect zallocations, ztraces, and top_ztrace from concurrent modification. */
2278 LCK_GRP_DECLARE(zleak_lock_grp
, "zleak_lock");
2279 LCK_SPIN_DECLARE(zleak_lock
, &zleak_lock_grp
);
2282 * Initializes the zone leak monitor. Called from zone_init()
2286 zleak_init(vm_size_t max_zonemap_size
)
2288 char scratch_buf
[16];
2289 boolean_t zleak_enable_flag
= FALSE
;
2291 zleak_max_zonemap_size
= max_zonemap_size
;
2292 zleak_global_tracking_threshold
= max_zonemap_size
/ 2;
2293 zleak_per_zone_tracking_threshold
= zleak_global_tracking_threshold
/ 8;
2296 if (PE_parse_boot_argn("-zleakon", scratch_buf
, sizeof(scratch_buf
))) {
2297 zleak_enable_flag
= TRUE
;
2298 printf("zone leak detection enabled\n");
2300 zleak_enable_flag
= FALSE
;
2301 printf("zone leak detection disabled\n");
2303 #else /* CONFIG_EMBEDDED */
2304 /* -zleakoff (flag to disable zone leak monitor) */
2305 if (PE_parse_boot_argn("-zleakoff", scratch_buf
, sizeof(scratch_buf
))) {
2306 zleak_enable_flag
= FALSE
;
2307 printf("zone leak detection disabled\n");
2309 zleak_enable_flag
= TRUE
;
2310 printf("zone leak detection enabled\n");
2312 #endif /* CONFIG_EMBEDDED */
2314 /* zfactor=XXXX (override how often to sample the zone allocator) */
2315 if (PE_parse_boot_argn("zfactor", &zleak_sample_factor
, sizeof(zleak_sample_factor
))) {
2316 printf("Zone leak factor override: %u\n", zleak_sample_factor
);
2319 /* zleak-allocs=XXXX (override number of buckets in zallocations) */
2320 if (PE_parse_boot_argn("zleak-allocs", &zleak_alloc_buckets
, sizeof(zleak_alloc_buckets
))) {
2321 printf("Zone leak alloc buckets override: %u\n", zleak_alloc_buckets
);
2322 /* uses 'is power of 2' trick: (0x01000 & 0x00FFF == 0) */
2323 if (zleak_alloc_buckets
== 0 || (zleak_alloc_buckets
& (zleak_alloc_buckets
- 1))) {
2324 printf("Override isn't a power of two, bad things might happen!\n");
2328 /* zleak-traces=XXXX (override number of buckets in ztraces) */
2329 if (PE_parse_boot_argn("zleak-traces", &zleak_trace_buckets
, sizeof(zleak_trace_buckets
))) {
2330 printf("Zone leak trace buckets override: %u\n", zleak_trace_buckets
);
2331 /* uses 'is power of 2' trick: (0x01000 & 0x00FFF == 0) */
2332 if (zleak_trace_buckets
== 0 || (zleak_trace_buckets
& (zleak_trace_buckets
- 1))) {
2333 printf("Override isn't a power of two, bad things might happen!\n");
2337 if (zleak_enable_flag
) {
2338 zleak_state
= ZLEAK_STATE_ENABLED
;
2343 * Support for kern.zleak.active sysctl - a simplified
2344 * version of the zleak_state variable.
2347 get_zleak_state(void)
2349 if (zleak_state
& ZLEAK_STATE_FAILED
) {
2352 if (zleak_state
& ZLEAK_STATE_ACTIVE
) {
2359 zleak_activate(void)
2361 kern_return_t retval
;
2362 vm_size_t z_alloc_size
= zleak_alloc_buckets
* sizeof(struct zallocation
);
2363 vm_size_t z_trace_size
= zleak_trace_buckets
* sizeof(struct ztrace
);
2364 void *allocations_ptr
= NULL
;
2365 void *traces_ptr
= NULL
;
2367 /* Only one thread attempts to activate at a time */
2368 if (zleak_state
& (ZLEAK_STATE_ACTIVE
| ZLEAK_STATE_ACTIVATING
| ZLEAK_STATE_FAILED
)) {
2369 return KERN_SUCCESS
;
2372 /* Indicate that we're doing the setup */
2373 lck_spin_lock(&zleak_lock
);
2374 if (zleak_state
& (ZLEAK_STATE_ACTIVE
| ZLEAK_STATE_ACTIVATING
| ZLEAK_STATE_FAILED
)) {
2375 lck_spin_unlock(&zleak_lock
);
2376 return KERN_SUCCESS
;
2379 zleak_state
|= ZLEAK_STATE_ACTIVATING
;
2380 lck_spin_unlock(&zleak_lock
);
2382 /* Allocate and zero tables */
2383 retval
= kmem_alloc_kobject(kernel_map
, (vm_offset_t
*)&allocations_ptr
, z_alloc_size
, VM_KERN_MEMORY_OSFMK
);
2384 if (retval
!= KERN_SUCCESS
) {
2388 retval
= kmem_alloc_kobject(kernel_map
, (vm_offset_t
*)&traces_ptr
, z_trace_size
, VM_KERN_MEMORY_OSFMK
);
2389 if (retval
!= KERN_SUCCESS
) {
2393 bzero(allocations_ptr
, z_alloc_size
);
2394 bzero(traces_ptr
, z_trace_size
);
2396 /* Everything's set. Install tables, mark active. */
2397 zallocations
= allocations_ptr
;
2398 ztraces
= traces_ptr
;
2401 * Initialize the top_ztrace to the first entry in ztraces,
2402 * so we don't have to check for null in zleak_log
2404 top_ztrace
= &ztraces
[0];
2407 * Note that we do need a barrier between installing
2408 * the tables and setting the active flag, because the zfree()
2409 * path accesses the table without a lock if we're active.
2411 lck_spin_lock(&zleak_lock
);
2412 zleak_state
|= ZLEAK_STATE_ACTIVE
;
2413 zleak_state
&= ~ZLEAK_STATE_ACTIVATING
;
2414 lck_spin_unlock(&zleak_lock
);
2420 * If we fail to allocate memory, don't further tax
2421 * the system by trying again.
2423 lck_spin_lock(&zleak_lock
);
2424 zleak_state
|= ZLEAK_STATE_FAILED
;
2425 zleak_state
&= ~ZLEAK_STATE_ACTIVATING
;
2426 lck_spin_unlock(&zleak_lock
);
2428 if (allocations_ptr
!= NULL
) {
2429 kmem_free(kernel_map
, (vm_offset_t
)allocations_ptr
, z_alloc_size
);
2432 if (traces_ptr
!= NULL
) {
2433 kmem_free(kernel_map
, (vm_offset_t
)traces_ptr
, z_trace_size
);
2440 * TODO: What about allocations that never get deallocated,
2441 * especially ones with unique backtraces? Should we wait to record
2442 * until after boot has completed?
2443 * (How many persistent zallocs are there?)
2447 * This function records the allocation in the allocations table,
2448 * and stores the associated backtrace in the traces table
2449 * (or just increments the refcount if the trace is already recorded)
2450 * If the allocation slot is in use, the old allocation is replaced with the new allocation, and
2451 * the associated trace's refcount is decremented.
2452 * If the trace slot is in use, it returns.
2453 * The refcount is incremented by the amount of memory the allocation consumes.
2454 * The return value indicates whether to try again next time.
2457 zleak_log(uintptr_t* bt
,
2460 vm_size_t allocation_size
)
2462 /* Quit if there's someone else modifying the hash tables */
2463 if (!lck_spin_try_lock(&zleak_lock
)) {
2464 z_total_conflicts
++;
2468 struct zallocation
* allocation
= &zallocations
[hashaddr(addr
, zleak_alloc_buckets
)];
2470 uint32_t trace_index
= hashbacktrace(bt
, depth
, zleak_trace_buckets
);
2471 struct ztrace
* trace
= &ztraces
[trace_index
];
2473 allocation
->za_hit_count
++;
2474 trace
->zt_hit_count
++;
2477 * If the allocation bucket we want to be in is occupied, and if the occupier
2478 * has the same trace as us, just bail.
2480 if (allocation
->za_element
!= (uintptr_t) 0 && trace_index
== allocation
->za_trace_index
) {
2481 z_alloc_collisions
++;
2483 lck_spin_unlock(&zleak_lock
);
2487 /* STEP 1: Store the backtrace in the traces array. */
2488 /* A size of zero indicates that the trace bucket is free. */
2490 if (trace
->zt_size
> 0 && bcmp(trace
->zt_stack
, bt
, (depth
* sizeof(uintptr_t))) != 0) {
2492 * Different unique trace with same hash!
2493 * Just bail - if we're trying to record the leaker, hopefully the other trace will be deallocated
2494 * and get out of the way for later chances
2496 trace
->zt_collisions
++;
2497 z_trace_collisions
++;
2499 lck_spin_unlock(&zleak_lock
);
2501 } else if (trace
->zt_size
> 0) {
2502 /* Same trace, already added, so increment refcount */
2503 trace
->zt_size
+= allocation_size
;
2505 /* Found an unused trace bucket, record the trace here! */
2506 if (trace
->zt_depth
!= 0) { /* if this slot was previously used but not currently in use */
2507 z_trace_overwrites
++;
2511 trace
->zt_size
= allocation_size
;
2512 memcpy(trace
->zt_stack
, bt
, (depth
* sizeof(uintptr_t)));
2514 trace
->zt_depth
= depth
;
2515 trace
->zt_collisions
= 0;
2518 /* STEP 2: Store the allocation record in the allocations array. */
2520 if (allocation
->za_element
!= (uintptr_t) 0) {
2522 * Straight up replace any allocation record that was there. We don't want to do the work
2523 * to preserve the allocation entries that were there, because we only record a subset of the
2524 * allocations anyways.
2527 z_alloc_collisions
++;
2529 struct ztrace
* associated_trace
= &ztraces
[allocation
->za_trace_index
];
2530 /* Knock off old allocation's size, not the new allocation */
2531 associated_trace
->zt_size
-= allocation
->za_size
;
2532 } else if (allocation
->za_trace_index
!= 0) {
2533 /* Slot previously used but not currently in use */
2534 z_alloc_overwrites
++;
2537 allocation
->za_element
= addr
;
2538 allocation
->za_trace_index
= trace_index
;
2539 allocation
->za_size
= allocation_size
;
2543 if (top_ztrace
->zt_size
< trace
->zt_size
) {
2547 lck_spin_unlock(&zleak_lock
);
2552 * Free the allocation record and release the stacktrace.
2553 * This should be as fast as possible because it will be called for every free.
2555 __attribute__((noinline
))
2557 zleak_free(uintptr_t addr
,
2558 vm_size_t allocation_size
)
2560 if (addr
== (uintptr_t) 0) {
2564 struct zallocation
* allocation
= &zallocations
[hashaddr(addr
, zleak_alloc_buckets
)];
2566 /* Double-checked locking: check to find out if we're interested, lock, check to make
2567 * sure it hasn't changed, then modify it, and release the lock.
2570 if (allocation
->za_element
== addr
&& allocation
->za_trace_index
< zleak_trace_buckets
) {
2571 /* if the allocation was the one, grab the lock, check again, then delete it */
2572 lck_spin_lock(&zleak_lock
);
2574 if (allocation
->za_element
== addr
&& allocation
->za_trace_index
< zleak_trace_buckets
) {
2575 struct ztrace
*trace
;
2577 /* allocation_size had better match what was passed into zleak_log - otherwise someone is freeing into the wrong zone! */
2578 if (allocation
->za_size
!= allocation_size
) {
2579 panic("Freeing as size %lu memory that was allocated with size %lu\n",
2580 (uintptr_t)allocation_size
, (uintptr_t)allocation
->za_size
);
2583 trace
= &ztraces
[allocation
->za_trace_index
];
2585 /* size of 0 indicates trace bucket is unused */
2586 if (trace
->zt_size
> 0) {
2587 trace
->zt_size
-= allocation_size
;
2590 /* A NULL element means the allocation bucket is unused */
2591 allocation
->za_element
= 0;
2593 lck_spin_unlock(&zleak_lock
);
2597 #endif /* CONFIG_ZLEAKS */
2599 /* These functions outside of CONFIG_ZLEAKS because they are also used in
2600 * mbuf.c for mbuf leak-detection. This is why they lack the z_ prefix.
2603 /* "Thomas Wang's 32/64 bit mix functions." http://www.concentric.net/~Ttwang/tech/inthash.htm */
2605 hash_mix(uintptr_t x
)
2628 hashbacktrace(uintptr_t* bt
, uint32_t depth
, uint32_t max_size
)
2631 uintptr_t mask
= max_size
- 1;
2634 hash
+= bt
[--depth
];
2637 hash
= hash_mix(hash
) & mask
;
2639 assert(hash
< max_size
);
2641 return (uint32_t) hash
;
2645 * TODO: Determine how well distributed this is
2646 * max_size must be a power of 2. i.e 0x10000 because 0x10000-1 is 0x0FFFF which is a great bitmask
2649 hashaddr(uintptr_t pt
, uint32_t max_size
)
2652 uintptr_t mask
= max_size
- 1;
2654 hash
= hash_mix(pt
) & mask
;
2656 assert(hash
< max_size
);
2658 return (uint32_t) hash
;
2661 /* End of all leak-detection code */
2662 #pragma mark zone creation, configuration, destruction
2665 zone_init_defaults(zone_id_t zid
)
2667 zone_t z
= &zone_array
[zid
];
2669 z
->page_count_max
= ~0u;
2670 z
->collectable
= true;
2671 z
->expandable
= true;
2672 z
->submap_idx
= Z_SUBMAP_IDX_GENERAL_MAP
;
2674 simple_lock_init(&z
->lock
, 0);
2680 zone_is_initializing(zone_t z
)
2682 return !z
->z_self
&& !z
->destroyed
;
2686 zone_set_max(zone_t z
, vm_size_t max
)
2689 if (z
->kasan_redzone
) {
2691 * Adjust the max memory for the kasan redzones
2693 max
+= (max
/ z
->pcpu_elem_size
) * z
->kasan_redzone
* 2;
2696 if (max
< z
->percpu
? 1 : z
->alloc_pages
) {
2697 max
= z
->percpu
? 1 : z
->alloc_pages
;
2699 max
= atop(round_page(max
));
2701 z
->page_count_max
= max
;
2705 zone_set_submap_idx(zone_t zone
, unsigned int sub_map_idx
)
2707 if (!zone_is_initializing(zone
)) {
2708 panic("%s: called after zone_create()", __func__
);
2710 if (sub_map_idx
> zone_last_submap_idx
) {
2711 panic("zone_set_submap_idx(%d) > %d", sub_map_idx
, zone_last_submap_idx
);
2713 zone
->submap_idx
= sub_map_idx
;
2721 if (!zone_is_initializing(zone
)) {
2722 panic("%s: called after zone_create()", __func__
);
2724 zone
->expandable
= false;
2725 zone_set_max(zone
, max
);
2729 zone_set_exhaustible(
2733 if (!zone_is_initializing(zone
)) {
2734 panic("%s: called after zone_create()", __func__
);
2736 zone
->expandable
= false;
2737 zone
->exhaustible
= true;
2738 zone_set_max(zone
, max
);
2742 * @function zone_create_find
2745 * Finds an unused zone for the given name and element size.
2747 * @param name the zone name
2748 * @param size the element size (including redzones, ...)
2749 * @param flags the flags passed to @c zone_create*
2750 * @param zid the desired zone ID or ZONE_ID_ANY
2752 * @returns a zone to initialize further.
2758 zone_create_flags_t flags
,
2764 simple_lock(&all_zones_lock
, &zone_locks_grp
);
2766 nzones
= (zone_id_t
)os_atomic_load(&num_zones
, relaxed
);
2767 assert(num_zones_in_use
<= nzones
&& nzones
< MAX_ZONES
);
2769 if (__improbable(nzones
< ZONE_ID__FIRST_DYNAMIC
)) {
2771 * The first time around, make sure the reserved zone IDs
2772 * have an initialized lock as zone_index_foreach() will
2775 while (nzones
< ZONE_ID__FIRST_DYNAMIC
) {
2776 zone_init_defaults(nzones
++);
2779 os_atomic_store(&num_zones
, nzones
, release
);
2782 if (zid
!= ZONE_ID_ANY
) {
2783 if (zid
>= ZONE_ID__FIRST_DYNAMIC
) {
2784 panic("zone_create: invalid desired zone ID %d for %s",
2787 if (flags
& ZC_DESTRUCTIBLE
) {
2788 panic("zone_create: ID %d (%s) must be permanent", zid
, name
);
2790 if (zone_array
[zid
].z_self
) {
2791 panic("zone_create: creating zone ID %d (%s) twice", zid
, name
);
2793 z
= &zone_array
[zid
];
2795 if (flags
& ZC_DESTRUCTIBLE
) {
2797 * If possible, find a previously zdestroy'ed zone in the
2798 * zone_array that we can reuse.
2800 for (int i
= bitmap_first(zone_destroyed_bitmap
, MAX_ZONES
);
2801 i
>= 0; i
= bitmap_next(zone_destroyed_bitmap
, i
)) {
2805 * If the zone name and the element size are the
2806 * same, we can just reuse the old zone struct.
2808 if (strcmp(z
->z_name
, name
) || zone_elem_size(z
) != size
) {
2811 bitmap_clear(zone_destroyed_bitmap
, i
);
2812 z
->destroyed
= false;
2820 z
= zone_init_defaults(zid
);
2823 * The release barrier pairs with the acquire in
2824 * zone_index_foreach() and makes sure that enumeration loops
2825 * always see an initialized zone lock.
2827 os_atomic_store(&num_zones
, nzones
, release
);
2832 simple_unlock(&all_zones_lock
);
2839 zone_create_panic(const char *name
, const char *f1
, const char *f2
)
2841 panic("zone_create: creating zone %s: flag %s and %s are incompatible",
2844 #define zone_create_assert_not_both(name, flags, current_flag, forbidden_flag) \
2845 if ((flags) & forbidden_flag) { \
2846 zone_create_panic(name, #current_flag, #forbidden_flag); \
2850 * Adjusts the size of the element based on minimum size, alignment
2851 * and kasan redzones
2854 zone_elem_adjust_size(
2855 const char *name __unused
,
2856 vm_size_t elem_size
,
2857 zone_create_flags_t flags
,
2858 vm_size_t
*redzone __unused
)
2862 * Adjust element size for minimum size and pointer alignment
2864 size
= (elem_size
+ sizeof(vm_offset_t
) - 1) & -sizeof(vm_offset_t
);
2865 if (((flags
& ZC_PERCPU
) == 0) && size
< ZONE_MIN_ELEM_SIZE
) {
2866 size
= ZONE_MIN_ELEM_SIZE
;
2871 * Expand the zone allocation size to include the redzones.
2873 * For page-multiple zones add a full guard page because they
2874 * likely require alignment.
2876 vm_size_t redzone_tmp
;
2877 if (flags
& (ZC_KASAN_NOREDZONE
| ZC_PERCPU
)) {
2879 } else if ((size
& PAGE_MASK
) == 0) {
2880 if (size
!= PAGE_SIZE
&& (flags
& ZC_ALIGNMENT_REQUIRED
)) {
2881 panic("zone_create: zone %s can't provide more than PAGE_SIZE"
2884 redzone_tmp
= PAGE_SIZE
;
2885 } else if (flags
& ZC_ALIGNMENT_REQUIRED
) {
2888 redzone_tmp
= KASAN_GUARD_SIZE
;
2890 size
+= redzone_tmp
* 2;
2892 *redzone
= redzone_tmp
;
2899 * Returns the allocation chunk size that has least framentation
2902 zone_get_min_alloc_granule(
2903 vm_size_t elem_size
,
2904 zone_create_flags_t flags
)
2906 vm_size_t alloc_granule
= PAGE_SIZE
;
2907 if (flags
& ZC_PERCPU
) {
2908 alloc_granule
= PAGE_SIZE
* zpercpu_count();
2909 if (PAGE_SIZE
% elem_size
> 256) {
2910 panic("zone_create: per-cpu zone has too much fragmentation");
2912 } else if ((elem_size
& PAGE_MASK
) == 0) {
2913 /* zero fragmentation by definition */
2914 alloc_granule
= elem_size
;
2915 } else if (alloc_granule
% elem_size
== 0) {
2916 /* zero fragmentation by definition */
2918 vm_size_t frag
= (alloc_granule
% elem_size
) * 100 / alloc_granule
;
2919 vm_size_t alloc_tmp
= PAGE_SIZE
;
2920 while ((alloc_tmp
+= PAGE_SIZE
) <= ZONE_MAX_ALLOC_SIZE
) {
2921 vm_size_t frag_tmp
= (alloc_tmp
% elem_size
) * 100 / alloc_tmp
;
2922 if (frag_tmp
< frag
) {
2924 alloc_granule
= alloc_tmp
;
2928 return alloc_granule
;
2932 zone_get_foreign_alloc_size(
2933 const char *name __unused
,
2934 vm_size_t elem_size
,
2935 zone_create_flags_t flags
,
2938 vm_size_t adjusted_size
= zone_elem_adjust_size(name
, elem_size
, flags
,
2940 vm_size_t alloc_granule
= zone_get_min_alloc_granule(adjusted_size
,
2942 vm_size_t min_size
= min_pages
* PAGE_SIZE
;
2944 * Round up min_size to a multiple of alloc_granule
2946 return ((min_size
+ alloc_granule
- 1) / alloc_granule
)
2954 zone_create_flags_t flags
,
2955 zone_id_t desired_zid
,
2956 void (^extra_setup
)(zone_t
))
2962 if (size
> ZONE_MAX_ALLOC_SIZE
) {
2963 panic("zone_create: element size too large: %zd", (size_t)size
);
2966 size
= zone_elem_adjust_size(name
, size
, flags
, &redzone
);
2968 * Allocate the zone slot, return early if we found an older match.
2970 z
= zone_create_find(name
, size
, flags
, desired_zid
);
2971 if (__improbable(z
->z_self
)) {
2972 /* We found a zone to reuse */
2977 * Initialize the zone properly.
2981 * If the kernel is post lockdown, copy the zone name passed in.
2982 * Else simply maintain a pointer to the name string as it can only
2983 * be a core XNU zone (no unloadable kext exists before lockdown).
2985 if (startup_phase
>= STARTUP_SUB_LOCKDOWN
) {
2986 size_t nsz
= MIN(strlen(name
) + 1, MACH_ZONE_NAME_MAX_LEN
);
2987 char *buf
= zalloc_permanent(nsz
, ZALIGN_NONE
);
2988 strlcpy(buf
, name
, nsz
);
2994 * If zone_init() hasn't run yet, the permanent zones do not exist.
2995 * We can limp along without properly initialized stats for a while,
2996 * zone_init() will rebuild the missing stats when it runs.
2998 if (__probable(zone_array
[ZONE_ID_PERCPU_PERMANENT
].z_self
)) {
2999 z
->z_stats
= zalloc_percpu_permanent_type(struct zone_stats
);
3002 alloc
= zone_get_min_alloc_granule(size
, flags
);
3004 if (flags
& ZC_KALLOC_HEAP
) {
3005 size_t rem
= (alloc
% size
) / (alloc
/ size
);
3008 * Try to grow the elements size and spread them more if the remaining
3009 * space is large enough.
3011 size
+= rem
& ~(KALLOC_MINALIGN
- 1);
3014 z
->pcpu_elem_size
= z
->z_elem_size
= (uint16_t)size
;
3015 z
->alloc_pages
= (uint16_t)atop(alloc
);
3017 z
->kasan_redzone
= redzone
;
3018 if (strncmp(name
, "fakestack.", sizeof("fakestack.") - 1) == 0) {
3019 z
->kasan_fakestacks
= true;
3027 if (flags
& ZC_SEQUESTER
) {
3028 z
->va_sequester
= true;
3031 /* ZC_CACHING applied after all configuration is done */
3033 if (flags
& ZC_PERCPU
) {
3035 * ZC_CACHING is disallowed because it uses per-cpu zones for its
3036 * implementation and it would be circular. These allocations are
3037 * also quite expensive, so caching feels dangerous memory wise too.
3039 * ZC_ZFREE_CLEARMEM is forced because per-cpu zones allow for
3040 * pointer-sized allocations which poisoning doesn't support.
3042 zone_create_assert_not_both(name
, flags
, ZC_PERCPU
, ZC_CACHING
);
3043 zone_create_assert_not_both(name
, flags
, ZC_PERCPU
, ZC_ALLOW_FOREIGN
);
3045 z
->gzalloc_exempt
= true;
3046 z
->zfree_clear_mem
= true;
3047 z
->pcpu_elem_size
*= zpercpu_count();
3049 if (flags
& ZC_ZFREE_CLEARMEM
) {
3050 z
->zfree_clear_mem
= true;
3052 if (flags
& ZC_NOGC
) {
3053 z
->collectable
= false;
3055 if (flags
& ZC_NOENCRYPT
) {
3056 z
->noencrypt
= true;
3058 if (flags
& ZC_ALIGNMENT_REQUIRED
) {
3059 z
->alignment_required
= true;
3061 if (flags
& ZC_NOGZALLOC
) {
3062 z
->gzalloc_exempt
= true;
3064 if (flags
& ZC_NOCALLOUT
) {
3065 z
->no_callout
= true;
3067 if (flags
& ZC_DESTRUCTIBLE
) {
3068 zone_create_assert_not_both(name
, flags
, ZC_DESTRUCTIBLE
, ZC_CACHING
);
3069 zone_create_assert_not_both(name
, flags
, ZC_DESTRUCTIBLE
, ZC_ALLOW_FOREIGN
);
3070 z
->destructible
= true;
3074 * Handle Internal flags
3076 if (flags
& ZC_ALLOW_FOREIGN
) {
3077 z
->allows_foreign
= true;
3079 if ((ZSECURITY_OPTIONS_SUBMAP_USER_DATA
& zsecurity_options
) &&
3080 (flags
& ZC_DATA_BUFFERS
)) {
3081 z
->submap_idx
= Z_SUBMAP_IDX_BAG_OF_BYTES_MAP
;
3083 if (flags
& ZC_KASAN_NOQUARANTINE
) {
3084 z
->kasan_noquarantine
= true;
3086 /* ZC_KASAN_NOREDZONE already handled */
3089 * Then if there's extra tuning, do it
3096 * Configure debugging features
3099 gzalloc_zone_init(z
); /* might set z->gzalloc_tracked */
3101 #if ZONE_ENABLE_LOGGING
3102 if (!z
->gzalloc_tracked
&& num_zones_logged
< max_num_zones_to_log
) {
3104 * Check for and set up zone leak detection if requested via boot-args.
3105 * might set z->zone_logging
3107 zone_setup_logging(z
);
3109 #endif /* ZONE_ENABLE_LOGGING */
3110 #if VM_MAX_TAG_ZONES
3111 if (!z
->gzalloc_tracked
&& z
->kalloc_heap
&& zone_tagging_on
) {
3112 static int tag_zone_index
;
3113 vm_offset_t esize
= zone_elem_size(z
);
3115 z
->tags_inline
= (((page_size
+ esize
- 1) / esize
) <=
3116 (sizeof(uint32_t) / sizeof(uint16_t)));
3117 z
->tag_zone_index
= os_atomic_inc_orig(&tag_zone_index
, relaxed
);
3118 assert(z
->tag_zone_index
< VM_MAX_TAG_ZONES
);
3123 * Finally, fixup properties based on security policies, boot-args, ...
3125 if ((ZSECURITY_OPTIONS_SUBMAP_USER_DATA
& zsecurity_options
) &&
3126 z
->kalloc_heap
== KHEAP_ID_DATA_BUFFERS
) {
3127 z
->submap_idx
= Z_SUBMAP_IDX_BAG_OF_BYTES_MAP
;
3130 if ((ZSECURITY_OPTIONS_SEQUESTER
& zsecurity_options
) &&
3131 (flags
& ZC_NOSEQUESTER
) == 0 &&
3132 z
->submap_idx
== Z_SUBMAP_IDX_GENERAL_MAP
) {
3133 z
->va_sequester
= true;
3137 * Always clear zone elements smaller than a cacheline,
3138 * because it's pretty close to free.
3140 if (size
<= zp_min_size
) {
3141 z
->zfree_clear_mem
= true;
3143 if (zp_factor
!= 0 && !z
->zfree_clear_mem
) {
3144 z
->zp_count
= zone_poison_count_init(z
);
3148 if ((flags
& ZC_NOCACHING
) == 0) {
3150 * Append kalloc heap name to zone name (if zone is used by kalloc)
3152 char temp_zone_name
[MAX_ZONE_NAME
] = "";
3153 snprintf(temp_zone_name
, MAX_ZONE_NAME
, "%s%s", zone_heap_name(z
), z
->z_name
);
3155 /* Check if boot-arg specified it should have a cache */
3156 if (track_this_zone(temp_zone_name
, cache_zone_name
)) {
3157 flags
|= ZC_CACHING
;
3158 } else if (zcc_kalloc
&& z
->kalloc_heap
) {
3159 flags
|= ZC_CACHING
;
3162 if ((flags
& ZC_CACHING
) &&
3163 !z
->tags
&& !z
->zone_logging
&& !z
->gzalloc_tracked
) {
3166 #endif /* CONFIG_ZCACHE */
3177 zone_create_startup(struct zone_create_startup_spec
*spec
)
3179 *spec
->z_var
= zone_create_ext(spec
->z_name
, spec
->z_size
,
3180 spec
->z_flags
, spec
->z_zid
, spec
->z_setup
);
3184 * The 4 first field of a zone_view and a zone alias, so that the zone_or_view_t
3185 * union works. trust but verify.
3187 #define zalloc_check_zov_alias(f1, f2) \
3188 static_assert(offsetof(struct zone, f1) == offsetof(struct zone_view, f2))
3189 zalloc_check_zov_alias(z_self
, zv_zone
);
3190 zalloc_check_zov_alias(z_stats
, zv_stats
);
3191 zalloc_check_zov_alias(z_name
, zv_name
);
3192 zalloc_check_zov_alias(z_views
, zv_next
);
3193 #undef zalloc_check_zov_alias
3197 zone_view_startup_init(struct zone_view_startup_spec
*spec
)
3199 struct kalloc_heap
*heap
= NULL
;
3200 zone_view_t zv
= spec
->zv_view
;
3203 switch (spec
->zv_heapid
) {
3204 case KHEAP_ID_DEFAULT
:
3205 heap
= KHEAP_DEFAULT
;
3207 case KHEAP_ID_DATA_BUFFERS
:
3208 heap
= KHEAP_DATA_BUFFERS
;
3218 z
= kalloc_heap_zone_for_size(heap
, spec
->zv_size
);
3222 assert(spec
->zv_size
<= zone_elem_size(z
));
3226 zv
->zv_stats
= zalloc_percpu_permanent_type(struct zone_stats
);
3227 zv
->zv_next
= z
->z_views
;
3228 if (z
->z_views
== NULL
&& z
->kalloc_heap
== KHEAP_ID_NONE
) {
3230 * count the raw view for zones not in a heap,
3231 * kalloc_heap_init() already counts it for its members.
3233 zone_view_count
+= 2;
3235 zone_view_count
+= 1;
3244 zone_create_flags_t flags
)
3246 return zone_create_ext(name
, size
, flags
, ZONE_ID_ANY
, NULL
);
3251 vm_size_t size
, /* the size of an element */
3252 vm_size_t max
, /* maximum memory to use */
3253 vm_size_t alloc __unused
, /* allocation size */
3254 const char *name
) /* a name for the zone */
3256 zone_t z
= zone_create(name
, size
, ZC_DESTRUCTIBLE
);
3257 zone_set_max(z
, max
);
3264 unsigned int zindex
= zone_index(z
);
3268 if (!z
->destructible
|| zone_caching_enabled(z
) || z
->allows_foreign
) {
3269 panic("zdestroy: Zone %s%s isn't destructible",
3270 zone_heap_name(z
), z
->z_name
);
3273 if (!z
->z_self
|| z
->expanding_no_vm_priv
|| z
->expanding_vm_priv
||
3274 z
->async_pending
|| z
->waiting
) {
3275 panic("zdestroy: Zone %s%s in an invalid state for destruction",
3276 zone_heap_name(z
), z
->z_name
);
3281 * Unset the valid bit. We'll hit an assert failure on further operations
3282 * on this zone, until zinit() is called again.
3284 * Leave the zone valid for KASan as we will see zfree's on quarantined free
3285 * elements even after the zone is destroyed.
3289 z
->destroyed
= true;
3292 /* Dump all the free elements */
3293 zone_drop_free_elements(z
);
3296 if (__improbable(z
->gzalloc_tracked
)) {
3297 /* If the zone is gzalloc managed dump all the elements in the free cache */
3298 gzalloc_empty_free_cache(z
);
3304 while (!zone_pva_is_null(z
->pages_sequester
)) {
3305 struct zone_page_metadata
*page_meta
;
3306 vm_offset_t free_addr
;
3308 page_meta
= zone_sequestered_page_get(z
, &free_addr
);
3310 kmem_free(submap_for_zone(z
), free_addr
, ptoa(z
->alloc_pages
));
3315 /* Assert that all counts are zero */
3316 if (z
->countavail
|| z
->countfree
|| zone_size_wired(z
) ||
3317 z
->allfree_page_count
|| z
->sequester_page_count
) {
3318 panic("zdestroy: Zone %s%s isn't empty at zdestroy() time",
3319 zone_heap_name(z
), z
->z_name
);
3322 /* consistency check: make sure everything is indeed empty */
3323 assert(zone_pva_is_null(z
->pages_any_free_foreign
));
3324 assert(zone_pva_is_null(z
->pages_all_used_foreign
));
3325 assert(zone_pva_is_null(z
->pages_all_free
));
3326 assert(zone_pva_is_null(z
->pages_intermediate
));
3327 assert(zone_pva_is_null(z
->pages_all_used
));
3328 assert(zone_pva_is_null(z
->pages_sequester
));
3333 simple_lock(&all_zones_lock
, &zone_locks_grp
);
3335 assert(!bitmap_test(zone_destroyed_bitmap
, zindex
));
3336 /* Mark the zone as empty in the bitmap */
3337 bitmap_set(zone_destroyed_bitmap
, zindex
);
3339 assert(num_zones_in_use
> 0);
3341 simple_unlock(&all_zones_lock
);
3344 #pragma mark zone (re)fill, jetsam
3347 * Dealing with zone allocations from the mach VM code.
3349 * The implementation of the mach VM itself uses the zone allocator
3350 * for things like the vm_map_entry data structure. In order to prevent
3351 * an infinite recursion problem when adding more pages to a zone, zalloc
3352 * uses a replenish thread to refill the VM layer's zones before they have
3353 * too few remaining free entries. The reserved remaining free entries
3354 * guarantee that the VM routines can get entries from already mapped pages.
3356 * In order for that to work, the amount of allocations in the nested
3357 * case have to be bounded. There are currently 2 replenish zones, and
3358 * if each needs 1 element of each zone to add a new page to itself, that
3359 * gives us a minumum reserve of 2 elements.
3361 * There is also a deadlock issue with the zone garbage collection thread,
3362 * or any thread that is trying to free zone pages. While holding
3363 * the kernel's map lock they may need to allocate new VM map entries, hence
3364 * we need enough reserve to allow them to get past the point of holding the
3365 * map lock. After freeing that page, the GC thread will wait in drop_free_elements()
3366 * until the replenish threads can finish. Since there's only 1 GC thread at a time,
3367 * that adds a minimum of 1 to the reserve size.
3369 * Since the minumum amount you can add to a zone is 1 page, we'll use 16K (from ARM)
3370 * as the refill size on all platforms.
3372 * When a refill zone drops to half that available, i.e. REFILL_SIZE / 2,
3373 * zalloc_ext() will wake the replenish thread. The replenish thread runs
3374 * until at least REFILL_SIZE worth of free elements exist, before sleeping again.
3375 * In the meantime threads may continue to use the reserve until there are only REFILL_SIZE / 4
3376 * elements left. Below that point only the replenish threads themselves and the GC
3377 * thread may continue to use from the reserve.
3379 static unsigned zone_replenish_loops
;
3380 static unsigned zone_replenish_wakeups
;
3381 static unsigned zone_replenish_wakeups_initiated
;
3382 static unsigned zone_replenish_throttle_count
;
3384 #define ZONE_REPLENISH_TARGET (16 * 1024)
3385 static unsigned zone_replenish_active
= 0; /* count of zones currently replenishing */
3386 static unsigned zone_replenish_max_threads
= 0;
3388 LCK_GRP_DECLARE(zone_replenish_lock_grp
, "zone_replenish_lock");
3389 LCK_SPIN_DECLARE(zone_replenish_lock
, &zone_replenish_lock_grp
);
3393 zone_replenish_panic(zone_t zone
, kern_return_t kr
)
3395 panic_include_zprint
= TRUE
;
3397 if ((zleak_state
& ZLEAK_STATE_ACTIVE
)) {
3398 panic_include_ztrace
= TRUE
;
3400 #endif /* CONFIG_ZLEAKS */
3401 if (kr
== KERN_NO_SPACE
) {
3402 zone_t zone_largest
= zone_find_largest();
3403 panic("zalloc: zone map exhausted while allocating from zone %s%s, "
3404 "likely due to memory leak in zone %s%s "
3405 "(%lu total bytes, %d elements allocated)",
3406 zone_heap_name(zone
), zone
->z_name
,
3407 zone_heap_name(zone_largest
), zone_largest
->z_name
,
3408 (unsigned long)zone_size_wired(zone_largest
),
3409 zone_count_allocated(zone_largest
));
3411 panic("zalloc: %s%s (%d elements) retry fail %d",
3412 zone_heap_name(zone
), zone
->z_name
,
3413 zone_count_allocated(zone
), kr
);
3417 zone_replenish_locked(zone_t z
, zalloc_flags_t flags
, bool asynchronously
)
3419 int kmaflags
= KMA_KOBJECT
| KMA_ZERO
;
3420 vm_offset_t space
, alloc_size
;
3425 kmaflags
|= KMA_NOENCRYPT
;
3427 if (flags
& Z_NOPAGEWAIT
) {
3428 kmaflags
|= KMA_NOPAGEWAIT
;
3431 kmaflags
|= KMA_PERMANENT
;
3435 struct zone_page_metadata
*page_meta
= NULL
;
3438 * Try to allocate our regular chunk of pages,
3439 * unless the system is under massive pressure
3440 * and we're looking for more than 2 pages.
3442 if (!z
->percpu
&& z
->alloc_pages
> 2 && (vm_pool_low() || retry
> 0)) {
3443 alloc_size
= round_page(zone_elem_size(z
));
3445 alloc_size
= ptoa(z
->alloc_pages
);
3446 page_meta
= zone_sequestered_page_get(z
, &space
);
3453 * Do the zone leak activation here because zleak_activate()
3454 * may block, and can't be done on the way out.
3456 if (__improbable(zleak_state
& ZLEAK_STATE_ENABLED
)) {
3457 if (!(zleak_state
& ZLEAK_STATE_ACTIVE
) &&
3458 zone_submaps_approx_size() >= zleak_global_tracking_threshold
) {
3459 kr
= zleak_activate();
3460 if (kr
!= KERN_SUCCESS
) {
3461 printf("Failed to activate live zone leak debugging (%d).\n", kr
);
3465 #endif /* CONFIG_ZLEAKS */
3468 * Trigger jetsams via the vm_pageout_garbage_collect thread if
3469 * we're running out of zone memory
3471 if (is_zone_map_nearing_exhaustion()) {
3472 thread_wakeup((event_t
) &vm_pageout_garbage_collect
);
3476 kr
= zone_sequestered_page_populate(z
, page_meta
, space
,
3477 alloc_size
, kmaflags
);
3479 if (z
->submap_idx
== Z_SUBMAP_IDX_GENERAL_MAP
&& z
->kalloc_heap
!= KHEAP_ID_NONE
) {
3480 kmaflags
|= KMA_KHEAP
;
3482 kr
= kernel_memory_allocate(submap_for_zone(z
),
3483 &space
, alloc_size
, 0, kmaflags
, VM_KERN_MEMORY_ZONE
);
3487 if (kr
== KERN_NO_SPACE
&& z
->allows_foreign
) {
3489 * For zones allowing foreign pages, fallback to the kernel map
3491 kr
= kernel_memory_allocate(kernel_map
, &space
,
3492 alloc_size
, 0, kmaflags
, VM_KERN_MEMORY_ZONE
);
3496 if (kr
== KERN_SUCCESS
) {
3500 if (flags
& Z_NOPAGEWAIT
) {
3505 if (asynchronously
) {
3506 assert_wait_timeout(&z
->prio_refill_count
,
3507 THREAD_UNINT
, 1, 100 * NSEC_PER_USEC
);
3508 thread_block(THREAD_CONTINUE_NULL
);
3509 } else if (++retry
== 3) {
3510 zone_replenish_panic(z
, kr
);
3516 zcram_and_lock(z
, space
, alloc_size
);
3519 if (__improbable(zleak_state
& ZLEAK_STATE_ACTIVE
)) {
3521 zone_size_wired(z
) >= zleak_per_zone_tracking_threshold
) {
3525 #endif /* CONFIG_ZLEAKS */
3529 * High priority VM privileged thread used to asynchronously refill a given zone.
3530 * These are needed for data structures used by the lower level VM itself. The
3531 * replenish thread maintains a reserve of elements, so that the VM will never
3532 * block in the zone allocator.
3536 zone_replenish_thread(void *_z
, wait_result_t __unused wr
)
3540 current_thread()->options
|= (TH_OPT_VMPRIV
| TH_OPT_ZONE_PRIV
);
3544 assert(z
->z_self
== z
);
3545 assert(z
->zone_replenishing
);
3546 assert(z
->prio_refill_count
!= 0);
3548 while (z
->countfree
< z
->prio_refill_count
) {
3549 assert(!z
->expanding_no_vm_priv
);
3550 assert(!z
->expanding_vm_priv
);
3552 zone_replenish_locked(z
, Z_WAITOK
, true);
3554 assert(z
->z_self
== z
);
3555 zone_replenish_loops
++;
3558 /* Wakeup any potentially throttled allocations. */
3561 assert_wait(&z
->prio_refill_count
, THREAD_UNINT
);
3564 * We finished refilling the zone, so decrement the active count
3565 * and wake up any waiting GC threads.
3567 lck_spin_lock(&zone_replenish_lock
);
3568 assert(zone_replenish_active
> 0);
3569 if (--zone_replenish_active
== 0) {
3570 thread_wakeup((event_t
)&zone_replenish_active
);
3572 lck_spin_unlock(&zone_replenish_lock
);
3574 z
->zone_replenishing
= false;
3577 thread_block(THREAD_CONTINUE_NULL
);
3578 zone_replenish_wakeups
++;
3583 zone_prio_refill_configure(zone_t z
)
3589 assert(!z
->prio_refill_count
&& !z
->destructible
);
3590 z
->prio_refill_count
= (uint16_t)(ZONE_REPLENISH_TARGET
/ zone_elem_size(z
));
3591 z
->zone_replenishing
= true;
3594 lck_spin_lock(&zone_replenish_lock
);
3595 ++zone_replenish_max_threads
;
3596 ++zone_replenish_active
;
3597 lck_spin_unlock(&zone_replenish_lock
);
3600 tres
= kernel_thread_start_priority(zone_replenish_thread
, z
,
3601 MAXPRI_KERNEL
, &th
);
3602 if (tres
!= KERN_SUCCESS
) {
3603 panic("zone_prio_refill_configure, thread create: 0x%x", tres
);
3606 thread_deallocate(th
);
3610 zone_randomize_freelist(zone_t zone
, struct zone_page_metadata
*meta
,
3611 vm_offset_t size
, zone_addr_kind_t kind
, unsigned int *entropy_buffer
)
3613 const vm_size_t elem_size
= zone_elem_size(zone
);
3614 vm_offset_t left
, right
, head
, base
;
3615 vm_offset_t element
;
3617 left
= ZONE_PAGE_FIRST_OFFSET(kind
);
3618 right
= size
- ((size
- left
) % elem_size
);
3620 base
= zone_meta_to_addr(meta
, kind
);
3622 while (left
< right
) {
3623 if (zone_leaks_scan_enable
|| __improbable(zone
->tags
) ||
3624 random_bool_gen_bits(&zone_bool_gen
, entropy_buffer
, MAX_ENTROPY_PER_ZCRAM
, 1)) {
3625 element
= base
+ left
;
3629 element
= base
+ right
;
3632 vm_offset_t
*primary
= (vm_offset_t
*)element
;
3633 vm_offset_t
*backup
= get_backup_ptr(elem_size
, primary
);
3635 *primary
= *backup
= head
^ zp_nopoison_cookie
;
3639 meta
->zm_freelist_offs
= (uint16_t)(head
- base
);
3643 * Cram the given memory into the specified zone. Update the zone page count accordingly.
3646 zcram_and_lock(zone_t zone
, vm_offset_t newmem
, vm_size_t size
)
3648 unsigned int entropy_buffer
[MAX_ENTROPY_PER_ZCRAM
] = { 0 };
3649 struct zone_page_metadata
*meta
;
3650 zone_addr_kind_t kind
;
3651 uint32_t pg_count
= (uint32_t)atop(size
);
3652 uint32_t zindex
= zone_index(zone
);
3653 uint32_t free_count
;
3654 uint16_t empty_freelist_offs
= PAGE_METADATA_EMPTY_FREELIST
;
3656 /* Basic sanity checks */
3657 assert(zone
!= ZONE_NULL
&& newmem
!= (vm_offset_t
)0);
3658 assert((newmem
& PAGE_MASK
) == 0);
3659 assert((size
& PAGE_MASK
) == 0);
3661 KDBG(MACHDBG_CODE(DBG_MACH_ZALLOC
, ZALLOC_ZCRAM
) | DBG_FUNC_START
,
3664 kind
= zone_addr_kind(newmem
, size
);
3665 #if DEBUG || DEVELOPMENT
3666 if (zalloc_debug
& ZALLOC_DEBUG_ZCRAM
) {
3667 kprintf("zcram(%p[%s%s], 0x%lx%s, 0x%lx)\n", zone
,
3668 zone_heap_name(zone
), zone
->z_name
, (uintptr_t)newmem
,
3669 kind
== ZONE_ADDR_FOREIGN
? "[F]" : "", (uintptr_t)size
);
3671 #endif /* DEBUG || DEVELOPMENT */
3674 * Initialize the metadata for all pages. We dont need the zone lock
3675 * here because we are not manipulating any zone related state yet.
3677 * This includes randomizing the freelists as the metadata isn't
3681 if (kind
== ZONE_ADDR_NATIVE
) {
3683 * We're being called by zfill,
3684 * zone_replenish_thread or vm_page_more_fictitious,
3686 * which will only either allocate a single page, or `alloc_pages`
3689 assert(pg_count
<= zone
->alloc_pages
);
3692 * Make sure the range of metadata entries we're about to init
3693 * have proper physical backing, then initialize them.
3695 meta
= zone_meta_from_addr(newmem
, kind
);
3696 zone_meta_populate(meta
, meta
+ pg_count
);
3698 if (zone
->permanent
) {
3699 empty_freelist_offs
= 0;
3702 meta
[0] = (struct zone_page_metadata
){
3704 .zm_page_count
= pg_count
,
3705 .zm_percpu
= zone
->percpu
,
3706 .zm_freelist_offs
= empty_freelist_offs
,
3709 for (uint32_t i
= 1; i
< pg_count
; i
++) {
3710 meta
[i
] = (struct zone_page_metadata
){
3713 .zm_percpu
= zone
->percpu
,
3714 .zm_secondary_page
= true,
3715 .zm_freelist_offs
= empty_freelist_offs
,
3719 if (!zone
->permanent
) {
3720 zone_randomize_freelist(zone
, meta
,
3721 zone
->percpu
? PAGE_SIZE
: size
, kind
, entropy_buffer
);
3724 if (!zone
->allows_foreign
|| !from_foreign_range(newmem
, size
)) {
3725 panic("zcram_and_lock: foreign memory [%lx] being crammed is "
3726 "outside of foreign range", (uintptr_t)newmem
);
3730 * We cannot support elements larger than page size for foreign
3731 * memory because we put metadata on the page itself for each
3732 * page of foreign memory.
3734 * We need to do this in order to be able to reach the metadata
3735 * when any element is freed.
3737 assert(!zone
->percpu
&& !zone
->permanent
);
3738 assert(zone_elem_size(zone
) <= PAGE_SIZE
- sizeof(struct zone_page_metadata
));
3740 bzero((void *)newmem
, size
);
3742 for (vm_offset_t offs
= 0; offs
< size
; offs
+= PAGE_SIZE
) {
3743 meta
= (struct zone_page_metadata
*)(newmem
+ offs
);
3744 *meta
= (struct zone_page_metadata
){
3747 .zm_freelist_offs
= empty_freelist_offs
,
3749 meta
->zm_foreign_cookie
[0] = ZONE_FOREIGN_COOKIE
;
3750 zone_randomize_freelist(zone
, meta
, PAGE_SIZE
, kind
,
3755 #if VM_MAX_TAG_ZONES
3756 if (__improbable(zone
->tags
)) {
3757 assert(kind
== ZONE_ADDR_NATIVE
&& !zone
->percpu
);
3758 ztMemoryAdd(zone
, newmem
, size
);
3760 #endif /* VM_MAX_TAG_ZONES */
3763 * Insert the initialized pages / metadatas into the right lists.
3767 assert(zone
->z_self
== zone
);
3769 zone
->page_count
+= pg_count
;
3770 if (zone
->page_count_hwm
< zone
->page_count
) {
3771 zone
->page_count_hwm
= zone
->page_count
;
3773 os_atomic_add(&zones_phys_page_count
, pg_count
, relaxed
);
3775 if (kind
== ZONE_ADDR_NATIVE
) {
3776 os_atomic_add(&zones_phys_page_mapped_count
, pg_count
, relaxed
);
3777 if (zone
->permanent
) {
3778 zone_meta_queue_push(zone
, &zone
->pages_intermediate
, meta
, kind
);
3780 zone_meta_queue_push(zone
, &zone
->pages_all_free
, meta
, kind
);
3781 zone
->allfree_page_count
+= meta
->zm_page_count
;
3783 free_count
= zone_elem_count(zone
, size
, kind
);
3784 zone
->countfree
+= free_count
;
3785 zone
->countavail
+= free_count
;
3787 free_count
= zone_elem_count(zone
, PAGE_SIZE
, kind
);
3788 for (vm_offset_t offs
= 0; offs
< size
; offs
+= PAGE_SIZE
) {
3789 meta
= (struct zone_page_metadata
*)(newmem
+ offs
);
3790 zone_meta_queue_push(zone
, &zone
->pages_any_free_foreign
, meta
, kind
);
3791 zone
->countfree
+= free_count
;
3792 zone
->countavail
+= free_count
;
3796 KDBG(MACHDBG_CODE(DBG_MACH_ZALLOC
, ZALLOC_ZCRAM
) | DBG_FUNC_END
, zindex
);
3800 zcram(zone_t zone
, vm_offset_t newmem
, vm_size_t size
)
3802 zcram_and_lock(zone
, newmem
, size
);
3807 * Fill a zone with enough memory to contain at least nelem elements.
3808 * Return the number of elements actually put into the zone, which may
3809 * be more than the caller asked for since the memory allocation is
3810 * rounded up to the next zone allocation size.
3820 vm_size_t alloc_size
= ptoa(zone
->alloc_pages
);
3821 vm_size_t nalloc_inc
= zone_elem_count(zone
, alloc_size
, ZONE_ADDR_NATIVE
);
3822 vm_size_t nalloc
= 0, goal
= MAX(0, nelem
);
3823 int kmaflags
= KMA_KOBJECT
| KMA_ZERO
;
3825 if (zone
->noencrypt
) {
3826 kmaflags
|= KMA_NOENCRYPT
;
3829 assert(!zone
->allows_foreign
&& !zone
->permanent
);
3832 * Trigger jetsams via the vm_pageout_garbage_collect thread if we're
3833 * running out of zone memory
3835 if (is_zone_map_nearing_exhaustion()) {
3836 thread_wakeup((event_t
) &vm_pageout_garbage_collect
);
3839 if (zone
->va_sequester
) {
3843 struct zone_page_metadata
*page_meta
;
3844 page_meta
= zone_sequestered_page_get(zone
, &memory
);
3845 if (NULL
== page_meta
) {
3850 kr
= zone_sequestered_page_populate(zone
, page_meta
,
3851 memory
, alloc_size
, kmaflags
);
3852 if (KERN_SUCCESS
!= kr
) {
3856 zcram_and_lock(zone
, memory
, alloc_size
);
3857 nalloc
+= nalloc_inc
;
3858 } while (nalloc
< goal
);
3864 while (nalloc
< goal
) {
3865 kr
= kernel_memory_allocate(submap_for_zone(zone
), &memory
,
3866 alloc_size
, 0, kmaflags
, VM_KERN_MEMORY_ZONE
);
3867 if (kr
!= KERN_SUCCESS
) {
3868 printf("%s: kernel_memory_allocate() of %lu bytes failed\n",
3869 __func__
, (unsigned long)(nalloc
* alloc_size
));
3873 zcram(zone
, memory
, alloc_size
);
3874 nalloc
+= nalloc_inc
;
3881 * We're being very conservative here and picking a value of 95%. We might need to lower this if
3882 * we find that we're not catching the problem and are still hitting zone map exhaustion panics.
3884 #define ZONE_MAP_JETSAM_LIMIT_DEFAULT 95
3887 * Trigger zone-map-exhaustion jetsams if the zone map is X% full, where X=zone_map_jetsam_limit.
3888 * Can be set via boot-arg "zone_map_jetsam_limit". Set to 95% by default.
3890 TUNABLE_WRITEABLE(unsigned int, zone_map_jetsam_limit
, "zone_map_jetsam_limit",
3891 ZONE_MAP_JETSAM_LIMIT_DEFAULT
);
3894 get_zone_map_size(uint64_t *current_size
, uint64_t *capacity
)
3896 vm_offset_t phys_pages
= os_atomic_load(&zones_phys_page_mapped_count
, relaxed
);
3897 *current_size
= ptoa_64(phys_pages
);
3898 *capacity
= zone_phys_mapped_max
;
3902 get_largest_zone_info(char *zone_name
, size_t zone_name_len
, uint64_t *zone_size
)
3904 zone_t largest_zone
= zone_find_largest();
3907 * Append kalloc heap name to zone name (if zone is used by kalloc)
3909 snprintf(zone_name
, zone_name_len
, "%s%s",
3910 zone_heap_name(largest_zone
), largest_zone
->z_name
);
3912 *zone_size
= zone_size_wired(largest_zone
);
3916 is_zone_map_nearing_exhaustion(void)
3918 vm_offset_t phys_pages
= os_atomic_load(&zones_phys_page_mapped_count
, relaxed
);
3919 return ptoa_64(phys_pages
) > (zone_phys_mapped_max
* zone_map_jetsam_limit
) / 100;
3923 #define VMENTRY_TO_VMOBJECT_COMPARISON_RATIO 98
3926 * Tries to kill a single process if it can attribute one to the largest zone. If not, wakes up the memorystatus thread
3927 * to walk through the jetsam priority bands and kill processes.
3930 kill_process_in_largest_zone(void)
3933 zone_t largest_zone
= zone_find_largest();
3935 printf("zone_map_exhaustion: Zone mapped %lld of %lld, used %lld, map size %lld, capacity %lld [jetsam limit %d%%]\n",
3936 ptoa_64(os_atomic_load(&zones_phys_page_mapped_count
, relaxed
)), ptoa_64(zone_phys_mapped_max
),
3937 ptoa_64(os_atomic_load(&zones_phys_page_count
, relaxed
)),
3938 (uint64_t)zone_submaps_approx_size(),
3939 (uint64_t)zone_range_size(&zone_info
.zi_map_range
),
3940 zone_map_jetsam_limit
);
3941 printf("zone_map_exhaustion: Largest zone %s%s, size %lu\n", zone_heap_name(largest_zone
),
3942 largest_zone
->z_name
, (uintptr_t)zone_size_wired(largest_zone
));
3945 * We want to make sure we don't call this function from userspace.
3946 * Or we could end up trying to synchronously kill the process
3947 * whose context we're in, causing the system to hang.
3949 assert(current_task() == kernel_task
);
3952 * If vm_object_zone is the largest, check to see if the number of
3953 * elements in vm_map_entry_zone is comparable.
3955 * If so, consider vm_map_entry_zone as the largest. This lets us target
3956 * a specific process to jetsam to quickly recover from the zone map
3959 if (largest_zone
== vm_object_zone
) {
3960 unsigned int vm_object_zone_count
= zone_count_allocated(vm_object_zone
);
3961 unsigned int vm_map_entry_zone_count
= zone_count_allocated(vm_map_entry_zone
);
3962 /* Is the VM map entries zone count >= 98% of the VM objects zone count? */
3963 if (vm_map_entry_zone_count
>= ((vm_object_zone_count
* VMENTRY_TO_VMOBJECT_COMPARISON_RATIO
) / 100)) {
3964 largest_zone
= vm_map_entry_zone
;
3965 printf("zone_map_exhaustion: Picking VM map entries as the zone to target, size %lu\n",
3966 (uintptr_t)zone_size_wired(largest_zone
));
3970 /* TODO: Extend this to check for the largest process in other zones as well. */
3971 if (largest_zone
== vm_map_entry_zone
) {
3972 pid
= find_largest_process_vm_map_entries();
3974 printf("zone_map_exhaustion: Nothing to do for the largest zone [%s%s]. "
3975 "Waking up memorystatus thread.\n", zone_heap_name(largest_zone
),
3976 largest_zone
->z_name
);
3978 if (!memorystatus_kill_on_zone_map_exhaustion(pid
)) {
3979 printf("zone_map_exhaustion: Call to memorystatus failed, victim pid: %d\n", pid
);
3983 #pragma mark zalloc module init
3986 * Initialize the "zone of zones" which uses fixed memory allocated
3987 * earlier in memory initialization. zone_bootstrap is called
3992 zone_bootstrap(void)
3994 /* Validate struct zone_page_metadata expectations */
3995 if ((1U << ZONE_PAGECOUNT_BITS
) <
3996 atop(ZONE_MAX_ALLOC_SIZE
) * sizeof(struct zone_page_metadata
)) {
3997 panic("ZONE_PAGECOUNT_BITS is not large enough to hold page counts");
4000 /* Validate struct zone_packed_virtual_address expectations */
4001 static_assert((intptr_t)VM_MIN_KERNEL_ADDRESS
< 0, "the top bit must be 1");
4002 if (VM_KERNEL_POINTER_SIGNIFICANT_BITS
- PAGE_SHIFT
> 31) {
4003 panic("zone_pva_t can't pack a kernel page address in 31 bits");
4006 zpercpu_early_count
= ml_early_cpu_max_number() + 1;
4008 /* Set up zone element poisoning */
4011 random_bool_init(&zone_bool_gen
);
4014 * the KASAN quarantine for kalloc doesn't understand heaps
4015 * and trips the heap confusion panics. At the end of the day,
4016 * all these security measures are double duty with KASAN.
4018 * On 32bit kernels, these protections are just too expensive.
4020 #if !defined(__LP64__) || KASAN_ZALLOC
4021 zsecurity_options
&= ~ZSECURITY_OPTIONS_SEQUESTER
;
4022 zsecurity_options
&= ~ZSECURITY_OPTIONS_SUBMAP_USER_DATA
;
4023 zsecurity_options
&= ~ZSECURITY_OPTIONS_SEQUESTER_KEXT_KALLOC
;
4026 thread_call_setup(&call_async_alloc
, zalloc_async
, NULL
);
4029 /* zcc_enable_for_zone_name=<zone>: enable per-cpu zone caching for <zone>. */
4030 if (PE_parse_boot_arg_str("zcc_enable_for_zone_name", cache_zone_name
, sizeof(cache_zone_name
))) {
4031 printf("zcache: caching enabled for zone %s\n", cache_zone_name
);
4033 #endif /* CONFIG_ZCACHE */
4038 #define ZONE_MAP_VIRTUAL_SIZE_LP64 (32ULL * 1024ULL * 1024 * 1024)
4040 #define ZONE_MAP_VIRTUAL_SIZE_LP64 (128ULL * 1024ULL * 1024 * 1024)
4042 #endif /* __LP64__ */
4044 #define SINGLE_GUARD 16384
4045 #define MULTI_GUARD (3 * SINGLE_GUARD)
4048 static inline vm_offset_t
4049 zone_restricted_va_max(void)
4051 vm_offset_t compressor_max
= VM_PACKING_MAX_PACKABLE(C_SLOT_PACKED_PTR
);
4052 vm_offset_t vm_page_max
= VM_PACKING_MAX_PACKABLE(VM_PAGE_PACKED_PTR
);
4054 return trunc_page(MIN(compressor_max
, vm_page_max
));
4060 zone_tunables_fixup(void)
4062 if (zone_map_jetsam_limit
== 0 || zone_map_jetsam_limit
> 100) {
4063 zone_map_jetsam_limit
= ZONE_MAP_JETSAM_LIMIT_DEFAULT
;
4066 STARTUP(TUNABLES
, STARTUP_RANK_MIDDLE
, zone_tunables_fixup
);
4070 zone_phys_size_max(void)
4072 mach_vm_size_t zsize
;
4075 if (PE_parse_boot_argn("zsize", &zsizearg
, sizeof(zsizearg
))) {
4076 zsize
= zsizearg
* (1024ULL * 1024);
4078 zsize
= sane_size
>> 2; /* Set target zone size as 1/4 of physical memory */
4079 #if defined(__LP64__)
4080 zsize
+= zsize
>> 1;
4081 #endif /* __LP64__ */
4084 if (zsize
< CONFIG_ZONE_MAP_MIN
) {
4085 zsize
= CONFIG_ZONE_MAP_MIN
; /* Clamp to min */
4087 if (zsize
> sane_size
>> 1) {
4088 zsize
= sane_size
>> 1; /* Clamp to half of RAM max */
4090 if (zsizearg
== 0 && zsize
> ZONE_MAP_MAX
) {
4091 /* if zsize boot-arg not present and zsize exceeds platform maximum, clip zsize */
4092 vm_size_t orig_zsize
= zsize
;
4093 zsize
= ZONE_MAP_MAX
;
4094 printf("NOTE: zonemap size reduced from 0x%lx to 0x%lx\n",
4095 (uintptr_t)orig_zsize
, (uintptr_t)zsize
);
4098 assert((vm_size_t
) zsize
== zsize
);
4099 return (vm_size_t
)trunc_page(zsize
);
4103 static struct zone_map_range
4104 zone_init_allocate_va(vm_offset_t
*submap_min
, vm_size_t size
, bool guard
)
4106 struct zone_map_range r
;
4110 vm_map_offset_t addr
= *submap_min
;
4111 vm_map_kernel_flags_t vmk_flags
= VM_MAP_KERNEL_FLAGS_NONE
;
4113 vmk_flags
.vmkf_permanent
= TRUE
;
4114 kr
= vm_map_enter(kernel_map
, &addr
, size
, 0,
4115 VM_FLAGS_FIXED
, vmk_flags
, VM_KERN_MEMORY_ZONE
, kernel_object
,
4116 0, FALSE
, VM_PROT_NONE
, VM_PROT_NONE
, VM_INHERIT_DEFAULT
);
4117 *submap_min
= (vm_offset_t
)addr
;
4119 kr
= kernel_memory_allocate(kernel_map
, submap_min
, size
,
4120 0, KMA_KOBJECT
| KMA_PAGEABLE
| KMA_VAONLY
, VM_KERN_MEMORY_ZONE
);
4122 if (kr
!= KERN_SUCCESS
) {
4123 panic("zone_init_allocate_va(0x%lx:0x%zx) failed: %d",
4124 (uintptr_t)*submap_min
, (size_t)size
, kr
);
4127 r
.min_address
= *submap_min
;
4128 *submap_min
+= size
;
4129 r
.max_address
= *submap_min
;
4137 vm_offset_t
*submap_min
,
4139 uint64_t zone_sub_map_numer
,
4140 uint64_t *remaining_denom
,
4141 vm_offset_t
*remaining_size
,
4142 vm_size_t guard_size
)
4144 vm_offset_t submap_start
, submap_end
;
4145 vm_size_t submap_size
;
4149 submap_size
= trunc_page(zone_sub_map_numer
* *remaining_size
/
4151 submap_start
= *submap_min
;
4152 submap_end
= submap_start
+ submap_size
;
4154 #if defined(__LP64__)
4155 if (idx
== Z_SUBMAP_IDX_VA_RESTRICTED_MAP
) {
4156 vm_offset_t restricted_va_max
= zone_restricted_va_max();
4157 if (submap_end
> restricted_va_max
) {
4158 #if DEBUG || DEVELOPMENT
4159 printf("zone_init: submap[%d] clipped to %zdM of %zdM\n", idx
,
4160 (size_t)(restricted_va_max
- submap_start
) >> 20,
4161 (size_t)submap_size
>> 20);
4162 #endif /* DEBUG || DEVELOPMENT */
4163 guard_size
+= submap_end
- restricted_va_max
;
4164 *remaining_size
-= submap_end
- restricted_va_max
;
4165 submap_end
= restricted_va_max
;
4166 submap_size
= restricted_va_max
- submap_start
;
4169 vm_packing_verify_range("vm_compressor",
4170 submap_start
, submap_end
, VM_PACKING_PARAMS(C_SLOT_PACKED_PTR
));
4171 vm_packing_verify_range("vm_page",
4172 submap_start
, submap_end
, VM_PACKING_PARAMS(VM_PAGE_PACKED_PTR
));
4174 #endif /* defined(__LP64__) */
4176 vm_map_kernel_flags_t vmk_flags
= VM_MAP_KERNEL_FLAGS_NONE
;
4177 vmk_flags
.vmkf_permanent
= TRUE
;
4178 kr
= kmem_suballoc(kernel_map
, submap_min
, submap_size
,
4179 FALSE
, VM_FLAGS_FIXED
, vmk_flags
,
4180 VM_KERN_MEMORY_ZONE
, &submap
);
4181 if (kr
!= KERN_SUCCESS
) {
4182 panic("kmem_suballoc(kernel_map[%d] %p:%p) failed: %d",
4183 idx
, (void *)submap_start
, (void *)submap_end
, kr
);
4186 #if DEBUG || DEVELOPMENT
4187 printf("zone_init: submap[%d] %p:%p (%zuM)\n",
4188 idx
, (void *)submap_start
, (void *)submap_end
,
4189 (size_t)submap_size
>> 20);
4190 #endif /* DEBUG || DEVELOPMENT */
4192 zone_submaps
[idx
] = submap
;
4193 *submap_min
= submap_end
;
4194 *remaining_size
-= submap_size
;
4195 *remaining_denom
-= zone_sub_map_numer
;
4197 zone_init_allocate_va(submap_min
, guard_size
, true);
4200 /* Global initialization of Zone Allocator.
4201 * Runs after zone_bootstrap.
4207 vm_size_t zone_meta_size
;
4208 vm_size_t zone_map_size
;
4209 vm_size_t remaining_size
;
4210 vm_offset_t submap_min
= 0;
4212 if (ZSECURITY_OPTIONS_SUBMAP_USER_DATA
& zsecurity_options
) {
4213 zone_last_submap_idx
= Z_SUBMAP_IDX_BAG_OF_BYTES_MAP
;
4215 zone_last_submap_idx
= Z_SUBMAP_IDX_GENERAL_MAP
;
4217 zone_phys_mapped_max
= zone_phys_size_max();
4220 zone_map_size
= ZONE_MAP_VIRTUAL_SIZE_LP64
;
4222 zone_map_size
= zone_phys_mapped_max
;
4224 zone_meta_size
= round_page(atop(zone_map_size
) *
4225 sizeof(struct zone_page_metadata
));
4230 * [ VA_RESTRICTED ] <-- LP64 only
4231 * [ SINGLE_GUARD ] <-- LP64 only
4234 * [ map<i> ] \ for each extra map
4237 remaining_size
= zone_map_size
;
4238 #if defined(__LP64__)
4239 remaining_size
-= SINGLE_GUARD
;
4241 remaining_size
-= zone_meta_size
+ SINGLE_GUARD
;
4242 remaining_size
-= MULTI_GUARD
* (zone_last_submap_idx
-
4243 Z_SUBMAP_IDX_GENERAL_MAP
+ 1);
4245 #if VM_MAX_TAG_ZONES
4246 if (zone_tagging_on
) {
4247 zone_tagging_init(zone_map_size
);
4251 uint64_t remaining_denom
= 0;
4252 uint64_t zone_sub_map_numer
[Z_SUBMAP_IDX_COUNT
] = {
4254 [Z_SUBMAP_IDX_VA_RESTRICTED_MAP
] = 20,
4255 #endif /* defined(__LP64__) */
4256 [Z_SUBMAP_IDX_GENERAL_MAP
] = 40,
4257 [Z_SUBMAP_IDX_BAG_OF_BYTES_MAP
] = 40,
4260 for (unsigned idx
= 0; idx
<= zone_last_submap_idx
; idx
++) {
4261 #if DEBUG || DEVELOPMENT
4262 char submap_name
[MAX_SUBMAP_NAME
];
4263 snprintf(submap_name
, MAX_SUBMAP_NAME
, "submap%d", idx
);
4264 PE_parse_boot_argn(submap_name
, &zone_sub_map_numer
[idx
], sizeof(uint64_t));
4266 remaining_denom
+= zone_sub_map_numer
[idx
];
4270 * And now allocate the various pieces of VA and submaps.
4272 * Make a first allocation of contiguous VA, that we'll deallocate,
4273 * and we'll carve-out memory in that range again linearly.
4274 * The kernel is stil single threaded at this stage.
4277 struct zone_map_range
*map_range
= &zone_info
.zi_map_range
;
4279 *map_range
= zone_init_allocate_va(&submap_min
, zone_map_size
, false);
4280 submap_min
= map_range
->min_address
;
4281 kmem_free(kernel_map
, submap_min
, zone_map_size
);
4283 #if defined(__LP64__)
4285 * Allocate `Z_SUBMAP_IDX_VA_RESTRICTED_MAP` first because its VA range
4286 * can't go beyond RESTRICTED_VA_MAX for the vm_page_t packing to work.
4288 zone_submap_init(&submap_min
, Z_SUBMAP_IDX_VA_RESTRICTED_MAP
,
4289 zone_sub_map_numer
[Z_SUBMAP_IDX_VA_RESTRICTED_MAP
], &remaining_denom
,
4290 &remaining_size
, SINGLE_GUARD
);
4291 #endif /* defined(__LP64__) */
4294 * Allocate metadata array
4296 zone_info
.zi_meta_range
=
4297 zone_init_allocate_va(&submap_min
, zone_meta_size
, true);
4298 zone_init_allocate_va(&submap_min
, SINGLE_GUARD
, true);
4300 zone_info
.zi_array_base
=
4301 (struct zone_page_metadata
*)zone_info
.zi_meta_range
.min_address
-
4302 zone_pva_from_addr(map_range
->min_address
).packed_address
;
4305 * Allocate other submaps
4307 for (unsigned idx
= Z_SUBMAP_IDX_GENERAL_MAP
; idx
<= zone_last_submap_idx
; idx
++) {
4308 zone_submap_init(&submap_min
, idx
, zone_sub_map_numer
[idx
],
4309 &remaining_denom
, &remaining_size
, MULTI_GUARD
);
4312 vm_map_t general_map
= zone_submaps
[Z_SUBMAP_IDX_GENERAL_MAP
];
4313 zone_info
.zi_general_range
.min_address
= vm_map_min(general_map
);
4314 zone_info
.zi_general_range
.max_address
= vm_map_max(general_map
);
4316 assert(submap_min
== map_range
->max_address
);
4319 gzalloc_init(zone_map_size
);
4322 zone_create_flags_t kma_flags
= ZC_NOCACHING
|
4323 ZC_NOGC
| ZC_NOENCRYPT
| ZC_NOGZALLOC
| ZC_NOCALLOUT
|
4324 ZC_KASAN_NOQUARANTINE
| ZC_KASAN_NOREDZONE
;
4326 (void)zone_create_ext("vm.permanent", 1, kma_flags
,
4327 ZONE_ID_PERMANENT
, ^(zone_t z
){
4328 z
->permanent
= true;
4330 z
->pcpu_elem_size
= 1;
4331 #if defined(__LP64__)
4332 z
->submap_idx
= Z_SUBMAP_IDX_VA_RESTRICTED_MAP
;
4335 (void)zone_create_ext("vm.permanent.percpu", 1, kma_flags
| ZC_PERCPU
,
4336 ZONE_ID_PERCPU_PERMANENT
, ^(zone_t z
){
4337 z
->permanent
= true;
4339 z
->pcpu_elem_size
= zpercpu_count();
4340 #if defined(__LP64__)
4341 z
->submap_idx
= Z_SUBMAP_IDX_VA_RESTRICTED_MAP
;
4346 * Now fix the zones that are missing their zone stats
4347 * we don't really know if zfree()s happened so our stats
4348 * are slightly off for early boot. ¯\_(ツ)_/¯
4350 zone_index_foreach(idx
) {
4351 zone_t tz
= &zone_array
[idx
];
4354 zone_stats_t zs
= zalloc_percpu_permanent_type(struct zone_stats
);
4356 zpercpu_get_cpu(zs
, 0)->zs_mem_allocated
+=
4357 (tz
->countavail
- tz
->countfree
) *
4359 assert(tz
->z_stats
== NULL
);
4361 #if ZONE_ENABLE_LOGGING
4362 if (tz
->zone_logging
&& !tz
->zlog_btlog
) {
4363 zone_enable_logging(tz
);
4371 * Initialize the zone leak monitor
4373 zleak_init(zone_map_size
);
4374 #endif /* CONFIG_ZLEAKS */
4376 #if VM_MAX_TAG_ZONES
4377 if (zone_tagging_on
) {
4378 vm_allocation_zones_init();
4382 STARTUP(ZALLOC
, STARTUP_RANK_FIRST
, zone_init
);
4386 zone_set_foreign_range(
4387 vm_offset_t range_min
,
4388 vm_offset_t range_max
)
4390 zone_info
.zi_foreign_range
.min_address
= range_min
;
4391 zone_info
.zi_foreign_range
.max_address
= range_max
;
4396 zone_foreign_mem_init(vm_size_t size
)
4398 vm_offset_t mem
= (vm_offset_t
) pmap_steal_memory(size
);
4399 zone_set_foreign_range(mem
, mem
+ size
);
4407 * Called from zfree() to add the element being freed to the KASan quarantine.
4409 * Returns true if the newly-freed element made it into the quarantine without
4410 * displacing another, false otherwise. In the latter case, addrp points to the
4411 * address of the displaced element, which will be freed by the zone.
4414 kasan_quarantine_freed_element(
4415 zone_t
*zonep
, /* the zone the element is being freed to */
4416 void **addrp
) /* address of the element being freed */
4418 zone_t zone
= *zonep
;
4419 void *addr
= *addrp
;
4422 * Resize back to the real allocation size and hand off to the KASan
4423 * quarantine. `addr` may then point to a different allocation, if the
4424 * current element replaced another in the quarantine. The zone then
4425 * takes ownership of the swapped out free element.
4427 vm_size_t usersz
= zone_elem_size(zone
) - 2 * zone
->kasan_redzone
;
4428 vm_size_t sz
= usersz
;
4430 if (addr
&& zone
->kasan_redzone
) {
4431 kasan_check_free((vm_address_t
)addr
, usersz
, KASAN_HEAP_ZALLOC
);
4432 addr
= (void *)kasan_dealloc((vm_address_t
)addr
, &sz
);
4433 assert(sz
== zone_elem_size(zone
));
4435 if (addr
&& !zone
->kasan_noquarantine
) {
4436 kasan_free(&addr
, &sz
, KASAN_HEAP_ZALLOC
, zonep
, usersz
, true);
4441 if (addr
&& zone
->kasan_noquarantine
) {
4442 kasan_unpoison(addr
, zone_elem_size(zone
));
4448 #endif /* KASAN_ZALLOC */
4451 zone_needs_async_refill(zone_t zone
)
4453 if (zone
->countfree
!= 0 || zone
->async_pending
|| zone
->no_callout
) {
4457 return zone
->expandable
|| zone
->page_count
< zone
->page_count_max
;
4460 __attribute__((noinline
))
4462 zone_refill_synchronously_locked(
4464 zalloc_flags_t flags
)
4466 thread_t thr
= current_thread();
4467 bool set_expanding_vm_priv
= false;
4468 zone_pva_t orig
= zone
->pages_intermediate
;
4470 while ((flags
& Z_NOWAIT
) == 0 && (zone
->permanent
4471 ? zone_pva_is_equal(zone
->pages_intermediate
, orig
)
4472 : zone
->countfree
== 0)) {
4474 * zone is empty, try to expand it
4476 * Note that we now allow up to 2 threads (1 vm_privliged and
4477 * 1 non-vm_privliged) to expand the zone concurrently...
4479 * this is necessary to avoid stalling vm_privileged threads
4480 * running critical code necessary to continue
4481 * compressing/swapping pages (i.e. making new free pages) from
4482 * stalling behind non-vm_privileged threads waiting to acquire
4483 * free pages when the vm_page_free_count is below the
4484 * vm_page_free_reserved limit.
4486 if ((zone
->expanding_no_vm_priv
|| zone
->expanding_vm_priv
) &&
4487 (((thr
->options
& TH_OPT_VMPRIV
) == 0) || zone
->expanding_vm_priv
)) {
4489 * This is a non-vm_privileged thread and a non-vm_privileged or
4490 * a vm_privileged thread is already expanding the zone...
4492 * this is a vm_privileged thread and a vm_privileged thread is
4493 * already expanding the zone...
4495 * In either case wait for a thread to finish, then try again.
4497 zone
->waiting
= true;
4498 assert_wait(zone
, THREAD_UNINT
);
4500 thread_block(THREAD_CONTINUE_NULL
);
4505 if (zone
->page_count
>= zone
->page_count_max
) {
4506 if (zone
->exhaustible
) {
4509 if (zone
->expandable
) {
4511 * If we're expandable, just don't go through this again.
4513 zone
->page_count_max
= ~0u;
4517 panic_include_zprint
= true;
4519 if (zleak_state
& ZLEAK_STATE_ACTIVE
) {
4520 panic_include_ztrace
= true;
4522 #endif /* CONFIG_ZLEAKS */
4523 panic("zalloc: zone \"%s\" empty.", zone
->z_name
);
4528 * It is possible that a BG thread is refilling/expanding the zone
4529 * and gets pre-empted during that operation. That blocks all other
4530 * threads from making progress leading to a watchdog timeout. To
4531 * avoid that, boost the thread priority using the rwlock boost
4533 set_thread_rwlock_boost();
4535 if ((thr
->options
& TH_OPT_VMPRIV
)) {
4536 zone
->expanding_vm_priv
= true;
4537 set_expanding_vm_priv
= true;
4539 zone
->expanding_no_vm_priv
= true;
4542 zone_replenish_locked(zone
, flags
, false);
4544 if (set_expanding_vm_priv
== true) {
4545 zone
->expanding_vm_priv
= false;
4547 zone
->expanding_no_vm_priv
= false;
4550 if (zone
->waiting
) {
4551 zone
->waiting
= false;
4552 thread_wakeup(zone
);
4554 clear_thread_rwlock_boost();
4556 if (zone
->countfree
== 0) {
4557 assert(flags
& Z_NOPAGEWAIT
);
4562 if ((flags
& (Z_NOWAIT
| Z_NOPAGEWAIT
)) &&
4563 zone_needs_async_refill(zone
) && !vm_pool_low()) {
4564 zone
->async_pending
= true;
4566 thread_call_enter(&call_async_alloc
);
4568 assert(zone
->z_self
== zone
);
4572 __attribute__((noinline
))
4574 zone_refill_asynchronously_locked(zone_t zone
)
4576 uint32_t min_free
= zone
->prio_refill_count
/ 2;
4577 uint32_t resv_free
= zone
->prio_refill_count
/ 4;
4578 thread_t thr
= current_thread();
4581 * Nothing to do if there are plenty of elements.
4583 while (zone
->countfree
<= min_free
) {
4585 * Wakeup the replenish thread if not running.
4587 if (!zone
->zone_replenishing
) {
4588 lck_spin_lock(&zone_replenish_lock
);
4589 assert(zone_replenish_active
< zone_replenish_max_threads
);
4590 ++zone_replenish_active
;
4591 lck_spin_unlock(&zone_replenish_lock
);
4592 zone
->zone_replenishing
= true;
4593 zone_replenish_wakeups_initiated
++;
4594 thread_wakeup(&zone
->prio_refill_count
);
4598 * We'll let VM_PRIV threads to continue to allocate until the
4599 * reserve drops to 25%. After that only TH_OPT_ZONE_PRIV threads
4602 * TH_OPT_ZONE_PRIV threads are the GC thread and a replenish thread itself.
4603 * Replenish threads *need* to use the reserve. GC threads need to
4604 * get through the current allocation, but then will wait at a higher
4605 * level after they've dropped any locks which would deadlock the
4608 if ((zone
->countfree
> resv_free
&& (thr
->options
& TH_OPT_VMPRIV
)) ||
4609 (thr
->options
& TH_OPT_ZONE_PRIV
)) {
4614 * Wait for the replenish threads to add more elements for us to allocate from.
4616 zone_replenish_throttle_count
++;
4618 assert_wait_timeout(zone
, THREAD_UNINT
, 1, NSEC_PER_MSEC
);
4619 thread_block(THREAD_CONTINUE_NULL
);
4622 assert(zone
->z_self
== zone
);
4626 * If we're here because of zone_gc(), we didn't wait for
4627 * zone_replenish_thread to finish. So we need to ensure that
4628 * we will successfully grab an element.
4630 * zones that have a replenish thread configured.
4631 * The value of (refill_level / 2) in the previous bit of code should have
4632 * given us headroom even though this thread didn't wait.
4634 if (thr
->options
& TH_OPT_ZONE_PRIV
) {
4635 assert(zone
->countfree
!= 0);
4639 #if ZONE_ENABLE_LOGGING || CONFIG_ZLEAKS
4640 __attribute__((noinline
))
4642 zalloc_log_or_trace_leaks(zone_t zone
, vm_offset_t addr
)
4644 uintptr_t zbt
[MAX_ZTRACE_DEPTH
]; /* used in zone leak logging and zone leak detection */
4645 unsigned int numsaved
= 0;
4647 #if ZONE_ENABLE_LOGGING
4648 if (DO_LOGGING(zone
)) {
4649 numsaved
= backtrace_frame(zbt
, MAX_ZTRACE_DEPTH
,
4650 __builtin_frame_address(0), NULL
);
4651 btlog_add_entry(zone
->zlog_btlog
, (void *)addr
,
4652 ZOP_ALLOC
, (void **)zbt
, numsaved
);
4658 * Zone leak detection: capture a backtrace every zleak_sample_factor
4659 * allocations in this zone.
4661 if (__improbable(zone
->zleak_on
)) {
4662 if (sample_counter(&zone
->zleak_capture
, zleak_sample_factor
)) {
4663 /* Avoid backtracing twice if zone logging is on */
4664 if (numsaved
== 0) {
4665 numsaved
= backtrace_frame(zbt
, MAX_ZTRACE_DEPTH
,
4666 __builtin_frame_address(1), NULL
);
4668 /* Sampling can fail if another sample is happening at the same time in a different zone. */
4669 if (!zleak_log(zbt
, addr
, numsaved
, zone_elem_size(zone
))) {
4670 /* If it failed, roll back the counter so we sample the next allocation instead. */
4671 zone
->zleak_capture
= zleak_sample_factor
;
4676 if (__improbable(zone_leaks_scan_enable
&&
4677 !(zone_elem_size(zone
) & (sizeof(uintptr_t) - 1)))) {
4678 unsigned int count
, idx
;
4679 /* Fill element, from tail, with backtrace in reverse order */
4680 if (numsaved
== 0) {
4681 numsaved
= backtrace_frame(zbt
, MAX_ZTRACE_DEPTH
,
4682 __builtin_frame_address(1), NULL
);
4684 count
= (unsigned int)(zone_elem_size(zone
) / sizeof(uintptr_t));
4685 if (count
>= numsaved
) {
4686 count
= numsaved
- 1;
4688 for (idx
= 0; idx
< count
; idx
++) {
4689 ((uintptr_t *)addr
)[count
- 1 - idx
] = zbt
[idx
+ 1];
4692 #endif /* CONFIG_ZLEAKS */
4696 zalloc_should_log_or_trace_leaks(zone_t zone
, vm_size_t elem_size
)
4698 #if ZONE_ENABLE_LOGGING
4699 if (DO_LOGGING(zone
)) {
4705 * Zone leak detection: capture a backtrace every zleak_sample_factor
4706 * allocations in this zone.
4708 if (zone
->zleak_on
) {
4711 if (zone_leaks_scan_enable
&& !(elem_size
& (sizeof(uintptr_t) - 1))) {
4714 #endif /* CONFIG_ZLEAKS */
4717 #endif /* ZONE_ENABLE_LOGGING || CONFIG_ZLEAKS */
4718 #if ZONE_ENABLE_LOGGING
4720 __attribute__((noinline
))
4722 zfree_log_trace(zone_t zone
, vm_offset_t addr
)
4725 * See if we're doing logging on this zone.
4727 * There are two styles of logging used depending on
4728 * whether we're trying to catch a leak or corruption.
4730 if (__improbable(DO_LOGGING(zone
))) {
4731 if (corruption_debug_flag
) {
4732 uintptr_t zbt
[MAX_ZTRACE_DEPTH
];
4733 unsigned int numsaved
;
4735 * We're logging to catch a corruption.
4737 * Add a record of this zfree operation to log.
4739 numsaved
= backtrace_frame(zbt
, MAX_ZTRACE_DEPTH
,
4740 __builtin_frame_address(1), NULL
);
4741 btlog_add_entry(zone
->zlog_btlog
, (void *)addr
, ZOP_FREE
,
4742 (void **)zbt
, numsaved
);
4745 * We're logging to catch a leak.
4747 * Remove any record we might have for this element
4748 * since it's being freed. Note that we may not find it
4749 * if the buffer overflowed and that's OK.
4751 * Since the log is of a limited size, old records get
4752 * overwritten if there are more zallocs than zfrees.
4754 btlog_remove_entries_for_element(zone
->zlog_btlog
, (void *)addr
);
4758 #endif /* ZONE_ENABLE_LOGGING */
4761 * Removes an element from the zone's free list, returning 0 if the free list is empty.
4762 * Verifies that the next-pointer and backup next-pointer are intact,
4763 * and verifies that a poisoned element hasn't been modified.
4766 zalloc_direct_locked(
4768 zalloc_flags_t flags __unused
,
4769 vm_size_t waste __unused
)
4771 struct zone_page_metadata
*page_meta
;
4772 zone_addr_kind_t kind
= ZONE_ADDR_NATIVE
;
4773 vm_offset_t element
, page
, validate_bit
= 0;
4775 /* if zone is empty, bail */
4776 if (!zone_pva_is_null(zone
->pages_any_free_foreign
)) {
4777 kind
= ZONE_ADDR_FOREIGN
;
4778 page_meta
= zone_pva_to_meta(zone
->pages_any_free_foreign
, kind
);
4779 page
= (vm_offset_t
)page_meta
;
4780 } else if (!zone_pva_is_null(zone
->pages_intermediate
)) {
4781 page_meta
= zone_pva_to_meta(zone
->pages_intermediate
, kind
);
4782 page
= zone_pva_to_addr(zone
->pages_intermediate
);
4783 } else if (!zone_pva_is_null(zone
->pages_all_free
)) {
4784 page_meta
= zone_pva_to_meta(zone
->pages_all_free
, kind
);
4785 page
= zone_pva_to_addr(zone
->pages_all_free
);
4786 if (os_sub_overflow(zone
->allfree_page_count
,
4787 page_meta
->zm_page_count
, &zone
->allfree_page_count
)) {
4788 zone_accounting_panic(zone
, "allfree_page_count wrap-around");
4791 zone_accounting_panic(zone
, "countfree corruption");
4794 if (!zone_has_index(zone
, page_meta
->zm_index
)) {
4795 zone_page_metadata_index_confusion_panic(zone
, page
, page_meta
);
4798 element
= zone_page_meta_get_freelist(zone
, page_meta
, page
);
4800 vm_offset_t
*primary
= (vm_offset_t
*) element
;
4801 vm_offset_t
*backup
= get_backup_ptr(zone_elem_size(zone
), primary
);
4804 * since the primary next pointer is xor'ed with zp_nopoison_cookie
4805 * for obfuscation, retrieve the original value back
4807 vm_offset_t next_element
= *primary
^ zp_nopoison_cookie
;
4808 vm_offset_t next_element_primary
= *primary
;
4809 vm_offset_t next_element_backup
= *backup
;
4812 * backup_ptr_mismatch_panic will determine what next_element
4813 * should have been, and print it appropriately
4815 if (!zone_page_meta_is_sane_element(zone
, page_meta
, page
, next_element
, kind
)) {
4816 backup_ptr_mismatch_panic(zone
, page_meta
, page
, element
);
4819 /* Check the backup pointer for the regular cookie */
4820 if (__improbable(next_element_primary
!= next_element_backup
)) {
4821 /* Check for the poisoned cookie instead */
4822 if (__improbable(next_element
!= (next_element_backup
^ zp_poisoned_cookie
))) {
4823 /* Neither cookie is valid, corruption has occurred */
4824 backup_ptr_mismatch_panic(zone
, page_meta
, page
, element
);
4828 * Element was marked as poisoned, so check its integrity before using it.
4830 validate_bit
= ZALLOC_ELEMENT_NEEDS_VALIDATION
;
4831 } else if (zone
->zfree_clear_mem
) {
4832 validate_bit
= ZALLOC_ELEMENT_NEEDS_VALIDATION
;
4835 /* Remove this element from the free list */
4836 zone_page_meta_set_freelist(page_meta
, page
, next_element
);
4838 if (kind
== ZONE_ADDR_FOREIGN
) {
4839 if (next_element
== 0) {
4840 /* last foreign element allocated on page, move to all_used_foreign */
4841 zone_meta_requeue(zone
, &zone
->pages_all_used_foreign
, page_meta
, kind
);
4843 } else if (next_element
== 0) {
4844 zone_meta_requeue(zone
, &zone
->pages_all_used
, page_meta
, kind
);
4845 } else if (page_meta
->zm_alloc_count
== 0) {
4846 /* remove from free, move to intermediate */
4847 zone_meta_requeue(zone
, &zone
->pages_intermediate
, page_meta
, kind
);
4850 if (os_add_overflow(page_meta
->zm_alloc_count
, 1,
4851 &page_meta
->zm_alloc_count
)) {
4853 * This will not catch a lot of errors, the proper check
4854 * would be against the number of elements this run should
4855 * have which is expensive to count.
4857 * But zm_alloc_count is a 16 bit number which could
4858 * theoretically be valuable to cause to wrap around,
4861 zone_page_meta_accounting_panic(zone
, page_meta
,
4862 "zm_alloc_count overflow");
4864 if (os_sub_overflow(zone
->countfree
, 1, &zone
->countfree
)) {
4865 zone_accounting_panic(zone
, "countfree wrap-around");
4868 #if VM_MAX_TAG_ZONES
4869 if (__improbable(zone
->tags
)) {
4870 vm_tag_t tag
= zalloc_flags_get_tag(flags
);
4871 // set the tag with b0 clear so the block remains inuse
4872 ZTAG(zone
, element
)[0] = (vm_tag_t
)(tag
<< 1);
4873 vm_tag_update_zone_size(tag
, zone
->tag_zone_index
,
4874 zone_elem_size(zone
), waste
);
4876 #endif /* VM_MAX_TAG_ZONES */
4879 zpercpu_foreach_cpu(i
) {
4880 kasan_poison_range(element
+ ptoa(i
),
4881 zone_elem_size(zone
), ASAN_VALID
);
4884 kasan_poison_range(element
, zone_elem_size(zone
), ASAN_VALID
);
4888 return element
| validate_bit
;
4892 * zalloc returns an element from the specified zone.
4894 * The function is noinline when zlog can be used so that the backtracing can
4895 * reliably skip the zalloc_ext() and zalloc_log_or_trace_leaks()
4898 #if ZONE_ENABLE_LOGGING
4899 __attribute__((noinline
))
4904 zone_stats_t zstats
,
4905 zalloc_flags_t flags
,
4908 vm_offset_t addr
= 0;
4909 vm_size_t elem_size
= zone_elem_size(zone
);
4912 * KASan uses zalloc() for fakestack, which can be called anywhere.
4913 * However, we make sure these calls can never block.
4915 assert(zone
->kasan_fakestacks
||
4916 ml_get_interrupts_enabled() ||
4917 ml_is_quiescing() ||
4918 debug_mode_active() ||
4919 startup_phase
< STARTUP_SUB_EARLY_BOOT
);
4922 * Make sure Z_NOFAIL was not obviously misused
4924 if ((flags
& Z_NOFAIL
) && !zone
->prio_refill_count
) {
4925 assert(!zone
->exhaustible
&& (flags
& (Z_NOWAIT
| Z_NOPAGEWAIT
)) == 0);
4930 * Note: if zone caching is on, gzalloc and tags aren't used
4931 * so we can always check this first
4933 if (zone_caching_enabled(zone
)) {
4934 addr
= zcache_alloc_from_cpu_cache(zone
, zstats
, waste
);
4935 if (__probable(addr
)) {
4936 goto allocated_from_cache
;
4939 #endif /* CONFIG_ZCACHE */
4942 if (__improbable(zone
->gzalloc_tracked
)) {
4943 addr
= gzalloc_alloc(zone
, zstats
, flags
);
4944 goto allocated_from_gzalloc
;
4946 #endif /* CONFIG_GZALLOC */
4947 #if VM_MAX_TAG_ZONES
4948 if (__improbable(zone
->tags
)) {
4949 vm_tag_t tag
= zalloc_flags_get_tag(flags
);
4950 if (tag
== VM_KERN_MEMORY_NONE
) {
4952 * zone views into heaps can lead to a site-less call
4953 * and we fallback to KALLOC as a tag for those.
4955 tag
= VM_KERN_MEMORY_KALLOC
;
4956 flags
|= Z_VM_TAG(tag
);
4958 vm_tag_will_update_zone(tag
, zone
->tag_zone_index
);
4960 #endif /* VM_MAX_TAG_ZONES */
4963 assert(zone
->z_self
== zone
);
4966 * Check if we need another thread to replenish the zone or
4967 * if we have to wait for a replenish thread to finish.
4968 * This is used for elements, like vm_map_entry, which are
4969 * needed themselves to implement zalloc().
4971 if (__improbable(zone
->prio_refill_count
&&
4972 zone
->countfree
<= zone
->prio_refill_count
/ 2)) {
4973 zone_refill_asynchronously_locked(zone
);
4974 } else if (__improbable(zone
->countfree
== 0)) {
4975 zone_refill_synchronously_locked(zone
, flags
);
4976 if (__improbable(zone
->countfree
== 0)) {
4978 if (__improbable(flags
& Z_NOFAIL
)) {
4979 zone_nofail_panic(zone
);
4985 addr
= zalloc_direct_locked(zone
, flags
, waste
);
4986 if (__probable(zstats
!= NULL
)) {
4988 * The few vm zones used before zone_init() runs do not have
4991 int cpu
= cpu_number();
4992 zpercpu_get_cpu(zstats
, cpu
)->zs_mem_allocated
+= elem_size
;
4993 #if ZALLOC_DETAILED_STATS
4995 zpercpu_get_cpu(zstats
, cpu
)->zs_mem_wasted
+= waste
;
4997 #endif /* ZALLOC_DETAILED_STATS */
5002 #if ZALLOC_ENABLE_POISONING
5003 bool validate
= addr
& ZALLOC_ELEMENT_NEEDS_VALIDATION
;
5005 addr
&= ~ZALLOC_ELEMENT_NEEDS_VALIDATION
;
5006 zone_clear_freelist_pointers(zone
, addr
);
5007 #if ZALLOC_ENABLE_POISONING
5009 * Note: percpu zones do not respect ZONE_MIN_ELEM_SIZE,
5010 * so we will check the first word even if we just
5013 zalloc_validate_element(zone
, addr
, elem_size
- sizeof(vm_offset_t
),
5015 #endif /* ZALLOC_ENABLE_POISONING */
5017 allocated_from_cache
:
5018 #if ZONE_ENABLE_LOGGING || CONFIG_ZLEAKS
5019 if (__improbable(zalloc_should_log_or_trace_leaks(zone
, elem_size
))) {
5020 zalloc_log_or_trace_leaks(zone
, addr
);
5022 #endif /* ZONE_ENABLE_LOGGING || CONFIG_ZLEAKS */
5025 allocated_from_gzalloc
:
5028 if (zone
->kasan_redzone
) {
5029 addr
= kasan_alloc(addr
, elem_size
,
5030 elem_size
- 2 * zone
->kasan_redzone
, zone
->kasan_redzone
);
5031 elem_size
-= 2 * zone
->kasan_redzone
;
5034 * Initialize buffer with unique pattern only if memory
5035 * wasn't expected to be zeroed.
5037 if (!zone
->zfree_clear_mem
&& !(flags
& Z_ZERO
)) {
5038 kasan_leak_init(addr
, elem_size
);
5040 #endif /* KASAN_ZALLOC */
5041 if ((flags
& Z_ZERO
) && !zone
->zfree_clear_mem
) {
5042 bzero((void *)addr
, elem_size
);
5045 TRACE_MACHLEAKS(ZALLOC_CODE
, ZALLOC_CODE_2
, elem_size
, addr
);
5048 DTRACE_VM2(zalloc
, zone_t
, zone
, void*, addr
);
5049 return (void *)addr
;
5053 zalloc(union zone_or_view zov
)
5055 return zalloc_flags(zov
, Z_WAITOK
);
5059 zalloc_noblock(union zone_or_view zov
)
5061 return zalloc_flags(zov
, Z_NOWAIT
);
5065 zalloc_flags(union zone_or_view zov
, zalloc_flags_t flags
)
5067 zone_t zone
= zov
.zov_view
->zv_zone
;
5068 zone_stats_t zstats
= zov
.zov_view
->zv_stats
;
5069 assert(!zone
->percpu
);
5070 return zalloc_ext(zone
, zstats
, flags
, 0);
5074 zalloc_percpu(union zone_or_view zov
, zalloc_flags_t flags
)
5076 zone_t zone
= zov
.zov_view
->zv_zone
;
5077 zone_stats_t zstats
= zov
.zov_view
->zv_stats
;
5078 assert(zone
->percpu
);
5079 return (void *)__zpcpu_mangle(zalloc_ext(zone
, zstats
, flags
, 0));
5083 _zalloc_permanent(zone_t zone
, vm_size_t size
, vm_offset_t mask
)
5085 const zone_addr_kind_t kind
= ZONE_ADDR_NATIVE
;
5086 struct zone_page_metadata
*page_meta
;
5087 vm_offset_t offs
, addr
;
5090 assert(ml_get_interrupts_enabled() ||
5091 ml_is_quiescing() ||
5092 debug_mode_active() ||
5093 startup_phase
< STARTUP_SUB_EARLY_BOOT
);
5095 size
= (size
+ mask
) & ~mask
;
5096 assert(size
<= PAGE_SIZE
);
5099 assert(zone
->z_self
== zone
);
5102 pva
= zone
->pages_intermediate
;
5103 while (!zone_pva_is_null(pva
)) {
5104 page_meta
= zone_pva_to_meta(pva
, kind
);
5105 if (page_meta
->zm_freelist_offs
+ size
<= PAGE_SIZE
) {
5108 pva
= page_meta
->zm_page_next
;
5111 zone_refill_synchronously_locked(zone
, Z_WAITOK
);
5115 offs
= (page_meta
->zm_freelist_offs
+ mask
) & ~mask
;
5116 page_meta
->zm_freelist_offs
= offs
+ size
;
5117 page_meta
->zm_alloc_count
+= size
;
5118 zone
->countfree
-= size
;
5119 if (__probable(zone
->z_stats
)) {
5120 zpercpu_get(zone
->z_stats
)->zs_mem_allocated
+= size
;
5123 if (page_meta
->zm_alloc_count
>= PAGE_SIZE
- sizeof(vm_offset_t
)) {
5124 zone_meta_requeue(zone
, &zone
->pages_all_used
, page_meta
, kind
);
5129 addr
= offs
+ zone_pva_to_addr(pva
);
5131 DTRACE_VM2(zalloc
, zone_t
, zone
, void*, addr
);
5132 return (void *)addr
;
5136 _zalloc_permanent_large(size_t size
, vm_offset_t mask
)
5141 kr
= kernel_memory_allocate(kernel_map
, &addr
, size
, mask
,
5142 KMA_KOBJECT
| KMA_PERMANENT
| KMA_ZERO
,
5143 VM_KERN_MEMORY_KALLOC
);
5145 panic("zalloc_permanent: unable to allocate %zd bytes (%d)",
5148 return (void *)addr
;
5152 zalloc_permanent(vm_size_t size
, vm_offset_t mask
)
5154 if (size
<= PAGE_SIZE
) {
5155 zone_t zone
= &zone_array
[ZONE_ID_PERMANENT
];
5156 return _zalloc_permanent(zone
, size
, mask
);
5158 return _zalloc_permanent_large(size
, mask
);
5162 zalloc_percpu_permanent(vm_size_t size
, vm_offset_t mask
)
5164 zone_t zone
= &zone_array
[ZONE_ID_PERCPU_PERMANENT
];
5165 return (void *)__zpcpu_mangle(_zalloc_permanent(zone
, size
, mask
));
5169 zalloc_async(__unused thread_call_param_t p0
, __unused thread_call_param_t p1
)
5171 zone_index_foreach(i
) {
5172 zone_t z
= &zone_array
[i
];
5174 if (z
->no_callout
) {
5175 /* async_pending will never be set */
5180 if (z
->z_self
&& z
->async_pending
) {
5181 z
->async_pending
= false;
5182 zone_refill_synchronously_locked(z
, Z_WAITOK
);
5189 * Adds the element to the head of the zone's free list
5190 * Keeps a backup next-pointer at the end of the element
5193 zfree_direct_locked(zone_t zone
, vm_offset_t element
, bool poison
)
5195 struct zone_page_metadata
*page_meta
;
5196 vm_offset_t page
, old_head
;
5197 zone_addr_kind_t kind
;
5198 vm_size_t elem_size
= zone_elem_size(zone
);
5200 vm_offset_t
*primary
= (vm_offset_t
*) element
;
5201 vm_offset_t
*backup
= get_backup_ptr(elem_size
, primary
);
5203 page_meta
= zone_allocated_element_resolve(zone
, element
, &page
, &kind
);
5204 old_head
= zone_page_meta_get_freelist(zone
, page_meta
, page
);
5206 if (__improbable(old_head
== element
)) {
5207 panic("zfree: double free of %p to zone %s%s\n",
5208 (void *) element
, zone_heap_name(zone
), zone
->z_name
);
5211 #if ZALLOC_ENABLE_POISONING
5212 if (poison
&& elem_size
< ZONE_MIN_ELEM_SIZE
) {
5213 assert(zone
->percpu
);
5221 * Always write a redundant next pointer
5222 * So that it is more difficult to forge, xor it with a random cookie
5223 * A poisoned element is indicated by using zp_poisoned_cookie
5224 * instead of zp_nopoison_cookie
5227 *backup
= old_head
^ (poison
? zp_poisoned_cookie
: zp_nopoison_cookie
);
5230 * Insert this element at the head of the free list. We also xor the
5231 * primary pointer with the zp_nopoison_cookie to make sure a free
5232 * element does not provide the location of the next free element directly.
5234 *primary
= old_head
^ zp_nopoison_cookie
;
5236 #if VM_MAX_TAG_ZONES
5237 if (__improbable(zone
->tags
)) {
5238 vm_tag_t tag
= (ZTAG(zone
, element
)[0] >> 1);
5239 // set the tag with b0 clear so the block remains inuse
5240 ZTAG(zone
, element
)[0] = 0xFFFE;
5241 vm_tag_update_zone_size(tag
, zone
->tag_zone_index
,
5242 -((int64_t)elem_size
), 0);
5244 #endif /* VM_MAX_TAG_ZONES */
5246 zone_page_meta_set_freelist(page_meta
, page
, element
);
5247 if (os_sub_overflow(page_meta
->zm_alloc_count
, 1,
5248 &page_meta
->zm_alloc_count
)) {
5249 zone_page_meta_accounting_panic(zone
, page_meta
,
5250 "alloc_count wrap-around");
5254 if (kind
== ZONE_ADDR_FOREIGN
) {
5255 if (old_head
== 0) {
5256 /* first foreign element freed on page, move from all_used_foreign */
5257 zone_meta_requeue(zone
, &zone
->pages_any_free_foreign
, page_meta
, kind
);
5259 } else if (page_meta
->zm_alloc_count
== 0) {
5260 /* whether the page was on the intermediate or all_used, queue, move it to free */
5261 zone_meta_requeue(zone
, &zone
->pages_all_free
, page_meta
, kind
);
5262 zone
->allfree_page_count
+= page_meta
->zm_page_count
;
5263 } else if (old_head
== 0) {
5264 /* first free element on page, move from all_used */
5265 zone_meta_requeue(zone
, &zone
->pages_intermediate
, page_meta
, kind
);
5270 zpercpu_foreach_cpu(i
) {
5271 kasan_poison_range(element
+ ptoa(i
), elem_size
,
5275 kasan_poison_range(element
, elem_size
, ASAN_HEAP_FREED
);
5281 * The function is noinline when zlog can be used so that the backtracing can
5282 * reliably skip the zfree_ext() and zfree_log_trace()
5285 #if ZONE_ENABLE_LOGGING
5286 __attribute__((noinline
))
5289 zfree_ext(zone_t zone
, zone_stats_t zstats
, void *addr
)
5291 vm_offset_t elem
= (vm_offset_t
)addr
;
5292 vm_size_t elem_size
= zone_elem_size(zone
);
5293 bool poison
= false;
5295 DTRACE_VM2(zfree
, zone_t
, zone
, void*, addr
);
5296 TRACE_MACHLEAKS(ZFREE_CODE
, ZFREE_CODE_2
, elem_size
, elem
);
5299 if (kasan_quarantine_freed_element(&zone
, &addr
)) {
5303 * kasan_quarantine_freed_element() might return a different
5304 * {zone, addr} than the one being freed for kalloc heaps.
5306 * Make sure we reload everything.
5308 elem
= (vm_offset_t
)addr
;
5309 elem_size
= zone_elem_size(zone
);
5314 * Zone leak detection: un-track the allocation
5316 if (__improbable(zone
->zleak_on
)) {
5317 zleak_free(elem
, elem_size
);
5319 #endif /* CONFIG_ZLEAKS */
5323 * Note: if zone caching is on, gzalloc and tags aren't used
5324 * so we can always check this first
5326 if (zone_caching_enabled(zone
)) {
5327 return zcache_free_to_cpu_cache(zone
, zstats
, (vm_offset_t
)addr
);
5329 #endif /* CONFIG_ZCACHE */
5332 if (__improbable(zone
->gzalloc_tracked
)) {
5333 return gzalloc_free(zone
, zstats
, addr
);
5335 #endif /* CONFIG_GZALLOC */
5337 #if ZONE_ENABLE_LOGGING
5338 if (__improbable(DO_LOGGING(zone
))) {
5339 zfree_log_trace(zone
, elem
);
5341 #endif /* ZONE_ENABLE_LOGGING */
5343 if (zone
->zfree_clear_mem
) {
5344 poison
= zfree_clear(zone
, elem
, elem_size
);
5348 assert(zone
->z_self
== zone
);
5351 poison
= zfree_poison_element(zone
, &zone
->zp_count
, elem
);
5354 if (__probable(zstats
!= NULL
)) {
5356 * The few vm zones used before zone_init() runs do not have
5359 zpercpu_get(zstats
)->zs_mem_freed
+= elem_size
;
5362 zfree_direct_locked(zone
, elem
, poison
);
5368 (zfree
)(union zone_or_view zov
, void *addr
)
5370 zone_t zone
= zov
.zov_view
->zv_zone
;
5371 zone_stats_t zstats
= zov
.zov_view
->zv_stats
;
5372 assert(!zone
->percpu
);
5373 zfree_ext(zone
, zstats
, addr
);
5377 zfree_percpu(union zone_or_view zov
, void *addr
)
5379 zone_t zone
= zov
.zov_view
->zv_zone
;
5380 zone_stats_t zstats
= zov
.zov_view
->zv_stats
;
5381 assert(zone
->percpu
);
5382 zfree_ext(zone
, zstats
, (void *)__zpcpu_demangle(addr
));
5385 #pragma mark vm integration, MIG routines
5388 * Drops (i.e. frees) the elements in the all free pages queue of a zone.
5389 * Called by zone_gc() on each zone and when a zone is zdestroy()ed.
5392 zone_drop_free_elements(zone_t z
)
5394 const zone_addr_kind_t kind
= ZONE_ADDR_NATIVE
;
5395 unsigned int total_freed_pages
= 0;
5396 struct zone_page_metadata
*page_meta
, *seq_meta
;
5397 vm_address_t page_addr
;
5398 vm_size_t size_to_free
;
5399 vm_size_t free_count
;
5400 uint32_t page_count
;
5402 current_thread()->options
|= TH_OPT_ZONE_PRIV
;
5405 while (!zone_pva_is_null(z
->pages_all_free
)) {
5407 * If any replenishment threads are running, defer to them,
5408 * so that we don't deplete reserved zones.
5410 * The timing of the check isn't super important, as there are
5411 * enough reserves to allow freeing an extra page_meta.
5413 * Hence, we can check without grabbing the lock every time
5414 * through the loop. We do need the lock however to avoid
5415 * missing a wakeup when we decide to block.
5417 if (zone_replenish_active
> 0) {
5418 lck_spin_lock(&zone_replenish_lock
);
5419 if (zone_replenish_active
> 0) {
5420 assert_wait(&zone_replenish_active
, THREAD_UNINT
);
5421 lck_spin_unlock(&zone_replenish_lock
);
5423 thread_block(THREAD_CONTINUE_NULL
);
5427 lck_spin_unlock(&zone_replenish_lock
);
5430 page_meta
= zone_pva_to_meta(z
->pages_all_free
, kind
);
5431 page_count
= page_meta
->zm_page_count
;
5432 free_count
= zone_elem_count(z
, ptoa(page_count
), kind
);
5435 * Don't drain zones with async refill to below the refill
5436 * threshold, as they need some reserve to function properly.
5438 if (!z
->destroyed
&& z
->prio_refill_count
&&
5439 (vm_size_t
)(z
->countfree
- free_count
) < z
->prio_refill_count
) {
5443 zone_meta_queue_pop(z
, &z
->pages_all_free
, kind
, &page_addr
);
5445 if (os_sub_overflow(z
->countfree
, free_count
, &z
->countfree
)) {
5446 zone_accounting_panic(z
, "countfree wrap-around");
5448 if (os_sub_overflow(z
->countavail
, free_count
, &z
->countavail
)) {
5449 zone_accounting_panic(z
, "countavail wrap-around");
5451 if (os_sub_overflow(z
->allfree_page_count
, page_count
,
5452 &z
->allfree_page_count
)) {
5453 zone_accounting_panic(z
, "allfree_page_count wrap-around");
5455 if (os_sub_overflow(z
->page_count
, page_count
, &z
->page_count
)) {
5456 zone_accounting_panic(z
, "page_count wrap-around");
5459 os_atomic_sub(&zones_phys_page_count
, page_count
, relaxed
);
5460 os_atomic_sub(&zones_phys_page_mapped_count
, page_count
, relaxed
);
5462 bzero(page_meta
, sizeof(*page_meta
) * page_count
);
5463 seq_meta
= page_meta
;
5464 page_meta
= NULL
; /* page_meta fields are zeroed, prevent reuse */
5468 /* Free the pages for metadata and account for them */
5469 total_freed_pages
+= page_count
;
5470 size_to_free
= ptoa(page_count
);
5472 kasan_poison_range(page_addr
, size_to_free
, ASAN_VALID
);
5474 #if VM_MAX_TAG_ZONES
5476 ztMemoryRemove(z
, page_addr
, size_to_free
);
5478 #endif /* VM_MAX_TAG_ZONES */
5480 if (z
->va_sequester
&& z
->alloc_pages
== page_count
) {
5481 kernel_memory_depopulate(submap_for_zone(z
), page_addr
,
5482 size_to_free
, KMA_KOBJECT
, VM_KERN_MEMORY_ZONE
);
5484 kmem_free(submap_for_zone(z
), page_addr
, size_to_free
);
5487 thread_yield_to_preemption();
5492 zone_meta_queue_push(z
, &z
->pages_sequester
, seq_meta
, kind
);
5493 z
->sequester_page_count
+= page_count
;
5497 assert(zone_pva_is_null(z
->pages_all_free
));
5498 assert(z
->allfree_page_count
== 0);
5501 current_thread()->options
&= ~TH_OPT_ZONE_PRIV
;
5503 #if DEBUG || DEVELOPMENT
5504 if (zalloc_debug
& ZALLOC_DEBUG_ZONEGC
) {
5505 kprintf("zone_gc() of zone %s%s freed %lu elements, %d pages\n",
5506 zone_heap_name(z
), z
->z_name
,
5507 (unsigned long)(ptoa(total_freed_pages
) / z
->pcpu_elem_size
),
5510 #endif /* DEBUG || DEVELOPMENT */
5513 /* Zone garbage collection
5515 * zone_gc will walk through all the free elements in all the
5516 * zones that are marked collectable looking for reclaimable
5517 * pages. zone_gc is called by consider_zone_gc when the system
5518 * begins to run out of memory.
5520 * We should ensure that zone_gc never blocks.
5523 zone_gc(boolean_t consider_jetsams
)
5525 if (consider_jetsams
) {
5526 kill_process_in_largest_zone();
5528 * If we do end up jetsamming something, we need to do a zone_gc so that
5529 * we can reclaim free zone elements and update the zone map size.
5534 lck_mtx_lock(&zone_gc_lock
);
5536 #if DEBUG || DEVELOPMENT
5537 if (zalloc_debug
& ZALLOC_DEBUG_ZONEGC
) {
5538 kprintf("zone_gc() starting...\n");
5540 #endif /* DEBUG || DEVELOPMENT */
5542 zone_index_foreach(i
) {
5543 zone_t z
= &zone_array
[i
];
5545 if (!z
->collectable
) {
5549 if (zone_caching_enabled(z
)) {
5550 zcache_drain_depot(z
);
5552 #endif /* CONFIG_ZCACHE */
5553 if (zone_pva_is_null(z
->pages_all_free
)) {
5557 zone_drop_free_elements(z
);
5560 lck_mtx_unlock(&zone_gc_lock
);
5566 * Called by the pageout daemon when the system needs more free pages.
5570 consider_zone_gc(boolean_t consider_jetsams
)
5573 * One-time reclaim of kernel_map resources we allocated in
5576 * Use atomic exchange in case multiple threads race into here.
5578 vm_offset_t deallocate_kaddr
;
5579 if (kmapoff_kaddr
!= 0 &&
5580 (deallocate_kaddr
= os_atomic_xchg(&kmapoff_kaddr
, 0, relaxed
)) != 0) {
5581 vm_deallocate(kernel_map
, deallocate_kaddr
, ptoa_64(kmapoff_pgcnt
));
5584 zone_gc(consider_jetsams
);
5588 * Creates a vm_map_copy_t to return to the caller of mach_* MIG calls
5589 * requesting zone information.
5590 * Frees unused pages towards the end of the region, and zero'es out unused
5591 * space on the last page.
5593 static vm_map_copy_t
5595 vm_offset_t start_addr
,
5596 vm_size_t total_size
,
5597 vm_size_t used_size
)
5600 vm_offset_t end_addr
;
5601 vm_size_t free_size
;
5604 if (used_size
!= total_size
) {
5605 end_addr
= start_addr
+ used_size
;
5606 free_size
= total_size
- (round_page(end_addr
) - start_addr
);
5608 if (free_size
>= PAGE_SIZE
) {
5609 kmem_free(ipc_kernel_map
,
5610 round_page(end_addr
), free_size
);
5612 bzero((char *) end_addr
, round_page(end_addr
) - end_addr
);
5615 kr
= vm_map_copyin(ipc_kernel_map
, (vm_map_address_t
)start_addr
,
5616 (vm_map_size_t
)used_size
, TRUE
, ©
);
5617 assert(kr
== KERN_SUCCESS
);
5625 mach_zone_name_t
*zn
,
5626 mach_zone_info_t
*zi
)
5630 assert(z
!= ZONE_NULL
);
5641 * Append kalloc heap name to zone name (if zone is used by kalloc)
5643 char temp_zone_name
[MAX_ZONE_NAME
] = "";
5644 snprintf(temp_zone_name
, MAX_ZONE_NAME
, "%s%s",
5645 zone_heap_name(z
), z
->z_name
);
5647 /* assuming here the name data is static */
5648 (void) __nosan_strlcpy(zn
->mzn_name
, temp_zone_name
,
5649 strlen(temp_zone_name
) + 1);
5653 *zi
= (mach_zone_info_t
) {
5654 .mzi_count
= zone_count_allocated(&zcopy
),
5655 .mzi_cur_size
= ptoa_64(zcopy
.page_count
),
5656 // max_size for zprint is now high-watermark of pages used
5657 .mzi_max_size
= ptoa_64(zcopy
.page_count_hwm
),
5658 .mzi_elem_size
= zcopy
.pcpu_elem_size
,
5659 .mzi_alloc_size
= ptoa_64(zcopy
.alloc_pages
),
5660 .mzi_exhaustible
= (uint64_t)zcopy
.exhaustible
,
5662 zpercpu_foreach(zs
, zcopy
.z_stats
) {
5663 zi
->mzi_sum_size
+= zs
->zs_mem_allocated
;
5665 if (zcopy
.collectable
) {
5666 SET_MZI_COLLECTABLE_BYTES(zi
->mzi_collectable
,
5667 ptoa_64(zcopy
.allfree_page_count
));
5668 SET_MZI_COLLECTABLE_FLAG(zi
->mzi_collectable
, TRUE
);
5677 __unused task_t task
,
5678 __unused mach_zone_name_array_t
*namesp
,
5679 __unused mach_msg_type_number_t
*namesCntp
,
5680 __unused task_zone_info_array_t
*infop
,
5681 __unused mach_msg_type_number_t
*infoCntp
)
5683 return KERN_FAILURE
;
5689 mach_zone_name_array_t
*namesp
,
5690 mach_msg_type_number_t
*namesCntp
,
5691 mach_zone_info_array_t
*infop
,
5692 mach_msg_type_number_t
*infoCntp
)
5694 return mach_memory_info(host
, namesp
, namesCntp
, infop
, infoCntp
, NULL
, NULL
);
5701 mach_zone_name_array_t
*namesp
,
5702 mach_msg_type_number_t
*namesCntp
,
5703 mach_zone_info_array_t
*infop
,
5704 mach_msg_type_number_t
*infoCntp
,
5705 mach_memory_info_array_t
*memoryInfop
,
5706 mach_msg_type_number_t
*memoryInfoCntp
)
5708 mach_zone_name_t
*names
;
5709 vm_offset_t names_addr
;
5710 vm_size_t names_size
;
5712 mach_zone_info_t
*info
;
5713 vm_offset_t info_addr
;
5714 vm_size_t info_size
;
5716 mach_memory_info_t
*memory_info
;
5717 vm_offset_t memory_info_addr
;
5718 vm_size_t memory_info_size
;
5719 vm_size_t memory_info_vmsize
;
5720 unsigned int num_info
;
5722 unsigned int max_zones
, used_zones
, i
;
5723 mach_zone_name_t
*zn
;
5724 mach_zone_info_t
*zi
;
5727 uint64_t zones_collectable_bytes
= 0;
5729 if (host
== HOST_NULL
) {
5730 return KERN_INVALID_HOST
;
5732 #if CONFIG_DEBUGGER_FOR_ZONE_INFO
5733 if (!PE_i_can_has_debugger(NULL
)) {
5734 return KERN_INVALID_HOST
;
5739 * We assume that zones aren't freed once allocated.
5740 * We won't pick up any zones that are allocated later.
5743 max_zones
= os_atomic_load(&num_zones
, relaxed
);
5745 names_size
= round_page(max_zones
* sizeof *names
);
5746 kr
= kmem_alloc_pageable(ipc_kernel_map
,
5747 &names_addr
, names_size
, VM_KERN_MEMORY_IPC
);
5748 if (kr
!= KERN_SUCCESS
) {
5751 names
= (mach_zone_name_t
*) names_addr
;
5753 info_size
= round_page(max_zones
* sizeof *info
);
5754 kr
= kmem_alloc_pageable(ipc_kernel_map
,
5755 &info_addr
, info_size
, VM_KERN_MEMORY_IPC
);
5756 if (kr
!= KERN_SUCCESS
) {
5757 kmem_free(ipc_kernel_map
,
5758 names_addr
, names_size
);
5761 info
= (mach_zone_info_t
*) info_addr
;
5766 used_zones
= max_zones
;
5767 for (i
= 0; i
< max_zones
; i
++) {
5768 if (!get_zone_info(&(zone_array
[i
]), zn
, zi
)) {
5772 zones_collectable_bytes
+= GET_MZI_COLLECTABLE_BYTES(zi
->mzi_collectable
);
5777 *namesp
= (mach_zone_name_t
*) create_vm_map_copy(names_addr
, names_size
, used_zones
* sizeof *names
);
5778 *namesCntp
= used_zones
;
5780 *infop
= (mach_zone_info_t
*) create_vm_map_copy(info_addr
, info_size
, used_zones
* sizeof *info
);
5781 *infoCntp
= used_zones
;
5784 memory_info_addr
= 0;
5786 if (memoryInfop
&& memoryInfoCntp
) {
5788 num_info
= vm_page_diagnose_estimate();
5789 memory_info_size
= num_info
* sizeof(*memory_info
);
5790 memory_info_vmsize
= round_page(memory_info_size
);
5791 kr
= kmem_alloc_pageable(ipc_kernel_map
,
5792 &memory_info_addr
, memory_info_vmsize
, VM_KERN_MEMORY_IPC
);
5793 if (kr
!= KERN_SUCCESS
) {
5797 kr
= vm_map_wire_kernel(ipc_kernel_map
, memory_info_addr
, memory_info_addr
+ memory_info_vmsize
,
5798 VM_PROT_READ
| VM_PROT_WRITE
, VM_KERN_MEMORY_IPC
, FALSE
);
5799 assert(kr
== KERN_SUCCESS
);
5801 memory_info
= (mach_memory_info_t
*) memory_info_addr
;
5802 vm_page_diagnose(memory_info
, num_info
, zones_collectable_bytes
);
5804 kr
= vm_map_unwire(ipc_kernel_map
, memory_info_addr
, memory_info_addr
+ memory_info_vmsize
, FALSE
);
5805 assert(kr
== KERN_SUCCESS
);
5807 kr
= vm_map_copyin(ipc_kernel_map
, (vm_map_address_t
)memory_info_addr
,
5808 (vm_map_size_t
)memory_info_size
, TRUE
, ©
);
5809 assert(kr
== KERN_SUCCESS
);
5811 *memoryInfop
= (mach_memory_info_t
*) copy
;
5812 *memoryInfoCntp
= num_info
;
5815 return KERN_SUCCESS
;
5819 mach_zone_info_for_zone(
5821 mach_zone_name_t name
,
5822 mach_zone_info_t
*infop
)
5826 if (host
== HOST_NULL
) {
5827 return KERN_INVALID_HOST
;
5829 #if CONFIG_DEBUGGER_FOR_ZONE_INFO
5830 if (!PE_i_can_has_debugger(NULL
)) {
5831 return KERN_INVALID_HOST
;
5835 if (infop
== NULL
) {
5836 return KERN_INVALID_ARGUMENT
;
5839 zone_ptr
= ZONE_NULL
;
5840 zone_index_foreach(i
) {
5841 zone_t z
= &(zone_array
[i
]);
5842 assert(z
!= ZONE_NULL
);
5845 * Append kalloc heap name to zone name (if zone is used by kalloc)
5847 char temp_zone_name
[MAX_ZONE_NAME
] = "";
5848 snprintf(temp_zone_name
, MAX_ZONE_NAME
, "%s%s",
5849 zone_heap_name(z
), z
->z_name
);
5851 /* Find the requested zone by name */
5852 if (track_this_zone(temp_zone_name
, name
.mzn_name
)) {
5858 /* No zones found with the requested zone name */
5859 if (zone_ptr
== ZONE_NULL
) {
5860 return KERN_INVALID_ARGUMENT
;
5863 if (get_zone_info(zone_ptr
, NULL
, infop
)) {
5864 return KERN_SUCCESS
;
5866 return KERN_FAILURE
;
5870 mach_zone_info_for_largest_zone(
5872 mach_zone_name_t
*namep
,
5873 mach_zone_info_t
*infop
)
5875 if (host
== HOST_NULL
) {
5876 return KERN_INVALID_HOST
;
5878 #if CONFIG_DEBUGGER_FOR_ZONE_INFO
5879 if (!PE_i_can_has_debugger(NULL
)) {
5880 return KERN_INVALID_HOST
;
5884 if (namep
== NULL
|| infop
== NULL
) {
5885 return KERN_INVALID_ARGUMENT
;
5888 if (get_zone_info(zone_find_largest(), namep
, infop
)) {
5889 return KERN_SUCCESS
;
5891 return KERN_FAILURE
;
5895 get_zones_collectable_bytes(void)
5897 uint64_t zones_collectable_bytes
= 0;
5898 mach_zone_info_t zi
;
5900 zone_index_foreach(i
) {
5901 if (get_zone_info(&zone_array
[i
], NULL
, &zi
)) {
5902 zones_collectable_bytes
+=
5903 GET_MZI_COLLECTABLE_BYTES(zi
.mzi_collectable
);
5907 return zones_collectable_bytes
;
5911 mach_zone_get_zlog_zones(
5913 mach_zone_name_array_t
*namesp
,
5914 mach_msg_type_number_t
*namesCntp
)
5916 #if ZONE_ENABLE_LOGGING
5917 unsigned int max_zones
, logged_zones
, i
;
5920 mach_zone_name_t
*names
;
5921 vm_offset_t names_addr
;
5922 vm_size_t names_size
;
5924 if (host
== HOST_NULL
) {
5925 return KERN_INVALID_HOST
;
5928 if (namesp
== NULL
|| namesCntp
== NULL
) {
5929 return KERN_INVALID_ARGUMENT
;
5932 max_zones
= os_atomic_load(&num_zones
, relaxed
);
5934 names_size
= round_page(max_zones
* sizeof *names
);
5935 kr
= kmem_alloc_pageable(ipc_kernel_map
,
5936 &names_addr
, names_size
, VM_KERN_MEMORY_IPC
);
5937 if (kr
!= KERN_SUCCESS
) {
5940 names
= (mach_zone_name_t
*) names_addr
;
5942 zone_ptr
= ZONE_NULL
;
5944 for (i
= 0; i
< max_zones
; i
++) {
5945 zone_t z
= &(zone_array
[i
]);
5946 assert(z
!= ZONE_NULL
);
5948 /* Copy out the zone name if zone logging is enabled */
5949 if (z
->zlog_btlog
) {
5950 get_zone_info(z
, &names
[logged_zones
], NULL
);
5955 *namesp
= (mach_zone_name_t
*) create_vm_map_copy(names_addr
, names_size
, logged_zones
* sizeof *names
);
5956 *namesCntp
= logged_zones
;
5958 return KERN_SUCCESS
;
5960 #else /* ZONE_ENABLE_LOGGING */
5961 #pragma unused(host, namesp, namesCntp)
5962 return KERN_FAILURE
;
5963 #endif /* ZONE_ENABLE_LOGGING */
5967 mach_zone_get_btlog_records(
5969 mach_zone_name_t name
,
5970 zone_btrecord_array_t
*recsp
,
5971 mach_msg_type_number_t
*recsCntp
)
5973 #if DEBUG || DEVELOPMENT
5974 unsigned int numrecs
= 0;
5975 zone_btrecord_t
*recs
;
5978 vm_offset_t recs_addr
;
5979 vm_size_t recs_size
;
5981 if (host
== HOST_NULL
) {
5982 return KERN_INVALID_HOST
;
5985 if (recsp
== NULL
|| recsCntp
== NULL
) {
5986 return KERN_INVALID_ARGUMENT
;
5989 zone_ptr
= ZONE_NULL
;
5990 zone_index_foreach(i
) {
5991 zone_t z
= &zone_array
[i
];
5994 * Append kalloc heap name to zone name (if zone is used by kalloc)
5996 char temp_zone_name
[MAX_ZONE_NAME
] = "";
5997 snprintf(temp_zone_name
, MAX_ZONE_NAME
, "%s%s",
5998 zone_heap_name(z
), z
->z_name
);
6000 /* Find the requested zone by name */
6001 if (track_this_zone(temp_zone_name
, name
.mzn_name
)) {
6007 /* No zones found with the requested zone name */
6008 if (zone_ptr
== ZONE_NULL
) {
6009 return KERN_INVALID_ARGUMENT
;
6012 /* Logging not turned on for the requested zone */
6013 if (!DO_LOGGING(zone_ptr
)) {
6014 return KERN_FAILURE
;
6017 /* Allocate memory for btlog records */
6018 numrecs
= (unsigned int)(get_btlog_records_count(zone_ptr
->zlog_btlog
));
6019 recs_size
= round_page(numrecs
* sizeof *recs
);
6021 kr
= kmem_alloc_pageable(ipc_kernel_map
, &recs_addr
, recs_size
, VM_KERN_MEMORY_IPC
);
6022 if (kr
!= KERN_SUCCESS
) {
6027 * We will call get_btlog_records() below which populates this region while holding a spinlock
6028 * (the btlog lock). So these pages need to be wired.
6030 kr
= vm_map_wire_kernel(ipc_kernel_map
, recs_addr
, recs_addr
+ recs_size
,
6031 VM_PROT_READ
| VM_PROT_WRITE
, VM_KERN_MEMORY_IPC
, FALSE
);
6032 assert(kr
== KERN_SUCCESS
);
6034 recs
= (zone_btrecord_t
*)recs_addr
;
6035 get_btlog_records(zone_ptr
->zlog_btlog
, recs
, &numrecs
);
6037 kr
= vm_map_unwire(ipc_kernel_map
, recs_addr
, recs_addr
+ recs_size
, FALSE
);
6038 assert(kr
== KERN_SUCCESS
);
6040 *recsp
= (zone_btrecord_t
*) create_vm_map_copy(recs_addr
, recs_size
, numrecs
* sizeof *recs
);
6041 *recsCntp
= numrecs
;
6043 return KERN_SUCCESS
;
6045 #else /* DEBUG || DEVELOPMENT */
6046 #pragma unused(host, name, recsp, recsCntp)
6047 return KERN_FAILURE
;
6048 #endif /* DEBUG || DEVELOPMENT */
6052 #if DEBUG || DEVELOPMENT
6055 mach_memory_info_check(void)
6057 mach_memory_info_t
* memory_info
;
6058 mach_memory_info_t
* info
;
6059 unsigned int num_info
;
6060 vm_offset_t memory_info_addr
;
6062 size_t memory_info_size
, memory_info_vmsize
;
6063 uint64_t top_wired
, zonestotal
, total
;
6065 num_info
= vm_page_diagnose_estimate();
6066 memory_info_size
= num_info
* sizeof(*memory_info
);
6067 memory_info_vmsize
= round_page(memory_info_size
);
6068 kr
= kmem_alloc(kernel_map
, &memory_info_addr
, memory_info_vmsize
, VM_KERN_MEMORY_DIAG
);
6069 assert(kr
== KERN_SUCCESS
);
6071 memory_info
= (mach_memory_info_t
*) memory_info_addr
;
6072 vm_page_diagnose(memory_info
, num_info
, 0);
6074 top_wired
= total
= zonestotal
= 0;
6075 zone_index_foreach(idx
) {
6076 zonestotal
+= zone_size_wired(&zone_array
[idx
]);
6079 for (uint32_t idx
= 0; idx
< num_info
; idx
++) {
6080 info
= &memory_info
[idx
];
6084 if (VM_KERN_COUNT_WIRED
== info
->site
) {
6085 top_wired
= info
->size
;
6087 if (VM_KERN_SITE_HIDE
& info
->flags
) {
6090 if (!(VM_KERN_SITE_WIRED
& info
->flags
)) {
6093 total
+= info
->size
;
6095 total
+= zonestotal
;
6097 printf("vm_page_diagnose_check %qd of %qd, zones %qd, short 0x%qx\n",
6098 total
, top_wired
, zonestotal
, top_wired
- total
);
6100 kmem_free(kernel_map
, memory_info_addr
, memory_info_vmsize
);
6105 extern boolean_t(*volatile consider_buffer_cache_collect
)(int);
6107 #endif /* DEBUG || DEVELOPMENT */
6113 if (host
== HOST_NULL
) {
6114 return KERN_INVALID_HOST
;
6117 #if DEBUG || DEVELOPMENT
6118 /* Callout to buffer cache GC to drop elements in the apfs zones */
6119 if (consider_buffer_cache_collect
!= NULL
) {
6120 (void)(*consider_buffer_cache_collect
)(0);
6122 consider_zone_gc(FALSE
);
6123 #endif /* DEBUG || DEVELOPMENT */
6124 return KERN_SUCCESS
;
6128 zone_find_largest(void)
6130 uint32_t largest_idx
= 0;
6131 vm_offset_t largest_size
= zone_size_wired(&zone_array
[0]);
6133 zone_index_foreach(i
) {
6134 vm_offset_t size
= zone_size_wired(&zone_array
[i
]);
6135 if (size
> largest_size
) {
6137 largest_size
= size
;
6141 return &zone_array
[largest_idx
];
6144 #pragma mark - tests
6145 #if DEBUG || DEVELOPMENT
6148 * Used for sysctl kern.run_zone_test which is not thread-safe. Ensure only one
6149 * thread goes through at a time. Or we can end up with multiple test zones (if
6150 * a second zinit() comes through before zdestroy()), which could lead us to
6153 SIMPLE_LOCK_DECLARE(zone_test_lock
, 0);
6154 static boolean_t zone_test_running
= FALSE
;
6155 static zone_t test_zone_ptr
= NULL
;
6158 zone_copy_allocations(zone_t z
, uintptr_t *elems
, bitmap_t
*bits
,
6159 zone_pva_t page_index
, zone_addr_kind_t kind
)
6161 vm_offset_t free
, first
, end
, page
;
6162 struct zone_page_metadata
*meta
;
6164 while (!zone_pva_is_null(page_index
)) {
6165 page
= zone_pva_to_addr(page_index
);
6166 meta
= zone_pva_to_meta(page_index
, kind
);
6167 end
= page
+ ptoa(meta
->zm_percpu
? 1 : meta
->zm_page_count
);
6168 first
= page
+ ZONE_PAGE_FIRST_OFFSET(kind
);
6170 bitmap_clear(bits
, (uint32_t)((end
- first
) / zone_elem_size(z
)));
6172 // construct bitmap of all freed elements
6173 free
= zone_page_meta_get_freelist(z
, meta
, page
);
6175 bitmap_set(bits
, (uint32_t)((free
- first
) / zone_elem_size(z
)));
6177 // next free element
6178 free
= *(vm_offset_t
*)free
^ zp_nopoison_cookie
;
6181 for (unsigned i
= 0; first
< end
; i
++, first
+= zone_elem_size(z
)) {
6182 if (!bitmap_test(bits
, i
)) {
6183 *elems
++ = INSTANCE_PUT(first
);
6187 page_index
= meta
->zm_page_next
;
6193 zone_leaks(const char * zoneName
, uint32_t nameLen
, leak_site_proc proc
, void * refCon
)
6195 uintptr_t zbt
[MAX_ZTRACE_DEPTH
];
6199 uintptr_t element
, bt
;
6200 uint32_t idx
, count
, found
;
6201 uint32_t btidx
, btcount
, nobtcount
, btfound
;
6207 zone_index_foreach(i
) {
6208 if (!strncmp(zoneName
, zone_array
[i
].z_name
, nameLen
)) {
6209 zone
= &zone_array
[i
];
6214 return KERN_INVALID_NAME
;
6217 elemSize
= zone_elem_size(zone
);
6218 maxElems
= (zone
->countavail
+ 1) & ~1ul;
6220 if ((ptoa(zone
->percpu
? 1 : zone
->alloc_pages
) % elemSize
) &&
6221 !zone_leaks_scan_enable
) {
6222 return KERN_INVALID_CAPABILITY
;
6225 kr
= kmem_alloc_kobject(kernel_map
, (vm_offset_t
*) &array
,
6226 maxElems
* sizeof(uintptr_t) + BITMAP_LEN(ZONE_CHUNK_MAXELEMENTS
),
6227 VM_KERN_MEMORY_DIAG
);
6228 if (KERN_SUCCESS
!= kr
) {
6232 /* maxElems is a 2-multiple so we're always aligned */
6233 bits
= CAST_DOWN_EXPLICIT(bitmap_t
*, array
+ maxElems
);
6238 next
= zone_copy_allocations(zone
, next
, bits
,
6239 zone
->pages_any_free_foreign
, ZONE_ADDR_FOREIGN
);
6240 next
= zone_copy_allocations(zone
, next
, bits
,
6241 zone
->pages_all_used_foreign
, ZONE_ADDR_FOREIGN
);
6242 next
= zone_copy_allocations(zone
, next
, bits
,
6243 zone
->pages_intermediate
, ZONE_ADDR_NATIVE
);
6244 next
= zone_copy_allocations(zone
, next
, bits
,
6245 zone
->pages_all_used
, ZONE_ADDR_NATIVE
);
6246 count
= (uint32_t)(next
- array
);
6250 zone_leaks_scan(array
, count
, zone_elem_size(zone
), &found
);
6251 assert(found
<= count
);
6253 for (idx
= 0; idx
< count
; idx
++) {
6254 element
= array
[idx
];
6255 if (kInstanceFlagReferenced
& element
) {
6258 element
= INSTANCE_PUT(element
) & ~kInstanceFlags
;
6261 #if ZONE_ENABLE_LOGGING
6262 if (zone
->zlog_btlog
&& !corruption_debug_flag
) {
6263 // btlog_copy_backtraces_for_elements will set kInstanceFlagReferenced on elements it found
6264 btlog_copy_backtraces_for_elements(zone
->zlog_btlog
, array
, &count
, elemSize
, proc
, refCon
);
6266 #endif /* ZONE_ENABLE_LOGGING */
6268 for (nobtcount
= idx
= 0; idx
< count
; idx
++) {
6269 element
= array
[idx
];
6273 if (kInstanceFlagReferenced
& element
) {
6276 element
= INSTANCE_PUT(element
) & ~kInstanceFlags
;
6278 // see if we can find any backtrace left in the element
6279 btcount
= (typeof(btcount
))(zone_elem_size(zone
) / sizeof(uintptr_t));
6280 if (btcount
>= MAX_ZTRACE_DEPTH
) {
6281 btcount
= MAX_ZTRACE_DEPTH
- 1;
6283 for (btfound
= btidx
= 0; btidx
< btcount
; btidx
++) {
6284 bt
= ((uintptr_t *)element
)[btcount
- 1 - btidx
];
6285 if (!VM_KERNEL_IS_SLID(bt
)) {
6288 zbt
[btfound
++] = bt
;
6291 (*proc
)(refCon
, 1, elemSize
, &zbt
[0], btfound
);
6297 // fake backtrace when we found nothing
6298 zbt
[0] = (uintptr_t) &zalloc
;
6299 (*proc
)(refCon
, nobtcount
, elemSize
, &zbt
[0], 1);
6302 kmem_free(kernel_map
, (vm_offset_t
) array
, maxElems
* sizeof(uintptr_t));
6304 return KERN_SUCCESS
;
6310 unsigned int i
= 0, max_iter
= 5;
6314 simple_lock(&zone_test_lock
, &zone_locks_grp
);
6315 if (!zone_test_running
) {
6316 zone_test_running
= TRUE
;
6318 simple_unlock(&zone_test_lock
);
6319 printf("run_zone_test: Test already running.\n");
6322 simple_unlock(&zone_test_lock
);
6324 printf("run_zone_test: Testing zinit(), zalloc(), zfree() and zdestroy() on zone \"test_zone_sysctl\"\n");
6326 /* zinit() and zdestroy() a zone with the same name a bunch of times, verify that we get back the same zone each time */
6328 test_zone
= zinit(sizeof(uint64_t), 100 * sizeof(uint64_t), sizeof(uint64_t), "test_zone_sysctl");
6329 if (test_zone
== NULL
) {
6330 printf("run_zone_test: zinit() failed\n");
6335 if (test_zone_ptr
== NULL
&& test_zone
->countfree
!= 0) {
6337 if (test_zone
->countfree
!= 0) {
6339 printf("run_zone_test: free count is not zero\n");
6343 if (test_zone_ptr
== NULL
) {
6344 /* Stash the zone pointer returned on the fist zinit */
6345 printf("run_zone_test: zone created for the first time\n");
6346 test_zone_ptr
= test_zone
;
6347 } else if (test_zone
!= test_zone_ptr
) {
6348 printf("run_zone_test: old zone pointer and new zone pointer don't match\n");
6352 test_ptr
= zalloc(test_zone
);
6353 if (test_ptr
== NULL
) {
6354 printf("run_zone_test: zalloc() failed\n");
6357 zfree(test_zone
, test_ptr
);
6359 zdestroy(test_zone
);
6362 printf("run_zone_test: Iteration %d successful\n", i
);
6363 } while (i
< max_iter
);
6365 /* test Z_VA_SEQUESTER */
6366 if (zsecurity_options
& ZSECURITY_OPTIONS_SEQUESTER
) {
6367 int idx
, num_allocs
= 8;
6368 vm_size_t elem_size
= 2 * PAGE_SIZE
/ num_allocs
;
6369 void *allocs
[num_allocs
];
6370 vm_offset_t phys_pages
= os_atomic_load(&zones_phys_page_count
, relaxed
);
6371 vm_size_t zone_map_size
= zone_range_size(&zone_info
.zi_map_range
);
6373 test_zone
= zone_create("test_zone_sysctl", elem_size
,
6374 ZC_DESTRUCTIBLE
| ZC_SEQUESTER
);
6375 if (test_zone
== NULL
) {
6376 printf("run_zone_test: zinit() failed\n");
6380 for (idx
= 0; idx
< num_allocs
; idx
++) {
6381 allocs
[idx
] = zalloc(test_zone
);
6382 assert(NULL
!= allocs
[idx
]);
6383 printf("alloc[%d] %p\n", idx
, allocs
[idx
]);
6385 for (idx
= 0; idx
< num_allocs
; idx
++) {
6386 zfree(test_zone
, allocs
[idx
]);
6388 assert(!zone_pva_is_null(test_zone
->pages_all_free
));
6390 printf("vm_page_wire_count %d, vm_page_free_count %d, p to v %qd%%\n",
6391 vm_page_wire_count
, vm_page_free_count
,
6392 (100ULL * ptoa_64(phys_pages
)) / zone_map_size
);
6394 printf("vm_page_wire_count %d, vm_page_free_count %d, p to v %qd%%\n",
6395 vm_page_wire_count
, vm_page_free_count
,
6396 (100ULL * ptoa_64(phys_pages
)) / zone_map_size
);
6397 unsigned int allva
= 0;
6398 zone_index_foreach(zidx
) {
6399 zone_t z
= &zone_array
[zidx
];
6401 allva
+= z
->page_count
;
6402 if (!z
->sequester_page_count
) {
6408 zone_pva_t pg
= z
->pages_sequester
;
6409 struct zone_page_metadata
*page_meta
;
6410 while (pg
.packed_address
) {
6411 page_meta
= zone_pva_to_meta(pg
, ZONE_ADDR_NATIVE
);
6412 count
+= z
->alloc_pages
;
6413 pg
= page_meta
->zm_page_next
;
6415 assert(count
== z
->sequester_page_count
);
6416 size
= zone_size_wired(z
);
6420 printf("%s%s: seq %d, res %d, %qd %%\n",
6421 zone_heap_name(z
), z
->z_name
, z
->sequester_page_count
,
6422 z
->page_count
, zone_size_allocated(z
) * 100ULL / size
);
6426 printf("total va: %d\n", allva
);
6428 assert(zone_pva_is_null(test_zone
->pages_all_free
));
6429 assert(!zone_pva_is_null(test_zone
->pages_sequester
));
6430 assert(2 == test_zone
->sequester_page_count
);
6431 for (idx
= 0; idx
< num_allocs
; idx
++) {
6432 assert(0 == pmap_find_phys(kernel_pmap
, (addr64_t
)(uintptr_t) allocs
[idx
]));
6434 for (idx
= 0; idx
< num_allocs
; idx
++) {
6435 allocs
[idx
] = zalloc(test_zone
);
6436 assert(allocs
[idx
]);
6437 printf("alloc[%d] %p\n", idx
, allocs
[idx
]);
6439 assert(zone_pva_is_null(test_zone
->pages_sequester
));
6440 assert(0 == test_zone
->sequester_page_count
);
6441 for (idx
= 0; idx
< num_allocs
; idx
++) {
6442 zfree(test_zone
, allocs
[idx
]);
6444 zdestroy(test_zone
);
6446 printf("run_zone_test: skipping sequester test (not enabled)\n");
6449 printf("run_zone_test: Test passed\n");
6451 simple_lock(&zone_test_lock
, &zone_locks_grp
);
6452 zone_test_running
= FALSE
;
6453 simple_unlock(&zone_test_lock
);
6459 * Routines to test that zone garbage collection and zone replenish threads
6460 * running at the same time don't cause problems.
6464 zone_gc_replenish_test(void)
6471 zone_alloc_replenish_test(void)
6474 struct data
{ struct data
*next
; } *node
, *list
= NULL
;
6477 * Find a zone that has a replenish thread
6479 zone_index_foreach(i
) {
6481 if (z
->prio_refill_count
&&
6482 zone_elem_size(z
) >= sizeof(struct data
)) {
6488 printf("Couldn't find a replenish zone\n");
6492 for (uint32_t i
= 0; i
< 2000; ++i
) { /* something big enough to go past replenishment */
6499 * release the memory we allocated
6501 while (list
!= NULL
) {
6508 #endif /* DEBUG || DEVELOPMENT */