]> git.saurik.com Git - apple/xnu.git/blame - osfmk/kern/zalloc.c
xnu-7195.60.75.tar.gz
[apple/xnu.git] / osfmk / kern / zalloc.c
CommitLineData
1c79356b 1/*
f427ee49 2 * Copyright (c) 2000-2020 Apple Inc. All rights reserved.
1c79356b 3 *
2d21ac55 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
0a7de745 5 *
2d21ac55
A
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
0a7de745 14 *
2d21ac55
A
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
0a7de745 17 *
2d21ac55
A
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
8f6c56a5
A
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
2d21ac55
A
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
0a7de745 25 *
2d21ac55 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
1c79356b
A
27 */
28/*
29 * @OSF_COPYRIGHT@
30 */
0a7de745 31/*
1c79356b
A
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34 * All Rights Reserved.
0a7de745 35 *
1c79356b
A
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
0a7de745 41 *
1c79356b
A
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
0a7de745 45 *
1c79356b 46 * Carnegie Mellon requests users of this software to return to
0a7de745 47 *
1c79356b
A
48 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
0a7de745 52 *
1c79356b
A
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56/*
57 */
58/*
59 * File: kern/zalloc.c
60 * Author: Avadis Tevanian, Jr.
61 *
62 * Zone-based memory allocator. A zone is a collection of fixed size
63 * data blocks for which quick allocation/deallocation is possible.
64 */
91447636 65
f427ee49 66#define ZALLOC_ALLOW_DEPRECATED 1
91447636
A
67#include <mach/mach_types.h>
68#include <mach/vm_param.h>
69#include <mach/kern_return.h>
70#include <mach/mach_host_server.h>
6d2010ae 71#include <mach/task_server.h>
91447636 72#include <mach/machine/vm_types.h>
316670eb 73#include <mach/vm_map.h>
a39ff7e2 74#include <mach/sdt.h>
91447636 75
5ba3f43e 76#include <kern/bits.h>
f427ee49 77#include <kern/startup.h>
91447636 78#include <kern/kern_types.h>
1c79356b 79#include <kern/assert.h>
39037602 80#include <kern/backtrace.h>
91447636 81#include <kern/host.h>
1c79356b
A
82#include <kern/macro_help.h>
83#include <kern/sched.h>
b0d623f7 84#include <kern/locks.h>
1c79356b
A
85#include <kern/sched_prim.h>
86#include <kern/misc_protos.h>
0b4e3aa0 87#include <kern/thread_call.h>
f427ee49 88#include <kern/zalloc_internal.h>
91447636
A
89#include <kern/kalloc.h>
90
5c9f4661
A
91#include <prng/random.h>
92
91447636
A
93#include <vm/pmap.h>
94#include <vm/vm_map.h>
1c79356b 95#include <vm/vm_kern.h>
91447636 96#include <vm/vm_page.h>
f427ee49 97#include <vm/vm_compressor.h> /* C_SLOT_PACKED_PTR* */
91447636 98
316670eb
A
99#include <pexpert/pexpert.h>
100
1c79356b 101#include <machine/machparam.h>
39236c6e 102#include <machine/machine_routines.h> /* ml_cpu_get_info */
1c79356b 103
f427ee49
A
104#include <os/atomic.h>
105
2d21ac55 106#include <libkern/OSDebug.h>
7ddcb079 107#include <libkern/OSAtomic.h>
d9a64523 108#include <libkern/section_keywords.h>
2d21ac55
A
109#include <sys/kdebug.h>
110
5ba3f43e
A
111#include <san/kasan.h>
112
f427ee49
A
113#if KASAN_ZALLOC
114#define ZONE_ENABLE_LOGGING 0
115#elif DEBUG || DEVELOPMENT
116#define ZONE_ENABLE_LOGGING 1
117#else
118#define ZONE_ENABLE_LOGGING 0
119#endif
120
121extern void vm_pageout_garbage_collect(int collect);
122
123/* Returns pid of the task with the largest number of VM map entries. */
124extern pid_t find_largest_process_vm_map_entries(void);
125
126/*
127 * Callout to jetsam. If pid is -1, we wake up the memorystatus thread to do asynchronous kills.
128 * For any other pid we try to kill that process synchronously.
129 */
130extern boolean_t memorystatus_kill_on_zone_map_exhaustion(pid_t pid);
131
132extern zone_t vm_map_entry_zone;
133extern zone_t vm_object_zone;
134extern vm_offset_t kmapoff_kaddr;
135extern unsigned int kmapoff_pgcnt;
136extern unsigned int stack_total;
137extern unsigned long long stack_allocs;
138
139/*
140 * The max # of elements in a chunk should fit into
141 * zone_page_metadata.free_count (uint16_t).
142 *
143 * Update this if the type of free_count changes.
144 */
145#define ZONE_CHUNK_MAXELEMENTS (UINT16_MAX)
146
147#define ZONE_PAGECOUNT_BITS 14
148
149/* Zone elements must fit both a next pointer and a backup pointer */
150#define ZONE_MIN_ELEM_SIZE (2 * sizeof(vm_offset_t))
151#define ZONE_MAX_ALLOC_SIZE (32 * 1024)
152
153/* per-cpu zones are special because of counters */
154#define ZONE_MIN_PCPU_ELEM_SIZE (1 * sizeof(vm_offset_t))
155
156struct zone_map_range {
157 vm_offset_t min_address;
158 vm_offset_t max_address;
159};
160
161struct zone_page_metadata {
162 /* The index of the zone this metadata page belongs to */
163 zone_id_t zm_index;
164
165 /*
166 * zm_secondary_page == 0: number of pages in this run
167 * zm_secondary_page == 1: offset to the chunk start
168 */
169 uint16_t zm_page_count : ZONE_PAGECOUNT_BITS;
170
171 /* Whether this page is part of a chunk run */
172 uint16_t zm_percpu : 1;
173 uint16_t zm_secondary_page : 1;
174
175 /*
176 * The start of the freelist can be maintained as a 16-bit
177 * offset instead of a pointer because the free elements would
178 * be at max ZONE_MAX_ALLOC_SIZE bytes away from the start
179 * of the allocation chunk.
180 *
181 * Offset from start of the allocation chunk to free element
182 * list head.
183 */
184 uint16_t zm_freelist_offs;
185
186 /*
187 * zm_secondary_page == 0: number of allocated elements in the chunk
188 * zm_secondary_page == 1: unused
189 *
190 * PAGE_METADATA_EMPTY_FREELIST indicates an empty freelist
191 */
192 uint16_t zm_alloc_count;
193#define PAGE_METADATA_EMPTY_FREELIST UINT16_MAX
194
195 zone_pva_t zm_page_next;
196 zone_pva_t zm_page_prev;
197
198 /*
199 * This is only for the sake of debuggers
200 */
201#define ZONE_FOREIGN_COOKIE 0x123456789abcdef
202 uint64_t zm_foreign_cookie[];
203};
204
205
206/* Align elements that use the zone page list to 32 byte boundaries. */
207#define ZONE_PAGE_FIRST_OFFSET(kind) ((kind) == ZONE_ADDR_NATIVE ? 0 : 32)
208
209static_assert(sizeof(struct zone_page_metadata) == 16, "validate packing");
210
211static __security_const_late struct {
212 struct zone_map_range zi_map_range;
213 struct zone_map_range zi_general_range;
214 struct zone_map_range zi_meta_range;
215 struct zone_map_range zi_foreign_range;
216
217 /*
218 * The metadata lives within the zi_meta_range address range.
219 *
220 * The correct formula to find a metadata index is:
221 * absolute_page_index - page_index(zi_meta_range.min_address)
222 *
223 * And then this index is used to dereference zi_meta_range.min_address
224 * as a `struct zone_page_metadata` array.
225 *
226 * To avoid doing that substraction all the time in the various fast-paths,
227 * zi_array_base is offset by `page_index(zi_meta_range.min_address)`
228 * to avoid redoing that math all the time.
229 */
230 struct zone_page_metadata *zi_array_base;
231} zone_info;
232
0a7de745
A
233/*
234 * The zone_locks_grp allows for collecting lock statistics.
235 * All locks are associated to this group in zinit.
236 * Look at tools/lockstat for debugging lock contention.
237 */
f427ee49
A
238LCK_GRP_DECLARE(zone_locks_grp, "zone_locks");
239LCK_MTX_EARLY_DECLARE(zone_metadata_region_lck, &zone_locks_grp);
240
241/*
242 * Exclude more than one concurrent garbage collection
243 */
244LCK_GRP_DECLARE(zone_gc_lck_grp, "zone_gc");
245LCK_MTX_EARLY_DECLARE(zone_gc_lock, &zone_gc_lck_grp);
0a7de745 246
f427ee49
A
247boolean_t panic_include_zprint = FALSE;
248mach_memory_info_t *panic_kext_memory_info = NULL;
249vm_size_t panic_kext_memory_size = 0;
0a7de745 250
39236c6e 251/*
f427ee49
A
252 * Protects zone_array, num_zones, num_zones_in_use, and
253 * zone_destroyed_bitmap
39236c6e 254 */
f427ee49
A
255static SIMPLE_LOCK_DECLARE(all_zones_lock, 0);
256static unsigned int num_zones_in_use;
257unsigned int _Atomic num_zones;
258SECURITY_READ_ONLY_LATE(unsigned int) zone_view_count;
39236c6e 259
f427ee49
A
260#if KASAN_ZALLOC
261#define MAX_ZONES 566
262#else /* !KASAN_ZALLOC */
263#define MAX_ZONES 402
264#endif/* !KASAN_ZALLOC */
265struct zone zone_array[MAX_ZONES];
266
267/* Initialized in zone_bootstrap(), how many "copies" the per-cpu system does */
268static SECURITY_READ_ONLY_LATE(unsigned) zpercpu_early_count;
269
270/* Used to keep track of destroyed slots in the zone_array */
271static bitmap_t zone_destroyed_bitmap[BITMAP_LEN(MAX_ZONES)];
272
273/* number of pages used by all zones */
274static long _Atomic zones_phys_page_count;
275
276/* number of zone mapped pages used by all zones */
277static long _Atomic zones_phys_page_mapped_count;
278
f427ee49
A
279/*
280 * Turn ZSECURITY_OPTIONS_STRICT_IOKIT_FREE off on x86 so as not
281 * not break third party kexts that haven't yet been recompiled
282 * to use the new iokit macros.
283 */
284#if XNU_TARGET_OS_OSX && __x86_64__
285#define ZSECURITY_OPTIONS_STRICT_IOKIT_FREE_DEFAULT 0
286#else
287#define ZSECURITY_OPTIONS_STRICT_IOKIT_FREE_DEFAULT \
288 ZSECURITY_OPTIONS_STRICT_IOKIT_FREE
289#endif
290
291#define ZSECURITY_DEFAULT ( \
2a1bd2d3 292 ZSECURITY_OPTIONS_SEQUESTER | \
f427ee49
A
293 ZSECURITY_OPTIONS_SUBMAP_USER_DATA | \
294 ZSECURITY_OPTIONS_SEQUESTER_KEXT_KALLOC | \
295 ZSECURITY_OPTIONS_STRICT_IOKIT_FREE_DEFAULT | \
296 0)
297TUNABLE(zone_security_options_t, zsecurity_options, "zs", ZSECURITY_DEFAULT);
298
299#if VM_MAX_TAG_ZONES
300/* enable tags for zones that ask for it */
301TUNABLE(bool, zone_tagging_on, "-zt", false);
302#endif /* VM_MAX_TAG_ZONES */
303
304#if DEBUG || DEVELOPMENT
305TUNABLE(bool, zalloc_disable_copyio_check, "-no-copyio-zalloc-check", false);
306__options_decl(zalloc_debug_t, uint32_t, {
307 ZALLOC_DEBUG_ZONEGC = 0x00000001,
308 ZALLOC_DEBUG_ZCRAM = 0x00000002,
309});
310
311TUNABLE(zalloc_debug_t, zalloc_debug, "zalloc_debug", 0);
312#endif /* DEBUG || DEVELOPMENT */
313#if CONFIG_ZLEAKS
314/* Making pointer scanning leaks detection possible for all zones */
315TUNABLE(bool, zone_leaks_scan_enable, "-zl", false);
316#else
317#define zone_leaks_scan_enable false
318#endif
319
320/*
321 * Async allocation of zones
322 * This mechanism allows for bootstrapping an empty zone which is setup with
323 * non-blocking flags. The first call to zalloc_noblock() will kick off a thread_call
324 * to zalloc_async. We perform a zalloc() (which may block) and then an immediate free.
325 * This will prime the zone for the next use.
326 *
327 * Currently the thread_callout function (zalloc_async) will loop through all zones
328 * looking for any zone with async_pending set and do the work for it.
329 *
330 * NOTE: If the calling thread for zalloc_noblock is lower priority than thread_call,
331 * then zalloc_noblock to an empty zone may succeed.
332 */
333static void zalloc_async(thread_call_param_t p0, thread_call_param_t p1);
334static thread_call_data_t call_async_alloc;
335static void zcram_and_lock(zone_t zone, vm_offset_t newmem, vm_size_t size);
39236c6e
A
336
337/*
c910b4d9
A
338 * Zone Corruption Debugging
339 *
f427ee49 340 * We use four techniques to detect modification of a zone element
39236c6e 341 * after it's been freed.
316670eb 342 *
39236c6e
A
343 * (1) Check the freelist next pointer for sanity.
344 * (2) Store a backup of the next pointer at the end of the element,
345 * and compare it to the primary next pointer when the element is allocated
346 * to detect corruption of the freelist due to use-after-free bugs.
347 * The backup pointer is also XORed with a per-boot random cookie.
348 * (3) Poison the freed element by overwriting it with 0xdeadbeef,
349 * and check for that value when the element is being reused to make sure
350 * no part of the element has been modified while it was on the freelist.
351 * This will also help catch read-after-frees, as code will now dereference
352 * 0xdeadbeef instead of a valid but freed pointer.
f427ee49
A
353 * (4) If the zfree_clear_mem flag is set clear the element on free and
354 * assert that it is still clear when alloc-ed.
316670eb 355 *
39236c6e
A
356 * (1) and (2) occur for every allocation and free to a zone.
357 * This is done to make it slightly more difficult for an attacker to
358 * manipulate the freelist to behave in a specific way.
c910b4d9 359 *
f427ee49
A
360 * Poisoning (3) occurs periodically for every N frees (counted per-zone).
361 * If -zp is passed as a boot arg, poisoning occurs for every free.
362 *
363 * Zeroing (4) is done for those zones that pass the ZC_ZFREE_CLEARMEM
364 * flag on creation or if the element size is less than one cacheline.
c910b4d9 365 *
39236c6e
A
366 * Performance slowdown is inversely proportional to the frequency of poisoning,
367 * with a 4-5% hit around N=1, down to ~0.3% at N=16 and just "noise" at N=32
368 * and higher. You can expect to find a 100% reproducible bug in an average of
369 * N tries, with a standard deviation of about N, but you will want to set
370 * "-zp" to always poison every free if you are attempting to reproduce
371 * a known bug.
316670eb 372 *
39236c6e
A
373 * For a more heavyweight, but finer-grained method of detecting misuse
374 * of zone memory, look up the "Guard mode" zone allocator in gzalloc.c.
375 *
376 * Zone Corruption Logging
377 *
378 * You can also track where corruptions come from by using the boot-arguments
379 * "zlog=<zone name to log> -zc". Search for "Zone corruption logging" later
380 * in this document for more implementation and usage information.
381 *
382 * Zone Leak Detection
383 *
384 * To debug leaks of zone memory, use the zone leak detection tool 'zleaks'
385 * found later in this file via the showtopztrace and showz* macros in kgmacros,
386 * or use zlog without the -zc argument.
316670eb 387 *
316670eb
A
388 */
389
f427ee49
A
390#define ZP_DEFAULT_SAMPLING_FACTOR 16
391#define ZP_DEFAULT_SCALE_FACTOR 4
fe8ab488 392
f427ee49
A
393/*
394 * set by zp-factor=N boot arg
395 *
396 * A zp_factor of 0 indicates zone poisoning is disabled and can also be set by
397 * passing the -no-zp boot-arg.
398 *
399 * A zp_factor of 1 indicates zone poisoning is on for all elements and can be
400 * set by passing the -zp boot-arg.
401 */
402static TUNABLE(uint32_t, zp_factor, "zp-factor", ZP_DEFAULT_SAMPLING_FACTOR);
fe8ab488 403
f427ee49
A
404/* set by zp-scale=N boot arg, scales zp_factor by zone size */
405static TUNABLE(uint32_t, zp_scale, "zp-scale", ZP_DEFAULT_SCALE_FACTOR);
fe8ab488 406
f427ee49
A
407/* initialized to a per-boot random value in zp_bootstrap */
408static SECURITY_READ_ONLY_LATE(uintptr_t) zp_poisoned_cookie;
409static SECURITY_READ_ONLY_LATE(uintptr_t) zp_nopoison_cookie;
410static SECURITY_READ_ONLY_LATE(uintptr_t) zp_min_size;
411static SECURITY_READ_ONLY_LATE(uint64_t) zone_phys_mapped_max;
316670eb 412
f427ee49
A
413static SECURITY_READ_ONLY_LATE(vm_map_t) zone_submaps[Z_SUBMAP_IDX_COUNT];
414static SECURITY_READ_ONLY_LATE(uint32_t) zone_last_submap_idx;
316670eb 415
f427ee49
A
416static struct bool_gen zone_bool_gen;
417static zone_t zone_find_largest(void);
418static void zone_drop_free_elements(zone_t z);
d9a64523 419
f427ee49
A
420#define submap_for_zone(z) zone_submaps[(z)->submap_idx]
421#define MAX_SUBMAP_NAME 16
316670eb 422
f427ee49
A
423/* Globals for random boolean generator for elements in free list */
424#define MAX_ENTROPY_PER_ZCRAM 4
425
426#if CONFIG_ZCACHE
39236c6e 427/*
f427ee49
A
428 * Specifies a single zone to enable CPU caching for.
429 * Can be set using boot-args: zcc_enable_for_zone_name=<zone>
316670eb 430 */
f427ee49
A
431static char cache_zone_name[MAX_ZONE_NAME];
432static TUNABLE(bool, zcc_kalloc, "zcc_kalloc", false);
39236c6e 433
f427ee49
A
434__header_always_inline bool
435zone_caching_enabled(zone_t z)
436{
437 return z->zcache.zcc_depot != NULL;
438}
d9a64523 439#else
f427ee49
A
440__header_always_inline bool
441zone_caching_enabled(zone_t z __unused)
442{
443 return false;
444}
445#endif /* CONFIG_ZCACHE */
39236c6e 446
f427ee49 447#pragma mark Zone metadata
fe8ab488 448
f427ee49
A
449__enum_closed_decl(zone_addr_kind_t, bool, {
450 ZONE_ADDR_NATIVE,
451 ZONE_ADDR_FOREIGN,
452});
39236c6e 453
f427ee49
A
454static inline zone_id_t
455zone_index(zone_t z)
456{
457 return (zone_id_t)(z - zone_array);
458}
39236c6e 459
f427ee49
A
460static inline bool
461zone_has_index(zone_t z, zone_id_t zid)
462{
463 return zone_array + zid == z;
464}
316670eb 465
f427ee49
A
466static inline vm_size_t
467zone_elem_count(zone_t zone, vm_size_t alloc_size, zone_addr_kind_t kind)
468{
469 if (kind == ZONE_ADDR_NATIVE) {
470 if (zone->percpu) {
471 return PAGE_SIZE / zone_elem_size(zone);
472 }
473 return alloc_size / zone_elem_size(zone);
474 } else {
475 assert(alloc_size == PAGE_SIZE);
476 return (PAGE_SIZE - ZONE_PAGE_FIRST_OFFSET(kind)) / zone_elem_size(zone);
477 }
478}
5c9f4661 479
f427ee49
A
480__abortlike
481static void
482zone_metadata_corruption(zone_t zone, struct zone_page_metadata *meta,
483 const char *kind)
39236c6e 484{
f427ee49
A
485 panic("zone metadata corruption: %s (meta %p, zone %s%s)",
486 kind, meta, zone_heap_name(zone), zone->z_name);
487}
39236c6e 488
f427ee49
A
489__abortlike
490static void
491zone_invalid_element_addr_panic(zone_t zone, vm_offset_t addr)
492{
493 panic("zone element pointer validation failed (addr: %p, zone %s%s)",
494 (void *)addr, zone_heap_name(zone), zone->z_name);
495}
39236c6e 496
f427ee49
A
497__abortlike
498static void
499zone_page_metadata_index_confusion_panic(zone_t zone, vm_offset_t addr,
500 struct zone_page_metadata *meta)
501{
502 panic("%p not in the expected zone %s%s (%d != %d)",
503 (void *)addr, zone_heap_name(zone), zone->z_name,
504 meta->zm_index, zone_index(zone));
505}
39236c6e 506
f427ee49
A
507__abortlike
508static void
509zone_page_metadata_native_queue_corruption(zone_t zone, zone_pva_t *queue)
510{
511 panic("foreign metadata index %d enqueued in native head %p from zone %s%s",
512 queue->packed_address, queue, zone_heap_name(zone),
513 zone->z_name);
514}
39236c6e 515
f427ee49
A
516__abortlike
517static void
518zone_page_metadata_list_corruption(zone_t zone, struct zone_page_metadata *meta)
519{
520 panic("metadata list corruption through element %p detected in zone %s%s",
521 meta, zone_heap_name(zone), zone->z_name);
522}
39236c6e 523
f427ee49
A
524__abortlike
525static void
526zone_page_metadata_foreign_queue_corruption(zone_t zone, zone_pva_t *queue)
527{
528 panic("native metadata index %d enqueued in foreign head %p from zone %s%s",
529 queue->packed_address, queue, zone_heap_name(zone), zone->z_name);
530}
39236c6e 531
f427ee49
A
532__abortlike
533static void
534zone_page_metadata_foreign_confusion_panic(zone_t zone, vm_offset_t addr)
535{
536 panic("manipulating foreign address %p in a native-only zone %s%s",
537 (void *)addr, zone_heap_name(zone), zone->z_name);
538}
39236c6e 539
f427ee49
A
540__abortlike __unused
541static void
542zone_invalid_foreign_addr_panic(zone_t zone, vm_offset_t addr)
543{
544 panic("addr %p being freed to foreign zone %s%s not from foreign range",
545 (void *)addr, zone_heap_name(zone), zone->z_name);
546}
39236c6e 547
f427ee49
A
548__abortlike
549static void
550zone_page_meta_accounting_panic(zone_t zone, struct zone_page_metadata *meta,
551 const char *kind)
552{
553 panic("accounting mismatch (%s) for zone %s%s, meta %p", kind,
554 zone_heap_name(zone), zone->z_name, meta);
555}
39236c6e 556
f427ee49
A
557__abortlike
558static void
559zone_accounting_panic(zone_t zone, const char *kind)
560{
561 panic("accounting mismatch (%s) for zone %s%s", kind,
562 zone_heap_name(zone), zone->z_name);
563}
fe8ab488 564
f427ee49
A
565__abortlike
566static void
567zone_nofail_panic(zone_t zone)
568{
569 panic("zalloc(Z_NOFAIL) can't be satisfied for zone %s%s (potential leak)",
570 zone_heap_name(zone), zone->z_name);
571}
39236c6e 572
f427ee49
A
573#if __arm64__
574// <rdar://problem/48304934> arm64 doesn't use ldp when I'd expect it to
575#define zone_range_load(r, rmin, rmax) \
576 asm("ldp %[rmin], %[rmax], [%[range]]" \
577 : [rmin] "=r"(rmin), [rmax] "=r"(rmax) \
578 : [range] "r"(r))
579#else
580#define zone_range_load(r, rmin, rmax) \
581 ({ rmin = (r)->min_address; rmax = (r)->max_address; })
39236c6e
A
582#endif
583
f427ee49
A
584__header_always_inline bool
585zone_range_contains(const struct zone_map_range *r, vm_offset_t addr, vm_offset_t size)
586{
587 vm_offset_t rmin, rmax;
39236c6e 588
39236c6e 589 /*
f427ee49
A
590 * The `&` is not a typo: we really expect the check to pass,
591 * so encourage the compiler to eagerly load and test without branches
39236c6e 592 */
f427ee49
A
593 zone_range_load(r, rmin, rmax);
594 return (addr >= rmin) & (addr + size >= rmin) & (addr + size <= rmax);
595}
39236c6e 596
f427ee49
A
597__header_always_inline vm_size_t
598zone_range_size(const struct zone_map_range *r)
599{
600 vm_offset_t rmin, rmax;
39236c6e 601
f427ee49
A
602 zone_range_load(r, rmin, rmax);
603 return rmax - rmin;
39236c6e
A
604}
605
f427ee49
A
606#define from_zone_map(addr, size) \
607 zone_range_contains(&zone_info.zi_map_range, (vm_offset_t)(addr), size)
39236c6e 608
f427ee49
A
609#define from_general_submap(addr, size) \
610 zone_range_contains(&zone_info.zi_general_range, (vm_offset_t)(addr), size)
39236c6e 611
f427ee49
A
612#define from_foreign_range(addr, size) \
613 zone_range_contains(&zone_info.zi_foreign_range, (vm_offset_t)(addr), size)
39037602 614
f427ee49
A
615#define from_native_meta_map(addr) \
616 zone_range_contains(&zone_info.zi_meta_range, (vm_offset_t)(addr), \
617 sizeof(struct zone_page_metadata))
39236c6e 618
f427ee49
A
619#define zone_addr_kind(addr, size) \
620 (from_zone_map(addr, size) ? ZONE_ADDR_NATIVE : ZONE_ADDR_FOREIGN)
39236c6e 621
f427ee49
A
622__header_always_inline bool
623zone_pva_is_null(zone_pva_t page)
d9a64523 624{
f427ee49 625 return page.packed_address == 0;
d9a64523
A
626}
627
f427ee49
A
628__header_always_inline bool
629zone_pva_is_queue(zone_pva_t page)
630{
631 // actual kernel pages have the top bit set
632 return (int32_t)page.packed_address > 0;
633}
39037602 634
f427ee49
A
635__header_always_inline bool
636zone_pva_is_equal(zone_pva_t pva1, zone_pva_t pva2)
637{
638 return pva1.packed_address == pva2.packed_address;
639}
39236c6e 640
f427ee49
A
641__header_always_inline void
642zone_queue_set_head(zone_t z, zone_pva_t queue, zone_pva_t oldv,
643 struct zone_page_metadata *meta)
644{
645 zone_pva_t *queue_head = &((zone_pva_t *)zone_array)[queue.packed_address];
39037602 646
f427ee49
A
647 if (!zone_pva_is_equal(*queue_head, oldv)) {
648 zone_page_metadata_list_corruption(z, meta);
649 }
650 *queue_head = meta->zm_page_next;
651}
39037602 652
f427ee49
A
653__header_always_inline zone_pva_t
654zone_queue_encode(zone_pva_t *headp)
655{
656 return (zone_pva_t){ (uint32_t)(headp - (zone_pva_t *)zone_array) };
657}
39037602 658
f427ee49
A
659__header_always_inline zone_pva_t
660zone_pva_from_addr(vm_address_t addr)
661{
662 // cannot use atop() because we want to maintain the sign bit
663 return (zone_pva_t){ (uint32_t)((intptr_t)addr >> PAGE_SHIFT) };
664}
39037602 665
f427ee49
A
666__header_always_inline vm_address_t
667zone_pva_to_addr(zone_pva_t page)
668{
669 // cause sign extension so that we end up with the right address
670 return (vm_offset_t)(int32_t)page.packed_address << PAGE_SHIFT;
671}
39037602 672
f427ee49
A
673__header_always_inline struct zone_page_metadata *
674zone_pva_to_meta(zone_pva_t page, zone_addr_kind_t kind)
675{
676 if (kind == ZONE_ADDR_NATIVE) {
677 return &zone_info.zi_array_base[page.packed_address];
678 } else {
679 return (struct zone_page_metadata *)zone_pva_to_addr(page);
680 }
681}
39037602 682
f427ee49
A
683__header_always_inline zone_pva_t
684zone_pva_from_meta(struct zone_page_metadata *meta, zone_addr_kind_t kind)
685{
686 if (kind == ZONE_ADDR_NATIVE) {
687 uint32_t index = (uint32_t)(meta - zone_info.zi_array_base);
688 return (zone_pva_t){ index };
689 } else {
690 return zone_pva_from_addr((vm_address_t)meta);
691 }
692}
5ba3f43e 693
f427ee49
A
694__header_always_inline struct zone_page_metadata *
695zone_meta_from_addr(vm_offset_t addr, zone_addr_kind_t kind)
39037602 696{
f427ee49
A
697 if (kind == ZONE_ADDR_NATIVE) {
698 return zone_pva_to_meta(zone_pva_from_addr(addr), kind);
0a7de745 699 } else {
f427ee49 700 return (struct zone_page_metadata *)trunc_page(addr);
39037602
A
701 }
702}
703
f427ee49
A
704#define zone_native_meta_from_addr(addr) \
705 zone_meta_from_addr((vm_offset_t)(addr), ZONE_ADDR_NATIVE)
706
707__header_always_inline vm_offset_t
708zone_meta_to_addr(struct zone_page_metadata *meta, zone_addr_kind_t kind)
39037602 709{
f427ee49
A
710 if (kind == ZONE_ADDR_NATIVE) {
711 return ptoa((int)(meta - zone_info.zi_array_base));
0a7de745 712 } else {
f427ee49 713 return (vm_offset_t)meta;
39037602
A
714 }
715}
716
f427ee49
A
717__header_always_inline void
718zone_meta_queue_push(zone_t z, zone_pva_t *headp,
719 struct zone_page_metadata *meta, zone_addr_kind_t kind)
39037602 720{
f427ee49
A
721 zone_pva_t head = *headp;
722 zone_pva_t queue_pva = zone_queue_encode(headp);
723 struct zone_page_metadata *tmp;
724
725 meta->zm_page_next = head;
726 if (!zone_pva_is_null(head)) {
727 tmp = zone_pva_to_meta(head, kind);
728 if (!zone_pva_is_equal(tmp->zm_page_prev, queue_pva)) {
729 zone_page_metadata_list_corruption(z, meta);
730 }
731 tmp->zm_page_prev = zone_pva_from_meta(meta, kind);
732 }
733 meta->zm_page_prev = queue_pva;
734 *headp = zone_pva_from_meta(meta, kind);
39037602
A
735}
736
f427ee49
A
737__header_always_inline struct zone_page_metadata *
738zone_meta_queue_pop(zone_t z, zone_pva_t *headp, zone_addr_kind_t kind,
739 vm_offset_t *page_addrp)
39037602 740{
f427ee49
A
741 zone_pva_t head = *headp;
742 struct zone_page_metadata *meta = zone_pva_to_meta(head, kind);
743 vm_offset_t page_addr = zone_pva_to_addr(head);
744 struct zone_page_metadata *tmp;
745
746 if (kind == ZONE_ADDR_NATIVE && !from_native_meta_map(meta)) {
747 zone_page_metadata_native_queue_corruption(z, headp);
748 }
749 if (kind == ZONE_ADDR_FOREIGN && from_zone_map(meta, sizeof(*meta))) {
750 zone_page_metadata_foreign_queue_corruption(z, headp);
751 }
752
753 if (!zone_pva_is_null(meta->zm_page_next)) {
754 tmp = zone_pva_to_meta(meta->zm_page_next, kind);
755 if (!zone_pva_is_equal(tmp->zm_page_prev, head)) {
756 zone_page_metadata_list_corruption(z, meta);
757 }
758 tmp->zm_page_prev = meta->zm_page_prev;
759 }
760 *headp = meta->zm_page_next;
761
762 *page_addrp = page_addr;
763 return meta;
39037602
A
764}
765
f427ee49
A
766__header_always_inline void
767zone_meta_requeue(zone_t z, zone_pva_t *headp,
768 struct zone_page_metadata *meta, zone_addr_kind_t kind)
39236c6e 769{
f427ee49
A
770 zone_pva_t meta_pva = zone_pva_from_meta(meta, kind);
771 struct zone_page_metadata *tmp;
772
773 if (!zone_pva_is_null(meta->zm_page_next)) {
774 tmp = zone_pva_to_meta(meta->zm_page_next, kind);
775 if (!zone_pva_is_equal(tmp->zm_page_prev, meta_pva)) {
776 zone_page_metadata_list_corruption(z, meta);
777 }
778 tmp->zm_page_prev = meta->zm_page_prev;
779 }
780 if (zone_pva_is_queue(meta->zm_page_prev)) {
781 zone_queue_set_head(z, meta->zm_page_prev, meta_pva, meta);
782 } else {
783 tmp = zone_pva_to_meta(meta->zm_page_prev, kind);
784 if (!zone_pva_is_equal(tmp->zm_page_next, meta_pva)) {
785 zone_page_metadata_list_corruption(z, meta);
786 }
787 tmp->zm_page_next = meta->zm_page_next;
788 }
789
790 zone_meta_queue_push(z, headp, meta, kind);
39236c6e
A
791}
792
0a7de745 793/*
39037602 794 * Routine to populate a page backing metadata in the zone_metadata_region.
0a7de745 795 * Must be called without the zone lock held as it might potentially block.
39037602 796 */
f427ee49
A
797static void
798zone_meta_populate(struct zone_page_metadata *from, struct zone_page_metadata *to)
39037602 799{
f427ee49 800 vm_offset_t page_addr = trunc_page(from);
d9a64523 801
f427ee49
A
802 for (; page_addr < (vm_offset_t)to; page_addr += PAGE_SIZE) {
803#if !KASAN_ZALLOC
d9a64523
A
804 /*
805 * This can race with another thread doing a populate on the same metadata
806 * page, where we see an updated pmap but unmapped KASan shadow, causing a
807 * fault in the shadow when we first access the metadata page. Avoid this
808 * by always synchronizing on the zone_metadata_region lock with KASan.
809 */
f427ee49 810 if (pmap_find_phys(kernel_pmap, page_addr)) {
39037602 811 continue;
0a7de745 812 }
d9a64523 813#endif
f427ee49
A
814
815 for (;;) {
816 kern_return_t ret = KERN_SUCCESS;
817
818 /* All updates to the zone_metadata_region are done under the zone_metadata_region_lck */
819 lck_mtx_lock(&zone_metadata_region_lck);
820 if (0 == pmap_find_phys(kernel_pmap, page_addr)) {
821 ret = kernel_memory_populate(kernel_map, page_addr,
822 PAGE_SIZE, KMA_NOPAGEWAIT | KMA_KOBJECT | KMA_ZERO,
823 VM_KERN_MEMORY_OSFMK);
824 }
825 lck_mtx_unlock(&zone_metadata_region_lck);
826
827 if (ret == KERN_SUCCESS) {
828 break;
829 }
830
831 /*
832 * We can't pass KMA_NOPAGEWAIT under a global lock as it leads
833 * to bad system deadlocks, so if the allocation failed,
834 * we need to do the VM_PAGE_WAIT() outside of the lock.
835 */
836 VM_PAGE_WAIT();
39037602 837 }
39037602 838 }
39037602
A
839}
840
f427ee49
A
841static inline bool
842zone_allocated_element_offset_is_valid(zone_t zone, vm_offset_t addr,
843 vm_offset_t page, zone_addr_kind_t kind)
39037602 844{
f427ee49
A
845 vm_offset_t offs = addr - page - ZONE_PAGE_FIRST_OFFSET(kind);
846 vm_offset_t esize = zone_elem_size(zone);
847
848 if (esize & (esize - 1)) { /* not a power of 2 */
849 return (offs % esize) == 0;
850 } else {
851 return (offs & (esize - 1)) == 0;
852 }
39037602
A
853}
854
f427ee49
A
855__attribute__((always_inline))
856static struct zone_page_metadata *
857zone_allocated_element_resolve(zone_t zone, vm_offset_t addr,
858 vm_offset_t *pagep, zone_addr_kind_t *kindp)
39037602 859{
f427ee49
A
860 struct zone_page_metadata *meta;
861 zone_addr_kind_t kind;
862 vm_offset_t page;
863 vm_offset_t esize = zone_elem_size(zone);
864
865 kind = zone_addr_kind(addr, esize);
866 page = trunc_page(addr);
867 meta = zone_meta_from_addr(addr, kind);
39037602 868
f427ee49
A
869 if (kind == ZONE_ADDR_NATIVE) {
870 if (meta->zm_secondary_page) {
871 if (meta->zm_percpu) {
872 zone_invalid_element_addr_panic(zone, addr);
873 }
874 page -= ptoa(meta->zm_page_count);
875 meta -= meta->zm_page_count;
0a7de745 876 }
f427ee49
A
877 } else if (!zone->allows_foreign) {
878 zone_page_metadata_foreign_confusion_panic(zone, addr);
879#if __LP64__
880 } else if (!from_foreign_range(addr, esize)) {
881 zone_invalid_foreign_addr_panic(zone, addr);
882#else
883 } else if (!pmap_kernel_va(addr)) {
884 zone_invalid_element_addr_panic(zone, addr);
885#endif
886 }
887
888 if (!zone_allocated_element_offset_is_valid(zone, addr, page, kind)) {
889 zone_invalid_element_addr_panic(zone, addr);
890 }
891
892 if (!zone_has_index(zone, meta->zm_index)) {
893 zone_page_metadata_index_confusion_panic(zone, addr, meta);
894 }
895
896 if (kindp) {
897 *kindp = kind;
898 }
899 if (pagep) {
900 *pagep = page;
901 }
902 return meta;
903}
904
905__attribute__((always_inline))
906void
907zone_allocated_element_validate(zone_t zone, vm_offset_t addr)
908{
909 zone_allocated_element_resolve(zone, addr, NULL, NULL);
910}
911
912__header_always_inline vm_offset_t
913zone_page_meta_get_freelist(zone_t zone, struct zone_page_metadata *meta,
914 vm_offset_t page)
915{
916 assert(!meta->zm_secondary_page);
917 if (meta->zm_freelist_offs == PAGE_METADATA_EMPTY_FREELIST) {
918 return 0;
919 }
920
921 vm_size_t size = ptoa(meta->zm_percpu ? 1 : meta->zm_page_count);
922 if (meta->zm_freelist_offs + zone_elem_size(zone) > size) {
923 zone_metadata_corruption(zone, meta, "freelist corruption");
924 }
925
926 return page + meta->zm_freelist_offs;
927}
928
929__header_always_inline void
930zone_page_meta_set_freelist(struct zone_page_metadata *meta,
931 vm_offset_t page, vm_offset_t addr)
932{
933 assert(!meta->zm_secondary_page);
934 if (addr) {
935 meta->zm_freelist_offs = (uint16_t)(addr - page);
39037602 936 } else {
f427ee49 937 meta->zm_freelist_offs = PAGE_METADATA_EMPTY_FREELIST;
39037602 938 }
f427ee49
A
939}
940
941static bool
942zone_page_meta_is_sane_element(zone_t zone, struct zone_page_metadata *meta,
943 vm_offset_t page, vm_offset_t element, zone_addr_kind_t kind)
944{
945 if (element == 0) {
946 /* ends of the freelist are NULL */
947 return true;
d9a64523 948 }
f427ee49
A
949 if (element < page + ZONE_PAGE_FIRST_OFFSET(kind)) {
950 return false;
951 }
952 vm_size_t size = ptoa(meta->zm_percpu ? 1 : meta->zm_page_count);
953 if (element > page + size - zone_elem_size(zone)) {
954 return false;
955 }
956 return true;
39037602
A
957}
958
f427ee49
A
959/* Routine to get the size of a zone allocated address.
960 * If the address doesnt belong to the zone maps, returns 0.
961 */
962vm_size_t
963zone_element_size(void *addr, zone_t *z)
964{
965 struct zone_page_metadata *meta;
966 struct zone *src_zone;
967
968 if (from_zone_map(addr, sizeof(void *))) {
969 meta = zone_native_meta_from_addr(addr);
970 src_zone = &zone_array[meta->zm_index];
971 if (z) {
972 *z = src_zone;
973 }
974 return zone_elem_size(src_zone);
975 }
976#if CONFIG_GZALLOC
977 if (__improbable(gzalloc_enabled())) {
978 vm_size_t gzsize;
979 if (gzalloc_element_size(addr, z, &gzsize)) {
980 return gzsize;
981 }
982 }
983#endif /* CONFIG_GZALLOC */
984
985 return 0;
986}
987
988/* This function just formats the reason for the panics by redoing the checks */
989__abortlike
990static void
991zone_require_panic(zone_t zone, void *addr)
39236c6e 992{
f427ee49
A
993 uint32_t zindex;
994 zone_t other;
995
996 if (!from_zone_map(addr, zone_elem_size(zone))) {
997 panic("zone_require failed: address not in a zone (addr: %p)", addr);
998 }
999
1000 zindex = zone_native_meta_from_addr(addr)->zm_index;
1001 other = &zone_array[zindex];
1002 if (zindex >= os_atomic_load(&num_zones, relaxed) || !other->z_self) {
1003 panic("zone_require failed: invalid zone index %d "
1004 "(addr: %p, expected: %s%s)", zindex,
1005 addr, zone_heap_name(zone), zone->z_name);
0a7de745 1006 } else {
f427ee49
A
1007 panic("zone_require failed: address in unexpected zone id %d (%s%s) "
1008 "(addr: %p, expected: %s%s)",
1009 zindex, zone_heap_name(other), other->z_name,
1010 addr, zone_heap_name(zone), zone->z_name);
0a7de745 1011 }
39037602
A
1012}
1013
f427ee49
A
1014__abortlike
1015static void
1016zone_id_require_panic(zone_id_t zid, void *addr)
1017{
1018 zone_require_panic(&zone_array[zid], addr);
1019}
1020
cb323159 1021/*
f427ee49 1022 * Routines to panic if a pointer is not mapped to an expected zone.
cb323159
A
1023 * This can be used as a means of pinning an object to the zone it is expected
1024 * to be a part of. Causes a panic if the address does not belong to any
1025 * specified zone, does not belong to any zone, has been freed and therefore
1026 * unmapped from the zone, or the pointer contains an uninitialized value that
1027 * does not belong to any zone.
f427ee49
A
1028 *
1029 * Note that this can only work with collectable zones without foreign pages.
cb323159 1030 */
cb323159 1031void
f427ee49 1032zone_require(zone_t zone, void *addr)
cb323159 1033{
f427ee49
A
1034 if (__probable(from_general_submap(addr, zone_elem_size(zone)) &&
1035 (zone_has_index(zone, zone_native_meta_from_addr(addr)->zm_index)))) {
1036 return;
1037 }
1038#if CONFIG_GZALLOC
1039 if (__probable(gzalloc_enabled())) {
1040 return;
cb323159 1041 }
f427ee49
A
1042#endif
1043 zone_require_panic(zone, addr);
1044}
cb323159 1045
f427ee49
A
1046void
1047zone_id_require(zone_id_t zid, vm_size_t esize, void *addr)
1048{
1049 if (__probable(from_general_submap(addr, esize) &&
1050 (zid == zone_native_meta_from_addr(addr)->zm_index))) {
1051 return;
eb6b6ca3 1052 }
f427ee49
A
1053#if CONFIG_GZALLOC
1054 if (__probable(gzalloc_enabled())) {
1055 return;
cb323159 1056 }
f427ee49
A
1057#endif
1058 zone_id_require_panic(zid, addr);
cb323159
A
1059}
1060
f427ee49
A
1061bool
1062zone_owns(zone_t zone, void *addr)
1063{
1064 if (__probable(from_general_submap(addr, zone_elem_size(zone)) &&
1065 (zone_has_index(zone, zone_native_meta_from_addr(addr)->zm_index)))) {
1066 return true;
1067 }
1068#if CONFIG_GZALLOC
1069 if (__probable(gzalloc_enabled())) {
1070 return true;
1071 }
1072#endif
1073 return false;
1074}
5ba3f43e 1075
f427ee49 1076#pragma mark ZTAGS
5ba3f43e
A
1077#if VM_MAX_TAG_ZONES
1078
1079// for zones with tagging enabled:
1080
1081// calculate a pointer to the tag base entry,
1082// holding either a uint32_t the first tag offset for a page in the zone map,
1083// or two uint16_t tags if the page can only hold one or two elements
1084
1085#define ZTAGBASE(zone, element) \
f427ee49 1086 (&((uint32_t *)zone_tagbase_min)[atop((element) - zone_info.zi_map_range.min_address)])
5ba3f43e
A
1087
1088// pointer to the tag for an element
1089#define ZTAG(zone, element) \
1090 ({ \
0a7de745
A
1091 vm_tag_t * result; \
1092 if ((zone)->tags_inline) { \
1093 result = (vm_tag_t *) ZTAGBASE((zone), (element)); \
f427ee49 1094 if ((page_mask & element) >= zone_elem_size(zone)) result++; \
0a7de745 1095 } else { \
f427ee49 1096 result = &((vm_tag_t *)zone_tags_min)[ZTAGBASE((zone), (element))[0] + ((element) & page_mask) / zone_elem_size((zone))]; \
0a7de745
A
1097 } \
1098 result; \
5ba3f43e
A
1099 })
1100
1101
1102static vm_offset_t zone_tagbase_min;
1103static vm_offset_t zone_tagbase_max;
1104static vm_offset_t zone_tagbase_map_size;
1105static vm_map_t zone_tagbase_map;
1106
1107static vm_offset_t zone_tags_min;
1108static vm_offset_t zone_tags_max;
1109static vm_offset_t zone_tags_map_size;
1110static vm_map_t zone_tags_map;
1111
1112// simple heap allocator for allocating the tags for new memory
1113
f427ee49
A
1114LCK_MTX_EARLY_DECLARE(ztLock, &zone_locks_grp); /* heap lock */
1115
0a7de745
A
1116enum{
1117 ztFreeIndexCount = 8,
1118 ztFreeIndexMax = (ztFreeIndexCount - 1),
1119 ztTagsPerBlock = 4
5ba3f43e
A
1120};
1121
0a7de745 1122struct ztBlock {
5ba3f43e 1123#if __LITTLE_ENDIAN__
0a7de745
A
1124 uint64_t free:1,
1125 next:21,
1126 prev:21,
1127 size:21;
5ba3f43e
A
1128#else
1129// ztBlock needs free bit least significant
1130#error !__LITTLE_ENDIAN__
1131#endif
1132};
1133typedef struct ztBlock ztBlock;
1134
1135static ztBlock * ztBlocks;
1136static uint32_t ztBlocksCount;
1137static uint32_t ztBlocksFree;
1138
1139static uint32_t
1140ztLog2up(uint32_t size)
1141{
0a7de745
A
1142 if (1 == size) {
1143 size = 0;
1144 } else {
1145 size = 32 - __builtin_clz(size - 1);
1146 }
1147 return size;
5ba3f43e
A
1148}
1149
1150static uint32_t
1151ztLog2down(uint32_t size)
1152{
0a7de745
A
1153 size = 31 - __builtin_clz(size);
1154 return size;
5ba3f43e
A
1155}
1156
1157static void
1158ztFault(vm_map_t map, const void * address, size_t size, uint32_t flags)
1159{
0a7de745
A
1160 vm_map_offset_t addr = (vm_map_offset_t) address;
1161 vm_map_offset_t page, end;
1162
1163 page = trunc_page(addr);
1164 end = round_page(addr + size);
1165
1166 for (; page < end; page += page_size) {
1167 if (!pmap_find_phys(kernel_pmap, page)) {
1168 kern_return_t __unused
1169 ret = kernel_memory_populate(map, page, PAGE_SIZE,
1170 KMA_KOBJECT | flags, VM_KERN_MEMORY_DIAG);
1171 assert(ret == KERN_SUCCESS);
1172 }
1173 }
5ba3f43e
A
1174}
1175
1176static boolean_t
1177ztPresent(const void * address, size_t size)
1178{
0a7de745
A
1179 vm_map_offset_t addr = (vm_map_offset_t) address;
1180 vm_map_offset_t page, end;
1181 boolean_t result;
1182
1183 page = trunc_page(addr);
1184 end = round_page(addr + size);
1185 for (result = TRUE; (page < end); page += page_size) {
1186 result = pmap_find_phys(kernel_pmap, page);
1187 if (!result) {
1188 break;
1189 }
1190 }
1191 return result;
5ba3f43e
A
1192}
1193
1194
1195void __unused
1196ztDump(boolean_t sanity);
1197void __unused
1198ztDump(boolean_t sanity)
1199{
0a7de745
A
1200 uint32_t q, cq, p;
1201
1202 for (q = 0; q <= ztFreeIndexMax; q++) {
1203 p = q;
1204 do{
1205 if (sanity) {
1206 cq = ztLog2down(ztBlocks[p].size);
1207 if (cq > ztFreeIndexMax) {
1208 cq = ztFreeIndexMax;
1209 }
1210 if (!ztBlocks[p].free
1211 || ((p != q) && (q != cq))
1212 || (ztBlocks[ztBlocks[p].next].prev != p)
1213 || (ztBlocks[ztBlocks[p].prev].next != p)) {
1214 kprintf("zterror at %d", p);
1215 ztDump(FALSE);
1216 kprintf("zterror at %d", p);
1217 assert(FALSE);
1218 }
1219 continue;
1220 }
1221 kprintf("zt[%03d]%c %d, %d, %d\n",
1222 p, ztBlocks[p].free ? 'F' : 'A',
1223 ztBlocks[p].next, ztBlocks[p].prev,
1224 ztBlocks[p].size);
1225 p = ztBlocks[p].next;
1226 if (p == q) {
1227 break;
1228 }
1229 }while (p != q);
1230 if (!sanity) {
1231 printf("\n");
1232 }
1233 }
1234 if (!sanity) {
1235 printf("-----------------------\n");
1236 }
5ba3f43e
A
1237}
1238
1239
1240
1241#define ZTBDEQ(idx) \
1242 ztBlocks[ztBlocks[(idx)].prev].next = ztBlocks[(idx)].next; \
1243 ztBlocks[ztBlocks[(idx)].next].prev = ztBlocks[(idx)].prev;
1244
1245static void
1246ztFree(zone_t zone __unused, uint32_t index, uint32_t count)
1247{
0a7de745
A
1248 uint32_t q, w, p, size, merge;
1249
1250 assert(count);
1251 ztBlocksFree += count;
1252
1253 // merge with preceding
1254 merge = (index + count);
1255 if ((merge < ztBlocksCount)
1256 && ztPresent(&ztBlocks[merge], sizeof(ztBlocks[merge]))
1257 && ztBlocks[merge].free) {
1258 ZTBDEQ(merge);
1259 count += ztBlocks[merge].size;
1260 }
1261
1262 // merge with following
1263 merge = (index - 1);
1264 if ((merge > ztFreeIndexMax)
1265 && ztPresent(&ztBlocks[merge], sizeof(ztBlocks[merge]))
1266 && ztBlocks[merge].free) {
1267 size = ztBlocks[merge].size;
1268 count += size;
1269 index -= size;
1270 ZTBDEQ(index);
1271 }
1272
1273 q = ztLog2down(count);
1274 if (q > ztFreeIndexMax) {
1275 q = ztFreeIndexMax;
1276 }
1277 w = q;
1278 // queue in order of size
1279 while (TRUE) {
1280 p = ztBlocks[w].next;
1281 if (p == q) {
1282 break;
1283 }
1284 if (ztBlocks[p].size >= count) {
1285 break;
1286 }
1287 w = p;
1288 }
1289 ztBlocks[p].prev = index;
1290 ztBlocks[w].next = index;
1291
1292 // fault in first
1293 ztFault(zone_tags_map, &ztBlocks[index], sizeof(ztBlocks[index]), 0);
1294
1295 // mark first & last with free flag and size
1296 ztBlocks[index].free = TRUE;
1297 ztBlocks[index].size = count;
1298 ztBlocks[index].prev = w;
1299 ztBlocks[index].next = p;
1300 if (count > 1) {
1301 index += (count - 1);
1302 // fault in last
1303 ztFault(zone_tags_map, &ztBlocks[index], sizeof(ztBlocks[index]), 0);
1304 ztBlocks[index].free = TRUE;
1305 ztBlocks[index].size = count;
1306 }
5ba3f43e
A
1307}
1308
1309static uint32_t
1310ztAlloc(zone_t zone, uint32_t count)
1311{
0a7de745
A
1312 uint32_t q, w, p, leftover;
1313
1314 assert(count);
1315
1316 q = ztLog2up(count);
1317 if (q > ztFreeIndexMax) {
1318 q = ztFreeIndexMax;
1319 }
1320 do{
1321 w = q;
1322 while (TRUE) {
1323 p = ztBlocks[w].next;
1324 if (p == q) {
1325 break;
1326 }
1327 if (ztBlocks[p].size >= count) {
1328 // dequeue, mark both ends allocated
1329 ztBlocks[w].next = ztBlocks[p].next;
1330 ztBlocks[ztBlocks[p].next].prev = w;
1331 ztBlocks[p].free = FALSE;
1332 ztBlocksFree -= ztBlocks[p].size;
1333 if (ztBlocks[p].size > 1) {
1334 ztBlocks[p + ztBlocks[p].size - 1].free = FALSE;
1335 }
1336
1337 // fault all the allocation
1338 ztFault(zone_tags_map, &ztBlocks[p], count * sizeof(ztBlocks[p]), 0);
1339 // mark last as allocated
1340 if (count > 1) {
1341 ztBlocks[p + count - 1].free = FALSE;
1342 }
1343 // free remainder
1344 leftover = ztBlocks[p].size - count;
1345 if (leftover) {
1346 ztFree(zone, p + ztBlocks[p].size - leftover, leftover);
1347 }
1348
1349 return p;
1350 }
1351 w = p;
1352 }
1353 q++;
1354 }while (q <= ztFreeIndexMax);
1355
1356 return -1U;
5ba3f43e
A
1357}
1358
f427ee49 1359__startup_func
5ba3f43e 1360static void
f427ee49 1361zone_tagging_init(vm_size_t max_zonemap_size)
5ba3f43e 1362{
0a7de745
A
1363 kern_return_t ret;
1364 vm_map_kernel_flags_t vmk_flags;
1365 uint32_t idx;
1366
0a7de745
A
1367 // allocate submaps VM_KERN_MEMORY_DIAG
1368
1369 zone_tagbase_map_size = atop(max_zonemap_size) * sizeof(uint32_t);
1370 vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
1371 vmk_flags.vmkf_permanent = TRUE;
1372 ret = kmem_suballoc(kernel_map, &zone_tagbase_min, zone_tagbase_map_size,
1373 FALSE, VM_FLAGS_ANYWHERE, vmk_flags, VM_KERN_MEMORY_DIAG,
1374 &zone_tagbase_map);
1375
1376 if (ret != KERN_SUCCESS) {
1377 panic("zone_init: kmem_suballoc failed");
1378 }
1379 zone_tagbase_max = zone_tagbase_min + round_page(zone_tagbase_map_size);
1380
1381 zone_tags_map_size = 2048 * 1024 * sizeof(vm_tag_t);
1382 vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
1383 vmk_flags.vmkf_permanent = TRUE;
1384 ret = kmem_suballoc(kernel_map, &zone_tags_min, zone_tags_map_size,
1385 FALSE, VM_FLAGS_ANYWHERE, vmk_flags, VM_KERN_MEMORY_DIAG,
1386 &zone_tags_map);
1387
1388 if (ret != KERN_SUCCESS) {
1389 panic("zone_init: kmem_suballoc failed");
1390 }
1391 zone_tags_max = zone_tags_min + round_page(zone_tags_map_size);
1392
1393 ztBlocks = (ztBlock *) zone_tags_min;
1394 ztBlocksCount = (uint32_t)(zone_tags_map_size / sizeof(ztBlock));
1395
1396 // initialize the qheads
1397 lck_mtx_lock(&ztLock);
1398
1399 ztFault(zone_tags_map, &ztBlocks[0], sizeof(ztBlocks[0]), 0);
1400 for (idx = 0; idx < ztFreeIndexCount; idx++) {
1401 ztBlocks[idx].free = TRUE;
1402 ztBlocks[idx].next = idx;
1403 ztBlocks[idx].prev = idx;
1404 ztBlocks[idx].size = 0;
1405 }
1406 // free remaining space
1407 ztFree(NULL, ztFreeIndexCount, ztBlocksCount - ztFreeIndexCount);
1408
1409 lck_mtx_unlock(&ztLock);
5ba3f43e
A
1410}
1411
1412static void
1413ztMemoryAdd(zone_t zone, vm_offset_t mem, vm_size_t size)
1414{
0a7de745
A
1415 uint32_t * tagbase;
1416 uint32_t count, block, blocks, idx;
1417 size_t pages;
1418
1419 pages = atop(size);
1420 tagbase = ZTAGBASE(zone, mem);
1421
1422 lck_mtx_lock(&ztLock);
1423
1424 // fault tagbase
1425 ztFault(zone_tagbase_map, tagbase, pages * sizeof(uint32_t), 0);
1426
1427 if (!zone->tags_inline) {
1428 // allocate tags
f427ee49 1429 count = (uint32_t)(size / zone_elem_size(zone));
0a7de745
A
1430 blocks = ((count + ztTagsPerBlock - 1) / ztTagsPerBlock);
1431 block = ztAlloc(zone, blocks);
1432 if (-1U == block) {
1433 ztDump(false);
1434 }
1435 assert(-1U != block);
1436 }
1437
1438 lck_mtx_unlock(&ztLock);
1439
1440 if (!zone->tags_inline) {
1441 // set tag base for each page
1442 block *= ztTagsPerBlock;
1443 for (idx = 0; idx < pages; idx++) {
f427ee49
A
1444 vm_offset_t esize = zone_elem_size(zone);
1445 tagbase[idx] = block + (uint32_t)((ptoa(idx) + esize - 1) / esize);
0a7de745
A
1446 }
1447 }
5ba3f43e
A
1448}
1449
1450static void
1451ztMemoryRemove(zone_t zone, vm_offset_t mem, vm_size_t size)
1452{
0a7de745
A
1453 uint32_t * tagbase;
1454 uint32_t count, block, blocks, idx;
1455 size_t pages;
1456
1457 // set tag base for each page
1458 pages = atop(size);
1459 tagbase = ZTAGBASE(zone, mem);
1460 block = tagbase[0];
1461 for (idx = 0; idx < pages; idx++) {
1462 tagbase[idx] = 0xFFFFFFFF;
1463 }
1464
1465 lck_mtx_lock(&ztLock);
1466 if (!zone->tags_inline) {
f427ee49 1467 count = (uint32_t)(size / zone_elem_size(zone));
0a7de745
A
1468 blocks = ((count + ztTagsPerBlock - 1) / ztTagsPerBlock);
1469 assert(block != 0xFFFFFFFF);
1470 block /= ztTagsPerBlock;
1471 ztFree(NULL /* zone is unlocked */, block, blocks);
1472 }
1473
1474 lck_mtx_unlock(&ztLock);
5ba3f43e
A
1475}
1476
1477uint32_t
1478zone_index_from_tag_index(uint32_t tag_zone_index, vm_size_t * elem_size)
1479{
0a7de745 1480 simple_lock(&all_zones_lock, &zone_locks_grp);
5ba3f43e 1481
f427ee49
A
1482 zone_index_foreach(idx) {
1483 zone_t z = &zone_array[idx];
0a7de745
A
1484 if (!z->tags) {
1485 continue;
1486 }
1487 if (tag_zone_index != z->tag_zone_index) {
1488 continue;
1489 }
f427ee49
A
1490
1491 *elem_size = zone_elem_size(z);
1492 simple_unlock(&all_zones_lock);
1493 return idx;
0a7de745 1494 }
5ba3f43e 1495
0a7de745 1496 simple_unlock(&all_zones_lock);
5ba3f43e 1497
f427ee49 1498 return -1U;
5ba3f43e
A
1499}
1500
1501#endif /* VM_MAX_TAG_ZONES */
f427ee49 1502#pragma mark zalloc helpers
5ba3f43e 1503
f427ee49
A
1504const char *
1505zone_name(zone_t z)
39037602 1506{
f427ee49
A
1507 return z->z_name;
1508}
39037602 1509
f427ee49
A
1510const char *
1511zone_heap_name(zone_t z)
1512{
1513 if (__probable(z->kalloc_heap < KHEAP_ID_COUNT)) {
1514 return kalloc_heap_names[z->kalloc_heap];
39037602 1515 }
f427ee49 1516 return "invalid";
39236c6e
A
1517}
1518
f427ee49
A
1519static inline vm_size_t
1520zone_submaps_approx_size(void)
5ba3f43e 1521{
f427ee49 1522 vm_size_t size = 0;
5ba3f43e 1523
f427ee49
A
1524 for (unsigned idx = 0; idx <= zone_last_submap_idx; idx++) {
1525 size += zone_submaps[idx]->size;
5ba3f43e 1526 }
f427ee49 1527
5ba3f43e
A
1528 return size;
1529}
1530
f427ee49
A
1531bool
1532zone_maps_owned(vm_address_t addr, vm_size_t size)
1533{
1534 return from_zone_map(addr, size);
1535}
5ba3f43e 1536
f427ee49
A
1537void
1538zone_map_sizes(
1539 vm_map_size_t *psize,
1540 vm_map_size_t *pfree,
1541 vm_map_size_t *plargest_free)
39236c6e 1542{
f427ee49
A
1543 vm_map_sizes(zone_submaps[Z_SUBMAP_IDX_GENERAL_MAP], psize, pfree, plargest_free);
1544}
39236c6e 1545
f427ee49
A
1546vm_map_t
1547zone_submap(zone_t zone)
1548{
1549 return submap_for_zone(zone);
1550}
39236c6e 1551
f427ee49
A
1552unsigned
1553zpercpu_count(void)
1554{
1555 return zpercpu_early_count;
1556}
1557
1558int
1559track_this_zone(const char *zonename, const char *logname)
1560{
1561 unsigned int len;
1562 const char *zc = zonename;
1563 const char *lc = logname;
1564
1565 /*
1566 * Compare the strings. We bound the compare by MAX_ZONE_NAME.
39236c6e 1567 */
f427ee49
A
1568
1569 for (len = 1; len <= MAX_ZONE_NAME; zc++, lc++, len++) {
1570 /*
1571 * If the current characters don't match, check for a space in
1572 * in the zone name and a corresponding period in the log name.
1573 * If that's not there, then the strings don't match.
1574 */
1575
1576 if (*zc != *lc && !(*zc == ' ' && *lc == '.')) {
1577 break;
0a7de745 1578 }
39236c6e 1579
f427ee49
A
1580 /*
1581 * The strings are equal so far. If we're at the end, then it's a match.
1582 */
1583
1584 if (*zc == '\0') {
1585 return TRUE;
1586 }
39236c6e
A
1587 }
1588
f427ee49 1589 return FALSE;
39236c6e
A
1590}
1591
f427ee49 1592#if DEBUG || DEVELOPMENT
39236c6e 1593
f427ee49
A
1594vm_size_t
1595zone_element_info(void *addr, vm_tag_t * ptag)
39236c6e 1596{
f427ee49
A
1597 vm_size_t size = 0;
1598 vm_tag_t tag = VM_KERN_MEMORY_NONE;
1599 struct zone_page_metadata *meta;
1600 struct zone *src_zone;
1601
1602 if (from_zone_map(addr, sizeof(void *))) {
1603 meta = zone_native_meta_from_addr(addr);
1604 src_zone = &zone_array[meta->zm_index];
1605#if VM_MAX_TAG_ZONES
1606 if (__improbable(src_zone->tags)) {
1607 tag = (ZTAG(src_zone, (vm_offset_t) addr)[0] >> 1);
1608 }
1609#endif /* VM_MAX_TAG_ZONES */
1610 size = zone_elem_size(src_zone);
1611 } else {
1612#if CONFIG_GZALLOC
1613 gzalloc_element_size(addr, NULL, &size);
1614#endif /* CONFIG_GZALLOC */
0a7de745 1615 }
f427ee49
A
1616 *ptag = tag;
1617 return size;
39236c6e 1618}
0a7de745 1619
f427ee49
A
1620#endif /* DEBUG || DEVELOPMENT */
1621
39236c6e 1622/* Someone wrote to freed memory. */
f427ee49
A
1623__abortlike
1624static void
1625zone_element_was_modified_panic(
1626 zone_t zone,
1627 vm_offset_t element,
1628 vm_offset_t found,
1629 vm_offset_t expected,
1630 vm_offset_t offset)
1631{
1632 panic("a freed zone element has been modified in zone %s%s: "
1633 "expected %p but found %p, bits changed %p, "
1634 "at offset %d of %d in element %p, cookies %p %p",
1635 zone_heap_name(zone),
1636 zone->z_name,
0a7de745
A
1637 (void *) expected,
1638 (void *) found,
1639 (void *) (expected ^ found),
1640 (uint32_t) offset,
f427ee49 1641 (uint32_t) zone_elem_size(zone),
0a7de745
A
1642 (void *) element,
1643 (void *) zp_nopoison_cookie,
1644 (void *) zp_poisoned_cookie);
39236c6e
A
1645}
1646
f427ee49
A
1647/* The backup pointer is stored in the last pointer-sized location in an element. */
1648__header_always_inline vm_offset_t *
1649get_backup_ptr(vm_size_t elem_size, vm_offset_t *element)
1650{
1651 return (vm_offset_t *)((vm_offset_t)element + elem_size - sizeof(vm_offset_t));
1652}
1653
39236c6e
A
1654/*
1655 * The primary and backup pointers don't match.
1656 * Determine which one was likely the corrupted pointer, find out what it
1657 * probably should have been, and panic.
39236c6e 1658 */
f427ee49 1659__abortlike
0a7de745 1660static void
f427ee49
A
1661backup_ptr_mismatch_panic(
1662 zone_t zone,
1663 struct zone_page_metadata *page_meta,
1664 vm_offset_t page,
1665 vm_offset_t element)
39236c6e 1666{
f427ee49
A
1667 vm_offset_t primary = *(vm_offset_t *)element;
1668 vm_offset_t backup = *get_backup_ptr(zone_elem_size(zone), &element);
39236c6e 1669 vm_offset_t likely_backup;
39037602 1670 vm_offset_t likely_primary;
f427ee49 1671 zone_addr_kind_t kind = zone_addr_kind(page, zone_elem_size(zone));
39236c6e 1672
39037602 1673 likely_primary = primary ^ zp_nopoison_cookie;
39236c6e 1674 boolean_t sane_backup;
f427ee49
A
1675 boolean_t sane_primary = zone_page_meta_is_sane_element(zone, page_meta,
1676 page, likely_primary, kind);
1677 boolean_t element_was_poisoned = (backup & 0x1);
39236c6e 1678
fe8ab488
A
1679#if defined(__LP64__)
1680 /* We can inspect the tag in the upper bits for additional confirmation */
0a7de745 1681 if ((backup & 0xFFFFFF0000000000) == 0xFACADE0000000000) {
fe8ab488 1682 element_was_poisoned = TRUE;
0a7de745 1683 } else if ((backup & 0xFFFFFF0000000000) == 0xC0FFEE0000000000) {
fe8ab488 1684 element_was_poisoned = FALSE;
0a7de745 1685 }
fe8ab488
A
1686#endif
1687
39236c6e
A
1688 if (element_was_poisoned) {
1689 likely_backup = backup ^ zp_poisoned_cookie;
316670eb 1690 } else {
39236c6e 1691 likely_backup = backup ^ zp_nopoison_cookie;
316670eb 1692 }
f427ee49
A
1693 sane_backup = zone_page_meta_is_sane_element(zone, page_meta,
1694 page, likely_backup, kind);
39236c6e
A
1695
1696 /* The primary is definitely the corrupted one */
0a7de745 1697 if (!sane_primary && sane_backup) {
39037602 1698 zone_element_was_modified_panic(zone, element, primary, (likely_backup ^ zp_nopoison_cookie), 0);
0a7de745 1699 }
39236c6e
A
1700
1701 /* The backup is definitely the corrupted one */
0a7de745 1702 if (sane_primary && !sane_backup) {
fe8ab488 1703 zone_element_was_modified_panic(zone, element, backup,
0a7de745 1704 (likely_primary ^ (element_was_poisoned ? zp_poisoned_cookie : zp_nopoison_cookie)),
f427ee49 1705 zone_elem_size(zone) - sizeof(vm_offset_t));
0a7de745 1706 }
39236c6e
A
1707
1708 /*
1709 * Not sure which is the corrupted one.
1710 * It's less likely that the backup pointer was overwritten with
1711 * ( (sane address) ^ (valid cookie) ), so we'll guess that the
1712 * primary pointer has been overwritten with a sane but incorrect address.
1713 */
0a7de745 1714 if (sane_primary && sane_backup) {
5ba3f43e 1715 zone_element_was_modified_panic(zone, element, primary, (likely_backup ^ zp_nopoison_cookie), 0);
0a7de745 1716 }
39236c6e
A
1717
1718 /* Neither are sane, so just guess. */
5ba3f43e 1719 zone_element_was_modified_panic(zone, element, primary, (likely_backup ^ zp_nopoison_cookie), 0);
316670eb
A
1720}
1721
39236c6e 1722/*
f427ee49
A
1723 * zone_sequestered_page_get
1724 * z is locked
39236c6e 1725 */
f427ee49
A
1726static struct zone_page_metadata *
1727zone_sequestered_page_get(zone_t z, vm_offset_t *page)
39236c6e 1728{
f427ee49 1729 const zone_addr_kind_t kind = ZONE_ADDR_NATIVE;
39236c6e 1730
f427ee49
A
1731 if (!zone_pva_is_null(z->pages_sequester)) {
1732 if (os_sub_overflow(z->sequester_page_count, z->alloc_pages,
1733 &z->sequester_page_count)) {
1734 zone_accounting_panic(z, "sequester_page_count wrap-around");
1735 }
1736 return zone_meta_queue_pop(z, &z->pages_sequester, kind, page);
0a7de745 1737 }
39236c6e 1738
f427ee49
A
1739 return NULL;
1740}
39236c6e 1741
f427ee49
A
1742/*
1743 * zone_sequestered_page_populate
1744 * z is unlocked
1745 * page_meta is invalid on failure
1746 */
1747static kern_return_t
1748zone_sequestered_page_populate(zone_t z, struct zone_page_metadata *page_meta,
1749 vm_offset_t space, vm_size_t alloc_size, int zflags)
1750{
1751 kern_return_t retval;
39236c6e 1752
f427ee49
A
1753 assert(alloc_size == ptoa(z->alloc_pages));
1754 retval = kernel_memory_populate(submap_for_zone(z), space, alloc_size,
1755 zflags, VM_KERN_MEMORY_ZONE);
1756 if (retval != KERN_SUCCESS) {
1757 lock_zone(z);
1758 zone_meta_queue_push(z, &z->pages_sequester, page_meta, ZONE_ADDR_NATIVE);
1759 z->sequester_page_count += z->alloc_pages;
1760 unlock_zone(z);
39236c6e 1761 }
f427ee49 1762 return retval;
39236c6e
A
1763}
1764
f427ee49 1765#pragma mark Zone poisoning/zeroing
39236c6e
A
1766
1767/*
f427ee49
A
1768 * Initialize zone poisoning
1769 * called from zone_bootstrap before any allocations are made from zalloc
39236c6e 1770 */
f427ee49
A
1771__startup_func
1772static void
1773zp_bootstrap(void)
39236c6e 1774{
f427ee49 1775 char temp_buf[16];
39236c6e 1776
f427ee49
A
1777 /*
1778 * Initialize backup pointer random cookie for poisoned elements
1779 * Try not to call early_random() back to back, it may return
1780 * the same value if mach_absolute_time doesn't have sufficient time
1781 * to tick over between calls. <rdar://problem/11597395>
1782 * (This is only a problem on embedded devices)
1783 */
1784 zp_poisoned_cookie = (uintptr_t) early_random();
fe8ab488 1785
f427ee49
A
1786 /* -zp: enable poisoning for every alloc and free */
1787 if (PE_parse_boot_argn("-zp", temp_buf, sizeof(temp_buf))) {
1788 zp_factor = 1;
39236c6e 1789 }
f427ee49
A
1790
1791 /* -no-zp: disable poisoning */
1792 if (PE_parse_boot_argn("-no-zp", temp_buf, sizeof(temp_buf))) {
1793 zp_factor = 0;
1794 printf("Zone poisoning disabled\n");
0a7de745 1795 }
39236c6e 1796
f427ee49
A
1797 /* Initialize backup pointer random cookie for unpoisoned elements */
1798 zp_nopoison_cookie = (uintptr_t) early_random();
1799
1800#if MACH_ASSERT
1801 if (zp_poisoned_cookie == zp_nopoison_cookie) {
1802 panic("early_random() is broken: %p and %p are not random\n",
1803 (void *) zp_poisoned_cookie, (void *) zp_nopoison_cookie);
0a7de745 1804 }
f427ee49 1805#endif
39236c6e 1806
f427ee49
A
1807 /*
1808 * Use the last bit in the backup pointer to hint poisoning state
1809 * to backup_ptr_mismatch_panic. Valid zone pointers are aligned, so
1810 * the low bits are zero.
1811 */
1812 zp_poisoned_cookie |= (uintptr_t)0x1ULL;
1813 zp_nopoison_cookie &= ~((uintptr_t)0x1ULL);
39236c6e 1814
f427ee49 1815#if defined(__LP64__)
0a7de745 1816 /*
f427ee49
A
1817 * Make backup pointers more obvious in GDB for 64 bit
1818 * by making OxFFFFFF... ^ cookie = 0xFACADE...
1819 * (0xFACADE = 0xFFFFFF ^ 0x053521)
1820 * (0xC0FFEE = 0xFFFFFF ^ 0x3f0011)
1821 * The high 3 bytes of a zone pointer are always 0xFFFFFF, and are checked
1822 * by the sanity check, so it's OK for that part of the cookie to be predictable.
1823 *
1824 * TODO: Use #defines, xors, and shifts
39037602 1825 */
f427ee49
A
1826
1827 zp_poisoned_cookie &= 0x000000FFFFFFFFFF;
1828 zp_poisoned_cookie |= 0x0535210000000000; /* 0xFACADE */
1829
1830 zp_nopoison_cookie &= 0x000000FFFFFFFFFF;
1831 zp_nopoison_cookie |= 0x3f00110000000000; /* 0xC0FFEE */
1832#endif
39236c6e
A
1833
1834 /*
f427ee49
A
1835 * Initialize zp_min_size to two cachelines. Elements smaller than this will
1836 * be zero-ed.
39236c6e 1837 */
f427ee49
A
1838 ml_cpu_info_t cpu_info;
1839 ml_cpu_get_info(&cpu_info);
1840 zp_min_size = 2 * cpu_info.cache_line_size;
1841}
39236c6e 1842
f427ee49
A
1843inline uint32_t
1844zone_poison_count_init(zone_t zone)
1845{
1846 return zp_factor + (((uint32_t)zone_elem_size(zone)) >> zp_scale) ^
1847 (mach_absolute_time() & 0x7);
1848}
1849
1850#if ZALLOC_ENABLE_POISONING
1851static bool
1852zfree_poison_element(zone_t zone, uint32_t *zp_count, vm_offset_t elem)
1853{
1854 bool poison = false;
1855 uint32_t zp_count_local;
39236c6e 1856
f427ee49
A
1857 assert(!zone->percpu);
1858 if (zp_factor != 0) {
39236c6e 1859 /*
f427ee49
A
1860 * Poison the memory of every zp_count-th element before it ends up
1861 * on the freelist to catch use-after-free and use of uninitialized
1862 * memory.
1863 *
1864 * Every element is poisoned when zp_factor is set to 1.
1865 *
39236c6e 1866 */
f427ee49
A
1867 zp_count_local = os_atomic_load(zp_count, relaxed);
1868 if (__improbable(zp_count_local == 0 || zp_factor == 1)) {
1869 poison = true;
39037602 1870
f427ee49 1871 os_atomic_store(zp_count, zone_poison_count_init(zone), relaxed);
39236c6e 1872
f427ee49
A
1873 /* memset_pattern{4|8} could help make this faster: <rdar://problem/4662004> */
1874 vm_offset_t *element_cursor = ((vm_offset_t *) elem);
1875 vm_offset_t *end_cursor = (vm_offset_t *)(elem + zone_elem_size(zone));
39236c6e 1876
f427ee49
A
1877 for (; element_cursor < end_cursor; element_cursor++) {
1878 *element_cursor = ZONE_POISON;
316670eb 1879 }
f427ee49
A
1880 } else {
1881 os_atomic_store(zp_count, zp_count_local - 1, relaxed);
1882 /*
1883 * Zero first zp_min_size bytes of elements that aren't being poisoned.
1884 * Element size is larger than zp_min_size in this path as elements
1885 * that are smaller will always be zero-ed.
1886 */
1887 bzero((void *) elem, zp_min_size);
316670eb 1888 }
316670eb 1889 }
f427ee49
A
1890 return poison;
1891}
1892#else
1893static bool
1894zfree_poison_element(zone_t zone, uint32_t *zp_count, vm_offset_t elem)
1895{
1896#pragma unused(zone, zp_count, elem)
1897 assert(!zone->percpu);
1898 return false;
1899}
1900#endif
39236c6e 1901
f427ee49
A
1902__attribute__((always_inline))
1903static bool
1904zfree_clear(zone_t zone, vm_offset_t addr, vm_size_t elem_size)
1905{
1906 assert(zone->zfree_clear_mem);
1907 if (zone->percpu) {
1908 zpercpu_foreach_cpu(i) {
1909 bzero((void *)(addr + ptoa(i)), elem_size);
1910 }
1911 } else {
1912 bzero((void *)addr, elem_size);
0a7de745 1913 }
5ba3f43e 1914
f427ee49 1915 return true;
316670eb 1916}
1c79356b 1917
39236c6e 1918/*
f427ee49
A
1919 * Zero the element if zone has zfree_clear_mem flag set else poison
1920 * the element if zp_count hits 0.
6d2010ae 1921 */
f427ee49
A
1922__attribute__((always_inline))
1923bool
1924zfree_clear_or_poison(zone_t zone, uint32_t *zp_count, vm_offset_t addr)
1925{
1926 vm_size_t elem_size = zone_elem_size(zone);
39236c6e 1927
f427ee49
A
1928 if (zone->zfree_clear_mem) {
1929 return zfree_clear(zone, addr, elem_size);
1930 }
0b4e3aa0 1931
f427ee49
A
1932 return zfree_poison_element(zone, zp_count, (vm_offset_t)addr);
1933}
0b4e3aa0 1934
3e170ce0 1935/*
f427ee49
A
1936 * Clear out the old next pointer and backup to avoid leaking the zone
1937 * poisoning cookie and so that only values on the freelist have a valid
1938 * cookie.
3e170ce0 1939 */
f427ee49
A
1940void
1941zone_clear_freelist_pointers(zone_t zone, vm_offset_t addr)
1942{
1943 vm_offset_t perm_value = 0;
1c79356b 1944
f427ee49
A
1945 if (!zone->zfree_clear_mem) {
1946 perm_value = ZONE_POISON;
1947 }
2d21ac55 1948
f427ee49
A
1949 vm_offset_t *primary = (vm_offset_t *) addr;
1950 vm_offset_t *backup = get_backup_ptr(zone_elem_size(zone), primary);
9bccf70c 1951
f427ee49
A
1952 *primary = perm_value;
1953 *backup = perm_value;
1954}
1c79356b 1955
f427ee49
A
1956#if ZALLOC_ENABLE_POISONING
1957__abortlike
1958static void
1959zone_element_not_clear_panic(zone_t zone, void *addr)
1960{
1961 panic("Zone element %p was modified after free for zone %s%s: "
1962 "Expected element to be cleared", addr, zone_heap_name(zone),
1963 zone->z_name);
1964}
1c79356b 1965
1c79356b 1966/*
f427ee49
A
1967 * Validate that the element was not tampered with while it was in the
1968 * freelist.
1c79356b 1969 */
f427ee49
A
1970void
1971zalloc_validate_element(zone_t zone, vm_offset_t addr, vm_size_t size, bool validate)
1972{
1973 if (zone->percpu) {
1974 assert(zone->zfree_clear_mem);
1975 zpercpu_foreach_cpu(i) {
1976 if (memcmp_zero_ptr_aligned((void *)(addr + ptoa(i)), size)) {
1977 zone_element_not_clear_panic(zone, (void *)(addr + ptoa(i)));
1978 }
1979 }
1980 } else if (zone->zfree_clear_mem) {
1981 if (memcmp_zero_ptr_aligned((void *)addr, size)) {
1982 zone_element_not_clear_panic(zone, (void *)addr);
1983 }
1984 } else if (__improbable(validate)) {
1985 const vm_offset_t *p = (vm_offset_t *)addr;
1986 const vm_offset_t *end = (vm_offset_t *)(addr + size);
1987
1988 for (; p < end; p++) {
1989 if (*p != ZONE_POISON) {
1990 zone_element_was_modified_panic(zone, addr,
1991 *p, ZONE_POISON, (vm_offset_t)p - addr);
1992 }
1993 }
1994 } else {
1995 /*
1996 * If element wasn't poisoned or entirely cleared, validate that the
1997 * minimum bytes that were cleared on free haven't been corrupted.
1998 * addr is advanced by ptr size as we have already validated and cleared
1999 * the freelist pointer/zcache canary.
2000 */
2001 if (memcmp_zero_ptr_aligned((void *) (addr + sizeof(vm_offset_t)),
2002 zp_min_size - sizeof(vm_offset_t))) {
2003 zone_element_not_clear_panic(zone, (void *)addr);
2004 }
2005 }
2006}
2007#endif /* ZALLOC_ENABLE_POISONING */
2008
2009#pragma mark Zone Leak Detection
39236c6e 2010
c910b4d9
A
2011/*
2012 * Zone leak debugging code
2013 *
2014 * When enabled, this code keeps a log to track allocations to a particular zone that have not
2015 * yet been freed. Examining this log will reveal the source of a zone leak. The log is allocated
2016 * only when logging is enabled, so there is no effect on the system when it's turned off. Logging is
2017 * off by default.
2018 *
2019 * Enable the logging via the boot-args. Add the parameter "zlog=<zone>" to boot-args where <zone>
0a7de745 2020 * is the name of the zone you wish to log.
c910b4d9
A
2021 *
2022 * This code only tracks one zone, so you need to identify which one is leaking first.
2023 * Generally, you'll know you have a leak when you get a "zalloc retry failed 3" panic from the zone
2024 * garbage collector. Note that the zone name printed in the panic message is not necessarily the one
2025 * containing the leak. So do a zprint from gdb and locate the zone with the bloated size. This
2026 * is most likely the problem zone, so set zlog in boot-args to this zone name, reboot and re-run the test. The
2027 * next time it panics with this message, examine the log using the kgmacros zstack, findoldest and countpcs.
2028 * See the help in the kgmacros for usage info.
2029 *
2030 *
2031 * Zone corruption logging
2032 *
2033 * Logging can also be used to help identify the source of a zone corruption. First, identify the zone
2034 * that is being corrupted, then add "-zc zlog=<zone name>" to the boot-args. When -zc is used in conjunction
2035 * with zlog, it changes the logging style to track both allocations and frees to the zone. So when the
2036 * corruption is detected, examining the log will show you the stack traces of the callers who last allocated
2037 * and freed any particular element in the zone. Use the findelem kgmacro with the address of the element that's been
2038 * corrupted to examine its history. This should lead to the source of the corruption.
2039 */
2040
f427ee49
A
2041/* Returns TRUE if we rolled over the counter at factor */
2042__header_always_inline bool
2043sample_counter(volatile uint32_t *count_p, uint32_t factor)
2044{
2045 uint32_t old_count, new_count = 0;
2046 if (count_p != NULL) {
2047 os_atomic_rmw_loop(count_p, old_count, new_count, relaxed, {
2048 new_count = old_count + 1;
2049 if (new_count >= factor) {
2050 new_count = 0;
2051 }
2052 });
2053 }
39037602 2054
f427ee49
A
2055 return new_count == 0;
2056}
c910b4d9 2057
f427ee49 2058#if ZONE_ENABLE_LOGGING
39236c6e 2059/* Log allocations and frees to help debug a zone element corruption */
f427ee49 2060TUNABLE(bool, corruption_debug_flag, "-zc", false);
39037602 2061
f427ee49 2062#define MAX_NUM_ZONES_ALLOWED_LOGGING 10 /* Maximum 10 zones can be logged at once */
39037602 2063
f427ee49
A
2064static int max_num_zones_to_log = MAX_NUM_ZONES_ALLOWED_LOGGING;
2065static int num_zones_logged = 0;
39236c6e 2066
c910b4d9 2067/*
0a7de745 2068 * The number of records in the log is configurable via the zrecs parameter in boot-args. Set this to
39037602
A
2069 * the number of records you want in the log. For example, "zrecs=10" sets it to 10 records. Since this
2070 * is the number of stacks suspected of leaking, we don't need many records.
c910b4d9 2071 */
316670eb 2072
f427ee49 2073#if defined(__LP64__)
0a7de745 2074#define ZRECORDS_MAX 2560 /* Max records allowed in the log */
6d2010ae 2075#else
0a7de745 2076#define ZRECORDS_MAX 1536 /* Max records allowed in the log */
6d2010ae 2077#endif
0a7de745 2078#define ZRECORDS_DEFAULT 1024 /* default records in log if zrecs is not specificed in boot-args */
0b4e3aa0 2079
f427ee49 2080static TUNABLE(uint32_t, log_records, "zrecs", ZRECORDS_DEFAULT);
c910b4d9 2081
f427ee49
A
2082static void
2083zone_enable_logging(zone_t z)
2084{
2085 z->zlog_btlog = btlog_create(log_records, MAX_ZTRACE_DEPTH,
2086 (corruption_debug_flag == FALSE) /* caller_will_remove_entries_for_element? */);
c910b4d9 2087
f427ee49
A
2088 if (z->zlog_btlog) {
2089 printf("zone: logging started for zone %s%s\n",
2090 zone_heap_name(z), z->z_name);
2091 } else {
2092 printf("zone: couldn't allocate memory for zrecords, turning off zleak logging\n");
2093 z->zone_logging = false;
2094 }
2095}
c910b4d9 2096
f427ee49
A
2097/**
2098 * @function zone_setup_logging
2099 *
2100 * @abstract
2101 * Optionally sets up a zone for logging.
2102 *
2103 * @discussion
2104 * We recognized two boot-args:
2105 *
2106 * zlog=<zone_to_log>
2107 * zrecs=<num_records_in_log>
2108 *
2109 * The zlog arg is used to specify the zone name that should be logged,
2110 * and zrecs is used to control the size of the log.
2111 *
2112 * If zrecs is not specified, a default value is used.
2113 */
2114static void
2115zone_setup_logging(zone_t z)
c910b4d9 2116{
f427ee49
A
2117 char zone_name[MAX_ZONE_NAME]; /* Temp. buffer for the zone name */
2118 char zlog_name[MAX_ZONE_NAME]; /* Temp. buffer to create the strings zlog1, zlog2 etc... */
2119 char zlog_val[MAX_ZONE_NAME]; /* the zone name we're logging, if any */
c910b4d9
A
2120
2121 /*
f427ee49
A
2122 * Don't allow more than ZRECORDS_MAX records even if the user asked for more.
2123 *
2124 * This prevents accidentally hogging too much kernel memory
2125 * and making the system unusable.
c910b4d9 2126 */
f427ee49
A
2127 if (log_records > ZRECORDS_MAX) {
2128 log_records = ZRECORDS_MAX;
2129 }
c910b4d9 2130
f427ee49
A
2131 /*
2132 * Append kalloc heap name to zone name (if zone is used by kalloc)
2133 */
2134 snprintf(zone_name, MAX_ZONE_NAME, "%s%s", zone_heap_name(z), z->z_name);
c910b4d9 2135
f427ee49
A
2136 /* zlog0 isn't allowed. */
2137 for (int i = 1; i <= max_num_zones_to_log; i++) {
2138 snprintf(zlog_name, MAX_ZONE_NAME, "zlog%d", i);
2139
2140 if (PE_parse_boot_argn(zlog_name, zlog_val, sizeof(zlog_val)) &&
2141 track_this_zone(zone_name, zlog_val)) {
2142 z->zone_logging = true;
2143 num_zones_logged++;
c910b4d9 2144 break;
0a7de745 2145 }
f427ee49 2146 }
c910b4d9 2147
f427ee49
A
2148 /*
2149 * Backwards compat. with the old boot-arg used to specify single zone
2150 * logging i.e. zlog Needs to happen after the newer zlogn checks
2151 * because the prefix will match all the zlogn
2152 * boot-args.
2153 */
2154 if (!z->zone_logging &&
2155 PE_parse_boot_argn("zlog", zlog_val, sizeof(zlog_val)) &&
2156 track_this_zone(zone_name, zlog_val)) {
2157 z->zone_logging = true;
2158 num_zones_logged++;
c910b4d9
A
2159 }
2160
f427ee49
A
2161
2162 /*
2163 * If we want to log a zone, see if we need to allocate buffer space for
2164 * the log.
2165 *
2166 * Some vm related zones are zinit'ed before we can do a kmem_alloc, so
2167 * we have to defer allocation in that case.
2168 *
2169 * zone_init() will finish the job.
2170 *
2171 * If we want to log one of the VM related zones that's set up early on,
2172 * we will skip allocation of the log until zinit is called again later
2173 * on some other zone.
2174 */
2175 if (z->zone_logging && startup_phase >= STARTUP_SUB_KMEM_ALLOC) {
2176 zone_enable_logging(z);
2177 }
c910b4d9
A
2178}
2179
f427ee49
A
2180/*
2181 * Each record in the log contains a pointer to the zone element it refers to,
2182 * and a small array to hold the pc's from the stack trace. A
2183 * record is added to the log each time a zalloc() is done in the zone_of_interest. For leak debugging,
2184 * the record is cleared when a zfree() is done. For corruption debugging, the log tracks both allocs and frees.
2185 * If the log fills, old records are replaced as if it were a circular buffer.
2186 */
2187
2188
2189/*
2190 * Decide if we want to log this zone by doing a string compare between a zone name and the name
2191 * of the zone to log. Return true if the strings are equal, false otherwise. Because it's not
2192 * possible to include spaces in strings passed in via the boot-args, a period in the logname will
2193 * match a space in the zone name.
2194 */
c910b4d9
A
2195
2196/*
2197 * Test if we want to log this zalloc/zfree event. We log if this is the zone we're interested in and
2198 * the buffer for the records has been allocated.
2199 */
2200
f427ee49
A
2201#define DO_LOGGING(z) (z->zlog_btlog != NULL)
2202#else /* !ZONE_ENABLE_LOGGING */
2203#define DO_LOGGING(z) 0
2204#endif /* !ZONE_ENABLE_LOGGING */
c910b4d9 2205
6d2010ae 2206#if CONFIG_ZLEAKS
6d2010ae 2207
0a7de745 2208/*
6d2010ae 2209 * The zone leak detector, abbreviated 'zleak', keeps track of a subset of the currently outstanding
316670eb 2210 * allocations made by the zone allocator. Every zleak_sample_factor allocations in each zone, we capture a
0a7de745 2211 * backtrace. Every free, we examine the table and determine if the allocation was being tracked,
6d2010ae
A
2212 * and stop tracking it if it was being tracked.
2213 *
0a7de745 2214 * We track the allocations in the zallocations hash table, which stores the address that was returned from
6d2010ae
A
2215 * the zone allocator. Each stored entry in the zallocations table points to an entry in the ztraces table, which
2216 * stores the backtrace associated with that allocation. This provides uniquing for the relatively large
2217 * backtraces - we don't store them more than once.
2218 *
2219 * Data collection begins when the zone map is 50% full, and only occurs for zones that are taking up
2220 * a large amount of virtual space.
2221 */
0a7de745
A
2222#define ZLEAK_STATE_ENABLED 0x01 /* Zone leak monitoring should be turned on if zone_map fills up. */
2223#define ZLEAK_STATE_ACTIVE 0x02 /* We are actively collecting traces. */
2224#define ZLEAK_STATE_ACTIVATING 0x04 /* Some thread is doing setup; others should move along. */
2225#define ZLEAK_STATE_FAILED 0x08 /* Attempt to allocate tables failed. We will not try again. */
2226uint32_t zleak_state = 0; /* State of collection, as above */
6d2010ae 2227
0a7de745
A
2228boolean_t panic_include_ztrace = FALSE; /* Enable zleak logging on panic */
2229vm_size_t zleak_global_tracking_threshold; /* Size of zone map at which to start collecting data */
2230vm_size_t zleak_per_zone_tracking_threshold; /* Size a zone will have before we will collect data on it */
2231unsigned int zleak_sample_factor = 1000; /* Allocations per sample attempt */
6d2010ae
A
2232
2233/*
2234 * Counters for allocation statistics.
0a7de745 2235 */
6d2010ae
A
2236
2237/* Times two active records want to occupy the same spot */
2238unsigned int z_alloc_collisions = 0;
2239unsigned int z_trace_collisions = 0;
2240
2241/* Times a new record lands on a spot previously occupied by a freed allocation */
2242unsigned int z_alloc_overwrites = 0;
2243unsigned int z_trace_overwrites = 0;
2244
2245/* Times a new alloc or trace is put into the hash table */
0a7de745
A
2246unsigned int z_alloc_recorded = 0;
2247unsigned int z_trace_recorded = 0;
6d2010ae
A
2248
2249/* Times zleak_log returned false due to not being able to acquire the lock */
0a7de745 2250unsigned int z_total_conflicts = 0;
6d2010ae 2251
6d2010ae
A
2252/*
2253 * Structure for keeping track of an allocation
2254 * An allocation bucket is in use if its element is not NULL
2255 */
2256struct zallocation {
0a7de745
A
2257 uintptr_t za_element; /* the element that was zalloc'ed or zfree'ed, NULL if bucket unused */
2258 vm_size_t za_size; /* how much memory did this allocation take up? */
2259 uint32_t za_trace_index; /* index into ztraces for backtrace associated with allocation */
6d2010ae 2260 /* TODO: #if this out */
0a7de745 2261 uint32_t za_hit_count; /* for determining effectiveness of hash function */
6d2010ae
A
2262};
2263
2264/* Size must be a power of two for the zhash to be able to just mask off bits instead of mod */
316670eb
A
2265uint32_t zleak_alloc_buckets = CONFIG_ZLEAK_ALLOCATION_MAP_NUM;
2266uint32_t zleak_trace_buckets = CONFIG_ZLEAK_TRACE_MAP_NUM;
6d2010ae
A
2267
2268vm_size_t zleak_max_zonemap_size;
2269
2270/* Hashmaps of allocations and their corresponding traces */
0a7de745
A
2271static struct zallocation* zallocations;
2272static struct ztrace* ztraces;
6d2010ae
A
2273
2274/* not static so that panic can see this, see kern/debug.c */
0a7de745 2275struct ztrace* top_ztrace;
6d2010ae
A
2276
2277/* Lock to protect zallocations, ztraces, and top_ztrace from concurrent modification. */
f427ee49
A
2278LCK_GRP_DECLARE(zleak_lock_grp, "zleak_lock");
2279LCK_SPIN_DECLARE(zleak_lock, &zleak_lock_grp);
6d2010ae
A
2280
2281/*
2282 * Initializes the zone leak monitor. Called from zone_init()
2283 */
f427ee49 2284__startup_func
0a7de745
A
2285static void
2286zleak_init(vm_size_t max_zonemap_size)
6d2010ae 2287{
0a7de745
A
2288 char scratch_buf[16];
2289 boolean_t zleak_enable_flag = FALSE;
6d2010ae
A
2290
2291 zleak_max_zonemap_size = max_zonemap_size;
0a7de745 2292 zleak_global_tracking_threshold = max_zonemap_size / 2;
6d2010ae
A
2293 zleak_per_zone_tracking_threshold = zleak_global_tracking_threshold / 8;
2294
5ba3f43e
A
2295#if CONFIG_EMBEDDED
2296 if (PE_parse_boot_argn("-zleakon", scratch_buf, sizeof(scratch_buf))) {
2297 zleak_enable_flag = TRUE;
2298 printf("zone leak detection enabled\n");
2299 } else {
2300 zleak_enable_flag = FALSE;
2301 printf("zone leak detection disabled\n");
2302 }
2303#else /* CONFIG_EMBEDDED */
6d2010ae
A
2304 /* -zleakoff (flag to disable zone leak monitor) */
2305 if (PE_parse_boot_argn("-zleakoff", scratch_buf, sizeof(scratch_buf))) {
2306 zleak_enable_flag = FALSE;
2307 printf("zone leak detection disabled\n");
2308 } else {
2309 zleak_enable_flag = TRUE;
2310 printf("zone leak detection enabled\n");
2311 }
5ba3f43e 2312#endif /* CONFIG_EMBEDDED */
0a7de745 2313
6d2010ae 2314 /* zfactor=XXXX (override how often to sample the zone allocator) */
316670eb 2315 if (PE_parse_boot_argn("zfactor", &zleak_sample_factor, sizeof(zleak_sample_factor))) {
39236c6e 2316 printf("Zone leak factor override: %u\n", zleak_sample_factor);
6d2010ae 2317 }
316670eb 2318
6d2010ae
A
2319 /* zleak-allocs=XXXX (override number of buckets in zallocations) */
2320 if (PE_parse_boot_argn("zleak-allocs", &zleak_alloc_buckets, sizeof(zleak_alloc_buckets))) {
39236c6e 2321 printf("Zone leak alloc buckets override: %u\n", zleak_alloc_buckets);
6d2010ae 2322 /* uses 'is power of 2' trick: (0x01000 & 0x00FFF == 0) */
0a7de745 2323 if (zleak_alloc_buckets == 0 || (zleak_alloc_buckets & (zleak_alloc_buckets - 1))) {
39236c6e 2324 printf("Override isn't a power of two, bad things might happen!\n");
6d2010ae
A
2325 }
2326 }
0a7de745 2327
6d2010ae
A
2328 /* zleak-traces=XXXX (override number of buckets in ztraces) */
2329 if (PE_parse_boot_argn("zleak-traces", &zleak_trace_buckets, sizeof(zleak_trace_buckets))) {
39236c6e 2330 printf("Zone leak trace buckets override: %u\n", zleak_trace_buckets);
6d2010ae 2331 /* uses 'is power of 2' trick: (0x01000 & 0x00FFF == 0) */
0a7de745 2332 if (zleak_trace_buckets == 0 || (zleak_trace_buckets & (zleak_trace_buckets - 1))) {
39236c6e 2333 printf("Override isn't a power of two, bad things might happen!\n");
6d2010ae
A
2334 }
2335 }
0a7de745 2336
6d2010ae
A
2337 if (zleak_enable_flag) {
2338 zleak_state = ZLEAK_STATE_ENABLED;
2339 }
2340}
2341
6d2010ae
A
2342/*
2343 * Support for kern.zleak.active sysctl - a simplified
316670eb 2344 * version of the zleak_state variable.
6d2010ae
A
2345 */
2346int
2347get_zleak_state(void)
2348{
0a7de745
A
2349 if (zleak_state & ZLEAK_STATE_FAILED) {
2350 return -1;
2351 }
2352 if (zleak_state & ZLEAK_STATE_ACTIVE) {
2353 return 1;
2354 }
2355 return 0;
6d2010ae
A
2356}
2357
6d2010ae
A
2358kern_return_t
2359zleak_activate(void)
2360{
2361 kern_return_t retval;
2362 vm_size_t z_alloc_size = zleak_alloc_buckets * sizeof(struct zallocation);
2363 vm_size_t z_trace_size = zleak_trace_buckets * sizeof(struct ztrace);
2364 void *allocations_ptr = NULL;
2365 void *traces_ptr = NULL;
2366
2367 /* Only one thread attempts to activate at a time */
2368 if (zleak_state & (ZLEAK_STATE_ACTIVE | ZLEAK_STATE_ACTIVATING | ZLEAK_STATE_FAILED)) {
2369 return KERN_SUCCESS;
2370 }
2371
2372 /* Indicate that we're doing the setup */
316670eb 2373 lck_spin_lock(&zleak_lock);
6d2010ae 2374 if (zleak_state & (ZLEAK_STATE_ACTIVE | ZLEAK_STATE_ACTIVATING | ZLEAK_STATE_FAILED)) {
316670eb 2375 lck_spin_unlock(&zleak_lock);
6d2010ae
A
2376 return KERN_SUCCESS;
2377 }
2378
2379 zleak_state |= ZLEAK_STATE_ACTIVATING;
316670eb 2380 lck_spin_unlock(&zleak_lock);
6d2010ae
A
2381
2382 /* Allocate and zero tables */
3e170ce0 2383 retval = kmem_alloc_kobject(kernel_map, (vm_offset_t*)&allocations_ptr, z_alloc_size, VM_KERN_MEMORY_OSFMK);
6d2010ae
A
2384 if (retval != KERN_SUCCESS) {
2385 goto fail;
2386 }
2387
3e170ce0 2388 retval = kmem_alloc_kobject(kernel_map, (vm_offset_t*)&traces_ptr, z_trace_size, VM_KERN_MEMORY_OSFMK);
6d2010ae
A
2389 if (retval != KERN_SUCCESS) {
2390 goto fail;
2391 }
2392
2393 bzero(allocations_ptr, z_alloc_size);
2394 bzero(traces_ptr, z_trace_size);
2395
2396 /* Everything's set. Install tables, mark active. */
2397 zallocations = allocations_ptr;
2398 ztraces = traces_ptr;
2399
2400 /*
0a7de745 2401 * Initialize the top_ztrace to the first entry in ztraces,
6d2010ae
A
2402 * so we don't have to check for null in zleak_log
2403 */
2404 top_ztrace = &ztraces[0];
2405
2406 /*
2407 * Note that we do need a barrier between installing
2408 * the tables and setting the active flag, because the zfree()
2409 * path accesses the table without a lock if we're active.
2410 */
316670eb 2411 lck_spin_lock(&zleak_lock);
6d2010ae
A
2412 zleak_state |= ZLEAK_STATE_ACTIVE;
2413 zleak_state &= ~ZLEAK_STATE_ACTIVATING;
316670eb 2414 lck_spin_unlock(&zleak_lock);
0a7de745 2415
6d2010ae
A
2416 return 0;
2417
0a7de745 2418fail:
6d2010ae
A
2419 /*
2420 * If we fail to allocate memory, don't further tax
2421 * the system by trying again.
2422 */
316670eb 2423 lck_spin_lock(&zleak_lock);
6d2010ae
A
2424 zleak_state |= ZLEAK_STATE_FAILED;
2425 zleak_state &= ~ZLEAK_STATE_ACTIVATING;
316670eb 2426 lck_spin_unlock(&zleak_lock);
6d2010ae
A
2427
2428 if (allocations_ptr != NULL) {
2429 kmem_free(kernel_map, (vm_offset_t)allocations_ptr, z_alloc_size);
2430 }
2431
2432 if (traces_ptr != NULL) {
2433 kmem_free(kernel_map, (vm_offset_t)traces_ptr, z_trace_size);
2434 }
2435
2436 return retval;
2437}
2438
2439/*
0a7de745 2440 * TODO: What about allocations that never get deallocated,
6d2010ae 2441 * especially ones with unique backtraces? Should we wait to record
0a7de745 2442 * until after boot has completed?
6d2010ae
A
2443 * (How many persistent zallocs are there?)
2444 */
2445
2446/*
0a7de745
A
2447 * This function records the allocation in the allocations table,
2448 * and stores the associated backtrace in the traces table
6d2010ae
A
2449 * (or just increments the refcount if the trace is already recorded)
2450 * If the allocation slot is in use, the old allocation is replaced with the new allocation, and
2451 * the associated trace's refcount is decremented.
2452 * If the trace slot is in use, it returns.
2453 * The refcount is incremented by the amount of memory the allocation consumes.
2454 * The return value indicates whether to try again next time.
2455 */
2456static boolean_t
2457zleak_log(uintptr_t* bt,
0a7de745
A
2458 uintptr_t addr,
2459 uint32_t depth,
2460 vm_size_t allocation_size)
6d2010ae
A
2461{
2462 /* Quit if there's someone else modifying the hash tables */
316670eb 2463 if (!lck_spin_try_lock(&zleak_lock)) {
6d2010ae
A
2464 z_total_conflicts++;
2465 return FALSE;
2466 }
0a7de745
A
2467
2468 struct zallocation* allocation = &zallocations[hashaddr(addr, zleak_alloc_buckets)];
2469
6d2010ae
A
2470 uint32_t trace_index = hashbacktrace(bt, depth, zleak_trace_buckets);
2471 struct ztrace* trace = &ztraces[trace_index];
0a7de745 2472
6d2010ae
A
2473 allocation->za_hit_count++;
2474 trace->zt_hit_count++;
0a7de745
A
2475
2476 /*
6d2010ae 2477 * If the allocation bucket we want to be in is occupied, and if the occupier
0a7de745 2478 * has the same trace as us, just bail.
6d2010ae
A
2479 */
2480 if (allocation->za_element != (uintptr_t) 0 && trace_index == allocation->za_trace_index) {
2481 z_alloc_collisions++;
0a7de745 2482
316670eb 2483 lck_spin_unlock(&zleak_lock);
6d2010ae
A
2484 return TRUE;
2485 }
0a7de745 2486
6d2010ae
A
2487 /* STEP 1: Store the backtrace in the traces array. */
2488 /* A size of zero indicates that the trace bucket is free. */
0a7de745
A
2489
2490 if (trace->zt_size > 0 && bcmp(trace->zt_stack, bt, (depth * sizeof(uintptr_t))) != 0) {
2491 /*
6d2010ae
A
2492 * Different unique trace with same hash!
2493 * Just bail - if we're trying to record the leaker, hopefully the other trace will be deallocated
2494 * and get out of the way for later chances
2495 */
2496 trace->zt_collisions++;
2497 z_trace_collisions++;
0a7de745 2498
316670eb 2499 lck_spin_unlock(&zleak_lock);
6d2010ae
A
2500 return TRUE;
2501 } else if (trace->zt_size > 0) {
2502 /* Same trace, already added, so increment refcount */
2503 trace->zt_size += allocation_size;
2504 } else {
2505 /* Found an unused trace bucket, record the trace here! */
0a7de745 2506 if (trace->zt_depth != 0) { /* if this slot was previously used but not currently in use */
6d2010ae 2507 z_trace_overwrites++;
0a7de745
A
2508 }
2509
6d2010ae 2510 z_trace_recorded++;
0a7de745
A
2511 trace->zt_size = allocation_size;
2512 memcpy(trace->zt_stack, bt, (depth * sizeof(uintptr_t)));
2513
2514 trace->zt_depth = depth;
2515 trace->zt_collisions = 0;
6d2010ae 2516 }
0a7de745 2517
6d2010ae 2518 /* STEP 2: Store the allocation record in the allocations array. */
0a7de745 2519
6d2010ae 2520 if (allocation->za_element != (uintptr_t) 0) {
0a7de745 2521 /*
6d2010ae 2522 * Straight up replace any allocation record that was there. We don't want to do the work
0a7de745 2523 * to preserve the allocation entries that were there, because we only record a subset of the
6d2010ae
A
2524 * allocations anyways.
2525 */
0a7de745 2526
6d2010ae 2527 z_alloc_collisions++;
0a7de745 2528
6d2010ae
A
2529 struct ztrace* associated_trace = &ztraces[allocation->za_trace_index];
2530 /* Knock off old allocation's size, not the new allocation */
2531 associated_trace->zt_size -= allocation->za_size;
2532 } else if (allocation->za_trace_index != 0) {
2533 /* Slot previously used but not currently in use */
2534 z_alloc_overwrites++;
2535 }
2536
0a7de745
A
2537 allocation->za_element = addr;
2538 allocation->za_trace_index = trace_index;
2539 allocation->za_size = allocation_size;
2540
6d2010ae 2541 z_alloc_recorded++;
0a7de745
A
2542
2543 if (top_ztrace->zt_size < trace->zt_size) {
6d2010ae 2544 top_ztrace = trace;
0a7de745
A
2545 }
2546
316670eb 2547 lck_spin_unlock(&zleak_lock);
6d2010ae
A
2548 return TRUE;
2549}
2550
2551/*
2552 * Free the allocation record and release the stacktrace.
2553 * This should be as fast as possible because it will be called for every free.
2554 */
f427ee49 2555__attribute__((noinline))
6d2010ae
A
2556static void
2557zleak_free(uintptr_t addr,
0a7de745 2558 vm_size_t allocation_size)
6d2010ae 2559{
0a7de745 2560 if (addr == (uintptr_t) 0) {
6d2010ae 2561 return;
0a7de745
A
2562 }
2563
6d2010ae 2564 struct zallocation* allocation = &zallocations[hashaddr(addr, zleak_alloc_buckets)];
0a7de745 2565
6d2010ae
A
2566 /* Double-checked locking: check to find out if we're interested, lock, check to make
2567 * sure it hasn't changed, then modify it, and release the lock.
2568 */
0a7de745 2569
6d2010ae
A
2570 if (allocation->za_element == addr && allocation->za_trace_index < zleak_trace_buckets) {
2571 /* if the allocation was the one, grab the lock, check again, then delete it */
316670eb 2572 lck_spin_lock(&zleak_lock);
0a7de745 2573
6d2010ae
A
2574 if (allocation->za_element == addr && allocation->za_trace_index < zleak_trace_buckets) {
2575 struct ztrace *trace;
2576
2577 /* allocation_size had better match what was passed into zleak_log - otherwise someone is freeing into the wrong zone! */
2578 if (allocation->za_size != allocation_size) {
0a7de745
A
2579 panic("Freeing as size %lu memory that was allocated with size %lu\n",
2580 (uintptr_t)allocation_size, (uintptr_t)allocation->za_size);
6d2010ae 2581 }
0a7de745 2582
6d2010ae 2583 trace = &ztraces[allocation->za_trace_index];
0a7de745 2584
6d2010ae
A
2585 /* size of 0 indicates trace bucket is unused */
2586 if (trace->zt_size > 0) {
2587 trace->zt_size -= allocation_size;
2588 }
0a7de745 2589
6d2010ae
A
2590 /* A NULL element means the allocation bucket is unused */
2591 allocation->za_element = 0;
2592 }
316670eb 2593 lck_spin_unlock(&zleak_lock);
6d2010ae
A
2594 }
2595}
2596
2597#endif /* CONFIG_ZLEAKS */
2598
2599/* These functions outside of CONFIG_ZLEAKS because they are also used in
2600 * mbuf.c for mbuf leak-detection. This is why they lack the z_ prefix.
2601 */
2602
6d2010ae
A
2603/* "Thomas Wang's 32/64 bit mix functions." http://www.concentric.net/~Ttwang/tech/inthash.htm */
2604uintptr_t
2605hash_mix(uintptr_t x)
2606{
2607#ifndef __LP64__
2608 x += ~(x << 15);
2609 x ^= (x >> 10);
0a7de745
A
2610 x += (x << 3);
2611 x ^= (x >> 6);
6d2010ae
A
2612 x += ~(x << 11);
2613 x ^= (x >> 16);
2614#else
2615 x += ~(x << 32);
2616 x ^= (x >> 22);
2617 x += ~(x << 13);
0a7de745
A
2618 x ^= (x >> 8);
2619 x += (x << 3);
6d2010ae
A
2620 x ^= (x >> 15);
2621 x += ~(x << 27);
2622 x ^= (x >> 31);
2623#endif
2624 return x;
2625}
2626
2627uint32_t
2628hashbacktrace(uintptr_t* bt, uint32_t depth, uint32_t max_size)
2629{
6d2010ae
A
2630 uintptr_t hash = 0;
2631 uintptr_t mask = max_size - 1;
2632
316670eb
A
2633 while (depth) {
2634 hash += bt[--depth];
6d2010ae
A
2635 }
2636
2637 hash = hash_mix(hash) & mask;
2638
2639 assert(hash < max_size);
2640
2641 return (uint32_t) hash;
2642}
2643
2644/*
2645 * TODO: Determine how well distributed this is
2646 * max_size must be a power of 2. i.e 0x10000 because 0x10000-1 is 0x0FFFF which is a great bitmask
2647 */
2648uint32_t
2649hashaddr(uintptr_t pt, uint32_t max_size)
2650{
2651 uintptr_t hash = 0;
2652 uintptr_t mask = max_size - 1;
2653
2654 hash = hash_mix(pt) & mask;
2655
2656 assert(hash < max_size);
2657
2658 return (uint32_t) hash;
2659}
2660
2661/* End of all leak-detection code */
f427ee49 2662#pragma mark zone creation, configuration, destruction
39037602 2663
f427ee49
A
2664static zone_t
2665zone_init_defaults(zone_id_t zid)
5ba3f43e 2666{
f427ee49 2667 zone_t z = &zone_array[zid];
5ba3f43e 2668
f427ee49
A
2669 z->page_count_max = ~0u;
2670 z->collectable = true;
2671 z->expandable = true;
2672 z->submap_idx = Z_SUBMAP_IDX_GENERAL_MAP;
5ba3f43e 2673
f427ee49 2674 simple_lock_init(&z->lock, 0);
5ba3f43e 2675
f427ee49 2676 return z;
5ba3f43e
A
2677}
2678
f427ee49
A
2679static bool
2680zone_is_initializing(zone_t z)
2681{
2682 return !z->z_self && !z->destroyed;
2683}
d9a64523 2684
d9a64523 2685static void
f427ee49
A
2686zone_set_max(zone_t z, vm_size_t max)
2687{
2688#if KASAN_ZALLOC
2689 if (z->kasan_redzone) {
2690 /*
2691 * Adjust the max memory for the kasan redzones
2692 */
2693 max += (max / z->pcpu_elem_size) * z->kasan_redzone * 2;
2694 }
2695#endif
2696 if (max < z->percpu ? 1 : z->alloc_pages) {
2697 max = z->percpu ? 1 : z->alloc_pages;
d9a64523 2698 } else {
f427ee49 2699 max = atop(round_page(max));
d9a64523 2700 }
f427ee49 2701 z->page_count_max = max;
d9a64523
A
2702}
2703
f427ee49
A
2704void
2705zone_set_submap_idx(zone_t zone, unsigned int sub_map_idx)
d9a64523 2706{
f427ee49
A
2707 if (!zone_is_initializing(zone)) {
2708 panic("%s: called after zone_create()", __func__);
d9a64523 2709 }
f427ee49
A
2710 if (sub_map_idx > zone_last_submap_idx) {
2711 panic("zone_set_submap_idx(%d) > %d", sub_map_idx, zone_last_submap_idx);
2712 }
2713 zone->submap_idx = sub_map_idx;
d9a64523
A
2714}
2715
f427ee49
A
2716void
2717zone_set_noexpand(
2718 zone_t zone,
2719 vm_size_t max)
d9a64523 2720{
f427ee49
A
2721 if (!zone_is_initializing(zone)) {
2722 panic("%s: called after zone_create()", __func__);
d9a64523 2723 }
f427ee49
A
2724 zone->expandable = false;
2725 zone_set_max(zone, max);
d9a64523
A
2726}
2727
f427ee49
A
2728void
2729zone_set_exhaustible(
2730 zone_t zone,
2731 vm_size_t max)
2732{
2733 if (!zone_is_initializing(zone)) {
2734 panic("%s: called after zone_create()", __func__);
2735 }
2736 zone->expandable = false;
2737 zone->exhaustible = true;
2738 zone_set_max(zone, max);
2739}
d9a64523 2740
f427ee49
A
2741/**
2742 * @function zone_create_find
2743 *
2744 * @abstract
2745 * Finds an unused zone for the given name and element size.
2746 *
2747 * @param name the zone name
2748 * @param size the element size (including redzones, ...)
2749 * @param flags the flags passed to @c zone_create*
2750 * @param zid the desired zone ID or ZONE_ID_ANY
2751 *
2752 * @returns a zone to initialize further.
1c79356b 2753 */
f427ee49
A
2754static zone_t
2755zone_create_find(
2756 const char *name,
2757 vm_size_t size,
2758 zone_create_flags_t flags,
2759 zone_id_t zid)
1c79356b 2760{
f427ee49
A
2761 zone_id_t nzones;
2762 zone_t z;
7ddcb079 2763
0a7de745 2764 simple_lock(&all_zones_lock, &zone_locks_grp);
5ba3f43e 2765
f427ee49
A
2766 nzones = (zone_id_t)os_atomic_load(&num_zones, relaxed);
2767 assert(num_zones_in_use <= nzones && nzones < MAX_ZONES);
5ba3f43e 2768
f427ee49 2769 if (__improbable(nzones < ZONE_ID__FIRST_DYNAMIC)) {
5ba3f43e 2770 /*
f427ee49
A
2771 * The first time around, make sure the reserved zone IDs
2772 * have an initialized lock as zone_index_foreach() will
2773 * enumerate them.
5ba3f43e 2774 */
f427ee49
A
2775 while (nzones < ZONE_ID__FIRST_DYNAMIC) {
2776 zone_init_defaults(nzones++);
5ba3f43e 2777 }
5ba3f43e 2778
f427ee49
A
2779 os_atomic_store(&num_zones, nzones, release);
2780 }
5ba3f43e 2781
f427ee49
A
2782 if (zid != ZONE_ID_ANY) {
2783 if (zid >= ZONE_ID__FIRST_DYNAMIC) {
2784 panic("zone_create: invalid desired zone ID %d for %s",
2785 zid, name);
2786 }
2787 if (flags & ZC_DESTRUCTIBLE) {
2788 panic("zone_create: ID %d (%s) must be permanent", zid, name);
2789 }
2790 if (zone_array[zid].z_self) {
2791 panic("zone_create: creating zone ID %d (%s) twice", zid, name);
2792 }
2793 z = &zone_array[zid];
2794 } else {
2795 if (flags & ZC_DESTRUCTIBLE) {
2796 /*
2797 * If possible, find a previously zdestroy'ed zone in the
2798 * zone_array that we can reuse.
2799 */
2800 for (int i = bitmap_first(zone_destroyed_bitmap, MAX_ZONES);
2801 i >= 0; i = bitmap_next(zone_destroyed_bitmap, i)) {
2802 z = &zone_array[i];
5ba3f43e 2803
f427ee49
A
2804 /*
2805 * If the zone name and the element size are the
2806 * same, we can just reuse the old zone struct.
2807 */
2808 if (strcmp(z->z_name, name) || zone_elem_size(z) != size) {
2809 continue;
2810 }
2811 bitmap_clear(zone_destroyed_bitmap, i);
2812 z->destroyed = false;
2813 z->z_self = z;
2814 zid = (zone_id_t)i;
2815 goto out;
2816 }
2817 }
1c79356b 2818
f427ee49
A
2819 zid = nzones++;
2820 z = zone_init_defaults(zid);
39236c6e 2821
f427ee49
A
2822 /*
2823 * The release barrier pairs with the acquire in
2824 * zone_index_foreach() and makes sure that enumeration loops
2825 * always see an initialized zone lock.
2826 */
2827 os_atomic_store(&num_zones, nzones, release);
2828 }
5ba3f43e 2829
f427ee49
A
2830out:
2831 num_zones_in_use++;
5ba3f43e 2832 simple_unlock(&all_zones_lock);
39236c6e 2833
f427ee49
A
2834 return z;
2835}
5ba3f43e 2836
f427ee49
A
2837__abortlike
2838static void
2839zone_create_panic(const char *name, const char *f1, const char *f2)
2840{
2841 panic("zone_create: creating zone %s: flag %s and %s are incompatible",
2842 name, f1, f2);
2843}
2844#define zone_create_assert_not_both(name, flags, current_flag, forbidden_flag) \
2845 if ((flags) & forbidden_flag) { \
2846 zone_create_panic(name, #current_flag, #forbidden_flag); \
1c79356b 2847 }
5ba3f43e 2848
f427ee49
A
2849/*
2850 * Adjusts the size of the element based on minimum size, alignment
2851 * and kasan redzones
2852 */
2853static vm_size_t
2854zone_elem_adjust_size(
2855 const char *name __unused,
2856 vm_size_t elem_size,
2857 zone_create_flags_t flags,
2858 vm_size_t *redzone __unused)
2859{
2860 vm_size_t size;
5ba3f43e 2861 /*
f427ee49 2862 * Adjust element size for minimum size and pointer alignment
5ba3f43e 2863 */
f427ee49
A
2864 size = (elem_size + sizeof(vm_offset_t) - 1) & -sizeof(vm_offset_t);
2865 if (((flags & ZC_PERCPU) == 0) && size < ZONE_MIN_ELEM_SIZE) {
2866 size = ZONE_MIN_ELEM_SIZE;
5ba3f43e 2867 }
1c79356b 2868
f427ee49 2869#if KASAN_ZALLOC
c910b4d9 2870 /*
f427ee49 2871 * Expand the zone allocation size to include the redzones.
39037602 2872 *
f427ee49
A
2873 * For page-multiple zones add a full guard page because they
2874 * likely require alignment.
c910b4d9 2875 */
f427ee49
A
2876 vm_size_t redzone_tmp;
2877 if (flags & (ZC_KASAN_NOREDZONE | ZC_PERCPU)) {
2878 redzone_tmp = 0;
2879 } else if ((size & PAGE_MASK) == 0) {
2880 if (size != PAGE_SIZE && (flags & ZC_ALIGNMENT_REQUIRED)) {
2881 panic("zone_create: zone %s can't provide more than PAGE_SIZE"
2882 "alignment", name);
39037602 2883 }
f427ee49
A
2884 redzone_tmp = PAGE_SIZE;
2885 } else if (flags & ZC_ALIGNMENT_REQUIRED) {
2886 redzone_tmp = 0;
2887 } else {
2888 redzone_tmp = KASAN_GUARD_SIZE;
2889 }
2890 size += redzone_tmp * 2;
2891 if (redzone) {
2892 *redzone = redzone_tmp;
2893 }
2894#endif
2895 return size;
2896}
2897
2898/*
2899 * Returns the allocation chunk size that has least framentation
2900 */
2901static vm_size_t
2902zone_get_min_alloc_granule(
2903 vm_size_t elem_size,
2904 zone_create_flags_t flags)
2905{
2906 vm_size_t alloc_granule = PAGE_SIZE;
2907 if (flags & ZC_PERCPU) {
2908 alloc_granule = PAGE_SIZE * zpercpu_count();
2909 if (PAGE_SIZE % elem_size > 256) {
2910 panic("zone_create: per-cpu zone has too much fragmentation");
2911 }
2912 } else if ((elem_size & PAGE_MASK) == 0) {
2913 /* zero fragmentation by definition */
2914 alloc_granule = elem_size;
2915 } else if (alloc_granule % elem_size == 0) {
2916 /* zero fragmentation by definition */
2917 } else {
2918 vm_size_t frag = (alloc_granule % elem_size) * 100 / alloc_granule;
2919 vm_size_t alloc_tmp = PAGE_SIZE;
2920 while ((alloc_tmp += PAGE_SIZE) <= ZONE_MAX_ALLOC_SIZE) {
2921 vm_size_t frag_tmp = (alloc_tmp % elem_size) * 100 / alloc_tmp;
2922 if (frag_tmp < frag) {
2923 frag = frag_tmp;
2924 alloc_granule = alloc_tmp;
39037602
A
2925 }
2926 }
f427ee49
A
2927 }
2928 return alloc_granule;
2929}
39037602 2930
f427ee49
A
2931vm_size_t
2932zone_get_foreign_alloc_size(
2933 const char *name __unused,
2934 vm_size_t elem_size,
2935 zone_create_flags_t flags,
2936 uint16_t min_pages)
2937{
2938 vm_size_t adjusted_size = zone_elem_adjust_size(name, elem_size, flags,
2939 NULL);
2940 vm_size_t alloc_granule = zone_get_min_alloc_granule(adjusted_size,
2941 flags);
2942 vm_size_t min_size = min_pages * PAGE_SIZE;
2943 /*
2944 * Round up min_size to a multiple of alloc_granule
2945 */
2946 return ((min_size + alloc_granule - 1) / alloc_granule)
2947 * alloc_granule;
2948}
39037602 2949
f427ee49
A
2950zone_t
2951zone_create_ext(
2952 const char *name,
2953 vm_size_t size,
2954 zone_create_flags_t flags,
2955 zone_id_t desired_zid,
2956 void (^extra_setup)(zone_t))
2957{
2958 vm_size_t alloc;
2959 vm_size_t redzone;
2960 zone_t z;
2961
2962 if (size > ZONE_MAX_ALLOC_SIZE) {
2963 panic("zone_create: element size too large: %zd", (size_t)size);
2964 }
2965
2966 size = zone_elem_adjust_size(name, size, flags, &redzone);
2967 /*
2968 * Allocate the zone slot, return early if we found an older match.
2969 */
2970 z = zone_create_find(name, size, flags, desired_zid);
2971 if (__improbable(z->z_self)) {
2972 /* We found a zone to reuse */
2973 return z;
2974 }
2975
2976 /*
2977 * Initialize the zone properly.
2978 */
2979
2980 /*
2981 * If the kernel is post lockdown, copy the zone name passed in.
2982 * Else simply maintain a pointer to the name string as it can only
2983 * be a core XNU zone (no unloadable kext exists before lockdown).
2984 */
2985 if (startup_phase >= STARTUP_SUB_LOCKDOWN) {
2986 size_t nsz = MIN(strlen(name) + 1, MACH_ZONE_NAME_MAX_LEN);
2987 char *buf = zalloc_permanent(nsz, ZALIGN_NONE);
2988 strlcpy(buf, name, nsz);
2989 z->z_name = buf;
2990 } else {
2991 z->z_name = name;
2992 }
2993 /*
2994 * If zone_init() hasn't run yet, the permanent zones do not exist.
2995 * We can limp along without properly initialized stats for a while,
2996 * zone_init() will rebuild the missing stats when it runs.
2997 */
2998 if (__probable(zone_array[ZONE_ID_PERCPU_PERMANENT].z_self)) {
2999 z->z_stats = zalloc_percpu_permanent_type(struct zone_stats);
3000 }
3001
3002 alloc = zone_get_min_alloc_granule(size, flags);
3003
3004 if (flags & ZC_KALLOC_HEAP) {
3005 size_t rem = (alloc % size) / (alloc / size);
39037602
A
3006
3007 /*
f427ee49
A
3008 * Try to grow the elements size and spread them more if the remaining
3009 * space is large enough.
39037602 3010 */
f427ee49
A
3011 size += rem & ~(KALLOC_MINALIGN - 1);
3012 }
39037602 3013
f427ee49
A
3014 z->pcpu_elem_size = z->z_elem_size = (uint16_t)size;
3015 z->alloc_pages = (uint16_t)atop(alloc);
3016#if KASAN_ZALLOC
3017 z->kasan_redzone = redzone;
3018 if (strncmp(name, "fakestack.", sizeof("fakestack.") - 1) == 0) {
3019 z->kasan_fakestacks = true;
3020 }
3021#endif
39037602 3022
f427ee49
A
3023 /*
3024 * Handle KPI flags
3025 */
3026#if __LP64__
3027 if (flags & ZC_SEQUESTER) {
3028 z->va_sequester = true;
3029 }
3030#endif
3031 /* ZC_CACHING applied after all configuration is done */
39037602 3032
f427ee49
A
3033 if (flags & ZC_PERCPU) {
3034 /*
3035 * ZC_CACHING is disallowed because it uses per-cpu zones for its
3036 * implementation and it would be circular. These allocations are
3037 * also quite expensive, so caching feels dangerous memory wise too.
3038 *
3039 * ZC_ZFREE_CLEARMEM is forced because per-cpu zones allow for
3040 * pointer-sized allocations which poisoning doesn't support.
3041 */
3042 zone_create_assert_not_both(name, flags, ZC_PERCPU, ZC_CACHING);
3043 zone_create_assert_not_both(name, flags, ZC_PERCPU, ZC_ALLOW_FOREIGN);
3044 z->percpu = true;
3045 z->gzalloc_exempt = true;
3046 z->zfree_clear_mem = true;
3047 z->pcpu_elem_size *= zpercpu_count();
3048 }
3049 if (flags & ZC_ZFREE_CLEARMEM) {
3050 z->zfree_clear_mem = true;
3051 }
3052 if (flags & ZC_NOGC) {
3053 z->collectable = false;
3054 }
3055 if (flags & ZC_NOENCRYPT) {
3056 z->noencrypt = true;
3057 }
3058 if (flags & ZC_ALIGNMENT_REQUIRED) {
3059 z->alignment_required = true;
3060 }
3061 if (flags & ZC_NOGZALLOC) {
3062 z->gzalloc_exempt = true;
3063 }
3064 if (flags & ZC_NOCALLOUT) {
3065 z->no_callout = true;
3066 }
3067 if (flags & ZC_DESTRUCTIBLE) {
3068 zone_create_assert_not_both(name, flags, ZC_DESTRUCTIBLE, ZC_CACHING);
3069 zone_create_assert_not_both(name, flags, ZC_DESTRUCTIBLE, ZC_ALLOW_FOREIGN);
3070 z->destructible = true;
3071 }
5ba3f43e 3072
f427ee49
A
3073 /*
3074 * Handle Internal flags
3075 */
3076 if (flags & ZC_ALLOW_FOREIGN) {
3077 z->allows_foreign = true;
3078 }
3079 if ((ZSECURITY_OPTIONS_SUBMAP_USER_DATA & zsecurity_options) &&
3080 (flags & ZC_DATA_BUFFERS)) {
3081 z->submap_idx = Z_SUBMAP_IDX_BAG_OF_BYTES_MAP;
3082 }
3083 if (flags & ZC_KASAN_NOQUARANTINE) {
3084 z->kasan_noquarantine = true;
3085 }
3086 /* ZC_KASAN_NOREDZONE already handled */
3087
3088 /*
3089 * Then if there's extra tuning, do it
3090 */
3091 if (extra_setup) {
3092 extra_setup(z);
39037602
A
3093 }
3094
f427ee49
A
3095 /*
3096 * Configure debugging features
3097 */
3098#if CONFIG_GZALLOC
3099 gzalloc_zone_init(z); /* might set z->gzalloc_tracked */
3100#endif
3101#if ZONE_ENABLE_LOGGING
3102 if (!z->gzalloc_tracked && num_zones_logged < max_num_zones_to_log) {
3103 /*
3104 * Check for and set up zone leak detection if requested via boot-args.
3105 * might set z->zone_logging
3106 */
3107 zone_setup_logging(z);
3108 }
3109#endif /* ZONE_ENABLE_LOGGING */
3110#if VM_MAX_TAG_ZONES
3111 if (!z->gzalloc_tracked && z->kalloc_heap && zone_tagging_on) {
3112 static int tag_zone_index;
3113 vm_offset_t esize = zone_elem_size(z);
3114 z->tags = true;
3115 z->tags_inline = (((page_size + esize - 1) / esize) <=
3116 (sizeof(uint32_t) / sizeof(uint16_t)));
3117 z->tag_zone_index = os_atomic_inc_orig(&tag_zone_index, relaxed);
3118 assert(z->tag_zone_index < VM_MAX_TAG_ZONES);
3119 }
39037602 3120#endif
5ba3f43e 3121
f427ee49
A
3122 /*
3123 * Finally, fixup properties based on security policies, boot-args, ...
3124 */
3125 if ((ZSECURITY_OPTIONS_SUBMAP_USER_DATA & zsecurity_options) &&
3126 z->kalloc_heap == KHEAP_ID_DATA_BUFFERS) {
3127 z->submap_idx = Z_SUBMAP_IDX_BAG_OF_BYTES_MAP;
3128 }
3129#if __LP64__
3130 if ((ZSECURITY_OPTIONS_SEQUESTER & zsecurity_options) &&
3131 (flags & ZC_NOSEQUESTER) == 0 &&
3132 z->submap_idx == Z_SUBMAP_IDX_GENERAL_MAP) {
3133 z->va_sequester = true;
d9a64523
A
3134 }
3135#endif
f427ee49
A
3136 /*
3137 * Always clear zone elements smaller than a cacheline,
3138 * because it's pretty close to free.
3139 */
3140 if (size <= zp_min_size) {
3141 z->zfree_clear_mem = true;
3142 }
3143 if (zp_factor != 0 && !z->zfree_clear_mem) {
3144 z->zp_count = zone_poison_count_init(z);
3145 }
3146
3147#if CONFIG_ZCACHE
3148 if ((flags & ZC_NOCACHING) == 0) {
3149 /*
3150 * Append kalloc heap name to zone name (if zone is used by kalloc)
3151 */
3152 char temp_zone_name[MAX_ZONE_NAME] = "";
3153 snprintf(temp_zone_name, MAX_ZONE_NAME, "%s%s", zone_heap_name(z), z->z_name);
3154
3155 /* Check if boot-arg specified it should have a cache */
3156 if (track_this_zone(temp_zone_name, cache_zone_name)) {
3157 flags |= ZC_CACHING;
3158 } else if (zcc_kalloc && z->kalloc_heap) {
3159 flags |= ZC_CACHING;
3160 }
3161 }
3162 if ((flags & ZC_CACHING) &&
3163 !z->tags && !z->zone_logging && !z->gzalloc_tracked) {
3164 zcache_init(z);
3165 }
3166#endif /* CONFIG_ZCACHE */
3167
3168 lock_zone(z);
3169 z->z_self = z;
3170 unlock_zone(z);
3171
3172 return z;
3173}
3174
3175__startup_func
3176void
3177zone_create_startup(struct zone_create_startup_spec *spec)
3178{
3179 *spec->z_var = zone_create_ext(spec->z_name, spec->z_size,
3180 spec->z_flags, spec->z_zid, spec->z_setup);
3181}
3182
3183/*
3184 * The 4 first field of a zone_view and a zone alias, so that the zone_or_view_t
3185 * union works. trust but verify.
3186 */
3187#define zalloc_check_zov_alias(f1, f2) \
3188 static_assert(offsetof(struct zone, f1) == offsetof(struct zone_view, f2))
3189zalloc_check_zov_alias(z_self, zv_zone);
3190zalloc_check_zov_alias(z_stats, zv_stats);
3191zalloc_check_zov_alias(z_name, zv_name);
3192zalloc_check_zov_alias(z_views, zv_next);
3193#undef zalloc_check_zov_alias
3194
3195__startup_func
3196void
3197zone_view_startup_init(struct zone_view_startup_spec *spec)
3198{
3199 struct kalloc_heap *heap = NULL;
3200 zone_view_t zv = spec->zv_view;
3201 zone_t z;
3202
3203 switch (spec->zv_heapid) {
3204 case KHEAP_ID_DEFAULT:
3205 heap = KHEAP_DEFAULT;
3206 break;
3207 case KHEAP_ID_DATA_BUFFERS:
3208 heap = KHEAP_DATA_BUFFERS;
3209 break;
3210 case KHEAP_ID_KEXT:
3211 heap = KHEAP_KEXT;
3212 break;
3213 default:
3214 heap = NULL;
3215 }
3216
3217 if (heap) {
3218 z = kalloc_heap_zone_for_size(heap, spec->zv_size);
3219 assert(z);
3220 } else {
3221 z = spec->zv_zone;
3222 assert(spec->zv_size <= zone_elem_size(z));
3223 }
3224
3225 zv->zv_zone = z;
3226 zv->zv_stats = zalloc_percpu_permanent_type(struct zone_stats);
3227 zv->zv_next = z->z_views;
3228 if (z->z_views == NULL && z->kalloc_heap == KHEAP_ID_NONE) {
3229 /*
3230 * count the raw view for zones not in a heap,
3231 * kalloc_heap_init() already counts it for its members.
3232 */
3233 zone_view_count += 2;
3234 } else {
3235 zone_view_count += 1;
3236 }
3237 z->z_views = zv;
3238}
3239
3240zone_t
3241zone_create(
3242 const char *name,
3243 vm_size_t size,
3244 zone_create_flags_t flags)
3245{
3246 return zone_create_ext(name, size, flags, ZONE_ID_ANY, NULL);
3247}
d9a64523 3248
f427ee49
A
3249zone_t
3250zinit(
3251 vm_size_t size, /* the size of an element */
3252 vm_size_t max, /* maximum memory to use */
3253 vm_size_t alloc __unused, /* allocation size */
3254 const char *name) /* a name for the zone */
3255{
3256 zone_t z = zone_create(name, size, ZC_DESTRUCTIBLE);
3257 zone_set_max(z, max);
0a7de745 3258 return z;
39037602 3259}
eb6b6ca3 3260
f427ee49
A
3261void
3262zdestroy(zone_t z)
3263{
3264 unsigned int zindex = zone_index(z);
3265
3266 lock_zone(z);
3267
3268 if (!z->destructible || zone_caching_enabled(z) || z->allows_foreign) {
3269 panic("zdestroy: Zone %s%s isn't destructible",
3270 zone_heap_name(z), z->z_name);
3271 }
3272
3273 if (!z->z_self || z->expanding_no_vm_priv || z->expanding_vm_priv ||
3274 z->async_pending || z->waiting) {
3275 panic("zdestroy: Zone %s%s in an invalid state for destruction",
3276 zone_heap_name(z), z->z_name);
3277 }
3278
3279#if !KASAN_ZALLOC
3280 /*
3281 * Unset the valid bit. We'll hit an assert failure on further operations
3282 * on this zone, until zinit() is called again.
3283 *
3284 * Leave the zone valid for KASan as we will see zfree's on quarantined free
3285 * elements even after the zone is destroyed.
3286 */
3287 z->z_self = NULL;
3288#endif
3289 z->destroyed = true;
3290 unlock_zone(z);
3291
3292 /* Dump all the free elements */
3293 zone_drop_free_elements(z);
3294
3295#if CONFIG_GZALLOC
3296 if (__improbable(z->gzalloc_tracked)) {
3297 /* If the zone is gzalloc managed dump all the elements in the free cache */
3298 gzalloc_empty_free_cache(z);
3299 }
3300#endif
3301
3302 lock_zone(z);
3303
3304 while (!zone_pva_is_null(z->pages_sequester)) {
3305 struct zone_page_metadata *page_meta;
3306 vm_offset_t free_addr;
3307
3308 page_meta = zone_sequestered_page_get(z, &free_addr);
3309 unlock_zone(z);
3310 kmem_free(submap_for_zone(z), free_addr, ptoa(z->alloc_pages));
3311 lock_zone(z);
3312 }
3313
3314#if !KASAN_ZALLOC
3315 /* Assert that all counts are zero */
3316 if (z->countavail || z->countfree || zone_size_wired(z) ||
3317 z->allfree_page_count || z->sequester_page_count) {
3318 panic("zdestroy: Zone %s%s isn't empty at zdestroy() time",
3319 zone_heap_name(z), z->z_name);
3320 }
3321
3322 /* consistency check: make sure everything is indeed empty */
3323 assert(zone_pva_is_null(z->pages_any_free_foreign));
3324 assert(zone_pva_is_null(z->pages_all_used_foreign));
3325 assert(zone_pva_is_null(z->pages_all_free));
3326 assert(zone_pva_is_null(z->pages_intermediate));
3327 assert(zone_pva_is_null(z->pages_all_used));
3328 assert(zone_pva_is_null(z->pages_sequester));
3329#endif
3330
3331 unlock_zone(z);
3332
3333 simple_lock(&all_zones_lock, &zone_locks_grp);
3334
3335 assert(!bitmap_test(zone_destroyed_bitmap, zindex));
3336 /* Mark the zone as empty in the bitmap */
3337 bitmap_set(zone_destroyed_bitmap, zindex);
3338 num_zones_in_use--;
3339 assert(num_zones_in_use > 0);
3340
3341 simple_unlock(&all_zones_lock);
3342}
3343
3344#pragma mark zone (re)fill, jetsam
3345
eb6b6ca3
A
3346/*
3347 * Dealing with zone allocations from the mach VM code.
3348 *
3349 * The implementation of the mach VM itself uses the zone allocator
3350 * for things like the vm_map_entry data structure. In order to prevent
3351 * an infinite recursion problem when adding more pages to a zone, zalloc
3352 * uses a replenish thread to refill the VM layer's zones before they have
3353 * too few remaining free entries. The reserved remaining free entries
3354 * guarantee that the VM routines can get entries from already mapped pages.
3355 *
3356 * In order for that to work, the amount of allocations in the nested
3357 * case have to be bounded. There are currently 2 replenish zones, and
3358 * if each needs 1 element of each zone to add a new page to itself, that
3359 * gives us a minumum reserve of 2 elements.
3360 *
3361 * There is also a deadlock issue with the zone garbage collection thread,
3362 * or any thread that is trying to free zone pages. While holding
3363 * the kernel's map lock they may need to allocate new VM map entries, hence
3364 * we need enough reserve to allow them to get past the point of holding the
3365 * map lock. After freeing that page, the GC thread will wait in drop_free_elements()
3366 * until the replenish threads can finish. Since there's only 1 GC thread at a time,
3367 * that adds a minimum of 1 to the reserve size.
3368 *
3369 * Since the minumum amount you can add to a zone is 1 page, we'll use 16K (from ARM)
3370 * as the refill size on all platforms.
3371 *
3372 * When a refill zone drops to half that available, i.e. REFILL_SIZE / 2,
f427ee49 3373 * zalloc_ext() will wake the replenish thread. The replenish thread runs
eb6b6ca3
A
3374 * until at least REFILL_SIZE worth of free elements exist, before sleeping again.
3375 * In the meantime threads may continue to use the reserve until there are only REFILL_SIZE / 4
3376 * elements left. Below that point only the replenish threads themselves and the GC
3377 * thread may continue to use from the reserve.
3378 */
3379static unsigned zone_replenish_loops;
3380static unsigned zone_replenish_wakeups;
3381static unsigned zone_replenish_wakeups_initiated;
3382static unsigned zone_replenish_throttle_count;
3383
3384#define ZONE_REPLENISH_TARGET (16 * 1024)
f427ee49
A
3385static unsigned zone_replenish_active = 0; /* count of zones currently replenishing */
3386static unsigned zone_replenish_max_threads = 0;
39037602 3387
f427ee49
A
3388LCK_GRP_DECLARE(zone_replenish_lock_grp, "zone_replenish_lock");
3389LCK_SPIN_DECLARE(zone_replenish_lock, &zone_replenish_lock_grp);
39037602 3390
f427ee49 3391__abortlike
39037602 3392static void
f427ee49 3393zone_replenish_panic(zone_t zone, kern_return_t kr)
39037602 3394{
f427ee49
A
3395 panic_include_zprint = TRUE;
3396#if CONFIG_ZLEAKS
3397 if ((zleak_state & ZLEAK_STATE_ACTIVE)) {
3398 panic_include_ztrace = TRUE;
3399 }
3400#endif /* CONFIG_ZLEAKS */
3401 if (kr == KERN_NO_SPACE) {
3402 zone_t zone_largest = zone_find_largest();
3403 panic("zalloc: zone map exhausted while allocating from zone %s%s, "
3404 "likely due to memory leak in zone %s%s "
3405 "(%lu total bytes, %d elements allocated)",
3406 zone_heap_name(zone), zone->z_name,
3407 zone_heap_name(zone_largest), zone_largest->z_name,
3408 (unsigned long)zone_size_wired(zone_largest),
3409 zone_count_allocated(zone_largest));
3410 }
3411 panic("zalloc: %s%s (%d elements) retry fail %d",
3412 zone_heap_name(zone), zone->z_name,
3413 zone_count_allocated(zone), kr);
3414}
3415
3416static void
3417zone_replenish_locked(zone_t z, zalloc_flags_t flags, bool asynchronously)
3418{
3419 int kmaflags = KMA_KOBJECT | KMA_ZERO;
3420 vm_offset_t space, alloc_size;
3421 uint32_t retry = 0;
3422 kern_return_t kr;
3423
3424 if (z->noencrypt) {
3425 kmaflags |= KMA_NOENCRYPT;
3426 }
3427 if (flags & Z_NOPAGEWAIT) {
3428 kmaflags |= KMA_NOPAGEWAIT;
3429 }
3430 if (z->permanent) {
3431 kmaflags |= KMA_PERMANENT;
3432 }
39037602
A
3433
3434 for (;;) {
f427ee49 3435 struct zone_page_metadata *page_meta = NULL;
7ddcb079 3436
f427ee49
A
3437 /*
3438 * Try to allocate our regular chunk of pages,
3439 * unless the system is under massive pressure
3440 * and we're looking for more than 2 pages.
3441 */
3442 if (!z->percpu && z->alloc_pages > 2 && (vm_pool_low() || retry > 0)) {
3443 alloc_size = round_page(zone_elem_size(z));
3444 } else {
3445 alloc_size = ptoa(z->alloc_pages);
3446 page_meta = zone_sequestered_page_get(z, &space);
3447 }
0a7de745 3448
f427ee49 3449 unlock_zone(z);
0a7de745 3450
f427ee49
A
3451#if CONFIG_ZLEAKS
3452 /*
3453 * Do the zone leak activation here because zleak_activate()
3454 * may block, and can't be done on the way out.
3455 */
3456 if (__improbable(zleak_state & ZLEAK_STATE_ENABLED)) {
3457 if (!(zleak_state & ZLEAK_STATE_ACTIVE) &&
3458 zone_submaps_approx_size() >= zleak_global_tracking_threshold) {
3459 kr = zleak_activate();
3460 if (kr != KERN_SUCCESS) {
3461 printf("Failed to activate live zone leak debugging (%d).\n", kr);
3462 }
0a7de745 3463 }
f427ee49
A
3464 }
3465#endif /* CONFIG_ZLEAKS */
0a7de745 3466
f427ee49
A
3467 /*
3468 * Trigger jetsams via the vm_pageout_garbage_collect thread if
3469 * we're running out of zone memory
3470 */
3471 if (is_zone_map_nearing_exhaustion()) {
3472 thread_wakeup((event_t) &vm_pageout_garbage_collect);
3473 }
cb323159 3474
f427ee49
A
3475 if (page_meta) {
3476 kr = zone_sequestered_page_populate(z, page_meta, space,
3477 alloc_size, kmaflags);
3478 } else {
3479 if (z->submap_idx == Z_SUBMAP_IDX_GENERAL_MAP && z->kalloc_heap != KHEAP_ID_NONE) {
3480 kmaflags |= KMA_KHEAP;
5ba3f43e 3481 }
f427ee49
A
3482 kr = kernel_memory_allocate(submap_for_zone(z),
3483 &space, alloc_size, 0, kmaflags, VM_KERN_MEMORY_ZONE);
3484 }
5ba3f43e 3485
f427ee49
A
3486#if !__LP64__
3487 if (kr == KERN_NO_SPACE && z->allows_foreign) {
3488 /*
3489 * For zones allowing foreign pages, fallback to the kernel map
3490 */
3491 kr = kernel_memory_allocate(kernel_map, &space,
3492 alloc_size, 0, kmaflags, VM_KERN_MEMORY_ZONE);
3493 }
3494#endif
3495
3496 if (kr == KERN_SUCCESS) {
3497 break;
3498 }
7ddcb079 3499
f427ee49 3500 if (flags & Z_NOPAGEWAIT) {
7ddcb079 3501 lock_zone(z);
f427ee49 3502 return;
7ddcb079
A
3503 }
3504
f427ee49
A
3505 if (asynchronously) {
3506 assert_wait_timeout(&z->prio_refill_count,
3507 THREAD_UNINT, 1, 100 * NSEC_PER_USEC);
3508 thread_block(THREAD_CONTINUE_NULL);
3509 } else if (++retry == 3) {
3510 zone_replenish_panic(z, kr);
3511 }
39236c6e 3512
f427ee49
A
3513 lock_zone(z);
3514 }
eb6b6ca3 3515
f427ee49
A
3516 zcram_and_lock(z, space, alloc_size);
3517
3518#if CONFIG_ZLEAKS
3519 if (__improbable(zleak_state & ZLEAK_STATE_ACTIVE)) {
3520 if (!z->zleak_on &&
3521 zone_size_wired(z) >= zleak_per_zone_tracking_threshold) {
3522 z->zleak_on = true;
3523 }
3524 }
3525#endif /* CONFIG_ZLEAKS */
3526}
3527
3528/*
3529 * High priority VM privileged thread used to asynchronously refill a given zone.
3530 * These are needed for data structures used by the lower level VM itself. The
3531 * replenish thread maintains a reserve of elements, so that the VM will never
3532 * block in the zone allocator.
3533 */
3534__dead2
3535static void
3536zone_replenish_thread(void *_z, wait_result_t __unused wr)
3537{
3538 zone_t z = _z;
3539
3540 current_thread()->options |= (TH_OPT_VMPRIV | TH_OPT_ZONE_PRIV);
3541
3542 for (;;) {
3543 lock_zone(z);
3544 assert(z->z_self == z);
3545 assert(z->zone_replenishing);
3546 assert(z->prio_refill_count != 0);
3547
3548 while (z->countfree < z->prio_refill_count) {
3549 assert(!z->expanding_no_vm_priv);
3550 assert(!z->expanding_vm_priv);
3551
3552 zone_replenish_locked(z, Z_WAITOK, true);
3553
3554 assert(z->z_self == z);
3555 zone_replenish_loops++;
3556 }
3557
3558 /* Wakeup any potentially throttled allocations. */
3559 thread_wakeup(z);
3560
3561 assert_wait(&z->prio_refill_count, THREAD_UNINT);
3562
3563 /*
3564 * We finished refilling the zone, so decrement the active count
3565 * and wake up any waiting GC threads.
3566 */
3567 lck_spin_lock(&zone_replenish_lock);
eb6b6ca3
A
3568 assert(zone_replenish_active > 0);
3569 if (--zone_replenish_active == 0) {
3570 thread_wakeup((event_t)&zone_replenish_active);
3571 }
3572 lck_spin_unlock(&zone_replenish_lock);
3573
f427ee49 3574 z->zone_replenishing = false;
39037602 3575 unlock_zone(z);
f427ee49 3576
7ddcb079
A
3577 thread_block(THREAD_CONTINUE_NULL);
3578 zone_replenish_wakeups++;
3579 }
3580}
3581
3582void
eb6b6ca3 3583zone_prio_refill_configure(zone_t z)
0a7de745 3584{
f427ee49
A
3585 thread_t th;
3586 kern_return_t tres;
3587
3588 lock_zone(z);
3589 assert(!z->prio_refill_count && !z->destructible);
3590 z->prio_refill_count = (uint16_t)(ZONE_REPLENISH_TARGET / zone_elem_size(z));
3591 z->zone_replenishing = true;
3592 unlock_zone(z);
7ddcb079 3593
eb6b6ca3
A
3594 lck_spin_lock(&zone_replenish_lock);
3595 ++zone_replenish_max_threads;
3596 ++zone_replenish_active;
3597 lck_spin_unlock(&zone_replenish_lock);
7ddcb079 3598 OSMemoryBarrier();
7ddcb079 3599
f427ee49
A
3600 tres = kernel_thread_start_priority(zone_replenish_thread, z,
3601 MAXPRI_KERNEL, &th);
7ddcb079
A
3602 if (tres != KERN_SUCCESS) {
3603 panic("zone_prio_refill_configure, thread create: 0x%x", tres);
3604 }
3605
f427ee49 3606 thread_deallocate(th);
5ba3f43e
A
3607}
3608
f427ee49
A
3609static void
3610zone_randomize_freelist(zone_t zone, struct zone_page_metadata *meta,
3611 vm_offset_t size, zone_addr_kind_t kind, unsigned int *entropy_buffer)
5ba3f43e 3612{
f427ee49
A
3613 const vm_size_t elem_size = zone_elem_size(zone);
3614 vm_offset_t left, right, head, base;
3615 vm_offset_t element;
3616
3617 left = ZONE_PAGE_FIRST_OFFSET(kind);
3618 right = size - ((size - left) % elem_size);
3619 head = 0;
3620 base = zone_meta_to_addr(meta, kind);
3621
3622 while (left < right) {
3623 if (zone_leaks_scan_enable || __improbable(zone->tags) ||
3624 random_bool_gen_bits(&zone_bool_gen, entropy_buffer, MAX_ENTROPY_PER_ZCRAM, 1)) {
3625 element = base + left;
3626 left += elem_size;
3627 } else {
3628 right -= elem_size;
3629 element = base + right;
3630 }
5ba3f43e 3631
f427ee49
A
3632 vm_offset_t *primary = (vm_offset_t *)element;
3633 vm_offset_t *backup = get_backup_ptr(elem_size, primary);
5ba3f43e 3634
f427ee49
A
3635 *primary = *backup = head ^ zp_nopoison_cookie;
3636 head = element;
39037602 3637 }
39037602 3638
f427ee49 3639 meta->zm_freelist_offs = (uint16_t)(head - base);
4bd07ac2
A
3640}
3641
1c79356b 3642/*
3e170ce0 3643 * Cram the given memory into the specified zone. Update the zone page count accordingly.
1c79356b 3644 */
f427ee49
A
3645static void
3646zcram_and_lock(zone_t zone, vm_offset_t newmem, vm_size_t size)
1c79356b 3647{
5c9f4661 3648 unsigned int entropy_buffer[MAX_ENTROPY_PER_ZCRAM] = { 0 };
f427ee49
A
3649 struct zone_page_metadata *meta;
3650 zone_addr_kind_t kind;
3651 uint32_t pg_count = (uint32_t)atop(size);
3652 uint32_t zindex = zone_index(zone);
3653 uint32_t free_count;
3654 uint16_t empty_freelist_offs = PAGE_METADATA_EMPTY_FREELIST;
1c79356b
A
3655
3656 /* Basic sanity checks */
3657 assert(zone != ZONE_NULL && newmem != (vm_offset_t)0);
f427ee49
A
3658 assert((newmem & PAGE_MASK) == 0);
3659 assert((size & PAGE_MASK) == 0);
7ddcb079 3660
f427ee49
A
3661 KDBG(MACHDBG_CODE(DBG_MACH_ZALLOC, ZALLOC_ZCRAM) | DBG_FUNC_START,
3662 zindex, size);
39037602 3663
f427ee49 3664 kind = zone_addr_kind(newmem, size);
cb323159 3665#if DEBUG || DEVELOPMENT
0a7de745 3666 if (zalloc_debug & ZALLOC_DEBUG_ZCRAM) {
f427ee49
A
3667 kprintf("zcram(%p[%s%s], 0x%lx%s, 0x%lx)\n", zone,
3668 zone_heap_name(zone), zone->z_name, (uintptr_t)newmem,
3669 kind == ZONE_ADDR_FOREIGN ? "[F]" : "", (uintptr_t)size);
0a7de745 3670 }
cb323159 3671#endif /* DEBUG || DEVELOPMENT */
39236c6e 3672
5c9f4661
A
3673 /*
3674 * Initialize the metadata for all pages. We dont need the zone lock
39037602 3675 * here because we are not manipulating any zone related state yet.
f427ee49
A
3676 *
3677 * This includes randomizing the freelists as the metadata isn't
3678 * published yet.
39037602 3679 */
39236c6e 3680
f427ee49
A
3681 if (kind == ZONE_ADDR_NATIVE) {
3682 /*
3683 * We're being called by zfill,
3684 * zone_replenish_thread or vm_page_more_fictitious,
3685 *
3686 * which will only either allocate a single page, or `alloc_pages`
3687 * worth.
3688 */
3689 assert(pg_count <= zone->alloc_pages);
39236c6e 3690
f427ee49
A
3691 /*
3692 * Make sure the range of metadata entries we're about to init
3693 * have proper physical backing, then initialize them.
3694 */
3695 meta = zone_meta_from_addr(newmem, kind);
3696 zone_meta_populate(meta, meta + pg_count);
3697
3698 if (zone->permanent) {
3699 empty_freelist_offs = 0;
3700 }
3701
3702 meta[0] = (struct zone_page_metadata){
3703 .zm_index = zindex,
3704 .zm_page_count = pg_count,
3705 .zm_percpu = zone->percpu,
3706 .zm_freelist_offs = empty_freelist_offs,
3707 };
3708
3709 for (uint32_t i = 1; i < pg_count; i++) {
3710 meta[i] = (struct zone_page_metadata){
3711 .zm_index = zindex,
3712 .zm_page_count = i,
3713 .zm_percpu = zone->percpu,
3714 .zm_secondary_page = true,
3715 .zm_freelist_offs = empty_freelist_offs,
3716 };
3717 }
39236c6e 3718
f427ee49
A
3719 if (!zone->permanent) {
3720 zone_randomize_freelist(zone, meta,
3721 zone->percpu ? PAGE_SIZE : size, kind, entropy_buffer);
3722 }
3723 } else {
3724 if (!zone->allows_foreign || !from_foreign_range(newmem, size)) {
3725 panic("zcram_and_lock: foreign memory [%lx] being crammed is "
3726 "outside of foreign range", (uintptr_t)newmem);
3727 }
39236c6e 3728
f427ee49
A
3729 /*
3730 * We cannot support elements larger than page size for foreign
3731 * memory because we put metadata on the page itself for each
3732 * page of foreign memory.
3733 *
3734 * We need to do this in order to be able to reach the metadata
3735 * when any element is freed.
3736 */
3737 assert(!zone->percpu && !zone->permanent);
3738 assert(zone_elem_size(zone) <= PAGE_SIZE - sizeof(struct zone_page_metadata));
3739
3740 bzero((void *)newmem, size);
3741
3742 for (vm_offset_t offs = 0; offs < size; offs += PAGE_SIZE) {
3743 meta = (struct zone_page_metadata *)(newmem + offs);
3744 *meta = (struct zone_page_metadata){
3745 .zm_index = zindex,
3746 .zm_page_count = 1,
3747 .zm_freelist_offs = empty_freelist_offs,
3748 };
3749 meta->zm_foreign_cookie[0] = ZONE_FOREIGN_COOKIE;
3750 zone_randomize_freelist(zone, meta, PAGE_SIZE, kind,
3751 entropy_buffer);
3752 }
3753 }
39236c6e 3754
5ba3f43e 3755#if VM_MAX_TAG_ZONES
0a7de745 3756 if (__improbable(zone->tags)) {
f427ee49 3757 assert(kind == ZONE_ADDR_NATIVE && !zone->percpu);
0a7de745
A
3758 ztMemoryAdd(zone, newmem, size);
3759 }
5ba3f43e
A
3760#endif /* VM_MAX_TAG_ZONES */
3761
f427ee49
A
3762 /*
3763 * Insert the initialized pages / metadatas into the right lists.
3764 */
3765
39037602 3766 lock_zone(zone);
f427ee49 3767 assert(zone->z_self == zone);
39037602 3768
f427ee49
A
3769 zone->page_count += pg_count;
3770 if (zone->page_count_hwm < zone->page_count) {
3771 zone->page_count_hwm = zone->page_count;
3772 }
3773 os_atomic_add(&zones_phys_page_count, pg_count, relaxed);
39037602 3774
f427ee49
A
3775 if (kind == ZONE_ADDR_NATIVE) {
3776 os_atomic_add(&zones_phys_page_mapped_count, pg_count, relaxed);
3777 if (zone->permanent) {
3778 zone_meta_queue_push(zone, &zone->pages_intermediate, meta, kind);
3779 } else {
3780 zone_meta_queue_push(zone, &zone->pages_all_free, meta, kind);
3781 zone->allfree_page_count += meta->zm_page_count;
39236c6e 3782 }
f427ee49
A
3783 free_count = zone_elem_count(zone, size, kind);
3784 zone->countfree += free_count;
3785 zone->countavail += free_count;
39037602 3786 } else {
f427ee49
A
3787 free_count = zone_elem_count(zone, PAGE_SIZE, kind);
3788 for (vm_offset_t offs = 0; offs < size; offs += PAGE_SIZE) {
3789 meta = (struct zone_page_metadata *)(newmem + offs);
3790 zone_meta_queue_push(zone, &zone->pages_any_free_foreign, meta, kind);
3791 zone->countfree += free_count;
3792 zone->countavail += free_count;
3793 }
1c79356b 3794 }
4bd07ac2 3795
f427ee49
A
3796 KDBG(MACHDBG_CODE(DBG_MACH_ZALLOC, ZALLOC_ZCRAM) | DBG_FUNC_END, zindex);
3797}
3798
3799void
3800zcram(zone_t zone, vm_offset_t newmem, vm_size_t size)
3801{
3802 zcram_and_lock(zone, newmem, size);
3803 unlock_zone(zone);
1c79356b
A
3804}
3805
1c79356b
A
3806/*
3807 * Fill a zone with enough memory to contain at least nelem elements.
1c79356b
A
3808 * Return the number of elements actually put into the zone, which may
3809 * be more than the caller asked for since the memory allocation is
5ba3f43e 3810 * rounded up to the next zone allocation size.
1c79356b
A
3811 */
3812int
3813zfill(
0a7de745
A
3814 zone_t zone,
3815 int nelem)
1c79356b 3816{
5ba3f43e 3817 kern_return_t kr;
f427ee49 3818 vm_offset_t memory;
1c79356b 3819
f427ee49
A
3820 vm_size_t alloc_size = ptoa(zone->alloc_pages);
3821 vm_size_t nalloc_inc = zone_elem_count(zone, alloc_size, ZONE_ADDR_NATIVE);
3822 vm_size_t nalloc = 0, goal = MAX(0, nelem);
3823 int kmaflags = KMA_KOBJECT | KMA_ZERO;
cb323159 3824
f427ee49
A
3825 if (zone->noencrypt) {
3826 kmaflags |= KMA_NOENCRYPT;
cb323159 3827 }
5ba3f43e 3828
f427ee49 3829 assert(!zone->allows_foreign && !zone->permanent);
5ba3f43e 3830
f427ee49
A
3831 /*
3832 * Trigger jetsams via the vm_pageout_garbage_collect thread if we're
3833 * running out of zone memory
3834 */
5ba3f43e
A
3835 if (is_zone_map_nearing_exhaustion()) {
3836 thread_wakeup((event_t) &vm_pageout_garbage_collect);
3837 }
3838
f427ee49
A
3839 if (zone->va_sequester) {
3840 lock_zone(zone);
316670eb 3841
f427ee49
A
3842 do {
3843 struct zone_page_metadata *page_meta;
3844 page_meta = zone_sequestered_page_get(zone, &memory);
3845 if (NULL == page_meta) {
3846 break;
3847 }
3848 unlock_zone(zone);
c910b4d9 3849
f427ee49
A
3850 kr = zone_sequestered_page_populate(zone, page_meta,
3851 memory, alloc_size, kmaflags);
3852 if (KERN_SUCCESS != kr) {
3853 goto out_nolock;
3854 }
4bd07ac2 3855
f427ee49
A
3856 zcram_and_lock(zone, memory, alloc_size);
3857 nalloc += nalloc_inc;
3858 } while (nalloc < goal);
6d2010ae 3859
f427ee49 3860 unlock_zone(zone);
2d21ac55 3861 }
1c79356b 3862
f427ee49
A
3863out_nolock:
3864 while (nalloc < goal) {
3865 kr = kernel_memory_allocate(submap_for_zone(zone), &memory,
3866 alloc_size, 0, kmaflags, VM_KERN_MEMORY_ZONE);
3867 if (kr != KERN_SUCCESS) {
3868 printf("%s: kernel_memory_allocate() of %lu bytes failed\n",
3869 __func__, (unsigned long)(nalloc * alloc_size));
3870 break;
3871 }
eb6b6ca3 3872
f427ee49
A
3873 zcram(zone, memory, alloc_size);
3874 nalloc += nalloc_inc;
d9a64523
A
3875 }
3876
f427ee49 3877 return (int)nalloc;
1c79356b
A
3878}
3879
5ba3f43e
A
3880/*
3881 * We're being very conservative here and picking a value of 95%. We might need to lower this if
3882 * we find that we're not catching the problem and are still hitting zone map exhaustion panics.
3883 */
3884#define ZONE_MAP_JETSAM_LIMIT_DEFAULT 95
3885
3886/*
3887 * Trigger zone-map-exhaustion jetsams if the zone map is X% full, where X=zone_map_jetsam_limit.
3888 * Can be set via boot-arg "zone_map_jetsam_limit". Set to 95% by default.
3889 */
f427ee49
A
3890TUNABLE_WRITEABLE(unsigned int, zone_map_jetsam_limit, "zone_map_jetsam_limit",
3891 ZONE_MAP_JETSAM_LIMIT_DEFAULT);
5ba3f43e 3892
0a7de745
A
3893void
3894get_zone_map_size(uint64_t *current_size, uint64_t *capacity)
5ba3f43e 3895{
f427ee49
A
3896 vm_offset_t phys_pages = os_atomic_load(&zones_phys_page_mapped_count, relaxed);
3897 *current_size = ptoa_64(phys_pages);
3898 *capacity = zone_phys_mapped_max;
5ba3f43e
A
3899}
3900
0a7de745
A
3901void
3902get_largest_zone_info(char *zone_name, size_t zone_name_len, uint64_t *zone_size)
5ba3f43e
A
3903{
3904 zone_t largest_zone = zone_find_largest();
f427ee49
A
3905
3906 /*
3907 * Append kalloc heap name to zone name (if zone is used by kalloc)
3908 */
3909 snprintf(zone_name, zone_name_len, "%s%s",
3910 zone_heap_name(largest_zone), largest_zone->z_name);
3911
3912 *zone_size = zone_size_wired(largest_zone);
5ba3f43e
A
3913}
3914
0a7de745
A
3915boolean_t
3916is_zone_map_nearing_exhaustion(void)
5ba3f43e 3917{
f427ee49
A
3918 vm_offset_t phys_pages = os_atomic_load(&zones_phys_page_mapped_count, relaxed);
3919 return ptoa_64(phys_pages) > (zone_phys_mapped_max * zone_map_jetsam_limit) / 100;
5ba3f43e
A
3920}
3921
5ba3f43e
A
3922
3923#define VMENTRY_TO_VMOBJECT_COMPARISON_RATIO 98
3924
3925/*
3926 * Tries to kill a single process if it can attribute one to the largest zone. If not, wakes up the memorystatus thread
3927 * to walk through the jetsam priority bands and kill processes.
3928 */
0a7de745
A
3929static void
3930kill_process_in_largest_zone(void)
5ba3f43e
A
3931{
3932 pid_t pid = -1;
3933 zone_t largest_zone = zone_find_largest();
3934
f427ee49
A
3935 printf("zone_map_exhaustion: Zone mapped %lld of %lld, used %lld, map size %lld, capacity %lld [jetsam limit %d%%]\n",
3936 ptoa_64(os_atomic_load(&zones_phys_page_mapped_count, relaxed)), ptoa_64(zone_phys_mapped_max),
3937 ptoa_64(os_atomic_load(&zones_phys_page_count, relaxed)),
3938 (uint64_t)zone_submaps_approx_size(),
3939 (uint64_t)zone_range_size(&zone_info.zi_map_range),
3940 zone_map_jetsam_limit);
3941 printf("zone_map_exhaustion: Largest zone %s%s, size %lu\n", zone_heap_name(largest_zone),
3942 largest_zone->z_name, (uintptr_t)zone_size_wired(largest_zone));
5ba3f43e
A
3943
3944 /*
f427ee49
A
3945 * We want to make sure we don't call this function from userspace.
3946 * Or we could end up trying to synchronously kill the process
5ba3f43e
A
3947 * whose context we're in, causing the system to hang.
3948 */
3949 assert(current_task() == kernel_task);
3950
3951 /*
f427ee49
A
3952 * If vm_object_zone is the largest, check to see if the number of
3953 * elements in vm_map_entry_zone is comparable.
3954 *
3955 * If so, consider vm_map_entry_zone as the largest. This lets us target
3956 * a specific process to jetsam to quickly recover from the zone map
3957 * bloat.
5ba3f43e
A
3958 */
3959 if (largest_zone == vm_object_zone) {
f427ee49
A
3960 unsigned int vm_object_zone_count = zone_count_allocated(vm_object_zone);
3961 unsigned int vm_map_entry_zone_count = zone_count_allocated(vm_map_entry_zone);
5ba3f43e
A
3962 /* Is the VM map entries zone count >= 98% of the VM objects zone count? */
3963 if (vm_map_entry_zone_count >= ((vm_object_zone_count * VMENTRY_TO_VMOBJECT_COMPARISON_RATIO) / 100)) {
3964 largest_zone = vm_map_entry_zone;
f427ee49
A
3965 printf("zone_map_exhaustion: Picking VM map entries as the zone to target, size %lu\n",
3966 (uintptr_t)zone_size_wired(largest_zone));
5ba3f43e
A
3967 }
3968 }
3969
3970 /* TODO: Extend this to check for the largest process in other zones as well. */
3971 if (largest_zone == vm_map_entry_zone) {
3972 pid = find_largest_process_vm_map_entries();
3973 } else {
f427ee49
A
3974 printf("zone_map_exhaustion: Nothing to do for the largest zone [%s%s]. "
3975 "Waking up memorystatus thread.\n", zone_heap_name(largest_zone),
3976 largest_zone->z_name);
5ba3f43e
A
3977 }
3978 if (!memorystatus_kill_on_zone_map_exhaustion(pid)) {
3979 printf("zone_map_exhaustion: Call to memorystatus failed, victim pid: %d\n", pid);
3980 }
3981}
3982
f427ee49
A
3983#pragma mark zalloc module init
3984
3985/*
3986 * Initialize the "zone of zones" which uses fixed memory allocated
3987 * earlier in memory initialization. zone_bootstrap is called
3988 * before zone_init.
39236c6e 3989 */
f427ee49 3990__startup_func
1c79356b 3991void
f427ee49
A
3992zone_bootstrap(void)
3993{
3994 /* Validate struct zone_page_metadata expectations */
3995 if ((1U << ZONE_PAGECOUNT_BITS) <
3996 atop(ZONE_MAX_ALLOC_SIZE) * sizeof(struct zone_page_metadata)) {
3997 panic("ZONE_PAGECOUNT_BITS is not large enough to hold page counts");
3998 }
5ba3f43e 3999
f427ee49
A
4000 /* Validate struct zone_packed_virtual_address expectations */
4001 static_assert((intptr_t)VM_MIN_KERNEL_ADDRESS < 0, "the top bit must be 1");
4002 if (VM_KERNEL_POINTER_SIGNIFICANT_BITS - PAGE_SHIFT > 31) {
4003 panic("zone_pva_t can't pack a kernel page address in 31 bits");
0a7de745 4004 }
1c79356b 4005
f427ee49 4006 zpercpu_early_count = ml_early_cpu_max_number() + 1;
91447636 4007
f427ee49
A
4008 /* Set up zone element poisoning */
4009 zp_bootstrap();
0a7de745 4010
f427ee49 4011 random_bool_init(&zone_bool_gen);
d9a64523 4012
1c79356b 4013 /*
f427ee49
A
4014 * the KASAN quarantine for kalloc doesn't understand heaps
4015 * and trips the heap confusion panics. At the end of the day,
4016 * all these security measures are double duty with KASAN.
4017 *
4018 * On 32bit kernels, these protections are just too expensive.
1c79356b 4019 */
f427ee49
A
4020#if !defined(__LP64__) || KASAN_ZALLOC
4021 zsecurity_options &= ~ZSECURITY_OPTIONS_SEQUESTER;
4022 zsecurity_options &= ~ZSECURITY_OPTIONS_SUBMAP_USER_DATA;
4023 zsecurity_options &= ~ZSECURITY_OPTIONS_SEQUESTER_KEXT_KALLOC;
4024#endif
7ddcb079 4025
f427ee49 4026 thread_call_setup(&call_async_alloc, zalloc_async, NULL);
39037602 4027
f427ee49
A
4028#if CONFIG_ZCACHE
4029 /* zcc_enable_for_zone_name=<zone>: enable per-cpu zone caching for <zone>. */
4030 if (PE_parse_boot_arg_str("zcc_enable_for_zone_name", cache_zone_name, sizeof(cache_zone_name))) {
4031 printf("zcache: caching enabled for zone %s\n", cache_zone_name);
0a7de745 4032 }
f427ee49
A
4033#endif /* CONFIG_ZCACHE */
4034}
fe8ab488 4035
f427ee49
A
4036#if __LP64__
4037#if CONFIG_EMBEDDED
4038#define ZONE_MAP_VIRTUAL_SIZE_LP64 (32ULL * 1024ULL * 1024 * 1024)
4039#else
4040#define ZONE_MAP_VIRTUAL_SIZE_LP64 (128ULL * 1024ULL * 1024 * 1024)
fe8ab488 4041#endif
f427ee49 4042#endif /* __LP64__ */
fe8ab488 4043
f427ee49
A
4044#define SINGLE_GUARD 16384
4045#define MULTI_GUARD (3 * SINGLE_GUARD)
0a7de745 4046
f427ee49
A
4047#if __LP64__
4048static inline vm_offset_t
4049zone_restricted_va_max(void)
4050{
4051 vm_offset_t compressor_max = VM_PACKING_MAX_PACKABLE(C_SLOT_PACKED_PTR);
4052 vm_offset_t vm_page_max = VM_PACKING_MAX_PACKABLE(VM_PAGE_PACKED_PTR);
5ba3f43e 4053
f427ee49
A
4054 return trunc_page(MIN(compressor_max, vm_page_max));
4055}
5ba3f43e
A
4056#endif
4057
f427ee49
A
4058__startup_func
4059static void
4060zone_tunables_fixup(void)
4061{
4062 if (zone_map_jetsam_limit == 0 || zone_map_jetsam_limit > 100) {
4063 zone_map_jetsam_limit = ZONE_MAP_JETSAM_LIMIT_DEFAULT;
0a7de745 4064 }
1c79356b 4065}
f427ee49 4066STARTUP(TUNABLES, STARTUP_RANK_MIDDLE, zone_tunables_fixup);
1c79356b 4067
f427ee49
A
4068__startup_func
4069static vm_size_t
4070zone_phys_size_max(void)
4071{
4072 mach_vm_size_t zsize;
4073 vm_size_t zsizearg;
6d2010ae 4074
f427ee49
A
4075 if (PE_parse_boot_argn("zsize", &zsizearg, sizeof(zsizearg))) {
4076 zsize = zsizearg * (1024ULL * 1024);
4077 } else {
4078 zsize = sane_size >> 2; /* Set target zone size as 1/4 of physical memory */
4079#if defined(__LP64__)
4080 zsize += zsize >> 1;
4081#endif /* __LP64__ */
4082 }
5ba3f43e 4083
f427ee49
A
4084 if (zsize < CONFIG_ZONE_MAP_MIN) {
4085 zsize = CONFIG_ZONE_MAP_MIN; /* Clamp to min */
4086 }
4087 if (zsize > sane_size >> 1) {
4088 zsize = sane_size >> 1; /* Clamp to half of RAM max */
4089 }
4090 if (zsizearg == 0 && zsize > ZONE_MAP_MAX) {
4091 /* if zsize boot-arg not present and zsize exceeds platform maximum, clip zsize */
4092 vm_size_t orig_zsize = zsize;
4093 zsize = ZONE_MAP_MAX;
4094 printf("NOTE: zonemap size reduced from 0x%lx to 0x%lx\n",
4095 (uintptr_t)orig_zsize, (uintptr_t)zsize);
d9a64523
A
4096 }
4097
f427ee49
A
4098 assert((vm_size_t) zsize == zsize);
4099 return (vm_size_t)trunc_page(zsize);
4100}
4101
4102__startup_func
4103static struct zone_map_range
4104zone_init_allocate_va(vm_offset_t *submap_min, vm_size_t size, bool guard)
4105{
4106 struct zone_map_range r;
4107 kern_return_t kr;
d9a64523 4108
f427ee49
A
4109 if (guard) {
4110 vm_map_offset_t addr = *submap_min;
4111 vm_map_kernel_flags_t vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
d9a64523 4112
f427ee49
A
4113 vmk_flags.vmkf_permanent = TRUE;
4114 kr = vm_map_enter(kernel_map, &addr, size, 0,
4115 VM_FLAGS_FIXED, vmk_flags, VM_KERN_MEMORY_ZONE, kernel_object,
4116 0, FALSE, VM_PROT_NONE, VM_PROT_NONE, VM_INHERIT_DEFAULT);
4117 *submap_min = (vm_offset_t)addr;
4118 } else {
4119 kr = kernel_memory_allocate(kernel_map, submap_min, size,
4120 0, KMA_KOBJECT | KMA_PAGEABLE | KMA_VAONLY, VM_KERN_MEMORY_ZONE);
d9a64523 4121 }
f427ee49
A
4122 if (kr != KERN_SUCCESS) {
4123 panic("zone_init_allocate_va(0x%lx:0x%zx) failed: %d",
4124 (uintptr_t)*submap_min, (size_t)size, kr);
4125 }
4126
4127 r.min_address = *submap_min;
4128 *submap_min += size;
4129 r.max_address = *submap_min;
4130
4131 return r;
d9a64523
A
4132}
4133
f427ee49
A
4134__startup_func
4135static void
4136zone_submap_init(
4137 vm_offset_t *submap_min,
4138 unsigned idx,
4139 uint64_t zone_sub_map_numer,
4140 uint64_t *remaining_denom,
4141 vm_offset_t *remaining_size,
4142 vm_size_t guard_size)
4143{
4144 vm_offset_t submap_start, submap_end;
4145 vm_size_t submap_size;
4146 vm_map_t submap;
4147 kern_return_t kr;
4148
4149 submap_size = trunc_page(zone_sub_map_numer * *remaining_size /
4150 *remaining_denom);
4151 submap_start = *submap_min;
4152 submap_end = submap_start + submap_size;
4153
4154#if defined(__LP64__)
4155 if (idx == Z_SUBMAP_IDX_VA_RESTRICTED_MAP) {
4156 vm_offset_t restricted_va_max = zone_restricted_va_max();
4157 if (submap_end > restricted_va_max) {
4158#if DEBUG || DEVELOPMENT
4159 printf("zone_init: submap[%d] clipped to %zdM of %zdM\n", idx,
4160 (size_t)(restricted_va_max - submap_start) >> 20,
4161 (size_t)submap_size >> 20);
4162#endif /* DEBUG || DEVELOPMENT */
4163 guard_size += submap_end - restricted_va_max;
4164 *remaining_size -= submap_end - restricted_va_max;
4165 submap_end = restricted_va_max;
4166 submap_size = restricted_va_max - submap_start;
4167 }
4168
4169 vm_packing_verify_range("vm_compressor",
4170 submap_start, submap_end, VM_PACKING_PARAMS(C_SLOT_PACKED_PTR));
4171 vm_packing_verify_range("vm_page",
4172 submap_start, submap_end, VM_PACKING_PARAMS(VM_PAGE_PACKED_PTR));
4173 }
4174#endif /* defined(__LP64__) */
4175
4176 vm_map_kernel_flags_t vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
4177 vmk_flags.vmkf_permanent = TRUE;
4178 kr = kmem_suballoc(kernel_map, submap_min, submap_size,
4179 FALSE, VM_FLAGS_FIXED, vmk_flags,
4180 VM_KERN_MEMORY_ZONE, &submap);
4181 if (kr != KERN_SUCCESS) {
4182 panic("kmem_suballoc(kernel_map[%d] %p:%p) failed: %d",
4183 idx, (void *)submap_start, (void *)submap_end, kr);
4184 }
4185
4186#if DEBUG || DEVELOPMENT
4187 printf("zone_init: submap[%d] %p:%p (%zuM)\n",
4188 idx, (void *)submap_start, (void *)submap_end,
4189 (size_t)submap_size >> 20);
4190#endif /* DEBUG || DEVELOPMENT */
4191
4192 zone_submaps[idx] = submap;
4193 *submap_min = submap_end;
4194 *remaining_size -= submap_size;
4195 *remaining_denom -= zone_sub_map_numer;
4196
4197 zone_init_allocate_va(submap_min, guard_size, true);
4198}
4199
4200/* Global initialization of Zone Allocator.
4201 * Runs after zone_bootstrap.
1c79356b 4202 */
f427ee49
A
4203__startup_func
4204static void
4205zone_init(void)
1c79356b 4206{
f427ee49
A
4207 vm_size_t zone_meta_size;
4208 vm_size_t zone_map_size;
4209 vm_size_t remaining_size;
4210 vm_offset_t submap_min = 0;
6d2010ae 4211
f427ee49
A
4212 if (ZSECURITY_OPTIONS_SUBMAP_USER_DATA & zsecurity_options) {
4213 zone_last_submap_idx = Z_SUBMAP_IDX_BAG_OF_BYTES_MAP;
4214 } else {
4215 zone_last_submap_idx = Z_SUBMAP_IDX_GENERAL_MAP;
4216 }
4217 zone_phys_mapped_max = zone_phys_size_max();
4218
4219#if __LP64__
4220 zone_map_size = ZONE_MAP_VIRTUAL_SIZE_LP64;
4221#else
4222 zone_map_size = zone_phys_mapped_max;
4223#endif
4224 zone_meta_size = round_page(atop(zone_map_size) *
4225 sizeof(struct zone_page_metadata));
1c79356b 4226
5ba3f43e 4227 /*
f427ee49
A
4228 * Zone "map" setup:
4229 *
4230 * [ VA_RESTRICTED ] <-- LP64 only
4231 * [ SINGLE_GUARD ] <-- LP64 only
4232 * [ meta ]
4233 * [ SINGLE_GUARD ]
4234 * [ map<i> ] \ for each extra map
4235 * [ MULTI_GUARD ] /
5ba3f43e 4236 */
f427ee49
A
4237 remaining_size = zone_map_size;
4238#if defined(__LP64__)
4239 remaining_size -= SINGLE_GUARD;
5ba3f43e 4240#endif
f427ee49
A
4241 remaining_size -= zone_meta_size + SINGLE_GUARD;
4242 remaining_size -= MULTI_GUARD * (zone_last_submap_idx -
4243 Z_SUBMAP_IDX_GENERAL_MAP + 1);
5ba3f43e 4244
f427ee49
A
4245#if VM_MAX_TAG_ZONES
4246 if (zone_tagging_on) {
4247 zone_tagging_init(zone_map_size);
4248 }
4249#endif
316670eb 4250
f427ee49
A
4251 uint64_t remaining_denom = 0;
4252 uint64_t zone_sub_map_numer[Z_SUBMAP_IDX_COUNT] = {
4253#ifdef __LP64__
4254 [Z_SUBMAP_IDX_VA_RESTRICTED_MAP] = 20,
4255#endif /* defined(__LP64__) */
4256 [Z_SUBMAP_IDX_GENERAL_MAP] = 40,
4257 [Z_SUBMAP_IDX_BAG_OF_BYTES_MAP] = 40,
4258 };
4259
4260 for (unsigned idx = 0; idx <= zone_last_submap_idx; idx++) {
4261#if DEBUG || DEVELOPMENT
4262 char submap_name[MAX_SUBMAP_NAME];
4263 snprintf(submap_name, MAX_SUBMAP_NAME, "submap%d", idx);
4264 PE_parse_boot_argn(submap_name, &zone_sub_map_numer[idx], sizeof(uint64_t));
316670eb 4265#endif
f427ee49
A
4266 remaining_denom += zone_sub_map_numer[idx];
4267 }
4268
c910b4d9 4269 /*
f427ee49
A
4270 * And now allocate the various pieces of VA and submaps.
4271 *
4272 * Make a first allocation of contiguous VA, that we'll deallocate,
4273 * and we'll carve-out memory in that range again linearly.
4274 * The kernel is stil single threaded at this stage.
c910b4d9 4275 */
39236c6e 4276
f427ee49
A
4277 struct zone_map_range *map_range = &zone_info.zi_map_range;
4278
4279 *map_range = zone_init_allocate_va(&submap_min, zone_map_size, false);
4280 submap_min = map_range->min_address;
4281 kmem_free(kernel_map, submap_min, zone_map_size);
4282
4283#if defined(__LP64__)
fe8ab488 4284 /*
f427ee49
A
4285 * Allocate `Z_SUBMAP_IDX_VA_RESTRICTED_MAP` first because its VA range
4286 * can't go beyond RESTRICTED_VA_MAX for the vm_page_t packing to work.
6d2010ae 4287 */
f427ee49
A
4288 zone_submap_init(&submap_min, Z_SUBMAP_IDX_VA_RESTRICTED_MAP,
4289 zone_sub_map_numer[Z_SUBMAP_IDX_VA_RESTRICTED_MAP], &remaining_denom,
4290 &remaining_size, SINGLE_GUARD);
4291#endif /* defined(__LP64__) */
1c79356b 4292
f427ee49
A
4293 /*
4294 * Allocate metadata array
4295 */
4296 zone_info.zi_meta_range =
4297 zone_init_allocate_va(&submap_min, zone_meta_size, true);
4298 zone_init_allocate_va(&submap_min, SINGLE_GUARD, true);
4299
4300 zone_info.zi_array_base =
4301 (struct zone_page_metadata *)zone_info.zi_meta_range.min_address -
4302 zone_pva_from_addr(map_range->min_address).packed_address;
4303
4304 /*
4305 * Allocate other submaps
4306 */
4307 for (unsigned idx = Z_SUBMAP_IDX_GENERAL_MAP; idx <= zone_last_submap_idx; idx++) {
4308 zone_submap_init(&submap_min, idx, zone_sub_map_numer[idx],
4309 &remaining_denom, &remaining_size, MULTI_GUARD);
0a7de745 4310 }
5ba3f43e 4311
f427ee49
A
4312 vm_map_t general_map = zone_submaps[Z_SUBMAP_IDX_GENERAL_MAP];
4313 zone_info.zi_general_range.min_address = vm_map_min(general_map);
4314 zone_info.zi_general_range.max_address = vm_map_max(general_map);
4315
4316 assert(submap_min == map_range->max_address);
4317
4318#if CONFIG_GZALLOC
4319 gzalloc_init(zone_map_size);
d9a64523 4320#endif
f427ee49
A
4321
4322 zone_create_flags_t kma_flags = ZC_NOCACHING |
4323 ZC_NOGC | ZC_NOENCRYPT | ZC_NOGZALLOC | ZC_NOCALLOUT |
4324 ZC_KASAN_NOQUARANTINE | ZC_KASAN_NOREDZONE;
4325
4326 (void)zone_create_ext("vm.permanent", 1, kma_flags,
4327 ZONE_ID_PERMANENT, ^(zone_t z){
4328 z->permanent = true;
4329 z->z_elem_size = 1;
4330 z->pcpu_elem_size = 1;
4331#if defined(__LP64__)
4332 z->submap_idx = Z_SUBMAP_IDX_VA_RESTRICTED_MAP;
4333#endif
4334 });
4335 (void)zone_create_ext("vm.permanent.percpu", 1, kma_flags | ZC_PERCPU,
4336 ZONE_ID_PERCPU_PERMANENT, ^(zone_t z){
4337 z->permanent = true;
4338 z->z_elem_size = 1;
4339 z->pcpu_elem_size = zpercpu_count();
4340#if defined(__LP64__)
4341 z->submap_idx = Z_SUBMAP_IDX_VA_RESTRICTED_MAP;
4342#endif
4343 });
4344
4345 /*
4346 * Now fix the zones that are missing their zone stats
4347 * we don't really know if zfree()s happened so our stats
4348 * are slightly off for early boot. ¯\_(ツ)_/¯
4349 */
4350 zone_index_foreach(idx) {
4351 zone_t tz = &zone_array[idx];
4352
4353 if (tz->z_self) {
4354 zone_stats_t zs = zalloc_percpu_permanent_type(struct zone_stats);
4355
4356 zpercpu_get_cpu(zs, 0)->zs_mem_allocated +=
4357 (tz->countavail - tz->countfree) *
4358 zone_elem_size(tz);
4359 assert(tz->z_stats == NULL);
4360 tz->z_stats = zs;
4361#if ZONE_ENABLE_LOGGING
4362 if (tz->zone_logging && !tz->zlog_btlog) {
4363 zone_enable_logging(tz);
d9a64523 4364 }
f427ee49 4365#endif
d9a64523
A
4366 }
4367 }
fe8ab488 4368
f427ee49 4369#if CONFIG_ZLEAKS
cb323159 4370 /*
f427ee49 4371 * Initialize the zone leak monitor
cb323159 4372 */
f427ee49
A
4373 zleak_init(zone_map_size);
4374#endif /* CONFIG_ZLEAKS */
5ba3f43e 4375
f427ee49
A
4376#if VM_MAX_TAG_ZONES
4377 if (zone_tagging_on) {
4378 vm_allocation_zones_init();
4379 }
4380#endif
4381}
4382STARTUP(ZALLOC, STARTUP_RANK_FIRST, zone_init);
cb323159 4383
f427ee49
A
4384__startup_func
4385static void
4386zone_set_foreign_range(
4387 vm_offset_t range_min,
4388 vm_offset_t range_max)
4389{
4390 zone_info.zi_foreign_range.min_address = range_min;
4391 zone_info.zi_foreign_range.max_address = range_max;
4392}
cb323159 4393
f427ee49
A
4394__startup_func
4395vm_offset_t
4396zone_foreign_mem_init(vm_size_t size)
4397{
4398 vm_offset_t mem = (vm_offset_t) pmap_steal_memory(size);
4399 zone_set_foreign_range(mem, mem + size);
4400 return mem;
4401}
eb6b6ca3 4402
f427ee49 4403#pragma mark zalloc
0a7de745 4404
f427ee49
A
4405#if KASAN_ZALLOC
4406/*
4407 * Called from zfree() to add the element being freed to the KASan quarantine.
4408 *
4409 * Returns true if the newly-freed element made it into the quarantine without
4410 * displacing another, false otherwise. In the latter case, addrp points to the
4411 * address of the displaced element, which will be freed by the zone.
4412 */
4413static bool
4414kasan_quarantine_freed_element(
4415 zone_t *zonep, /* the zone the element is being freed to */
4416 void **addrp) /* address of the element being freed */
4417{
4418 zone_t zone = *zonep;
4419 void *addr = *addrp;
4420
4421 /*
4422 * Resize back to the real allocation size and hand off to the KASan
4423 * quarantine. `addr` may then point to a different allocation, if the
4424 * current element replaced another in the quarantine. The zone then
4425 * takes ownership of the swapped out free element.
4426 */
4427 vm_size_t usersz = zone_elem_size(zone) - 2 * zone->kasan_redzone;
4428 vm_size_t sz = usersz;
4429
4430 if (addr && zone->kasan_redzone) {
4431 kasan_check_free((vm_address_t)addr, usersz, KASAN_HEAP_ZALLOC);
4432 addr = (void *)kasan_dealloc((vm_address_t)addr, &sz);
4433 assert(sz == zone_elem_size(zone));
4434 }
4435 if (addr && !zone->kasan_noquarantine) {
4436 kasan_free(&addr, &sz, KASAN_HEAP_ZALLOC, zonep, usersz, true);
4437 if (!addr) {
4438 return TRUE;
cb323159 4439 }
0a7de745 4440 }
f427ee49
A
4441 if (addr && zone->kasan_noquarantine) {
4442 kasan_unpoison(addr, zone_elem_size(zone));
0a7de745 4443 }
f427ee49
A
4444 *addrp = addr;
4445 return FALSE;
4446}
0b4e3aa0 4447
f427ee49
A
4448#endif /* KASAN_ZALLOC */
4449
4450static inline bool
4451zone_needs_async_refill(zone_t zone)
4452{
4453 if (zone->countfree != 0 || zone->async_pending || zone->no_callout) {
4454 return false;
0a7de745 4455 }
a3d08fcd 4456
f427ee49
A
4457 return zone->expandable || zone->page_count < zone->page_count_max;
4458}
4459
4460__attribute__((noinline))
4461static void
4462zone_refill_synchronously_locked(
4463 zone_t zone,
4464 zalloc_flags_t flags)
4465{
4466 thread_t thr = current_thread();
4467 bool set_expanding_vm_priv = false;
4468 zone_pva_t orig = zone->pages_intermediate;
4469
4470 while ((flags & Z_NOWAIT) == 0 && (zone->permanent
4471 ? zone_pva_is_equal(zone->pages_intermediate, orig)
4472 : zone->countfree == 0)) {
1c79356b 4473 /*
0a7de745
A
4474 * zone is empty, try to expand it
4475 *
f427ee49
A
4476 * Note that we now allow up to 2 threads (1 vm_privliged and
4477 * 1 non-vm_privliged) to expand the zone concurrently...
4478 *
4479 * this is necessary to avoid stalling vm_privileged threads
4480 * running critical code necessary to continue
4481 * compressing/swapping pages (i.e. making new free pages) from
4482 * stalling behind non-vm_privileged threads waiting to acquire
4483 * free pages when the vm_page_free_count is below the
3e170ce0 4484 * vm_page_free_reserved limit.
1c79356b 4485 */
f427ee49
A
4486 if ((zone->expanding_no_vm_priv || zone->expanding_vm_priv) &&
4487 (((thr->options & TH_OPT_VMPRIV) == 0) || zone->expanding_vm_priv)) {
1c79356b 4488 /*
3e170ce0
A
4489 * This is a non-vm_privileged thread and a non-vm_privileged or
4490 * a vm_privileged thread is already expanding the zone...
4491 * OR
4492 * this is a vm_privileged thread and a vm_privileged thread is
4493 * already expanding the zone...
4494 *
4495 * In either case wait for a thread to finish, then try again.
1c79356b 4496 */
f427ee49
A
4497 zone->waiting = true;
4498 assert_wait(zone, THREAD_UNINT);
4499 unlock_zone(zone);
4500 thread_block(THREAD_CONTINUE_NULL);
4501 lock_zone(zone);
4502 continue;
4503 }
4504
4505 if (zone->page_count >= zone->page_count_max) {
4506 if (zone->exhaustible) {
4507 break;
4508 }
4509 if (zone->expandable) {
4510 /*
4511 * If we're expandable, just don't go through this again.
4512 */
4513 zone->page_count_max = ~0u;
4514 } else {
4515 unlock_zone(zone);
4516
4517 panic_include_zprint = true;
316670eb 4518#if CONFIG_ZLEAKS
f427ee49
A
4519 if (zleak_state & ZLEAK_STATE_ACTIVE) {
4520 panic_include_ztrace = true;
1c79356b 4521 }
f427ee49
A
4522#endif /* CONFIG_ZLEAKS */
4523 panic("zalloc: zone \"%s\" empty.", zone->z_name);
1c79356b 4524 }
f427ee49
A
4525 }
4526
4527 /*
4528 * It is possible that a BG thread is refilling/expanding the zone
4529 * and gets pre-empted during that operation. That blocks all other
4530 * threads from making progress leading to a watchdog timeout. To
4531 * avoid that, boost the thread priority using the rwlock boost
4532 */
4533 set_thread_rwlock_boost();
4534
4535 if ((thr->options & TH_OPT_VMPRIV)) {
4536 zone->expanding_vm_priv = true;
4537 set_expanding_vm_priv = true;
4538 } else {
4539 zone->expanding_no_vm_priv = true;
4540 }
4541
4542 zone_replenish_locked(zone, flags, false);
4543
4544 if (set_expanding_vm_priv == true) {
4545 zone->expanding_vm_priv = false;
4546 } else {
4547 zone->expanding_no_vm_priv = false;
4548 }
4549
4550 if (zone->waiting) {
4551 zone->waiting = false;
4552 thread_wakeup(zone);
4553 }
4554 clear_thread_rwlock_boost();
4555
4556 if (zone->countfree == 0) {
4557 assert(flags & Z_NOPAGEWAIT);
4558 break;
4559 }
4560 }
4561
4562 if ((flags & (Z_NOWAIT | Z_NOPAGEWAIT)) &&
4563 zone_needs_async_refill(zone) && !vm_pool_low()) {
4564 zone->async_pending = true;
4565 unlock_zone(zone);
4566 thread_call_enter(&call_async_alloc);
4567 lock_zone(zone);
4568 assert(zone->z_self == zone);
4569 }
4570}
4571
4572__attribute__((noinline))
4573static void
4574zone_refill_asynchronously_locked(zone_t zone)
4575{
4576 uint32_t min_free = zone->prio_refill_count / 2;
4577 uint32_t resv_free = zone->prio_refill_count / 4;
4578 thread_t thr = current_thread();
4579
4580 /*
4581 * Nothing to do if there are plenty of elements.
4582 */
4583 while (zone->countfree <= min_free) {
4584 /*
4585 * Wakeup the replenish thread if not running.
4586 */
4587 if (!zone->zone_replenishing) {
4588 lck_spin_lock(&zone_replenish_lock);
4589 assert(zone_replenish_active < zone_replenish_max_threads);
4590 ++zone_replenish_active;
4591 lck_spin_unlock(&zone_replenish_lock);
4592 zone->zone_replenishing = true;
4593 zone_replenish_wakeups_initiated++;
4594 thread_wakeup(&zone->prio_refill_count);
4595 }
4596
4597 /*
4598 * We'll let VM_PRIV threads to continue to allocate until the
4599 * reserve drops to 25%. After that only TH_OPT_ZONE_PRIV threads
4600 * may continue.
4601 *
4602 * TH_OPT_ZONE_PRIV threads are the GC thread and a replenish thread itself.
4603 * Replenish threads *need* to use the reserve. GC threads need to
4604 * get through the current allocation, but then will wait at a higher
4605 * level after they've dropped any locks which would deadlock the
4606 * replenish thread.
4607 */
4608 if ((zone->countfree > resv_free && (thr->options & TH_OPT_VMPRIV)) ||
4609 (thr->options & TH_OPT_ZONE_PRIV)) {
4610 break;
4611 }
4612
4613 /*
4614 * Wait for the replenish threads to add more elements for us to allocate from.
4615 */
4616 zone_replenish_throttle_count++;
4617 unlock_zone(zone);
4618 assert_wait_timeout(zone, THREAD_UNINT, 1, NSEC_PER_MSEC);
4619 thread_block(THREAD_CONTINUE_NULL);
4620 lock_zone(zone);
4621
4622 assert(zone->z_self == zone);
4623 }
4624
4625 /*
4626 * If we're here because of zone_gc(), we didn't wait for
4627 * zone_replenish_thread to finish. So we need to ensure that
4628 * we will successfully grab an element.
4629 *
4630 * zones that have a replenish thread configured.
4631 * The value of (refill_level / 2) in the previous bit of code should have
4632 * given us headroom even though this thread didn't wait.
4633 */
4634 if (thr->options & TH_OPT_ZONE_PRIV) {
4635 assert(zone->countfree != 0);
4636 }
4637}
4638
4639#if ZONE_ENABLE_LOGGING || CONFIG_ZLEAKS
4640__attribute__((noinline))
4641static void
4642zalloc_log_or_trace_leaks(zone_t zone, vm_offset_t addr)
4643{
4644 uintptr_t zbt[MAX_ZTRACE_DEPTH]; /* used in zone leak logging and zone leak detection */
4645 unsigned int numsaved = 0;
4646
4647#if ZONE_ENABLE_LOGGING
4648 if (DO_LOGGING(zone)) {
2a1bd2d3
A
4649 numsaved = backtrace_frame(zbt, MAX_ZTRACE_DEPTH,
4650 __builtin_frame_address(0), NULL);
f427ee49
A
4651 btlog_add_entry(zone->zlog_btlog, (void *)addr,
4652 ZOP_ALLOC, (void **)zbt, numsaved);
4653 }
4654#endif
4655
4656#if CONFIG_ZLEAKS
4657 /*
4658 * Zone leak detection: capture a backtrace every zleak_sample_factor
4659 * allocations in this zone.
4660 */
4661 if (__improbable(zone->zleak_on)) {
4662 if (sample_counter(&zone->zleak_capture, zleak_sample_factor)) {
4663 /* Avoid backtracing twice if zone logging is on */
4664 if (numsaved == 0) {
2a1bd2d3
A
4665 numsaved = backtrace_frame(zbt, MAX_ZTRACE_DEPTH,
4666 __builtin_frame_address(1), NULL);
f427ee49
A
4667 }
4668 /* Sampling can fail if another sample is happening at the same time in a different zone. */
4669 if (!zleak_log(zbt, addr, numsaved, zone_elem_size(zone))) {
4670 /* If it failed, roll back the counter so we sample the next allocation instead. */
4671 zone->zleak_capture = zleak_sample_factor;
4672 }
4673 }
4674 }
4675
4676 if (__improbable(zone_leaks_scan_enable &&
4677 !(zone_elem_size(zone) & (sizeof(uintptr_t) - 1)))) {
4678 unsigned int count, idx;
4679 /* Fill element, from tail, with backtrace in reverse order */
4680 if (numsaved == 0) {
2a1bd2d3
A
4681 numsaved = backtrace_frame(zbt, MAX_ZTRACE_DEPTH,
4682 __builtin_frame_address(1), NULL);
f427ee49
A
4683 }
4684 count = (unsigned int)(zone_elem_size(zone) / sizeof(uintptr_t));
4685 if (count >= numsaved) {
4686 count = numsaved - 1;
4687 }
4688 for (idx = 0; idx < count; idx++) {
4689 ((uintptr_t *)addr)[count - 1 - idx] = zbt[idx + 1];
4690 }
4691 }
4692#endif /* CONFIG_ZLEAKS */
4693}
4694
4695static inline bool
4696zalloc_should_log_or_trace_leaks(zone_t zone, vm_size_t elem_size)
4697{
4698#if ZONE_ENABLE_LOGGING
4699 if (DO_LOGGING(zone)) {
4700 return true;
4701 }
4702#endif
4703#if CONFIG_ZLEAKS
4704 /*
4705 * Zone leak detection: capture a backtrace every zleak_sample_factor
4706 * allocations in this zone.
4707 */
4708 if (zone->zleak_on) {
4709 return true;
4710 }
4711 if (zone_leaks_scan_enable && !(elem_size & (sizeof(uintptr_t) - 1))) {
4712 return true;
4713 }
4714#endif /* CONFIG_ZLEAKS */
4715 return false;
4716}
4717#endif /* ZONE_ENABLE_LOGGING || CONFIG_ZLEAKS */
4718#if ZONE_ENABLE_LOGGING
4719
4720__attribute__((noinline))
4721static void
4722zfree_log_trace(zone_t zone, vm_offset_t addr)
4723{
4724 /*
4725 * See if we're doing logging on this zone.
4726 *
4727 * There are two styles of logging used depending on
4728 * whether we're trying to catch a leak or corruption.
4729 */
4730 if (__improbable(DO_LOGGING(zone))) {
4731 if (corruption_debug_flag) {
4732 uintptr_t zbt[MAX_ZTRACE_DEPTH];
4733 unsigned int numsaved;
0a7de745 4734 /*
f427ee49
A
4735 * We're logging to catch a corruption.
4736 *
4737 * Add a record of this zfree operation to log.
4738 */
2a1bd2d3
A
4739 numsaved = backtrace_frame(zbt, MAX_ZTRACE_DEPTH,
4740 __builtin_frame_address(1), NULL);
f427ee49
A
4741 btlog_add_entry(zone->zlog_btlog, (void *)addr, ZOP_FREE,
4742 (void **)zbt, numsaved);
4743 } else {
4744 /*
4745 * We're logging to catch a leak.
4746 *
4747 * Remove any record we might have for this element
4748 * since it's being freed. Note that we may not find it
4749 * if the buffer overflowed and that's OK.
4750 *
4751 * Since the log is of a limited size, old records get
4752 * overwritten if there are more zallocs than zfrees.
39037602 4753 */
f427ee49
A
4754 btlog_remove_entries_for_element(zone->zlog_btlog, (void *)addr);
4755 }
4756 }
4757}
4758#endif /* ZONE_ENABLE_LOGGING */
0a7de745 4759
f427ee49
A
4760/*
4761 * Removes an element from the zone's free list, returning 0 if the free list is empty.
4762 * Verifies that the next-pointer and backup next-pointer are intact,
4763 * and verifies that a poisoned element hasn't been modified.
4764 */
4765vm_offset_t
4766zalloc_direct_locked(
4767 zone_t zone,
4768 zalloc_flags_t flags __unused,
4769 vm_size_t waste __unused)
4770{
4771 struct zone_page_metadata *page_meta;
4772 zone_addr_kind_t kind = ZONE_ADDR_NATIVE;
4773 vm_offset_t element, page, validate_bit = 0;
1c79356b 4774
f427ee49
A
4775 /* if zone is empty, bail */
4776 if (!zone_pva_is_null(zone->pages_any_free_foreign)) {
4777 kind = ZONE_ADDR_FOREIGN;
4778 page_meta = zone_pva_to_meta(zone->pages_any_free_foreign, kind);
4779 page = (vm_offset_t)page_meta;
4780 } else if (!zone_pva_is_null(zone->pages_intermediate)) {
4781 page_meta = zone_pva_to_meta(zone->pages_intermediate, kind);
4782 page = zone_pva_to_addr(zone->pages_intermediate);
4783 } else if (!zone_pva_is_null(zone->pages_all_free)) {
4784 page_meta = zone_pva_to_meta(zone->pages_all_free, kind);
4785 page = zone_pva_to_addr(zone->pages_all_free);
4786 if (os_sub_overflow(zone->allfree_page_count,
4787 page_meta->zm_page_count, &zone->allfree_page_count)) {
4788 zone_accounting_panic(zone, "allfree_page_count wrap-around");
4789 }
4790 } else {
4791 zone_accounting_panic(zone, "countfree corruption");
4792 }
fe8ab488 4793
f427ee49
A
4794 if (!zone_has_index(zone, page_meta->zm_index)) {
4795 zone_page_metadata_index_confusion_panic(zone, page, page_meta);
4796 }
0a7de745 4797
f427ee49 4798 element = zone_page_meta_get_freelist(zone, page_meta, page);
0a7de745 4799
f427ee49
A
4800 vm_offset_t *primary = (vm_offset_t *) element;
4801 vm_offset_t *backup = get_backup_ptr(zone_elem_size(zone), primary);
cb323159 4802
f427ee49
A
4803 /*
4804 * since the primary next pointer is xor'ed with zp_nopoison_cookie
4805 * for obfuscation, retrieve the original value back
4806 */
4807 vm_offset_t next_element = *primary ^ zp_nopoison_cookie;
4808 vm_offset_t next_element_primary = *primary;
4809 vm_offset_t next_element_backup = *backup;
5ba3f43e 4810
f427ee49
A
4811 /*
4812 * backup_ptr_mismatch_panic will determine what next_element
4813 * should have been, and print it appropriately
4814 */
4815 if (!zone_page_meta_is_sane_element(zone, page_meta, page, next_element, kind)) {
4816 backup_ptr_mismatch_panic(zone, page_meta, page, element);
4817 }
0a7de745 4818
f427ee49
A
4819 /* Check the backup pointer for the regular cookie */
4820 if (__improbable(next_element_primary != next_element_backup)) {
4821 /* Check for the poisoned cookie instead */
4822 if (__improbable(next_element != (next_element_backup ^ zp_poisoned_cookie))) {
4823 /* Neither cookie is valid, corruption has occurred */
4824 backup_ptr_mismatch_panic(zone, page_meta, page, element);
4825 }
0a7de745 4826
f427ee49
A
4827 /*
4828 * Element was marked as poisoned, so check its integrity before using it.
4829 */
4830 validate_bit = ZALLOC_ELEMENT_NEEDS_VALIDATION;
4831 } else if (zone->zfree_clear_mem) {
4832 validate_bit = ZALLOC_ELEMENT_NEEDS_VALIDATION;
4833 }
3e170ce0 4834
f427ee49
A
4835 /* Remove this element from the free list */
4836 zone_page_meta_set_freelist(page_meta, page, next_element);
0a7de745 4837
f427ee49
A
4838 if (kind == ZONE_ADDR_FOREIGN) {
4839 if (next_element == 0) {
4840 /* last foreign element allocated on page, move to all_used_foreign */
4841 zone_meta_requeue(zone, &zone->pages_all_used_foreign, page_meta, kind);
4842 }
4843 } else if (next_element == 0) {
4844 zone_meta_requeue(zone, &zone->pages_all_used, page_meta, kind);
4845 } else if (page_meta->zm_alloc_count == 0) {
4846 /* remove from free, move to intermediate */
4847 zone_meta_requeue(zone, &zone->pages_intermediate, page_meta, kind);
4848 }
39037602 4849
f427ee49
A
4850 if (os_add_overflow(page_meta->zm_alloc_count, 1,
4851 &page_meta->zm_alloc_count)) {
4852 /*
4853 * This will not catch a lot of errors, the proper check
4854 * would be against the number of elements this run should
4855 * have which is expensive to count.
4856 *
4857 * But zm_alloc_count is a 16 bit number which could
4858 * theoretically be valuable to cause to wrap around,
4859 * so catch this.
4860 */
4861 zone_page_meta_accounting_panic(zone, page_meta,
4862 "zm_alloc_count overflow");
4863 }
4864 if (os_sub_overflow(zone->countfree, 1, &zone->countfree)) {
4865 zone_accounting_panic(zone, "countfree wrap-around");
4866 }
fe8ab488 4867
f427ee49
A
4868#if VM_MAX_TAG_ZONES
4869 if (__improbable(zone->tags)) {
4870 vm_tag_t tag = zalloc_flags_get_tag(flags);
4871 // set the tag with b0 clear so the block remains inuse
4872 ZTAG(zone, element)[0] = (vm_tag_t)(tag << 1);
4873 vm_tag_update_zone_size(tag, zone->tag_zone_index,
4874 zone_elem_size(zone), waste);
4875 }
4876#endif /* VM_MAX_TAG_ZONES */
4877#if KASAN_ZALLOC
4878 if (zone->percpu) {
4879 zpercpu_foreach_cpu(i) {
4880 kasan_poison_range(element + ptoa(i),
4881 zone_elem_size(zone), ASAN_VALID);
0a7de745 4882 }
f427ee49
A
4883 } else {
4884 kasan_poison_range(element, zone_elem_size(zone), ASAN_VALID);
4885 }
4886#endif
4887
4888 return element | validate_bit;
4889}
4890
4891/*
4892 * zalloc returns an element from the specified zone.
2a1bd2d3
A
4893 *
4894 * The function is noinline when zlog can be used so that the backtracing can
4895 * reliably skip the zalloc_ext() and zalloc_log_or_trace_leaks()
4896 * boring frames.
f427ee49 4897 */
2a1bd2d3
A
4898#if ZONE_ENABLE_LOGGING
4899__attribute__((noinline))
4900#endif
f427ee49
A
4901void *
4902zalloc_ext(
4903 zone_t zone,
4904 zone_stats_t zstats,
4905 zalloc_flags_t flags,
4906 vm_size_t waste)
4907{
4908 vm_offset_t addr = 0;
4909 vm_size_t elem_size = zone_elem_size(zone);
4910
4911 /*
4912 * KASan uses zalloc() for fakestack, which can be called anywhere.
4913 * However, we make sure these calls can never block.
4914 */
4915 assert(zone->kasan_fakestacks ||
4916 ml_get_interrupts_enabled() ||
4917 ml_is_quiescing() ||
4918 debug_mode_active() ||
4919 startup_phase < STARTUP_SUB_EARLY_BOOT);
4920
4921 /*
4922 * Make sure Z_NOFAIL was not obviously misused
4923 */
4924 if ((flags & Z_NOFAIL) && !zone->prio_refill_count) {
4925 assert(!zone->exhaustible && (flags & (Z_NOWAIT | Z_NOPAGEWAIT)) == 0);
1c79356b
A
4926 }
4927
f427ee49
A
4928#if CONFIG_ZCACHE
4929 /*
4930 * Note: if zone caching is on, gzalloc and tags aren't used
4931 * so we can always check this first
6d2010ae 4932 */
f427ee49
A
4933 if (zone_caching_enabled(zone)) {
4934 addr = zcache_alloc_from_cpu_cache(zone, zstats, waste);
4935 if (__probable(addr)) {
4936 goto allocated_from_cache;
6d2010ae
A
4937 }
4938 }
f427ee49 4939#endif /* CONFIG_ZCACHE */
0a7de745 4940
f427ee49
A
4941#if CONFIG_GZALLOC
4942 if (__improbable(zone->gzalloc_tracked)) {
4943 addr = gzalloc_alloc(zone, zstats, flags);
4944 goto allocated_from_gzalloc;
39236c6e 4945 }
f427ee49 4946#endif /* CONFIG_GZALLOC */
5ba3f43e 4947#if VM_MAX_TAG_ZONES
f427ee49
A
4948 if (__improbable(zone->tags)) {
4949 vm_tag_t tag = zalloc_flags_get_tag(flags);
4950 if (tag == VM_KERN_MEMORY_NONE) {
4951 /*
4952 * zone views into heaps can lead to a site-less call
4953 * and we fallback to KALLOC as a tag for those.
4954 */
4955 tag = VM_KERN_MEMORY_KALLOC;
4956 flags |= Z_VM_TAG(tag);
0a7de745 4957 }
f427ee49 4958 vm_tag_will_update_zone(tag, zone->tag_zone_index);
0a7de745 4959 }
5ba3f43e 4960#endif /* VM_MAX_TAG_ZONES */
1c79356b 4961
f427ee49
A
4962 lock_zone(zone);
4963 assert(zone->z_self == zone);
0b4e3aa0 4964
f427ee49
A
4965 /*
4966 * Check if we need another thread to replenish the zone or
4967 * if we have to wait for a replenish thread to finish.
4968 * This is used for elements, like vm_map_entry, which are
4969 * needed themselves to implement zalloc().
4970 */
4971 if (__improbable(zone->prio_refill_count &&
4972 zone->countfree <= zone->prio_refill_count / 2)) {
4973 zone_refill_asynchronously_locked(zone);
4974 } else if (__improbable(zone->countfree == 0)) {
4975 zone_refill_synchronously_locked(zone, flags);
4976 if (__improbable(zone->countfree == 0)) {
4977 unlock_zone(zone);
4978 if (__improbable(flags & Z_NOFAIL)) {
4979 zone_nofail_panic(zone);
4980 }
4981 goto out_nomem;
4982 }
39037602
A
4983 }
4984
f427ee49
A
4985 addr = zalloc_direct_locked(zone, flags, waste);
4986 if (__probable(zstats != NULL)) {
4987 /*
4988 * The few vm zones used before zone_init() runs do not have
4989 * per-cpu stats yet
4990 */
4991 int cpu = cpu_number();
4992 zpercpu_get_cpu(zstats, cpu)->zs_mem_allocated += elem_size;
4993#if ZALLOC_DETAILED_STATS
4994 if (waste) {
4995 zpercpu_get_cpu(zstats, cpu)->zs_mem_wasted += waste;
39037602 4996 }
f427ee49 4997#endif /* ZALLOC_DETAILED_STATS */
fe8ab488
A
4998 }
4999
f427ee49
A
5000 unlock_zone(zone);
5001
5002#if ZALLOC_ENABLE_POISONING
5003 bool validate = addr & ZALLOC_ELEMENT_NEEDS_VALIDATION;
5004#endif
5005 addr &= ~ZALLOC_ELEMENT_NEEDS_VALIDATION;
5006 zone_clear_freelist_pointers(zone, addr);
5007#if ZALLOC_ENABLE_POISONING
5008 /*
5009 * Note: percpu zones do not respect ZONE_MIN_ELEM_SIZE,
5010 * so we will check the first word even if we just
5011 * cleared it.
5012 */
5013 zalloc_validate_element(zone, addr, elem_size - sizeof(vm_offset_t),
5014 validate);
5015#endif /* ZALLOC_ENABLE_POISONING */
5ba3f43e 5016
f427ee49
A
5017allocated_from_cache:
5018#if ZONE_ENABLE_LOGGING || CONFIG_ZLEAKS
5019 if (__improbable(zalloc_should_log_or_trace_leaks(zone, elem_size))) {
5020 zalloc_log_or_trace_leaks(zone, addr);
5021 }
5022#endif /* ZONE_ENABLE_LOGGING || CONFIG_ZLEAKS */
d9a64523 5023
f427ee49
A
5024#if CONFIG_GZALLOC
5025allocated_from_gzalloc:
5ba3f43e 5026#endif
f427ee49
A
5027#if KASAN_ZALLOC
5028 if (zone->kasan_redzone) {
5029 addr = kasan_alloc(addr, elem_size,
5030 elem_size - 2 * zone->kasan_redzone, zone->kasan_redzone);
5031 elem_size -= 2 * zone->kasan_redzone;
5032 }
5033 /*
5034 * Initialize buffer with unique pattern only if memory
5035 * wasn't expected to be zeroed.
5036 */
5037 if (!zone->zfree_clear_mem && !(flags & Z_ZERO)) {
5038 kasan_leak_init(addr, elem_size);
5039 }
5040#endif /* KASAN_ZALLOC */
5041 if ((flags & Z_ZERO) && !zone->zfree_clear_mem) {
5042 bzero((void *)addr, elem_size);
5043 }
5ba3f43e 5044
f427ee49 5045 TRACE_MACHLEAKS(ZALLOC_CODE, ZALLOC_CODE_2, elem_size, addr);
a39ff7e2 5046
f427ee49
A
5047out_nomem:
5048 DTRACE_VM2(zalloc, zone_t, zone, void*, addr);
0a7de745 5049 return (void *)addr;
1c79356b
A
5050}
5051
91447636 5052void *
f427ee49 5053zalloc(union zone_or_view zov)
fe8ab488 5054{
f427ee49 5055 return zalloc_flags(zov, Z_WAITOK);
fe8ab488
A
5056}
5057
5058void *
f427ee49 5059zalloc_noblock(union zone_or_view zov)
fe8ab488 5060{
f427ee49 5061 return zalloc_flags(zov, Z_NOWAIT);
fe8ab488
A
5062}
5063
5064void *
f427ee49 5065zalloc_flags(union zone_or_view zov, zalloc_flags_t flags)
1c79356b 5066{
f427ee49
A
5067 zone_t zone = zov.zov_view->zv_zone;
5068 zone_stats_t zstats = zov.zov_view->zv_stats;
5069 assert(!zone->percpu);
5070 return zalloc_ext(zone, zstats, flags, 0);
5ba3f43e
A
5071}
5072
5073void *
f427ee49 5074zalloc_percpu(union zone_or_view zov, zalloc_flags_t flags)
5ba3f43e 5075{
f427ee49
A
5076 zone_t zone = zov.zov_view->zv_zone;
5077 zone_stats_t zstats = zov.zov_view->zv_stats;
5078 assert(zone->percpu);
5079 return (void *)__zpcpu_mangle(zalloc_ext(zone, zstats, flags, 0));
1c79356b
A
5080}
5081
f427ee49
A
5082static void *
5083_zalloc_permanent(zone_t zone, vm_size_t size, vm_offset_t mask)
1c79356b 5084{
f427ee49
A
5085 const zone_addr_kind_t kind = ZONE_ADDR_NATIVE;
5086 struct zone_page_metadata *page_meta;
5087 vm_offset_t offs, addr;
5088 zone_pva_t pva;
5089
5090 assert(ml_get_interrupts_enabled() ||
5091 ml_is_quiescing() ||
5092 debug_mode_active() ||
5093 startup_phase < STARTUP_SUB_EARLY_BOOT);
5094
5095 size = (size + mask) & ~mask;
5096 assert(size <= PAGE_SIZE);
5097
5098 lock_zone(zone);
5099 assert(zone->z_self == zone);
5100
5101 for (;;) {
5102 pva = zone->pages_intermediate;
5103 while (!zone_pva_is_null(pva)) {
5104 page_meta = zone_pva_to_meta(pva, kind);
5105 if (page_meta->zm_freelist_offs + size <= PAGE_SIZE) {
5106 goto found;
5107 }
5108 pva = page_meta->zm_page_next;
5109 }
5110
5111 zone_refill_synchronously_locked(zone, Z_WAITOK);
5112 }
5113
5114found:
5115 offs = (page_meta->zm_freelist_offs + mask) & ~mask;
5116 page_meta->zm_freelist_offs = offs + size;
5117 page_meta->zm_alloc_count += size;
5118 zone->countfree -= size;
5119 if (__probable(zone->z_stats)) {
5120 zpercpu_get(zone->z_stats)->zs_mem_allocated += size;
5121 }
5122
5123 if (page_meta->zm_alloc_count >= PAGE_SIZE - sizeof(vm_offset_t)) {
5124 zone_meta_requeue(zone, &zone->pages_all_used, page_meta, kind);
5125 }
5126
5127 unlock_zone(zone);
5128
5129 addr = offs + zone_pva_to_addr(pva);
5130
5131 DTRACE_VM2(zalloc, zone_t, zone, void*, addr);
5132 return (void *)addr;
1c79356b
A
5133}
5134
f427ee49
A
5135static void *
5136_zalloc_permanent_large(size_t size, vm_offset_t mask)
d9a64523 5137{
f427ee49
A
5138 kern_return_t kr;
5139 vm_offset_t addr;
5140
5141 kr = kernel_memory_allocate(kernel_map, &addr, size, mask,
5142 KMA_KOBJECT | KMA_PERMANENT | KMA_ZERO,
5143 VM_KERN_MEMORY_KALLOC);
5144 if (kr != 0) {
5145 panic("zalloc_permanent: unable to allocate %zd bytes (%d)",
5146 size, kr);
5147 }
d9a64523
A
5148 return (void *)addr;
5149}
5150
f427ee49
A
5151void *
5152zalloc_permanent(vm_size_t size, vm_offset_t mask)
d9a64523 5153{
f427ee49
A
5154 if (size <= PAGE_SIZE) {
5155 zone_t zone = &zone_array[ZONE_ID_PERMANENT];
5156 return _zalloc_permanent(zone, size, mask);
5157 }
5158 return _zalloc_permanent_large(size, mask);
d9a64523
A
5159}
5160
f427ee49
A
5161void *
5162zalloc_percpu_permanent(vm_size_t size, vm_offset_t mask)
5163{
5164 zone_t zone = &zone_array[ZONE_ID_PERCPU_PERMANENT];
5165 return (void *)__zpcpu_mangle(_zalloc_permanent(zone, size, mask));
5166}
fe8ab488 5167
0b4e3aa0 5168void
f427ee49 5169zalloc_async(__unused thread_call_param_t p0, __unused thread_call_param_t p1)
0b4e3aa0 5170{
f427ee49
A
5171 zone_index_foreach(i) {
5172 zone_t z = &zone_array[i];
5ba3f43e 5173
f427ee49 5174 if (z->no_callout) {
5ba3f43e
A
5175 /* async_pending will never be set */
5176 continue;
5177 }
5178
f427ee49
A
5179 lock_zone(z);
5180 if (z->z_self && z->async_pending) {
5181 z->async_pending = false;
5182 zone_refill_synchronously_locked(z, Z_WAITOK);
39236c6e 5183 }
f427ee49 5184 unlock_zone(z);
39236c6e 5185 }
0b4e3aa0
A
5186}
5187
1c79356b 5188/*
f427ee49
A
5189 * Adds the element to the head of the zone's free list
5190 * Keeps a backup next-pointer at the end of the element
1c79356b 5191 */
f427ee49
A
5192void
5193zfree_direct_locked(zone_t zone, vm_offset_t element, bool poison)
d9a64523 5194{
f427ee49
A
5195 struct zone_page_metadata *page_meta;
5196 vm_offset_t page, old_head;
5197 zone_addr_kind_t kind;
5198 vm_size_t elem_size = zone_elem_size(zone);
d9a64523 5199
f427ee49
A
5200 vm_offset_t *primary = (vm_offset_t *) element;
5201 vm_offset_t *backup = get_backup_ptr(elem_size, primary);
d9a64523 5202
f427ee49
A
5203 page_meta = zone_allocated_element_resolve(zone, element, &page, &kind);
5204 old_head = zone_page_meta_get_freelist(zone, page_meta, page);
d9a64523 5205
f427ee49
A
5206 if (__improbable(old_head == element)) {
5207 panic("zfree: double free of %p to zone %s%s\n",
5208 (void *) element, zone_heap_name(zone), zone->z_name);
d9a64523 5209 }
c910b4d9 5210
f427ee49
A
5211#if ZALLOC_ENABLE_POISONING
5212 if (poison && elem_size < ZONE_MIN_ELEM_SIZE) {
5213 assert(zone->percpu);
5214 poison = false;
5ba3f43e 5215 }
f427ee49
A
5216#else
5217 poison = false;
5ba3f43e
A
5218#endif
5219
c910b4d9 5220 /*
f427ee49
A
5221 * Always write a redundant next pointer
5222 * So that it is more difficult to forge, xor it with a random cookie
5223 * A poisoned element is indicated by using zp_poisoned_cookie
5224 * instead of zp_nopoison_cookie
5225 */
5226
5227 *backup = old_head ^ (poison ? zp_poisoned_cookie : zp_nopoison_cookie);
5228
5229 /*
5230 * Insert this element at the head of the free list. We also xor the
5231 * primary pointer with the zp_nopoison_cookie to make sure a free
5232 * element does not provide the location of the next free element directly.
c910b4d9 5233 */
f427ee49 5234 *primary = old_head ^ zp_nopoison_cookie;
c910b4d9 5235
f427ee49
A
5236#if VM_MAX_TAG_ZONES
5237 if (__improbable(zone->tags)) {
5238 vm_tag_t tag = (ZTAG(zone, element)[0] >> 1);
5239 // set the tag with b0 clear so the block remains inuse
5240 ZTAG(zone, element)[0] = 0xFFFE;
5241 vm_tag_update_zone_size(tag, zone->tag_zone_index,
5242 -((int64_t)elem_size), 0);
0a7de745 5243 }
f427ee49 5244#endif /* VM_MAX_TAG_ZONES */
1c79356b 5245
f427ee49
A
5246 zone_page_meta_set_freelist(page_meta, page, element);
5247 if (os_sub_overflow(page_meta->zm_alloc_count, 1,
5248 &page_meta->zm_alloc_count)) {
5249 zone_page_meta_accounting_panic(zone, page_meta,
5250 "alloc_count wrap-around");
0a7de745 5251 }
f427ee49 5252 zone->countfree++;
55e303ae 5253
f427ee49
A
5254 if (kind == ZONE_ADDR_FOREIGN) {
5255 if (old_head == 0) {
5256 /* first foreign element freed on page, move from all_used_foreign */
5257 zone_meta_requeue(zone, &zone->pages_any_free_foreign, page_meta, kind);
5258 }
5259 } else if (page_meta->zm_alloc_count == 0) {
5260 /* whether the page was on the intermediate or all_used, queue, move it to free */
5261 zone_meta_requeue(zone, &zone->pages_all_free, page_meta, kind);
5262 zone->allfree_page_count += page_meta->zm_page_count;
5263 } else if (old_head == 0) {
5264 /* first free element on page, move from all_used */
5265 zone_meta_requeue(zone, &zone->pages_intermediate, page_meta, kind);
5266 }
316670eb 5267
f427ee49
A
5268#if KASAN_ZALLOC
5269 if (zone->percpu) {
5270 zpercpu_foreach_cpu(i) {
5271 kasan_poison_range(element + ptoa(i), elem_size,
5272 ASAN_HEAP_FREED);
39037602 5273 }
f427ee49
A
5274 } else {
5275 kasan_poison_range(element, elem_size, ASAN_HEAP_FREED);
39037602 5276 }
f427ee49
A
5277#endif
5278}
39037602 5279
2a1bd2d3
A
5280/*
5281 * The function is noinline when zlog can be used so that the backtracing can
5282 * reliably skip the zfree_ext() and zfree_log_trace()
5283 * boring frames.
5284 */
5285#if ZONE_ENABLE_LOGGING
5286__attribute__((noinline))
5287#endif
f427ee49
A
5288void
5289zfree_ext(zone_t zone, zone_stats_t zstats, void *addr)
5290{
5291 vm_offset_t elem = (vm_offset_t)addr;
5292 vm_size_t elem_size = zone_elem_size(zone);
5293 bool poison = false;
2d21ac55 5294
f427ee49
A
5295 DTRACE_VM2(zfree, zone_t, zone, void*, addr);
5296 TRACE_MACHLEAKS(ZFREE_CODE, ZFREE_CODE_2, elem_size, elem);
1c79356b 5297
f427ee49
A
5298#if KASAN_ZALLOC
5299 if (kasan_quarantine_freed_element(&zone, &addr)) {
5300 return;
fe8ab488 5301 }
c910b4d9 5302 /*
f427ee49
A
5303 * kasan_quarantine_freed_element() might return a different
5304 * {zone, addr} than the one being freed for kalloc heaps.
5305 *
5306 * Make sure we reload everything.
c910b4d9 5307 */
f427ee49
A
5308 elem = (vm_offset_t)addr;
5309 elem_size = zone_elem_size(zone);
5310#endif
c910b4d9 5311
f427ee49
A
5312#if CONFIG_ZLEAKS
5313 /*
5314 * Zone leak detection: un-track the allocation
5315 */
5316 if (__improbable(zone->zleak_on)) {
5317 zleak_free(elem, elem_size);
c910b4d9 5318 }
f427ee49 5319#endif /* CONFIG_ZLEAKS */
c910b4d9 5320
d9a64523 5321#if CONFIG_ZCACHE
f427ee49
A
5322 /*
5323 * Note: if zone caching is on, gzalloc and tags aren't used
5324 * so we can always check this first
5325 */
0a7de745 5326 if (zone_caching_enabled(zone)) {
f427ee49 5327 return zcache_free_to_cpu_cache(zone, zstats, (vm_offset_t)addr);
0a7de745 5328 }
d9a64523
A
5329#endif /* CONFIG_ZCACHE */
5330
f427ee49
A
5331#if CONFIG_GZALLOC
5332 if (__improbable(zone->gzalloc_tracked)) {
5333 return gzalloc_free(zone, zstats, addr);
1c79356b 5334 }
f427ee49 5335#endif /* CONFIG_GZALLOC */
316670eb 5336
f427ee49
A
5337#if ZONE_ENABLE_LOGGING
5338 if (__improbable(DO_LOGGING(zone))) {
5339 zfree_log_trace(zone, elem);
5ba3f43e 5340 }
f427ee49 5341#endif /* ZONE_ENABLE_LOGGING */
316670eb 5342
f427ee49
A
5343 if (zone->zfree_clear_mem) {
5344 poison = zfree_clear(zone, elem, elem_size);
d9a64523 5345 }
0b4e3aa0 5346
f427ee49
A
5347 lock_zone(zone);
5348 assert(zone->z_self == zone);
5349
5350 if (!poison) {
5351 poison = zfree_poison_element(zone, &zone->zp_count, elem);
6d2010ae 5352 }
0a7de745 5353
f427ee49
A
5354 if (__probable(zstats != NULL)) {
5355 /*
5356 * The few vm zones used before zone_init() runs do not have
5357 * per-cpu stats yet
5358 */
5359 zpercpu_get(zstats)->zs_mem_freed += elem_size;
5ba3f43e 5360 }
f427ee49
A
5361
5362 zfree_direct_locked(zone, elem, poison);
5ba3f43e 5363
1c79356b
A
5364 unlock_zone(zone);
5365}
5366
1c79356b 5367void
f427ee49 5368(zfree)(union zone_or_view zov, void *addr)
1c79356b 5369{
f427ee49
A
5370 zone_t zone = zov.zov_view->zv_zone;
5371 zone_stats_t zstats = zov.zov_view->zv_stats;
5372 assert(!zone->percpu);
5373 zfree_ext(zone, zstats, addr);
1c79356b
A
5374}
5375
f427ee49
A
5376void
5377zfree_percpu(union zone_or_view zov, void *addr)
1c79356b 5378{
f427ee49
A
5379 zone_t zone = zov.zov_view->zv_zone;
5380 zone_stats_t zstats = zov.zov_view->zv_stats;
5381 assert(zone->percpu);
5382 zfree_ext(zone, zstats, (void *)__zpcpu_demangle(addr));
1c79356b
A
5383}
5384
f427ee49
A
5385#pragma mark vm integration, MIG routines
5386
cb323159
A
5387/*
5388 * Drops (i.e. frees) the elements in the all free pages queue of a zone.
5389 * Called by zone_gc() on each zone and when a zone is zdestroy()ed.
5390 */
f427ee49
A
5391static void
5392zone_drop_free_elements(zone_t z)
5ba3f43e 5393{
f427ee49 5394 const zone_addr_kind_t kind = ZONE_ADDR_NATIVE;
cb323159 5395 unsigned int total_freed_pages = 0;
f427ee49
A
5396 struct zone_page_metadata *page_meta, *seq_meta;
5397 vm_address_t page_addr;
cb323159 5398 vm_size_t size_to_free;
f427ee49
A
5399 vm_size_t free_count;
5400 uint32_t page_count;
5ba3f43e 5401
eb6b6ca3 5402 current_thread()->options |= TH_OPT_ZONE_PRIV;
5ba3f43e 5403 lock_zone(z);
5ba3f43e 5404
f427ee49 5405 while (!zone_pva_is_null(z->pages_all_free)) {
eb6b6ca3 5406 /*
f427ee49
A
5407 * If any replenishment threads are running, defer to them,
5408 * so that we don't deplete reserved zones.
5409 *
5410 * The timing of the check isn't super important, as there are
5411 * enough reserves to allow freeing an extra page_meta.
5412 *
5413 * Hence, we can check without grabbing the lock every time
5414 * through the loop. We do need the lock however to avoid
5415 * missing a wakeup when we decide to block.
eb6b6ca3
A
5416 */
5417 if (zone_replenish_active > 0) {
5418 lck_spin_lock(&zone_replenish_lock);
5419 if (zone_replenish_active > 0) {
5420 assert_wait(&zone_replenish_active, THREAD_UNINT);
5421 lck_spin_unlock(&zone_replenish_lock);
5422 unlock_zone(z);
5423 thread_block(THREAD_CONTINUE_NULL);
5424 lock_zone(z);
5425 continue;
5426 }
5427 lck_spin_unlock(&zone_replenish_lock);
5428 }
f427ee49
A
5429
5430 page_meta = zone_pva_to_meta(z->pages_all_free, kind);
5431 page_count = page_meta->zm_page_count;
5432 free_count = zone_elem_count(z, ptoa(page_count), kind);
5433
cb323159 5434 /*
f427ee49
A
5435 * Don't drain zones with async refill to below the refill
5436 * threshold, as they need some reserve to function properly.
cb323159 5437 */
f427ee49
A
5438 if (!z->destroyed && z->prio_refill_count &&
5439 (vm_size_t)(z->countfree - free_count) < z->prio_refill_count) {
cb323159
A
5440 break;
5441 }
5ba3f43e 5442
f427ee49 5443 zone_meta_queue_pop(z, &z->pages_all_free, kind, &page_addr);
cb323159 5444
f427ee49
A
5445 if (os_sub_overflow(z->countfree, free_count, &z->countfree)) {
5446 zone_accounting_panic(z, "countfree wrap-around");
5447 }
5448 if (os_sub_overflow(z->countavail, free_count, &z->countavail)) {
5449 zone_accounting_panic(z, "countavail wrap-around");
5450 }
5451 if (os_sub_overflow(z->allfree_page_count, page_count,
5452 &z->allfree_page_count)) {
5453 zone_accounting_panic(z, "allfree_page_count wrap-around");
5454 }
5455 if (os_sub_overflow(z->page_count, page_count, &z->page_count)) {
5456 zone_accounting_panic(z, "page_count wrap-around");
5457 }
cb323159 5458
f427ee49
A
5459 os_atomic_sub(&zones_phys_page_count, page_count, relaxed);
5460 os_atomic_sub(&zones_phys_page_mapped_count, page_count, relaxed);
cb323159 5461
f427ee49
A
5462 bzero(page_meta, sizeof(*page_meta) * page_count);
5463 seq_meta = page_meta;
5464 page_meta = NULL; /* page_meta fields are zeroed, prevent reuse */
cb323159 5465
cb323159 5466 unlock_zone(z);
5ba3f43e 5467
5ba3f43e 5468 /* Free the pages for metadata and account for them */
f427ee49
A
5469 total_freed_pages += page_count;
5470 size_to_free = ptoa(page_count);
5ba3f43e 5471#if KASAN_ZALLOC
f427ee49 5472 kasan_poison_range(page_addr, size_to_free, ASAN_VALID);
5ba3f43e
A
5473#endif
5474#if VM_MAX_TAG_ZONES
0a7de745 5475 if (z->tags) {
f427ee49 5476 ztMemoryRemove(z, page_addr, size_to_free);
0a7de745 5477 }
5ba3f43e 5478#endif /* VM_MAX_TAG_ZONES */
f427ee49
A
5479
5480 if (z->va_sequester && z->alloc_pages == page_count) {
5481 kernel_memory_depopulate(submap_for_zone(z), page_addr,
5482 size_to_free, KMA_KOBJECT, VM_KERN_MEMORY_ZONE);
5483 } else {
5484 kmem_free(submap_for_zone(z), page_addr, size_to_free);
5485 seq_meta = NULL;
5486 }
eb6b6ca3 5487 thread_yield_to_preemption();
f427ee49 5488
cb323159 5489 lock_zone(z);
f427ee49
A
5490
5491 if (seq_meta) {
5492 zone_meta_queue_push(z, &z->pages_sequester, seq_meta, kind);
5493 z->sequester_page_count += page_count;
5494 }
cb323159 5495 }
f427ee49
A
5496 if (z->destroyed) {
5497 assert(zone_pva_is_null(z->pages_all_free));
5498 assert(z->allfree_page_count == 0);
5ba3f43e 5499 }
cb323159 5500 unlock_zone(z);
eb6b6ca3 5501 current_thread()->options &= ~TH_OPT_ZONE_PRIV;
5ba3f43e 5502
cb323159 5503#if DEBUG || DEVELOPMENT
0a7de745 5504 if (zalloc_debug & ZALLOC_DEBUG_ZONEGC) {
f427ee49
A
5505 kprintf("zone_gc() of zone %s%s freed %lu elements, %d pages\n",
5506 zone_heap_name(z), z->z_name,
5507 (unsigned long)(ptoa(total_freed_pages) / z->pcpu_elem_size),
5508 total_freed_pages);
0a7de745 5509 }
cb323159 5510#endif /* DEBUG || DEVELOPMENT */
5ba3f43e
A
5511}
5512
1c79356b
A
5513/* Zone garbage collection
5514 *
5515 * zone_gc will walk through all the free elements in all the
5516 * zones that are marked collectable looking for reclaimable
5517 * pages. zone_gc is called by consider_zone_gc when the system
5518 * begins to run out of memory.
5ba3f43e
A
5519 *
5520 * We should ensure that zone_gc never blocks.
1c79356b
A
5521 */
5522void
5ba3f43e 5523zone_gc(boolean_t consider_jetsams)
1c79356b 5524{
5ba3f43e
A
5525 if (consider_jetsams) {
5526 kill_process_in_largest_zone();
5527 /*
5528 * If we do end up jetsamming something, we need to do a zone_gc so that
5529 * we can reclaim free zone elements and update the zone map size.
5530 * Fall through.
5531 */
5532 }
1c79356b 5533
b0d623f7 5534 lck_mtx_lock(&zone_gc_lock);
1c79356b 5535
cb323159 5536#if DEBUG || DEVELOPMENT
0a7de745 5537 if (zalloc_debug & ZALLOC_DEBUG_ZONEGC) {
39037602 5538 kprintf("zone_gc() starting...\n");
0a7de745 5539 }
cb323159 5540#endif /* DEBUG || DEVELOPMENT */
1c79356b 5541
f427ee49
A
5542 zone_index_foreach(i) {
5543 zone_t z = &zone_array[i];
1c79356b 5544
5ba3f43e 5545 if (!z->collectable) {
316670eb 5546 continue;
39037602 5547 }
d9a64523
A
5548#if CONFIG_ZCACHE
5549 if (zone_caching_enabled(z)) {
5550 zcache_drain_depot(z);
5551 }
5552#endif /* CONFIG_ZCACHE */
f427ee49 5553 if (zone_pva_is_null(z->pages_all_free)) {
1c79356b
A
5554 continue;
5555 }
0a7de745 5556
f427ee49 5557 zone_drop_free_elements(z);
39037602 5558 }
316670eb 5559
39037602
A
5560 lck_mtx_unlock(&zone_gc_lock);
5561}
316670eb 5562
39037602
A
5563/*
5564 * consider_zone_gc:
5565 *
5566 * Called by the pageout daemon when the system needs more free pages.
5567 */
1c79356b
A
5568
5569void
5ba3f43e 5570consider_zone_gc(boolean_t consider_jetsams)
1c79356b 5571{
f427ee49
A
5572 /*
5573 * One-time reclaim of kernel_map resources we allocated in
5574 * early boot.
5575 *
5576 * Use atomic exchange in case multiple threads race into here.
5577 */
5578 vm_offset_t deallocate_kaddr;
5579 if (kmapoff_kaddr != 0 &&
5580 (deallocate_kaddr = os_atomic_xchg(&kmapoff_kaddr, 0, relaxed)) != 0) {
5581 vm_deallocate(kernel_map, deallocate_kaddr, ptoa_64(kmapoff_pgcnt));
316670eb 5582 }
1c79356b 5583
f427ee49 5584 zone_gc(consider_jetsams);
6d2010ae 5585}
2d21ac55 5586
d9a64523
A
5587/*
5588 * Creates a vm_map_copy_t to return to the caller of mach_* MIG calls
5589 * requesting zone information.
5590 * Frees unused pages towards the end of the region, and zero'es out unused
5591 * space on the last page.
5592 */
f427ee49 5593static vm_map_copy_t
d9a64523 5594create_vm_map_copy(
0a7de745
A
5595 vm_offset_t start_addr,
5596 vm_size_t total_size,
5597 vm_size_t used_size)
d9a64523 5598{
0a7de745
A
5599 kern_return_t kr;
5600 vm_offset_t end_addr;
5601 vm_size_t free_size;
5602 vm_map_copy_t copy;
d9a64523
A
5603
5604 if (used_size != total_size) {
5605 end_addr = start_addr + used_size;
5606 free_size = total_size - (round_page(end_addr) - start_addr);
5607
5608 if (free_size >= PAGE_SIZE) {
5609 kmem_free(ipc_kernel_map,
0a7de745 5610 round_page(end_addr), free_size);
d9a64523
A
5611 }
5612 bzero((char *) end_addr, round_page(end_addr) - end_addr);
5613 }
5614
5615 kr = vm_map_copyin(ipc_kernel_map, (vm_map_address_t)start_addr,
0a7de745 5616 (vm_map_size_t)used_size, TRUE, &copy);
d9a64523
A
5617 assert(kr == KERN_SUCCESS);
5618
5619 return copy;
5620}
a39ff7e2 5621
f427ee49 5622static boolean_t
a39ff7e2 5623get_zone_info(
f427ee49 5624 zone_t z,
0a7de745
A
5625 mach_zone_name_t *zn,
5626 mach_zone_info_t *zi)
a39ff7e2
A
5627{
5628 struct zone zcopy;
5629
5630 assert(z != ZONE_NULL);
5631 lock_zone(z);
f427ee49 5632 if (!z->z_self) {
a39ff7e2
A
5633 unlock_zone(z);
5634 return FALSE;
5635 }
5636 zcopy = *z;
5637 unlock_zone(z);
5638
5639 if (zn != NULL) {
f427ee49
A
5640 /*
5641 * Append kalloc heap name to zone name (if zone is used by kalloc)
5642 */
5643 char temp_zone_name[MAX_ZONE_NAME] = "";
5644 snprintf(temp_zone_name, MAX_ZONE_NAME, "%s%s",
5645 zone_heap_name(z), z->z_name);
5646
a39ff7e2 5647 /* assuming here the name data is static */
f427ee49
A
5648 (void) __nosan_strlcpy(zn->mzn_name, temp_zone_name,
5649 strlen(temp_zone_name) + 1);
a39ff7e2
A
5650 }
5651
5652 if (zi != NULL) {
f427ee49
A
5653 *zi = (mach_zone_info_t) {
5654 .mzi_count = zone_count_allocated(&zcopy),
5655 .mzi_cur_size = ptoa_64(zcopy.page_count),
5656 // max_size for zprint is now high-watermark of pages used
5657 .mzi_max_size = ptoa_64(zcopy.page_count_hwm),
5658 .mzi_elem_size = zcopy.pcpu_elem_size,
5659 .mzi_alloc_size = ptoa_64(zcopy.alloc_pages),
5660 .mzi_exhaustible = (uint64_t)zcopy.exhaustible,
5661 };
5662 zpercpu_foreach(zs, zcopy.z_stats) {
5663 zi->mzi_sum_size += zs->zs_mem_allocated;
5664 }
a39ff7e2 5665 if (zcopy.collectable) {
f427ee49
A
5666 SET_MZI_COLLECTABLE_BYTES(zi->mzi_collectable,
5667 ptoa_64(zcopy.allfree_page_count));
a39ff7e2
A
5668 SET_MZI_COLLECTABLE_FLAG(zi->mzi_collectable, TRUE);
5669 }
5670 }
5671
5672 return TRUE;
5673}
5674
6d2010ae
A
5675kern_return_t
5676task_zone_info(
0a7de745
A
5677 __unused task_t task,
5678 __unused mach_zone_name_array_t *namesp,
316670eb 5679 __unused mach_msg_type_number_t *namesCntp,
0a7de745 5680 __unused task_zone_info_array_t *infop,
316670eb
A
5681 __unused mach_msg_type_number_t *infoCntp)
5682{
5683 return KERN_FAILURE;
5684}
5685
6d2010ae
A
5686kern_return_t
5687mach_zone_info(
0a7de745
A
5688 host_priv_t host,
5689 mach_zone_name_array_t *namesp,
6d2010ae 5690 mach_msg_type_number_t *namesCntp,
0a7de745 5691 mach_zone_info_array_t *infop,
6d2010ae 5692 mach_msg_type_number_t *infoCntp)
3e170ce0 5693{
0a7de745 5694 return mach_memory_info(host, namesp, namesCntp, infop, infoCntp, NULL, NULL);
3e170ce0
A
5695}
5696
39037602 5697
3e170ce0
A
5698kern_return_t
5699mach_memory_info(
0a7de745
A
5700 host_priv_t host,
5701 mach_zone_name_array_t *namesp,
3e170ce0 5702 mach_msg_type_number_t *namesCntp,
0a7de745 5703 mach_zone_info_array_t *infop,
3e170ce0
A
5704 mach_msg_type_number_t *infoCntp,
5705 mach_memory_info_array_t *memoryInfop,
5706 mach_msg_type_number_t *memoryInfoCntp)
6d2010ae 5707{
0a7de745
A
5708 mach_zone_name_t *names;
5709 vm_offset_t names_addr;
5710 vm_size_t names_size;
5711
5712 mach_zone_info_t *info;
5713 vm_offset_t info_addr;
5714 vm_size_t info_size;
5715
5716 mach_memory_info_t *memory_info;
5717 vm_offset_t memory_info_addr;
5718 vm_size_t memory_info_size;
5719 vm_size_t memory_info_vmsize;
5720 unsigned int num_info;
5721
5722 unsigned int max_zones, used_zones, i;
5723 mach_zone_name_t *zn;
5724 mach_zone_info_t *zi;
5725 kern_return_t kr;
5726
5727 uint64_t zones_collectable_bytes = 0;
5728
5729 if (host == HOST_NULL) {
6d2010ae 5730 return KERN_INVALID_HOST;
0a7de745 5731 }
316670eb 5732#if CONFIG_DEBUGGER_FOR_ZONE_INFO
0a7de745 5733 if (!PE_i_can_has_debugger(NULL)) {
316670eb 5734 return KERN_INVALID_HOST;
0a7de745 5735 }
316670eb 5736#endif
6d2010ae
A
5737
5738 /*
5739 * We assume that zones aren't freed once allocated.
5740 * We won't pick up any zones that are allocated later.
5741 */
5742
f427ee49 5743 max_zones = os_atomic_load(&num_zones, relaxed);
6d2010ae
A
5744
5745 names_size = round_page(max_zones * sizeof *names);
5746 kr = kmem_alloc_pageable(ipc_kernel_map,
0a7de745
A
5747 &names_addr, names_size, VM_KERN_MEMORY_IPC);
5748 if (kr != KERN_SUCCESS) {
6d2010ae 5749 return kr;
0a7de745 5750 }
6d2010ae
A
5751 names = (mach_zone_name_t *) names_addr;
5752
5753 info_size = round_page(max_zones * sizeof *info);
5754 kr = kmem_alloc_pageable(ipc_kernel_map,
0a7de745 5755 &info_addr, info_size, VM_KERN_MEMORY_IPC);
6d2010ae
A
5756 if (kr != KERN_SUCCESS) {
5757 kmem_free(ipc_kernel_map,
0a7de745 5758 names_addr, names_size);
6d2010ae
A
5759 return kr;
5760 }
6d2010ae
A
5761 info = (mach_zone_info_t *) info_addr;
5762
5763 zn = &names[0];
5764 zi = &info[0];
5765
5ba3f43e 5766 used_zones = max_zones;
39037602 5767 for (i = 0; i < max_zones; i++) {
a39ff7e2 5768 if (!get_zone_info(&(zone_array[i]), zn, zi)) {
5ba3f43e
A
5769 used_zones--;
5770 continue;
5771 }
a39ff7e2 5772 zones_collectable_bytes += GET_MZI_COLLECTABLE_BYTES(zi->mzi_collectable);
6d2010ae
A
5773 zn++;
5774 zi++;
5775 }
5776
d9a64523 5777 *namesp = (mach_zone_name_t *) create_vm_map_copy(names_addr, names_size, used_zones * sizeof *names);
5ba3f43e 5778 *namesCntp = used_zones;
6d2010ae 5779
d9a64523 5780 *infop = (mach_zone_info_t *) create_vm_map_copy(info_addr, info_size, used_zones * sizeof *info);
5ba3f43e 5781 *infoCntp = used_zones;
0a7de745 5782
5ba3f43e 5783 num_info = 0;
39037602 5784 memory_info_addr = 0;
6d2010ae 5785
0a7de745
A
5786 if (memoryInfop && memoryInfoCntp) {
5787 vm_map_copy_t copy;
5ba3f43e
A
5788 num_info = vm_page_diagnose_estimate();
5789 memory_info_size = num_info * sizeof(*memory_info);
39037602
A
5790 memory_info_vmsize = round_page(memory_info_size);
5791 kr = kmem_alloc_pageable(ipc_kernel_map,
0a7de745 5792 &memory_info_addr, memory_info_vmsize, VM_KERN_MEMORY_IPC);
39037602 5793 if (kr != KERN_SUCCESS) {
39037602
A
5794 return kr;
5795 }
5796
5ba3f43e 5797 kr = vm_map_wire_kernel(ipc_kernel_map, memory_info_addr, memory_info_addr + memory_info_vmsize,
0a7de745 5798 VM_PROT_READ | VM_PROT_WRITE, VM_KERN_MEMORY_IPC, FALSE);
39037602
A
5799 assert(kr == KERN_SUCCESS);
5800
5801 memory_info = (mach_memory_info_t *) memory_info_addr;
5ba3f43e 5802 vm_page_diagnose(memory_info, num_info, zones_collectable_bytes);
39037602
A
5803
5804 kr = vm_map_unwire(ipc_kernel_map, memory_info_addr, memory_info_addr + memory_info_vmsize, FALSE);
5805 assert(kr == KERN_SUCCESS);
0a7de745 5806
3e170ce0 5807 kr = vm_map_copyin(ipc_kernel_map, (vm_map_address_t)memory_info_addr,
0a7de745 5808 (vm_map_size_t)memory_info_size, TRUE, &copy);
3e170ce0
A
5809 assert(kr == KERN_SUCCESS);
5810
5811 *memoryInfop = (mach_memory_info_t *) copy;
5ba3f43e 5812 *memoryInfoCntp = num_info;
3e170ce0
A
5813 }
5814
6d2010ae
A
5815 return KERN_SUCCESS;
5816}
5817
a39ff7e2
A
5818kern_return_t
5819mach_zone_info_for_zone(
0a7de745
A
5820 host_priv_t host,
5821 mach_zone_name_t name,
5822 mach_zone_info_t *infop)
a39ff7e2 5823{
a39ff7e2
A
5824 zone_t zone_ptr;
5825
0a7de745 5826 if (host == HOST_NULL) {
a39ff7e2 5827 return KERN_INVALID_HOST;
0a7de745 5828 }
a39ff7e2 5829#if CONFIG_DEBUGGER_FOR_ZONE_INFO
0a7de745 5830 if (!PE_i_can_has_debugger(NULL)) {
a39ff7e2 5831 return KERN_INVALID_HOST;
0a7de745 5832 }
a39ff7e2
A
5833#endif
5834
5835 if (infop == NULL) {
5836 return KERN_INVALID_ARGUMENT;
5837 }
5838
a39ff7e2 5839 zone_ptr = ZONE_NULL;
f427ee49 5840 zone_index_foreach(i) {
a39ff7e2
A
5841 zone_t z = &(zone_array[i]);
5842 assert(z != ZONE_NULL);
5843
f427ee49
A
5844 /*
5845 * Append kalloc heap name to zone name (if zone is used by kalloc)
5846 */
5847 char temp_zone_name[MAX_ZONE_NAME] = "";
5848 snprintf(temp_zone_name, MAX_ZONE_NAME, "%s%s",
5849 zone_heap_name(z), z->z_name);
5850
a39ff7e2 5851 /* Find the requested zone by name */
f427ee49 5852 if (track_this_zone(temp_zone_name, name.mzn_name)) {
a39ff7e2
A
5853 zone_ptr = z;
5854 break;
5855 }
5856 }
5857
5858 /* No zones found with the requested zone name */
5859 if (zone_ptr == ZONE_NULL) {
5860 return KERN_INVALID_ARGUMENT;
5861 }
5862
5863 if (get_zone_info(zone_ptr, NULL, infop)) {
5864 return KERN_SUCCESS;
5865 }
5866 return KERN_FAILURE;
5867}
5868
5869kern_return_t
5870mach_zone_info_for_largest_zone(
0a7de745
A
5871 host_priv_t host,
5872 mach_zone_name_t *namep,
5873 mach_zone_info_t *infop)
a39ff7e2 5874{
0a7de745 5875 if (host == HOST_NULL) {
a39ff7e2 5876 return KERN_INVALID_HOST;
0a7de745 5877 }
a39ff7e2 5878#if CONFIG_DEBUGGER_FOR_ZONE_INFO
0a7de745 5879 if (!PE_i_can_has_debugger(NULL)) {
a39ff7e2 5880 return KERN_INVALID_HOST;
0a7de745 5881 }
a39ff7e2
A
5882#endif
5883
5884 if (namep == NULL || infop == NULL) {
5885 return KERN_INVALID_ARGUMENT;
5886 }
5887
5888 if (get_zone_info(zone_find_largest(), namep, infop)) {
5889 return KERN_SUCCESS;
5890 }
5891 return KERN_FAILURE;
5892}
5893
5ba3f43e
A
5894uint64_t
5895get_zones_collectable_bytes(void)
5896{
5ba3f43e 5897 uint64_t zones_collectable_bytes = 0;
a39ff7e2 5898 mach_zone_info_t zi;
5ba3f43e 5899
f427ee49
A
5900 zone_index_foreach(i) {
5901 if (get_zone_info(&zone_array[i], NULL, &zi)) {
5902 zones_collectable_bytes +=
5903 GET_MZI_COLLECTABLE_BYTES(zi.mzi_collectable);
a39ff7e2 5904 }
5ba3f43e
A
5905 }
5906
5907 return zones_collectable_bytes;
5908}
5909
d9a64523
A
5910kern_return_t
5911mach_zone_get_zlog_zones(
0a7de745
A
5912 host_priv_t host,
5913 mach_zone_name_array_t *namesp,
d9a64523
A
5914 mach_msg_type_number_t *namesCntp)
5915{
f427ee49 5916#if ZONE_ENABLE_LOGGING
d9a64523
A
5917 unsigned int max_zones, logged_zones, i;
5918 kern_return_t kr;
5919 zone_t zone_ptr;
5920 mach_zone_name_t *names;
5921 vm_offset_t names_addr;
5922 vm_size_t names_size;
5923
0a7de745 5924 if (host == HOST_NULL) {
d9a64523 5925 return KERN_INVALID_HOST;
0a7de745 5926 }
d9a64523 5927
0a7de745 5928 if (namesp == NULL || namesCntp == NULL) {
d9a64523 5929 return KERN_INVALID_ARGUMENT;
0a7de745 5930 }
d9a64523 5931
f427ee49 5932 max_zones = os_atomic_load(&num_zones, relaxed);
d9a64523
A
5933
5934 names_size = round_page(max_zones * sizeof *names);
5935 kr = kmem_alloc_pageable(ipc_kernel_map,
0a7de745
A
5936 &names_addr, names_size, VM_KERN_MEMORY_IPC);
5937 if (kr != KERN_SUCCESS) {
d9a64523 5938 return kr;
0a7de745 5939 }
d9a64523
A
5940 names = (mach_zone_name_t *) names_addr;
5941
5942 zone_ptr = ZONE_NULL;
5943 logged_zones = 0;
5944 for (i = 0; i < max_zones; i++) {
5945 zone_t z = &(zone_array[i]);
5946 assert(z != ZONE_NULL);
5947
5948 /* Copy out the zone name if zone logging is enabled */
0a7de745 5949 if (z->zlog_btlog) {
d9a64523
A
5950 get_zone_info(z, &names[logged_zones], NULL);
5951 logged_zones++;
5952 }
5953 }
5954
5955 *namesp = (mach_zone_name_t *) create_vm_map_copy(names_addr, names_size, logged_zones * sizeof *names);
5956 *namesCntp = logged_zones;
5957
5958 return KERN_SUCCESS;
5959
f427ee49 5960#else /* ZONE_ENABLE_LOGGING */
d9a64523
A
5961#pragma unused(host, namesp, namesCntp)
5962 return KERN_FAILURE;
f427ee49 5963#endif /* ZONE_ENABLE_LOGGING */
d9a64523
A
5964}
5965
5966kern_return_t
5967mach_zone_get_btlog_records(
0a7de745
A
5968 host_priv_t host,
5969 mach_zone_name_t name,
5970 zone_btrecord_array_t *recsp,
5971 mach_msg_type_number_t *recsCntp)
d9a64523
A
5972{
5973#if DEBUG || DEVELOPMENT
f427ee49 5974 unsigned int numrecs = 0;
d9a64523
A
5975 zone_btrecord_t *recs;
5976 kern_return_t kr;
5977 zone_t zone_ptr;
5978 vm_offset_t recs_addr;
5979 vm_size_t recs_size;
5980
0a7de745 5981 if (host == HOST_NULL) {
d9a64523 5982 return KERN_INVALID_HOST;
0a7de745 5983 }
d9a64523 5984
0a7de745 5985 if (recsp == NULL || recsCntp == NULL) {
d9a64523 5986 return KERN_INVALID_ARGUMENT;
0a7de745 5987 }
d9a64523 5988
d9a64523 5989 zone_ptr = ZONE_NULL;
f427ee49
A
5990 zone_index_foreach(i) {
5991 zone_t z = &zone_array[i];
5992
5993 /*
5994 * Append kalloc heap name to zone name (if zone is used by kalloc)
5995 */
5996 char temp_zone_name[MAX_ZONE_NAME] = "";
5997 snprintf(temp_zone_name, MAX_ZONE_NAME, "%s%s",
5998 zone_heap_name(z), z->z_name);
d9a64523
A
5999
6000 /* Find the requested zone by name */
f427ee49 6001 if (track_this_zone(temp_zone_name, name.mzn_name)) {
d9a64523
A
6002 zone_ptr = z;
6003 break;
6004 }
6005 }
6006
6007 /* No zones found with the requested zone name */
6008 if (zone_ptr == ZONE_NULL) {
6009 return KERN_INVALID_ARGUMENT;
6010 }
6011
6012 /* Logging not turned on for the requested zone */
6013 if (!DO_LOGGING(zone_ptr)) {
6014 return KERN_FAILURE;
6015 }
6016
6017 /* Allocate memory for btlog records */
6018 numrecs = (unsigned int)(get_btlog_records_count(zone_ptr->zlog_btlog));
6019 recs_size = round_page(numrecs * sizeof *recs);
6020
6021 kr = kmem_alloc_pageable(ipc_kernel_map, &recs_addr, recs_size, VM_KERN_MEMORY_IPC);
6022 if (kr != KERN_SUCCESS) {
6023 return kr;
6024 }
6025
6026 /*
6027 * We will call get_btlog_records() below which populates this region while holding a spinlock
6028 * (the btlog lock). So these pages need to be wired.
6029 */
6030 kr = vm_map_wire_kernel(ipc_kernel_map, recs_addr, recs_addr + recs_size,
0a7de745 6031 VM_PROT_READ | VM_PROT_WRITE, VM_KERN_MEMORY_IPC, FALSE);
d9a64523
A
6032 assert(kr == KERN_SUCCESS);
6033
6034 recs = (zone_btrecord_t *)recs_addr;
6035 get_btlog_records(zone_ptr->zlog_btlog, recs, &numrecs);
6036
6037 kr = vm_map_unwire(ipc_kernel_map, recs_addr, recs_addr + recs_size, FALSE);
6038 assert(kr == KERN_SUCCESS);
6039
6040 *recsp = (zone_btrecord_t *) create_vm_map_copy(recs_addr, recs_size, numrecs * sizeof *recs);
6041 *recsCntp = numrecs;
6042
6043 return KERN_SUCCESS;
6044
6045#else /* DEBUG || DEVELOPMENT */
6046#pragma unused(host, name, recsp, recsCntp)
6047 return KERN_FAILURE;
6048#endif /* DEBUG || DEVELOPMENT */
6049}
6050
6051
5ba3f43e
A
6052#if DEBUG || DEVELOPMENT
6053
6054kern_return_t
6055mach_memory_info_check(void)
6056{
0a7de745
A
6057 mach_memory_info_t * memory_info;
6058 mach_memory_info_t * info;
f427ee49
A
6059 unsigned int num_info;
6060 vm_offset_t memory_info_addr;
5ba3f43e 6061 kern_return_t kr;
0a7de745 6062 size_t memory_info_size, memory_info_vmsize;
5ba3f43e
A
6063 uint64_t top_wired, zonestotal, total;
6064
6065 num_info = vm_page_diagnose_estimate();
6066 memory_info_size = num_info * sizeof(*memory_info);
6067 memory_info_vmsize = round_page(memory_info_size);
6068 kr = kmem_alloc(kernel_map, &memory_info_addr, memory_info_vmsize, VM_KERN_MEMORY_DIAG);
0a7de745 6069 assert(kr == KERN_SUCCESS);
5ba3f43e
A
6070
6071 memory_info = (mach_memory_info_t *) memory_info_addr;
6072 vm_page_diagnose(memory_info, num_info, 0);
6073
0a7de745 6074 top_wired = total = zonestotal = 0;
f427ee49
A
6075 zone_index_foreach(idx) {
6076 zonestotal += zone_size_wired(&zone_array[idx]);
5ba3f43e 6077 }
f427ee49
A
6078
6079 for (uint32_t idx = 0; idx < num_info; idx++) {
5ba3f43e 6080 info = &memory_info[idx];
0a7de745
A
6081 if (!info->size) {
6082 continue;
6083 }
6084 if (VM_KERN_COUNT_WIRED == info->site) {
6085 top_wired = info->size;
6086 }
6087 if (VM_KERN_SITE_HIDE & info->flags) {
6088 continue;
6089 }
6090 if (!(VM_KERN_SITE_WIRED & info->flags)) {
6091 continue;
6092 }
5ba3f43e 6093 total += info->size;
0a7de745 6094 }
5ba3f43e
A
6095 total += zonestotal;
6096
f427ee49
A
6097 printf("vm_page_diagnose_check %qd of %qd, zones %qd, short 0x%qx\n",
6098 total, top_wired, zonestotal, top_wired - total);
5ba3f43e 6099
0a7de745 6100 kmem_free(kernel_map, memory_info_addr, memory_info_vmsize);
5ba3f43e 6101
0a7de745 6102 return kr;
5ba3f43e
A
6103}
6104
0a7de745 6105extern boolean_t(*volatile consider_buffer_cache_collect)(int);
d9a64523 6106
5ba3f43e
A
6107#endif /* DEBUG || DEVELOPMENT */
6108
316670eb
A
6109kern_return_t
6110mach_zone_force_gc(
6111 host_t host)
6112{
0a7de745 6113 if (host == HOST_NULL) {
316670eb 6114 return KERN_INVALID_HOST;
0a7de745 6115 }
316670eb 6116
5ba3f43e 6117#if DEBUG || DEVELOPMENT
d9a64523
A
6118 /* Callout to buffer cache GC to drop elements in the apfs zones */
6119 if (consider_buffer_cache_collect != NULL) {
6120 (void)(*consider_buffer_cache_collect)(0);
6121 }
5ba3f43e
A
6122 consider_zone_gc(FALSE);
6123#endif /* DEBUG || DEVELOPMENT */
0a7de745 6124 return KERN_SUCCESS;
316670eb
A
6125}
6126
39236c6e
A
6127zone_t
6128zone_find_largest(void)
6129{
f427ee49
A
6130 uint32_t largest_idx = 0;
6131 vm_offset_t largest_size = zone_size_wired(&zone_array[0]);
6132
6133 zone_index_foreach(i) {
6134 vm_offset_t size = zone_size_wired(&zone_array[i]);
6135 if (size > largest_size) {
6136 largest_idx = i;
6137 largest_size = size;
39236c6e 6138 }
39236c6e 6139 }
39236c6e 6140
f427ee49
A
6141 return &zone_array[largest_idx];
6142}
1c79356b 6143
f427ee49
A
6144#pragma mark - tests
6145#if DEBUG || DEVELOPMENT
1c79356b 6146
f427ee49
A
6147/*
6148 * Used for sysctl kern.run_zone_test which is not thread-safe. Ensure only one
6149 * thread goes through at a time. Or we can end up with multiple test zones (if
6150 * a second zinit() comes through before zdestroy()), which could lead us to
6151 * run out of zones.
6152 */
6153SIMPLE_LOCK_DECLARE(zone_test_lock, 0);
6154static boolean_t zone_test_running = FALSE;
6155static zone_t test_zone_ptr = NULL;
1c79356b 6156
f427ee49
A
6157static uintptr_t *
6158zone_copy_allocations(zone_t z, uintptr_t *elems, bitmap_t *bits,
6159 zone_pva_t page_index, zone_addr_kind_t kind)
6160{
6161 vm_offset_t free, first, end, page;
6162 struct zone_page_metadata *meta;
39037602 6163
f427ee49
A
6164 while (!zone_pva_is_null(page_index)) {
6165 page = zone_pva_to_addr(page_index);
6166 meta = zone_pva_to_meta(page_index, kind);
6167 end = page + ptoa(meta->zm_percpu ? 1 : meta->zm_page_count);
6168 first = page + ZONE_PAGE_FIRST_OFFSET(kind);
39037602 6169
f427ee49 6170 bitmap_clear(bits, (uint32_t)((end - first) / zone_elem_size(z)));
39037602 6171
f427ee49
A
6172 // construct bitmap of all freed elements
6173 free = zone_page_meta_get_freelist(z, meta, page);
6174 while (free) {
6175 bitmap_set(bits, (uint32_t)((free - first) / zone_elem_size(z)));
39037602 6176
f427ee49
A
6177 // next free element
6178 free = *(vm_offset_t *)free ^ zp_nopoison_cookie;
6179 }
39037602 6180
f427ee49
A
6181 for (unsigned i = 0; first < end; i++, first += zone_elem_size(z)) {
6182 if (!bitmap_test(bits, i)) {
6183 *elems++ = INSTANCE_PUT(first);
0a7de745 6184 }
0a7de745 6185 }
0a7de745 6186
f427ee49
A
6187 page_index = meta->zm_page_next;
6188 }
0a7de745 6189 return elems;
39037602
A
6190}
6191
6192kern_return_t
6193zone_leaks(const char * zoneName, uint32_t nameLen, leak_site_proc proc, void * refCon)
6194{
f427ee49
A
6195 uintptr_t zbt[MAX_ZTRACE_DEPTH];
6196 zone_t zone = NULL;
0a7de745
A
6197 uintptr_t * array;
6198 uintptr_t * next;
6199 uintptr_t element, bt;
6200 uint32_t idx, count, found;
6201 uint32_t btidx, btcount, nobtcount, btfound;
6202 uint32_t elemSize;
6203 uint64_t maxElems;
5ba3f43e 6204 kern_return_t kr;
f427ee49 6205 bitmap_t *bits;
39037602 6206
f427ee49
A
6207 zone_index_foreach(i) {
6208 if (!strncmp(zoneName, zone_array[i].z_name, nameLen)) {
6209 zone = &zone_array[i];
0a7de745
A
6210 break;
6211 }
6212 }
f427ee49 6213 if (zone == NULL) {
0a7de745
A
6214 return KERN_INVALID_NAME;
6215 }
0a7de745 6216
f427ee49
A
6217 elemSize = zone_elem_size(zone);
6218 maxElems = (zone->countavail + 1) & ~1ul;
0a7de745 6219
f427ee49
A
6220 if ((ptoa(zone->percpu ? 1 : zone->alloc_pages) % elemSize) &&
6221 !zone_leaks_scan_enable) {
0a7de745
A
6222 return KERN_INVALID_CAPABILITY;
6223 }
6224
6225 kr = kmem_alloc_kobject(kernel_map, (vm_offset_t *) &array,
f427ee49
A
6226 maxElems * sizeof(uintptr_t) + BITMAP_LEN(ZONE_CHUNK_MAXELEMENTS),
6227 VM_KERN_MEMORY_DIAG);
0a7de745
A
6228 if (KERN_SUCCESS != kr) {
6229 return kr;
6230 }
6231
f427ee49
A
6232 /* maxElems is a 2-multiple so we're always aligned */
6233 bits = CAST_DOWN_EXPLICIT(bitmap_t *, array + maxElems);
6234
0a7de745
A
6235 lock_zone(zone);
6236
6237 next = array;
f427ee49
A
6238 next = zone_copy_allocations(zone, next, bits,
6239 zone->pages_any_free_foreign, ZONE_ADDR_FOREIGN);
6240 next = zone_copy_allocations(zone, next, bits,
6241 zone->pages_all_used_foreign, ZONE_ADDR_FOREIGN);
6242 next = zone_copy_allocations(zone, next, bits,
6243 zone->pages_intermediate, ZONE_ADDR_NATIVE);
6244 next = zone_copy_allocations(zone, next, bits,
6245 zone->pages_all_used, ZONE_ADDR_NATIVE);
0a7de745
A
6246 count = (uint32_t)(next - array);
6247
6248 unlock_zone(zone);
6249
f427ee49 6250 zone_leaks_scan(array, count, zone_elem_size(zone), &found);
0a7de745
A
6251 assert(found <= count);
6252
6253 for (idx = 0; idx < count; idx++) {
6254 element = array[idx];
6255 if (kInstanceFlagReferenced & element) {
6256 continue;
6257 }
6258 element = INSTANCE_PUT(element) & ~kInstanceFlags;
6259 }
6260
f427ee49 6261#if ZONE_ENABLE_LOGGING
0a7de745
A
6262 if (zone->zlog_btlog && !corruption_debug_flag) {
6263 // btlog_copy_backtraces_for_elements will set kInstanceFlagReferenced on elements it found
6264 btlog_copy_backtraces_for_elements(zone->zlog_btlog, array, &count, elemSize, proc, refCon);
6265 }
f427ee49 6266#endif /* ZONE_ENABLE_LOGGING */
0a7de745
A
6267
6268 for (nobtcount = idx = 0; idx < count; idx++) {
6269 element = array[idx];
6270 if (!element) {
6271 continue;
6272 }
6273 if (kInstanceFlagReferenced & element) {
6274 continue;
6275 }
6276 element = INSTANCE_PUT(element) & ~kInstanceFlags;
6277
6278 // see if we can find any backtrace left in the element
f427ee49 6279 btcount = (typeof(btcount))(zone_elem_size(zone) / sizeof(uintptr_t));
0a7de745
A
6280 if (btcount >= MAX_ZTRACE_DEPTH) {
6281 btcount = MAX_ZTRACE_DEPTH - 1;
6282 }
6283 for (btfound = btidx = 0; btidx < btcount; btidx++) {
6284 bt = ((uintptr_t *)element)[btcount - 1 - btidx];
6285 if (!VM_KERNEL_IS_SLID(bt)) {
6286 break;
6287 }
6288 zbt[btfound++] = bt;
6289 }
6290 if (btfound) {
6291 (*proc)(refCon, 1, elemSize, &zbt[0], btfound);
6292 } else {
6293 nobtcount++;
6294 }
6295 }
6296 if (nobtcount) {
6297 // fake backtrace when we found nothing
6298 zbt[0] = (uintptr_t) &zalloc;
6299 (*proc)(refCon, nobtcount, elemSize, &zbt[0], 1);
6300 }
6301
6302 kmem_free(kernel_map, (vm_offset_t) array, maxElems * sizeof(uintptr_t));
6303
6304 return KERN_SUCCESS;
1c79356b
A
6305}
6306
5ba3f43e
A
6307boolean_t
6308run_zone_test(void)
6309{
d9a64523 6310 unsigned int i = 0, max_iter = 5;
5ba3f43e
A
6311 void * test_ptr;
6312 zone_t test_zone;
b0d623f7 6313
0a7de745 6314 simple_lock(&zone_test_lock, &zone_locks_grp);
5ba3f43e
A
6315 if (!zone_test_running) {
6316 zone_test_running = TRUE;
6317 } else {
6318 simple_unlock(&zone_test_lock);
6319 printf("run_zone_test: Test already running.\n");
6320 return FALSE;
6321 }
6322 simple_unlock(&zone_test_lock);
39037602 6323
5ba3f43e 6324 printf("run_zone_test: Testing zinit(), zalloc(), zfree() and zdestroy() on zone \"test_zone_sysctl\"\n");
39037602 6325
5ba3f43e
A
6326 /* zinit() and zdestroy() a zone with the same name a bunch of times, verify that we get back the same zone each time */
6327 do {
6328 test_zone = zinit(sizeof(uint64_t), 100 * sizeof(uint64_t), sizeof(uint64_t), "test_zone_sysctl");
6329 if (test_zone == NULL) {
6330 printf("run_zone_test: zinit() failed\n");
6331 return FALSE;
6332 }
39037602 6333
5ba3f43e 6334#if KASAN_ZALLOC
f427ee49 6335 if (test_zone_ptr == NULL && test_zone->countfree != 0) {
5ba3f43e 6336#else
f427ee49 6337 if (test_zone->countfree != 0) {
5ba3f43e
A
6338#endif
6339 printf("run_zone_test: free count is not zero\n");
6340 return FALSE;
6341 }
6342
6343 if (test_zone_ptr == NULL) {
6344 /* Stash the zone pointer returned on the fist zinit */
6345 printf("run_zone_test: zone created for the first time\n");
6346 test_zone_ptr = test_zone;
6347 } else if (test_zone != test_zone_ptr) {
6348 printf("run_zone_test: old zone pointer and new zone pointer don't match\n");
6349 return FALSE;
6350 }
6351
6352 test_ptr = zalloc(test_zone);
6353 if (test_ptr == NULL) {
6354 printf("run_zone_test: zalloc() failed\n");
6355 return FALSE;
6356 }
6357 zfree(test_zone, test_ptr);
6358
6359 zdestroy(test_zone);
6360 i++;
6361
6362 printf("run_zone_test: Iteration %d successful\n", i);
6363 } while (i < max_iter);
6364
f427ee49
A
6365 /* test Z_VA_SEQUESTER */
6366 if (zsecurity_options & ZSECURITY_OPTIONS_SEQUESTER) {
6367 int idx, num_allocs = 8;
6368 vm_size_t elem_size = 2 * PAGE_SIZE / num_allocs;
6369 void *allocs[num_allocs];
6370 vm_offset_t phys_pages = os_atomic_load(&zones_phys_page_count, relaxed);
6371 vm_size_t zone_map_size = zone_range_size(&zone_info.zi_map_range);
6372
6373 test_zone = zone_create("test_zone_sysctl", elem_size,
6374 ZC_DESTRUCTIBLE | ZC_SEQUESTER);
6375 if (test_zone == NULL) {
6376 printf("run_zone_test: zinit() failed\n");
6377 return FALSE;
6378 }
6379
6380 for (idx = 0; idx < num_allocs; idx++) {
6381 allocs[idx] = zalloc(test_zone);
6382 assert(NULL != allocs[idx]);
6383 printf("alloc[%d] %p\n", idx, allocs[idx]);
6384 }
6385 for (idx = 0; idx < num_allocs; idx++) {
6386 zfree(test_zone, allocs[idx]);
6387 }
6388 assert(!zone_pva_is_null(test_zone->pages_all_free));
6389
6390 printf("vm_page_wire_count %d, vm_page_free_count %d, p to v %qd%%\n",
6391 vm_page_wire_count, vm_page_free_count,
6392 (100ULL * ptoa_64(phys_pages)) / zone_map_size);
6393 zone_gc(FALSE);
6394 printf("vm_page_wire_count %d, vm_page_free_count %d, p to v %qd%%\n",
6395 vm_page_wire_count, vm_page_free_count,
6396 (100ULL * ptoa_64(phys_pages)) / zone_map_size);
6397 unsigned int allva = 0;
6398 zone_index_foreach(zidx) {
6399 zone_t z = &zone_array[zidx];
6400 lock_zone(z);
6401 allva += z->page_count;
6402 if (!z->sequester_page_count) {
6403 unlock_zone(z);
6404 continue;
6405 }
6406 unsigned count = 0;
6407 uint64_t size;
6408 zone_pva_t pg = z->pages_sequester;
6409 struct zone_page_metadata *page_meta;
6410 while (pg.packed_address) {
6411 page_meta = zone_pva_to_meta(pg, ZONE_ADDR_NATIVE);
6412 count += z->alloc_pages;
6413 pg = page_meta->zm_page_next;
6414 }
6415 assert(count == z->sequester_page_count);
6416 size = zone_size_wired(z);
6417 if (!size) {
6418 size = 1;
6419 }
6420 printf("%s%s: seq %d, res %d, %qd %%\n",
6421 zone_heap_name(z), z->z_name, z->sequester_page_count,
6422 z->page_count, zone_size_allocated(z) * 100ULL / size);
6423 unlock_zone(z);
6424 }
6425
6426 printf("total va: %d\n", allva);
6427
6428 assert(zone_pva_is_null(test_zone->pages_all_free));
6429 assert(!zone_pva_is_null(test_zone->pages_sequester));
6430 assert(2 == test_zone->sequester_page_count);
6431 for (idx = 0; idx < num_allocs; idx++) {
6432 assert(0 == pmap_find_phys(kernel_pmap, (addr64_t)(uintptr_t) allocs[idx]));
6433 }
6434 for (idx = 0; idx < num_allocs; idx++) {
6435 allocs[idx] = zalloc(test_zone);
6436 assert(allocs[idx]);
6437 printf("alloc[%d] %p\n", idx, allocs[idx]);
6438 }
6439 assert(zone_pva_is_null(test_zone->pages_sequester));
6440 assert(0 == test_zone->sequester_page_count);
6441 for (idx = 0; idx < num_allocs; idx++) {
6442 zfree(test_zone, allocs[idx]);
6443 }
6444 zdestroy(test_zone);
6445 } else {
6446 printf("run_zone_test: skipping sequester test (not enabled)\n");
6447 }
6448
5ba3f43e
A
6449 printf("run_zone_test: Test passed\n");
6450
0a7de745 6451 simple_lock(&zone_test_lock, &zone_locks_grp);
5ba3f43e
A
6452 zone_test_running = FALSE;
6453 simple_unlock(&zone_test_lock);
6454
6455 return TRUE;
813fb2f6
A
6456}
6457
f427ee49
A
6458/*
6459 * Routines to test that zone garbage collection and zone replenish threads
6460 * running at the same time don't cause problems.
6461 */
6462
6463void
6464zone_gc_replenish_test(void)
6465{
6466 zone_gc(FALSE);
6467}
6468
6469
6470void
6471zone_alloc_replenish_test(void)
6472{
6473 zone_t z = NULL;
6474 struct data { struct data *next; } *node, *list = NULL;
6475
6476 /*
6477 * Find a zone that has a replenish thread
6478 */
6479 zone_index_foreach(i) {
6480 z = &zone_array[i];
6481 if (z->prio_refill_count &&
6482 zone_elem_size(z) >= sizeof(struct data)) {
6483 z = &zone_array[i];
6484 break;
6485 }
6486 }
6487 if (z == NULL) {
6488 printf("Couldn't find a replenish zone\n");
6489 return;
6490 }
6491
6492 for (uint32_t i = 0; i < 2000; ++i) { /* something big enough to go past replenishment */
6493 node = zalloc(z);
6494 node->next = list;
6495 list = node;
6496 }
6497
6498 /*
6499 * release the memory we allocated
6500 */
6501 while (list != NULL) {
6502 node = list;
6503 list = list->next;
6504 zfree(z, node);
6505 }
6506}
6507
39037602 6508#endif /* DEBUG || DEVELOPMENT */