]> git.saurik.com Git - apple/xnu.git/blob - osfmk/kern/zalloc.c
8853e5cb7b5f56a1e691c19b5eb898895fa35475
[apple/xnu.git] / osfmk / kern / zalloc.c
1 /*
2 * Copyright (c) 2000-2020 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * @OSF_COPYRIGHT@
30 */
31 /*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56 /*
57 */
58 /*
59 * File: kern/zalloc.c
60 * Author: Avadis Tevanian, Jr.
61 *
62 * Zone-based memory allocator. A zone is a collection of fixed size
63 * data blocks for which quick allocation/deallocation is possible.
64 */
65
66 #define ZALLOC_ALLOW_DEPRECATED 1
67 #include <mach/mach_types.h>
68 #include <mach/vm_param.h>
69 #include <mach/kern_return.h>
70 #include <mach/mach_host_server.h>
71 #include <mach/task_server.h>
72 #include <mach/machine/vm_types.h>
73 #include <mach/vm_map.h>
74 #include <mach/sdt.h>
75
76 #include <kern/bits.h>
77 #include <kern/startup.h>
78 #include <kern/kern_types.h>
79 #include <kern/assert.h>
80 #include <kern/backtrace.h>
81 #include <kern/host.h>
82 #include <kern/macro_help.h>
83 #include <kern/sched.h>
84 #include <kern/locks.h>
85 #include <kern/sched_prim.h>
86 #include <kern/misc_protos.h>
87 #include <kern/thread_call.h>
88 #include <kern/zalloc_internal.h>
89 #include <kern/kalloc.h>
90
91 #include <prng/random.h>
92
93 #include <vm/pmap.h>
94 #include <vm/vm_map.h>
95 #include <vm/vm_kern.h>
96 #include <vm/vm_page.h>
97 #include <vm/vm_compressor.h> /* C_SLOT_PACKED_PTR* */
98
99 #include <pexpert/pexpert.h>
100
101 #include <machine/machparam.h>
102 #include <machine/machine_routines.h> /* ml_cpu_get_info */
103
104 #include <os/atomic.h>
105
106 #include <libkern/OSDebug.h>
107 #include <libkern/OSAtomic.h>
108 #include <libkern/section_keywords.h>
109 #include <sys/kdebug.h>
110
111 #include <san/kasan.h>
112
113 #if KASAN_ZALLOC
114 #define ZONE_ENABLE_LOGGING 0
115 #elif DEBUG || DEVELOPMENT
116 #define ZONE_ENABLE_LOGGING 1
117 #else
118 #define ZONE_ENABLE_LOGGING 0
119 #endif
120
121 extern void vm_pageout_garbage_collect(int collect);
122
123 /* Returns pid of the task with the largest number of VM map entries. */
124 extern pid_t find_largest_process_vm_map_entries(void);
125
126 /*
127 * Callout to jetsam. If pid is -1, we wake up the memorystatus thread to do asynchronous kills.
128 * For any other pid we try to kill that process synchronously.
129 */
130 extern boolean_t memorystatus_kill_on_zone_map_exhaustion(pid_t pid);
131
132 extern zone_t vm_map_entry_zone;
133 extern zone_t vm_object_zone;
134 extern vm_offset_t kmapoff_kaddr;
135 extern unsigned int kmapoff_pgcnt;
136 extern unsigned int stack_total;
137 extern unsigned long long stack_allocs;
138
139 /*
140 * The max # of elements in a chunk should fit into
141 * zone_page_metadata.free_count (uint16_t).
142 *
143 * Update this if the type of free_count changes.
144 */
145 #define ZONE_CHUNK_MAXELEMENTS (UINT16_MAX)
146
147 #define ZONE_PAGECOUNT_BITS 14
148
149 /* Zone elements must fit both a next pointer and a backup pointer */
150 #define ZONE_MIN_ELEM_SIZE (2 * sizeof(vm_offset_t))
151 #define ZONE_MAX_ALLOC_SIZE (32 * 1024)
152
153 /* per-cpu zones are special because of counters */
154 #define ZONE_MIN_PCPU_ELEM_SIZE (1 * sizeof(vm_offset_t))
155
156 struct zone_map_range {
157 vm_offset_t min_address;
158 vm_offset_t max_address;
159 };
160
161 struct zone_page_metadata {
162 /* The index of the zone this metadata page belongs to */
163 zone_id_t zm_index;
164
165 /*
166 * zm_secondary_page == 0: number of pages in this run
167 * zm_secondary_page == 1: offset to the chunk start
168 */
169 uint16_t zm_page_count : ZONE_PAGECOUNT_BITS;
170
171 /* Whether this page is part of a chunk run */
172 uint16_t zm_percpu : 1;
173 uint16_t zm_secondary_page : 1;
174
175 /*
176 * The start of the freelist can be maintained as a 16-bit
177 * offset instead of a pointer because the free elements would
178 * be at max ZONE_MAX_ALLOC_SIZE bytes away from the start
179 * of the allocation chunk.
180 *
181 * Offset from start of the allocation chunk to free element
182 * list head.
183 */
184 uint16_t zm_freelist_offs;
185
186 /*
187 * zm_secondary_page == 0: number of allocated elements in the chunk
188 * zm_secondary_page == 1: unused
189 *
190 * PAGE_METADATA_EMPTY_FREELIST indicates an empty freelist
191 */
192 uint16_t zm_alloc_count;
193 #define PAGE_METADATA_EMPTY_FREELIST UINT16_MAX
194
195 zone_pva_t zm_page_next;
196 zone_pva_t zm_page_prev;
197
198 /*
199 * This is only for the sake of debuggers
200 */
201 #define ZONE_FOREIGN_COOKIE 0x123456789abcdef
202 uint64_t zm_foreign_cookie[];
203 };
204
205
206 /* Align elements that use the zone page list to 32 byte boundaries. */
207 #define ZONE_PAGE_FIRST_OFFSET(kind) ((kind) == ZONE_ADDR_NATIVE ? 0 : 32)
208
209 static_assert(sizeof(struct zone_page_metadata) == 16, "validate packing");
210
211 static __security_const_late struct {
212 struct zone_map_range zi_map_range;
213 struct zone_map_range zi_general_range;
214 struct zone_map_range zi_meta_range;
215 struct zone_map_range zi_foreign_range;
216
217 /*
218 * The metadata lives within the zi_meta_range address range.
219 *
220 * The correct formula to find a metadata index is:
221 * absolute_page_index - page_index(zi_meta_range.min_address)
222 *
223 * And then this index is used to dereference zi_meta_range.min_address
224 * as a `struct zone_page_metadata` array.
225 *
226 * To avoid doing that substraction all the time in the various fast-paths,
227 * zi_array_base is offset by `page_index(zi_meta_range.min_address)`
228 * to avoid redoing that math all the time.
229 */
230 struct zone_page_metadata *zi_array_base;
231 } zone_info;
232
233 /*
234 * The zone_locks_grp allows for collecting lock statistics.
235 * All locks are associated to this group in zinit.
236 * Look at tools/lockstat for debugging lock contention.
237 */
238 LCK_GRP_DECLARE(zone_locks_grp, "zone_locks");
239 LCK_MTX_EARLY_DECLARE(zone_metadata_region_lck, &zone_locks_grp);
240
241 /*
242 * Exclude more than one concurrent garbage collection
243 */
244 LCK_GRP_DECLARE(zone_gc_lck_grp, "zone_gc");
245 LCK_MTX_EARLY_DECLARE(zone_gc_lock, &zone_gc_lck_grp);
246
247 boolean_t panic_include_zprint = FALSE;
248 mach_memory_info_t *panic_kext_memory_info = NULL;
249 vm_size_t panic_kext_memory_size = 0;
250
251 /*
252 * Protects zone_array, num_zones, num_zones_in_use, and
253 * zone_destroyed_bitmap
254 */
255 static SIMPLE_LOCK_DECLARE(all_zones_lock, 0);
256 static unsigned int num_zones_in_use;
257 unsigned int _Atomic num_zones;
258 SECURITY_READ_ONLY_LATE(unsigned int) zone_view_count;
259
260 #if KASAN_ZALLOC
261 #define MAX_ZONES 566
262 #else /* !KASAN_ZALLOC */
263 #define MAX_ZONES 402
264 #endif/* !KASAN_ZALLOC */
265 struct zone zone_array[MAX_ZONES];
266
267 /* Initialized in zone_bootstrap(), how many "copies" the per-cpu system does */
268 static SECURITY_READ_ONLY_LATE(unsigned) zpercpu_early_count;
269
270 /* Used to keep track of destroyed slots in the zone_array */
271 static bitmap_t zone_destroyed_bitmap[BITMAP_LEN(MAX_ZONES)];
272
273 /* number of pages used by all zones */
274 static long _Atomic zones_phys_page_count;
275
276 /* number of zone mapped pages used by all zones */
277 static long _Atomic zones_phys_page_mapped_count;
278
279 #if CONFIG_ZALLOC_SEQUESTER
280 #define ZSECURITY_OPTIONS_SEQUESTER_DEFAULT ZSECURITY_OPTIONS_SEQUESTER
281 #else
282 #define ZSECURITY_OPTIONS_SEQUESTER_DEFAULT 0
283 #endif
284 /*
285 * Turn ZSECURITY_OPTIONS_STRICT_IOKIT_FREE off on x86 so as not
286 * not break third party kexts that haven't yet been recompiled
287 * to use the new iokit macros.
288 */
289 #if XNU_TARGET_OS_OSX && __x86_64__
290 #define ZSECURITY_OPTIONS_STRICT_IOKIT_FREE_DEFAULT 0
291 #else
292 #define ZSECURITY_OPTIONS_STRICT_IOKIT_FREE_DEFAULT \
293 ZSECURITY_OPTIONS_STRICT_IOKIT_FREE
294 #endif
295
296 #define ZSECURITY_DEFAULT ( \
297 ZSECURITY_OPTIONS_SEQUESTER_DEFAULT | \
298 ZSECURITY_OPTIONS_SUBMAP_USER_DATA | \
299 ZSECURITY_OPTIONS_SEQUESTER_KEXT_KALLOC | \
300 ZSECURITY_OPTIONS_STRICT_IOKIT_FREE_DEFAULT | \
301 0)
302 TUNABLE(zone_security_options_t, zsecurity_options, "zs", ZSECURITY_DEFAULT);
303
304 #if VM_MAX_TAG_ZONES
305 /* enable tags for zones that ask for it */
306 TUNABLE(bool, zone_tagging_on, "-zt", false);
307 #endif /* VM_MAX_TAG_ZONES */
308
309 #if DEBUG || DEVELOPMENT
310 TUNABLE(bool, zalloc_disable_copyio_check, "-no-copyio-zalloc-check", false);
311 __options_decl(zalloc_debug_t, uint32_t, {
312 ZALLOC_DEBUG_ZONEGC = 0x00000001,
313 ZALLOC_DEBUG_ZCRAM = 0x00000002,
314 });
315
316 TUNABLE(zalloc_debug_t, zalloc_debug, "zalloc_debug", 0);
317 #endif /* DEBUG || DEVELOPMENT */
318 #if CONFIG_ZLEAKS
319 /* Making pointer scanning leaks detection possible for all zones */
320 TUNABLE(bool, zone_leaks_scan_enable, "-zl", false);
321 #else
322 #define zone_leaks_scan_enable false
323 #endif
324
325 /*
326 * Async allocation of zones
327 * This mechanism allows for bootstrapping an empty zone which is setup with
328 * non-blocking flags. The first call to zalloc_noblock() will kick off a thread_call
329 * to zalloc_async. We perform a zalloc() (which may block) and then an immediate free.
330 * This will prime the zone for the next use.
331 *
332 * Currently the thread_callout function (zalloc_async) will loop through all zones
333 * looking for any zone with async_pending set and do the work for it.
334 *
335 * NOTE: If the calling thread for zalloc_noblock is lower priority than thread_call,
336 * then zalloc_noblock to an empty zone may succeed.
337 */
338 static void zalloc_async(thread_call_param_t p0, thread_call_param_t p1);
339 static thread_call_data_t call_async_alloc;
340 static void zcram_and_lock(zone_t zone, vm_offset_t newmem, vm_size_t size);
341
342 /*
343 * Zone Corruption Debugging
344 *
345 * We use four techniques to detect modification of a zone element
346 * after it's been freed.
347 *
348 * (1) Check the freelist next pointer for sanity.
349 * (2) Store a backup of the next pointer at the end of the element,
350 * and compare it to the primary next pointer when the element is allocated
351 * to detect corruption of the freelist due to use-after-free bugs.
352 * The backup pointer is also XORed with a per-boot random cookie.
353 * (3) Poison the freed element by overwriting it with 0xdeadbeef,
354 * and check for that value when the element is being reused to make sure
355 * no part of the element has been modified while it was on the freelist.
356 * This will also help catch read-after-frees, as code will now dereference
357 * 0xdeadbeef instead of a valid but freed pointer.
358 * (4) If the zfree_clear_mem flag is set clear the element on free and
359 * assert that it is still clear when alloc-ed.
360 *
361 * (1) and (2) occur for every allocation and free to a zone.
362 * This is done to make it slightly more difficult for an attacker to
363 * manipulate the freelist to behave in a specific way.
364 *
365 * Poisoning (3) occurs periodically for every N frees (counted per-zone).
366 * If -zp is passed as a boot arg, poisoning occurs for every free.
367 *
368 * Zeroing (4) is done for those zones that pass the ZC_ZFREE_CLEARMEM
369 * flag on creation or if the element size is less than one cacheline.
370 *
371 * Performance slowdown is inversely proportional to the frequency of poisoning,
372 * with a 4-5% hit around N=1, down to ~0.3% at N=16 and just "noise" at N=32
373 * and higher. You can expect to find a 100% reproducible bug in an average of
374 * N tries, with a standard deviation of about N, but you will want to set
375 * "-zp" to always poison every free if you are attempting to reproduce
376 * a known bug.
377 *
378 * For a more heavyweight, but finer-grained method of detecting misuse
379 * of zone memory, look up the "Guard mode" zone allocator in gzalloc.c.
380 *
381 * Zone Corruption Logging
382 *
383 * You can also track where corruptions come from by using the boot-arguments
384 * "zlog=<zone name to log> -zc". Search for "Zone corruption logging" later
385 * in this document for more implementation and usage information.
386 *
387 * Zone Leak Detection
388 *
389 * To debug leaks of zone memory, use the zone leak detection tool 'zleaks'
390 * found later in this file via the showtopztrace and showz* macros in kgmacros,
391 * or use zlog without the -zc argument.
392 *
393 */
394
395 #define ZP_DEFAULT_SAMPLING_FACTOR 16
396 #define ZP_DEFAULT_SCALE_FACTOR 4
397
398 /*
399 * set by zp-factor=N boot arg
400 *
401 * A zp_factor of 0 indicates zone poisoning is disabled and can also be set by
402 * passing the -no-zp boot-arg.
403 *
404 * A zp_factor of 1 indicates zone poisoning is on for all elements and can be
405 * set by passing the -zp boot-arg.
406 */
407 static TUNABLE(uint32_t, zp_factor, "zp-factor", ZP_DEFAULT_SAMPLING_FACTOR);
408
409 /* set by zp-scale=N boot arg, scales zp_factor by zone size */
410 static TUNABLE(uint32_t, zp_scale, "zp-scale", ZP_DEFAULT_SCALE_FACTOR);
411
412 /* initialized to a per-boot random value in zp_bootstrap */
413 static SECURITY_READ_ONLY_LATE(uintptr_t) zp_poisoned_cookie;
414 static SECURITY_READ_ONLY_LATE(uintptr_t) zp_nopoison_cookie;
415 static SECURITY_READ_ONLY_LATE(uintptr_t) zp_min_size;
416 static SECURITY_READ_ONLY_LATE(uint64_t) zone_phys_mapped_max;
417
418 static SECURITY_READ_ONLY_LATE(vm_map_t) zone_submaps[Z_SUBMAP_IDX_COUNT];
419 static SECURITY_READ_ONLY_LATE(uint32_t) zone_last_submap_idx;
420
421 static struct bool_gen zone_bool_gen;
422 static zone_t zone_find_largest(void);
423 static void zone_drop_free_elements(zone_t z);
424
425 #define submap_for_zone(z) zone_submaps[(z)->submap_idx]
426 #define MAX_SUBMAP_NAME 16
427
428 /* Globals for random boolean generator for elements in free list */
429 #define MAX_ENTROPY_PER_ZCRAM 4
430
431 #if CONFIG_ZCACHE
432 /*
433 * Specifies a single zone to enable CPU caching for.
434 * Can be set using boot-args: zcc_enable_for_zone_name=<zone>
435 */
436 static char cache_zone_name[MAX_ZONE_NAME];
437 static TUNABLE(bool, zcc_kalloc, "zcc_kalloc", false);
438
439 __header_always_inline bool
440 zone_caching_enabled(zone_t z)
441 {
442 return z->zcache.zcc_depot != NULL;
443 }
444 #else
445 __header_always_inline bool
446 zone_caching_enabled(zone_t z __unused)
447 {
448 return false;
449 }
450 #endif /* CONFIG_ZCACHE */
451
452 #pragma mark Zone metadata
453
454 __enum_closed_decl(zone_addr_kind_t, bool, {
455 ZONE_ADDR_NATIVE,
456 ZONE_ADDR_FOREIGN,
457 });
458
459 static inline zone_id_t
460 zone_index(zone_t z)
461 {
462 return (zone_id_t)(z - zone_array);
463 }
464
465 static inline bool
466 zone_has_index(zone_t z, zone_id_t zid)
467 {
468 return zone_array + zid == z;
469 }
470
471 static inline vm_size_t
472 zone_elem_count(zone_t zone, vm_size_t alloc_size, zone_addr_kind_t kind)
473 {
474 if (kind == ZONE_ADDR_NATIVE) {
475 if (zone->percpu) {
476 return PAGE_SIZE / zone_elem_size(zone);
477 }
478 return alloc_size / zone_elem_size(zone);
479 } else {
480 assert(alloc_size == PAGE_SIZE);
481 return (PAGE_SIZE - ZONE_PAGE_FIRST_OFFSET(kind)) / zone_elem_size(zone);
482 }
483 }
484
485 __abortlike
486 static void
487 zone_metadata_corruption(zone_t zone, struct zone_page_metadata *meta,
488 const char *kind)
489 {
490 panic("zone metadata corruption: %s (meta %p, zone %s%s)",
491 kind, meta, zone_heap_name(zone), zone->z_name);
492 }
493
494 __abortlike
495 static void
496 zone_invalid_element_addr_panic(zone_t zone, vm_offset_t addr)
497 {
498 panic("zone element pointer validation failed (addr: %p, zone %s%s)",
499 (void *)addr, zone_heap_name(zone), zone->z_name);
500 }
501
502 __abortlike
503 static void
504 zone_page_metadata_index_confusion_panic(zone_t zone, vm_offset_t addr,
505 struct zone_page_metadata *meta)
506 {
507 panic("%p not in the expected zone %s%s (%d != %d)",
508 (void *)addr, zone_heap_name(zone), zone->z_name,
509 meta->zm_index, zone_index(zone));
510 }
511
512 __abortlike
513 static void
514 zone_page_metadata_native_queue_corruption(zone_t zone, zone_pva_t *queue)
515 {
516 panic("foreign metadata index %d enqueued in native head %p from zone %s%s",
517 queue->packed_address, queue, zone_heap_name(zone),
518 zone->z_name);
519 }
520
521 __abortlike
522 static void
523 zone_page_metadata_list_corruption(zone_t zone, struct zone_page_metadata *meta)
524 {
525 panic("metadata list corruption through element %p detected in zone %s%s",
526 meta, zone_heap_name(zone), zone->z_name);
527 }
528
529 __abortlike
530 static void
531 zone_page_metadata_foreign_queue_corruption(zone_t zone, zone_pva_t *queue)
532 {
533 panic("native metadata index %d enqueued in foreign head %p from zone %s%s",
534 queue->packed_address, queue, zone_heap_name(zone), zone->z_name);
535 }
536
537 __abortlike
538 static void
539 zone_page_metadata_foreign_confusion_panic(zone_t zone, vm_offset_t addr)
540 {
541 panic("manipulating foreign address %p in a native-only zone %s%s",
542 (void *)addr, zone_heap_name(zone), zone->z_name);
543 }
544
545 __abortlike __unused
546 static void
547 zone_invalid_foreign_addr_panic(zone_t zone, vm_offset_t addr)
548 {
549 panic("addr %p being freed to foreign zone %s%s not from foreign range",
550 (void *)addr, zone_heap_name(zone), zone->z_name);
551 }
552
553 __abortlike
554 static void
555 zone_page_meta_accounting_panic(zone_t zone, struct zone_page_metadata *meta,
556 const char *kind)
557 {
558 panic("accounting mismatch (%s) for zone %s%s, meta %p", kind,
559 zone_heap_name(zone), zone->z_name, meta);
560 }
561
562 __abortlike
563 static void
564 zone_accounting_panic(zone_t zone, const char *kind)
565 {
566 panic("accounting mismatch (%s) for zone %s%s", kind,
567 zone_heap_name(zone), zone->z_name);
568 }
569
570 __abortlike
571 static void
572 zone_nofail_panic(zone_t zone)
573 {
574 panic("zalloc(Z_NOFAIL) can't be satisfied for zone %s%s (potential leak)",
575 zone_heap_name(zone), zone->z_name);
576 }
577
578 #if __arm64__
579 // <rdar://problem/48304934> arm64 doesn't use ldp when I'd expect it to
580 #define zone_range_load(r, rmin, rmax) \
581 asm("ldp %[rmin], %[rmax], [%[range]]" \
582 : [rmin] "=r"(rmin), [rmax] "=r"(rmax) \
583 : [range] "r"(r))
584 #else
585 #define zone_range_load(r, rmin, rmax) \
586 ({ rmin = (r)->min_address; rmax = (r)->max_address; })
587 #endif
588
589 __header_always_inline bool
590 zone_range_contains(const struct zone_map_range *r, vm_offset_t addr, vm_offset_t size)
591 {
592 vm_offset_t rmin, rmax;
593
594 /*
595 * The `&` is not a typo: we really expect the check to pass,
596 * so encourage the compiler to eagerly load and test without branches
597 */
598 zone_range_load(r, rmin, rmax);
599 return (addr >= rmin) & (addr + size >= rmin) & (addr + size <= rmax);
600 }
601
602 __header_always_inline vm_size_t
603 zone_range_size(const struct zone_map_range *r)
604 {
605 vm_offset_t rmin, rmax;
606
607 zone_range_load(r, rmin, rmax);
608 return rmax - rmin;
609 }
610
611 #define from_zone_map(addr, size) \
612 zone_range_contains(&zone_info.zi_map_range, (vm_offset_t)(addr), size)
613
614 #define from_general_submap(addr, size) \
615 zone_range_contains(&zone_info.zi_general_range, (vm_offset_t)(addr), size)
616
617 #define from_foreign_range(addr, size) \
618 zone_range_contains(&zone_info.zi_foreign_range, (vm_offset_t)(addr), size)
619
620 #define from_native_meta_map(addr) \
621 zone_range_contains(&zone_info.zi_meta_range, (vm_offset_t)(addr), \
622 sizeof(struct zone_page_metadata))
623
624 #define zone_addr_kind(addr, size) \
625 (from_zone_map(addr, size) ? ZONE_ADDR_NATIVE : ZONE_ADDR_FOREIGN)
626
627 __header_always_inline bool
628 zone_pva_is_null(zone_pva_t page)
629 {
630 return page.packed_address == 0;
631 }
632
633 __header_always_inline bool
634 zone_pva_is_queue(zone_pva_t page)
635 {
636 // actual kernel pages have the top bit set
637 return (int32_t)page.packed_address > 0;
638 }
639
640 __header_always_inline bool
641 zone_pva_is_equal(zone_pva_t pva1, zone_pva_t pva2)
642 {
643 return pva1.packed_address == pva2.packed_address;
644 }
645
646 __header_always_inline void
647 zone_queue_set_head(zone_t z, zone_pva_t queue, zone_pva_t oldv,
648 struct zone_page_metadata *meta)
649 {
650 zone_pva_t *queue_head = &((zone_pva_t *)zone_array)[queue.packed_address];
651
652 if (!zone_pva_is_equal(*queue_head, oldv)) {
653 zone_page_metadata_list_corruption(z, meta);
654 }
655 *queue_head = meta->zm_page_next;
656 }
657
658 __header_always_inline zone_pva_t
659 zone_queue_encode(zone_pva_t *headp)
660 {
661 return (zone_pva_t){ (uint32_t)(headp - (zone_pva_t *)zone_array) };
662 }
663
664 __header_always_inline zone_pva_t
665 zone_pva_from_addr(vm_address_t addr)
666 {
667 // cannot use atop() because we want to maintain the sign bit
668 return (zone_pva_t){ (uint32_t)((intptr_t)addr >> PAGE_SHIFT) };
669 }
670
671 __header_always_inline vm_address_t
672 zone_pva_to_addr(zone_pva_t page)
673 {
674 // cause sign extension so that we end up with the right address
675 return (vm_offset_t)(int32_t)page.packed_address << PAGE_SHIFT;
676 }
677
678 __header_always_inline struct zone_page_metadata *
679 zone_pva_to_meta(zone_pva_t page, zone_addr_kind_t kind)
680 {
681 if (kind == ZONE_ADDR_NATIVE) {
682 return &zone_info.zi_array_base[page.packed_address];
683 } else {
684 return (struct zone_page_metadata *)zone_pva_to_addr(page);
685 }
686 }
687
688 __header_always_inline zone_pva_t
689 zone_pva_from_meta(struct zone_page_metadata *meta, zone_addr_kind_t kind)
690 {
691 if (kind == ZONE_ADDR_NATIVE) {
692 uint32_t index = (uint32_t)(meta - zone_info.zi_array_base);
693 return (zone_pva_t){ index };
694 } else {
695 return zone_pva_from_addr((vm_address_t)meta);
696 }
697 }
698
699 __header_always_inline struct zone_page_metadata *
700 zone_meta_from_addr(vm_offset_t addr, zone_addr_kind_t kind)
701 {
702 if (kind == ZONE_ADDR_NATIVE) {
703 return zone_pva_to_meta(zone_pva_from_addr(addr), kind);
704 } else {
705 return (struct zone_page_metadata *)trunc_page(addr);
706 }
707 }
708
709 #define zone_native_meta_from_addr(addr) \
710 zone_meta_from_addr((vm_offset_t)(addr), ZONE_ADDR_NATIVE)
711
712 __header_always_inline vm_offset_t
713 zone_meta_to_addr(struct zone_page_metadata *meta, zone_addr_kind_t kind)
714 {
715 if (kind == ZONE_ADDR_NATIVE) {
716 return ptoa((int)(meta - zone_info.zi_array_base));
717 } else {
718 return (vm_offset_t)meta;
719 }
720 }
721
722 __header_always_inline void
723 zone_meta_queue_push(zone_t z, zone_pva_t *headp,
724 struct zone_page_metadata *meta, zone_addr_kind_t kind)
725 {
726 zone_pva_t head = *headp;
727 zone_pva_t queue_pva = zone_queue_encode(headp);
728 struct zone_page_metadata *tmp;
729
730 meta->zm_page_next = head;
731 if (!zone_pva_is_null(head)) {
732 tmp = zone_pva_to_meta(head, kind);
733 if (!zone_pva_is_equal(tmp->zm_page_prev, queue_pva)) {
734 zone_page_metadata_list_corruption(z, meta);
735 }
736 tmp->zm_page_prev = zone_pva_from_meta(meta, kind);
737 }
738 meta->zm_page_prev = queue_pva;
739 *headp = zone_pva_from_meta(meta, kind);
740 }
741
742 __header_always_inline struct zone_page_metadata *
743 zone_meta_queue_pop(zone_t z, zone_pva_t *headp, zone_addr_kind_t kind,
744 vm_offset_t *page_addrp)
745 {
746 zone_pva_t head = *headp;
747 struct zone_page_metadata *meta = zone_pva_to_meta(head, kind);
748 vm_offset_t page_addr = zone_pva_to_addr(head);
749 struct zone_page_metadata *tmp;
750
751 if (kind == ZONE_ADDR_NATIVE && !from_native_meta_map(meta)) {
752 zone_page_metadata_native_queue_corruption(z, headp);
753 }
754 if (kind == ZONE_ADDR_FOREIGN && from_zone_map(meta, sizeof(*meta))) {
755 zone_page_metadata_foreign_queue_corruption(z, headp);
756 }
757
758 if (!zone_pva_is_null(meta->zm_page_next)) {
759 tmp = zone_pva_to_meta(meta->zm_page_next, kind);
760 if (!zone_pva_is_equal(tmp->zm_page_prev, head)) {
761 zone_page_metadata_list_corruption(z, meta);
762 }
763 tmp->zm_page_prev = meta->zm_page_prev;
764 }
765 *headp = meta->zm_page_next;
766
767 *page_addrp = page_addr;
768 return meta;
769 }
770
771 __header_always_inline void
772 zone_meta_requeue(zone_t z, zone_pva_t *headp,
773 struct zone_page_metadata *meta, zone_addr_kind_t kind)
774 {
775 zone_pva_t meta_pva = zone_pva_from_meta(meta, kind);
776 struct zone_page_metadata *tmp;
777
778 if (!zone_pva_is_null(meta->zm_page_next)) {
779 tmp = zone_pva_to_meta(meta->zm_page_next, kind);
780 if (!zone_pva_is_equal(tmp->zm_page_prev, meta_pva)) {
781 zone_page_metadata_list_corruption(z, meta);
782 }
783 tmp->zm_page_prev = meta->zm_page_prev;
784 }
785 if (zone_pva_is_queue(meta->zm_page_prev)) {
786 zone_queue_set_head(z, meta->zm_page_prev, meta_pva, meta);
787 } else {
788 tmp = zone_pva_to_meta(meta->zm_page_prev, kind);
789 if (!zone_pva_is_equal(tmp->zm_page_next, meta_pva)) {
790 zone_page_metadata_list_corruption(z, meta);
791 }
792 tmp->zm_page_next = meta->zm_page_next;
793 }
794
795 zone_meta_queue_push(z, headp, meta, kind);
796 }
797
798 /*
799 * Routine to populate a page backing metadata in the zone_metadata_region.
800 * Must be called without the zone lock held as it might potentially block.
801 */
802 static void
803 zone_meta_populate(struct zone_page_metadata *from, struct zone_page_metadata *to)
804 {
805 vm_offset_t page_addr = trunc_page(from);
806
807 for (; page_addr < (vm_offset_t)to; page_addr += PAGE_SIZE) {
808 #if !KASAN_ZALLOC
809 /*
810 * This can race with another thread doing a populate on the same metadata
811 * page, where we see an updated pmap but unmapped KASan shadow, causing a
812 * fault in the shadow when we first access the metadata page. Avoid this
813 * by always synchronizing on the zone_metadata_region lock with KASan.
814 */
815 if (pmap_find_phys(kernel_pmap, page_addr)) {
816 continue;
817 }
818 #endif
819
820 for (;;) {
821 kern_return_t ret = KERN_SUCCESS;
822
823 /* All updates to the zone_metadata_region are done under the zone_metadata_region_lck */
824 lck_mtx_lock(&zone_metadata_region_lck);
825 if (0 == pmap_find_phys(kernel_pmap, page_addr)) {
826 ret = kernel_memory_populate(kernel_map, page_addr,
827 PAGE_SIZE, KMA_NOPAGEWAIT | KMA_KOBJECT | KMA_ZERO,
828 VM_KERN_MEMORY_OSFMK);
829 }
830 lck_mtx_unlock(&zone_metadata_region_lck);
831
832 if (ret == KERN_SUCCESS) {
833 break;
834 }
835
836 /*
837 * We can't pass KMA_NOPAGEWAIT under a global lock as it leads
838 * to bad system deadlocks, so if the allocation failed,
839 * we need to do the VM_PAGE_WAIT() outside of the lock.
840 */
841 VM_PAGE_WAIT();
842 }
843 }
844 }
845
846 static inline bool
847 zone_allocated_element_offset_is_valid(zone_t zone, vm_offset_t addr,
848 vm_offset_t page, zone_addr_kind_t kind)
849 {
850 vm_offset_t offs = addr - page - ZONE_PAGE_FIRST_OFFSET(kind);
851 vm_offset_t esize = zone_elem_size(zone);
852
853 if (esize & (esize - 1)) { /* not a power of 2 */
854 return (offs % esize) == 0;
855 } else {
856 return (offs & (esize - 1)) == 0;
857 }
858 }
859
860 __attribute__((always_inline))
861 static struct zone_page_metadata *
862 zone_allocated_element_resolve(zone_t zone, vm_offset_t addr,
863 vm_offset_t *pagep, zone_addr_kind_t *kindp)
864 {
865 struct zone_page_metadata *meta;
866 zone_addr_kind_t kind;
867 vm_offset_t page;
868 vm_offset_t esize = zone_elem_size(zone);
869
870 kind = zone_addr_kind(addr, esize);
871 page = trunc_page(addr);
872 meta = zone_meta_from_addr(addr, kind);
873
874 if (kind == ZONE_ADDR_NATIVE) {
875 if (meta->zm_secondary_page) {
876 if (meta->zm_percpu) {
877 zone_invalid_element_addr_panic(zone, addr);
878 }
879 page -= ptoa(meta->zm_page_count);
880 meta -= meta->zm_page_count;
881 }
882 } else if (!zone->allows_foreign) {
883 zone_page_metadata_foreign_confusion_panic(zone, addr);
884 #if __LP64__
885 } else if (!from_foreign_range(addr, esize)) {
886 zone_invalid_foreign_addr_panic(zone, addr);
887 #else
888 } else if (!pmap_kernel_va(addr)) {
889 zone_invalid_element_addr_panic(zone, addr);
890 #endif
891 }
892
893 if (!zone_allocated_element_offset_is_valid(zone, addr, page, kind)) {
894 zone_invalid_element_addr_panic(zone, addr);
895 }
896
897 if (!zone_has_index(zone, meta->zm_index)) {
898 zone_page_metadata_index_confusion_panic(zone, addr, meta);
899 }
900
901 if (kindp) {
902 *kindp = kind;
903 }
904 if (pagep) {
905 *pagep = page;
906 }
907 return meta;
908 }
909
910 __attribute__((always_inline))
911 void
912 zone_allocated_element_validate(zone_t zone, vm_offset_t addr)
913 {
914 zone_allocated_element_resolve(zone, addr, NULL, NULL);
915 }
916
917 __header_always_inline vm_offset_t
918 zone_page_meta_get_freelist(zone_t zone, struct zone_page_metadata *meta,
919 vm_offset_t page)
920 {
921 assert(!meta->zm_secondary_page);
922 if (meta->zm_freelist_offs == PAGE_METADATA_EMPTY_FREELIST) {
923 return 0;
924 }
925
926 vm_size_t size = ptoa(meta->zm_percpu ? 1 : meta->zm_page_count);
927 if (meta->zm_freelist_offs + zone_elem_size(zone) > size) {
928 zone_metadata_corruption(zone, meta, "freelist corruption");
929 }
930
931 return page + meta->zm_freelist_offs;
932 }
933
934 __header_always_inline void
935 zone_page_meta_set_freelist(struct zone_page_metadata *meta,
936 vm_offset_t page, vm_offset_t addr)
937 {
938 assert(!meta->zm_secondary_page);
939 if (addr) {
940 meta->zm_freelist_offs = (uint16_t)(addr - page);
941 } else {
942 meta->zm_freelist_offs = PAGE_METADATA_EMPTY_FREELIST;
943 }
944 }
945
946 static bool
947 zone_page_meta_is_sane_element(zone_t zone, struct zone_page_metadata *meta,
948 vm_offset_t page, vm_offset_t element, zone_addr_kind_t kind)
949 {
950 if (element == 0) {
951 /* ends of the freelist are NULL */
952 return true;
953 }
954 if (element < page + ZONE_PAGE_FIRST_OFFSET(kind)) {
955 return false;
956 }
957 vm_size_t size = ptoa(meta->zm_percpu ? 1 : meta->zm_page_count);
958 if (element > page + size - zone_elem_size(zone)) {
959 return false;
960 }
961 return true;
962 }
963
964 /* Routine to get the size of a zone allocated address.
965 * If the address doesnt belong to the zone maps, returns 0.
966 */
967 vm_size_t
968 zone_element_size(void *addr, zone_t *z)
969 {
970 struct zone_page_metadata *meta;
971 struct zone *src_zone;
972
973 if (from_zone_map(addr, sizeof(void *))) {
974 meta = zone_native_meta_from_addr(addr);
975 src_zone = &zone_array[meta->zm_index];
976 if (z) {
977 *z = src_zone;
978 }
979 return zone_elem_size(src_zone);
980 }
981 #if CONFIG_GZALLOC
982 if (__improbable(gzalloc_enabled())) {
983 vm_size_t gzsize;
984 if (gzalloc_element_size(addr, z, &gzsize)) {
985 return gzsize;
986 }
987 }
988 #endif /* CONFIG_GZALLOC */
989
990 return 0;
991 }
992
993 /* This function just formats the reason for the panics by redoing the checks */
994 __abortlike
995 static void
996 zone_require_panic(zone_t zone, void *addr)
997 {
998 uint32_t zindex;
999 zone_t other;
1000
1001 if (!from_zone_map(addr, zone_elem_size(zone))) {
1002 panic("zone_require failed: address not in a zone (addr: %p)", addr);
1003 }
1004
1005 zindex = zone_native_meta_from_addr(addr)->zm_index;
1006 other = &zone_array[zindex];
1007 if (zindex >= os_atomic_load(&num_zones, relaxed) || !other->z_self) {
1008 panic("zone_require failed: invalid zone index %d "
1009 "(addr: %p, expected: %s%s)", zindex,
1010 addr, zone_heap_name(zone), zone->z_name);
1011 } else {
1012 panic("zone_require failed: address in unexpected zone id %d (%s%s) "
1013 "(addr: %p, expected: %s%s)",
1014 zindex, zone_heap_name(other), other->z_name,
1015 addr, zone_heap_name(zone), zone->z_name);
1016 }
1017 }
1018
1019 __abortlike
1020 static void
1021 zone_id_require_panic(zone_id_t zid, void *addr)
1022 {
1023 zone_require_panic(&zone_array[zid], addr);
1024 }
1025
1026 /*
1027 * Routines to panic if a pointer is not mapped to an expected zone.
1028 * This can be used as a means of pinning an object to the zone it is expected
1029 * to be a part of. Causes a panic if the address does not belong to any
1030 * specified zone, does not belong to any zone, has been freed and therefore
1031 * unmapped from the zone, or the pointer contains an uninitialized value that
1032 * does not belong to any zone.
1033 *
1034 * Note that this can only work with collectable zones without foreign pages.
1035 */
1036 void
1037 zone_require(zone_t zone, void *addr)
1038 {
1039 if (__probable(from_general_submap(addr, zone_elem_size(zone)) &&
1040 (zone_has_index(zone, zone_native_meta_from_addr(addr)->zm_index)))) {
1041 return;
1042 }
1043 #if CONFIG_GZALLOC
1044 if (__probable(gzalloc_enabled())) {
1045 return;
1046 }
1047 #endif
1048 zone_require_panic(zone, addr);
1049 }
1050
1051 void
1052 zone_id_require(zone_id_t zid, vm_size_t esize, void *addr)
1053 {
1054 if (__probable(from_general_submap(addr, esize) &&
1055 (zid == zone_native_meta_from_addr(addr)->zm_index))) {
1056 return;
1057 }
1058 #if CONFIG_GZALLOC
1059 if (__probable(gzalloc_enabled())) {
1060 return;
1061 }
1062 #endif
1063 zone_id_require_panic(zid, addr);
1064 }
1065
1066 bool
1067 zone_owns(zone_t zone, void *addr)
1068 {
1069 if (__probable(from_general_submap(addr, zone_elem_size(zone)) &&
1070 (zone_has_index(zone, zone_native_meta_from_addr(addr)->zm_index)))) {
1071 return true;
1072 }
1073 #if CONFIG_GZALLOC
1074 if (__probable(gzalloc_enabled())) {
1075 return true;
1076 }
1077 #endif
1078 return false;
1079 }
1080
1081 #pragma mark ZTAGS
1082 #if VM_MAX_TAG_ZONES
1083
1084 // for zones with tagging enabled:
1085
1086 // calculate a pointer to the tag base entry,
1087 // holding either a uint32_t the first tag offset for a page in the zone map,
1088 // or two uint16_t tags if the page can only hold one or two elements
1089
1090 #define ZTAGBASE(zone, element) \
1091 (&((uint32_t *)zone_tagbase_min)[atop((element) - zone_info.zi_map_range.min_address)])
1092
1093 // pointer to the tag for an element
1094 #define ZTAG(zone, element) \
1095 ({ \
1096 vm_tag_t * result; \
1097 if ((zone)->tags_inline) { \
1098 result = (vm_tag_t *) ZTAGBASE((zone), (element)); \
1099 if ((page_mask & element) >= zone_elem_size(zone)) result++; \
1100 } else { \
1101 result = &((vm_tag_t *)zone_tags_min)[ZTAGBASE((zone), (element))[0] + ((element) & page_mask) / zone_elem_size((zone))]; \
1102 } \
1103 result; \
1104 })
1105
1106
1107 static vm_offset_t zone_tagbase_min;
1108 static vm_offset_t zone_tagbase_max;
1109 static vm_offset_t zone_tagbase_map_size;
1110 static vm_map_t zone_tagbase_map;
1111
1112 static vm_offset_t zone_tags_min;
1113 static vm_offset_t zone_tags_max;
1114 static vm_offset_t zone_tags_map_size;
1115 static vm_map_t zone_tags_map;
1116
1117 // simple heap allocator for allocating the tags for new memory
1118
1119 LCK_MTX_EARLY_DECLARE(ztLock, &zone_locks_grp); /* heap lock */
1120
1121 enum{
1122 ztFreeIndexCount = 8,
1123 ztFreeIndexMax = (ztFreeIndexCount - 1),
1124 ztTagsPerBlock = 4
1125 };
1126
1127 struct ztBlock {
1128 #if __LITTLE_ENDIAN__
1129 uint64_t free:1,
1130 next:21,
1131 prev:21,
1132 size:21;
1133 #else
1134 // ztBlock needs free bit least significant
1135 #error !__LITTLE_ENDIAN__
1136 #endif
1137 };
1138 typedef struct ztBlock ztBlock;
1139
1140 static ztBlock * ztBlocks;
1141 static uint32_t ztBlocksCount;
1142 static uint32_t ztBlocksFree;
1143
1144 static uint32_t
1145 ztLog2up(uint32_t size)
1146 {
1147 if (1 == size) {
1148 size = 0;
1149 } else {
1150 size = 32 - __builtin_clz(size - 1);
1151 }
1152 return size;
1153 }
1154
1155 static uint32_t
1156 ztLog2down(uint32_t size)
1157 {
1158 size = 31 - __builtin_clz(size);
1159 return size;
1160 }
1161
1162 static void
1163 ztFault(vm_map_t map, const void * address, size_t size, uint32_t flags)
1164 {
1165 vm_map_offset_t addr = (vm_map_offset_t) address;
1166 vm_map_offset_t page, end;
1167
1168 page = trunc_page(addr);
1169 end = round_page(addr + size);
1170
1171 for (; page < end; page += page_size) {
1172 if (!pmap_find_phys(kernel_pmap, page)) {
1173 kern_return_t __unused
1174 ret = kernel_memory_populate(map, page, PAGE_SIZE,
1175 KMA_KOBJECT | flags, VM_KERN_MEMORY_DIAG);
1176 assert(ret == KERN_SUCCESS);
1177 }
1178 }
1179 }
1180
1181 static boolean_t
1182 ztPresent(const void * address, size_t size)
1183 {
1184 vm_map_offset_t addr = (vm_map_offset_t) address;
1185 vm_map_offset_t page, end;
1186 boolean_t result;
1187
1188 page = trunc_page(addr);
1189 end = round_page(addr + size);
1190 for (result = TRUE; (page < end); page += page_size) {
1191 result = pmap_find_phys(kernel_pmap, page);
1192 if (!result) {
1193 break;
1194 }
1195 }
1196 return result;
1197 }
1198
1199
1200 void __unused
1201 ztDump(boolean_t sanity);
1202 void __unused
1203 ztDump(boolean_t sanity)
1204 {
1205 uint32_t q, cq, p;
1206
1207 for (q = 0; q <= ztFreeIndexMax; q++) {
1208 p = q;
1209 do{
1210 if (sanity) {
1211 cq = ztLog2down(ztBlocks[p].size);
1212 if (cq > ztFreeIndexMax) {
1213 cq = ztFreeIndexMax;
1214 }
1215 if (!ztBlocks[p].free
1216 || ((p != q) && (q != cq))
1217 || (ztBlocks[ztBlocks[p].next].prev != p)
1218 || (ztBlocks[ztBlocks[p].prev].next != p)) {
1219 kprintf("zterror at %d", p);
1220 ztDump(FALSE);
1221 kprintf("zterror at %d", p);
1222 assert(FALSE);
1223 }
1224 continue;
1225 }
1226 kprintf("zt[%03d]%c %d, %d, %d\n",
1227 p, ztBlocks[p].free ? 'F' : 'A',
1228 ztBlocks[p].next, ztBlocks[p].prev,
1229 ztBlocks[p].size);
1230 p = ztBlocks[p].next;
1231 if (p == q) {
1232 break;
1233 }
1234 }while (p != q);
1235 if (!sanity) {
1236 printf("\n");
1237 }
1238 }
1239 if (!sanity) {
1240 printf("-----------------------\n");
1241 }
1242 }
1243
1244
1245
1246 #define ZTBDEQ(idx) \
1247 ztBlocks[ztBlocks[(idx)].prev].next = ztBlocks[(idx)].next; \
1248 ztBlocks[ztBlocks[(idx)].next].prev = ztBlocks[(idx)].prev;
1249
1250 static void
1251 ztFree(zone_t zone __unused, uint32_t index, uint32_t count)
1252 {
1253 uint32_t q, w, p, size, merge;
1254
1255 assert(count);
1256 ztBlocksFree += count;
1257
1258 // merge with preceding
1259 merge = (index + count);
1260 if ((merge < ztBlocksCount)
1261 && ztPresent(&ztBlocks[merge], sizeof(ztBlocks[merge]))
1262 && ztBlocks[merge].free) {
1263 ZTBDEQ(merge);
1264 count += ztBlocks[merge].size;
1265 }
1266
1267 // merge with following
1268 merge = (index - 1);
1269 if ((merge > ztFreeIndexMax)
1270 && ztPresent(&ztBlocks[merge], sizeof(ztBlocks[merge]))
1271 && ztBlocks[merge].free) {
1272 size = ztBlocks[merge].size;
1273 count += size;
1274 index -= size;
1275 ZTBDEQ(index);
1276 }
1277
1278 q = ztLog2down(count);
1279 if (q > ztFreeIndexMax) {
1280 q = ztFreeIndexMax;
1281 }
1282 w = q;
1283 // queue in order of size
1284 while (TRUE) {
1285 p = ztBlocks[w].next;
1286 if (p == q) {
1287 break;
1288 }
1289 if (ztBlocks[p].size >= count) {
1290 break;
1291 }
1292 w = p;
1293 }
1294 ztBlocks[p].prev = index;
1295 ztBlocks[w].next = index;
1296
1297 // fault in first
1298 ztFault(zone_tags_map, &ztBlocks[index], sizeof(ztBlocks[index]), 0);
1299
1300 // mark first & last with free flag and size
1301 ztBlocks[index].free = TRUE;
1302 ztBlocks[index].size = count;
1303 ztBlocks[index].prev = w;
1304 ztBlocks[index].next = p;
1305 if (count > 1) {
1306 index += (count - 1);
1307 // fault in last
1308 ztFault(zone_tags_map, &ztBlocks[index], sizeof(ztBlocks[index]), 0);
1309 ztBlocks[index].free = TRUE;
1310 ztBlocks[index].size = count;
1311 }
1312 }
1313
1314 static uint32_t
1315 ztAlloc(zone_t zone, uint32_t count)
1316 {
1317 uint32_t q, w, p, leftover;
1318
1319 assert(count);
1320
1321 q = ztLog2up(count);
1322 if (q > ztFreeIndexMax) {
1323 q = ztFreeIndexMax;
1324 }
1325 do{
1326 w = q;
1327 while (TRUE) {
1328 p = ztBlocks[w].next;
1329 if (p == q) {
1330 break;
1331 }
1332 if (ztBlocks[p].size >= count) {
1333 // dequeue, mark both ends allocated
1334 ztBlocks[w].next = ztBlocks[p].next;
1335 ztBlocks[ztBlocks[p].next].prev = w;
1336 ztBlocks[p].free = FALSE;
1337 ztBlocksFree -= ztBlocks[p].size;
1338 if (ztBlocks[p].size > 1) {
1339 ztBlocks[p + ztBlocks[p].size - 1].free = FALSE;
1340 }
1341
1342 // fault all the allocation
1343 ztFault(zone_tags_map, &ztBlocks[p], count * sizeof(ztBlocks[p]), 0);
1344 // mark last as allocated
1345 if (count > 1) {
1346 ztBlocks[p + count - 1].free = FALSE;
1347 }
1348 // free remainder
1349 leftover = ztBlocks[p].size - count;
1350 if (leftover) {
1351 ztFree(zone, p + ztBlocks[p].size - leftover, leftover);
1352 }
1353
1354 return p;
1355 }
1356 w = p;
1357 }
1358 q++;
1359 }while (q <= ztFreeIndexMax);
1360
1361 return -1U;
1362 }
1363
1364 __startup_func
1365 static void
1366 zone_tagging_init(vm_size_t max_zonemap_size)
1367 {
1368 kern_return_t ret;
1369 vm_map_kernel_flags_t vmk_flags;
1370 uint32_t idx;
1371
1372 // allocate submaps VM_KERN_MEMORY_DIAG
1373
1374 zone_tagbase_map_size = atop(max_zonemap_size) * sizeof(uint32_t);
1375 vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
1376 vmk_flags.vmkf_permanent = TRUE;
1377 ret = kmem_suballoc(kernel_map, &zone_tagbase_min, zone_tagbase_map_size,
1378 FALSE, VM_FLAGS_ANYWHERE, vmk_flags, VM_KERN_MEMORY_DIAG,
1379 &zone_tagbase_map);
1380
1381 if (ret != KERN_SUCCESS) {
1382 panic("zone_init: kmem_suballoc failed");
1383 }
1384 zone_tagbase_max = zone_tagbase_min + round_page(zone_tagbase_map_size);
1385
1386 zone_tags_map_size = 2048 * 1024 * sizeof(vm_tag_t);
1387 vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
1388 vmk_flags.vmkf_permanent = TRUE;
1389 ret = kmem_suballoc(kernel_map, &zone_tags_min, zone_tags_map_size,
1390 FALSE, VM_FLAGS_ANYWHERE, vmk_flags, VM_KERN_MEMORY_DIAG,
1391 &zone_tags_map);
1392
1393 if (ret != KERN_SUCCESS) {
1394 panic("zone_init: kmem_suballoc failed");
1395 }
1396 zone_tags_max = zone_tags_min + round_page(zone_tags_map_size);
1397
1398 ztBlocks = (ztBlock *) zone_tags_min;
1399 ztBlocksCount = (uint32_t)(zone_tags_map_size / sizeof(ztBlock));
1400
1401 // initialize the qheads
1402 lck_mtx_lock(&ztLock);
1403
1404 ztFault(zone_tags_map, &ztBlocks[0], sizeof(ztBlocks[0]), 0);
1405 for (idx = 0; idx < ztFreeIndexCount; idx++) {
1406 ztBlocks[idx].free = TRUE;
1407 ztBlocks[idx].next = idx;
1408 ztBlocks[idx].prev = idx;
1409 ztBlocks[idx].size = 0;
1410 }
1411 // free remaining space
1412 ztFree(NULL, ztFreeIndexCount, ztBlocksCount - ztFreeIndexCount);
1413
1414 lck_mtx_unlock(&ztLock);
1415 }
1416
1417 static void
1418 ztMemoryAdd(zone_t zone, vm_offset_t mem, vm_size_t size)
1419 {
1420 uint32_t * tagbase;
1421 uint32_t count, block, blocks, idx;
1422 size_t pages;
1423
1424 pages = atop(size);
1425 tagbase = ZTAGBASE(zone, mem);
1426
1427 lck_mtx_lock(&ztLock);
1428
1429 // fault tagbase
1430 ztFault(zone_tagbase_map, tagbase, pages * sizeof(uint32_t), 0);
1431
1432 if (!zone->tags_inline) {
1433 // allocate tags
1434 count = (uint32_t)(size / zone_elem_size(zone));
1435 blocks = ((count + ztTagsPerBlock - 1) / ztTagsPerBlock);
1436 block = ztAlloc(zone, blocks);
1437 if (-1U == block) {
1438 ztDump(false);
1439 }
1440 assert(-1U != block);
1441 }
1442
1443 lck_mtx_unlock(&ztLock);
1444
1445 if (!zone->tags_inline) {
1446 // set tag base for each page
1447 block *= ztTagsPerBlock;
1448 for (idx = 0; idx < pages; idx++) {
1449 vm_offset_t esize = zone_elem_size(zone);
1450 tagbase[idx] = block + (uint32_t)((ptoa(idx) + esize - 1) / esize);
1451 }
1452 }
1453 }
1454
1455 static void
1456 ztMemoryRemove(zone_t zone, vm_offset_t mem, vm_size_t size)
1457 {
1458 uint32_t * tagbase;
1459 uint32_t count, block, blocks, idx;
1460 size_t pages;
1461
1462 // set tag base for each page
1463 pages = atop(size);
1464 tagbase = ZTAGBASE(zone, mem);
1465 block = tagbase[0];
1466 for (idx = 0; idx < pages; idx++) {
1467 tagbase[idx] = 0xFFFFFFFF;
1468 }
1469
1470 lck_mtx_lock(&ztLock);
1471 if (!zone->tags_inline) {
1472 count = (uint32_t)(size / zone_elem_size(zone));
1473 blocks = ((count + ztTagsPerBlock - 1) / ztTagsPerBlock);
1474 assert(block != 0xFFFFFFFF);
1475 block /= ztTagsPerBlock;
1476 ztFree(NULL /* zone is unlocked */, block, blocks);
1477 }
1478
1479 lck_mtx_unlock(&ztLock);
1480 }
1481
1482 uint32_t
1483 zone_index_from_tag_index(uint32_t tag_zone_index, vm_size_t * elem_size)
1484 {
1485 simple_lock(&all_zones_lock, &zone_locks_grp);
1486
1487 zone_index_foreach(idx) {
1488 zone_t z = &zone_array[idx];
1489 if (!z->tags) {
1490 continue;
1491 }
1492 if (tag_zone_index != z->tag_zone_index) {
1493 continue;
1494 }
1495
1496 *elem_size = zone_elem_size(z);
1497 simple_unlock(&all_zones_lock);
1498 return idx;
1499 }
1500
1501 simple_unlock(&all_zones_lock);
1502
1503 return -1U;
1504 }
1505
1506 #endif /* VM_MAX_TAG_ZONES */
1507 #pragma mark zalloc helpers
1508
1509 const char *
1510 zone_name(zone_t z)
1511 {
1512 return z->z_name;
1513 }
1514
1515 const char *
1516 zone_heap_name(zone_t z)
1517 {
1518 if (__probable(z->kalloc_heap < KHEAP_ID_COUNT)) {
1519 return kalloc_heap_names[z->kalloc_heap];
1520 }
1521 return "invalid";
1522 }
1523
1524 static inline vm_size_t
1525 zone_submaps_approx_size(void)
1526 {
1527 vm_size_t size = 0;
1528
1529 for (unsigned idx = 0; idx <= zone_last_submap_idx; idx++) {
1530 size += zone_submaps[idx]->size;
1531 }
1532
1533 return size;
1534 }
1535
1536 bool
1537 zone_maps_owned(vm_address_t addr, vm_size_t size)
1538 {
1539 return from_zone_map(addr, size);
1540 }
1541
1542 void
1543 zone_map_sizes(
1544 vm_map_size_t *psize,
1545 vm_map_size_t *pfree,
1546 vm_map_size_t *plargest_free)
1547 {
1548 vm_map_sizes(zone_submaps[Z_SUBMAP_IDX_GENERAL_MAP], psize, pfree, plargest_free);
1549 }
1550
1551 vm_map_t
1552 zone_submap(zone_t zone)
1553 {
1554 return submap_for_zone(zone);
1555 }
1556
1557 unsigned
1558 zpercpu_count(void)
1559 {
1560 return zpercpu_early_count;
1561 }
1562
1563 int
1564 track_this_zone(const char *zonename, const char *logname)
1565 {
1566 unsigned int len;
1567 const char *zc = zonename;
1568 const char *lc = logname;
1569
1570 /*
1571 * Compare the strings. We bound the compare by MAX_ZONE_NAME.
1572 */
1573
1574 for (len = 1; len <= MAX_ZONE_NAME; zc++, lc++, len++) {
1575 /*
1576 * If the current characters don't match, check for a space in
1577 * in the zone name and a corresponding period in the log name.
1578 * If that's not there, then the strings don't match.
1579 */
1580
1581 if (*zc != *lc && !(*zc == ' ' && *lc == '.')) {
1582 break;
1583 }
1584
1585 /*
1586 * The strings are equal so far. If we're at the end, then it's a match.
1587 */
1588
1589 if (*zc == '\0') {
1590 return TRUE;
1591 }
1592 }
1593
1594 return FALSE;
1595 }
1596
1597 #if DEBUG || DEVELOPMENT
1598
1599 vm_size_t
1600 zone_element_info(void *addr, vm_tag_t * ptag)
1601 {
1602 vm_size_t size = 0;
1603 vm_tag_t tag = VM_KERN_MEMORY_NONE;
1604 struct zone_page_metadata *meta;
1605 struct zone *src_zone;
1606
1607 if (from_zone_map(addr, sizeof(void *))) {
1608 meta = zone_native_meta_from_addr(addr);
1609 src_zone = &zone_array[meta->zm_index];
1610 #if VM_MAX_TAG_ZONES
1611 if (__improbable(src_zone->tags)) {
1612 tag = (ZTAG(src_zone, (vm_offset_t) addr)[0] >> 1);
1613 }
1614 #endif /* VM_MAX_TAG_ZONES */
1615 size = zone_elem_size(src_zone);
1616 } else {
1617 #if CONFIG_GZALLOC
1618 gzalloc_element_size(addr, NULL, &size);
1619 #endif /* CONFIG_GZALLOC */
1620 }
1621 *ptag = tag;
1622 return size;
1623 }
1624
1625 #endif /* DEBUG || DEVELOPMENT */
1626
1627 /* Someone wrote to freed memory. */
1628 __abortlike
1629 static void
1630 zone_element_was_modified_panic(
1631 zone_t zone,
1632 vm_offset_t element,
1633 vm_offset_t found,
1634 vm_offset_t expected,
1635 vm_offset_t offset)
1636 {
1637 panic("a freed zone element has been modified in zone %s%s: "
1638 "expected %p but found %p, bits changed %p, "
1639 "at offset %d of %d in element %p, cookies %p %p",
1640 zone_heap_name(zone),
1641 zone->z_name,
1642 (void *) expected,
1643 (void *) found,
1644 (void *) (expected ^ found),
1645 (uint32_t) offset,
1646 (uint32_t) zone_elem_size(zone),
1647 (void *) element,
1648 (void *) zp_nopoison_cookie,
1649 (void *) zp_poisoned_cookie);
1650 }
1651
1652 /* The backup pointer is stored in the last pointer-sized location in an element. */
1653 __header_always_inline vm_offset_t *
1654 get_backup_ptr(vm_size_t elem_size, vm_offset_t *element)
1655 {
1656 return (vm_offset_t *)((vm_offset_t)element + elem_size - sizeof(vm_offset_t));
1657 }
1658
1659 /*
1660 * The primary and backup pointers don't match.
1661 * Determine which one was likely the corrupted pointer, find out what it
1662 * probably should have been, and panic.
1663 */
1664 __abortlike
1665 static void
1666 backup_ptr_mismatch_panic(
1667 zone_t zone,
1668 struct zone_page_metadata *page_meta,
1669 vm_offset_t page,
1670 vm_offset_t element)
1671 {
1672 vm_offset_t primary = *(vm_offset_t *)element;
1673 vm_offset_t backup = *get_backup_ptr(zone_elem_size(zone), &element);
1674 vm_offset_t likely_backup;
1675 vm_offset_t likely_primary;
1676 zone_addr_kind_t kind = zone_addr_kind(page, zone_elem_size(zone));
1677
1678 likely_primary = primary ^ zp_nopoison_cookie;
1679 boolean_t sane_backup;
1680 boolean_t sane_primary = zone_page_meta_is_sane_element(zone, page_meta,
1681 page, likely_primary, kind);
1682 boolean_t element_was_poisoned = (backup & 0x1);
1683
1684 #if defined(__LP64__)
1685 /* We can inspect the tag in the upper bits for additional confirmation */
1686 if ((backup & 0xFFFFFF0000000000) == 0xFACADE0000000000) {
1687 element_was_poisoned = TRUE;
1688 } else if ((backup & 0xFFFFFF0000000000) == 0xC0FFEE0000000000) {
1689 element_was_poisoned = FALSE;
1690 }
1691 #endif
1692
1693 if (element_was_poisoned) {
1694 likely_backup = backup ^ zp_poisoned_cookie;
1695 } else {
1696 likely_backup = backup ^ zp_nopoison_cookie;
1697 }
1698 sane_backup = zone_page_meta_is_sane_element(zone, page_meta,
1699 page, likely_backup, kind);
1700
1701 /* The primary is definitely the corrupted one */
1702 if (!sane_primary && sane_backup) {
1703 zone_element_was_modified_panic(zone, element, primary, (likely_backup ^ zp_nopoison_cookie), 0);
1704 }
1705
1706 /* The backup is definitely the corrupted one */
1707 if (sane_primary && !sane_backup) {
1708 zone_element_was_modified_panic(zone, element, backup,
1709 (likely_primary ^ (element_was_poisoned ? zp_poisoned_cookie : zp_nopoison_cookie)),
1710 zone_elem_size(zone) - sizeof(vm_offset_t));
1711 }
1712
1713 /*
1714 * Not sure which is the corrupted one.
1715 * It's less likely that the backup pointer was overwritten with
1716 * ( (sane address) ^ (valid cookie) ), so we'll guess that the
1717 * primary pointer has been overwritten with a sane but incorrect address.
1718 */
1719 if (sane_primary && sane_backup) {
1720 zone_element_was_modified_panic(zone, element, primary, (likely_backup ^ zp_nopoison_cookie), 0);
1721 }
1722
1723 /* Neither are sane, so just guess. */
1724 zone_element_was_modified_panic(zone, element, primary, (likely_backup ^ zp_nopoison_cookie), 0);
1725 }
1726
1727 /*
1728 * zone_sequestered_page_get
1729 * z is locked
1730 */
1731 static struct zone_page_metadata *
1732 zone_sequestered_page_get(zone_t z, vm_offset_t *page)
1733 {
1734 const zone_addr_kind_t kind = ZONE_ADDR_NATIVE;
1735
1736 if (!zone_pva_is_null(z->pages_sequester)) {
1737 if (os_sub_overflow(z->sequester_page_count, z->alloc_pages,
1738 &z->sequester_page_count)) {
1739 zone_accounting_panic(z, "sequester_page_count wrap-around");
1740 }
1741 return zone_meta_queue_pop(z, &z->pages_sequester, kind, page);
1742 }
1743
1744 return NULL;
1745 }
1746
1747 /*
1748 * zone_sequestered_page_populate
1749 * z is unlocked
1750 * page_meta is invalid on failure
1751 */
1752 static kern_return_t
1753 zone_sequestered_page_populate(zone_t z, struct zone_page_metadata *page_meta,
1754 vm_offset_t space, vm_size_t alloc_size, int zflags)
1755 {
1756 kern_return_t retval;
1757
1758 assert(alloc_size == ptoa(z->alloc_pages));
1759 retval = kernel_memory_populate(submap_for_zone(z), space, alloc_size,
1760 zflags, VM_KERN_MEMORY_ZONE);
1761 if (retval != KERN_SUCCESS) {
1762 lock_zone(z);
1763 zone_meta_queue_push(z, &z->pages_sequester, page_meta, ZONE_ADDR_NATIVE);
1764 z->sequester_page_count += z->alloc_pages;
1765 unlock_zone(z);
1766 }
1767 return retval;
1768 }
1769
1770 #pragma mark Zone poisoning/zeroing
1771
1772 /*
1773 * Initialize zone poisoning
1774 * called from zone_bootstrap before any allocations are made from zalloc
1775 */
1776 __startup_func
1777 static void
1778 zp_bootstrap(void)
1779 {
1780 char temp_buf[16];
1781
1782 /*
1783 * Initialize backup pointer random cookie for poisoned elements
1784 * Try not to call early_random() back to back, it may return
1785 * the same value if mach_absolute_time doesn't have sufficient time
1786 * to tick over between calls. <rdar://problem/11597395>
1787 * (This is only a problem on embedded devices)
1788 */
1789 zp_poisoned_cookie = (uintptr_t) early_random();
1790
1791 /* -zp: enable poisoning for every alloc and free */
1792 if (PE_parse_boot_argn("-zp", temp_buf, sizeof(temp_buf))) {
1793 zp_factor = 1;
1794 }
1795
1796 /* -no-zp: disable poisoning */
1797 if (PE_parse_boot_argn("-no-zp", temp_buf, sizeof(temp_buf))) {
1798 zp_factor = 0;
1799 printf("Zone poisoning disabled\n");
1800 }
1801
1802 /* Initialize backup pointer random cookie for unpoisoned elements */
1803 zp_nopoison_cookie = (uintptr_t) early_random();
1804
1805 #if MACH_ASSERT
1806 if (zp_poisoned_cookie == zp_nopoison_cookie) {
1807 panic("early_random() is broken: %p and %p are not random\n",
1808 (void *) zp_poisoned_cookie, (void *) zp_nopoison_cookie);
1809 }
1810 #endif
1811
1812 /*
1813 * Use the last bit in the backup pointer to hint poisoning state
1814 * to backup_ptr_mismatch_panic. Valid zone pointers are aligned, so
1815 * the low bits are zero.
1816 */
1817 zp_poisoned_cookie |= (uintptr_t)0x1ULL;
1818 zp_nopoison_cookie &= ~((uintptr_t)0x1ULL);
1819
1820 #if defined(__LP64__)
1821 /*
1822 * Make backup pointers more obvious in GDB for 64 bit
1823 * by making OxFFFFFF... ^ cookie = 0xFACADE...
1824 * (0xFACADE = 0xFFFFFF ^ 0x053521)
1825 * (0xC0FFEE = 0xFFFFFF ^ 0x3f0011)
1826 * The high 3 bytes of a zone pointer are always 0xFFFFFF, and are checked
1827 * by the sanity check, so it's OK for that part of the cookie to be predictable.
1828 *
1829 * TODO: Use #defines, xors, and shifts
1830 */
1831
1832 zp_poisoned_cookie &= 0x000000FFFFFFFFFF;
1833 zp_poisoned_cookie |= 0x0535210000000000; /* 0xFACADE */
1834
1835 zp_nopoison_cookie &= 0x000000FFFFFFFFFF;
1836 zp_nopoison_cookie |= 0x3f00110000000000; /* 0xC0FFEE */
1837 #endif
1838
1839 /*
1840 * Initialize zp_min_size to two cachelines. Elements smaller than this will
1841 * be zero-ed.
1842 */
1843 ml_cpu_info_t cpu_info;
1844 ml_cpu_get_info(&cpu_info);
1845 zp_min_size = 2 * cpu_info.cache_line_size;
1846 }
1847
1848 inline uint32_t
1849 zone_poison_count_init(zone_t zone)
1850 {
1851 return zp_factor + (((uint32_t)zone_elem_size(zone)) >> zp_scale) ^
1852 (mach_absolute_time() & 0x7);
1853 }
1854
1855 #if ZALLOC_ENABLE_POISONING
1856 static bool
1857 zfree_poison_element(zone_t zone, uint32_t *zp_count, vm_offset_t elem)
1858 {
1859 bool poison = false;
1860 uint32_t zp_count_local;
1861
1862 assert(!zone->percpu);
1863 if (zp_factor != 0) {
1864 /*
1865 * Poison the memory of every zp_count-th element before it ends up
1866 * on the freelist to catch use-after-free and use of uninitialized
1867 * memory.
1868 *
1869 * Every element is poisoned when zp_factor is set to 1.
1870 *
1871 */
1872 zp_count_local = os_atomic_load(zp_count, relaxed);
1873 if (__improbable(zp_count_local == 0 || zp_factor == 1)) {
1874 poison = true;
1875
1876 os_atomic_store(zp_count, zone_poison_count_init(zone), relaxed);
1877
1878 /* memset_pattern{4|8} could help make this faster: <rdar://problem/4662004> */
1879 vm_offset_t *element_cursor = ((vm_offset_t *) elem);
1880 vm_offset_t *end_cursor = (vm_offset_t *)(elem + zone_elem_size(zone));
1881
1882 for (; element_cursor < end_cursor; element_cursor++) {
1883 *element_cursor = ZONE_POISON;
1884 }
1885 } else {
1886 os_atomic_store(zp_count, zp_count_local - 1, relaxed);
1887 /*
1888 * Zero first zp_min_size bytes of elements that aren't being poisoned.
1889 * Element size is larger than zp_min_size in this path as elements
1890 * that are smaller will always be zero-ed.
1891 */
1892 bzero((void *) elem, zp_min_size);
1893 }
1894 }
1895 return poison;
1896 }
1897 #else
1898 static bool
1899 zfree_poison_element(zone_t zone, uint32_t *zp_count, vm_offset_t elem)
1900 {
1901 #pragma unused(zone, zp_count, elem)
1902 assert(!zone->percpu);
1903 return false;
1904 }
1905 #endif
1906
1907 __attribute__((always_inline))
1908 static bool
1909 zfree_clear(zone_t zone, vm_offset_t addr, vm_size_t elem_size)
1910 {
1911 assert(zone->zfree_clear_mem);
1912 if (zone->percpu) {
1913 zpercpu_foreach_cpu(i) {
1914 bzero((void *)(addr + ptoa(i)), elem_size);
1915 }
1916 } else {
1917 bzero((void *)addr, elem_size);
1918 }
1919
1920 return true;
1921 }
1922
1923 /*
1924 * Zero the element if zone has zfree_clear_mem flag set else poison
1925 * the element if zp_count hits 0.
1926 */
1927 __attribute__((always_inline))
1928 bool
1929 zfree_clear_or_poison(zone_t zone, uint32_t *zp_count, vm_offset_t addr)
1930 {
1931 vm_size_t elem_size = zone_elem_size(zone);
1932
1933 if (zone->zfree_clear_mem) {
1934 return zfree_clear(zone, addr, elem_size);
1935 }
1936
1937 return zfree_poison_element(zone, zp_count, (vm_offset_t)addr);
1938 }
1939
1940 /*
1941 * Clear out the old next pointer and backup to avoid leaking the zone
1942 * poisoning cookie and so that only values on the freelist have a valid
1943 * cookie.
1944 */
1945 void
1946 zone_clear_freelist_pointers(zone_t zone, vm_offset_t addr)
1947 {
1948 vm_offset_t perm_value = 0;
1949
1950 if (!zone->zfree_clear_mem) {
1951 perm_value = ZONE_POISON;
1952 }
1953
1954 vm_offset_t *primary = (vm_offset_t *) addr;
1955 vm_offset_t *backup = get_backup_ptr(zone_elem_size(zone), primary);
1956
1957 *primary = perm_value;
1958 *backup = perm_value;
1959 }
1960
1961 #if ZALLOC_ENABLE_POISONING
1962 __abortlike
1963 static void
1964 zone_element_not_clear_panic(zone_t zone, void *addr)
1965 {
1966 panic("Zone element %p was modified after free for zone %s%s: "
1967 "Expected element to be cleared", addr, zone_heap_name(zone),
1968 zone->z_name);
1969 }
1970
1971 /*
1972 * Validate that the element was not tampered with while it was in the
1973 * freelist.
1974 */
1975 void
1976 zalloc_validate_element(zone_t zone, vm_offset_t addr, vm_size_t size, bool validate)
1977 {
1978 if (zone->percpu) {
1979 assert(zone->zfree_clear_mem);
1980 zpercpu_foreach_cpu(i) {
1981 if (memcmp_zero_ptr_aligned((void *)(addr + ptoa(i)), size)) {
1982 zone_element_not_clear_panic(zone, (void *)(addr + ptoa(i)));
1983 }
1984 }
1985 } else if (zone->zfree_clear_mem) {
1986 if (memcmp_zero_ptr_aligned((void *)addr, size)) {
1987 zone_element_not_clear_panic(zone, (void *)addr);
1988 }
1989 } else if (__improbable(validate)) {
1990 const vm_offset_t *p = (vm_offset_t *)addr;
1991 const vm_offset_t *end = (vm_offset_t *)(addr + size);
1992
1993 for (; p < end; p++) {
1994 if (*p != ZONE_POISON) {
1995 zone_element_was_modified_panic(zone, addr,
1996 *p, ZONE_POISON, (vm_offset_t)p - addr);
1997 }
1998 }
1999 } else {
2000 /*
2001 * If element wasn't poisoned or entirely cleared, validate that the
2002 * minimum bytes that were cleared on free haven't been corrupted.
2003 * addr is advanced by ptr size as we have already validated and cleared
2004 * the freelist pointer/zcache canary.
2005 */
2006 if (memcmp_zero_ptr_aligned((void *) (addr + sizeof(vm_offset_t)),
2007 zp_min_size - sizeof(vm_offset_t))) {
2008 zone_element_not_clear_panic(zone, (void *)addr);
2009 }
2010 }
2011 }
2012 #endif /* ZALLOC_ENABLE_POISONING */
2013
2014 #pragma mark Zone Leak Detection
2015
2016 /*
2017 * Zone leak debugging code
2018 *
2019 * When enabled, this code keeps a log to track allocations to a particular zone that have not
2020 * yet been freed. Examining this log will reveal the source of a zone leak. The log is allocated
2021 * only when logging is enabled, so there is no effect on the system when it's turned off. Logging is
2022 * off by default.
2023 *
2024 * Enable the logging via the boot-args. Add the parameter "zlog=<zone>" to boot-args where <zone>
2025 * is the name of the zone you wish to log.
2026 *
2027 * This code only tracks one zone, so you need to identify which one is leaking first.
2028 * Generally, you'll know you have a leak when you get a "zalloc retry failed 3" panic from the zone
2029 * garbage collector. Note that the zone name printed in the panic message is not necessarily the one
2030 * containing the leak. So do a zprint from gdb and locate the zone with the bloated size. This
2031 * is most likely the problem zone, so set zlog in boot-args to this zone name, reboot and re-run the test. The
2032 * next time it panics with this message, examine the log using the kgmacros zstack, findoldest and countpcs.
2033 * See the help in the kgmacros for usage info.
2034 *
2035 *
2036 * Zone corruption logging
2037 *
2038 * Logging can also be used to help identify the source of a zone corruption. First, identify the zone
2039 * that is being corrupted, then add "-zc zlog=<zone name>" to the boot-args. When -zc is used in conjunction
2040 * with zlog, it changes the logging style to track both allocations and frees to the zone. So when the
2041 * corruption is detected, examining the log will show you the stack traces of the callers who last allocated
2042 * and freed any particular element in the zone. Use the findelem kgmacro with the address of the element that's been
2043 * corrupted to examine its history. This should lead to the source of the corruption.
2044 */
2045
2046 /* Returns TRUE if we rolled over the counter at factor */
2047 __header_always_inline bool
2048 sample_counter(volatile uint32_t *count_p, uint32_t factor)
2049 {
2050 uint32_t old_count, new_count = 0;
2051 if (count_p != NULL) {
2052 os_atomic_rmw_loop(count_p, old_count, new_count, relaxed, {
2053 new_count = old_count + 1;
2054 if (new_count >= factor) {
2055 new_count = 0;
2056 }
2057 });
2058 }
2059
2060 return new_count == 0;
2061 }
2062
2063 #if ZONE_ENABLE_LOGGING
2064 /* Log allocations and frees to help debug a zone element corruption */
2065 TUNABLE(bool, corruption_debug_flag, "-zc", false);
2066
2067 #define MAX_NUM_ZONES_ALLOWED_LOGGING 10 /* Maximum 10 zones can be logged at once */
2068
2069 static int max_num_zones_to_log = MAX_NUM_ZONES_ALLOWED_LOGGING;
2070 static int num_zones_logged = 0;
2071
2072 /*
2073 * The number of records in the log is configurable via the zrecs parameter in boot-args. Set this to
2074 * the number of records you want in the log. For example, "zrecs=10" sets it to 10 records. Since this
2075 * is the number of stacks suspected of leaking, we don't need many records.
2076 */
2077
2078 #if defined(__LP64__)
2079 #define ZRECORDS_MAX 2560 /* Max records allowed in the log */
2080 #else
2081 #define ZRECORDS_MAX 1536 /* Max records allowed in the log */
2082 #endif
2083 #define ZRECORDS_DEFAULT 1024 /* default records in log if zrecs is not specificed in boot-args */
2084
2085 static TUNABLE(uint32_t, log_records, "zrecs", ZRECORDS_DEFAULT);
2086
2087 static void
2088 zone_enable_logging(zone_t z)
2089 {
2090 z->zlog_btlog = btlog_create(log_records, MAX_ZTRACE_DEPTH,
2091 (corruption_debug_flag == FALSE) /* caller_will_remove_entries_for_element? */);
2092
2093 if (z->zlog_btlog) {
2094 printf("zone: logging started for zone %s%s\n",
2095 zone_heap_name(z), z->z_name);
2096 } else {
2097 printf("zone: couldn't allocate memory for zrecords, turning off zleak logging\n");
2098 z->zone_logging = false;
2099 }
2100 }
2101
2102 /**
2103 * @function zone_setup_logging
2104 *
2105 * @abstract
2106 * Optionally sets up a zone for logging.
2107 *
2108 * @discussion
2109 * We recognized two boot-args:
2110 *
2111 * zlog=<zone_to_log>
2112 * zrecs=<num_records_in_log>
2113 *
2114 * The zlog arg is used to specify the zone name that should be logged,
2115 * and zrecs is used to control the size of the log.
2116 *
2117 * If zrecs is not specified, a default value is used.
2118 */
2119 static void
2120 zone_setup_logging(zone_t z)
2121 {
2122 char zone_name[MAX_ZONE_NAME]; /* Temp. buffer for the zone name */
2123 char zlog_name[MAX_ZONE_NAME]; /* Temp. buffer to create the strings zlog1, zlog2 etc... */
2124 char zlog_val[MAX_ZONE_NAME]; /* the zone name we're logging, if any */
2125
2126 /*
2127 * Don't allow more than ZRECORDS_MAX records even if the user asked for more.
2128 *
2129 * This prevents accidentally hogging too much kernel memory
2130 * and making the system unusable.
2131 */
2132 if (log_records > ZRECORDS_MAX) {
2133 log_records = ZRECORDS_MAX;
2134 }
2135
2136 /*
2137 * Append kalloc heap name to zone name (if zone is used by kalloc)
2138 */
2139 snprintf(zone_name, MAX_ZONE_NAME, "%s%s", zone_heap_name(z), z->z_name);
2140
2141 /* zlog0 isn't allowed. */
2142 for (int i = 1; i <= max_num_zones_to_log; i++) {
2143 snprintf(zlog_name, MAX_ZONE_NAME, "zlog%d", i);
2144
2145 if (PE_parse_boot_argn(zlog_name, zlog_val, sizeof(zlog_val)) &&
2146 track_this_zone(zone_name, zlog_val)) {
2147 z->zone_logging = true;
2148 num_zones_logged++;
2149 break;
2150 }
2151 }
2152
2153 /*
2154 * Backwards compat. with the old boot-arg used to specify single zone
2155 * logging i.e. zlog Needs to happen after the newer zlogn checks
2156 * because the prefix will match all the zlogn
2157 * boot-args.
2158 */
2159 if (!z->zone_logging &&
2160 PE_parse_boot_argn("zlog", zlog_val, sizeof(zlog_val)) &&
2161 track_this_zone(zone_name, zlog_val)) {
2162 z->zone_logging = true;
2163 num_zones_logged++;
2164 }
2165
2166
2167 /*
2168 * If we want to log a zone, see if we need to allocate buffer space for
2169 * the log.
2170 *
2171 * Some vm related zones are zinit'ed before we can do a kmem_alloc, so
2172 * we have to defer allocation in that case.
2173 *
2174 * zone_init() will finish the job.
2175 *
2176 * If we want to log one of the VM related zones that's set up early on,
2177 * we will skip allocation of the log until zinit is called again later
2178 * on some other zone.
2179 */
2180 if (z->zone_logging && startup_phase >= STARTUP_SUB_KMEM_ALLOC) {
2181 zone_enable_logging(z);
2182 }
2183 }
2184
2185 /*
2186 * Each record in the log contains a pointer to the zone element it refers to,
2187 * and a small array to hold the pc's from the stack trace. A
2188 * record is added to the log each time a zalloc() is done in the zone_of_interest. For leak debugging,
2189 * the record is cleared when a zfree() is done. For corruption debugging, the log tracks both allocs and frees.
2190 * If the log fills, old records are replaced as if it were a circular buffer.
2191 */
2192
2193
2194 /*
2195 * Decide if we want to log this zone by doing a string compare between a zone name and the name
2196 * of the zone to log. Return true if the strings are equal, false otherwise. Because it's not
2197 * possible to include spaces in strings passed in via the boot-args, a period in the logname will
2198 * match a space in the zone name.
2199 */
2200
2201 /*
2202 * Test if we want to log this zalloc/zfree event. We log if this is the zone we're interested in and
2203 * the buffer for the records has been allocated.
2204 */
2205
2206 #define DO_LOGGING(z) (z->zlog_btlog != NULL)
2207 #else /* !ZONE_ENABLE_LOGGING */
2208 #define DO_LOGGING(z) 0
2209 #endif /* !ZONE_ENABLE_LOGGING */
2210
2211 #if CONFIG_ZLEAKS
2212
2213 /*
2214 * The zone leak detector, abbreviated 'zleak', keeps track of a subset of the currently outstanding
2215 * allocations made by the zone allocator. Every zleak_sample_factor allocations in each zone, we capture a
2216 * backtrace. Every free, we examine the table and determine if the allocation was being tracked,
2217 * and stop tracking it if it was being tracked.
2218 *
2219 * We track the allocations in the zallocations hash table, which stores the address that was returned from
2220 * the zone allocator. Each stored entry in the zallocations table points to an entry in the ztraces table, which
2221 * stores the backtrace associated with that allocation. This provides uniquing for the relatively large
2222 * backtraces - we don't store them more than once.
2223 *
2224 * Data collection begins when the zone map is 50% full, and only occurs for zones that are taking up
2225 * a large amount of virtual space.
2226 */
2227 #define ZLEAK_STATE_ENABLED 0x01 /* Zone leak monitoring should be turned on if zone_map fills up. */
2228 #define ZLEAK_STATE_ACTIVE 0x02 /* We are actively collecting traces. */
2229 #define ZLEAK_STATE_ACTIVATING 0x04 /* Some thread is doing setup; others should move along. */
2230 #define ZLEAK_STATE_FAILED 0x08 /* Attempt to allocate tables failed. We will not try again. */
2231 uint32_t zleak_state = 0; /* State of collection, as above */
2232
2233 boolean_t panic_include_ztrace = FALSE; /* Enable zleak logging on panic */
2234 vm_size_t zleak_global_tracking_threshold; /* Size of zone map at which to start collecting data */
2235 vm_size_t zleak_per_zone_tracking_threshold; /* Size a zone will have before we will collect data on it */
2236 unsigned int zleak_sample_factor = 1000; /* Allocations per sample attempt */
2237
2238 /*
2239 * Counters for allocation statistics.
2240 */
2241
2242 /* Times two active records want to occupy the same spot */
2243 unsigned int z_alloc_collisions = 0;
2244 unsigned int z_trace_collisions = 0;
2245
2246 /* Times a new record lands on a spot previously occupied by a freed allocation */
2247 unsigned int z_alloc_overwrites = 0;
2248 unsigned int z_trace_overwrites = 0;
2249
2250 /* Times a new alloc or trace is put into the hash table */
2251 unsigned int z_alloc_recorded = 0;
2252 unsigned int z_trace_recorded = 0;
2253
2254 /* Times zleak_log returned false due to not being able to acquire the lock */
2255 unsigned int z_total_conflicts = 0;
2256
2257 /*
2258 * Structure for keeping track of an allocation
2259 * An allocation bucket is in use if its element is not NULL
2260 */
2261 struct zallocation {
2262 uintptr_t za_element; /* the element that was zalloc'ed or zfree'ed, NULL if bucket unused */
2263 vm_size_t za_size; /* how much memory did this allocation take up? */
2264 uint32_t za_trace_index; /* index into ztraces for backtrace associated with allocation */
2265 /* TODO: #if this out */
2266 uint32_t za_hit_count; /* for determining effectiveness of hash function */
2267 };
2268
2269 /* Size must be a power of two for the zhash to be able to just mask off bits instead of mod */
2270 uint32_t zleak_alloc_buckets = CONFIG_ZLEAK_ALLOCATION_MAP_NUM;
2271 uint32_t zleak_trace_buckets = CONFIG_ZLEAK_TRACE_MAP_NUM;
2272
2273 vm_size_t zleak_max_zonemap_size;
2274
2275 /* Hashmaps of allocations and their corresponding traces */
2276 static struct zallocation* zallocations;
2277 static struct ztrace* ztraces;
2278
2279 /* not static so that panic can see this, see kern/debug.c */
2280 struct ztrace* top_ztrace;
2281
2282 /* Lock to protect zallocations, ztraces, and top_ztrace from concurrent modification. */
2283 LCK_GRP_DECLARE(zleak_lock_grp, "zleak_lock");
2284 LCK_SPIN_DECLARE(zleak_lock, &zleak_lock_grp);
2285
2286 /*
2287 * Initializes the zone leak monitor. Called from zone_init()
2288 */
2289 __startup_func
2290 static void
2291 zleak_init(vm_size_t max_zonemap_size)
2292 {
2293 char scratch_buf[16];
2294 boolean_t zleak_enable_flag = FALSE;
2295
2296 zleak_max_zonemap_size = max_zonemap_size;
2297 zleak_global_tracking_threshold = max_zonemap_size / 2;
2298 zleak_per_zone_tracking_threshold = zleak_global_tracking_threshold / 8;
2299
2300 #if CONFIG_EMBEDDED
2301 if (PE_parse_boot_argn("-zleakon", scratch_buf, sizeof(scratch_buf))) {
2302 zleak_enable_flag = TRUE;
2303 printf("zone leak detection enabled\n");
2304 } else {
2305 zleak_enable_flag = FALSE;
2306 printf("zone leak detection disabled\n");
2307 }
2308 #else /* CONFIG_EMBEDDED */
2309 /* -zleakoff (flag to disable zone leak monitor) */
2310 if (PE_parse_boot_argn("-zleakoff", scratch_buf, sizeof(scratch_buf))) {
2311 zleak_enable_flag = FALSE;
2312 printf("zone leak detection disabled\n");
2313 } else {
2314 zleak_enable_flag = TRUE;
2315 printf("zone leak detection enabled\n");
2316 }
2317 #endif /* CONFIG_EMBEDDED */
2318
2319 /* zfactor=XXXX (override how often to sample the zone allocator) */
2320 if (PE_parse_boot_argn("zfactor", &zleak_sample_factor, sizeof(zleak_sample_factor))) {
2321 printf("Zone leak factor override: %u\n", zleak_sample_factor);
2322 }
2323
2324 /* zleak-allocs=XXXX (override number of buckets in zallocations) */
2325 if (PE_parse_boot_argn("zleak-allocs", &zleak_alloc_buckets, sizeof(zleak_alloc_buckets))) {
2326 printf("Zone leak alloc buckets override: %u\n", zleak_alloc_buckets);
2327 /* uses 'is power of 2' trick: (0x01000 & 0x00FFF == 0) */
2328 if (zleak_alloc_buckets == 0 || (zleak_alloc_buckets & (zleak_alloc_buckets - 1))) {
2329 printf("Override isn't a power of two, bad things might happen!\n");
2330 }
2331 }
2332
2333 /* zleak-traces=XXXX (override number of buckets in ztraces) */
2334 if (PE_parse_boot_argn("zleak-traces", &zleak_trace_buckets, sizeof(zleak_trace_buckets))) {
2335 printf("Zone leak trace buckets override: %u\n", zleak_trace_buckets);
2336 /* uses 'is power of 2' trick: (0x01000 & 0x00FFF == 0) */
2337 if (zleak_trace_buckets == 0 || (zleak_trace_buckets & (zleak_trace_buckets - 1))) {
2338 printf("Override isn't a power of two, bad things might happen!\n");
2339 }
2340 }
2341
2342 if (zleak_enable_flag) {
2343 zleak_state = ZLEAK_STATE_ENABLED;
2344 }
2345 }
2346
2347 /*
2348 * Support for kern.zleak.active sysctl - a simplified
2349 * version of the zleak_state variable.
2350 */
2351 int
2352 get_zleak_state(void)
2353 {
2354 if (zleak_state & ZLEAK_STATE_FAILED) {
2355 return -1;
2356 }
2357 if (zleak_state & ZLEAK_STATE_ACTIVE) {
2358 return 1;
2359 }
2360 return 0;
2361 }
2362
2363 kern_return_t
2364 zleak_activate(void)
2365 {
2366 kern_return_t retval;
2367 vm_size_t z_alloc_size = zleak_alloc_buckets * sizeof(struct zallocation);
2368 vm_size_t z_trace_size = zleak_trace_buckets * sizeof(struct ztrace);
2369 void *allocations_ptr = NULL;
2370 void *traces_ptr = NULL;
2371
2372 /* Only one thread attempts to activate at a time */
2373 if (zleak_state & (ZLEAK_STATE_ACTIVE | ZLEAK_STATE_ACTIVATING | ZLEAK_STATE_FAILED)) {
2374 return KERN_SUCCESS;
2375 }
2376
2377 /* Indicate that we're doing the setup */
2378 lck_spin_lock(&zleak_lock);
2379 if (zleak_state & (ZLEAK_STATE_ACTIVE | ZLEAK_STATE_ACTIVATING | ZLEAK_STATE_FAILED)) {
2380 lck_spin_unlock(&zleak_lock);
2381 return KERN_SUCCESS;
2382 }
2383
2384 zleak_state |= ZLEAK_STATE_ACTIVATING;
2385 lck_spin_unlock(&zleak_lock);
2386
2387 /* Allocate and zero tables */
2388 retval = kmem_alloc_kobject(kernel_map, (vm_offset_t*)&allocations_ptr, z_alloc_size, VM_KERN_MEMORY_OSFMK);
2389 if (retval != KERN_SUCCESS) {
2390 goto fail;
2391 }
2392
2393 retval = kmem_alloc_kobject(kernel_map, (vm_offset_t*)&traces_ptr, z_trace_size, VM_KERN_MEMORY_OSFMK);
2394 if (retval != KERN_SUCCESS) {
2395 goto fail;
2396 }
2397
2398 bzero(allocations_ptr, z_alloc_size);
2399 bzero(traces_ptr, z_trace_size);
2400
2401 /* Everything's set. Install tables, mark active. */
2402 zallocations = allocations_ptr;
2403 ztraces = traces_ptr;
2404
2405 /*
2406 * Initialize the top_ztrace to the first entry in ztraces,
2407 * so we don't have to check for null in zleak_log
2408 */
2409 top_ztrace = &ztraces[0];
2410
2411 /*
2412 * Note that we do need a barrier between installing
2413 * the tables and setting the active flag, because the zfree()
2414 * path accesses the table without a lock if we're active.
2415 */
2416 lck_spin_lock(&zleak_lock);
2417 zleak_state |= ZLEAK_STATE_ACTIVE;
2418 zleak_state &= ~ZLEAK_STATE_ACTIVATING;
2419 lck_spin_unlock(&zleak_lock);
2420
2421 return 0;
2422
2423 fail:
2424 /*
2425 * If we fail to allocate memory, don't further tax
2426 * the system by trying again.
2427 */
2428 lck_spin_lock(&zleak_lock);
2429 zleak_state |= ZLEAK_STATE_FAILED;
2430 zleak_state &= ~ZLEAK_STATE_ACTIVATING;
2431 lck_spin_unlock(&zleak_lock);
2432
2433 if (allocations_ptr != NULL) {
2434 kmem_free(kernel_map, (vm_offset_t)allocations_ptr, z_alloc_size);
2435 }
2436
2437 if (traces_ptr != NULL) {
2438 kmem_free(kernel_map, (vm_offset_t)traces_ptr, z_trace_size);
2439 }
2440
2441 return retval;
2442 }
2443
2444 /*
2445 * TODO: What about allocations that never get deallocated,
2446 * especially ones with unique backtraces? Should we wait to record
2447 * until after boot has completed?
2448 * (How many persistent zallocs are there?)
2449 */
2450
2451 /*
2452 * This function records the allocation in the allocations table,
2453 * and stores the associated backtrace in the traces table
2454 * (or just increments the refcount if the trace is already recorded)
2455 * If the allocation slot is in use, the old allocation is replaced with the new allocation, and
2456 * the associated trace's refcount is decremented.
2457 * If the trace slot is in use, it returns.
2458 * The refcount is incremented by the amount of memory the allocation consumes.
2459 * The return value indicates whether to try again next time.
2460 */
2461 static boolean_t
2462 zleak_log(uintptr_t* bt,
2463 uintptr_t addr,
2464 uint32_t depth,
2465 vm_size_t allocation_size)
2466 {
2467 /* Quit if there's someone else modifying the hash tables */
2468 if (!lck_spin_try_lock(&zleak_lock)) {
2469 z_total_conflicts++;
2470 return FALSE;
2471 }
2472
2473 struct zallocation* allocation = &zallocations[hashaddr(addr, zleak_alloc_buckets)];
2474
2475 uint32_t trace_index = hashbacktrace(bt, depth, zleak_trace_buckets);
2476 struct ztrace* trace = &ztraces[trace_index];
2477
2478 allocation->za_hit_count++;
2479 trace->zt_hit_count++;
2480
2481 /*
2482 * If the allocation bucket we want to be in is occupied, and if the occupier
2483 * has the same trace as us, just bail.
2484 */
2485 if (allocation->za_element != (uintptr_t) 0 && trace_index == allocation->za_trace_index) {
2486 z_alloc_collisions++;
2487
2488 lck_spin_unlock(&zleak_lock);
2489 return TRUE;
2490 }
2491
2492 /* STEP 1: Store the backtrace in the traces array. */
2493 /* A size of zero indicates that the trace bucket is free. */
2494
2495 if (trace->zt_size > 0 && bcmp(trace->zt_stack, bt, (depth * sizeof(uintptr_t))) != 0) {
2496 /*
2497 * Different unique trace with same hash!
2498 * Just bail - if we're trying to record the leaker, hopefully the other trace will be deallocated
2499 * and get out of the way for later chances
2500 */
2501 trace->zt_collisions++;
2502 z_trace_collisions++;
2503
2504 lck_spin_unlock(&zleak_lock);
2505 return TRUE;
2506 } else if (trace->zt_size > 0) {
2507 /* Same trace, already added, so increment refcount */
2508 trace->zt_size += allocation_size;
2509 } else {
2510 /* Found an unused trace bucket, record the trace here! */
2511 if (trace->zt_depth != 0) { /* if this slot was previously used but not currently in use */
2512 z_trace_overwrites++;
2513 }
2514
2515 z_trace_recorded++;
2516 trace->zt_size = allocation_size;
2517 memcpy(trace->zt_stack, bt, (depth * sizeof(uintptr_t)));
2518
2519 trace->zt_depth = depth;
2520 trace->zt_collisions = 0;
2521 }
2522
2523 /* STEP 2: Store the allocation record in the allocations array. */
2524
2525 if (allocation->za_element != (uintptr_t) 0) {
2526 /*
2527 * Straight up replace any allocation record that was there. We don't want to do the work
2528 * to preserve the allocation entries that were there, because we only record a subset of the
2529 * allocations anyways.
2530 */
2531
2532 z_alloc_collisions++;
2533
2534 struct ztrace* associated_trace = &ztraces[allocation->za_trace_index];
2535 /* Knock off old allocation's size, not the new allocation */
2536 associated_trace->zt_size -= allocation->za_size;
2537 } else if (allocation->za_trace_index != 0) {
2538 /* Slot previously used but not currently in use */
2539 z_alloc_overwrites++;
2540 }
2541
2542 allocation->za_element = addr;
2543 allocation->za_trace_index = trace_index;
2544 allocation->za_size = allocation_size;
2545
2546 z_alloc_recorded++;
2547
2548 if (top_ztrace->zt_size < trace->zt_size) {
2549 top_ztrace = trace;
2550 }
2551
2552 lck_spin_unlock(&zleak_lock);
2553 return TRUE;
2554 }
2555
2556 /*
2557 * Free the allocation record and release the stacktrace.
2558 * This should be as fast as possible because it will be called for every free.
2559 */
2560 __attribute__((noinline))
2561 static void
2562 zleak_free(uintptr_t addr,
2563 vm_size_t allocation_size)
2564 {
2565 if (addr == (uintptr_t) 0) {
2566 return;
2567 }
2568
2569 struct zallocation* allocation = &zallocations[hashaddr(addr, zleak_alloc_buckets)];
2570
2571 /* Double-checked locking: check to find out if we're interested, lock, check to make
2572 * sure it hasn't changed, then modify it, and release the lock.
2573 */
2574
2575 if (allocation->za_element == addr && allocation->za_trace_index < zleak_trace_buckets) {
2576 /* if the allocation was the one, grab the lock, check again, then delete it */
2577 lck_spin_lock(&zleak_lock);
2578
2579 if (allocation->za_element == addr && allocation->za_trace_index < zleak_trace_buckets) {
2580 struct ztrace *trace;
2581
2582 /* allocation_size had better match what was passed into zleak_log - otherwise someone is freeing into the wrong zone! */
2583 if (allocation->za_size != allocation_size) {
2584 panic("Freeing as size %lu memory that was allocated with size %lu\n",
2585 (uintptr_t)allocation_size, (uintptr_t)allocation->za_size);
2586 }
2587
2588 trace = &ztraces[allocation->za_trace_index];
2589
2590 /* size of 0 indicates trace bucket is unused */
2591 if (trace->zt_size > 0) {
2592 trace->zt_size -= allocation_size;
2593 }
2594
2595 /* A NULL element means the allocation bucket is unused */
2596 allocation->za_element = 0;
2597 }
2598 lck_spin_unlock(&zleak_lock);
2599 }
2600 }
2601
2602 #endif /* CONFIG_ZLEAKS */
2603
2604 /* These functions outside of CONFIG_ZLEAKS because they are also used in
2605 * mbuf.c for mbuf leak-detection. This is why they lack the z_ prefix.
2606 */
2607
2608 /* "Thomas Wang's 32/64 bit mix functions." http://www.concentric.net/~Ttwang/tech/inthash.htm */
2609 uintptr_t
2610 hash_mix(uintptr_t x)
2611 {
2612 #ifndef __LP64__
2613 x += ~(x << 15);
2614 x ^= (x >> 10);
2615 x += (x << 3);
2616 x ^= (x >> 6);
2617 x += ~(x << 11);
2618 x ^= (x >> 16);
2619 #else
2620 x += ~(x << 32);
2621 x ^= (x >> 22);
2622 x += ~(x << 13);
2623 x ^= (x >> 8);
2624 x += (x << 3);
2625 x ^= (x >> 15);
2626 x += ~(x << 27);
2627 x ^= (x >> 31);
2628 #endif
2629 return x;
2630 }
2631
2632 uint32_t
2633 hashbacktrace(uintptr_t* bt, uint32_t depth, uint32_t max_size)
2634 {
2635 uintptr_t hash = 0;
2636 uintptr_t mask = max_size - 1;
2637
2638 while (depth) {
2639 hash += bt[--depth];
2640 }
2641
2642 hash = hash_mix(hash) & mask;
2643
2644 assert(hash < max_size);
2645
2646 return (uint32_t) hash;
2647 }
2648
2649 /*
2650 * TODO: Determine how well distributed this is
2651 * max_size must be a power of 2. i.e 0x10000 because 0x10000-1 is 0x0FFFF which is a great bitmask
2652 */
2653 uint32_t
2654 hashaddr(uintptr_t pt, uint32_t max_size)
2655 {
2656 uintptr_t hash = 0;
2657 uintptr_t mask = max_size - 1;
2658
2659 hash = hash_mix(pt) & mask;
2660
2661 assert(hash < max_size);
2662
2663 return (uint32_t) hash;
2664 }
2665
2666 /* End of all leak-detection code */
2667 #pragma mark zone creation, configuration, destruction
2668
2669 static zone_t
2670 zone_init_defaults(zone_id_t zid)
2671 {
2672 zone_t z = &zone_array[zid];
2673
2674 z->page_count_max = ~0u;
2675 z->collectable = true;
2676 z->expandable = true;
2677 z->submap_idx = Z_SUBMAP_IDX_GENERAL_MAP;
2678
2679 simple_lock_init(&z->lock, 0);
2680
2681 return z;
2682 }
2683
2684 static bool
2685 zone_is_initializing(zone_t z)
2686 {
2687 return !z->z_self && !z->destroyed;
2688 }
2689
2690 static void
2691 zone_set_max(zone_t z, vm_size_t max)
2692 {
2693 #if KASAN_ZALLOC
2694 if (z->kasan_redzone) {
2695 /*
2696 * Adjust the max memory for the kasan redzones
2697 */
2698 max += (max / z->pcpu_elem_size) * z->kasan_redzone * 2;
2699 }
2700 #endif
2701 if (max < z->percpu ? 1 : z->alloc_pages) {
2702 max = z->percpu ? 1 : z->alloc_pages;
2703 } else {
2704 max = atop(round_page(max));
2705 }
2706 z->page_count_max = max;
2707 }
2708
2709 void
2710 zone_set_submap_idx(zone_t zone, unsigned int sub_map_idx)
2711 {
2712 if (!zone_is_initializing(zone)) {
2713 panic("%s: called after zone_create()", __func__);
2714 }
2715 if (sub_map_idx > zone_last_submap_idx) {
2716 panic("zone_set_submap_idx(%d) > %d", sub_map_idx, zone_last_submap_idx);
2717 }
2718 zone->submap_idx = sub_map_idx;
2719 }
2720
2721 void
2722 zone_set_noexpand(
2723 zone_t zone,
2724 vm_size_t max)
2725 {
2726 if (!zone_is_initializing(zone)) {
2727 panic("%s: called after zone_create()", __func__);
2728 }
2729 zone->expandable = false;
2730 zone_set_max(zone, max);
2731 }
2732
2733 void
2734 zone_set_exhaustible(
2735 zone_t zone,
2736 vm_size_t max)
2737 {
2738 if (!zone_is_initializing(zone)) {
2739 panic("%s: called after zone_create()", __func__);
2740 }
2741 zone->expandable = false;
2742 zone->exhaustible = true;
2743 zone_set_max(zone, max);
2744 }
2745
2746 /**
2747 * @function zone_create_find
2748 *
2749 * @abstract
2750 * Finds an unused zone for the given name and element size.
2751 *
2752 * @param name the zone name
2753 * @param size the element size (including redzones, ...)
2754 * @param flags the flags passed to @c zone_create*
2755 * @param zid the desired zone ID or ZONE_ID_ANY
2756 *
2757 * @returns a zone to initialize further.
2758 */
2759 static zone_t
2760 zone_create_find(
2761 const char *name,
2762 vm_size_t size,
2763 zone_create_flags_t flags,
2764 zone_id_t zid)
2765 {
2766 zone_id_t nzones;
2767 zone_t z;
2768
2769 simple_lock(&all_zones_lock, &zone_locks_grp);
2770
2771 nzones = (zone_id_t)os_atomic_load(&num_zones, relaxed);
2772 assert(num_zones_in_use <= nzones && nzones < MAX_ZONES);
2773
2774 if (__improbable(nzones < ZONE_ID__FIRST_DYNAMIC)) {
2775 /*
2776 * The first time around, make sure the reserved zone IDs
2777 * have an initialized lock as zone_index_foreach() will
2778 * enumerate them.
2779 */
2780 while (nzones < ZONE_ID__FIRST_DYNAMIC) {
2781 zone_init_defaults(nzones++);
2782 }
2783
2784 os_atomic_store(&num_zones, nzones, release);
2785 }
2786
2787 if (zid != ZONE_ID_ANY) {
2788 if (zid >= ZONE_ID__FIRST_DYNAMIC) {
2789 panic("zone_create: invalid desired zone ID %d for %s",
2790 zid, name);
2791 }
2792 if (flags & ZC_DESTRUCTIBLE) {
2793 panic("zone_create: ID %d (%s) must be permanent", zid, name);
2794 }
2795 if (zone_array[zid].z_self) {
2796 panic("zone_create: creating zone ID %d (%s) twice", zid, name);
2797 }
2798 z = &zone_array[zid];
2799 } else {
2800 if (flags & ZC_DESTRUCTIBLE) {
2801 /*
2802 * If possible, find a previously zdestroy'ed zone in the
2803 * zone_array that we can reuse.
2804 */
2805 for (int i = bitmap_first(zone_destroyed_bitmap, MAX_ZONES);
2806 i >= 0; i = bitmap_next(zone_destroyed_bitmap, i)) {
2807 z = &zone_array[i];
2808
2809 /*
2810 * If the zone name and the element size are the
2811 * same, we can just reuse the old zone struct.
2812 */
2813 if (strcmp(z->z_name, name) || zone_elem_size(z) != size) {
2814 continue;
2815 }
2816 bitmap_clear(zone_destroyed_bitmap, i);
2817 z->destroyed = false;
2818 z->z_self = z;
2819 zid = (zone_id_t)i;
2820 goto out;
2821 }
2822 }
2823
2824 zid = nzones++;
2825 z = zone_init_defaults(zid);
2826
2827 /*
2828 * The release barrier pairs with the acquire in
2829 * zone_index_foreach() and makes sure that enumeration loops
2830 * always see an initialized zone lock.
2831 */
2832 os_atomic_store(&num_zones, nzones, release);
2833 }
2834
2835 out:
2836 num_zones_in_use++;
2837 simple_unlock(&all_zones_lock);
2838
2839 return z;
2840 }
2841
2842 __abortlike
2843 static void
2844 zone_create_panic(const char *name, const char *f1, const char *f2)
2845 {
2846 panic("zone_create: creating zone %s: flag %s and %s are incompatible",
2847 name, f1, f2);
2848 }
2849 #define zone_create_assert_not_both(name, flags, current_flag, forbidden_flag) \
2850 if ((flags) & forbidden_flag) { \
2851 zone_create_panic(name, #current_flag, #forbidden_flag); \
2852 }
2853
2854 /*
2855 * Adjusts the size of the element based on minimum size, alignment
2856 * and kasan redzones
2857 */
2858 static vm_size_t
2859 zone_elem_adjust_size(
2860 const char *name __unused,
2861 vm_size_t elem_size,
2862 zone_create_flags_t flags,
2863 vm_size_t *redzone __unused)
2864 {
2865 vm_size_t size;
2866 /*
2867 * Adjust element size for minimum size and pointer alignment
2868 */
2869 size = (elem_size + sizeof(vm_offset_t) - 1) & -sizeof(vm_offset_t);
2870 if (((flags & ZC_PERCPU) == 0) && size < ZONE_MIN_ELEM_SIZE) {
2871 size = ZONE_MIN_ELEM_SIZE;
2872 }
2873
2874 #if KASAN_ZALLOC
2875 /*
2876 * Expand the zone allocation size to include the redzones.
2877 *
2878 * For page-multiple zones add a full guard page because they
2879 * likely require alignment.
2880 */
2881 vm_size_t redzone_tmp;
2882 if (flags & (ZC_KASAN_NOREDZONE | ZC_PERCPU)) {
2883 redzone_tmp = 0;
2884 } else if ((size & PAGE_MASK) == 0) {
2885 if (size != PAGE_SIZE && (flags & ZC_ALIGNMENT_REQUIRED)) {
2886 panic("zone_create: zone %s can't provide more than PAGE_SIZE"
2887 "alignment", name);
2888 }
2889 redzone_tmp = PAGE_SIZE;
2890 } else if (flags & ZC_ALIGNMENT_REQUIRED) {
2891 redzone_tmp = 0;
2892 } else {
2893 redzone_tmp = KASAN_GUARD_SIZE;
2894 }
2895 size += redzone_tmp * 2;
2896 if (redzone) {
2897 *redzone = redzone_tmp;
2898 }
2899 #endif
2900 return size;
2901 }
2902
2903 /*
2904 * Returns the allocation chunk size that has least framentation
2905 */
2906 static vm_size_t
2907 zone_get_min_alloc_granule(
2908 vm_size_t elem_size,
2909 zone_create_flags_t flags)
2910 {
2911 vm_size_t alloc_granule = PAGE_SIZE;
2912 if (flags & ZC_PERCPU) {
2913 alloc_granule = PAGE_SIZE * zpercpu_count();
2914 if (PAGE_SIZE % elem_size > 256) {
2915 panic("zone_create: per-cpu zone has too much fragmentation");
2916 }
2917 } else if ((elem_size & PAGE_MASK) == 0) {
2918 /* zero fragmentation by definition */
2919 alloc_granule = elem_size;
2920 } else if (alloc_granule % elem_size == 0) {
2921 /* zero fragmentation by definition */
2922 } else {
2923 vm_size_t frag = (alloc_granule % elem_size) * 100 / alloc_granule;
2924 vm_size_t alloc_tmp = PAGE_SIZE;
2925 while ((alloc_tmp += PAGE_SIZE) <= ZONE_MAX_ALLOC_SIZE) {
2926 vm_size_t frag_tmp = (alloc_tmp % elem_size) * 100 / alloc_tmp;
2927 if (frag_tmp < frag) {
2928 frag = frag_tmp;
2929 alloc_granule = alloc_tmp;
2930 }
2931 }
2932 }
2933 return alloc_granule;
2934 }
2935
2936 vm_size_t
2937 zone_get_foreign_alloc_size(
2938 const char *name __unused,
2939 vm_size_t elem_size,
2940 zone_create_flags_t flags,
2941 uint16_t min_pages)
2942 {
2943 vm_size_t adjusted_size = zone_elem_adjust_size(name, elem_size, flags,
2944 NULL);
2945 vm_size_t alloc_granule = zone_get_min_alloc_granule(adjusted_size,
2946 flags);
2947 vm_size_t min_size = min_pages * PAGE_SIZE;
2948 /*
2949 * Round up min_size to a multiple of alloc_granule
2950 */
2951 return ((min_size + alloc_granule - 1) / alloc_granule)
2952 * alloc_granule;
2953 }
2954
2955 zone_t
2956 zone_create_ext(
2957 const char *name,
2958 vm_size_t size,
2959 zone_create_flags_t flags,
2960 zone_id_t desired_zid,
2961 void (^extra_setup)(zone_t))
2962 {
2963 vm_size_t alloc;
2964 vm_size_t redzone;
2965 zone_t z;
2966
2967 if (size > ZONE_MAX_ALLOC_SIZE) {
2968 panic("zone_create: element size too large: %zd", (size_t)size);
2969 }
2970
2971 size = zone_elem_adjust_size(name, size, flags, &redzone);
2972 /*
2973 * Allocate the zone slot, return early if we found an older match.
2974 */
2975 z = zone_create_find(name, size, flags, desired_zid);
2976 if (__improbable(z->z_self)) {
2977 /* We found a zone to reuse */
2978 return z;
2979 }
2980
2981 /*
2982 * Initialize the zone properly.
2983 */
2984
2985 /*
2986 * If the kernel is post lockdown, copy the zone name passed in.
2987 * Else simply maintain a pointer to the name string as it can only
2988 * be a core XNU zone (no unloadable kext exists before lockdown).
2989 */
2990 if (startup_phase >= STARTUP_SUB_LOCKDOWN) {
2991 size_t nsz = MIN(strlen(name) + 1, MACH_ZONE_NAME_MAX_LEN);
2992 char *buf = zalloc_permanent(nsz, ZALIGN_NONE);
2993 strlcpy(buf, name, nsz);
2994 z->z_name = buf;
2995 } else {
2996 z->z_name = name;
2997 }
2998 /*
2999 * If zone_init() hasn't run yet, the permanent zones do not exist.
3000 * We can limp along without properly initialized stats for a while,
3001 * zone_init() will rebuild the missing stats when it runs.
3002 */
3003 if (__probable(zone_array[ZONE_ID_PERCPU_PERMANENT].z_self)) {
3004 z->z_stats = zalloc_percpu_permanent_type(struct zone_stats);
3005 }
3006
3007 alloc = zone_get_min_alloc_granule(size, flags);
3008
3009 if (flags & ZC_KALLOC_HEAP) {
3010 size_t rem = (alloc % size) / (alloc / size);
3011
3012 /*
3013 * Try to grow the elements size and spread them more if the remaining
3014 * space is large enough.
3015 */
3016 size += rem & ~(KALLOC_MINALIGN - 1);
3017 }
3018
3019 z->pcpu_elem_size = z->z_elem_size = (uint16_t)size;
3020 z->alloc_pages = (uint16_t)atop(alloc);
3021 #if KASAN_ZALLOC
3022 z->kasan_redzone = redzone;
3023 if (strncmp(name, "fakestack.", sizeof("fakestack.") - 1) == 0) {
3024 z->kasan_fakestacks = true;
3025 }
3026 #endif
3027
3028 /*
3029 * Handle KPI flags
3030 */
3031 #if __LP64__
3032 if (flags & ZC_SEQUESTER) {
3033 z->va_sequester = true;
3034 }
3035 #endif
3036 /* ZC_CACHING applied after all configuration is done */
3037
3038 if (flags & ZC_PERCPU) {
3039 /*
3040 * ZC_CACHING is disallowed because it uses per-cpu zones for its
3041 * implementation and it would be circular. These allocations are
3042 * also quite expensive, so caching feels dangerous memory wise too.
3043 *
3044 * ZC_ZFREE_CLEARMEM is forced because per-cpu zones allow for
3045 * pointer-sized allocations which poisoning doesn't support.
3046 */
3047 zone_create_assert_not_both(name, flags, ZC_PERCPU, ZC_CACHING);
3048 zone_create_assert_not_both(name, flags, ZC_PERCPU, ZC_ALLOW_FOREIGN);
3049 z->percpu = true;
3050 z->gzalloc_exempt = true;
3051 z->zfree_clear_mem = true;
3052 z->pcpu_elem_size *= zpercpu_count();
3053 }
3054 if (flags & ZC_ZFREE_CLEARMEM) {
3055 z->zfree_clear_mem = true;
3056 }
3057 if (flags & ZC_NOGC) {
3058 z->collectable = false;
3059 }
3060 if (flags & ZC_NOENCRYPT) {
3061 z->noencrypt = true;
3062 }
3063 if (flags & ZC_ALIGNMENT_REQUIRED) {
3064 z->alignment_required = true;
3065 }
3066 if (flags & ZC_NOGZALLOC) {
3067 z->gzalloc_exempt = true;
3068 }
3069 if (flags & ZC_NOCALLOUT) {
3070 z->no_callout = true;
3071 }
3072 if (flags & ZC_DESTRUCTIBLE) {
3073 zone_create_assert_not_both(name, flags, ZC_DESTRUCTIBLE, ZC_CACHING);
3074 zone_create_assert_not_both(name, flags, ZC_DESTRUCTIBLE, ZC_ALLOW_FOREIGN);
3075 z->destructible = true;
3076 }
3077
3078 /*
3079 * Handle Internal flags
3080 */
3081 if (flags & ZC_ALLOW_FOREIGN) {
3082 z->allows_foreign = true;
3083 }
3084 if ((ZSECURITY_OPTIONS_SUBMAP_USER_DATA & zsecurity_options) &&
3085 (flags & ZC_DATA_BUFFERS)) {
3086 z->submap_idx = Z_SUBMAP_IDX_BAG_OF_BYTES_MAP;
3087 }
3088 if (flags & ZC_KASAN_NOQUARANTINE) {
3089 z->kasan_noquarantine = true;
3090 }
3091 /* ZC_KASAN_NOREDZONE already handled */
3092
3093 /*
3094 * Then if there's extra tuning, do it
3095 */
3096 if (extra_setup) {
3097 extra_setup(z);
3098 }
3099
3100 /*
3101 * Configure debugging features
3102 */
3103 #if CONFIG_GZALLOC
3104 gzalloc_zone_init(z); /* might set z->gzalloc_tracked */
3105 #endif
3106 #if ZONE_ENABLE_LOGGING
3107 if (!z->gzalloc_tracked && num_zones_logged < max_num_zones_to_log) {
3108 /*
3109 * Check for and set up zone leak detection if requested via boot-args.
3110 * might set z->zone_logging
3111 */
3112 zone_setup_logging(z);
3113 }
3114 #endif /* ZONE_ENABLE_LOGGING */
3115 #if VM_MAX_TAG_ZONES
3116 if (!z->gzalloc_tracked && z->kalloc_heap && zone_tagging_on) {
3117 static int tag_zone_index;
3118 vm_offset_t esize = zone_elem_size(z);
3119 z->tags = true;
3120 z->tags_inline = (((page_size + esize - 1) / esize) <=
3121 (sizeof(uint32_t) / sizeof(uint16_t)));
3122 z->tag_zone_index = os_atomic_inc_orig(&tag_zone_index, relaxed);
3123 assert(z->tag_zone_index < VM_MAX_TAG_ZONES);
3124 }
3125 #endif
3126
3127 /*
3128 * Finally, fixup properties based on security policies, boot-args, ...
3129 */
3130 if ((ZSECURITY_OPTIONS_SUBMAP_USER_DATA & zsecurity_options) &&
3131 z->kalloc_heap == KHEAP_ID_DATA_BUFFERS) {
3132 z->submap_idx = Z_SUBMAP_IDX_BAG_OF_BYTES_MAP;
3133 }
3134 #if __LP64__
3135 if ((ZSECURITY_OPTIONS_SEQUESTER & zsecurity_options) &&
3136 (flags & ZC_NOSEQUESTER) == 0 &&
3137 z->submap_idx == Z_SUBMAP_IDX_GENERAL_MAP) {
3138 z->va_sequester = true;
3139 }
3140 #endif
3141 /*
3142 * Always clear zone elements smaller than a cacheline,
3143 * because it's pretty close to free.
3144 */
3145 if (size <= zp_min_size) {
3146 z->zfree_clear_mem = true;
3147 }
3148 if (zp_factor != 0 && !z->zfree_clear_mem) {
3149 z->zp_count = zone_poison_count_init(z);
3150 }
3151
3152 #if CONFIG_ZCACHE
3153 if ((flags & ZC_NOCACHING) == 0) {
3154 /*
3155 * Append kalloc heap name to zone name (if zone is used by kalloc)
3156 */
3157 char temp_zone_name[MAX_ZONE_NAME] = "";
3158 snprintf(temp_zone_name, MAX_ZONE_NAME, "%s%s", zone_heap_name(z), z->z_name);
3159
3160 /* Check if boot-arg specified it should have a cache */
3161 if (track_this_zone(temp_zone_name, cache_zone_name)) {
3162 flags |= ZC_CACHING;
3163 } else if (zcc_kalloc && z->kalloc_heap) {
3164 flags |= ZC_CACHING;
3165 }
3166 }
3167 if ((flags & ZC_CACHING) &&
3168 !z->tags && !z->zone_logging && !z->gzalloc_tracked) {
3169 zcache_init(z);
3170 }
3171 #endif /* CONFIG_ZCACHE */
3172
3173 lock_zone(z);
3174 z->z_self = z;
3175 unlock_zone(z);
3176
3177 return z;
3178 }
3179
3180 __startup_func
3181 void
3182 zone_create_startup(struct zone_create_startup_spec *spec)
3183 {
3184 *spec->z_var = zone_create_ext(spec->z_name, spec->z_size,
3185 spec->z_flags, spec->z_zid, spec->z_setup);
3186 }
3187
3188 /*
3189 * The 4 first field of a zone_view and a zone alias, so that the zone_or_view_t
3190 * union works. trust but verify.
3191 */
3192 #define zalloc_check_zov_alias(f1, f2) \
3193 static_assert(offsetof(struct zone, f1) == offsetof(struct zone_view, f2))
3194 zalloc_check_zov_alias(z_self, zv_zone);
3195 zalloc_check_zov_alias(z_stats, zv_stats);
3196 zalloc_check_zov_alias(z_name, zv_name);
3197 zalloc_check_zov_alias(z_views, zv_next);
3198 #undef zalloc_check_zov_alias
3199
3200 __startup_func
3201 void
3202 zone_view_startup_init(struct zone_view_startup_spec *spec)
3203 {
3204 struct kalloc_heap *heap = NULL;
3205 zone_view_t zv = spec->zv_view;
3206 zone_t z;
3207
3208 switch (spec->zv_heapid) {
3209 case KHEAP_ID_DEFAULT:
3210 heap = KHEAP_DEFAULT;
3211 break;
3212 case KHEAP_ID_DATA_BUFFERS:
3213 heap = KHEAP_DATA_BUFFERS;
3214 break;
3215 case KHEAP_ID_KEXT:
3216 heap = KHEAP_KEXT;
3217 break;
3218 default:
3219 heap = NULL;
3220 }
3221
3222 if (heap) {
3223 z = kalloc_heap_zone_for_size(heap, spec->zv_size);
3224 assert(z);
3225 } else {
3226 z = spec->zv_zone;
3227 assert(spec->zv_size <= zone_elem_size(z));
3228 }
3229
3230 zv->zv_zone = z;
3231 zv->zv_stats = zalloc_percpu_permanent_type(struct zone_stats);
3232 zv->zv_next = z->z_views;
3233 if (z->z_views == NULL && z->kalloc_heap == KHEAP_ID_NONE) {
3234 /*
3235 * count the raw view for zones not in a heap,
3236 * kalloc_heap_init() already counts it for its members.
3237 */
3238 zone_view_count += 2;
3239 } else {
3240 zone_view_count += 1;
3241 }
3242 z->z_views = zv;
3243 }
3244
3245 zone_t
3246 zone_create(
3247 const char *name,
3248 vm_size_t size,
3249 zone_create_flags_t flags)
3250 {
3251 return zone_create_ext(name, size, flags, ZONE_ID_ANY, NULL);
3252 }
3253
3254 zone_t
3255 zinit(
3256 vm_size_t size, /* the size of an element */
3257 vm_size_t max, /* maximum memory to use */
3258 vm_size_t alloc __unused, /* allocation size */
3259 const char *name) /* a name for the zone */
3260 {
3261 zone_t z = zone_create(name, size, ZC_DESTRUCTIBLE);
3262 zone_set_max(z, max);
3263 return z;
3264 }
3265
3266 void
3267 zdestroy(zone_t z)
3268 {
3269 unsigned int zindex = zone_index(z);
3270
3271 lock_zone(z);
3272
3273 if (!z->destructible || zone_caching_enabled(z) || z->allows_foreign) {
3274 panic("zdestroy: Zone %s%s isn't destructible",
3275 zone_heap_name(z), z->z_name);
3276 }
3277
3278 if (!z->z_self || z->expanding_no_vm_priv || z->expanding_vm_priv ||
3279 z->async_pending || z->waiting) {
3280 panic("zdestroy: Zone %s%s in an invalid state for destruction",
3281 zone_heap_name(z), z->z_name);
3282 }
3283
3284 #if !KASAN_ZALLOC
3285 /*
3286 * Unset the valid bit. We'll hit an assert failure on further operations
3287 * on this zone, until zinit() is called again.
3288 *
3289 * Leave the zone valid for KASan as we will see zfree's on quarantined free
3290 * elements even after the zone is destroyed.
3291 */
3292 z->z_self = NULL;
3293 #endif
3294 z->destroyed = true;
3295 unlock_zone(z);
3296
3297 /* Dump all the free elements */
3298 zone_drop_free_elements(z);
3299
3300 #if CONFIG_GZALLOC
3301 if (__improbable(z->gzalloc_tracked)) {
3302 /* If the zone is gzalloc managed dump all the elements in the free cache */
3303 gzalloc_empty_free_cache(z);
3304 }
3305 #endif
3306
3307 lock_zone(z);
3308
3309 while (!zone_pva_is_null(z->pages_sequester)) {
3310 struct zone_page_metadata *page_meta;
3311 vm_offset_t free_addr;
3312
3313 page_meta = zone_sequestered_page_get(z, &free_addr);
3314 unlock_zone(z);
3315 kmem_free(submap_for_zone(z), free_addr, ptoa(z->alloc_pages));
3316 lock_zone(z);
3317 }
3318
3319 #if !KASAN_ZALLOC
3320 /* Assert that all counts are zero */
3321 if (z->countavail || z->countfree || zone_size_wired(z) ||
3322 z->allfree_page_count || z->sequester_page_count) {
3323 panic("zdestroy: Zone %s%s isn't empty at zdestroy() time",
3324 zone_heap_name(z), z->z_name);
3325 }
3326
3327 /* consistency check: make sure everything is indeed empty */
3328 assert(zone_pva_is_null(z->pages_any_free_foreign));
3329 assert(zone_pva_is_null(z->pages_all_used_foreign));
3330 assert(zone_pva_is_null(z->pages_all_free));
3331 assert(zone_pva_is_null(z->pages_intermediate));
3332 assert(zone_pva_is_null(z->pages_all_used));
3333 assert(zone_pva_is_null(z->pages_sequester));
3334 #endif
3335
3336 unlock_zone(z);
3337
3338 simple_lock(&all_zones_lock, &zone_locks_grp);
3339
3340 assert(!bitmap_test(zone_destroyed_bitmap, zindex));
3341 /* Mark the zone as empty in the bitmap */
3342 bitmap_set(zone_destroyed_bitmap, zindex);
3343 num_zones_in_use--;
3344 assert(num_zones_in_use > 0);
3345
3346 simple_unlock(&all_zones_lock);
3347 }
3348
3349 #pragma mark zone (re)fill, jetsam
3350
3351 /*
3352 * Dealing with zone allocations from the mach VM code.
3353 *
3354 * The implementation of the mach VM itself uses the zone allocator
3355 * for things like the vm_map_entry data structure. In order to prevent
3356 * an infinite recursion problem when adding more pages to a zone, zalloc
3357 * uses a replenish thread to refill the VM layer's zones before they have
3358 * too few remaining free entries. The reserved remaining free entries
3359 * guarantee that the VM routines can get entries from already mapped pages.
3360 *
3361 * In order for that to work, the amount of allocations in the nested
3362 * case have to be bounded. There are currently 2 replenish zones, and
3363 * if each needs 1 element of each zone to add a new page to itself, that
3364 * gives us a minumum reserve of 2 elements.
3365 *
3366 * There is also a deadlock issue with the zone garbage collection thread,
3367 * or any thread that is trying to free zone pages. While holding
3368 * the kernel's map lock they may need to allocate new VM map entries, hence
3369 * we need enough reserve to allow them to get past the point of holding the
3370 * map lock. After freeing that page, the GC thread will wait in drop_free_elements()
3371 * until the replenish threads can finish. Since there's only 1 GC thread at a time,
3372 * that adds a minimum of 1 to the reserve size.
3373 *
3374 * Since the minumum amount you can add to a zone is 1 page, we'll use 16K (from ARM)
3375 * as the refill size on all platforms.
3376 *
3377 * When a refill zone drops to half that available, i.e. REFILL_SIZE / 2,
3378 * zalloc_ext() will wake the replenish thread. The replenish thread runs
3379 * until at least REFILL_SIZE worth of free elements exist, before sleeping again.
3380 * In the meantime threads may continue to use the reserve until there are only REFILL_SIZE / 4
3381 * elements left. Below that point only the replenish threads themselves and the GC
3382 * thread may continue to use from the reserve.
3383 */
3384 static unsigned zone_replenish_loops;
3385 static unsigned zone_replenish_wakeups;
3386 static unsigned zone_replenish_wakeups_initiated;
3387 static unsigned zone_replenish_throttle_count;
3388
3389 #define ZONE_REPLENISH_TARGET (16 * 1024)
3390 static unsigned zone_replenish_active = 0; /* count of zones currently replenishing */
3391 static unsigned zone_replenish_max_threads = 0;
3392
3393 LCK_GRP_DECLARE(zone_replenish_lock_grp, "zone_replenish_lock");
3394 LCK_SPIN_DECLARE(zone_replenish_lock, &zone_replenish_lock_grp);
3395
3396 __abortlike
3397 static void
3398 zone_replenish_panic(zone_t zone, kern_return_t kr)
3399 {
3400 panic_include_zprint = TRUE;
3401 #if CONFIG_ZLEAKS
3402 if ((zleak_state & ZLEAK_STATE_ACTIVE)) {
3403 panic_include_ztrace = TRUE;
3404 }
3405 #endif /* CONFIG_ZLEAKS */
3406 if (kr == KERN_NO_SPACE) {
3407 zone_t zone_largest = zone_find_largest();
3408 panic("zalloc: zone map exhausted while allocating from zone %s%s, "
3409 "likely due to memory leak in zone %s%s "
3410 "(%lu total bytes, %d elements allocated)",
3411 zone_heap_name(zone), zone->z_name,
3412 zone_heap_name(zone_largest), zone_largest->z_name,
3413 (unsigned long)zone_size_wired(zone_largest),
3414 zone_count_allocated(zone_largest));
3415 }
3416 panic("zalloc: %s%s (%d elements) retry fail %d",
3417 zone_heap_name(zone), zone->z_name,
3418 zone_count_allocated(zone), kr);
3419 }
3420
3421 static void
3422 zone_replenish_locked(zone_t z, zalloc_flags_t flags, bool asynchronously)
3423 {
3424 int kmaflags = KMA_KOBJECT | KMA_ZERO;
3425 vm_offset_t space, alloc_size;
3426 uint32_t retry = 0;
3427 kern_return_t kr;
3428
3429 if (z->noencrypt) {
3430 kmaflags |= KMA_NOENCRYPT;
3431 }
3432 if (flags & Z_NOPAGEWAIT) {
3433 kmaflags |= KMA_NOPAGEWAIT;
3434 }
3435 if (z->permanent) {
3436 kmaflags |= KMA_PERMANENT;
3437 }
3438
3439 for (;;) {
3440 struct zone_page_metadata *page_meta = NULL;
3441
3442 /*
3443 * Try to allocate our regular chunk of pages,
3444 * unless the system is under massive pressure
3445 * and we're looking for more than 2 pages.
3446 */
3447 if (!z->percpu && z->alloc_pages > 2 && (vm_pool_low() || retry > 0)) {
3448 alloc_size = round_page(zone_elem_size(z));
3449 } else {
3450 alloc_size = ptoa(z->alloc_pages);
3451 page_meta = zone_sequestered_page_get(z, &space);
3452 }
3453
3454 unlock_zone(z);
3455
3456 #if CONFIG_ZLEAKS
3457 /*
3458 * Do the zone leak activation here because zleak_activate()
3459 * may block, and can't be done on the way out.
3460 */
3461 if (__improbable(zleak_state & ZLEAK_STATE_ENABLED)) {
3462 if (!(zleak_state & ZLEAK_STATE_ACTIVE) &&
3463 zone_submaps_approx_size() >= zleak_global_tracking_threshold) {
3464 kr = zleak_activate();
3465 if (kr != KERN_SUCCESS) {
3466 printf("Failed to activate live zone leak debugging (%d).\n", kr);
3467 }
3468 }
3469 }
3470 #endif /* CONFIG_ZLEAKS */
3471
3472 /*
3473 * Trigger jetsams via the vm_pageout_garbage_collect thread if
3474 * we're running out of zone memory
3475 */
3476 if (is_zone_map_nearing_exhaustion()) {
3477 thread_wakeup((event_t) &vm_pageout_garbage_collect);
3478 }
3479
3480 if (page_meta) {
3481 kr = zone_sequestered_page_populate(z, page_meta, space,
3482 alloc_size, kmaflags);
3483 } else {
3484 if (z->submap_idx == Z_SUBMAP_IDX_GENERAL_MAP && z->kalloc_heap != KHEAP_ID_NONE) {
3485 kmaflags |= KMA_KHEAP;
3486 }
3487 kr = kernel_memory_allocate(submap_for_zone(z),
3488 &space, alloc_size, 0, kmaflags, VM_KERN_MEMORY_ZONE);
3489 }
3490
3491 #if !__LP64__
3492 if (kr == KERN_NO_SPACE && z->allows_foreign) {
3493 /*
3494 * For zones allowing foreign pages, fallback to the kernel map
3495 */
3496 kr = kernel_memory_allocate(kernel_map, &space,
3497 alloc_size, 0, kmaflags, VM_KERN_MEMORY_ZONE);
3498 }
3499 #endif
3500
3501 if (kr == KERN_SUCCESS) {
3502 break;
3503 }
3504
3505 if (flags & Z_NOPAGEWAIT) {
3506 lock_zone(z);
3507 return;
3508 }
3509
3510 if (asynchronously) {
3511 assert_wait_timeout(&z->prio_refill_count,
3512 THREAD_UNINT, 1, 100 * NSEC_PER_USEC);
3513 thread_block(THREAD_CONTINUE_NULL);
3514 } else if (++retry == 3) {
3515 zone_replenish_panic(z, kr);
3516 }
3517
3518 lock_zone(z);
3519 }
3520
3521 zcram_and_lock(z, space, alloc_size);
3522
3523 #if CONFIG_ZLEAKS
3524 if (__improbable(zleak_state & ZLEAK_STATE_ACTIVE)) {
3525 if (!z->zleak_on &&
3526 zone_size_wired(z) >= zleak_per_zone_tracking_threshold) {
3527 z->zleak_on = true;
3528 }
3529 }
3530 #endif /* CONFIG_ZLEAKS */
3531 }
3532
3533 /*
3534 * High priority VM privileged thread used to asynchronously refill a given zone.
3535 * These are needed for data structures used by the lower level VM itself. The
3536 * replenish thread maintains a reserve of elements, so that the VM will never
3537 * block in the zone allocator.
3538 */
3539 __dead2
3540 static void
3541 zone_replenish_thread(void *_z, wait_result_t __unused wr)
3542 {
3543 zone_t z = _z;
3544
3545 current_thread()->options |= (TH_OPT_VMPRIV | TH_OPT_ZONE_PRIV);
3546
3547 for (;;) {
3548 lock_zone(z);
3549 assert(z->z_self == z);
3550 assert(z->zone_replenishing);
3551 assert(z->prio_refill_count != 0);
3552
3553 while (z->countfree < z->prio_refill_count) {
3554 assert(!z->expanding_no_vm_priv);
3555 assert(!z->expanding_vm_priv);
3556
3557 zone_replenish_locked(z, Z_WAITOK, true);
3558
3559 assert(z->z_self == z);
3560 zone_replenish_loops++;
3561 }
3562
3563 /* Wakeup any potentially throttled allocations. */
3564 thread_wakeup(z);
3565
3566 assert_wait(&z->prio_refill_count, THREAD_UNINT);
3567
3568 /*
3569 * We finished refilling the zone, so decrement the active count
3570 * and wake up any waiting GC threads.
3571 */
3572 lck_spin_lock(&zone_replenish_lock);
3573 assert(zone_replenish_active > 0);
3574 if (--zone_replenish_active == 0) {
3575 thread_wakeup((event_t)&zone_replenish_active);
3576 }
3577 lck_spin_unlock(&zone_replenish_lock);
3578
3579 z->zone_replenishing = false;
3580 unlock_zone(z);
3581
3582 thread_block(THREAD_CONTINUE_NULL);
3583 zone_replenish_wakeups++;
3584 }
3585 }
3586
3587 void
3588 zone_prio_refill_configure(zone_t z)
3589 {
3590 thread_t th;
3591 kern_return_t tres;
3592
3593 lock_zone(z);
3594 assert(!z->prio_refill_count && !z->destructible);
3595 z->prio_refill_count = (uint16_t)(ZONE_REPLENISH_TARGET / zone_elem_size(z));
3596 z->zone_replenishing = true;
3597 unlock_zone(z);
3598
3599 lck_spin_lock(&zone_replenish_lock);
3600 ++zone_replenish_max_threads;
3601 ++zone_replenish_active;
3602 lck_spin_unlock(&zone_replenish_lock);
3603 OSMemoryBarrier();
3604
3605 tres = kernel_thread_start_priority(zone_replenish_thread, z,
3606 MAXPRI_KERNEL, &th);
3607 if (tres != KERN_SUCCESS) {
3608 panic("zone_prio_refill_configure, thread create: 0x%x", tres);
3609 }
3610
3611 thread_deallocate(th);
3612 }
3613
3614 static void
3615 zone_randomize_freelist(zone_t zone, struct zone_page_metadata *meta,
3616 vm_offset_t size, zone_addr_kind_t kind, unsigned int *entropy_buffer)
3617 {
3618 const vm_size_t elem_size = zone_elem_size(zone);
3619 vm_offset_t left, right, head, base;
3620 vm_offset_t element;
3621
3622 left = ZONE_PAGE_FIRST_OFFSET(kind);
3623 right = size - ((size - left) % elem_size);
3624 head = 0;
3625 base = zone_meta_to_addr(meta, kind);
3626
3627 while (left < right) {
3628 if (zone_leaks_scan_enable || __improbable(zone->tags) ||
3629 random_bool_gen_bits(&zone_bool_gen, entropy_buffer, MAX_ENTROPY_PER_ZCRAM, 1)) {
3630 element = base + left;
3631 left += elem_size;
3632 } else {
3633 right -= elem_size;
3634 element = base + right;
3635 }
3636
3637 vm_offset_t *primary = (vm_offset_t *)element;
3638 vm_offset_t *backup = get_backup_ptr(elem_size, primary);
3639
3640 *primary = *backup = head ^ zp_nopoison_cookie;
3641 head = element;
3642 }
3643
3644 meta->zm_freelist_offs = (uint16_t)(head - base);
3645 }
3646
3647 /*
3648 * Cram the given memory into the specified zone. Update the zone page count accordingly.
3649 */
3650 static void
3651 zcram_and_lock(zone_t zone, vm_offset_t newmem, vm_size_t size)
3652 {
3653 unsigned int entropy_buffer[MAX_ENTROPY_PER_ZCRAM] = { 0 };
3654 struct zone_page_metadata *meta;
3655 zone_addr_kind_t kind;
3656 uint32_t pg_count = (uint32_t)atop(size);
3657 uint32_t zindex = zone_index(zone);
3658 uint32_t free_count;
3659 uint16_t empty_freelist_offs = PAGE_METADATA_EMPTY_FREELIST;
3660
3661 /* Basic sanity checks */
3662 assert(zone != ZONE_NULL && newmem != (vm_offset_t)0);
3663 assert((newmem & PAGE_MASK) == 0);
3664 assert((size & PAGE_MASK) == 0);
3665
3666 KDBG(MACHDBG_CODE(DBG_MACH_ZALLOC, ZALLOC_ZCRAM) | DBG_FUNC_START,
3667 zindex, size);
3668
3669 kind = zone_addr_kind(newmem, size);
3670 #if DEBUG || DEVELOPMENT
3671 if (zalloc_debug & ZALLOC_DEBUG_ZCRAM) {
3672 kprintf("zcram(%p[%s%s], 0x%lx%s, 0x%lx)\n", zone,
3673 zone_heap_name(zone), zone->z_name, (uintptr_t)newmem,
3674 kind == ZONE_ADDR_FOREIGN ? "[F]" : "", (uintptr_t)size);
3675 }
3676 #endif /* DEBUG || DEVELOPMENT */
3677
3678 /*
3679 * Initialize the metadata for all pages. We dont need the zone lock
3680 * here because we are not manipulating any zone related state yet.
3681 *
3682 * This includes randomizing the freelists as the metadata isn't
3683 * published yet.
3684 */
3685
3686 if (kind == ZONE_ADDR_NATIVE) {
3687 /*
3688 * We're being called by zfill,
3689 * zone_replenish_thread or vm_page_more_fictitious,
3690 *
3691 * which will only either allocate a single page, or `alloc_pages`
3692 * worth.
3693 */
3694 assert(pg_count <= zone->alloc_pages);
3695
3696 /*
3697 * Make sure the range of metadata entries we're about to init
3698 * have proper physical backing, then initialize them.
3699 */
3700 meta = zone_meta_from_addr(newmem, kind);
3701 zone_meta_populate(meta, meta + pg_count);
3702
3703 if (zone->permanent) {
3704 empty_freelist_offs = 0;
3705 }
3706
3707 meta[0] = (struct zone_page_metadata){
3708 .zm_index = zindex,
3709 .zm_page_count = pg_count,
3710 .zm_percpu = zone->percpu,
3711 .zm_freelist_offs = empty_freelist_offs,
3712 };
3713
3714 for (uint32_t i = 1; i < pg_count; i++) {
3715 meta[i] = (struct zone_page_metadata){
3716 .zm_index = zindex,
3717 .zm_page_count = i,
3718 .zm_percpu = zone->percpu,
3719 .zm_secondary_page = true,
3720 .zm_freelist_offs = empty_freelist_offs,
3721 };
3722 }
3723
3724 if (!zone->permanent) {
3725 zone_randomize_freelist(zone, meta,
3726 zone->percpu ? PAGE_SIZE : size, kind, entropy_buffer);
3727 }
3728 } else {
3729 if (!zone->allows_foreign || !from_foreign_range(newmem, size)) {
3730 panic("zcram_and_lock: foreign memory [%lx] being crammed is "
3731 "outside of foreign range", (uintptr_t)newmem);
3732 }
3733
3734 /*
3735 * We cannot support elements larger than page size for foreign
3736 * memory because we put metadata on the page itself for each
3737 * page of foreign memory.
3738 *
3739 * We need to do this in order to be able to reach the metadata
3740 * when any element is freed.
3741 */
3742 assert(!zone->percpu && !zone->permanent);
3743 assert(zone_elem_size(zone) <= PAGE_SIZE - sizeof(struct zone_page_metadata));
3744
3745 bzero((void *)newmem, size);
3746
3747 for (vm_offset_t offs = 0; offs < size; offs += PAGE_SIZE) {
3748 meta = (struct zone_page_metadata *)(newmem + offs);
3749 *meta = (struct zone_page_metadata){
3750 .zm_index = zindex,
3751 .zm_page_count = 1,
3752 .zm_freelist_offs = empty_freelist_offs,
3753 };
3754 meta->zm_foreign_cookie[0] = ZONE_FOREIGN_COOKIE;
3755 zone_randomize_freelist(zone, meta, PAGE_SIZE, kind,
3756 entropy_buffer);
3757 }
3758 }
3759
3760 #if VM_MAX_TAG_ZONES
3761 if (__improbable(zone->tags)) {
3762 assert(kind == ZONE_ADDR_NATIVE && !zone->percpu);
3763 ztMemoryAdd(zone, newmem, size);
3764 }
3765 #endif /* VM_MAX_TAG_ZONES */
3766
3767 /*
3768 * Insert the initialized pages / metadatas into the right lists.
3769 */
3770
3771 lock_zone(zone);
3772 assert(zone->z_self == zone);
3773
3774 zone->page_count += pg_count;
3775 if (zone->page_count_hwm < zone->page_count) {
3776 zone->page_count_hwm = zone->page_count;
3777 }
3778 os_atomic_add(&zones_phys_page_count, pg_count, relaxed);
3779
3780 if (kind == ZONE_ADDR_NATIVE) {
3781 os_atomic_add(&zones_phys_page_mapped_count, pg_count, relaxed);
3782 if (zone->permanent) {
3783 zone_meta_queue_push(zone, &zone->pages_intermediate, meta, kind);
3784 } else {
3785 zone_meta_queue_push(zone, &zone->pages_all_free, meta, kind);
3786 zone->allfree_page_count += meta->zm_page_count;
3787 }
3788 free_count = zone_elem_count(zone, size, kind);
3789 zone->countfree += free_count;
3790 zone->countavail += free_count;
3791 } else {
3792 free_count = zone_elem_count(zone, PAGE_SIZE, kind);
3793 for (vm_offset_t offs = 0; offs < size; offs += PAGE_SIZE) {
3794 meta = (struct zone_page_metadata *)(newmem + offs);
3795 zone_meta_queue_push(zone, &zone->pages_any_free_foreign, meta, kind);
3796 zone->countfree += free_count;
3797 zone->countavail += free_count;
3798 }
3799 }
3800
3801 KDBG(MACHDBG_CODE(DBG_MACH_ZALLOC, ZALLOC_ZCRAM) | DBG_FUNC_END, zindex);
3802 }
3803
3804 void
3805 zcram(zone_t zone, vm_offset_t newmem, vm_size_t size)
3806 {
3807 zcram_and_lock(zone, newmem, size);
3808 unlock_zone(zone);
3809 }
3810
3811 /*
3812 * Fill a zone with enough memory to contain at least nelem elements.
3813 * Return the number of elements actually put into the zone, which may
3814 * be more than the caller asked for since the memory allocation is
3815 * rounded up to the next zone allocation size.
3816 */
3817 int
3818 zfill(
3819 zone_t zone,
3820 int nelem)
3821 {
3822 kern_return_t kr;
3823 vm_offset_t memory;
3824
3825 vm_size_t alloc_size = ptoa(zone->alloc_pages);
3826 vm_size_t nalloc_inc = zone_elem_count(zone, alloc_size, ZONE_ADDR_NATIVE);
3827 vm_size_t nalloc = 0, goal = MAX(0, nelem);
3828 int kmaflags = KMA_KOBJECT | KMA_ZERO;
3829
3830 if (zone->noencrypt) {
3831 kmaflags |= KMA_NOENCRYPT;
3832 }
3833
3834 assert(!zone->allows_foreign && !zone->permanent);
3835
3836 /*
3837 * Trigger jetsams via the vm_pageout_garbage_collect thread if we're
3838 * running out of zone memory
3839 */
3840 if (is_zone_map_nearing_exhaustion()) {
3841 thread_wakeup((event_t) &vm_pageout_garbage_collect);
3842 }
3843
3844 if (zone->va_sequester) {
3845 lock_zone(zone);
3846
3847 do {
3848 struct zone_page_metadata *page_meta;
3849 page_meta = zone_sequestered_page_get(zone, &memory);
3850 if (NULL == page_meta) {
3851 break;
3852 }
3853 unlock_zone(zone);
3854
3855 kr = zone_sequestered_page_populate(zone, page_meta,
3856 memory, alloc_size, kmaflags);
3857 if (KERN_SUCCESS != kr) {
3858 goto out_nolock;
3859 }
3860
3861 zcram_and_lock(zone, memory, alloc_size);
3862 nalloc += nalloc_inc;
3863 } while (nalloc < goal);
3864
3865 unlock_zone(zone);
3866 }
3867
3868 out_nolock:
3869 while (nalloc < goal) {
3870 kr = kernel_memory_allocate(submap_for_zone(zone), &memory,
3871 alloc_size, 0, kmaflags, VM_KERN_MEMORY_ZONE);
3872 if (kr != KERN_SUCCESS) {
3873 printf("%s: kernel_memory_allocate() of %lu bytes failed\n",
3874 __func__, (unsigned long)(nalloc * alloc_size));
3875 break;
3876 }
3877
3878 zcram(zone, memory, alloc_size);
3879 nalloc += nalloc_inc;
3880 }
3881
3882 return (int)nalloc;
3883 }
3884
3885 /*
3886 * We're being very conservative here and picking a value of 95%. We might need to lower this if
3887 * we find that we're not catching the problem and are still hitting zone map exhaustion panics.
3888 */
3889 #define ZONE_MAP_JETSAM_LIMIT_DEFAULT 95
3890
3891 /*
3892 * Trigger zone-map-exhaustion jetsams if the zone map is X% full, where X=zone_map_jetsam_limit.
3893 * Can be set via boot-arg "zone_map_jetsam_limit". Set to 95% by default.
3894 */
3895 TUNABLE_WRITEABLE(unsigned int, zone_map_jetsam_limit, "zone_map_jetsam_limit",
3896 ZONE_MAP_JETSAM_LIMIT_DEFAULT);
3897
3898 void
3899 get_zone_map_size(uint64_t *current_size, uint64_t *capacity)
3900 {
3901 vm_offset_t phys_pages = os_atomic_load(&zones_phys_page_mapped_count, relaxed);
3902 *current_size = ptoa_64(phys_pages);
3903 *capacity = zone_phys_mapped_max;
3904 }
3905
3906 void
3907 get_largest_zone_info(char *zone_name, size_t zone_name_len, uint64_t *zone_size)
3908 {
3909 zone_t largest_zone = zone_find_largest();
3910
3911 /*
3912 * Append kalloc heap name to zone name (if zone is used by kalloc)
3913 */
3914 snprintf(zone_name, zone_name_len, "%s%s",
3915 zone_heap_name(largest_zone), largest_zone->z_name);
3916
3917 *zone_size = zone_size_wired(largest_zone);
3918 }
3919
3920 boolean_t
3921 is_zone_map_nearing_exhaustion(void)
3922 {
3923 vm_offset_t phys_pages = os_atomic_load(&zones_phys_page_mapped_count, relaxed);
3924 return ptoa_64(phys_pages) > (zone_phys_mapped_max * zone_map_jetsam_limit) / 100;
3925 }
3926
3927
3928 #define VMENTRY_TO_VMOBJECT_COMPARISON_RATIO 98
3929
3930 /*
3931 * Tries to kill a single process if it can attribute one to the largest zone. If not, wakes up the memorystatus thread
3932 * to walk through the jetsam priority bands and kill processes.
3933 */
3934 static void
3935 kill_process_in_largest_zone(void)
3936 {
3937 pid_t pid = -1;
3938 zone_t largest_zone = zone_find_largest();
3939
3940 printf("zone_map_exhaustion: Zone mapped %lld of %lld, used %lld, map size %lld, capacity %lld [jetsam limit %d%%]\n",
3941 ptoa_64(os_atomic_load(&zones_phys_page_mapped_count, relaxed)), ptoa_64(zone_phys_mapped_max),
3942 ptoa_64(os_atomic_load(&zones_phys_page_count, relaxed)),
3943 (uint64_t)zone_submaps_approx_size(),
3944 (uint64_t)zone_range_size(&zone_info.zi_map_range),
3945 zone_map_jetsam_limit);
3946 printf("zone_map_exhaustion: Largest zone %s%s, size %lu\n", zone_heap_name(largest_zone),
3947 largest_zone->z_name, (uintptr_t)zone_size_wired(largest_zone));
3948
3949 /*
3950 * We want to make sure we don't call this function from userspace.
3951 * Or we could end up trying to synchronously kill the process
3952 * whose context we're in, causing the system to hang.
3953 */
3954 assert(current_task() == kernel_task);
3955
3956 /*
3957 * If vm_object_zone is the largest, check to see if the number of
3958 * elements in vm_map_entry_zone is comparable.
3959 *
3960 * If so, consider vm_map_entry_zone as the largest. This lets us target
3961 * a specific process to jetsam to quickly recover from the zone map
3962 * bloat.
3963 */
3964 if (largest_zone == vm_object_zone) {
3965 unsigned int vm_object_zone_count = zone_count_allocated(vm_object_zone);
3966 unsigned int vm_map_entry_zone_count = zone_count_allocated(vm_map_entry_zone);
3967 /* Is the VM map entries zone count >= 98% of the VM objects zone count? */
3968 if (vm_map_entry_zone_count >= ((vm_object_zone_count * VMENTRY_TO_VMOBJECT_COMPARISON_RATIO) / 100)) {
3969 largest_zone = vm_map_entry_zone;
3970 printf("zone_map_exhaustion: Picking VM map entries as the zone to target, size %lu\n",
3971 (uintptr_t)zone_size_wired(largest_zone));
3972 }
3973 }
3974
3975 /* TODO: Extend this to check for the largest process in other zones as well. */
3976 if (largest_zone == vm_map_entry_zone) {
3977 pid = find_largest_process_vm_map_entries();
3978 } else {
3979 printf("zone_map_exhaustion: Nothing to do for the largest zone [%s%s]. "
3980 "Waking up memorystatus thread.\n", zone_heap_name(largest_zone),
3981 largest_zone->z_name);
3982 }
3983 if (!memorystatus_kill_on_zone_map_exhaustion(pid)) {
3984 printf("zone_map_exhaustion: Call to memorystatus failed, victim pid: %d\n", pid);
3985 }
3986 }
3987
3988 #pragma mark zalloc module init
3989
3990 /*
3991 * Initialize the "zone of zones" which uses fixed memory allocated
3992 * earlier in memory initialization. zone_bootstrap is called
3993 * before zone_init.
3994 */
3995 __startup_func
3996 void
3997 zone_bootstrap(void)
3998 {
3999 /* Validate struct zone_page_metadata expectations */
4000 if ((1U << ZONE_PAGECOUNT_BITS) <
4001 atop(ZONE_MAX_ALLOC_SIZE) * sizeof(struct zone_page_metadata)) {
4002 panic("ZONE_PAGECOUNT_BITS is not large enough to hold page counts");
4003 }
4004
4005 /* Validate struct zone_packed_virtual_address expectations */
4006 static_assert((intptr_t)VM_MIN_KERNEL_ADDRESS < 0, "the top bit must be 1");
4007 if (VM_KERNEL_POINTER_SIGNIFICANT_BITS - PAGE_SHIFT > 31) {
4008 panic("zone_pva_t can't pack a kernel page address in 31 bits");
4009 }
4010
4011 zpercpu_early_count = ml_early_cpu_max_number() + 1;
4012
4013 /* Set up zone element poisoning */
4014 zp_bootstrap();
4015
4016 random_bool_init(&zone_bool_gen);
4017
4018 /*
4019 * the KASAN quarantine for kalloc doesn't understand heaps
4020 * and trips the heap confusion panics. At the end of the day,
4021 * all these security measures are double duty with KASAN.
4022 *
4023 * On 32bit kernels, these protections are just too expensive.
4024 */
4025 #if !defined(__LP64__) || KASAN_ZALLOC
4026 zsecurity_options &= ~ZSECURITY_OPTIONS_SEQUESTER;
4027 zsecurity_options &= ~ZSECURITY_OPTIONS_SUBMAP_USER_DATA;
4028 zsecurity_options &= ~ZSECURITY_OPTIONS_SEQUESTER_KEXT_KALLOC;
4029 #endif
4030
4031 thread_call_setup(&call_async_alloc, zalloc_async, NULL);
4032
4033 #if CONFIG_ZCACHE
4034 /* zcc_enable_for_zone_name=<zone>: enable per-cpu zone caching for <zone>. */
4035 if (PE_parse_boot_arg_str("zcc_enable_for_zone_name", cache_zone_name, sizeof(cache_zone_name))) {
4036 printf("zcache: caching enabled for zone %s\n", cache_zone_name);
4037 }
4038 #endif /* CONFIG_ZCACHE */
4039 }
4040
4041 #if __LP64__
4042 #if CONFIG_EMBEDDED
4043 #define ZONE_MAP_VIRTUAL_SIZE_LP64 (32ULL * 1024ULL * 1024 * 1024)
4044 #else
4045 #define ZONE_MAP_VIRTUAL_SIZE_LP64 (128ULL * 1024ULL * 1024 * 1024)
4046 #endif
4047 #endif /* __LP64__ */
4048
4049 #define SINGLE_GUARD 16384
4050 #define MULTI_GUARD (3 * SINGLE_GUARD)
4051
4052 #if __LP64__
4053 static inline vm_offset_t
4054 zone_restricted_va_max(void)
4055 {
4056 vm_offset_t compressor_max = VM_PACKING_MAX_PACKABLE(C_SLOT_PACKED_PTR);
4057 vm_offset_t vm_page_max = VM_PACKING_MAX_PACKABLE(VM_PAGE_PACKED_PTR);
4058
4059 return trunc_page(MIN(compressor_max, vm_page_max));
4060 }
4061 #endif
4062
4063 __startup_func
4064 static void
4065 zone_tunables_fixup(void)
4066 {
4067 if (zone_map_jetsam_limit == 0 || zone_map_jetsam_limit > 100) {
4068 zone_map_jetsam_limit = ZONE_MAP_JETSAM_LIMIT_DEFAULT;
4069 }
4070 }
4071 STARTUP(TUNABLES, STARTUP_RANK_MIDDLE, zone_tunables_fixup);
4072
4073 __startup_func
4074 static vm_size_t
4075 zone_phys_size_max(void)
4076 {
4077 mach_vm_size_t zsize;
4078 vm_size_t zsizearg;
4079
4080 if (PE_parse_boot_argn("zsize", &zsizearg, sizeof(zsizearg))) {
4081 zsize = zsizearg * (1024ULL * 1024);
4082 } else {
4083 zsize = sane_size >> 2; /* Set target zone size as 1/4 of physical memory */
4084 #if defined(__LP64__)
4085 zsize += zsize >> 1;
4086 #endif /* __LP64__ */
4087 }
4088
4089 if (zsize < CONFIG_ZONE_MAP_MIN) {
4090 zsize = CONFIG_ZONE_MAP_MIN; /* Clamp to min */
4091 }
4092 if (zsize > sane_size >> 1) {
4093 zsize = sane_size >> 1; /* Clamp to half of RAM max */
4094 }
4095 if (zsizearg == 0 && zsize > ZONE_MAP_MAX) {
4096 /* if zsize boot-arg not present and zsize exceeds platform maximum, clip zsize */
4097 vm_size_t orig_zsize = zsize;
4098 zsize = ZONE_MAP_MAX;
4099 printf("NOTE: zonemap size reduced from 0x%lx to 0x%lx\n",
4100 (uintptr_t)orig_zsize, (uintptr_t)zsize);
4101 }
4102
4103 assert((vm_size_t) zsize == zsize);
4104 return (vm_size_t)trunc_page(zsize);
4105 }
4106
4107 __startup_func
4108 static struct zone_map_range
4109 zone_init_allocate_va(vm_offset_t *submap_min, vm_size_t size, bool guard)
4110 {
4111 struct zone_map_range r;
4112 kern_return_t kr;
4113
4114 if (guard) {
4115 vm_map_offset_t addr = *submap_min;
4116 vm_map_kernel_flags_t vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
4117
4118 vmk_flags.vmkf_permanent = TRUE;
4119 kr = vm_map_enter(kernel_map, &addr, size, 0,
4120 VM_FLAGS_FIXED, vmk_flags, VM_KERN_MEMORY_ZONE, kernel_object,
4121 0, FALSE, VM_PROT_NONE, VM_PROT_NONE, VM_INHERIT_DEFAULT);
4122 *submap_min = (vm_offset_t)addr;
4123 } else {
4124 kr = kernel_memory_allocate(kernel_map, submap_min, size,
4125 0, KMA_KOBJECT | KMA_PAGEABLE | KMA_VAONLY, VM_KERN_MEMORY_ZONE);
4126 }
4127 if (kr != KERN_SUCCESS) {
4128 panic("zone_init_allocate_va(0x%lx:0x%zx) failed: %d",
4129 (uintptr_t)*submap_min, (size_t)size, kr);
4130 }
4131
4132 r.min_address = *submap_min;
4133 *submap_min += size;
4134 r.max_address = *submap_min;
4135
4136 return r;
4137 }
4138
4139 __startup_func
4140 static void
4141 zone_submap_init(
4142 vm_offset_t *submap_min,
4143 unsigned idx,
4144 uint64_t zone_sub_map_numer,
4145 uint64_t *remaining_denom,
4146 vm_offset_t *remaining_size,
4147 vm_size_t guard_size)
4148 {
4149 vm_offset_t submap_start, submap_end;
4150 vm_size_t submap_size;
4151 vm_map_t submap;
4152 kern_return_t kr;
4153
4154 submap_size = trunc_page(zone_sub_map_numer * *remaining_size /
4155 *remaining_denom);
4156 submap_start = *submap_min;
4157 submap_end = submap_start + submap_size;
4158
4159 #if defined(__LP64__)
4160 if (idx == Z_SUBMAP_IDX_VA_RESTRICTED_MAP) {
4161 vm_offset_t restricted_va_max = zone_restricted_va_max();
4162 if (submap_end > restricted_va_max) {
4163 #if DEBUG || DEVELOPMENT
4164 printf("zone_init: submap[%d] clipped to %zdM of %zdM\n", idx,
4165 (size_t)(restricted_va_max - submap_start) >> 20,
4166 (size_t)submap_size >> 20);
4167 #endif /* DEBUG || DEVELOPMENT */
4168 guard_size += submap_end - restricted_va_max;
4169 *remaining_size -= submap_end - restricted_va_max;
4170 submap_end = restricted_va_max;
4171 submap_size = restricted_va_max - submap_start;
4172 }
4173
4174 vm_packing_verify_range("vm_compressor",
4175 submap_start, submap_end, VM_PACKING_PARAMS(C_SLOT_PACKED_PTR));
4176 vm_packing_verify_range("vm_page",
4177 submap_start, submap_end, VM_PACKING_PARAMS(VM_PAGE_PACKED_PTR));
4178 }
4179 #endif /* defined(__LP64__) */
4180
4181 vm_map_kernel_flags_t vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
4182 vmk_flags.vmkf_permanent = TRUE;
4183 kr = kmem_suballoc(kernel_map, submap_min, submap_size,
4184 FALSE, VM_FLAGS_FIXED, vmk_flags,
4185 VM_KERN_MEMORY_ZONE, &submap);
4186 if (kr != KERN_SUCCESS) {
4187 panic("kmem_suballoc(kernel_map[%d] %p:%p) failed: %d",
4188 idx, (void *)submap_start, (void *)submap_end, kr);
4189 }
4190
4191 #if DEBUG || DEVELOPMENT
4192 printf("zone_init: submap[%d] %p:%p (%zuM)\n",
4193 idx, (void *)submap_start, (void *)submap_end,
4194 (size_t)submap_size >> 20);
4195 #endif /* DEBUG || DEVELOPMENT */
4196
4197 zone_submaps[idx] = submap;
4198 *submap_min = submap_end;
4199 *remaining_size -= submap_size;
4200 *remaining_denom -= zone_sub_map_numer;
4201
4202 zone_init_allocate_va(submap_min, guard_size, true);
4203 }
4204
4205 /* Global initialization of Zone Allocator.
4206 * Runs after zone_bootstrap.
4207 */
4208 __startup_func
4209 static void
4210 zone_init(void)
4211 {
4212 vm_size_t zone_meta_size;
4213 vm_size_t zone_map_size;
4214 vm_size_t remaining_size;
4215 vm_offset_t submap_min = 0;
4216
4217 if (ZSECURITY_OPTIONS_SUBMAP_USER_DATA & zsecurity_options) {
4218 zone_last_submap_idx = Z_SUBMAP_IDX_BAG_OF_BYTES_MAP;
4219 } else {
4220 zone_last_submap_idx = Z_SUBMAP_IDX_GENERAL_MAP;
4221 }
4222 zone_phys_mapped_max = zone_phys_size_max();
4223
4224 #if __LP64__
4225 zone_map_size = ZONE_MAP_VIRTUAL_SIZE_LP64;
4226 #else
4227 zone_map_size = zone_phys_mapped_max;
4228 #endif
4229 zone_meta_size = round_page(atop(zone_map_size) *
4230 sizeof(struct zone_page_metadata));
4231
4232 /*
4233 * Zone "map" setup:
4234 *
4235 * [ VA_RESTRICTED ] <-- LP64 only
4236 * [ SINGLE_GUARD ] <-- LP64 only
4237 * [ meta ]
4238 * [ SINGLE_GUARD ]
4239 * [ map<i> ] \ for each extra map
4240 * [ MULTI_GUARD ] /
4241 */
4242 remaining_size = zone_map_size;
4243 #if defined(__LP64__)
4244 remaining_size -= SINGLE_GUARD;
4245 #endif
4246 remaining_size -= zone_meta_size + SINGLE_GUARD;
4247 remaining_size -= MULTI_GUARD * (zone_last_submap_idx -
4248 Z_SUBMAP_IDX_GENERAL_MAP + 1);
4249
4250 #if VM_MAX_TAG_ZONES
4251 if (zone_tagging_on) {
4252 zone_tagging_init(zone_map_size);
4253 }
4254 #endif
4255
4256 uint64_t remaining_denom = 0;
4257 uint64_t zone_sub_map_numer[Z_SUBMAP_IDX_COUNT] = {
4258 #ifdef __LP64__
4259 [Z_SUBMAP_IDX_VA_RESTRICTED_MAP] = 20,
4260 #endif /* defined(__LP64__) */
4261 [Z_SUBMAP_IDX_GENERAL_MAP] = 40,
4262 [Z_SUBMAP_IDX_BAG_OF_BYTES_MAP] = 40,
4263 };
4264
4265 for (unsigned idx = 0; idx <= zone_last_submap_idx; idx++) {
4266 #if DEBUG || DEVELOPMENT
4267 char submap_name[MAX_SUBMAP_NAME];
4268 snprintf(submap_name, MAX_SUBMAP_NAME, "submap%d", idx);
4269 PE_parse_boot_argn(submap_name, &zone_sub_map_numer[idx], sizeof(uint64_t));
4270 #endif
4271 remaining_denom += zone_sub_map_numer[idx];
4272 }
4273
4274 /*
4275 * And now allocate the various pieces of VA and submaps.
4276 *
4277 * Make a first allocation of contiguous VA, that we'll deallocate,
4278 * and we'll carve-out memory in that range again linearly.
4279 * The kernel is stil single threaded at this stage.
4280 */
4281
4282 struct zone_map_range *map_range = &zone_info.zi_map_range;
4283
4284 *map_range = zone_init_allocate_va(&submap_min, zone_map_size, false);
4285 submap_min = map_range->min_address;
4286 kmem_free(kernel_map, submap_min, zone_map_size);
4287
4288 #if defined(__LP64__)
4289 /*
4290 * Allocate `Z_SUBMAP_IDX_VA_RESTRICTED_MAP` first because its VA range
4291 * can't go beyond RESTRICTED_VA_MAX for the vm_page_t packing to work.
4292 */
4293 zone_submap_init(&submap_min, Z_SUBMAP_IDX_VA_RESTRICTED_MAP,
4294 zone_sub_map_numer[Z_SUBMAP_IDX_VA_RESTRICTED_MAP], &remaining_denom,
4295 &remaining_size, SINGLE_GUARD);
4296 #endif /* defined(__LP64__) */
4297
4298 /*
4299 * Allocate metadata array
4300 */
4301 zone_info.zi_meta_range =
4302 zone_init_allocate_va(&submap_min, zone_meta_size, true);
4303 zone_init_allocate_va(&submap_min, SINGLE_GUARD, true);
4304
4305 zone_info.zi_array_base =
4306 (struct zone_page_metadata *)zone_info.zi_meta_range.min_address -
4307 zone_pva_from_addr(map_range->min_address).packed_address;
4308
4309 /*
4310 * Allocate other submaps
4311 */
4312 for (unsigned idx = Z_SUBMAP_IDX_GENERAL_MAP; idx <= zone_last_submap_idx; idx++) {
4313 zone_submap_init(&submap_min, idx, zone_sub_map_numer[idx],
4314 &remaining_denom, &remaining_size, MULTI_GUARD);
4315 }
4316
4317 vm_map_t general_map = zone_submaps[Z_SUBMAP_IDX_GENERAL_MAP];
4318 zone_info.zi_general_range.min_address = vm_map_min(general_map);
4319 zone_info.zi_general_range.max_address = vm_map_max(general_map);
4320
4321 assert(submap_min == map_range->max_address);
4322
4323 #if CONFIG_GZALLOC
4324 gzalloc_init(zone_map_size);
4325 #endif
4326
4327 zone_create_flags_t kma_flags = ZC_NOCACHING |
4328 ZC_NOGC | ZC_NOENCRYPT | ZC_NOGZALLOC | ZC_NOCALLOUT |
4329 ZC_KASAN_NOQUARANTINE | ZC_KASAN_NOREDZONE;
4330
4331 (void)zone_create_ext("vm.permanent", 1, kma_flags,
4332 ZONE_ID_PERMANENT, ^(zone_t z){
4333 z->permanent = true;
4334 z->z_elem_size = 1;
4335 z->pcpu_elem_size = 1;
4336 #if defined(__LP64__)
4337 z->submap_idx = Z_SUBMAP_IDX_VA_RESTRICTED_MAP;
4338 #endif
4339 });
4340 (void)zone_create_ext("vm.permanent.percpu", 1, kma_flags | ZC_PERCPU,
4341 ZONE_ID_PERCPU_PERMANENT, ^(zone_t z){
4342 z->permanent = true;
4343 z->z_elem_size = 1;
4344 z->pcpu_elem_size = zpercpu_count();
4345 #if defined(__LP64__)
4346 z->submap_idx = Z_SUBMAP_IDX_VA_RESTRICTED_MAP;
4347 #endif
4348 });
4349
4350 /*
4351 * Now fix the zones that are missing their zone stats
4352 * we don't really know if zfree()s happened so our stats
4353 * are slightly off for early boot. ¯\_(ツ)_/¯
4354 */
4355 zone_index_foreach(idx) {
4356 zone_t tz = &zone_array[idx];
4357
4358 if (tz->z_self) {
4359 zone_stats_t zs = zalloc_percpu_permanent_type(struct zone_stats);
4360
4361 zpercpu_get_cpu(zs, 0)->zs_mem_allocated +=
4362 (tz->countavail - tz->countfree) *
4363 zone_elem_size(tz);
4364 assert(tz->z_stats == NULL);
4365 tz->z_stats = zs;
4366 #if ZONE_ENABLE_LOGGING
4367 if (tz->zone_logging && !tz->zlog_btlog) {
4368 zone_enable_logging(tz);
4369 }
4370 #endif
4371 }
4372 }
4373
4374 #if CONFIG_ZLEAKS
4375 /*
4376 * Initialize the zone leak monitor
4377 */
4378 zleak_init(zone_map_size);
4379 #endif /* CONFIG_ZLEAKS */
4380
4381 #if VM_MAX_TAG_ZONES
4382 if (zone_tagging_on) {
4383 vm_allocation_zones_init();
4384 }
4385 #endif
4386 }
4387 STARTUP(ZALLOC, STARTUP_RANK_FIRST, zone_init);
4388
4389 __startup_func
4390 static void
4391 zone_set_foreign_range(
4392 vm_offset_t range_min,
4393 vm_offset_t range_max)
4394 {
4395 zone_info.zi_foreign_range.min_address = range_min;
4396 zone_info.zi_foreign_range.max_address = range_max;
4397 }
4398
4399 __startup_func
4400 vm_offset_t
4401 zone_foreign_mem_init(vm_size_t size)
4402 {
4403 vm_offset_t mem = (vm_offset_t) pmap_steal_memory(size);
4404 zone_set_foreign_range(mem, mem + size);
4405 return mem;
4406 }
4407
4408 #pragma mark zalloc
4409
4410 #if KASAN_ZALLOC
4411 /*
4412 * Called from zfree() to add the element being freed to the KASan quarantine.
4413 *
4414 * Returns true if the newly-freed element made it into the quarantine without
4415 * displacing another, false otherwise. In the latter case, addrp points to the
4416 * address of the displaced element, which will be freed by the zone.
4417 */
4418 static bool
4419 kasan_quarantine_freed_element(
4420 zone_t *zonep, /* the zone the element is being freed to */
4421 void **addrp) /* address of the element being freed */
4422 {
4423 zone_t zone = *zonep;
4424 void *addr = *addrp;
4425
4426 /*
4427 * Resize back to the real allocation size and hand off to the KASan
4428 * quarantine. `addr` may then point to a different allocation, if the
4429 * current element replaced another in the quarantine. The zone then
4430 * takes ownership of the swapped out free element.
4431 */
4432 vm_size_t usersz = zone_elem_size(zone) - 2 * zone->kasan_redzone;
4433 vm_size_t sz = usersz;
4434
4435 if (addr && zone->kasan_redzone) {
4436 kasan_check_free((vm_address_t)addr, usersz, KASAN_HEAP_ZALLOC);
4437 addr = (void *)kasan_dealloc((vm_address_t)addr, &sz);
4438 assert(sz == zone_elem_size(zone));
4439 }
4440 if (addr && !zone->kasan_noquarantine) {
4441 kasan_free(&addr, &sz, KASAN_HEAP_ZALLOC, zonep, usersz, true);
4442 if (!addr) {
4443 return TRUE;
4444 }
4445 }
4446 if (addr && zone->kasan_noquarantine) {
4447 kasan_unpoison(addr, zone_elem_size(zone));
4448 }
4449 *addrp = addr;
4450 return FALSE;
4451 }
4452
4453 #endif /* KASAN_ZALLOC */
4454
4455 static inline bool
4456 zone_needs_async_refill(zone_t zone)
4457 {
4458 if (zone->countfree != 0 || zone->async_pending || zone->no_callout) {
4459 return false;
4460 }
4461
4462 return zone->expandable || zone->page_count < zone->page_count_max;
4463 }
4464
4465 __attribute__((noinline))
4466 static void
4467 zone_refill_synchronously_locked(
4468 zone_t zone,
4469 zalloc_flags_t flags)
4470 {
4471 thread_t thr = current_thread();
4472 bool set_expanding_vm_priv = false;
4473 zone_pva_t orig = zone->pages_intermediate;
4474
4475 while ((flags & Z_NOWAIT) == 0 && (zone->permanent
4476 ? zone_pva_is_equal(zone->pages_intermediate, orig)
4477 : zone->countfree == 0)) {
4478 /*
4479 * zone is empty, try to expand it
4480 *
4481 * Note that we now allow up to 2 threads (1 vm_privliged and
4482 * 1 non-vm_privliged) to expand the zone concurrently...
4483 *
4484 * this is necessary to avoid stalling vm_privileged threads
4485 * running critical code necessary to continue
4486 * compressing/swapping pages (i.e. making new free pages) from
4487 * stalling behind non-vm_privileged threads waiting to acquire
4488 * free pages when the vm_page_free_count is below the
4489 * vm_page_free_reserved limit.
4490 */
4491 if ((zone->expanding_no_vm_priv || zone->expanding_vm_priv) &&
4492 (((thr->options & TH_OPT_VMPRIV) == 0) || zone->expanding_vm_priv)) {
4493 /*
4494 * This is a non-vm_privileged thread and a non-vm_privileged or
4495 * a vm_privileged thread is already expanding the zone...
4496 * OR
4497 * this is a vm_privileged thread and a vm_privileged thread is
4498 * already expanding the zone...
4499 *
4500 * In either case wait for a thread to finish, then try again.
4501 */
4502 zone->waiting = true;
4503 assert_wait(zone, THREAD_UNINT);
4504 unlock_zone(zone);
4505 thread_block(THREAD_CONTINUE_NULL);
4506 lock_zone(zone);
4507 continue;
4508 }
4509
4510 if (zone->page_count >= zone->page_count_max) {
4511 if (zone->exhaustible) {
4512 break;
4513 }
4514 if (zone->expandable) {
4515 /*
4516 * If we're expandable, just don't go through this again.
4517 */
4518 zone->page_count_max = ~0u;
4519 } else {
4520 unlock_zone(zone);
4521
4522 panic_include_zprint = true;
4523 #if CONFIG_ZLEAKS
4524 if (zleak_state & ZLEAK_STATE_ACTIVE) {
4525 panic_include_ztrace = true;
4526 }
4527 #endif /* CONFIG_ZLEAKS */
4528 panic("zalloc: zone \"%s\" empty.", zone->z_name);
4529 }
4530 }
4531
4532 /*
4533 * It is possible that a BG thread is refilling/expanding the zone
4534 * and gets pre-empted during that operation. That blocks all other
4535 * threads from making progress leading to a watchdog timeout. To
4536 * avoid that, boost the thread priority using the rwlock boost
4537 */
4538 set_thread_rwlock_boost();
4539
4540 if ((thr->options & TH_OPT_VMPRIV)) {
4541 zone->expanding_vm_priv = true;
4542 set_expanding_vm_priv = true;
4543 } else {
4544 zone->expanding_no_vm_priv = true;
4545 }
4546
4547 zone_replenish_locked(zone, flags, false);
4548
4549 if (set_expanding_vm_priv == true) {
4550 zone->expanding_vm_priv = false;
4551 } else {
4552 zone->expanding_no_vm_priv = false;
4553 }
4554
4555 if (zone->waiting) {
4556 zone->waiting = false;
4557 thread_wakeup(zone);
4558 }
4559 clear_thread_rwlock_boost();
4560
4561 if (zone->countfree == 0) {
4562 assert(flags & Z_NOPAGEWAIT);
4563 break;
4564 }
4565 }
4566
4567 if ((flags & (Z_NOWAIT | Z_NOPAGEWAIT)) &&
4568 zone_needs_async_refill(zone) && !vm_pool_low()) {
4569 zone->async_pending = true;
4570 unlock_zone(zone);
4571 thread_call_enter(&call_async_alloc);
4572 lock_zone(zone);
4573 assert(zone->z_self == zone);
4574 }
4575 }
4576
4577 __attribute__((noinline))
4578 static void
4579 zone_refill_asynchronously_locked(zone_t zone)
4580 {
4581 uint32_t min_free = zone->prio_refill_count / 2;
4582 uint32_t resv_free = zone->prio_refill_count / 4;
4583 thread_t thr = current_thread();
4584
4585 /*
4586 * Nothing to do if there are plenty of elements.
4587 */
4588 while (zone->countfree <= min_free) {
4589 /*
4590 * Wakeup the replenish thread if not running.
4591 */
4592 if (!zone->zone_replenishing) {
4593 lck_spin_lock(&zone_replenish_lock);
4594 assert(zone_replenish_active < zone_replenish_max_threads);
4595 ++zone_replenish_active;
4596 lck_spin_unlock(&zone_replenish_lock);
4597 zone->zone_replenishing = true;
4598 zone_replenish_wakeups_initiated++;
4599 thread_wakeup(&zone->prio_refill_count);
4600 }
4601
4602 /*
4603 * We'll let VM_PRIV threads to continue to allocate until the
4604 * reserve drops to 25%. After that only TH_OPT_ZONE_PRIV threads
4605 * may continue.
4606 *
4607 * TH_OPT_ZONE_PRIV threads are the GC thread and a replenish thread itself.
4608 * Replenish threads *need* to use the reserve. GC threads need to
4609 * get through the current allocation, but then will wait at a higher
4610 * level after they've dropped any locks which would deadlock the
4611 * replenish thread.
4612 */
4613 if ((zone->countfree > resv_free && (thr->options & TH_OPT_VMPRIV)) ||
4614 (thr->options & TH_OPT_ZONE_PRIV)) {
4615 break;
4616 }
4617
4618 /*
4619 * Wait for the replenish threads to add more elements for us to allocate from.
4620 */
4621 zone_replenish_throttle_count++;
4622 unlock_zone(zone);
4623 assert_wait_timeout(zone, THREAD_UNINT, 1, NSEC_PER_MSEC);
4624 thread_block(THREAD_CONTINUE_NULL);
4625 lock_zone(zone);
4626
4627 assert(zone->z_self == zone);
4628 }
4629
4630 /*
4631 * If we're here because of zone_gc(), we didn't wait for
4632 * zone_replenish_thread to finish. So we need to ensure that
4633 * we will successfully grab an element.
4634 *
4635 * zones that have a replenish thread configured.
4636 * The value of (refill_level / 2) in the previous bit of code should have
4637 * given us headroom even though this thread didn't wait.
4638 */
4639 if (thr->options & TH_OPT_ZONE_PRIV) {
4640 assert(zone->countfree != 0);
4641 }
4642 }
4643
4644 #if ZONE_ENABLE_LOGGING || CONFIG_ZLEAKS
4645 __attribute__((noinline))
4646 static void
4647 zalloc_log_or_trace_leaks(zone_t zone, vm_offset_t addr)
4648 {
4649 uintptr_t zbt[MAX_ZTRACE_DEPTH]; /* used in zone leak logging and zone leak detection */
4650 unsigned int numsaved = 0;
4651
4652 #if ZONE_ENABLE_LOGGING
4653 if (DO_LOGGING(zone)) {
4654 numsaved = backtrace(zbt, MAX_ZTRACE_DEPTH, NULL);
4655 btlog_add_entry(zone->zlog_btlog, (void *)addr,
4656 ZOP_ALLOC, (void **)zbt, numsaved);
4657 }
4658 #endif
4659
4660 #if CONFIG_ZLEAKS
4661 /*
4662 * Zone leak detection: capture a backtrace every zleak_sample_factor
4663 * allocations in this zone.
4664 */
4665 if (__improbable(zone->zleak_on)) {
4666 if (sample_counter(&zone->zleak_capture, zleak_sample_factor)) {
4667 /* Avoid backtracing twice if zone logging is on */
4668 if (numsaved == 0) {
4669 numsaved = backtrace(zbt, MAX_ZTRACE_DEPTH, NULL);
4670 }
4671 /* Sampling can fail if another sample is happening at the same time in a different zone. */
4672 if (!zleak_log(zbt, addr, numsaved, zone_elem_size(zone))) {
4673 /* If it failed, roll back the counter so we sample the next allocation instead. */
4674 zone->zleak_capture = zleak_sample_factor;
4675 }
4676 }
4677 }
4678
4679 if (__improbable(zone_leaks_scan_enable &&
4680 !(zone_elem_size(zone) & (sizeof(uintptr_t) - 1)))) {
4681 unsigned int count, idx;
4682 /* Fill element, from tail, with backtrace in reverse order */
4683 if (numsaved == 0) {
4684 numsaved = backtrace(zbt, MAX_ZTRACE_DEPTH, NULL);
4685 }
4686 count = (unsigned int)(zone_elem_size(zone) / sizeof(uintptr_t));
4687 if (count >= numsaved) {
4688 count = numsaved - 1;
4689 }
4690 for (idx = 0; idx < count; idx++) {
4691 ((uintptr_t *)addr)[count - 1 - idx] = zbt[idx + 1];
4692 }
4693 }
4694 #endif /* CONFIG_ZLEAKS */
4695 }
4696
4697 static inline bool
4698 zalloc_should_log_or_trace_leaks(zone_t zone, vm_size_t elem_size)
4699 {
4700 #if ZONE_ENABLE_LOGGING
4701 if (DO_LOGGING(zone)) {
4702 return true;
4703 }
4704 #endif
4705 #if CONFIG_ZLEAKS
4706 /*
4707 * Zone leak detection: capture a backtrace every zleak_sample_factor
4708 * allocations in this zone.
4709 */
4710 if (zone->zleak_on) {
4711 return true;
4712 }
4713 if (zone_leaks_scan_enable && !(elem_size & (sizeof(uintptr_t) - 1))) {
4714 return true;
4715 }
4716 #endif /* CONFIG_ZLEAKS */
4717 return false;
4718 }
4719 #endif /* ZONE_ENABLE_LOGGING || CONFIG_ZLEAKS */
4720 #if ZONE_ENABLE_LOGGING
4721
4722 __attribute__((noinline))
4723 static void
4724 zfree_log_trace(zone_t zone, vm_offset_t addr)
4725 {
4726 /*
4727 * See if we're doing logging on this zone.
4728 *
4729 * There are two styles of logging used depending on
4730 * whether we're trying to catch a leak or corruption.
4731 */
4732 if (__improbable(DO_LOGGING(zone))) {
4733 if (corruption_debug_flag) {
4734 uintptr_t zbt[MAX_ZTRACE_DEPTH];
4735 unsigned int numsaved;
4736 /*
4737 * We're logging to catch a corruption.
4738 *
4739 * Add a record of this zfree operation to log.
4740 */
4741 numsaved = backtrace(zbt, MAX_ZTRACE_DEPTH, NULL);
4742 btlog_add_entry(zone->zlog_btlog, (void *)addr, ZOP_FREE,
4743 (void **)zbt, numsaved);
4744 } else {
4745 /*
4746 * We're logging to catch a leak.
4747 *
4748 * Remove any record we might have for this element
4749 * since it's being freed. Note that we may not find it
4750 * if the buffer overflowed and that's OK.
4751 *
4752 * Since the log is of a limited size, old records get
4753 * overwritten if there are more zallocs than zfrees.
4754 */
4755 btlog_remove_entries_for_element(zone->zlog_btlog, (void *)addr);
4756 }
4757 }
4758 }
4759 #endif /* ZONE_ENABLE_LOGGING */
4760
4761 /*
4762 * Removes an element from the zone's free list, returning 0 if the free list is empty.
4763 * Verifies that the next-pointer and backup next-pointer are intact,
4764 * and verifies that a poisoned element hasn't been modified.
4765 */
4766 vm_offset_t
4767 zalloc_direct_locked(
4768 zone_t zone,
4769 zalloc_flags_t flags __unused,
4770 vm_size_t waste __unused)
4771 {
4772 struct zone_page_metadata *page_meta;
4773 zone_addr_kind_t kind = ZONE_ADDR_NATIVE;
4774 vm_offset_t element, page, validate_bit = 0;
4775
4776 /* if zone is empty, bail */
4777 if (!zone_pva_is_null(zone->pages_any_free_foreign)) {
4778 kind = ZONE_ADDR_FOREIGN;
4779 page_meta = zone_pva_to_meta(zone->pages_any_free_foreign, kind);
4780 page = (vm_offset_t)page_meta;
4781 } else if (!zone_pva_is_null(zone->pages_intermediate)) {
4782 page_meta = zone_pva_to_meta(zone->pages_intermediate, kind);
4783 page = zone_pva_to_addr(zone->pages_intermediate);
4784 } else if (!zone_pva_is_null(zone->pages_all_free)) {
4785 page_meta = zone_pva_to_meta(zone->pages_all_free, kind);
4786 page = zone_pva_to_addr(zone->pages_all_free);
4787 if (os_sub_overflow(zone->allfree_page_count,
4788 page_meta->zm_page_count, &zone->allfree_page_count)) {
4789 zone_accounting_panic(zone, "allfree_page_count wrap-around");
4790 }
4791 } else {
4792 zone_accounting_panic(zone, "countfree corruption");
4793 }
4794
4795 if (!zone_has_index(zone, page_meta->zm_index)) {
4796 zone_page_metadata_index_confusion_panic(zone, page, page_meta);
4797 }
4798
4799 element = zone_page_meta_get_freelist(zone, page_meta, page);
4800
4801 vm_offset_t *primary = (vm_offset_t *) element;
4802 vm_offset_t *backup = get_backup_ptr(zone_elem_size(zone), primary);
4803
4804 /*
4805 * since the primary next pointer is xor'ed with zp_nopoison_cookie
4806 * for obfuscation, retrieve the original value back
4807 */
4808 vm_offset_t next_element = *primary ^ zp_nopoison_cookie;
4809 vm_offset_t next_element_primary = *primary;
4810 vm_offset_t next_element_backup = *backup;
4811
4812 /*
4813 * backup_ptr_mismatch_panic will determine what next_element
4814 * should have been, and print it appropriately
4815 */
4816 if (!zone_page_meta_is_sane_element(zone, page_meta, page, next_element, kind)) {
4817 backup_ptr_mismatch_panic(zone, page_meta, page, element);
4818 }
4819
4820 /* Check the backup pointer for the regular cookie */
4821 if (__improbable(next_element_primary != next_element_backup)) {
4822 /* Check for the poisoned cookie instead */
4823 if (__improbable(next_element != (next_element_backup ^ zp_poisoned_cookie))) {
4824 /* Neither cookie is valid, corruption has occurred */
4825 backup_ptr_mismatch_panic(zone, page_meta, page, element);
4826 }
4827
4828 /*
4829 * Element was marked as poisoned, so check its integrity before using it.
4830 */
4831 validate_bit = ZALLOC_ELEMENT_NEEDS_VALIDATION;
4832 } else if (zone->zfree_clear_mem) {
4833 validate_bit = ZALLOC_ELEMENT_NEEDS_VALIDATION;
4834 }
4835
4836 /* Remove this element from the free list */
4837 zone_page_meta_set_freelist(page_meta, page, next_element);
4838
4839 if (kind == ZONE_ADDR_FOREIGN) {
4840 if (next_element == 0) {
4841 /* last foreign element allocated on page, move to all_used_foreign */
4842 zone_meta_requeue(zone, &zone->pages_all_used_foreign, page_meta, kind);
4843 }
4844 } else if (next_element == 0) {
4845 zone_meta_requeue(zone, &zone->pages_all_used, page_meta, kind);
4846 } else if (page_meta->zm_alloc_count == 0) {
4847 /* remove from free, move to intermediate */
4848 zone_meta_requeue(zone, &zone->pages_intermediate, page_meta, kind);
4849 }
4850
4851 if (os_add_overflow(page_meta->zm_alloc_count, 1,
4852 &page_meta->zm_alloc_count)) {
4853 /*
4854 * This will not catch a lot of errors, the proper check
4855 * would be against the number of elements this run should
4856 * have which is expensive to count.
4857 *
4858 * But zm_alloc_count is a 16 bit number which could
4859 * theoretically be valuable to cause to wrap around,
4860 * so catch this.
4861 */
4862 zone_page_meta_accounting_panic(zone, page_meta,
4863 "zm_alloc_count overflow");
4864 }
4865 if (os_sub_overflow(zone->countfree, 1, &zone->countfree)) {
4866 zone_accounting_panic(zone, "countfree wrap-around");
4867 }
4868
4869 #if VM_MAX_TAG_ZONES
4870 if (__improbable(zone->tags)) {
4871 vm_tag_t tag = zalloc_flags_get_tag(flags);
4872 // set the tag with b0 clear so the block remains inuse
4873 ZTAG(zone, element)[0] = (vm_tag_t)(tag << 1);
4874 vm_tag_update_zone_size(tag, zone->tag_zone_index,
4875 zone_elem_size(zone), waste);
4876 }
4877 #endif /* VM_MAX_TAG_ZONES */
4878 #if KASAN_ZALLOC
4879 if (zone->percpu) {
4880 zpercpu_foreach_cpu(i) {
4881 kasan_poison_range(element + ptoa(i),
4882 zone_elem_size(zone), ASAN_VALID);
4883 }
4884 } else {
4885 kasan_poison_range(element, zone_elem_size(zone), ASAN_VALID);
4886 }
4887 #endif
4888
4889 return element | validate_bit;
4890 }
4891
4892 /*
4893 * zalloc returns an element from the specified zone.
4894 */
4895 void *
4896 zalloc_ext(
4897 zone_t zone,
4898 zone_stats_t zstats,
4899 zalloc_flags_t flags,
4900 vm_size_t waste)
4901 {
4902 vm_offset_t addr = 0;
4903 vm_size_t elem_size = zone_elem_size(zone);
4904
4905 /*
4906 * KASan uses zalloc() for fakestack, which can be called anywhere.
4907 * However, we make sure these calls can never block.
4908 */
4909 assert(zone->kasan_fakestacks ||
4910 ml_get_interrupts_enabled() ||
4911 ml_is_quiescing() ||
4912 debug_mode_active() ||
4913 startup_phase < STARTUP_SUB_EARLY_BOOT);
4914
4915 /*
4916 * Make sure Z_NOFAIL was not obviously misused
4917 */
4918 if ((flags & Z_NOFAIL) && !zone->prio_refill_count) {
4919 assert(!zone->exhaustible && (flags & (Z_NOWAIT | Z_NOPAGEWAIT)) == 0);
4920 }
4921
4922 #if CONFIG_ZCACHE
4923 /*
4924 * Note: if zone caching is on, gzalloc and tags aren't used
4925 * so we can always check this first
4926 */
4927 if (zone_caching_enabled(zone)) {
4928 addr = zcache_alloc_from_cpu_cache(zone, zstats, waste);
4929 if (__probable(addr)) {
4930 goto allocated_from_cache;
4931 }
4932 }
4933 #endif /* CONFIG_ZCACHE */
4934
4935 #if CONFIG_GZALLOC
4936 if (__improbable(zone->gzalloc_tracked)) {
4937 addr = gzalloc_alloc(zone, zstats, flags);
4938 goto allocated_from_gzalloc;
4939 }
4940 #endif /* CONFIG_GZALLOC */
4941 #if VM_MAX_TAG_ZONES
4942 if (__improbable(zone->tags)) {
4943 vm_tag_t tag = zalloc_flags_get_tag(flags);
4944 if (tag == VM_KERN_MEMORY_NONE) {
4945 /*
4946 * zone views into heaps can lead to a site-less call
4947 * and we fallback to KALLOC as a tag for those.
4948 */
4949 tag = VM_KERN_MEMORY_KALLOC;
4950 flags |= Z_VM_TAG(tag);
4951 }
4952 vm_tag_will_update_zone(tag, zone->tag_zone_index);
4953 }
4954 #endif /* VM_MAX_TAG_ZONES */
4955
4956 lock_zone(zone);
4957 assert(zone->z_self == zone);
4958
4959 /*
4960 * Check if we need another thread to replenish the zone or
4961 * if we have to wait for a replenish thread to finish.
4962 * This is used for elements, like vm_map_entry, which are
4963 * needed themselves to implement zalloc().
4964 */
4965 if (__improbable(zone->prio_refill_count &&
4966 zone->countfree <= zone->prio_refill_count / 2)) {
4967 zone_refill_asynchronously_locked(zone);
4968 } else if (__improbable(zone->countfree == 0)) {
4969 zone_refill_synchronously_locked(zone, flags);
4970 if (__improbable(zone->countfree == 0)) {
4971 unlock_zone(zone);
4972 if (__improbable(flags & Z_NOFAIL)) {
4973 zone_nofail_panic(zone);
4974 }
4975 goto out_nomem;
4976 }
4977 }
4978
4979 addr = zalloc_direct_locked(zone, flags, waste);
4980 if (__probable(zstats != NULL)) {
4981 /*
4982 * The few vm zones used before zone_init() runs do not have
4983 * per-cpu stats yet
4984 */
4985 int cpu = cpu_number();
4986 zpercpu_get_cpu(zstats, cpu)->zs_mem_allocated += elem_size;
4987 #if ZALLOC_DETAILED_STATS
4988 if (waste) {
4989 zpercpu_get_cpu(zstats, cpu)->zs_mem_wasted += waste;
4990 }
4991 #endif /* ZALLOC_DETAILED_STATS */
4992 }
4993
4994 unlock_zone(zone);
4995
4996 #if ZALLOC_ENABLE_POISONING
4997 bool validate = addr & ZALLOC_ELEMENT_NEEDS_VALIDATION;
4998 #endif
4999 addr &= ~ZALLOC_ELEMENT_NEEDS_VALIDATION;
5000 zone_clear_freelist_pointers(zone, addr);
5001 #if ZALLOC_ENABLE_POISONING
5002 /*
5003 * Note: percpu zones do not respect ZONE_MIN_ELEM_SIZE,
5004 * so we will check the first word even if we just
5005 * cleared it.
5006 */
5007 zalloc_validate_element(zone, addr, elem_size - sizeof(vm_offset_t),
5008 validate);
5009 #endif /* ZALLOC_ENABLE_POISONING */
5010
5011 allocated_from_cache:
5012 #if ZONE_ENABLE_LOGGING || CONFIG_ZLEAKS
5013 if (__improbable(zalloc_should_log_or_trace_leaks(zone, elem_size))) {
5014 zalloc_log_or_trace_leaks(zone, addr);
5015 }
5016 #endif /* ZONE_ENABLE_LOGGING || CONFIG_ZLEAKS */
5017
5018 #if CONFIG_GZALLOC
5019 allocated_from_gzalloc:
5020 #endif
5021 #if KASAN_ZALLOC
5022 if (zone->kasan_redzone) {
5023 addr = kasan_alloc(addr, elem_size,
5024 elem_size - 2 * zone->kasan_redzone, zone->kasan_redzone);
5025 elem_size -= 2 * zone->kasan_redzone;
5026 }
5027 /*
5028 * Initialize buffer with unique pattern only if memory
5029 * wasn't expected to be zeroed.
5030 */
5031 if (!zone->zfree_clear_mem && !(flags & Z_ZERO)) {
5032 kasan_leak_init(addr, elem_size);
5033 }
5034 #endif /* KASAN_ZALLOC */
5035 if ((flags & Z_ZERO) && !zone->zfree_clear_mem) {
5036 bzero((void *)addr, elem_size);
5037 }
5038
5039 TRACE_MACHLEAKS(ZALLOC_CODE, ZALLOC_CODE_2, elem_size, addr);
5040
5041 out_nomem:
5042 DTRACE_VM2(zalloc, zone_t, zone, void*, addr);
5043 return (void *)addr;
5044 }
5045
5046 void *
5047 zalloc(union zone_or_view zov)
5048 {
5049 return zalloc_flags(zov, Z_WAITOK);
5050 }
5051
5052 void *
5053 zalloc_noblock(union zone_or_view zov)
5054 {
5055 return zalloc_flags(zov, Z_NOWAIT);
5056 }
5057
5058 void *
5059 zalloc_flags(union zone_or_view zov, zalloc_flags_t flags)
5060 {
5061 zone_t zone = zov.zov_view->zv_zone;
5062 zone_stats_t zstats = zov.zov_view->zv_stats;
5063 assert(!zone->percpu);
5064 return zalloc_ext(zone, zstats, flags, 0);
5065 }
5066
5067 void *
5068 zalloc_percpu(union zone_or_view zov, zalloc_flags_t flags)
5069 {
5070 zone_t zone = zov.zov_view->zv_zone;
5071 zone_stats_t zstats = zov.zov_view->zv_stats;
5072 assert(zone->percpu);
5073 return (void *)__zpcpu_mangle(zalloc_ext(zone, zstats, flags, 0));
5074 }
5075
5076 static void *
5077 _zalloc_permanent(zone_t zone, vm_size_t size, vm_offset_t mask)
5078 {
5079 const zone_addr_kind_t kind = ZONE_ADDR_NATIVE;
5080 struct zone_page_metadata *page_meta;
5081 vm_offset_t offs, addr;
5082 zone_pva_t pva;
5083
5084 assert(ml_get_interrupts_enabled() ||
5085 ml_is_quiescing() ||
5086 debug_mode_active() ||
5087 startup_phase < STARTUP_SUB_EARLY_BOOT);
5088
5089 size = (size + mask) & ~mask;
5090 assert(size <= PAGE_SIZE);
5091
5092 lock_zone(zone);
5093 assert(zone->z_self == zone);
5094
5095 for (;;) {
5096 pva = zone->pages_intermediate;
5097 while (!zone_pva_is_null(pva)) {
5098 page_meta = zone_pva_to_meta(pva, kind);
5099 if (page_meta->zm_freelist_offs + size <= PAGE_SIZE) {
5100 goto found;
5101 }
5102 pva = page_meta->zm_page_next;
5103 }
5104
5105 zone_refill_synchronously_locked(zone, Z_WAITOK);
5106 }
5107
5108 found:
5109 offs = (page_meta->zm_freelist_offs + mask) & ~mask;
5110 page_meta->zm_freelist_offs = offs + size;
5111 page_meta->zm_alloc_count += size;
5112 zone->countfree -= size;
5113 if (__probable(zone->z_stats)) {
5114 zpercpu_get(zone->z_stats)->zs_mem_allocated += size;
5115 }
5116
5117 if (page_meta->zm_alloc_count >= PAGE_SIZE - sizeof(vm_offset_t)) {
5118 zone_meta_requeue(zone, &zone->pages_all_used, page_meta, kind);
5119 }
5120
5121 unlock_zone(zone);
5122
5123 addr = offs + zone_pva_to_addr(pva);
5124
5125 DTRACE_VM2(zalloc, zone_t, zone, void*, addr);
5126 return (void *)addr;
5127 }
5128
5129 static void *
5130 _zalloc_permanent_large(size_t size, vm_offset_t mask)
5131 {
5132 kern_return_t kr;
5133 vm_offset_t addr;
5134
5135 kr = kernel_memory_allocate(kernel_map, &addr, size, mask,
5136 KMA_KOBJECT | KMA_PERMANENT | KMA_ZERO,
5137 VM_KERN_MEMORY_KALLOC);
5138 if (kr != 0) {
5139 panic("zalloc_permanent: unable to allocate %zd bytes (%d)",
5140 size, kr);
5141 }
5142 return (void *)addr;
5143 }
5144
5145 void *
5146 zalloc_permanent(vm_size_t size, vm_offset_t mask)
5147 {
5148 if (size <= PAGE_SIZE) {
5149 zone_t zone = &zone_array[ZONE_ID_PERMANENT];
5150 return _zalloc_permanent(zone, size, mask);
5151 }
5152 return _zalloc_permanent_large(size, mask);
5153 }
5154
5155 void *
5156 zalloc_percpu_permanent(vm_size_t size, vm_offset_t mask)
5157 {
5158 zone_t zone = &zone_array[ZONE_ID_PERCPU_PERMANENT];
5159 return (void *)__zpcpu_mangle(_zalloc_permanent(zone, size, mask));
5160 }
5161
5162 void
5163 zalloc_async(__unused thread_call_param_t p0, __unused thread_call_param_t p1)
5164 {
5165 zone_index_foreach(i) {
5166 zone_t z = &zone_array[i];
5167
5168 if (z->no_callout) {
5169 /* async_pending will never be set */
5170 continue;
5171 }
5172
5173 lock_zone(z);
5174 if (z->z_self && z->async_pending) {
5175 z->async_pending = false;
5176 zone_refill_synchronously_locked(z, Z_WAITOK);
5177 }
5178 unlock_zone(z);
5179 }
5180 }
5181
5182 /*
5183 * Adds the element to the head of the zone's free list
5184 * Keeps a backup next-pointer at the end of the element
5185 */
5186 void
5187 zfree_direct_locked(zone_t zone, vm_offset_t element, bool poison)
5188 {
5189 struct zone_page_metadata *page_meta;
5190 vm_offset_t page, old_head;
5191 zone_addr_kind_t kind;
5192 vm_size_t elem_size = zone_elem_size(zone);
5193
5194 vm_offset_t *primary = (vm_offset_t *) element;
5195 vm_offset_t *backup = get_backup_ptr(elem_size, primary);
5196
5197 page_meta = zone_allocated_element_resolve(zone, element, &page, &kind);
5198 old_head = zone_page_meta_get_freelist(zone, page_meta, page);
5199
5200 if (__improbable(old_head == element)) {
5201 panic("zfree: double free of %p to zone %s%s\n",
5202 (void *) element, zone_heap_name(zone), zone->z_name);
5203 }
5204
5205 #if ZALLOC_ENABLE_POISONING
5206 if (poison && elem_size < ZONE_MIN_ELEM_SIZE) {
5207 assert(zone->percpu);
5208 poison = false;
5209 }
5210 #else
5211 poison = false;
5212 #endif
5213
5214 /*
5215 * Always write a redundant next pointer
5216 * So that it is more difficult to forge, xor it with a random cookie
5217 * A poisoned element is indicated by using zp_poisoned_cookie
5218 * instead of zp_nopoison_cookie
5219 */
5220
5221 *backup = old_head ^ (poison ? zp_poisoned_cookie : zp_nopoison_cookie);
5222
5223 /*
5224 * Insert this element at the head of the free list. We also xor the
5225 * primary pointer with the zp_nopoison_cookie to make sure a free
5226 * element does not provide the location of the next free element directly.
5227 */
5228 *primary = old_head ^ zp_nopoison_cookie;
5229
5230 #if VM_MAX_TAG_ZONES
5231 if (__improbable(zone->tags)) {
5232 vm_tag_t tag = (ZTAG(zone, element)[0] >> 1);
5233 // set the tag with b0 clear so the block remains inuse
5234 ZTAG(zone, element)[0] = 0xFFFE;
5235 vm_tag_update_zone_size(tag, zone->tag_zone_index,
5236 -((int64_t)elem_size), 0);
5237 }
5238 #endif /* VM_MAX_TAG_ZONES */
5239
5240 zone_page_meta_set_freelist(page_meta, page, element);
5241 if (os_sub_overflow(page_meta->zm_alloc_count, 1,
5242 &page_meta->zm_alloc_count)) {
5243 zone_page_meta_accounting_panic(zone, page_meta,
5244 "alloc_count wrap-around");
5245 }
5246 zone->countfree++;
5247
5248 if (kind == ZONE_ADDR_FOREIGN) {
5249 if (old_head == 0) {
5250 /* first foreign element freed on page, move from all_used_foreign */
5251 zone_meta_requeue(zone, &zone->pages_any_free_foreign, page_meta, kind);
5252 }
5253 } else if (page_meta->zm_alloc_count == 0) {
5254 /* whether the page was on the intermediate or all_used, queue, move it to free */
5255 zone_meta_requeue(zone, &zone->pages_all_free, page_meta, kind);
5256 zone->allfree_page_count += page_meta->zm_page_count;
5257 } else if (old_head == 0) {
5258 /* first free element on page, move from all_used */
5259 zone_meta_requeue(zone, &zone->pages_intermediate, page_meta, kind);
5260 }
5261
5262 #if KASAN_ZALLOC
5263 if (zone->percpu) {
5264 zpercpu_foreach_cpu(i) {
5265 kasan_poison_range(element + ptoa(i), elem_size,
5266 ASAN_HEAP_FREED);
5267 }
5268 } else {
5269 kasan_poison_range(element, elem_size, ASAN_HEAP_FREED);
5270 }
5271 #endif
5272 }
5273
5274 void
5275 zfree_ext(zone_t zone, zone_stats_t zstats, void *addr)
5276 {
5277 vm_offset_t elem = (vm_offset_t)addr;
5278 vm_size_t elem_size = zone_elem_size(zone);
5279 bool poison = false;
5280
5281 DTRACE_VM2(zfree, zone_t, zone, void*, addr);
5282 TRACE_MACHLEAKS(ZFREE_CODE, ZFREE_CODE_2, elem_size, elem);
5283
5284 #if KASAN_ZALLOC
5285 if (kasan_quarantine_freed_element(&zone, &addr)) {
5286 return;
5287 }
5288 /*
5289 * kasan_quarantine_freed_element() might return a different
5290 * {zone, addr} than the one being freed for kalloc heaps.
5291 *
5292 * Make sure we reload everything.
5293 */
5294 elem = (vm_offset_t)addr;
5295 elem_size = zone_elem_size(zone);
5296 #endif
5297
5298 #if CONFIG_ZLEAKS
5299 /*
5300 * Zone leak detection: un-track the allocation
5301 */
5302 if (__improbable(zone->zleak_on)) {
5303 zleak_free(elem, elem_size);
5304 }
5305 #endif /* CONFIG_ZLEAKS */
5306
5307 #if CONFIG_ZCACHE
5308 /*
5309 * Note: if zone caching is on, gzalloc and tags aren't used
5310 * so we can always check this first
5311 */
5312 if (zone_caching_enabled(zone)) {
5313 return zcache_free_to_cpu_cache(zone, zstats, (vm_offset_t)addr);
5314 }
5315 #endif /* CONFIG_ZCACHE */
5316
5317 #if CONFIG_GZALLOC
5318 if (__improbable(zone->gzalloc_tracked)) {
5319 return gzalloc_free(zone, zstats, addr);
5320 }
5321 #endif /* CONFIG_GZALLOC */
5322
5323 #if ZONE_ENABLE_LOGGING
5324 if (__improbable(DO_LOGGING(zone))) {
5325 zfree_log_trace(zone, elem);
5326 }
5327 #endif /* ZONE_ENABLE_LOGGING */
5328
5329 if (zone->zfree_clear_mem) {
5330 poison = zfree_clear(zone, elem, elem_size);
5331 }
5332
5333 lock_zone(zone);
5334 assert(zone->z_self == zone);
5335
5336 if (!poison) {
5337 poison = zfree_poison_element(zone, &zone->zp_count, elem);
5338 }
5339
5340 if (__probable(zstats != NULL)) {
5341 /*
5342 * The few vm zones used before zone_init() runs do not have
5343 * per-cpu stats yet
5344 */
5345 zpercpu_get(zstats)->zs_mem_freed += elem_size;
5346 }
5347
5348 zfree_direct_locked(zone, elem, poison);
5349
5350 unlock_zone(zone);
5351 }
5352
5353 void
5354 (zfree)(union zone_or_view zov, void *addr)
5355 {
5356 zone_t zone = zov.zov_view->zv_zone;
5357 zone_stats_t zstats = zov.zov_view->zv_stats;
5358 assert(!zone->percpu);
5359 zfree_ext(zone, zstats, addr);
5360 }
5361
5362 void
5363 zfree_percpu(union zone_or_view zov, void *addr)
5364 {
5365 zone_t zone = zov.zov_view->zv_zone;
5366 zone_stats_t zstats = zov.zov_view->zv_stats;
5367 assert(zone->percpu);
5368 zfree_ext(zone, zstats, (void *)__zpcpu_demangle(addr));
5369 }
5370
5371 #pragma mark vm integration, MIG routines
5372
5373 /*
5374 * Drops (i.e. frees) the elements in the all free pages queue of a zone.
5375 * Called by zone_gc() on each zone and when a zone is zdestroy()ed.
5376 */
5377 static void
5378 zone_drop_free_elements(zone_t z)
5379 {
5380 const zone_addr_kind_t kind = ZONE_ADDR_NATIVE;
5381 unsigned int total_freed_pages = 0;
5382 struct zone_page_metadata *page_meta, *seq_meta;
5383 vm_address_t page_addr;
5384 vm_size_t size_to_free;
5385 vm_size_t free_count;
5386 uint32_t page_count;
5387
5388 current_thread()->options |= TH_OPT_ZONE_PRIV;
5389 lock_zone(z);
5390
5391 while (!zone_pva_is_null(z->pages_all_free)) {
5392 /*
5393 * If any replenishment threads are running, defer to them,
5394 * so that we don't deplete reserved zones.
5395 *
5396 * The timing of the check isn't super important, as there are
5397 * enough reserves to allow freeing an extra page_meta.
5398 *
5399 * Hence, we can check without grabbing the lock every time
5400 * through the loop. We do need the lock however to avoid
5401 * missing a wakeup when we decide to block.
5402 */
5403 if (zone_replenish_active > 0) {
5404 lck_spin_lock(&zone_replenish_lock);
5405 if (zone_replenish_active > 0) {
5406 assert_wait(&zone_replenish_active, THREAD_UNINT);
5407 lck_spin_unlock(&zone_replenish_lock);
5408 unlock_zone(z);
5409 thread_block(THREAD_CONTINUE_NULL);
5410 lock_zone(z);
5411 continue;
5412 }
5413 lck_spin_unlock(&zone_replenish_lock);
5414 }
5415
5416 page_meta = zone_pva_to_meta(z->pages_all_free, kind);
5417 page_count = page_meta->zm_page_count;
5418 free_count = zone_elem_count(z, ptoa(page_count), kind);
5419
5420 /*
5421 * Don't drain zones with async refill to below the refill
5422 * threshold, as they need some reserve to function properly.
5423 */
5424 if (!z->destroyed && z->prio_refill_count &&
5425 (vm_size_t)(z->countfree - free_count) < z->prio_refill_count) {
5426 break;
5427 }
5428
5429 zone_meta_queue_pop(z, &z->pages_all_free, kind, &page_addr);
5430
5431 if (os_sub_overflow(z->countfree, free_count, &z->countfree)) {
5432 zone_accounting_panic(z, "countfree wrap-around");
5433 }
5434 if (os_sub_overflow(z->countavail, free_count, &z->countavail)) {
5435 zone_accounting_panic(z, "countavail wrap-around");
5436 }
5437 if (os_sub_overflow(z->allfree_page_count, page_count,
5438 &z->allfree_page_count)) {
5439 zone_accounting_panic(z, "allfree_page_count wrap-around");
5440 }
5441 if (os_sub_overflow(z->page_count, page_count, &z->page_count)) {
5442 zone_accounting_panic(z, "page_count wrap-around");
5443 }
5444
5445 os_atomic_sub(&zones_phys_page_count, page_count, relaxed);
5446 os_atomic_sub(&zones_phys_page_mapped_count, page_count, relaxed);
5447
5448 bzero(page_meta, sizeof(*page_meta) * page_count);
5449 seq_meta = page_meta;
5450 page_meta = NULL; /* page_meta fields are zeroed, prevent reuse */
5451
5452 unlock_zone(z);
5453
5454 /* Free the pages for metadata and account for them */
5455 total_freed_pages += page_count;
5456 size_to_free = ptoa(page_count);
5457 #if KASAN_ZALLOC
5458 kasan_poison_range(page_addr, size_to_free, ASAN_VALID);
5459 #endif
5460 #if VM_MAX_TAG_ZONES
5461 if (z->tags) {
5462 ztMemoryRemove(z, page_addr, size_to_free);
5463 }
5464 #endif /* VM_MAX_TAG_ZONES */
5465
5466 if (z->va_sequester && z->alloc_pages == page_count) {
5467 kernel_memory_depopulate(submap_for_zone(z), page_addr,
5468 size_to_free, KMA_KOBJECT, VM_KERN_MEMORY_ZONE);
5469 } else {
5470 kmem_free(submap_for_zone(z), page_addr, size_to_free);
5471 seq_meta = NULL;
5472 }
5473 thread_yield_to_preemption();
5474
5475 lock_zone(z);
5476
5477 if (seq_meta) {
5478 zone_meta_queue_push(z, &z->pages_sequester, seq_meta, kind);
5479 z->sequester_page_count += page_count;
5480 }
5481 }
5482 if (z->destroyed) {
5483 assert(zone_pva_is_null(z->pages_all_free));
5484 assert(z->allfree_page_count == 0);
5485 }
5486 unlock_zone(z);
5487 current_thread()->options &= ~TH_OPT_ZONE_PRIV;
5488
5489 #if DEBUG || DEVELOPMENT
5490 if (zalloc_debug & ZALLOC_DEBUG_ZONEGC) {
5491 kprintf("zone_gc() of zone %s%s freed %lu elements, %d pages\n",
5492 zone_heap_name(z), z->z_name,
5493 (unsigned long)(ptoa(total_freed_pages) / z->pcpu_elem_size),
5494 total_freed_pages);
5495 }
5496 #endif /* DEBUG || DEVELOPMENT */
5497 }
5498
5499 /* Zone garbage collection
5500 *
5501 * zone_gc will walk through all the free elements in all the
5502 * zones that are marked collectable looking for reclaimable
5503 * pages. zone_gc is called by consider_zone_gc when the system
5504 * begins to run out of memory.
5505 *
5506 * We should ensure that zone_gc never blocks.
5507 */
5508 void
5509 zone_gc(boolean_t consider_jetsams)
5510 {
5511 if (consider_jetsams) {
5512 kill_process_in_largest_zone();
5513 /*
5514 * If we do end up jetsamming something, we need to do a zone_gc so that
5515 * we can reclaim free zone elements and update the zone map size.
5516 * Fall through.
5517 */
5518 }
5519
5520 lck_mtx_lock(&zone_gc_lock);
5521
5522 #if DEBUG || DEVELOPMENT
5523 if (zalloc_debug & ZALLOC_DEBUG_ZONEGC) {
5524 kprintf("zone_gc() starting...\n");
5525 }
5526 #endif /* DEBUG || DEVELOPMENT */
5527
5528 zone_index_foreach(i) {
5529 zone_t z = &zone_array[i];
5530
5531 if (!z->collectable) {
5532 continue;
5533 }
5534 #if CONFIG_ZCACHE
5535 if (zone_caching_enabled(z)) {
5536 zcache_drain_depot(z);
5537 }
5538 #endif /* CONFIG_ZCACHE */
5539 if (zone_pva_is_null(z->pages_all_free)) {
5540 continue;
5541 }
5542
5543 zone_drop_free_elements(z);
5544 }
5545
5546 lck_mtx_unlock(&zone_gc_lock);
5547 }
5548
5549 /*
5550 * consider_zone_gc:
5551 *
5552 * Called by the pageout daemon when the system needs more free pages.
5553 */
5554
5555 void
5556 consider_zone_gc(boolean_t consider_jetsams)
5557 {
5558 /*
5559 * One-time reclaim of kernel_map resources we allocated in
5560 * early boot.
5561 *
5562 * Use atomic exchange in case multiple threads race into here.
5563 */
5564 vm_offset_t deallocate_kaddr;
5565 if (kmapoff_kaddr != 0 &&
5566 (deallocate_kaddr = os_atomic_xchg(&kmapoff_kaddr, 0, relaxed)) != 0) {
5567 vm_deallocate(kernel_map, deallocate_kaddr, ptoa_64(kmapoff_pgcnt));
5568 }
5569
5570 zone_gc(consider_jetsams);
5571 }
5572
5573 /*
5574 * Creates a vm_map_copy_t to return to the caller of mach_* MIG calls
5575 * requesting zone information.
5576 * Frees unused pages towards the end of the region, and zero'es out unused
5577 * space on the last page.
5578 */
5579 static vm_map_copy_t
5580 create_vm_map_copy(
5581 vm_offset_t start_addr,
5582 vm_size_t total_size,
5583 vm_size_t used_size)
5584 {
5585 kern_return_t kr;
5586 vm_offset_t end_addr;
5587 vm_size_t free_size;
5588 vm_map_copy_t copy;
5589
5590 if (used_size != total_size) {
5591 end_addr = start_addr + used_size;
5592 free_size = total_size - (round_page(end_addr) - start_addr);
5593
5594 if (free_size >= PAGE_SIZE) {
5595 kmem_free(ipc_kernel_map,
5596 round_page(end_addr), free_size);
5597 }
5598 bzero((char *) end_addr, round_page(end_addr) - end_addr);
5599 }
5600
5601 kr = vm_map_copyin(ipc_kernel_map, (vm_map_address_t)start_addr,
5602 (vm_map_size_t)used_size, TRUE, &copy);
5603 assert(kr == KERN_SUCCESS);
5604
5605 return copy;
5606 }
5607
5608 static boolean_t
5609 get_zone_info(
5610 zone_t z,
5611 mach_zone_name_t *zn,
5612 mach_zone_info_t *zi)
5613 {
5614 struct zone zcopy;
5615
5616 assert(z != ZONE_NULL);
5617 lock_zone(z);
5618 if (!z->z_self) {
5619 unlock_zone(z);
5620 return FALSE;
5621 }
5622 zcopy = *z;
5623 unlock_zone(z);
5624
5625 if (zn != NULL) {
5626 /*
5627 * Append kalloc heap name to zone name (if zone is used by kalloc)
5628 */
5629 char temp_zone_name[MAX_ZONE_NAME] = "";
5630 snprintf(temp_zone_name, MAX_ZONE_NAME, "%s%s",
5631 zone_heap_name(z), z->z_name);
5632
5633 /* assuming here the name data is static */
5634 (void) __nosan_strlcpy(zn->mzn_name, temp_zone_name,
5635 strlen(temp_zone_name) + 1);
5636 }
5637
5638 if (zi != NULL) {
5639 *zi = (mach_zone_info_t) {
5640 .mzi_count = zone_count_allocated(&zcopy),
5641 .mzi_cur_size = ptoa_64(zcopy.page_count),
5642 // max_size for zprint is now high-watermark of pages used
5643 .mzi_max_size = ptoa_64(zcopy.page_count_hwm),
5644 .mzi_elem_size = zcopy.pcpu_elem_size,
5645 .mzi_alloc_size = ptoa_64(zcopy.alloc_pages),
5646 .mzi_exhaustible = (uint64_t)zcopy.exhaustible,
5647 };
5648 zpercpu_foreach(zs, zcopy.z_stats) {
5649 zi->mzi_sum_size += zs->zs_mem_allocated;
5650 }
5651 if (zcopy.collectable) {
5652 SET_MZI_COLLECTABLE_BYTES(zi->mzi_collectable,
5653 ptoa_64(zcopy.allfree_page_count));
5654 SET_MZI_COLLECTABLE_FLAG(zi->mzi_collectable, TRUE);
5655 }
5656 }
5657
5658 return TRUE;
5659 }
5660
5661 kern_return_t
5662 task_zone_info(
5663 __unused task_t task,
5664 __unused mach_zone_name_array_t *namesp,
5665 __unused mach_msg_type_number_t *namesCntp,
5666 __unused task_zone_info_array_t *infop,
5667 __unused mach_msg_type_number_t *infoCntp)
5668 {
5669 return KERN_FAILURE;
5670 }
5671
5672 kern_return_t
5673 mach_zone_info(
5674 host_priv_t host,
5675 mach_zone_name_array_t *namesp,
5676 mach_msg_type_number_t *namesCntp,
5677 mach_zone_info_array_t *infop,
5678 mach_msg_type_number_t *infoCntp)
5679 {
5680 return mach_memory_info(host, namesp, namesCntp, infop, infoCntp, NULL, NULL);
5681 }
5682
5683
5684 kern_return_t
5685 mach_memory_info(
5686 host_priv_t host,
5687 mach_zone_name_array_t *namesp,
5688 mach_msg_type_number_t *namesCntp,
5689 mach_zone_info_array_t *infop,
5690 mach_msg_type_number_t *infoCntp,
5691 mach_memory_info_array_t *memoryInfop,
5692 mach_msg_type_number_t *memoryInfoCntp)
5693 {
5694 mach_zone_name_t *names;
5695 vm_offset_t names_addr;
5696 vm_size_t names_size;
5697
5698 mach_zone_info_t *info;
5699 vm_offset_t info_addr;
5700 vm_size_t info_size;
5701
5702 mach_memory_info_t *memory_info;
5703 vm_offset_t memory_info_addr;
5704 vm_size_t memory_info_size;
5705 vm_size_t memory_info_vmsize;
5706 unsigned int num_info;
5707
5708 unsigned int max_zones, used_zones, i;
5709 mach_zone_name_t *zn;
5710 mach_zone_info_t *zi;
5711 kern_return_t kr;
5712
5713 uint64_t zones_collectable_bytes = 0;
5714
5715 if (host == HOST_NULL) {
5716 return KERN_INVALID_HOST;
5717 }
5718 #if CONFIG_DEBUGGER_FOR_ZONE_INFO
5719 if (!PE_i_can_has_debugger(NULL)) {
5720 return KERN_INVALID_HOST;
5721 }
5722 #endif
5723
5724 /*
5725 * We assume that zones aren't freed once allocated.
5726 * We won't pick up any zones that are allocated later.
5727 */
5728
5729 max_zones = os_atomic_load(&num_zones, relaxed);
5730
5731 names_size = round_page(max_zones * sizeof *names);
5732 kr = kmem_alloc_pageable(ipc_kernel_map,
5733 &names_addr, names_size, VM_KERN_MEMORY_IPC);
5734 if (kr != KERN_SUCCESS) {
5735 return kr;
5736 }
5737 names = (mach_zone_name_t *) names_addr;
5738
5739 info_size = round_page(max_zones * sizeof *info);
5740 kr = kmem_alloc_pageable(ipc_kernel_map,
5741 &info_addr, info_size, VM_KERN_MEMORY_IPC);
5742 if (kr != KERN_SUCCESS) {
5743 kmem_free(ipc_kernel_map,
5744 names_addr, names_size);
5745 return kr;
5746 }
5747 info = (mach_zone_info_t *) info_addr;
5748
5749 zn = &names[0];
5750 zi = &info[0];
5751
5752 used_zones = max_zones;
5753 for (i = 0; i < max_zones; i++) {
5754 if (!get_zone_info(&(zone_array[i]), zn, zi)) {
5755 used_zones--;
5756 continue;
5757 }
5758 zones_collectable_bytes += GET_MZI_COLLECTABLE_BYTES(zi->mzi_collectable);
5759 zn++;
5760 zi++;
5761 }
5762
5763 *namesp = (mach_zone_name_t *) create_vm_map_copy(names_addr, names_size, used_zones * sizeof *names);
5764 *namesCntp = used_zones;
5765
5766 *infop = (mach_zone_info_t *) create_vm_map_copy(info_addr, info_size, used_zones * sizeof *info);
5767 *infoCntp = used_zones;
5768
5769 num_info = 0;
5770 memory_info_addr = 0;
5771
5772 if (memoryInfop && memoryInfoCntp) {
5773 vm_map_copy_t copy;
5774 num_info = vm_page_diagnose_estimate();
5775 memory_info_size = num_info * sizeof(*memory_info);
5776 memory_info_vmsize = round_page(memory_info_size);
5777 kr = kmem_alloc_pageable(ipc_kernel_map,
5778 &memory_info_addr, memory_info_vmsize, VM_KERN_MEMORY_IPC);
5779 if (kr != KERN_SUCCESS) {
5780 return kr;
5781 }
5782
5783 kr = vm_map_wire_kernel(ipc_kernel_map, memory_info_addr, memory_info_addr + memory_info_vmsize,
5784 VM_PROT_READ | VM_PROT_WRITE, VM_KERN_MEMORY_IPC, FALSE);
5785 assert(kr == KERN_SUCCESS);
5786
5787 memory_info = (mach_memory_info_t *) memory_info_addr;
5788 vm_page_diagnose(memory_info, num_info, zones_collectable_bytes);
5789
5790 kr = vm_map_unwire(ipc_kernel_map, memory_info_addr, memory_info_addr + memory_info_vmsize, FALSE);
5791 assert(kr == KERN_SUCCESS);
5792
5793 kr = vm_map_copyin(ipc_kernel_map, (vm_map_address_t)memory_info_addr,
5794 (vm_map_size_t)memory_info_size, TRUE, &copy);
5795 assert(kr == KERN_SUCCESS);
5796
5797 *memoryInfop = (mach_memory_info_t *) copy;
5798 *memoryInfoCntp = num_info;
5799 }
5800
5801 return KERN_SUCCESS;
5802 }
5803
5804 kern_return_t
5805 mach_zone_info_for_zone(
5806 host_priv_t host,
5807 mach_zone_name_t name,
5808 mach_zone_info_t *infop)
5809 {
5810 zone_t zone_ptr;
5811
5812 if (host == HOST_NULL) {
5813 return KERN_INVALID_HOST;
5814 }
5815 #if CONFIG_DEBUGGER_FOR_ZONE_INFO
5816 if (!PE_i_can_has_debugger(NULL)) {
5817 return KERN_INVALID_HOST;
5818 }
5819 #endif
5820
5821 if (infop == NULL) {
5822 return KERN_INVALID_ARGUMENT;
5823 }
5824
5825 zone_ptr = ZONE_NULL;
5826 zone_index_foreach(i) {
5827 zone_t z = &(zone_array[i]);
5828 assert(z != ZONE_NULL);
5829
5830 /*
5831 * Append kalloc heap name to zone name (if zone is used by kalloc)
5832 */
5833 char temp_zone_name[MAX_ZONE_NAME] = "";
5834 snprintf(temp_zone_name, MAX_ZONE_NAME, "%s%s",
5835 zone_heap_name(z), z->z_name);
5836
5837 /* Find the requested zone by name */
5838 if (track_this_zone(temp_zone_name, name.mzn_name)) {
5839 zone_ptr = z;
5840 break;
5841 }
5842 }
5843
5844 /* No zones found with the requested zone name */
5845 if (zone_ptr == ZONE_NULL) {
5846 return KERN_INVALID_ARGUMENT;
5847 }
5848
5849 if (get_zone_info(zone_ptr, NULL, infop)) {
5850 return KERN_SUCCESS;
5851 }
5852 return KERN_FAILURE;
5853 }
5854
5855 kern_return_t
5856 mach_zone_info_for_largest_zone(
5857 host_priv_t host,
5858 mach_zone_name_t *namep,
5859 mach_zone_info_t *infop)
5860 {
5861 if (host == HOST_NULL) {
5862 return KERN_INVALID_HOST;
5863 }
5864 #if CONFIG_DEBUGGER_FOR_ZONE_INFO
5865 if (!PE_i_can_has_debugger(NULL)) {
5866 return KERN_INVALID_HOST;
5867 }
5868 #endif
5869
5870 if (namep == NULL || infop == NULL) {
5871 return KERN_INVALID_ARGUMENT;
5872 }
5873
5874 if (get_zone_info(zone_find_largest(), namep, infop)) {
5875 return KERN_SUCCESS;
5876 }
5877 return KERN_FAILURE;
5878 }
5879
5880 uint64_t
5881 get_zones_collectable_bytes(void)
5882 {
5883 uint64_t zones_collectable_bytes = 0;
5884 mach_zone_info_t zi;
5885
5886 zone_index_foreach(i) {
5887 if (get_zone_info(&zone_array[i], NULL, &zi)) {
5888 zones_collectable_bytes +=
5889 GET_MZI_COLLECTABLE_BYTES(zi.mzi_collectable);
5890 }
5891 }
5892
5893 return zones_collectable_bytes;
5894 }
5895
5896 kern_return_t
5897 mach_zone_get_zlog_zones(
5898 host_priv_t host,
5899 mach_zone_name_array_t *namesp,
5900 mach_msg_type_number_t *namesCntp)
5901 {
5902 #if ZONE_ENABLE_LOGGING
5903 unsigned int max_zones, logged_zones, i;
5904 kern_return_t kr;
5905 zone_t zone_ptr;
5906 mach_zone_name_t *names;
5907 vm_offset_t names_addr;
5908 vm_size_t names_size;
5909
5910 if (host == HOST_NULL) {
5911 return KERN_INVALID_HOST;
5912 }
5913
5914 if (namesp == NULL || namesCntp == NULL) {
5915 return KERN_INVALID_ARGUMENT;
5916 }
5917
5918 max_zones = os_atomic_load(&num_zones, relaxed);
5919
5920 names_size = round_page(max_zones * sizeof *names);
5921 kr = kmem_alloc_pageable(ipc_kernel_map,
5922 &names_addr, names_size, VM_KERN_MEMORY_IPC);
5923 if (kr != KERN_SUCCESS) {
5924 return kr;
5925 }
5926 names = (mach_zone_name_t *) names_addr;
5927
5928 zone_ptr = ZONE_NULL;
5929 logged_zones = 0;
5930 for (i = 0; i < max_zones; i++) {
5931 zone_t z = &(zone_array[i]);
5932 assert(z != ZONE_NULL);
5933
5934 /* Copy out the zone name if zone logging is enabled */
5935 if (z->zlog_btlog) {
5936 get_zone_info(z, &names[logged_zones], NULL);
5937 logged_zones++;
5938 }
5939 }
5940
5941 *namesp = (mach_zone_name_t *) create_vm_map_copy(names_addr, names_size, logged_zones * sizeof *names);
5942 *namesCntp = logged_zones;
5943
5944 return KERN_SUCCESS;
5945
5946 #else /* ZONE_ENABLE_LOGGING */
5947 #pragma unused(host, namesp, namesCntp)
5948 return KERN_FAILURE;
5949 #endif /* ZONE_ENABLE_LOGGING */
5950 }
5951
5952 kern_return_t
5953 mach_zone_get_btlog_records(
5954 host_priv_t host,
5955 mach_zone_name_t name,
5956 zone_btrecord_array_t *recsp,
5957 mach_msg_type_number_t *recsCntp)
5958 {
5959 #if DEBUG || DEVELOPMENT
5960 unsigned int numrecs = 0;
5961 zone_btrecord_t *recs;
5962 kern_return_t kr;
5963 zone_t zone_ptr;
5964 vm_offset_t recs_addr;
5965 vm_size_t recs_size;
5966
5967 if (host == HOST_NULL) {
5968 return KERN_INVALID_HOST;
5969 }
5970
5971 if (recsp == NULL || recsCntp == NULL) {
5972 return KERN_INVALID_ARGUMENT;
5973 }
5974
5975 zone_ptr = ZONE_NULL;
5976 zone_index_foreach(i) {
5977 zone_t z = &zone_array[i];
5978
5979 /*
5980 * Append kalloc heap name to zone name (if zone is used by kalloc)
5981 */
5982 char temp_zone_name[MAX_ZONE_NAME] = "";
5983 snprintf(temp_zone_name, MAX_ZONE_NAME, "%s%s",
5984 zone_heap_name(z), z->z_name);
5985
5986 /* Find the requested zone by name */
5987 if (track_this_zone(temp_zone_name, name.mzn_name)) {
5988 zone_ptr = z;
5989 break;
5990 }
5991 }
5992
5993 /* No zones found with the requested zone name */
5994 if (zone_ptr == ZONE_NULL) {
5995 return KERN_INVALID_ARGUMENT;
5996 }
5997
5998 /* Logging not turned on for the requested zone */
5999 if (!DO_LOGGING(zone_ptr)) {
6000 return KERN_FAILURE;
6001 }
6002
6003 /* Allocate memory for btlog records */
6004 numrecs = (unsigned int)(get_btlog_records_count(zone_ptr->zlog_btlog));
6005 recs_size = round_page(numrecs * sizeof *recs);
6006
6007 kr = kmem_alloc_pageable(ipc_kernel_map, &recs_addr, recs_size, VM_KERN_MEMORY_IPC);
6008 if (kr != KERN_SUCCESS) {
6009 return kr;
6010 }
6011
6012 /*
6013 * We will call get_btlog_records() below which populates this region while holding a spinlock
6014 * (the btlog lock). So these pages need to be wired.
6015 */
6016 kr = vm_map_wire_kernel(ipc_kernel_map, recs_addr, recs_addr + recs_size,
6017 VM_PROT_READ | VM_PROT_WRITE, VM_KERN_MEMORY_IPC, FALSE);
6018 assert(kr == KERN_SUCCESS);
6019
6020 recs = (zone_btrecord_t *)recs_addr;
6021 get_btlog_records(zone_ptr->zlog_btlog, recs, &numrecs);
6022
6023 kr = vm_map_unwire(ipc_kernel_map, recs_addr, recs_addr + recs_size, FALSE);
6024 assert(kr == KERN_SUCCESS);
6025
6026 *recsp = (zone_btrecord_t *) create_vm_map_copy(recs_addr, recs_size, numrecs * sizeof *recs);
6027 *recsCntp = numrecs;
6028
6029 return KERN_SUCCESS;
6030
6031 #else /* DEBUG || DEVELOPMENT */
6032 #pragma unused(host, name, recsp, recsCntp)
6033 return KERN_FAILURE;
6034 #endif /* DEBUG || DEVELOPMENT */
6035 }
6036
6037
6038 #if DEBUG || DEVELOPMENT
6039
6040 kern_return_t
6041 mach_memory_info_check(void)
6042 {
6043 mach_memory_info_t * memory_info;
6044 mach_memory_info_t * info;
6045 unsigned int num_info;
6046 vm_offset_t memory_info_addr;
6047 kern_return_t kr;
6048 size_t memory_info_size, memory_info_vmsize;
6049 uint64_t top_wired, zonestotal, total;
6050
6051 num_info = vm_page_diagnose_estimate();
6052 memory_info_size = num_info * sizeof(*memory_info);
6053 memory_info_vmsize = round_page(memory_info_size);
6054 kr = kmem_alloc(kernel_map, &memory_info_addr, memory_info_vmsize, VM_KERN_MEMORY_DIAG);
6055 assert(kr == KERN_SUCCESS);
6056
6057 memory_info = (mach_memory_info_t *) memory_info_addr;
6058 vm_page_diagnose(memory_info, num_info, 0);
6059
6060 top_wired = total = zonestotal = 0;
6061 zone_index_foreach(idx) {
6062 zonestotal += zone_size_wired(&zone_array[idx]);
6063 }
6064
6065 for (uint32_t idx = 0; idx < num_info; idx++) {
6066 info = &memory_info[idx];
6067 if (!info->size) {
6068 continue;
6069 }
6070 if (VM_KERN_COUNT_WIRED == info->site) {
6071 top_wired = info->size;
6072 }
6073 if (VM_KERN_SITE_HIDE & info->flags) {
6074 continue;
6075 }
6076 if (!(VM_KERN_SITE_WIRED & info->flags)) {
6077 continue;
6078 }
6079 total += info->size;
6080 }
6081 total += zonestotal;
6082
6083 printf("vm_page_diagnose_check %qd of %qd, zones %qd, short 0x%qx\n",
6084 total, top_wired, zonestotal, top_wired - total);
6085
6086 kmem_free(kernel_map, memory_info_addr, memory_info_vmsize);
6087
6088 return kr;
6089 }
6090
6091 extern boolean_t(*volatile consider_buffer_cache_collect)(int);
6092
6093 #endif /* DEBUG || DEVELOPMENT */
6094
6095 kern_return_t
6096 mach_zone_force_gc(
6097 host_t host)
6098 {
6099 if (host == HOST_NULL) {
6100 return KERN_INVALID_HOST;
6101 }
6102
6103 #if DEBUG || DEVELOPMENT
6104 /* Callout to buffer cache GC to drop elements in the apfs zones */
6105 if (consider_buffer_cache_collect != NULL) {
6106 (void)(*consider_buffer_cache_collect)(0);
6107 }
6108 consider_zone_gc(FALSE);
6109 #endif /* DEBUG || DEVELOPMENT */
6110 return KERN_SUCCESS;
6111 }
6112
6113 zone_t
6114 zone_find_largest(void)
6115 {
6116 uint32_t largest_idx = 0;
6117 vm_offset_t largest_size = zone_size_wired(&zone_array[0]);
6118
6119 zone_index_foreach(i) {
6120 vm_offset_t size = zone_size_wired(&zone_array[i]);
6121 if (size > largest_size) {
6122 largest_idx = i;
6123 largest_size = size;
6124 }
6125 }
6126
6127 return &zone_array[largest_idx];
6128 }
6129
6130 #pragma mark - tests
6131 #if DEBUG || DEVELOPMENT
6132
6133 /*
6134 * Used for sysctl kern.run_zone_test which is not thread-safe. Ensure only one
6135 * thread goes through at a time. Or we can end up with multiple test zones (if
6136 * a second zinit() comes through before zdestroy()), which could lead us to
6137 * run out of zones.
6138 */
6139 SIMPLE_LOCK_DECLARE(zone_test_lock, 0);
6140 static boolean_t zone_test_running = FALSE;
6141 static zone_t test_zone_ptr = NULL;
6142
6143 static uintptr_t *
6144 zone_copy_allocations(zone_t z, uintptr_t *elems, bitmap_t *bits,
6145 zone_pva_t page_index, zone_addr_kind_t kind)
6146 {
6147 vm_offset_t free, first, end, page;
6148 struct zone_page_metadata *meta;
6149
6150 while (!zone_pva_is_null(page_index)) {
6151 page = zone_pva_to_addr(page_index);
6152 meta = zone_pva_to_meta(page_index, kind);
6153 end = page + ptoa(meta->zm_percpu ? 1 : meta->zm_page_count);
6154 first = page + ZONE_PAGE_FIRST_OFFSET(kind);
6155
6156 bitmap_clear(bits, (uint32_t)((end - first) / zone_elem_size(z)));
6157
6158 // construct bitmap of all freed elements
6159 free = zone_page_meta_get_freelist(z, meta, page);
6160 while (free) {
6161 bitmap_set(bits, (uint32_t)((free - first) / zone_elem_size(z)));
6162
6163 // next free element
6164 free = *(vm_offset_t *)free ^ zp_nopoison_cookie;
6165 }
6166
6167 for (unsigned i = 0; first < end; i++, first += zone_elem_size(z)) {
6168 if (!bitmap_test(bits, i)) {
6169 *elems++ = INSTANCE_PUT(first);
6170 }
6171 }
6172
6173 page_index = meta->zm_page_next;
6174 }
6175 return elems;
6176 }
6177
6178 kern_return_t
6179 zone_leaks(const char * zoneName, uint32_t nameLen, leak_site_proc proc, void * refCon)
6180 {
6181 uintptr_t zbt[MAX_ZTRACE_DEPTH];
6182 zone_t zone = NULL;
6183 uintptr_t * array;
6184 uintptr_t * next;
6185 uintptr_t element, bt;
6186 uint32_t idx, count, found;
6187 uint32_t btidx, btcount, nobtcount, btfound;
6188 uint32_t elemSize;
6189 uint64_t maxElems;
6190 kern_return_t kr;
6191 bitmap_t *bits;
6192
6193 zone_index_foreach(i) {
6194 if (!strncmp(zoneName, zone_array[i].z_name, nameLen)) {
6195 zone = &zone_array[i];
6196 break;
6197 }
6198 }
6199 if (zone == NULL) {
6200 return KERN_INVALID_NAME;
6201 }
6202
6203 elemSize = zone_elem_size(zone);
6204 maxElems = (zone->countavail + 1) & ~1ul;
6205
6206 if ((ptoa(zone->percpu ? 1 : zone->alloc_pages) % elemSize) &&
6207 !zone_leaks_scan_enable) {
6208 return KERN_INVALID_CAPABILITY;
6209 }
6210
6211 kr = kmem_alloc_kobject(kernel_map, (vm_offset_t *) &array,
6212 maxElems * sizeof(uintptr_t) + BITMAP_LEN(ZONE_CHUNK_MAXELEMENTS),
6213 VM_KERN_MEMORY_DIAG);
6214 if (KERN_SUCCESS != kr) {
6215 return kr;
6216 }
6217
6218 /* maxElems is a 2-multiple so we're always aligned */
6219 bits = CAST_DOWN_EXPLICIT(bitmap_t *, array + maxElems);
6220
6221 lock_zone(zone);
6222
6223 next = array;
6224 next = zone_copy_allocations(zone, next, bits,
6225 zone->pages_any_free_foreign, ZONE_ADDR_FOREIGN);
6226 next = zone_copy_allocations(zone, next, bits,
6227 zone->pages_all_used_foreign, ZONE_ADDR_FOREIGN);
6228 next = zone_copy_allocations(zone, next, bits,
6229 zone->pages_intermediate, ZONE_ADDR_NATIVE);
6230 next = zone_copy_allocations(zone, next, bits,
6231 zone->pages_all_used, ZONE_ADDR_NATIVE);
6232 count = (uint32_t)(next - array);
6233
6234 unlock_zone(zone);
6235
6236 zone_leaks_scan(array, count, zone_elem_size(zone), &found);
6237 assert(found <= count);
6238
6239 for (idx = 0; idx < count; idx++) {
6240 element = array[idx];
6241 if (kInstanceFlagReferenced & element) {
6242 continue;
6243 }
6244 element = INSTANCE_PUT(element) & ~kInstanceFlags;
6245 }
6246
6247 #if ZONE_ENABLE_LOGGING
6248 if (zone->zlog_btlog && !corruption_debug_flag) {
6249 // btlog_copy_backtraces_for_elements will set kInstanceFlagReferenced on elements it found
6250 btlog_copy_backtraces_for_elements(zone->zlog_btlog, array, &count, elemSize, proc, refCon);
6251 }
6252 #endif /* ZONE_ENABLE_LOGGING */
6253
6254 for (nobtcount = idx = 0; idx < count; idx++) {
6255 element = array[idx];
6256 if (!element) {
6257 continue;
6258 }
6259 if (kInstanceFlagReferenced & element) {
6260 continue;
6261 }
6262 element = INSTANCE_PUT(element) & ~kInstanceFlags;
6263
6264 // see if we can find any backtrace left in the element
6265 btcount = (typeof(btcount))(zone_elem_size(zone) / sizeof(uintptr_t));
6266 if (btcount >= MAX_ZTRACE_DEPTH) {
6267 btcount = MAX_ZTRACE_DEPTH - 1;
6268 }
6269 for (btfound = btidx = 0; btidx < btcount; btidx++) {
6270 bt = ((uintptr_t *)element)[btcount - 1 - btidx];
6271 if (!VM_KERNEL_IS_SLID(bt)) {
6272 break;
6273 }
6274 zbt[btfound++] = bt;
6275 }
6276 if (btfound) {
6277 (*proc)(refCon, 1, elemSize, &zbt[0], btfound);
6278 } else {
6279 nobtcount++;
6280 }
6281 }
6282 if (nobtcount) {
6283 // fake backtrace when we found nothing
6284 zbt[0] = (uintptr_t) &zalloc;
6285 (*proc)(refCon, nobtcount, elemSize, &zbt[0], 1);
6286 }
6287
6288 kmem_free(kernel_map, (vm_offset_t) array, maxElems * sizeof(uintptr_t));
6289
6290 return KERN_SUCCESS;
6291 }
6292
6293 boolean_t
6294 run_zone_test(void)
6295 {
6296 unsigned int i = 0, max_iter = 5;
6297 void * test_ptr;
6298 zone_t test_zone;
6299
6300 simple_lock(&zone_test_lock, &zone_locks_grp);
6301 if (!zone_test_running) {
6302 zone_test_running = TRUE;
6303 } else {
6304 simple_unlock(&zone_test_lock);
6305 printf("run_zone_test: Test already running.\n");
6306 return FALSE;
6307 }
6308 simple_unlock(&zone_test_lock);
6309
6310 printf("run_zone_test: Testing zinit(), zalloc(), zfree() and zdestroy() on zone \"test_zone_sysctl\"\n");
6311
6312 /* zinit() and zdestroy() a zone with the same name a bunch of times, verify that we get back the same zone each time */
6313 do {
6314 test_zone = zinit(sizeof(uint64_t), 100 * sizeof(uint64_t), sizeof(uint64_t), "test_zone_sysctl");
6315 if (test_zone == NULL) {
6316 printf("run_zone_test: zinit() failed\n");
6317 return FALSE;
6318 }
6319
6320 #if KASAN_ZALLOC
6321 if (test_zone_ptr == NULL && test_zone->countfree != 0) {
6322 #else
6323 if (test_zone->countfree != 0) {
6324 #endif
6325 printf("run_zone_test: free count is not zero\n");
6326 return FALSE;
6327 }
6328
6329 if (test_zone_ptr == NULL) {
6330 /* Stash the zone pointer returned on the fist zinit */
6331 printf("run_zone_test: zone created for the first time\n");
6332 test_zone_ptr = test_zone;
6333 } else if (test_zone != test_zone_ptr) {
6334 printf("run_zone_test: old zone pointer and new zone pointer don't match\n");
6335 return FALSE;
6336 }
6337
6338 test_ptr = zalloc(test_zone);
6339 if (test_ptr == NULL) {
6340 printf("run_zone_test: zalloc() failed\n");
6341 return FALSE;
6342 }
6343 zfree(test_zone, test_ptr);
6344
6345 zdestroy(test_zone);
6346 i++;
6347
6348 printf("run_zone_test: Iteration %d successful\n", i);
6349 } while (i < max_iter);
6350
6351 /* test Z_VA_SEQUESTER */
6352 if (zsecurity_options & ZSECURITY_OPTIONS_SEQUESTER) {
6353 int idx, num_allocs = 8;
6354 vm_size_t elem_size = 2 * PAGE_SIZE / num_allocs;
6355 void *allocs[num_allocs];
6356 vm_offset_t phys_pages = os_atomic_load(&zones_phys_page_count, relaxed);
6357 vm_size_t zone_map_size = zone_range_size(&zone_info.zi_map_range);
6358
6359 test_zone = zone_create("test_zone_sysctl", elem_size,
6360 ZC_DESTRUCTIBLE | ZC_SEQUESTER);
6361 if (test_zone == NULL) {
6362 printf("run_zone_test: zinit() failed\n");
6363 return FALSE;
6364 }
6365
6366 for (idx = 0; idx < num_allocs; idx++) {
6367 allocs[idx] = zalloc(test_zone);
6368 assert(NULL != allocs[idx]);
6369 printf("alloc[%d] %p\n", idx, allocs[idx]);
6370 }
6371 for (idx = 0; idx < num_allocs; idx++) {
6372 zfree(test_zone, allocs[idx]);
6373 }
6374 assert(!zone_pva_is_null(test_zone->pages_all_free));
6375
6376 printf("vm_page_wire_count %d, vm_page_free_count %d, p to v %qd%%\n",
6377 vm_page_wire_count, vm_page_free_count,
6378 (100ULL * ptoa_64(phys_pages)) / zone_map_size);
6379 zone_gc(FALSE);
6380 printf("vm_page_wire_count %d, vm_page_free_count %d, p to v %qd%%\n",
6381 vm_page_wire_count, vm_page_free_count,
6382 (100ULL * ptoa_64(phys_pages)) / zone_map_size);
6383 unsigned int allva = 0;
6384 zone_index_foreach(zidx) {
6385 zone_t z = &zone_array[zidx];
6386 lock_zone(z);
6387 allva += z->page_count;
6388 if (!z->sequester_page_count) {
6389 unlock_zone(z);
6390 continue;
6391 }
6392 unsigned count = 0;
6393 uint64_t size;
6394 zone_pva_t pg = z->pages_sequester;
6395 struct zone_page_metadata *page_meta;
6396 while (pg.packed_address) {
6397 page_meta = zone_pva_to_meta(pg, ZONE_ADDR_NATIVE);
6398 count += z->alloc_pages;
6399 pg = page_meta->zm_page_next;
6400 }
6401 assert(count == z->sequester_page_count);
6402 size = zone_size_wired(z);
6403 if (!size) {
6404 size = 1;
6405 }
6406 printf("%s%s: seq %d, res %d, %qd %%\n",
6407 zone_heap_name(z), z->z_name, z->sequester_page_count,
6408 z->page_count, zone_size_allocated(z) * 100ULL / size);
6409 unlock_zone(z);
6410 }
6411
6412 printf("total va: %d\n", allva);
6413
6414 assert(zone_pva_is_null(test_zone->pages_all_free));
6415 assert(!zone_pva_is_null(test_zone->pages_sequester));
6416 assert(2 == test_zone->sequester_page_count);
6417 for (idx = 0; idx < num_allocs; idx++) {
6418 assert(0 == pmap_find_phys(kernel_pmap, (addr64_t)(uintptr_t) allocs[idx]));
6419 }
6420 for (idx = 0; idx < num_allocs; idx++) {
6421 allocs[idx] = zalloc(test_zone);
6422 assert(allocs[idx]);
6423 printf("alloc[%d] %p\n", idx, allocs[idx]);
6424 }
6425 assert(zone_pva_is_null(test_zone->pages_sequester));
6426 assert(0 == test_zone->sequester_page_count);
6427 for (idx = 0; idx < num_allocs; idx++) {
6428 zfree(test_zone, allocs[idx]);
6429 }
6430 zdestroy(test_zone);
6431 } else {
6432 printf("run_zone_test: skipping sequester test (not enabled)\n");
6433 }
6434
6435 printf("run_zone_test: Test passed\n");
6436
6437 simple_lock(&zone_test_lock, &zone_locks_grp);
6438 zone_test_running = FALSE;
6439 simple_unlock(&zone_test_lock);
6440
6441 return TRUE;
6442 }
6443
6444 /*
6445 * Routines to test that zone garbage collection and zone replenish threads
6446 * running at the same time don't cause problems.
6447 */
6448
6449 void
6450 zone_gc_replenish_test(void)
6451 {
6452 zone_gc(FALSE);
6453 }
6454
6455
6456 void
6457 zone_alloc_replenish_test(void)
6458 {
6459 zone_t z = NULL;
6460 struct data { struct data *next; } *node, *list = NULL;
6461
6462 /*
6463 * Find a zone that has a replenish thread
6464 */
6465 zone_index_foreach(i) {
6466 z = &zone_array[i];
6467 if (z->prio_refill_count &&
6468 zone_elem_size(z) >= sizeof(struct data)) {
6469 z = &zone_array[i];
6470 break;
6471 }
6472 }
6473 if (z == NULL) {
6474 printf("Couldn't find a replenish zone\n");
6475 return;
6476 }
6477
6478 for (uint32_t i = 0; i < 2000; ++i) { /* something big enough to go past replenishment */
6479 node = zalloc(z);
6480 node->next = list;
6481 list = node;
6482 }
6483
6484 /*
6485 * release the memory we allocated
6486 */
6487 while (list != NULL) {
6488 node = list;
6489 list = list->next;
6490 zfree(z, node);
6491 }
6492 }
6493
6494 #endif /* DEBUG || DEVELOPMENT */