osfmk/kern/zalloc.c

   1 /*
   2  * Copyright (c) 2000-2020 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * @OSF_COPYRIGHT@
  30  */
  31 /*
  32  * Mach Operating System
  33  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
  34  * All Rights Reserved.
  35  *
  36  * Permission to use, copy, modify and distribute this software and its
  37  * documentation is hereby granted, provided that both the copyright
  38  * notice and this permission notice appear in all copies of the
  39  * software, derivative works or modified versions, and any portions
  40  * thereof, and that both notices appear in supporting documentation.
  41  *
  42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
  44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  45  *
  46  * Carnegie Mellon requests users of this software to return to
  47  *
  48  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  49  *  School of Computer Science
  50  *  Carnegie Mellon University
  51  *  Pittsburgh PA 15213-3890
  52  *
  53  * any improvements or extensions that they make and grant Carnegie Mellon
  54  * the rights to redistribute these changes.
  55  */
  56 /*
  57  */
  58 /*
  59  *      File:   kern/zalloc.c
  60  *      Author: Avadis Tevanian, Jr.
  61  *
  62  *      Zone-based memory allocator.  A zone is a collection of fixed size
  63  *      data blocks for which quick allocation/deallocation is possible.
  64  */
  65
  66 #define ZALLOC_ALLOW_DEPRECATED 1
  67 #include <mach/mach_types.h>
  68 #include <mach/vm_param.h>
  69 #include <mach/kern_return.h>
  70 #include <mach/mach_host_server.h>
  71 #include <mach/task_server.h>
  72 #include <mach/machine/vm_types.h>
  73 #include <mach/vm_map.h>
  74 #include <mach/sdt.h>
  75
  76 #include <kern/bits.h>
  77 #include <kern/startup.h>
  78 #include <kern/kern_types.h>
  79 #include <kern/assert.h>
  80 #include <kern/backtrace.h>
  81 #include <kern/host.h>
  82 #include <kern/macro_help.h>
  83 #include <kern/sched.h>
  84 #include <kern/locks.h>
  85 #include <kern/sched_prim.h>
  86 #include <kern/misc_protos.h>
  87 #include <kern/thread_call.h>
  88 #include <kern/zalloc_internal.h>
  89 #include <kern/kalloc.h>
  90
  91 #include <prng/random.h>
  92
  93 #include <vm/pmap.h>
  94 #include <vm/vm_map.h>
  95 #include <vm/vm_kern.h>
  96 #include <vm/vm_page.h>
  97 #include <vm/vm_compressor.h> /* C_SLOT_PACKED_PTR* */
  98
  99 #include <pexpert/pexpert.h>
 100
 101 #include <machine/machparam.h>
 102 #include <machine/machine_routines.h>  /* ml_cpu_get_info */
 103
 104 #include <os/atomic.h>
 105
 106 #include <libkern/OSDebug.h>
 107 #include <libkern/OSAtomic.h>
 108 #include <libkern/section_keywords.h>
 109 #include <sys/kdebug.h>
 110
 111 #include <san/kasan.h>
 112
 113 #if KASAN_ZALLOC
 114 #define ZONE_ENABLE_LOGGING 0
 115 #elif DEBUG || DEVELOPMENT
 116 #define ZONE_ENABLE_LOGGING 1
 117 #else
 118 #define ZONE_ENABLE_LOGGING 0
 119 #endif
 120
 121 extern void vm_pageout_garbage_collect(int collect);
 122
 123 /* Returns pid of the task with the largest number of VM map entries.  */
 124 extern pid_t find_largest_process_vm_map_entries(void);
 125
 126 /*
 127  * Callout to jetsam. If pid is -1, we wake up the memorystatus thread to do asynchronous kills.
 128  * For any other pid we try to kill that process synchronously.
 129  */
 130 extern boolean_t memorystatus_kill_on_zone_map_exhaustion(pid_t pid);
 131
 132 extern zone_t vm_map_entry_zone;
 133 extern zone_t vm_object_zone;
 134 extern vm_offset_t kmapoff_kaddr;
 135 extern unsigned int kmapoff_pgcnt;
 136 extern unsigned int stack_total;
 137 extern unsigned long long stack_allocs;
 138
 139 /*
 140  * The max # of elements in a chunk should fit into
 141  * zone_page_metadata.free_count (uint16_t).
 142  *
 143  * Update this if the type of free_count changes.
 144  */
 145 #define ZONE_CHUNK_MAXELEMENTS  (UINT16_MAX)
 146
 147 #define ZONE_PAGECOUNT_BITS     14
 148
 149 /* Zone elements must fit both a next pointer and a backup pointer */
 150 #define ZONE_MIN_ELEM_SIZE      (2 * sizeof(vm_offset_t))
 151 #define ZONE_MAX_ALLOC_SIZE     (32 * 1024)
 152
 153 /* per-cpu zones are special because of counters */
 154 #define ZONE_MIN_PCPU_ELEM_SIZE (1 * sizeof(vm_offset_t))
 155
 156 struct zone_map_range {
 157         vm_offset_t min_address;
 158         vm_offset_t max_address;
 159 };
 160
 161 struct zone_page_metadata {
 162         /* The index of the zone this metadata page belongs to */
 163         zone_id_t       zm_index;
 164
 165         /*
 166          * zm_secondary_page == 0: number of pages in this run
 167          * zm_secondary_page == 1: offset to the chunk start
 168          */
 169         uint16_t        zm_page_count : ZONE_PAGECOUNT_BITS;
 170
 171         /* Whether this page is part of a chunk run */
 172         uint16_t        zm_percpu : 1;
 173         uint16_t        zm_secondary_page : 1;
 174
 175         /*
 176          * The start of the freelist can be maintained as a 16-bit
 177          * offset instead of a pointer because the free elements would
 178          * be at max ZONE_MAX_ALLOC_SIZE bytes away from the start
 179          * of the allocation chunk.
 180          *
 181          * Offset from start of the allocation chunk to free element
 182          * list head.
 183          */
 184         uint16_t        zm_freelist_offs;
 185
 186         /*
 187          * zm_secondary_page == 0: number of allocated elements in the chunk
 188          * zm_secondary_page == 1: unused
 189          *
 190          * PAGE_METADATA_EMPTY_FREELIST indicates an empty freelist
 191          */
 192         uint16_t        zm_alloc_count;
 193 #define PAGE_METADATA_EMPTY_FREELIST  UINT16_MAX
 194
 195         zone_pva_t      zm_page_next;
 196         zone_pva_t      zm_page_prev;
 197
 198         /*
 199          * This is only for the sake of debuggers
 200          */
 201 #define ZONE_FOREIGN_COOKIE           0x123456789abcdef
 202         uint64_t        zm_foreign_cookie[];
 203 };
 204
 205
 206 /* Align elements that use the zone page list to 32 byte boundaries. */
 207 #define ZONE_PAGE_FIRST_OFFSET(kind)  ((kind) == ZONE_ADDR_NATIVE ? 0 : 32)
 208
 209 static_assert(sizeof(struct zone_page_metadata) == 16, "validate packing");
 210
 211 static __security_const_late struct {
 212         struct zone_map_range      zi_map_range;
 213         struct zone_map_range      zi_general_range;
 214         struct zone_map_range      zi_meta_range;
 215         struct zone_map_range      zi_foreign_range;
 216
 217         /*
 218          * The metadata lives within the zi_meta_range address range.
 219          *
 220          * The correct formula to find a metadata index is:
 221          *     absolute_page_index - page_index(zi_meta_range.min_address)
 222          *
 223          * And then this index is used to dereference zi_meta_range.min_address
 224          * as a `struct zone_page_metadata` array.
 225          *
 226          * To avoid doing that substraction all the time in the various fast-paths,
 227          * zi_array_base is offset by `page_index(zi_meta_range.min_address)`
 228          * to avoid redoing that math all the time.
 229          */
 230         struct zone_page_metadata *zi_array_base;
 231 } zone_info;
 232
 233 /*
 234  *      The zone_locks_grp allows for collecting lock statistics.
 235  *      All locks are associated to this group in zinit.
 236  *      Look at tools/lockstat for debugging lock contention.
 237  */
 238 LCK_GRP_DECLARE(zone_locks_grp, "zone_locks");
 239 LCK_MTX_EARLY_DECLARE(zone_metadata_region_lck, &zone_locks_grp);
 240
 241 /*
 242  *      Exclude more than one concurrent garbage collection
 243  */
 244 LCK_GRP_DECLARE(zone_gc_lck_grp, "zone_gc");
 245 LCK_MTX_EARLY_DECLARE(zone_gc_lock, &zone_gc_lck_grp);
 246
 247 boolean_t panic_include_zprint = FALSE;
 248 mach_memory_info_t *panic_kext_memory_info = NULL;
 249 vm_size_t panic_kext_memory_size = 0;
 250
 251 /*
 252  *      Protects zone_array, num_zones, num_zones_in_use, and
 253  *      zone_destroyed_bitmap
 254  */
 255 static SIMPLE_LOCK_DECLARE(all_zones_lock, 0);
 256 static unsigned int     num_zones_in_use;
 257 unsigned int _Atomic    num_zones;
 258 SECURITY_READ_ONLY_LATE(unsigned int) zone_view_count;
 259
 260 #if KASAN_ZALLOC
 261 #define MAX_ZONES       566
 262 #else /* !KASAN_ZALLOC */
 263 #define MAX_ZONES       402
 264 #endif/* !KASAN_ZALLOC */
 265 struct zone             zone_array[MAX_ZONES];
 266
 267 /* Initialized in zone_bootstrap(), how many "copies" the per-cpu system does */
 268 static SECURITY_READ_ONLY_LATE(unsigned) zpercpu_early_count;
 269
 270 /* Used to keep track of destroyed slots in the zone_array */
 271 static bitmap_t zone_destroyed_bitmap[BITMAP_LEN(MAX_ZONES)];
 272
 273 /* number of pages used by all zones */
 274 static long _Atomic zones_phys_page_count;
 275
 276 /* number of zone mapped pages used by all zones */
 277 static long _Atomic zones_phys_page_mapped_count;
 278
 279 /*
 280  * Turn ZSECURITY_OPTIONS_STRICT_IOKIT_FREE off on x86 so as not
 281  * not break third party kexts that haven't yet been recompiled
 282  * to use the new iokit macros.
 283  */
 284 #if XNU_TARGET_OS_OSX && __x86_64__
 285 #define ZSECURITY_OPTIONS_STRICT_IOKIT_FREE_DEFAULT 0
 286 #else
 287 #define ZSECURITY_OPTIONS_STRICT_IOKIT_FREE_DEFAULT \
 288   ZSECURITY_OPTIONS_STRICT_IOKIT_FREE
 289 #endif
 290
 291 #define ZSECURITY_DEFAULT ( \
 292                 ZSECURITY_OPTIONS_SEQUESTER | \
 293                 ZSECURITY_OPTIONS_SUBMAP_USER_DATA | \
 294                 ZSECURITY_OPTIONS_SEQUESTER_KEXT_KALLOC | \
 295                 ZSECURITY_OPTIONS_STRICT_IOKIT_FREE_DEFAULT | \
 296                 0)
 297 TUNABLE(zone_security_options_t, zsecurity_options, "zs", ZSECURITY_DEFAULT);
 298
 299 #if VM_MAX_TAG_ZONES
 300 /* enable tags for zones that ask for it */
 301 TUNABLE(bool, zone_tagging_on, "-zt", false);
 302 #endif /* VM_MAX_TAG_ZONES */
 303
 304 #if DEBUG || DEVELOPMENT
 305 TUNABLE(bool, zalloc_disable_copyio_check, "-no-copyio-zalloc-check", false);
 306 __options_decl(zalloc_debug_t, uint32_t, {
 307         ZALLOC_DEBUG_ZONEGC     = 0x00000001,
 308         ZALLOC_DEBUG_ZCRAM      = 0x00000002,
 309 });
 310
 311 TUNABLE(zalloc_debug_t, zalloc_debug, "zalloc_debug", 0);
 312 #endif /* DEBUG || DEVELOPMENT */
 313 #if CONFIG_ZLEAKS
 314 /* Making pointer scanning leaks detection possible for all zones */
 315 TUNABLE(bool, zone_leaks_scan_enable, "-zl", false);
 316 #else
 317 #define zone_leaks_scan_enable false
 318 #endif
 319
 320 /*
 321  * Async allocation of zones
 322  * This mechanism allows for bootstrapping an empty zone which is setup with
 323  * non-blocking flags. The first call to zalloc_noblock() will kick off a thread_call
 324  * to zalloc_async. We perform a zalloc() (which may block) and then an immediate free.
 325  * This will prime the zone for the next use.
 326  *
 327  * Currently the thread_callout function (zalloc_async) will loop through all zones
 328  * looking for any zone with async_pending set and do the work for it.
 329  *
 330  * NOTE: If the calling thread for zalloc_noblock is lower priority than thread_call,
 331  * then zalloc_noblock to an empty zone may succeed.
 332  */
 333 static void zalloc_async(thread_call_param_t p0, thread_call_param_t p1);
 334 static thread_call_data_t call_async_alloc;
 335 static void zcram_and_lock(zone_t zone, vm_offset_t newmem, vm_size_t size);
 336
 337 /*
 338  * Zone Corruption Debugging
 339  *
 340  * We use four techniques to detect modification of a zone element
 341  * after it's been freed.
 342  *
 343  * (1) Check the freelist next pointer for sanity.
 344  * (2) Store a backup of the next pointer at the end of the element,
 345  *     and compare it to the primary next pointer when the element is allocated
 346  *     to detect corruption of the freelist due to use-after-free bugs.
 347  *     The backup pointer is also XORed with a per-boot random cookie.
 348  * (3) Poison the freed element by overwriting it with 0xdeadbeef,
 349  *     and check for that value when the element is being reused to make sure
 350  *     no part of the element has been modified while it was on the freelist.
 351  *     This will also help catch read-after-frees, as code will now dereference
 352  *     0xdeadbeef instead of a valid but freed pointer.
 353  * (4) If the zfree_clear_mem flag is set clear the element on free and
 354  *     assert that it is still clear when alloc-ed.
 355  *
 356  * (1) and (2) occur for every allocation and free to a zone.
 357  * This is done to make it slightly more difficult for an attacker to
 358  * manipulate the freelist to behave in a specific way.
 359  *
 360  * Poisoning (3) occurs periodically for every N frees (counted per-zone).
 361  * If -zp is passed as a boot arg, poisoning occurs for every free.
 362  *
 363  * Zeroing (4) is done for those zones that pass the ZC_ZFREE_CLEARMEM
 364  * flag on creation or if the element size is less than one cacheline.
 365  *
 366  * Performance slowdown is inversely proportional to the frequency of poisoning,
 367  * with a 4-5% hit around N=1, down to ~0.3% at N=16 and just "noise" at N=32
 368  * and higher. You can expect to find a 100% reproducible bug in an average of
 369  * N tries, with a standard deviation of about N, but you will want to set
 370  * "-zp" to always poison every free if you are attempting to reproduce
 371  * a known bug.
 372  *
 373  * For a more heavyweight, but finer-grained method of detecting misuse
 374  * of zone memory, look up the "Guard mode" zone allocator in gzalloc.c.
 375  *
 376  * Zone Corruption Logging
 377  *
 378  * You can also track where corruptions come from by using the boot-arguments
 379  * "zlog=<zone name to log> -zc". Search for "Zone corruption logging" later
 380  * in this document for more implementation and usage information.
 381  *
 382  * Zone Leak Detection
 383  *
 384  * To debug leaks of zone memory, use the zone leak detection tool 'zleaks'
 385  * found later in this file via the showtopztrace and showz* macros in kgmacros,
 386  * or use zlog without the -zc argument.
 387  *
 388  */
 389
 390 #define ZP_DEFAULT_SAMPLING_FACTOR 16
 391 #define ZP_DEFAULT_SCALE_FACTOR 4
 392
 393 /*
 394  * set by zp-factor=N boot arg
 395  *
 396  * A zp_factor of 0 indicates zone poisoning is disabled and can also be set by
 397  * passing the -no-zp boot-arg.
 398  *
 399  * A zp_factor of 1 indicates zone poisoning is on for all elements and can be
 400  * set by passing the -zp boot-arg.
 401  */
 402 static TUNABLE(uint32_t, zp_factor, "zp-factor", ZP_DEFAULT_SAMPLING_FACTOR);
 403
 404 /* set by zp-scale=N boot arg, scales zp_factor by zone size */
 405 static TUNABLE(uint32_t, zp_scale, "zp-scale", ZP_DEFAULT_SCALE_FACTOR);
 406
 407 /* initialized to a per-boot random value in zp_bootstrap */
 408 static SECURITY_READ_ONLY_LATE(uintptr_t) zp_poisoned_cookie;
 409 static SECURITY_READ_ONLY_LATE(uintptr_t) zp_nopoison_cookie;
 410 static SECURITY_READ_ONLY_LATE(uintptr_t) zp_min_size;
 411 static SECURITY_READ_ONLY_LATE(uint64_t) zone_phys_mapped_max;
 412
 413 static SECURITY_READ_ONLY_LATE(vm_map_t) zone_submaps[Z_SUBMAP_IDX_COUNT];
 414 static SECURITY_READ_ONLY_LATE(uint32_t) zone_last_submap_idx;
 415
 416 static struct bool_gen zone_bool_gen;
 417 static zone_t          zone_find_largest(void);
 418 static void            zone_drop_free_elements(zone_t z);
 419
 420 #define submap_for_zone(z) zone_submaps[(z)->submap_idx]
 421 #define MAX_SUBMAP_NAME                16
 422
 423 /* Globals for random boolean generator for elements in free list */
 424 #define MAX_ENTROPY_PER_ZCRAM           4
 425
 426 #if CONFIG_ZCACHE
 427 /*
 428  * Specifies a single zone to enable CPU caching for.
 429  * Can be set using boot-args: zcc_enable_for_zone_name=<zone>
 430  */
 431 static char cache_zone_name[MAX_ZONE_NAME];
 432 static TUNABLE(bool, zcc_kalloc, "zcc_kalloc", false);
 433
 434 __header_always_inline bool
 435 zone_caching_enabled(zone_t z)
 436 {
 437         return z->zcache.zcc_depot != NULL;
 438 }
 439 #else
 440 __header_always_inline bool
 441 zone_caching_enabled(zone_t z __unused)
 442 {
 443         return false;
 444 }
 445 #endif /* CONFIG_ZCACHE */
 446
 447 #pragma mark Zone metadata
 448
 449 __enum_closed_decl(zone_addr_kind_t, bool, {
 450         ZONE_ADDR_NATIVE,
 451         ZONE_ADDR_FOREIGN,
 452 });
 453
 454 static inline zone_id_t
 455 zone_index(zone_t z)
 456 {
 457         return (zone_id_t)(z - zone_array);
 458 }
 459
 460 static inline bool
 461 zone_has_index(zone_t z, zone_id_t zid)
 462 {
 463         return zone_array + zid == z;
 464 }
 465
 466 static inline vm_size_t
 467 zone_elem_count(zone_t zone, vm_size_t alloc_size, zone_addr_kind_t kind)
 468 {
 469         if (kind == ZONE_ADDR_NATIVE) {
 470                 if (zone->percpu) {
 471                         return PAGE_SIZE / zone_elem_size(zone);
 472                 }
 473                 return alloc_size / zone_elem_size(zone);
 474         } else {
 475                 assert(alloc_size == PAGE_SIZE);
 476                 return (PAGE_SIZE - ZONE_PAGE_FIRST_OFFSET(kind)) / zone_elem_size(zone);
 477         }
 478 }
 479
 480 __abortlike
 481 static void
 482 zone_metadata_corruption(zone_t zone, struct zone_page_metadata *meta,
 483     const char *kind)
 484 {
 485         panic("zone metadata corruption: %s (meta %p, zone %s%s)",
 486             kind, meta, zone_heap_name(zone), zone->z_name);
 487 }
 488
 489 __abortlike
 490 static void
 491 zone_invalid_element_addr_panic(zone_t zone, vm_offset_t addr)
 492 {
 493         panic("zone element pointer validation failed (addr: %p, zone %s%s)",
 494             (void *)addr, zone_heap_name(zone), zone->z_name);
 495 }
 496
 497 __abortlike
 498 static void
 499 zone_page_metadata_index_confusion_panic(zone_t zone, vm_offset_t addr,
 500     struct zone_page_metadata *meta)
 501 {
 502         panic("%p not in the expected zone %s%s (%d != %d)",
 503             (void *)addr, zone_heap_name(zone), zone->z_name,
 504             meta->zm_index, zone_index(zone));
 505 }
 506
 507 __abortlike
 508 static void
 509 zone_page_metadata_native_queue_corruption(zone_t zone, zone_pva_t *queue)
 510 {
 511         panic("foreign metadata index %d enqueued in native head %p from zone %s%s",
 512             queue->packed_address, queue, zone_heap_name(zone),
 513             zone->z_name);
 514 }
 515
 516 __abortlike
 517 static void
 518 zone_page_metadata_list_corruption(zone_t zone, struct zone_page_metadata *meta)
 519 {
 520         panic("metadata list corruption through element %p detected in zone %s%s",
 521             meta, zone_heap_name(zone), zone->z_name);
 522 }
 523
 524 __abortlike
 525 static void
 526 zone_page_metadata_foreign_queue_corruption(zone_t zone, zone_pva_t *queue)
 527 {
 528         panic("native metadata index %d enqueued in foreign head %p from zone %s%s",
 529             queue->packed_address, queue, zone_heap_name(zone), zone->z_name);
 530 }
 531
 532 __abortlike
 533 static void
 534 zone_page_metadata_foreign_confusion_panic(zone_t zone, vm_offset_t addr)
 535 {
 536         panic("manipulating foreign address %p in a native-only zone %s%s",
 537             (void *)addr, zone_heap_name(zone), zone->z_name);
 538 }
 539
 540 __abortlike __unused
 541 static void
 542 zone_invalid_foreign_addr_panic(zone_t zone, vm_offset_t addr)
 543 {
 544         panic("addr %p being freed to foreign zone %s%s not from foreign range",
 545             (void *)addr, zone_heap_name(zone), zone->z_name);
 546 }
 547
 548 __abortlike
 549 static void
 550 zone_page_meta_accounting_panic(zone_t zone, struct zone_page_metadata *meta,
 551     const char *kind)
 552 {
 553         panic("accounting mismatch (%s) for zone %s%s, meta %p", kind,
 554             zone_heap_name(zone), zone->z_name, meta);
 555 }
 556
 557 __abortlike
 558 static void
 559 zone_accounting_panic(zone_t zone, const char *kind)
 560 {
 561         panic("accounting mismatch (%s) for zone %s%s", kind,
 562             zone_heap_name(zone), zone->z_name);
 563 }
 564
 565 __abortlike
 566 static void
 567 zone_nofail_panic(zone_t zone)
 568 {
 569         panic("zalloc(Z_NOFAIL) can't be satisfied for zone %s%s (potential leak)",
 570             zone_heap_name(zone), zone->z_name);
 571 }
 572
 573 #if __arm64__
 574 // <rdar://problem/48304934> arm64 doesn't use ldp when I'd expect it to
 575 #define zone_range_load(r, rmin, rmax) \
 576         asm("ldp %[rmin], %[rmax], [%[range]]" \
 577             : [rmin] "=r"(rmin), [rmax] "=r"(rmax) \
 578             : [range] "r"(r))
 579 #else
 580 #define zone_range_load(r, rmin, rmax) \
 581         ({ rmin = (r)->min_address; rmax = (r)->max_address; })
 582 #endif
 583
 584 __header_always_inline bool
 585 zone_range_contains(const struct zone_map_range *r, vm_offset_t addr, vm_offset_t size)
 586 {
 587         vm_offset_t rmin, rmax;
 588
 589         /*
 590          * The `&` is not a typo: we really expect the check to pass,
 591          * so encourage the compiler to eagerly load and test without branches
 592          */
 593         zone_range_load(r, rmin, rmax);
 594         return (addr >= rmin) & (addr + size >= rmin) & (addr + size <= rmax);
 595 }
 596
 597 __header_always_inline vm_size_t
 598 zone_range_size(const struct zone_map_range *r)
 599 {
 600         vm_offset_t rmin, rmax;
 601
 602         zone_range_load(r, rmin, rmax);
 603         return rmax - rmin;
 604 }
 605
 606 #define from_zone_map(addr, size) \
 607         zone_range_contains(&zone_info.zi_map_range, (vm_offset_t)(addr), size)
 608
 609 #define from_general_submap(addr, size) \
 610         zone_range_contains(&zone_info.zi_general_range, (vm_offset_t)(addr), size)
 611
 612 #define from_foreign_range(addr, size) \
 613         zone_range_contains(&zone_info.zi_foreign_range, (vm_offset_t)(addr), size)
 614
 615 #define from_native_meta_map(addr) \
 616         zone_range_contains(&zone_info.zi_meta_range, (vm_offset_t)(addr), \
 617             sizeof(struct zone_page_metadata))
 618
 619 #define zone_addr_kind(addr, size) \
 620         (from_zone_map(addr, size) ? ZONE_ADDR_NATIVE : ZONE_ADDR_FOREIGN)
 621
 622 __header_always_inline bool
 623 zone_pva_is_null(zone_pva_t page)
 624 {
 625         return page.packed_address == 0;
 626 }
 627
 628 __header_always_inline bool
 629 zone_pva_is_queue(zone_pva_t page)
 630 {
 631         // actual kernel pages have the top bit set
 632         return (int32_t)page.packed_address > 0;
 633 }
 634
 635 __header_always_inline bool
 636 zone_pva_is_equal(zone_pva_t pva1, zone_pva_t pva2)
 637 {
 638         return pva1.packed_address == pva2.packed_address;
 639 }
 640
 641 __header_always_inline void
 642 zone_queue_set_head(zone_t z, zone_pva_t queue, zone_pva_t oldv,
 643     struct zone_page_metadata *meta)
 644 {
 645         zone_pva_t *queue_head = &((zone_pva_t *)zone_array)[queue.packed_address];
 646
 647         if (!zone_pva_is_equal(*queue_head, oldv)) {
 648                 zone_page_metadata_list_corruption(z, meta);
 649         }
 650         *queue_head = meta->zm_page_next;
 651 }
 652
 653 __header_always_inline zone_pva_t
 654 zone_queue_encode(zone_pva_t *headp)
 655 {
 656         return (zone_pva_t){ (uint32_t)(headp - (zone_pva_t *)zone_array) };
 657 }
 658
 659 __header_always_inline zone_pva_t
 660 zone_pva_from_addr(vm_address_t addr)
 661 {
 662         // cannot use atop() because we want to maintain the sign bit
 663         return (zone_pva_t){ (uint32_t)((intptr_t)addr >> PAGE_SHIFT) };
 664 }
 665
 666 __header_always_inline vm_address_t
 667 zone_pva_to_addr(zone_pva_t page)
 668 {
 669         // cause sign extension so that we end up with the right address
 670         return (vm_offset_t)(int32_t)page.packed_address << PAGE_SHIFT;
 671 }
 672
 673 __header_always_inline struct zone_page_metadata *
 674 zone_pva_to_meta(zone_pva_t page, zone_addr_kind_t kind)
 675 {
 676         if (kind == ZONE_ADDR_NATIVE) {
 677                 return &zone_info.zi_array_base[page.packed_address];
 678         } else {
 679                 return (struct zone_page_metadata *)zone_pva_to_addr(page);
 680         }
 681 }
 682
 683 __header_always_inline zone_pva_t
 684 zone_pva_from_meta(struct zone_page_metadata *meta, zone_addr_kind_t kind)
 685 {
 686         if (kind == ZONE_ADDR_NATIVE) {
 687                 uint32_t index = (uint32_t)(meta - zone_info.zi_array_base);
 688                 return (zone_pva_t){ index };
 689         } else {
 690                 return zone_pva_from_addr((vm_address_t)meta);
 691         }
 692 }
 693
 694 __header_always_inline struct zone_page_metadata *
 695 zone_meta_from_addr(vm_offset_t addr, zone_addr_kind_t kind)
 696 {
 697         if (kind == ZONE_ADDR_NATIVE) {
 698                 return zone_pva_to_meta(zone_pva_from_addr(addr), kind);
 699         } else {
 700                 return (struct zone_page_metadata *)trunc_page(addr);
 701         }
 702 }
 703
 704 #define zone_native_meta_from_addr(addr) \
 705         zone_meta_from_addr((vm_offset_t)(addr), ZONE_ADDR_NATIVE)
 706
 707 __header_always_inline vm_offset_t
 708 zone_meta_to_addr(struct zone_page_metadata *meta, zone_addr_kind_t kind)
 709 {
 710         if (kind == ZONE_ADDR_NATIVE) {
 711                 return ptoa((int)(meta - zone_info.zi_array_base));
 712         } else {
 713                 return (vm_offset_t)meta;
 714         }
 715 }
 716
 717 __header_always_inline void
 718 zone_meta_queue_push(zone_t z, zone_pva_t *headp,
 719     struct zone_page_metadata *meta, zone_addr_kind_t kind)
 720 {
 721         zone_pva_t head = *headp;
 722         zone_pva_t queue_pva = zone_queue_encode(headp);
 723         struct zone_page_metadata *tmp;
 724
 725         meta->zm_page_next = head;
 726         if (!zone_pva_is_null(head)) {
 727                 tmp = zone_pva_to_meta(head, kind);
 728                 if (!zone_pva_is_equal(tmp->zm_page_prev, queue_pva)) {
 729                         zone_page_metadata_list_corruption(z, meta);
 730                 }
 731                 tmp->zm_page_prev = zone_pva_from_meta(meta, kind);
 732         }
 733         meta->zm_page_prev = queue_pva;
 734         *headp = zone_pva_from_meta(meta, kind);
 735 }
 736
 737 __header_always_inline struct zone_page_metadata *
 738 zone_meta_queue_pop(zone_t z, zone_pva_t *headp, zone_addr_kind_t kind,
 739     vm_offset_t *page_addrp)
 740 {
 741         zone_pva_t head = *headp;
 742         struct zone_page_metadata *meta = zone_pva_to_meta(head, kind);
 743         vm_offset_t page_addr = zone_pva_to_addr(head);
 744         struct zone_page_metadata *tmp;
 745
 746         if (kind == ZONE_ADDR_NATIVE && !from_native_meta_map(meta)) {
 747                 zone_page_metadata_native_queue_corruption(z, headp);
 748         }
 749         if (kind == ZONE_ADDR_FOREIGN && from_zone_map(meta, sizeof(*meta))) {
 750                 zone_page_metadata_foreign_queue_corruption(z, headp);
 751         }
 752
 753         if (!zone_pva_is_null(meta->zm_page_next)) {
 754                 tmp = zone_pva_to_meta(meta->zm_page_next, kind);
 755                 if (!zone_pva_is_equal(tmp->zm_page_prev, head)) {
 756                         zone_page_metadata_list_corruption(z, meta);
 757                 }
 758                 tmp->zm_page_prev = meta->zm_page_prev;
 759         }
 760         *headp = meta->zm_page_next;
 761
 762         *page_addrp = page_addr;
 763         return meta;
 764 }
 765
 766 __header_always_inline void
 767 zone_meta_requeue(zone_t z, zone_pva_t *headp,
 768     struct zone_page_metadata *meta, zone_addr_kind_t kind)
 769 {
 770         zone_pva_t meta_pva = zone_pva_from_meta(meta, kind);
 771         struct zone_page_metadata *tmp;
 772
 773         if (!zone_pva_is_null(meta->zm_page_next)) {
 774                 tmp = zone_pva_to_meta(meta->zm_page_next, kind);
 775                 if (!zone_pva_is_equal(tmp->zm_page_prev, meta_pva)) {
 776                         zone_page_metadata_list_corruption(z, meta);
 777                 }
 778                 tmp->zm_page_prev = meta->zm_page_prev;
 779         }
 780         if (zone_pva_is_queue(meta->zm_page_prev)) {
 781                 zone_queue_set_head(z, meta->zm_page_prev, meta_pva, meta);
 782         } else {
 783                 tmp = zone_pva_to_meta(meta->zm_page_prev, kind);
 784                 if (!zone_pva_is_equal(tmp->zm_page_next, meta_pva)) {
 785                         zone_page_metadata_list_corruption(z, meta);
 786                 }
 787                 tmp->zm_page_next = meta->zm_page_next;
 788         }
 789
 790         zone_meta_queue_push(z, headp, meta, kind);
 791 }
 792
 793 /*
 794  * Routine to populate a page backing metadata in the zone_metadata_region.
 795  * Must be called without the zone lock held as it might potentially block.
 796  */
 797 static void
 798 zone_meta_populate(struct zone_page_metadata *from, struct zone_page_metadata *to)
 799 {
 800         vm_offset_t page_addr = trunc_page(from);
 801
 802         for (; page_addr < (vm_offset_t)to; page_addr += PAGE_SIZE) {
 803 #if !KASAN_ZALLOC
 804                 /*
 805                  * This can race with another thread doing a populate on the same metadata
 806                  * page, where we see an updated pmap but unmapped KASan shadow, causing a
 807                  * fault in the shadow when we first access the metadata page. Avoid this
 808                  * by always synchronizing on the zone_metadata_region lock with KASan.
 809                  */
 810                 if (pmap_find_phys(kernel_pmap, page_addr)) {
 811                         continue;
 812                 }
 813 #endif
 814
 815                 for (;;) {
 816                         kern_return_t ret = KERN_SUCCESS;
 817
 818                         /* All updates to the zone_metadata_region are done under the zone_metadata_region_lck */
 819                         lck_mtx_lock(&zone_metadata_region_lck);
 820                         if (0 == pmap_find_phys(kernel_pmap, page_addr)) {
 821                                 ret = kernel_memory_populate(kernel_map, page_addr,
 822                                     PAGE_SIZE, KMA_NOPAGEWAIT | KMA_KOBJECT | KMA_ZERO,
 823                                     VM_KERN_MEMORY_OSFMK);
 824                         }
 825                         lck_mtx_unlock(&zone_metadata_region_lck);
 826
 827                         if (ret == KERN_SUCCESS) {
 828                                 break;
 829                         }
 830
 831                         /*
 832                          * We can't pass KMA_NOPAGEWAIT under a global lock as it leads
 833                          * to bad system deadlocks, so if the allocation failed,
 834                          * we need to do the VM_PAGE_WAIT() outside of the lock.
 835                          */
 836                         VM_PAGE_WAIT();
 837                 }
 838         }
 839 }
 840
 841 static inline bool
 842 zone_allocated_element_offset_is_valid(zone_t zone, vm_offset_t addr,
 843     vm_offset_t page, zone_addr_kind_t kind)
 844 {
 845         vm_offset_t offs = addr - page - ZONE_PAGE_FIRST_OFFSET(kind);
 846         vm_offset_t esize = zone_elem_size(zone);
 847
 848         if (esize & (esize - 1)) { /* not a power of 2 */
 849                 return (offs % esize) == 0;
 850         } else {
 851                 return (offs & (esize - 1)) == 0;
 852         }
 853 }
 854
 855 __attribute__((always_inline))
 856 static struct zone_page_metadata *
 857 zone_allocated_element_resolve(zone_t zone, vm_offset_t addr,
 858     vm_offset_t *pagep, zone_addr_kind_t *kindp)
 859 {
 860         struct zone_page_metadata *meta;
 861         zone_addr_kind_t kind;
 862         vm_offset_t page;
 863         vm_offset_t esize = zone_elem_size(zone);
 864
 865         kind = zone_addr_kind(addr, esize);
 866         page = trunc_page(addr);
 867         meta = zone_meta_from_addr(addr, kind);
 868
 869         if (kind == ZONE_ADDR_NATIVE) {
 870                 if (meta->zm_secondary_page) {
 871                         if (meta->zm_percpu) {
 872                                 zone_invalid_element_addr_panic(zone, addr);
 873                         }
 874                         page -= ptoa(meta->zm_page_count);
 875                         meta -= meta->zm_page_count;
 876                 }
 877         } else if (!zone->allows_foreign) {
 878                 zone_page_metadata_foreign_confusion_panic(zone, addr);
 879 #if __LP64__
 880         } else if (!from_foreign_range(addr, esize)) {
 881                 zone_invalid_foreign_addr_panic(zone, addr);
 882 #else
 883         } else if (!pmap_kernel_va(addr)) {
 884                 zone_invalid_element_addr_panic(zone, addr);
 885 #endif
 886         }
 887
 888         if (!zone_allocated_element_offset_is_valid(zone, addr, page, kind)) {
 889                 zone_invalid_element_addr_panic(zone, addr);
 890         }
 891
 892         if (!zone_has_index(zone, meta->zm_index)) {
 893                 zone_page_metadata_index_confusion_panic(zone, addr, meta);
 894         }
 895
 896         if (kindp) {
 897                 *kindp = kind;
 898         }
 899         if (pagep) {
 900                 *pagep = page;
 901         }
 902         return meta;
 903 }
 904
 905 __attribute__((always_inline))
 906 void
 907 zone_allocated_element_validate(zone_t zone, vm_offset_t addr)
 908 {
 909         zone_allocated_element_resolve(zone, addr, NULL, NULL);
 910 }
 911
 912 __header_always_inline vm_offset_t
 913 zone_page_meta_get_freelist(zone_t zone, struct zone_page_metadata *meta,
 914     vm_offset_t page)
 915 {
 916         assert(!meta->zm_secondary_page);
 917         if (meta->zm_freelist_offs == PAGE_METADATA_EMPTY_FREELIST) {
 918                 return 0;
 919         }
 920
 921         vm_size_t size = ptoa(meta->zm_percpu ? 1 : meta->zm_page_count);
 922         if (meta->zm_freelist_offs + zone_elem_size(zone) > size) {
 923                 zone_metadata_corruption(zone, meta, "freelist corruption");
 924         }
 925
 926         return page + meta->zm_freelist_offs;
 927 }
 928
 929 __header_always_inline void
 930 zone_page_meta_set_freelist(struct zone_page_metadata *meta,
 931     vm_offset_t page, vm_offset_t addr)
 932 {
 933         assert(!meta->zm_secondary_page);
 934         if (addr) {
 935                 meta->zm_freelist_offs = (uint16_t)(addr - page);
 936         } else {
 937                 meta->zm_freelist_offs = PAGE_METADATA_EMPTY_FREELIST;
 938         }
 939 }
 940
 941 static bool
 942 zone_page_meta_is_sane_element(zone_t zone, struct zone_page_metadata *meta,
 943     vm_offset_t page, vm_offset_t element, zone_addr_kind_t kind)
 944 {
 945         if (element == 0) {
 946                 /* ends of the freelist are NULL */
 947                 return true;
 948         }
 949         if (element < page + ZONE_PAGE_FIRST_OFFSET(kind)) {
 950                 return false;
 951         }
 952         vm_size_t size = ptoa(meta->zm_percpu ? 1 : meta->zm_page_count);
 953         if (element > page + size - zone_elem_size(zone)) {
 954                 return false;
 955         }
 956         return true;
 957 }
 958
 959 /* Routine to get the size of a zone allocated address.
 960  * If the address doesnt belong to the zone maps, returns 0.
 961  */
 962 vm_size_t
 963 zone_element_size(void *addr, zone_t *z)
 964 {
 965         struct zone_page_metadata *meta;
 966         struct zone *src_zone;
 967
 968         if (from_zone_map(addr, sizeof(void *))) {
 969                 meta = zone_native_meta_from_addr(addr);
 970                 src_zone = &zone_array[meta->zm_index];
 971                 if (z) {
 972                         *z = src_zone;
 973                 }
 974                 return zone_elem_size(src_zone);
 975         }
 976 #if CONFIG_GZALLOC
 977         if (__improbable(gzalloc_enabled())) {
 978                 vm_size_t gzsize;
 979                 if (gzalloc_element_size(addr, z, &gzsize)) {
 980                         return gzsize;
 981                 }
 982         }
 983 #endif /* CONFIG_GZALLOC */
 984
 985         return 0;
 986 }
 987
 988 /* This function just formats the reason for the panics by redoing the checks */
 989 __abortlike
 990 static void
 991 zone_require_panic(zone_t zone, void *addr)
 992 {
 993         uint32_t zindex;
 994         zone_t other;
 995
 996         if (!from_zone_map(addr, zone_elem_size(zone))) {
 997                 panic("zone_require failed: address not in a zone (addr: %p)", addr);
 998         }
 999
1000         zindex = zone_native_meta_from_addr(addr)->zm_index;
1001         other = &zone_array[zindex];
1002         if (zindex >= os_atomic_load(&num_zones, relaxed) || !other->z_self) {
1003                 panic("zone_require failed: invalid zone index %d "
1004                     "(addr: %p, expected: %s%s)", zindex,
1005                     addr, zone_heap_name(zone), zone->z_name);
1006         } else {
1007                 panic("zone_require failed: address in unexpected zone id %d (%s%s) "
1008                     "(addr: %p, expected: %s%s)",
1009                     zindex, zone_heap_name(other), other->z_name,
1010                     addr, zone_heap_name(zone), zone->z_name);
1011         }
1012 }
1013
1014 __abortlike
1015 static void
1016 zone_id_require_panic(zone_id_t zid, void *addr)
1017 {
1018         zone_require_panic(&zone_array[zid], addr);
1019 }
1020
1021 /*
1022  * Routines to panic if a pointer is not mapped to an expected zone.
1023  * This can be used as a means of pinning an object to the zone it is expected
1024  * to be a part of.  Causes a panic if the address does not belong to any
1025  * specified zone, does not belong to any zone, has been freed and therefore
1026  * unmapped from the zone, or the pointer contains an uninitialized value that
1027  * does not belong to any zone.
1028  *
1029  * Note that this can only work with collectable zones without foreign pages.
1030  */
1031 void
1032 zone_require(zone_t zone, void *addr)
1033 {
1034         if (__probable(from_general_submap(addr, zone_elem_size(zone)) &&
1035             (zone_has_index(zone, zone_native_meta_from_addr(addr)->zm_index)))) {
1036                 return;
1037         }
1038 #if CONFIG_GZALLOC
1039         if (__probable(gzalloc_enabled())) {
1040                 return;
1041         }
1042 #endif
1043         zone_require_panic(zone, addr);
1044 }
1045
1046 void
1047 zone_id_require(zone_id_t zid, vm_size_t esize, void *addr)
1048 {
1049         if (__probable(from_general_submap(addr, esize) &&
1050             (zid == zone_native_meta_from_addr(addr)->zm_index))) {
1051                 return;
1052         }
1053 #if CONFIG_GZALLOC
1054         if (__probable(gzalloc_enabled())) {
1055                 return;
1056         }
1057 #endif
1058         zone_id_require_panic(zid, addr);
1059 }
1060
1061 bool
1062 zone_owns(zone_t zone, void *addr)
1063 {
1064         if (__probable(from_general_submap(addr, zone_elem_size(zone)) &&
1065             (zone_has_index(zone, zone_native_meta_from_addr(addr)->zm_index)))) {
1066                 return true;
1067         }
1068 #if CONFIG_GZALLOC
1069         if (__probable(gzalloc_enabled())) {
1070                 return true;
1071         }
1072 #endif
1073         return false;
1074 }
1075
1076 #pragma mark ZTAGS
1077 #if VM_MAX_TAG_ZONES
1078
1079 // for zones with tagging enabled:
1080
1081 // calculate a pointer to the tag base entry,
1082 // holding either a uint32_t the first tag offset for a page in the zone map,
1083 // or two uint16_t tags if the page can only hold one or two elements
1084
1085 #define ZTAGBASE(zone, element) \
1086     (&((uint32_t *)zone_tagbase_min)[atop((element) - zone_info.zi_map_range.min_address)])
1087
1088 // pointer to the tag for an element
1089 #define ZTAG(zone, element)                                     \
1090     ({                                                          \
1091         vm_tag_t * result;                                      \
1092         if ((zone)->tags_inline) {                              \
1093             result = (vm_tag_t *) ZTAGBASE((zone), (element));  \
1094             if ((page_mask & element) >= zone_elem_size(zone)) result++;    \
1095         } else {                                                \
1096             result =  &((vm_tag_t *)zone_tags_min)[ZTAGBASE((zone), (element))[0] + ((element) & page_mask) / zone_elem_size((zone))];   \
1097         }                                                       \
1098         result;                                                 \
1099     })
1100
1101
1102 static vm_offset_t  zone_tagbase_min;
1103 static vm_offset_t  zone_tagbase_max;
1104 static vm_offset_t  zone_tagbase_map_size;
1105 static vm_map_t     zone_tagbase_map;
1106
1107 static vm_offset_t  zone_tags_min;
1108 static vm_offset_t  zone_tags_max;
1109 static vm_offset_t  zone_tags_map_size;
1110 static vm_map_t     zone_tags_map;
1111
1112 // simple heap allocator for allocating the tags for new memory
1113
1114 LCK_MTX_EARLY_DECLARE(ztLock, &zone_locks_grp); /* heap lock */
1115
1116 enum{
1117         ztFreeIndexCount = 8,
1118         ztFreeIndexMax   = (ztFreeIndexCount - 1),
1119         ztTagsPerBlock   = 4
1120 };
1121
1122 struct ztBlock {
1123 #if __LITTLE_ENDIAN__
1124         uint64_t free:1,
1125             next:21,
1126             prev:21,
1127             size:21;
1128 #else
1129 // ztBlock needs free bit least significant
1130 #error !__LITTLE_ENDIAN__
1131 #endif
1132 };
1133 typedef struct ztBlock ztBlock;
1134
1135 static ztBlock * ztBlocks;
1136 static uint32_t  ztBlocksCount;
1137 static uint32_t  ztBlocksFree;
1138
1139 static uint32_t
1140 ztLog2up(uint32_t size)
1141 {
1142         if (1 == size) {
1143                 size = 0;
1144         } else {
1145                 size = 32 - __builtin_clz(size - 1);
1146         }
1147         return size;
1148 }
1149
1150 static uint32_t
1151 ztLog2down(uint32_t size)
1152 {
1153         size = 31 - __builtin_clz(size);
1154         return size;
1155 }
1156
1157 static void
1158 ztFault(vm_map_t map, const void * address, size_t size, uint32_t flags)
1159 {
1160         vm_map_offset_t addr = (vm_map_offset_t) address;
1161         vm_map_offset_t page, end;
1162
1163         page = trunc_page(addr);
1164         end  = round_page(addr + size);
1165
1166         for (; page < end; page += page_size) {
1167                 if (!pmap_find_phys(kernel_pmap, page)) {
1168                         kern_return_t __unused
1169                         ret = kernel_memory_populate(map, page, PAGE_SIZE,
1170                             KMA_KOBJECT | flags, VM_KERN_MEMORY_DIAG);
1171                         assert(ret == KERN_SUCCESS);
1172                 }
1173         }
1174 }
1175
1176 static boolean_t
1177 ztPresent(const void * address, size_t size)
1178 {
1179         vm_map_offset_t addr = (vm_map_offset_t) address;
1180         vm_map_offset_t page, end;
1181         boolean_t       result;
1182
1183         page = trunc_page(addr);
1184         end  = round_page(addr + size);
1185         for (result = TRUE; (page < end); page += page_size) {
1186                 result = pmap_find_phys(kernel_pmap, page);
1187                 if (!result) {
1188                         break;
1189                 }
1190         }
1191         return result;
1192 }
1193
1194
1195 void __unused
1196 ztDump(boolean_t sanity);
1197 void __unused
1198 ztDump(boolean_t sanity)
1199 {
1200         uint32_t q, cq, p;
1201
1202         for (q = 0; q <= ztFreeIndexMax; q++) {
1203                 p = q;
1204                 do{
1205                         if (sanity) {
1206                                 cq = ztLog2down(ztBlocks[p].size);
1207                                 if (cq > ztFreeIndexMax) {
1208                                         cq = ztFreeIndexMax;
1209                                 }
1210                                 if (!ztBlocks[p].free
1211                                     || ((p != q) && (q != cq))
1212                                     || (ztBlocks[ztBlocks[p].next].prev != p)
1213                                     || (ztBlocks[ztBlocks[p].prev].next != p)) {
1214                                         kprintf("zterror at %d", p);
1215                                         ztDump(FALSE);
1216                                         kprintf("zterror at %d", p);
1217                                         assert(FALSE);
1218                                 }
1219                                 continue;
1220                         }
1221                         kprintf("zt[%03d]%c %d, %d, %d\n",
1222                             p, ztBlocks[p].free ? 'F' : 'A',
1223                             ztBlocks[p].next, ztBlocks[p].prev,
1224                             ztBlocks[p].size);
1225                         p = ztBlocks[p].next;
1226                         if (p == q) {
1227                                 break;
1228                         }
1229                 }while (p != q);
1230                 if (!sanity) {
1231                         printf("\n");
1232                 }
1233         }
1234         if (!sanity) {
1235                 printf("-----------------------\n");
1236         }
1237 }
1238
1239
1240
1241 #define ZTBDEQ(idx)                                                 \
1242     ztBlocks[ztBlocks[(idx)].prev].next = ztBlocks[(idx)].next;     \
1243     ztBlocks[ztBlocks[(idx)].next].prev = ztBlocks[(idx)].prev;
1244
1245 static void
1246 ztFree(zone_t zone __unused, uint32_t index, uint32_t count)
1247 {
1248         uint32_t q, w, p, size, merge;
1249
1250         assert(count);
1251         ztBlocksFree += count;
1252
1253         // merge with preceding
1254         merge = (index + count);
1255         if ((merge < ztBlocksCount)
1256             && ztPresent(&ztBlocks[merge], sizeof(ztBlocks[merge]))
1257             && ztBlocks[merge].free) {
1258                 ZTBDEQ(merge);
1259                 count += ztBlocks[merge].size;
1260         }
1261
1262         // merge with following
1263         merge = (index - 1);
1264         if ((merge > ztFreeIndexMax)
1265             && ztPresent(&ztBlocks[merge], sizeof(ztBlocks[merge]))
1266             && ztBlocks[merge].free) {
1267                 size = ztBlocks[merge].size;
1268                 count += size;
1269                 index -= size;
1270                 ZTBDEQ(index);
1271         }
1272
1273         q = ztLog2down(count);
1274         if (q > ztFreeIndexMax) {
1275                 q = ztFreeIndexMax;
1276         }
1277         w = q;
1278         // queue in order of size
1279         while (TRUE) {
1280                 p = ztBlocks[w].next;
1281                 if (p == q) {
1282                         break;
1283                 }
1284                 if (ztBlocks[p].size >= count) {
1285                         break;
1286                 }
1287                 w = p;
1288         }
1289         ztBlocks[p].prev = index;
1290         ztBlocks[w].next = index;
1291
1292         // fault in first
1293         ztFault(zone_tags_map, &ztBlocks[index], sizeof(ztBlocks[index]), 0);
1294
1295         // mark first & last with free flag and size
1296         ztBlocks[index].free = TRUE;
1297         ztBlocks[index].size = count;
1298         ztBlocks[index].prev = w;
1299         ztBlocks[index].next = p;
1300         if (count > 1) {
1301                 index += (count - 1);
1302                 // fault in last
1303                 ztFault(zone_tags_map, &ztBlocks[index], sizeof(ztBlocks[index]), 0);
1304                 ztBlocks[index].free = TRUE;
1305                 ztBlocks[index].size = count;
1306         }
1307 }
1308
1309 static uint32_t
1310 ztAlloc(zone_t zone, uint32_t count)
1311 {
1312         uint32_t q, w, p, leftover;
1313
1314         assert(count);
1315
1316         q = ztLog2up(count);
1317         if (q > ztFreeIndexMax) {
1318                 q = ztFreeIndexMax;
1319         }
1320         do{
1321                 w = q;
1322                 while (TRUE) {
1323                         p = ztBlocks[w].next;
1324                         if (p == q) {
1325                                 break;
1326                         }
1327                         if (ztBlocks[p].size >= count) {
1328                                 // dequeue, mark both ends allocated
1329                                 ztBlocks[w].next = ztBlocks[p].next;
1330                                 ztBlocks[ztBlocks[p].next].prev = w;
1331                                 ztBlocks[p].free = FALSE;
1332                                 ztBlocksFree -= ztBlocks[p].size;
1333                                 if (ztBlocks[p].size > 1) {
1334                                         ztBlocks[p + ztBlocks[p].size - 1].free = FALSE;
1335                                 }
1336
1337                                 // fault all the allocation
1338                                 ztFault(zone_tags_map, &ztBlocks[p], count * sizeof(ztBlocks[p]), 0);
1339                                 // mark last as allocated
1340                                 if (count > 1) {
1341                                         ztBlocks[p + count - 1].free = FALSE;
1342                                 }
1343                                 // free remainder
1344                                 leftover = ztBlocks[p].size - count;
1345                                 if (leftover) {
1346                                         ztFree(zone, p + ztBlocks[p].size - leftover, leftover);
1347                                 }
1348
1349                                 return p;
1350                         }
1351                         w = p;
1352                 }
1353                 q++;
1354         }while (q <= ztFreeIndexMax);
1355
1356         return -1U;
1357 }
1358
1359 __startup_func
1360 static void
1361 zone_tagging_init(vm_size_t max_zonemap_size)
1362 {
1363         kern_return_t         ret;
1364         vm_map_kernel_flags_t vmk_flags;
1365         uint32_t              idx;
1366
1367         // allocate submaps VM_KERN_MEMORY_DIAG
1368
1369         zone_tagbase_map_size = atop(max_zonemap_size) * sizeof(uint32_t);
1370         vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
1371         vmk_flags.vmkf_permanent = TRUE;
1372         ret = kmem_suballoc(kernel_map, &zone_tagbase_min, zone_tagbase_map_size,
1373             FALSE, VM_FLAGS_ANYWHERE, vmk_flags, VM_KERN_MEMORY_DIAG,
1374             &zone_tagbase_map);
1375
1376         if (ret != KERN_SUCCESS) {
1377                 panic("zone_init: kmem_suballoc failed");
1378         }
1379         zone_tagbase_max = zone_tagbase_min + round_page(zone_tagbase_map_size);
1380
1381         zone_tags_map_size = 2048 * 1024 * sizeof(vm_tag_t);
1382         vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
1383         vmk_flags.vmkf_permanent = TRUE;
1384         ret = kmem_suballoc(kernel_map, &zone_tags_min, zone_tags_map_size,
1385             FALSE, VM_FLAGS_ANYWHERE, vmk_flags, VM_KERN_MEMORY_DIAG,
1386             &zone_tags_map);
1387
1388         if (ret != KERN_SUCCESS) {
1389                 panic("zone_init: kmem_suballoc failed");
1390         }
1391         zone_tags_max = zone_tags_min + round_page(zone_tags_map_size);
1392
1393         ztBlocks = (ztBlock *) zone_tags_min;
1394         ztBlocksCount = (uint32_t)(zone_tags_map_size / sizeof(ztBlock));
1395
1396         // initialize the qheads
1397         lck_mtx_lock(&ztLock);
1398
1399         ztFault(zone_tags_map, &ztBlocks[0], sizeof(ztBlocks[0]), 0);
1400         for (idx = 0; idx < ztFreeIndexCount; idx++) {
1401                 ztBlocks[idx].free = TRUE;
1402                 ztBlocks[idx].next = idx;
1403                 ztBlocks[idx].prev = idx;
1404                 ztBlocks[idx].size = 0;
1405         }
1406         // free remaining space
1407         ztFree(NULL, ztFreeIndexCount, ztBlocksCount - ztFreeIndexCount);
1408
1409         lck_mtx_unlock(&ztLock);
1410 }
1411
1412 static void
1413 ztMemoryAdd(zone_t zone, vm_offset_t mem, vm_size_t size)
1414 {
1415         uint32_t * tagbase;
1416         uint32_t   count, block, blocks, idx;
1417         size_t     pages;
1418
1419         pages = atop(size);
1420         tagbase = ZTAGBASE(zone, mem);
1421
1422         lck_mtx_lock(&ztLock);
1423
1424         // fault tagbase
1425         ztFault(zone_tagbase_map, tagbase, pages * sizeof(uint32_t), 0);
1426
1427         if (!zone->tags_inline) {
1428                 // allocate tags
1429                 count = (uint32_t)(size / zone_elem_size(zone));
1430                 blocks = ((count + ztTagsPerBlock - 1) / ztTagsPerBlock);
1431                 block = ztAlloc(zone, blocks);
1432                 if (-1U == block) {
1433                         ztDump(false);
1434                 }
1435                 assert(-1U != block);
1436         }
1437
1438         lck_mtx_unlock(&ztLock);
1439
1440         if (!zone->tags_inline) {
1441                 // set tag base for each page
1442                 block *= ztTagsPerBlock;
1443                 for (idx = 0; idx < pages; idx++) {
1444                         vm_offset_t esize = zone_elem_size(zone);
1445                         tagbase[idx] = block + (uint32_t)((ptoa(idx) + esize - 1) / esize);
1446                 }
1447         }
1448 }
1449
1450 static void
1451 ztMemoryRemove(zone_t zone, vm_offset_t mem, vm_size_t size)
1452 {
1453         uint32_t * tagbase;
1454         uint32_t   count, block, blocks, idx;
1455         size_t     pages;
1456
1457         // set tag base for each page
1458         pages = atop(size);
1459         tagbase = ZTAGBASE(zone, mem);
1460         block = tagbase[0];
1461         for (idx = 0; idx < pages; idx++) {
1462                 tagbase[idx] = 0xFFFFFFFF;
1463         }
1464
1465         lck_mtx_lock(&ztLock);
1466         if (!zone->tags_inline) {
1467                 count = (uint32_t)(size / zone_elem_size(zone));
1468                 blocks = ((count + ztTagsPerBlock - 1) / ztTagsPerBlock);
1469                 assert(block != 0xFFFFFFFF);
1470                 block /= ztTagsPerBlock;
1471                 ztFree(NULL /* zone is unlocked */, block, blocks);
1472         }
1473
1474         lck_mtx_unlock(&ztLock);
1475 }
1476
1477 uint32_t
1478 zone_index_from_tag_index(uint32_t tag_zone_index, vm_size_t * elem_size)
1479 {
1480         simple_lock(&all_zones_lock, &zone_locks_grp);
1481
1482         zone_index_foreach(idx) {
1483                 zone_t z = &zone_array[idx];
1484                 if (!z->tags) {
1485                         continue;
1486                 }
1487                 if (tag_zone_index != z->tag_zone_index) {
1488                         continue;
1489                 }
1490
1491                 *elem_size = zone_elem_size(z);
1492                 simple_unlock(&all_zones_lock);
1493                 return idx;
1494         }
1495
1496         simple_unlock(&all_zones_lock);
1497
1498         return -1U;
1499 }
1500
1501 #endif /* VM_MAX_TAG_ZONES */
1502 #pragma mark zalloc helpers
1503
1504 const char *
1505 zone_name(zone_t z)
1506 {
1507         return z->z_name;
1508 }
1509
1510 const char *
1511 zone_heap_name(zone_t z)
1512 {
1513         if (__probable(z->kalloc_heap < KHEAP_ID_COUNT)) {
1514                 return kalloc_heap_names[z->kalloc_heap];
1515         }
1516         return "invalid";
1517 }
1518
1519 static inline vm_size_t
1520 zone_submaps_approx_size(void)
1521 {
1522         vm_size_t size = 0;
1523
1524         for (unsigned idx = 0; idx <= zone_last_submap_idx; idx++) {
1525                 size += zone_submaps[idx]->size;
1526         }
1527
1528         return size;
1529 }
1530
1531 bool
1532 zone_maps_owned(vm_address_t addr, vm_size_t size)
1533 {
1534         return from_zone_map(addr, size);
1535 }
1536
1537 void
1538 zone_map_sizes(
1539         vm_map_size_t    *psize,
1540         vm_map_size_t    *pfree,
1541         vm_map_size_t    *plargest_free)
1542 {
1543         vm_map_sizes(zone_submaps[Z_SUBMAP_IDX_GENERAL_MAP], psize, pfree, plargest_free);
1544 }
1545
1546 vm_map_t
1547 zone_submap(zone_t zone)
1548 {
1549         return submap_for_zone(zone);
1550 }
1551
1552 unsigned
1553 zpercpu_count(void)
1554 {
1555         return zpercpu_early_count;
1556 }
1557
1558 int
1559 track_this_zone(const char *zonename, const char *logname)
1560 {
1561         unsigned int len;
1562         const char *zc = zonename;
1563         const char *lc = logname;
1564
1565         /*
1566          * Compare the strings.  We bound the compare by MAX_ZONE_NAME.
1567          */
1568
1569         for (len = 1; len <= MAX_ZONE_NAME; zc++, lc++, len++) {
1570                 /*
1571                  * If the current characters don't match, check for a space in
1572                  * in the zone name and a corresponding period in the log name.
1573                  * If that's not there, then the strings don't match.
1574                  */
1575
1576                 if (*zc != *lc && !(*zc == ' ' && *lc == '.')) {
1577                         break;
1578                 }
1579
1580                 /*
1581                  * The strings are equal so far.  If we're at the end, then it's a match.
1582                  */
1583
1584                 if (*zc == '\0') {
1585                         return TRUE;
1586                 }
1587         }
1588
1589         return FALSE;
1590 }
1591
1592 #if DEBUG || DEVELOPMENT
1593
1594 vm_size_t
1595 zone_element_info(void *addr, vm_tag_t * ptag)
1596 {
1597         vm_size_t     size = 0;
1598         vm_tag_t      tag = VM_KERN_MEMORY_NONE;
1599         struct zone_page_metadata *meta;
1600         struct zone *src_zone;
1601
1602         if (from_zone_map(addr, sizeof(void *))) {
1603                 meta = zone_native_meta_from_addr(addr);
1604                 src_zone = &zone_array[meta->zm_index];
1605 #if VM_MAX_TAG_ZONES
1606                 if (__improbable(src_zone->tags)) {
1607                         tag = (ZTAG(src_zone, (vm_offset_t) addr)[0] >> 1);
1608                 }
1609 #endif /* VM_MAX_TAG_ZONES */
1610                 size = zone_elem_size(src_zone);
1611         } else {
1612 #if CONFIG_GZALLOC
1613                 gzalloc_element_size(addr, NULL, &size);
1614 #endif /* CONFIG_GZALLOC */
1615         }
1616         *ptag = tag;
1617         return size;
1618 }
1619
1620 #endif /* DEBUG || DEVELOPMENT */
1621
1622 /* Someone wrote to freed memory. */
1623 __abortlike
1624 static void
1625 zone_element_was_modified_panic(
1626         zone_t        zone,
1627         vm_offset_t   element,
1628         vm_offset_t   found,
1629         vm_offset_t   expected,
1630         vm_offset_t   offset)
1631 {
1632         panic("a freed zone element has been modified in zone %s%s: "
1633             "expected %p but found %p, bits changed %p, "
1634             "at offset %d of %d in element %p, cookies %p %p",
1635             zone_heap_name(zone),
1636             zone->z_name,
1637             (void *)   expected,
1638             (void *)   found,
1639             (void *)   (expected ^ found),
1640             (uint32_t) offset,
1641             (uint32_t) zone_elem_size(zone),
1642             (void *)   element,
1643             (void *)   zp_nopoison_cookie,
1644             (void *)   zp_poisoned_cookie);
1645 }
1646
1647 /* The backup pointer is stored in the last pointer-sized location in an element. */
1648 __header_always_inline vm_offset_t *
1649 get_backup_ptr(vm_size_t elem_size, vm_offset_t *element)
1650 {
1651         return (vm_offset_t *)((vm_offset_t)element + elem_size - sizeof(vm_offset_t));
1652 }
1653
1654 /*
1655  * The primary and backup pointers don't match.
1656  * Determine which one was likely the corrupted pointer, find out what it
1657  * probably should have been, and panic.
1658  */
1659 __abortlike
1660 static void
1661 backup_ptr_mismatch_panic(
1662         zone_t        zone,
1663         struct zone_page_metadata *page_meta,
1664         vm_offset_t   page,
1665         vm_offset_t   element)
1666 {
1667         vm_offset_t primary = *(vm_offset_t *)element;
1668         vm_offset_t backup  = *get_backup_ptr(zone_elem_size(zone), &element);
1669         vm_offset_t likely_backup;
1670         vm_offset_t likely_primary;
1671         zone_addr_kind_t kind = zone_addr_kind(page, zone_elem_size(zone));
1672
1673         likely_primary = primary ^ zp_nopoison_cookie;
1674         boolean_t   sane_backup;
1675         boolean_t   sane_primary = zone_page_meta_is_sane_element(zone, page_meta,
1676             page, likely_primary, kind);
1677         boolean_t   element_was_poisoned = (backup & 0x1);
1678
1679 #if defined(__LP64__)
1680         /* We can inspect the tag in the upper bits for additional confirmation */
1681         if ((backup & 0xFFFFFF0000000000) == 0xFACADE0000000000) {
1682                 element_was_poisoned = TRUE;
1683         } else if ((backup & 0xFFFFFF0000000000) == 0xC0FFEE0000000000) {
1684                 element_was_poisoned = FALSE;
1685         }
1686 #endif
1687
1688         if (element_was_poisoned) {
1689                 likely_backup = backup ^ zp_poisoned_cookie;
1690         } else {
1691                 likely_backup = backup ^ zp_nopoison_cookie;
1692         }
1693         sane_backup = zone_page_meta_is_sane_element(zone, page_meta,
1694             page, likely_backup, kind);
1695
1696         /* The primary is definitely the corrupted one */
1697         if (!sane_primary && sane_backup) {
1698                 zone_element_was_modified_panic(zone, element, primary, (likely_backup ^ zp_nopoison_cookie), 0);
1699         }
1700
1701         /* The backup is definitely the corrupted one */
1702         if (sane_primary && !sane_backup) {
1703                 zone_element_was_modified_panic(zone, element, backup,
1704                     (likely_primary ^ (element_was_poisoned ? zp_poisoned_cookie : zp_nopoison_cookie)),
1705                     zone_elem_size(zone) - sizeof(vm_offset_t));
1706         }
1707
1708         /*
1709          * Not sure which is the corrupted one.
1710          * It's less likely that the backup pointer was overwritten with
1711          * ( (sane address) ^ (valid cookie) ), so we'll guess that the
1712          * primary pointer has been overwritten with a sane but incorrect address.
1713          */
1714         if (sane_primary && sane_backup) {
1715                 zone_element_was_modified_panic(zone, element, primary, (likely_backup ^ zp_nopoison_cookie), 0);
1716         }
1717
1718         /* Neither are sane, so just guess. */
1719         zone_element_was_modified_panic(zone, element, primary, (likely_backup ^ zp_nopoison_cookie), 0);
1720 }
1721
1722 /*
1723  * zone_sequestered_page_get
1724  * z is locked
1725  */
1726 static struct zone_page_metadata *
1727 zone_sequestered_page_get(zone_t z, vm_offset_t *page)
1728 {
1729         const zone_addr_kind_t kind = ZONE_ADDR_NATIVE;
1730
1731         if (!zone_pva_is_null(z->pages_sequester)) {
1732                 if (os_sub_overflow(z->sequester_page_count, z->alloc_pages,
1733                     &z->sequester_page_count)) {
1734                         zone_accounting_panic(z, "sequester_page_count wrap-around");
1735                 }
1736                 return zone_meta_queue_pop(z, &z->pages_sequester, kind, page);
1737         }
1738
1739         return NULL;
1740 }
1741
1742 /*
1743  * zone_sequestered_page_populate
1744  * z is unlocked
1745  * page_meta is invalid on failure
1746  */
1747 static kern_return_t
1748 zone_sequestered_page_populate(zone_t z, struct zone_page_metadata *page_meta,
1749     vm_offset_t space, vm_size_t alloc_size, int zflags)
1750 {
1751         kern_return_t retval;
1752
1753         assert(alloc_size == ptoa(z->alloc_pages));
1754         retval = kernel_memory_populate(submap_for_zone(z), space, alloc_size,
1755             zflags, VM_KERN_MEMORY_ZONE);
1756         if (retval != KERN_SUCCESS) {
1757                 lock_zone(z);
1758                 zone_meta_queue_push(z, &z->pages_sequester, page_meta, ZONE_ADDR_NATIVE);
1759                 z->sequester_page_count += z->alloc_pages;
1760                 unlock_zone(z);
1761         }
1762         return retval;
1763 }
1764
1765 #pragma mark Zone poisoning/zeroing
1766
1767 /*
1768  * Initialize zone poisoning
1769  * called from zone_bootstrap before any allocations are made from zalloc
1770  */
1771 __startup_func
1772 static void
1773 zp_bootstrap(void)
1774 {
1775         char temp_buf[16];
1776
1777         /*
1778          * Initialize backup pointer random cookie for poisoned elements
1779          * Try not to call early_random() back to back, it may return
1780          * the same value if mach_absolute_time doesn't have sufficient time
1781          * to tick over between calls.  <rdar://problem/11597395>
1782          * (This is only a problem on embedded devices)
1783          */
1784         zp_poisoned_cookie = (uintptr_t) early_random();
1785
1786         /* -zp: enable poisoning for every alloc and free */
1787         if (PE_parse_boot_argn("-zp", temp_buf, sizeof(temp_buf))) {
1788                 zp_factor = 1;
1789         }
1790
1791         /* -no-zp: disable poisoning */
1792         if (PE_parse_boot_argn("-no-zp", temp_buf, sizeof(temp_buf))) {
1793                 zp_factor = 0;
1794                 printf("Zone poisoning disabled\n");
1795         }
1796
1797         /* Initialize backup pointer random cookie for unpoisoned elements */
1798         zp_nopoison_cookie = (uintptr_t) early_random();
1799
1800 #if MACH_ASSERT
1801         if (zp_poisoned_cookie == zp_nopoison_cookie) {
1802                 panic("early_random() is broken: %p and %p are not random\n",
1803                     (void *) zp_poisoned_cookie, (void *) zp_nopoison_cookie);
1804         }
1805 #endif
1806
1807         /*
1808          * Use the last bit in the backup pointer to hint poisoning state
1809          * to backup_ptr_mismatch_panic. Valid zone pointers are aligned, so
1810          * the low bits are zero.
1811          */
1812         zp_poisoned_cookie |=   (uintptr_t)0x1ULL;
1813         zp_nopoison_cookie &= ~((uintptr_t)0x1ULL);
1814
1815 #if defined(__LP64__)
1816         /*
1817          * Make backup pointers more obvious in GDB for 64 bit
1818          * by making OxFFFFFF... ^ cookie = 0xFACADE...
1819          * (0xFACADE = 0xFFFFFF ^ 0x053521)
1820          * (0xC0FFEE = 0xFFFFFF ^ 0x3f0011)
1821          * The high 3 bytes of a zone pointer are always 0xFFFFFF, and are checked
1822          * by the sanity check, so it's OK for that part of the cookie to be predictable.
1823          *
1824          * TODO: Use #defines, xors, and shifts
1825          */
1826
1827         zp_poisoned_cookie &= 0x000000FFFFFFFFFF;
1828         zp_poisoned_cookie |= 0x0535210000000000; /* 0xFACADE */
1829
1830         zp_nopoison_cookie &= 0x000000FFFFFFFFFF;
1831         zp_nopoison_cookie |= 0x3f00110000000000; /* 0xC0FFEE */
1832 #endif
1833
1834         /*
1835          * Initialize zp_min_size to two cachelines. Elements smaller than this will
1836          * be zero-ed.
1837          */
1838         ml_cpu_info_t cpu_info;
1839         ml_cpu_get_info(&cpu_info);
1840         zp_min_size = 2 * cpu_info.cache_line_size;
1841 }
1842
1843 inline uint32_t
1844 zone_poison_count_init(zone_t zone)
1845 {
1846         return zp_factor + (((uint32_t)zone_elem_size(zone)) >> zp_scale) ^
1847                (mach_absolute_time() & 0x7);
1848 }
1849
1850 #if ZALLOC_ENABLE_POISONING
1851 static bool
1852 zfree_poison_element(zone_t zone, uint32_t *zp_count, vm_offset_t elem)
1853 {
1854         bool poison = false;
1855         uint32_t zp_count_local;
1856
1857         assert(!zone->percpu);
1858         if (zp_factor != 0) {
1859                 /*
1860                  * Poison the memory of every zp_count-th element before it ends up
1861                  * on the freelist to catch use-after-free and use of uninitialized
1862                  * memory.
1863                  *
1864                  * Every element is poisoned when zp_factor is set to 1.
1865                  *
1866                  */
1867                 zp_count_local = os_atomic_load(zp_count, relaxed);
1868                 if (__improbable(zp_count_local == 0 || zp_factor == 1)) {
1869                         poison = true;
1870
1871                         os_atomic_store(zp_count, zone_poison_count_init(zone), relaxed);
1872
1873                         /* memset_pattern{4|8} could help make this faster: <rdar://problem/4662004> */
1874                         vm_offset_t *element_cursor  = ((vm_offset_t *) elem);
1875                         vm_offset_t *end_cursor      = (vm_offset_t *)(elem + zone_elem_size(zone));
1876
1877                         for (; element_cursor < end_cursor; element_cursor++) {
1878                                 *element_cursor = ZONE_POISON;
1879                         }
1880                 } else {
1881                         os_atomic_store(zp_count, zp_count_local - 1, relaxed);
1882                         /*
1883                          * Zero first zp_min_size bytes of elements that aren't being poisoned.
1884                          * Element size is larger than zp_min_size in this path as elements
1885                          * that are smaller will always be zero-ed.
1886                          */
1887                         bzero((void *) elem, zp_min_size);
1888                 }
1889         }
1890         return poison;
1891 }
1892 #else
1893 static bool
1894 zfree_poison_element(zone_t zone, uint32_t *zp_count, vm_offset_t elem)
1895 {
1896 #pragma unused(zone, zp_count, elem)
1897         assert(!zone->percpu);
1898         return false;
1899 }
1900 #endif
1901
1902 __attribute__((always_inline))
1903 static bool
1904 zfree_clear(zone_t zone, vm_offset_t addr, vm_size_t elem_size)
1905 {
1906         assert(zone->zfree_clear_mem);
1907         if (zone->percpu) {
1908                 zpercpu_foreach_cpu(i) {
1909                         bzero((void *)(addr + ptoa(i)), elem_size);
1910                 }
1911         } else {
1912                 bzero((void *)addr, elem_size);
1913         }
1914
1915         return true;
1916 }
1917
1918 /*
1919  * Zero the element if zone has zfree_clear_mem flag set else poison
1920  * the element if zp_count hits 0.
1921  */
1922 __attribute__((always_inline))
1923 bool
1924 zfree_clear_or_poison(zone_t zone, uint32_t *zp_count, vm_offset_t addr)
1925 {
1926         vm_size_t elem_size = zone_elem_size(zone);
1927
1928         if (zone->zfree_clear_mem) {
1929                 return zfree_clear(zone, addr, elem_size);
1930         }
1931
1932         return zfree_poison_element(zone, zp_count, (vm_offset_t)addr);
1933 }
1934
1935 /*
1936  * Clear out the old next pointer and backup to avoid leaking the zone
1937  * poisoning cookie and so that only values on the freelist have a valid
1938  * cookie.
1939  */
1940 void
1941 zone_clear_freelist_pointers(zone_t zone, vm_offset_t addr)
1942 {
1943         vm_offset_t perm_value = 0;
1944
1945         if (!zone->zfree_clear_mem) {
1946                 perm_value = ZONE_POISON;
1947         }
1948
1949         vm_offset_t *primary  = (vm_offset_t *) addr;
1950         vm_offset_t *backup   = get_backup_ptr(zone_elem_size(zone), primary);
1951
1952         *primary = perm_value;
1953         *backup  = perm_value;
1954 }
1955
1956 #if ZALLOC_ENABLE_POISONING
1957 __abortlike
1958 static void
1959 zone_element_not_clear_panic(zone_t zone, void *addr)
1960 {
1961         panic("Zone element %p was modified after free for zone %s%s: "
1962             "Expected element to be cleared", addr, zone_heap_name(zone),
1963             zone->z_name);
1964 }
1965
1966 /*
1967  * Validate that the element was not tampered with while it was in the
1968  * freelist.
1969  */
1970 void
1971 zalloc_validate_element(zone_t zone, vm_offset_t addr, vm_size_t size, bool validate)
1972 {
1973         if (zone->percpu) {
1974                 assert(zone->zfree_clear_mem);
1975                 zpercpu_foreach_cpu(i) {
1976                         if (memcmp_zero_ptr_aligned((void *)(addr + ptoa(i)), size)) {
1977                                 zone_element_not_clear_panic(zone, (void *)(addr + ptoa(i)));
1978                         }
1979                 }
1980         } else if (zone->zfree_clear_mem) {
1981                 if (memcmp_zero_ptr_aligned((void *)addr, size)) {
1982                         zone_element_not_clear_panic(zone, (void *)addr);
1983                 }
1984         } else if (__improbable(validate)) {
1985                 const vm_offset_t *p   = (vm_offset_t *)addr;
1986                 const vm_offset_t *end = (vm_offset_t *)(addr + size);
1987
1988                 for (; p < end; p++) {
1989                         if (*p != ZONE_POISON) {
1990                                 zone_element_was_modified_panic(zone, addr,
1991                                     *p, ZONE_POISON, (vm_offset_t)p - addr);
1992                         }
1993                 }
1994         } else {
1995                 /*
1996                  * If element wasn't poisoned or entirely cleared, validate that the
1997                  * minimum bytes that were cleared on free haven't been corrupted.
1998                  * addr is advanced by ptr size as we have already validated and cleared
1999                  * the freelist pointer/zcache canary.
2000                  */
2001                 if (memcmp_zero_ptr_aligned((void *) (addr + sizeof(vm_offset_t)),
2002                     zp_min_size - sizeof(vm_offset_t))) {
2003                         zone_element_not_clear_panic(zone, (void *)addr);
2004                 }
2005         }
2006 }
2007 #endif /* ZALLOC_ENABLE_POISONING */
2008
2009 #pragma mark Zone Leak Detection
2010
2011 /*
2012  * Zone leak debugging code
2013  *
2014  * When enabled, this code keeps a log to track allocations to a particular zone that have not
2015  * yet been freed.  Examining this log will reveal the source of a zone leak.  The log is allocated
2016  * only when logging is enabled, so there is no effect on the system when it's turned off.  Logging is
2017  * off by default.
2018  *
2019  * Enable the logging via the boot-args. Add the parameter "zlog=<zone>" to boot-args where <zone>
2020  * is the name of the zone you wish to log.
2021  *
2022  * This code only tracks one zone, so you need to identify which one is leaking first.
2023  * Generally, you'll know you have a leak when you get a "zalloc retry failed 3" panic from the zone
2024  * garbage collector.  Note that the zone name printed in the panic message is not necessarily the one
2025  * containing the leak.  So do a zprint from gdb and locate the zone with the bloated size.  This
2026  * is most likely the problem zone, so set zlog in boot-args to this zone name, reboot and re-run the test.  The
2027  * next time it panics with this message, examine the log using the kgmacros zstack, findoldest and countpcs.
2028  * See the help in the kgmacros for usage info.
2029  *
2030  *
2031  * Zone corruption logging
2032  *
2033  * Logging can also be used to help identify the source of a zone corruption.  First, identify the zone
2034  * that is being corrupted, then add "-zc zlog=<zone name>" to the boot-args.  When -zc is used in conjunction
2035  * with zlog, it changes the logging style to track both allocations and frees to the zone.  So when the
2036  * corruption is detected, examining the log will show you the stack traces of the callers who last allocated
2037  * and freed any particular element in the zone.  Use the findelem kgmacro with the address of the element that's been
2038  * corrupted to examine its history.  This should lead to the source of the corruption.
2039  */
2040
2041 /* Returns TRUE if we rolled over the counter at factor */
2042 __header_always_inline bool
2043 sample_counter(volatile uint32_t *count_p, uint32_t factor)
2044 {
2045         uint32_t old_count, new_count = 0;
2046         if (count_p != NULL) {
2047                 os_atomic_rmw_loop(count_p, old_count, new_count, relaxed, {
2048                         new_count = old_count + 1;
2049                         if (new_count >= factor) {
2050                                 new_count = 0;
2051                         }
2052                 });
2053         }
2054
2055         return new_count == 0;
2056 }
2057
2058 #if ZONE_ENABLE_LOGGING
2059 /* Log allocations and frees to help debug a zone element corruption */
2060 TUNABLE(bool, corruption_debug_flag, "-zc", false);
2061
2062 #define MAX_NUM_ZONES_ALLOWED_LOGGING   10 /* Maximum 10 zones can be logged at once */
2063
2064 static int  max_num_zones_to_log = MAX_NUM_ZONES_ALLOWED_LOGGING;
2065 static int  num_zones_logged = 0;
2066
2067 /*
2068  * The number of records in the log is configurable via the zrecs parameter in boot-args.  Set this to
2069  * the number of records you want in the log.  For example, "zrecs=10" sets it to 10 records. Since this
2070  * is the number of stacks suspected of leaking, we don't need many records.
2071  */
2072
2073 #if defined(__LP64__)
2074 #define ZRECORDS_MAX            2560            /* Max records allowed in the log */
2075 #else
2076 #define ZRECORDS_MAX            1536            /* Max records allowed in the log */
2077 #endif
2078 #define ZRECORDS_DEFAULT        1024            /* default records in log if zrecs is not specificed in boot-args */
2079
2080 static TUNABLE(uint32_t, log_records, "zrecs", ZRECORDS_DEFAULT);
2081
2082 static void
2083 zone_enable_logging(zone_t z)
2084 {
2085         z->zlog_btlog = btlog_create(log_records, MAX_ZTRACE_DEPTH,
2086             (corruption_debug_flag == FALSE) /* caller_will_remove_entries_for_element? */);
2087
2088         if (z->zlog_btlog) {
2089                 printf("zone: logging started for zone %s%s\n",
2090                     zone_heap_name(z), z->z_name);
2091         } else {
2092                 printf("zone: couldn't allocate memory for zrecords, turning off zleak logging\n");
2093                 z->zone_logging = false;
2094         }
2095 }
2096
2097 /**
2098  * @function zone_setup_logging
2099  *
2100  * @abstract
2101  * Optionally sets up a zone for logging.
2102  *
2103  * @discussion
2104  * We recognized two boot-args:
2105  *
2106  *      zlog=<zone_to_log>
2107  *      zrecs=<num_records_in_log>
2108  *
2109  * The zlog arg is used to specify the zone name that should be logged,
2110  * and zrecs is used to control the size of the log.
2111  *
2112  * If zrecs is not specified, a default value is used.
2113  */
2114 static void
2115 zone_setup_logging(zone_t z)
2116 {
2117         char zone_name[MAX_ZONE_NAME]; /* Temp. buffer for the zone name */
2118         char zlog_name[MAX_ZONE_NAME]; /* Temp. buffer to create the strings zlog1, zlog2 etc... */
2119         char zlog_val[MAX_ZONE_NAME];  /* the zone name we're logging, if any */
2120
2121         /*
2122          * Don't allow more than ZRECORDS_MAX records even if the user asked for more.
2123          *
2124          * This prevents accidentally hogging too much kernel memory
2125          * and making the system unusable.
2126          */
2127         if (log_records > ZRECORDS_MAX) {
2128                 log_records = ZRECORDS_MAX;
2129         }
2130
2131         /*
2132          * Append kalloc heap name to zone name (if zone is used by kalloc)
2133          */
2134         snprintf(zone_name, MAX_ZONE_NAME, "%s%s", zone_heap_name(z), z->z_name);
2135
2136         /* zlog0 isn't allowed. */
2137         for (int i = 1; i <= max_num_zones_to_log; i++) {
2138                 snprintf(zlog_name, MAX_ZONE_NAME, "zlog%d", i);
2139
2140                 if (PE_parse_boot_argn(zlog_name, zlog_val, sizeof(zlog_val)) &&
2141                     track_this_zone(zone_name, zlog_val)) {
2142                         z->zone_logging = true;
2143                         num_zones_logged++;
2144                         break;
2145                 }
2146         }
2147
2148         /*
2149          * Backwards compat. with the old boot-arg used to specify single zone
2150          * logging i.e. zlog Needs to happen after the newer zlogn checks
2151          * because the prefix will match all the zlogn
2152          * boot-args.
2153          */
2154         if (!z->zone_logging &&
2155             PE_parse_boot_argn("zlog", zlog_val, sizeof(zlog_val)) &&
2156             track_this_zone(zone_name, zlog_val)) {
2157                 z->zone_logging = true;
2158                 num_zones_logged++;
2159         }
2160
2161
2162         /*
2163          * If we want to log a zone, see if we need to allocate buffer space for
2164          * the log.
2165          *
2166          * Some vm related zones are zinit'ed before we can do a kmem_alloc, so
2167          * we have to defer allocation in that case.
2168          *
2169          * zone_init() will finish the job.
2170          *
2171          * If we want to log one of the VM related zones that's set up early on,
2172          * we will skip allocation of the log until zinit is called again later
2173          * on some other zone.
2174          */
2175         if (z->zone_logging && startup_phase >= STARTUP_SUB_KMEM_ALLOC) {
2176                 zone_enable_logging(z);
2177         }
2178 }
2179
2180 /*
2181  * Each record in the log contains a pointer to the zone element it refers to,
2182  * and a small array to hold the pc's from the stack trace.  A
2183  * record is added to the log each time a zalloc() is done in the zone_of_interest.  For leak debugging,
2184  * the record is cleared when a zfree() is done.  For corruption debugging, the log tracks both allocs and frees.
2185  * If the log fills, old records are replaced as if it were a circular buffer.
2186  */
2187
2188
2189 /*
2190  * Decide if we want to log this zone by doing a string compare between a zone name and the name
2191  * of the zone to log. Return true if the strings are equal, false otherwise.  Because it's not
2192  * possible to include spaces in strings passed in via the boot-args, a period in the logname will
2193  * match a space in the zone name.
2194  */
2195
2196 /*
2197  * Test if we want to log this zalloc/zfree event.  We log if this is the zone we're interested in and
2198  * the buffer for the records has been allocated.
2199  */
2200
2201 #define DO_LOGGING(z)           (z->zlog_btlog != NULL)
2202 #else /* !ZONE_ENABLE_LOGGING */
2203 #define DO_LOGGING(z)           0
2204 #endif /* !ZONE_ENABLE_LOGGING */
2205
2206 #if CONFIG_ZLEAKS
2207
2208 /*
2209  * The zone leak detector, abbreviated 'zleak', keeps track of a subset of the currently outstanding
2210  * allocations made by the zone allocator.  Every zleak_sample_factor allocations in each zone, we capture a
2211  * backtrace.  Every free, we examine the table and determine if the allocation was being tracked,
2212  * and stop tracking it if it was being tracked.
2213  *
2214  * We track the allocations in the zallocations hash table, which stores the address that was returned from
2215  * the zone allocator.  Each stored entry in the zallocations table points to an entry in the ztraces table, which
2216  * stores the backtrace associated with that allocation.  This provides uniquing for the relatively large
2217  * backtraces - we don't store them more than once.
2218  *
2219  * Data collection begins when the zone map is 50% full, and only occurs for zones that are taking up
2220  * a large amount of virtual space.
2221  */
2222 #define ZLEAK_STATE_ENABLED             0x01    /* Zone leak monitoring should be turned on if zone_map fills up. */
2223 #define ZLEAK_STATE_ACTIVE              0x02    /* We are actively collecting traces. */
2224 #define ZLEAK_STATE_ACTIVATING          0x04    /* Some thread is doing setup; others should move along. */
2225 #define ZLEAK_STATE_FAILED              0x08    /* Attempt to allocate tables failed.  We will not try again. */
2226 uint32_t        zleak_state = 0;                /* State of collection, as above */
2227
2228 boolean_t       panic_include_ztrace    = FALSE;        /* Enable zleak logging on panic */
2229 vm_size_t       zleak_global_tracking_threshold;        /* Size of zone map at which to start collecting data */
2230 vm_size_t       zleak_per_zone_tracking_threshold;      /* Size a zone will have before we will collect data on it */
2231 unsigned int    zleak_sample_factor     = 1000;         /* Allocations per sample attempt */
2232
2233 /*
2234  * Counters for allocation statistics.
2235  */
2236
2237 /* Times two active records want to occupy the same spot */
2238 unsigned int z_alloc_collisions = 0;
2239 unsigned int z_trace_collisions = 0;
2240
2241 /* Times a new record lands on a spot previously occupied by a freed allocation */
2242 unsigned int z_alloc_overwrites = 0;
2243 unsigned int z_trace_overwrites = 0;
2244
2245 /* Times a new alloc or trace is put into the hash table */
2246 unsigned int z_alloc_recorded   = 0;
2247 unsigned int z_trace_recorded   = 0;
2248
2249 /* Times zleak_log returned false due to not being able to acquire the lock */
2250 unsigned int z_total_conflicts  = 0;
2251
2252 /*
2253  * Structure for keeping track of an allocation
2254  * An allocation bucket is in use if its element is not NULL
2255  */
2256 struct zallocation {
2257         uintptr_t               za_element;             /* the element that was zalloc'ed or zfree'ed, NULL if bucket unused */
2258         vm_size_t               za_size;                        /* how much memory did this allocation take up? */
2259         uint32_t                za_trace_index; /* index into ztraces for backtrace associated with allocation */
2260         /* TODO: #if this out */
2261         uint32_t                za_hit_count;           /* for determining effectiveness of hash function */
2262 };
2263
2264 /* Size must be a power of two for the zhash to be able to just mask off bits instead of mod */
2265 uint32_t zleak_alloc_buckets = CONFIG_ZLEAK_ALLOCATION_MAP_NUM;
2266 uint32_t zleak_trace_buckets = CONFIG_ZLEAK_TRACE_MAP_NUM;
2267
2268 vm_size_t zleak_max_zonemap_size;
2269
2270 /* Hashmaps of allocations and their corresponding traces */
2271 static struct zallocation*      zallocations;
2272 static struct ztrace*           ztraces;
2273
2274 /* not static so that panic can see this, see kern/debug.c */
2275 struct ztrace*                          top_ztrace;
2276
2277 /* Lock to protect zallocations, ztraces, and top_ztrace from concurrent modification. */
2278 LCK_GRP_DECLARE(zleak_lock_grp, "zleak_lock");
2279 LCK_SPIN_DECLARE(zleak_lock, &zleak_lock_grp);
2280
2281 /*
2282  * Initializes the zone leak monitor.  Called from zone_init()
2283  */
2284 __startup_func
2285 static void
2286 zleak_init(vm_size_t max_zonemap_size)
2287 {
2288         char                    scratch_buf[16];
2289         boolean_t               zleak_enable_flag = FALSE;
2290
2291         zleak_max_zonemap_size = max_zonemap_size;
2292         zleak_global_tracking_threshold = max_zonemap_size / 2;
2293         zleak_per_zone_tracking_threshold = zleak_global_tracking_threshold / 8;
2294
2295 #if CONFIG_EMBEDDED
2296         if (PE_parse_boot_argn("-zleakon", scratch_buf, sizeof(scratch_buf))) {
2297                 zleak_enable_flag = TRUE;
2298                 printf("zone leak detection enabled\n");
2299         } else {
2300                 zleak_enable_flag = FALSE;
2301                 printf("zone leak detection disabled\n");
2302         }
2303 #else /* CONFIG_EMBEDDED */
2304         /* -zleakoff (flag to disable zone leak monitor) */
2305         if (PE_parse_boot_argn("-zleakoff", scratch_buf, sizeof(scratch_buf))) {
2306                 zleak_enable_flag = FALSE;
2307                 printf("zone leak detection disabled\n");
2308         } else {
2309                 zleak_enable_flag = TRUE;
2310                 printf("zone leak detection enabled\n");
2311         }
2312 #endif /* CONFIG_EMBEDDED */
2313
2314         /* zfactor=XXXX (override how often to sample the zone allocator) */
2315         if (PE_parse_boot_argn("zfactor", &zleak_sample_factor, sizeof(zleak_sample_factor))) {
2316                 printf("Zone leak factor override: %u\n", zleak_sample_factor);
2317         }
2318
2319         /* zleak-allocs=XXXX (override number of buckets in zallocations) */
2320         if (PE_parse_boot_argn("zleak-allocs", &zleak_alloc_buckets, sizeof(zleak_alloc_buckets))) {
2321                 printf("Zone leak alloc buckets override: %u\n", zleak_alloc_buckets);
2322                 /* uses 'is power of 2' trick: (0x01000 & 0x00FFF == 0) */
2323                 if (zleak_alloc_buckets == 0 || (zleak_alloc_buckets & (zleak_alloc_buckets - 1))) {
2324                         printf("Override isn't a power of two, bad things might happen!\n");
2325                 }
2326         }
2327
2328         /* zleak-traces=XXXX (override number of buckets in ztraces) */
2329         if (PE_parse_boot_argn("zleak-traces", &zleak_trace_buckets, sizeof(zleak_trace_buckets))) {
2330                 printf("Zone leak trace buckets override: %u\n", zleak_trace_buckets);
2331                 /* uses 'is power of 2' trick: (0x01000 & 0x00FFF == 0) */
2332                 if (zleak_trace_buckets == 0 || (zleak_trace_buckets & (zleak_trace_buckets - 1))) {
2333                         printf("Override isn't a power of two, bad things might happen!\n");
2334                 }
2335         }
2336
2337         if (zleak_enable_flag) {
2338                 zleak_state = ZLEAK_STATE_ENABLED;
2339         }
2340 }
2341
2342 /*
2343  * Support for kern.zleak.active sysctl - a simplified
2344  * version of the zleak_state variable.
2345  */
2346 int
2347 get_zleak_state(void)
2348 {
2349         if (zleak_state & ZLEAK_STATE_FAILED) {
2350                 return -1;
2351         }
2352         if (zleak_state & ZLEAK_STATE_ACTIVE) {
2353                 return 1;
2354         }
2355         return 0;
2356 }
2357
2358 kern_return_t
2359 zleak_activate(void)
2360 {
2361         kern_return_t retval;
2362         vm_size_t z_alloc_size = zleak_alloc_buckets * sizeof(struct zallocation);
2363         vm_size_t z_trace_size = zleak_trace_buckets * sizeof(struct ztrace);
2364         void *allocations_ptr = NULL;
2365         void *traces_ptr = NULL;
2366
2367         /* Only one thread attempts to activate at a time */
2368         if (zleak_state & (ZLEAK_STATE_ACTIVE | ZLEAK_STATE_ACTIVATING | ZLEAK_STATE_FAILED)) {
2369                 return KERN_SUCCESS;
2370         }
2371
2372         /* Indicate that we're doing the setup */
2373         lck_spin_lock(&zleak_lock);
2374         if (zleak_state & (ZLEAK_STATE_ACTIVE | ZLEAK_STATE_ACTIVATING | ZLEAK_STATE_FAILED)) {
2375                 lck_spin_unlock(&zleak_lock);
2376                 return KERN_SUCCESS;
2377         }
2378
2379         zleak_state |= ZLEAK_STATE_ACTIVATING;
2380         lck_spin_unlock(&zleak_lock);
2381
2382         /* Allocate and zero tables */
2383         retval = kmem_alloc_kobject(kernel_map, (vm_offset_t*)&allocations_ptr, z_alloc_size, VM_KERN_MEMORY_OSFMK);
2384         if (retval != KERN_SUCCESS) {
2385                 goto fail;
2386         }
2387
2388         retval = kmem_alloc_kobject(kernel_map, (vm_offset_t*)&traces_ptr, z_trace_size, VM_KERN_MEMORY_OSFMK);
2389         if (retval != KERN_SUCCESS) {
2390                 goto fail;
2391         }
2392
2393         bzero(allocations_ptr, z_alloc_size);
2394         bzero(traces_ptr, z_trace_size);
2395
2396         /* Everything's set.  Install tables, mark active. */
2397         zallocations = allocations_ptr;
2398         ztraces = traces_ptr;
2399
2400         /*
2401          * Initialize the top_ztrace to the first entry in ztraces,
2402          * so we don't have to check for null in zleak_log
2403          */
2404         top_ztrace = &ztraces[0];
2405
2406         /*
2407          * Note that we do need a barrier between installing
2408          * the tables and setting the active flag, because the zfree()
2409          * path accesses the table without a lock if we're active.
2410          */
2411         lck_spin_lock(&zleak_lock);
2412         zleak_state |= ZLEAK_STATE_ACTIVE;
2413         zleak_state &= ~ZLEAK_STATE_ACTIVATING;
2414         lck_spin_unlock(&zleak_lock);
2415
2416         return 0;
2417
2418 fail:
2419         /*
2420          * If we fail to allocate memory, don't further tax
2421          * the system by trying again.
2422          */
2423         lck_spin_lock(&zleak_lock);
2424         zleak_state |= ZLEAK_STATE_FAILED;
2425         zleak_state &= ~ZLEAK_STATE_ACTIVATING;
2426         lck_spin_unlock(&zleak_lock);
2427
2428         if (allocations_ptr != NULL) {
2429                 kmem_free(kernel_map, (vm_offset_t)allocations_ptr, z_alloc_size);
2430         }
2431
2432         if (traces_ptr != NULL) {
2433                 kmem_free(kernel_map, (vm_offset_t)traces_ptr, z_trace_size);
2434         }
2435
2436         return retval;
2437 }
2438
2439 /*
2440  * TODO: What about allocations that never get deallocated,
2441  * especially ones with unique backtraces? Should we wait to record
2442  * until after boot has completed?
2443  * (How many persistent zallocs are there?)
2444  */
2445
2446 /*
2447  * This function records the allocation in the allocations table,
2448  * and stores the associated backtrace in the traces table
2449  * (or just increments the refcount if the trace is already recorded)
2450  * If the allocation slot is in use, the old allocation is replaced with the new allocation, and
2451  * the associated trace's refcount is decremented.
2452  * If the trace slot is in use, it returns.
2453  * The refcount is incremented by the amount of memory the allocation consumes.
2454  * The return value indicates whether to try again next time.
2455  */
2456 static boolean_t
2457 zleak_log(uintptr_t* bt,
2458     uintptr_t addr,
2459     uint32_t depth,
2460     vm_size_t allocation_size)
2461 {
2462         /* Quit if there's someone else modifying the hash tables */
2463         if (!lck_spin_try_lock(&zleak_lock)) {
2464                 z_total_conflicts++;
2465                 return FALSE;
2466         }
2467
2468         struct zallocation* allocation  = &zallocations[hashaddr(addr, zleak_alloc_buckets)];
2469
2470         uint32_t trace_index = hashbacktrace(bt, depth, zleak_trace_buckets);
2471         struct ztrace* trace = &ztraces[trace_index];
2472
2473         allocation->za_hit_count++;
2474         trace->zt_hit_count++;
2475
2476         /*
2477          * If the allocation bucket we want to be in is occupied, and if the occupier
2478          * has the same trace as us, just bail.
2479          */
2480         if (allocation->za_element != (uintptr_t) 0 && trace_index == allocation->za_trace_index) {
2481                 z_alloc_collisions++;
2482
2483                 lck_spin_unlock(&zleak_lock);
2484                 return TRUE;
2485         }
2486
2487         /* STEP 1: Store the backtrace in the traces array. */
2488         /* A size of zero indicates that the trace bucket is free. */
2489
2490         if (trace->zt_size > 0 && bcmp(trace->zt_stack, bt, (depth * sizeof(uintptr_t))) != 0) {
2491                 /*
2492                  * Different unique trace with same hash!
2493                  * Just bail - if we're trying to record the leaker, hopefully the other trace will be deallocated
2494                  * and get out of the way for later chances
2495                  */
2496                 trace->zt_collisions++;
2497                 z_trace_collisions++;
2498
2499                 lck_spin_unlock(&zleak_lock);
2500                 return TRUE;
2501         } else if (trace->zt_size > 0) {
2502                 /* Same trace, already added, so increment refcount */
2503                 trace->zt_size += allocation_size;
2504         } else {
2505                 /* Found an unused trace bucket, record the trace here! */
2506                 if (trace->zt_depth != 0) { /* if this slot was previously used but not currently in use */
2507                         z_trace_overwrites++;
2508                 }
2509
2510                 z_trace_recorded++;
2511                 trace->zt_size                  = allocation_size;
2512                 memcpy(trace->zt_stack, bt, (depth * sizeof(uintptr_t)));
2513
2514                 trace->zt_depth         = depth;
2515                 trace->zt_collisions    = 0;
2516         }
2517
2518         /* STEP 2: Store the allocation record in the allocations array. */
2519
2520         if (allocation->za_element != (uintptr_t) 0) {
2521                 /*
2522                  * Straight up replace any allocation record that was there.  We don't want to do the work
2523                  * to preserve the allocation entries that were there, because we only record a subset of the
2524                  * allocations anyways.
2525                  */
2526
2527                 z_alloc_collisions++;
2528
2529                 struct ztrace* associated_trace = &ztraces[allocation->za_trace_index];
2530                 /* Knock off old allocation's size, not the new allocation */
2531                 associated_trace->zt_size -= allocation->za_size;
2532         } else if (allocation->za_trace_index != 0) {
2533                 /* Slot previously used but not currently in use */
2534                 z_alloc_overwrites++;
2535         }
2536
2537         allocation->za_element          = addr;
2538         allocation->za_trace_index      = trace_index;
2539         allocation->za_size             = allocation_size;
2540
2541         z_alloc_recorded++;
2542
2543         if (top_ztrace->zt_size < trace->zt_size) {
2544                 top_ztrace = trace;
2545         }
2546
2547         lck_spin_unlock(&zleak_lock);
2548         return TRUE;
2549 }
2550
2551 /*
2552  * Free the allocation record and release the stacktrace.
2553  * This should be as fast as possible because it will be called for every free.
2554  */
2555 __attribute__((noinline))
2556 static void
2557 zleak_free(uintptr_t addr,
2558     vm_size_t allocation_size)
2559 {
2560         if (addr == (uintptr_t) 0) {
2561                 return;
2562         }
2563
2564         struct zallocation* allocation = &zallocations[hashaddr(addr, zleak_alloc_buckets)];
2565
2566         /* Double-checked locking: check to find out if we're interested, lock, check to make
2567          * sure it hasn't changed, then modify it, and release the lock.
2568          */
2569
2570         if (allocation->za_element == addr && allocation->za_trace_index < zleak_trace_buckets) {
2571                 /* if the allocation was the one, grab the lock, check again, then delete it */
2572                 lck_spin_lock(&zleak_lock);
2573
2574                 if (allocation->za_element == addr && allocation->za_trace_index < zleak_trace_buckets) {
2575                         struct ztrace *trace;
2576
2577                         /* allocation_size had better match what was passed into zleak_log - otherwise someone is freeing into the wrong zone! */
2578                         if (allocation->za_size != allocation_size) {
2579                                 panic("Freeing as size %lu memory that was allocated with size %lu\n",
2580                                     (uintptr_t)allocation_size, (uintptr_t)allocation->za_size);
2581                         }
2582
2583                         trace = &ztraces[allocation->za_trace_index];
2584
2585                         /* size of 0 indicates trace bucket is unused */
2586                         if (trace->zt_size > 0) {
2587                                 trace->zt_size -= allocation_size;
2588                         }
2589
2590                         /* A NULL element means the allocation bucket is unused */
2591                         allocation->za_element = 0;
2592                 }
2593                 lck_spin_unlock(&zleak_lock);
2594         }
2595 }
2596
2597 #endif /* CONFIG_ZLEAKS */
2598
2599 /*  These functions outside of CONFIG_ZLEAKS because they are also used in
2600  *  mbuf.c for mbuf leak-detection.  This is why they lack the z_ prefix.
2601  */
2602
2603 /* "Thomas Wang's 32/64 bit mix functions."  http://www.concentric.net/~Ttwang/tech/inthash.htm */
2604 uintptr_t
2605 hash_mix(uintptr_t x)
2606 {
2607 #ifndef __LP64__
2608         x += ~(x << 15);
2609         x ^=  (x >> 10);
2610         x +=  (x << 3);
2611         x ^=  (x >> 6);
2612         x += ~(x << 11);
2613         x ^=  (x >> 16);
2614 #else
2615         x += ~(x << 32);
2616         x ^=  (x >> 22);
2617         x += ~(x << 13);
2618         x ^=  (x >> 8);
2619         x +=  (x << 3);
2620         x ^=  (x >> 15);
2621         x += ~(x << 27);
2622         x ^=  (x >> 31);
2623 #endif
2624         return x;
2625 }
2626
2627 uint32_t
2628 hashbacktrace(uintptr_t* bt, uint32_t depth, uint32_t max_size)
2629 {
2630         uintptr_t hash = 0;
2631         uintptr_t mask = max_size - 1;
2632
2633         while (depth) {
2634                 hash += bt[--depth];
2635         }
2636
2637         hash = hash_mix(hash) & mask;
2638
2639         assert(hash < max_size);
2640
2641         return (uint32_t) hash;
2642 }
2643
2644 /*
2645  *  TODO: Determine how well distributed this is
2646  *      max_size must be a power of 2. i.e 0x10000 because 0x10000-1 is 0x0FFFF which is a great bitmask
2647  */
2648 uint32_t
2649 hashaddr(uintptr_t pt, uint32_t max_size)
2650 {
2651         uintptr_t hash = 0;
2652         uintptr_t mask = max_size - 1;
2653
2654         hash = hash_mix(pt) & mask;
2655
2656         assert(hash < max_size);
2657
2658         return (uint32_t) hash;
2659 }
2660
2661 /* End of all leak-detection code */
2662 #pragma mark zone creation, configuration, destruction
2663
2664 static zone_t
2665 zone_init_defaults(zone_id_t zid)
2666 {
2667         zone_t z = &zone_array[zid];
2668
2669         z->page_count_max = ~0u;
2670         z->collectable = true;
2671         z->expandable = true;
2672         z->submap_idx = Z_SUBMAP_IDX_GENERAL_MAP;
2673
2674         simple_lock_init(&z->lock, 0);
2675
2676         return z;
2677 }
2678
2679 static bool
2680 zone_is_initializing(zone_t z)
2681 {
2682         return !z->z_self && !z->destroyed;
2683 }
2684
2685 static void
2686 zone_set_max(zone_t z, vm_size_t max)
2687 {
2688 #if KASAN_ZALLOC
2689         if (z->kasan_redzone) {
2690                 /*
2691                  * Adjust the max memory for the kasan redzones
2692                  */
2693                 max += (max / z->pcpu_elem_size) * z->kasan_redzone * 2;
2694         }
2695 #endif
2696         if (max < z->percpu ? 1 : z->alloc_pages) {
2697                 max = z->percpu ? 1 : z->alloc_pages;
2698         } else {
2699                 max = atop(round_page(max));
2700         }
2701         z->page_count_max = max;
2702 }
2703
2704 void
2705 zone_set_submap_idx(zone_t zone, unsigned int sub_map_idx)
2706 {
2707         if (!zone_is_initializing(zone)) {
2708                 panic("%s: called after zone_create()", __func__);
2709         }
2710         if (sub_map_idx > zone_last_submap_idx) {
2711                 panic("zone_set_submap_idx(%d) > %d", sub_map_idx, zone_last_submap_idx);
2712         }
2713         zone->submap_idx = sub_map_idx;
2714 }
2715
2716 void
2717 zone_set_noexpand(
2718         zone_t          zone,
2719         vm_size_t       max)
2720 {
2721         if (!zone_is_initializing(zone)) {
2722                 panic("%s: called after zone_create()", __func__);
2723         }
2724         zone->expandable = false;
2725         zone_set_max(zone, max);
2726 }
2727
2728 void
2729 zone_set_exhaustible(
2730         zone_t          zone,
2731         vm_size_t       max)
2732 {
2733         if (!zone_is_initializing(zone)) {
2734                 panic("%s: called after zone_create()", __func__);
2735         }
2736         zone->expandable = false;
2737         zone->exhaustible = true;
2738         zone_set_max(zone, max);
2739 }
2740
2741 /**
2742  * @function zone_create_find
2743  *
2744  * @abstract
2745  * Finds an unused zone for the given name and element size.
2746  *
2747  * @param name          the zone name
2748  * @param size          the element size (including redzones, ...)
2749  * @param flags         the flags passed to @c zone_create*
2750  * @param zid           the desired zone ID or ZONE_ID_ANY
2751  *
2752  * @returns             a zone to initialize further.
2753  */
2754 static zone_t
2755 zone_create_find(
2756         const char             *name,
2757         vm_size_t               size,
2758         zone_create_flags_t     flags,
2759         zone_id_t               zid)
2760 {
2761         zone_id_t nzones;
2762         zone_t z;
2763
2764         simple_lock(&all_zones_lock, &zone_locks_grp);
2765
2766         nzones = (zone_id_t)os_atomic_load(&num_zones, relaxed);
2767         assert(num_zones_in_use <= nzones && nzones < MAX_ZONES);
2768
2769         if (__improbable(nzones < ZONE_ID__FIRST_DYNAMIC)) {
2770                 /*
2771                  * The first time around, make sure the reserved zone IDs
2772                  * have an initialized lock as zone_index_foreach() will
2773                  * enumerate them.
2774                  */
2775                 while (nzones < ZONE_ID__FIRST_DYNAMIC) {
2776                         zone_init_defaults(nzones++);
2777                 }
2778
2779                 os_atomic_store(&num_zones, nzones, release);
2780         }
2781
2782         if (zid != ZONE_ID_ANY) {
2783                 if (zid >= ZONE_ID__FIRST_DYNAMIC) {
2784                         panic("zone_create: invalid desired zone ID %d for %s",
2785                             zid, name);
2786                 }
2787                 if (flags & ZC_DESTRUCTIBLE) {
2788                         panic("zone_create: ID %d (%s) must be permanent", zid, name);
2789                 }
2790                 if (zone_array[zid].z_self) {
2791                         panic("zone_create: creating zone ID %d (%s) twice", zid, name);
2792                 }
2793                 z = &zone_array[zid];
2794         } else {
2795                 if (flags & ZC_DESTRUCTIBLE) {
2796                         /*
2797                          * If possible, find a previously zdestroy'ed zone in the
2798                          * zone_array that we can reuse.
2799                          */
2800                         for (int i = bitmap_first(zone_destroyed_bitmap, MAX_ZONES);
2801                             i >= 0; i = bitmap_next(zone_destroyed_bitmap, i)) {
2802                                 z = &zone_array[i];
2803
2804                                 /*
2805                                  * If the zone name and the element size are the
2806                                  * same, we can just reuse the old zone struct.
2807                                  */
2808                                 if (strcmp(z->z_name, name) || zone_elem_size(z) != size) {
2809                                         continue;
2810                                 }
2811                                 bitmap_clear(zone_destroyed_bitmap, i);
2812                                 z->destroyed = false;
2813                                 z->z_self = z;
2814                                 zid = (zone_id_t)i;
2815                                 goto out;
2816                         }
2817                 }
2818
2819                 zid = nzones++;
2820                 z = zone_init_defaults(zid);
2821
2822                 /*
2823                  * The release barrier pairs with the acquire in
2824                  * zone_index_foreach() and makes sure that enumeration loops
2825                  * always see an initialized zone lock.
2826                  */
2827                 os_atomic_store(&num_zones, nzones, release);
2828         }
2829
2830 out:
2831         num_zones_in_use++;
2832         simple_unlock(&all_zones_lock);
2833
2834         return z;
2835 }
2836
2837 __abortlike
2838 static void
2839 zone_create_panic(const char *name, const char *f1, const char *f2)
2840 {
2841         panic("zone_create: creating zone %s: flag %s and %s are incompatible",
2842             name, f1, f2);
2843 }
2844 #define zone_create_assert_not_both(name, flags, current_flag, forbidden_flag) \
2845         if ((flags) & forbidden_flag) { \
2846                 zone_create_panic(name, #current_flag, #forbidden_flag); \
2847         }
2848
2849 /*
2850  * Adjusts the size of the element based on minimum size, alignment
2851  * and kasan redzones
2852  */
2853 static vm_size_t
2854 zone_elem_adjust_size(
2855         const char             *name __unused,
2856         vm_size_t               elem_size,
2857         zone_create_flags_t     flags,
2858         vm_size_t              *redzone __unused)
2859 {
2860         vm_size_t size;
2861         /*
2862          * Adjust element size for minimum size and pointer alignment
2863          */
2864         size = (elem_size + sizeof(vm_offset_t) - 1) & -sizeof(vm_offset_t);
2865         if (((flags & ZC_PERCPU) == 0) && size < ZONE_MIN_ELEM_SIZE) {
2866                 size = ZONE_MIN_ELEM_SIZE;
2867         }
2868
2869 #if KASAN_ZALLOC
2870         /*
2871          * Expand the zone allocation size to include the redzones.
2872          *
2873          * For page-multiple zones add a full guard page because they
2874          * likely require alignment.
2875          */
2876         vm_size_t redzone_tmp;
2877         if (flags & (ZC_KASAN_NOREDZONE | ZC_PERCPU)) {
2878                 redzone_tmp = 0;
2879         } else if ((size & PAGE_MASK) == 0) {
2880                 if (size != PAGE_SIZE && (flags & ZC_ALIGNMENT_REQUIRED)) {
2881                         panic("zone_create: zone %s can't provide more than PAGE_SIZE"
2882                             "alignment", name);
2883                 }
2884                 redzone_tmp = PAGE_SIZE;
2885         } else if (flags & ZC_ALIGNMENT_REQUIRED) {
2886                 redzone_tmp = 0;
2887         } else {
2888                 redzone_tmp = KASAN_GUARD_SIZE;
2889         }
2890         size += redzone_tmp * 2;
2891         if (redzone) {
2892                 *redzone = redzone_tmp;
2893         }
2894 #endif
2895         return size;
2896 }
2897
2898 /*
2899  * Returns the allocation chunk size that has least framentation
2900  */
2901 static vm_size_t
2902 zone_get_min_alloc_granule(
2903         vm_size_t               elem_size,
2904         zone_create_flags_t     flags)
2905 {
2906         vm_size_t alloc_granule = PAGE_SIZE;
2907         if (flags & ZC_PERCPU) {
2908                 alloc_granule = PAGE_SIZE * zpercpu_count();
2909                 if (PAGE_SIZE % elem_size > 256) {
2910                         panic("zone_create: per-cpu zone has too much fragmentation");
2911                 }
2912         } else if ((elem_size & PAGE_MASK) == 0) {
2913                 /* zero fragmentation by definition */
2914                 alloc_granule = elem_size;
2915         } else if (alloc_granule % elem_size == 0) {
2916                 /* zero fragmentation by definition */
2917         } else {
2918                 vm_size_t frag = (alloc_granule % elem_size) * 100 / alloc_granule;
2919                 vm_size_t alloc_tmp = PAGE_SIZE;
2920                 while ((alloc_tmp += PAGE_SIZE) <= ZONE_MAX_ALLOC_SIZE) {
2921                         vm_size_t frag_tmp = (alloc_tmp % elem_size) * 100 / alloc_tmp;
2922                         if (frag_tmp < frag) {
2923                                 frag = frag_tmp;
2924                                 alloc_granule = alloc_tmp;
2925                         }
2926                 }
2927         }
2928         return alloc_granule;
2929 }
2930
2931 vm_size_t
2932 zone_get_foreign_alloc_size(
2933         const char             *name __unused,
2934         vm_size_t               elem_size,
2935         zone_create_flags_t     flags,
2936         uint16_t                min_pages)
2937 {
2938         vm_size_t adjusted_size = zone_elem_adjust_size(name, elem_size, flags,
2939             NULL);
2940         vm_size_t alloc_granule = zone_get_min_alloc_granule(adjusted_size,
2941             flags);
2942         vm_size_t min_size = min_pages * PAGE_SIZE;
2943         /*
2944          * Round up min_size to a multiple of alloc_granule
2945          */
2946         return ((min_size + alloc_granule - 1) / alloc_granule)
2947                * alloc_granule;
2948 }
2949
2950 zone_t
2951 zone_create_ext(
2952         const char             *name,
2953         vm_size_t               size,
2954         zone_create_flags_t     flags,
2955         zone_id_t               desired_zid,
2956         void                  (^extra_setup)(zone_t))
2957 {
2958         vm_size_t alloc;
2959         vm_size_t redzone;
2960         zone_t z;
2961
2962         if (size > ZONE_MAX_ALLOC_SIZE) {
2963                 panic("zone_create: element size too large: %zd", (size_t)size);
2964         }
2965
2966         size = zone_elem_adjust_size(name, size, flags, &redzone);
2967         /*
2968          * Allocate the zone slot, return early if we found an older match.
2969          */
2970         z = zone_create_find(name, size, flags, desired_zid);
2971         if (__improbable(z->z_self)) {
2972                 /* We found a zone to reuse */
2973                 return z;
2974         }
2975
2976         /*
2977          * Initialize the zone properly.
2978          */
2979
2980         /*
2981          * If the kernel is post lockdown, copy the zone name passed in.
2982          * Else simply maintain a pointer to the name string as it can only
2983          * be a core XNU zone (no unloadable kext exists before lockdown).
2984          */
2985         if (startup_phase >= STARTUP_SUB_LOCKDOWN) {
2986                 size_t nsz = MIN(strlen(name) + 1, MACH_ZONE_NAME_MAX_LEN);
2987                 char *buf = zalloc_permanent(nsz, ZALIGN_NONE);
2988                 strlcpy(buf, name, nsz);
2989                 z->z_name = buf;
2990         } else {
2991                 z->z_name = name;
2992         }
2993         /*
2994          * If zone_init() hasn't run yet, the permanent zones do not exist.
2995          * We can limp along without properly initialized stats for a while,
2996          * zone_init() will rebuild the missing stats when it runs.
2997          */
2998         if (__probable(zone_array[ZONE_ID_PERCPU_PERMANENT].z_self)) {
2999                 z->z_stats = zalloc_percpu_permanent_type(struct zone_stats);
3000         }
3001
3002         alloc = zone_get_min_alloc_granule(size, flags);
3003
3004         if (flags & ZC_KALLOC_HEAP) {
3005                 size_t rem = (alloc % size) / (alloc / size);
3006
3007                 /*
3008                  * Try to grow the elements size and spread them more if the remaining
3009                  * space is large enough.
3010                  */
3011                 size += rem & ~(KALLOC_MINALIGN - 1);
3012         }
3013
3014         z->pcpu_elem_size = z->z_elem_size = (uint16_t)size;
3015         z->alloc_pages = (uint16_t)atop(alloc);
3016 #if KASAN_ZALLOC
3017         z->kasan_redzone = redzone;
3018         if (strncmp(name, "fakestack.", sizeof("fakestack.") - 1) == 0) {
3019                 z->kasan_fakestacks = true;
3020         }
3021 #endif
3022
3023         /*
3024          * Handle KPI flags
3025          */
3026 #if __LP64__
3027         if (flags & ZC_SEQUESTER) {
3028                 z->va_sequester = true;
3029         }
3030 #endif
3031         /* ZC_CACHING applied after all configuration is done */
3032
3033         if (flags & ZC_PERCPU) {
3034                 /*
3035                  * ZC_CACHING is disallowed because it uses per-cpu zones for its
3036                  * implementation and it would be circular. These allocations are
3037                  * also quite expensive, so caching feels dangerous memory wise too.
3038                  *
3039                  * ZC_ZFREE_CLEARMEM is forced because per-cpu zones allow for
3040                  * pointer-sized allocations which poisoning doesn't support.
3041                  */
3042                 zone_create_assert_not_both(name, flags, ZC_PERCPU, ZC_CACHING);
3043                 zone_create_assert_not_both(name, flags, ZC_PERCPU, ZC_ALLOW_FOREIGN);
3044                 z->percpu = true;
3045                 z->gzalloc_exempt = true;
3046                 z->zfree_clear_mem = true;
3047                 z->pcpu_elem_size *= zpercpu_count();
3048         }
3049         if (flags & ZC_ZFREE_CLEARMEM) {
3050                 z->zfree_clear_mem = true;
3051         }
3052         if (flags & ZC_NOGC) {
3053                 z->collectable = false;
3054         }
3055         if (flags & ZC_NOENCRYPT) {
3056                 z->noencrypt = true;
3057         }
3058         if (flags & ZC_ALIGNMENT_REQUIRED) {
3059                 z->alignment_required = true;
3060         }
3061         if (flags & ZC_NOGZALLOC) {
3062                 z->gzalloc_exempt = true;
3063         }
3064         if (flags & ZC_NOCALLOUT) {
3065                 z->no_callout = true;
3066         }
3067         if (flags & ZC_DESTRUCTIBLE) {
3068                 zone_create_assert_not_both(name, flags, ZC_DESTRUCTIBLE, ZC_CACHING);
3069                 zone_create_assert_not_both(name, flags, ZC_DESTRUCTIBLE, ZC_ALLOW_FOREIGN);
3070                 z->destructible = true;
3071         }
3072
3073         /*
3074          * Handle Internal flags
3075          */
3076         if (flags & ZC_ALLOW_FOREIGN) {
3077                 z->allows_foreign = true;
3078         }
3079         if ((ZSECURITY_OPTIONS_SUBMAP_USER_DATA & zsecurity_options) &&
3080             (flags & ZC_DATA_BUFFERS)) {
3081                 z->submap_idx = Z_SUBMAP_IDX_BAG_OF_BYTES_MAP;
3082         }
3083         if (flags & ZC_KASAN_NOQUARANTINE) {
3084                 z->kasan_noquarantine = true;
3085         }
3086         /* ZC_KASAN_NOREDZONE already handled */
3087
3088         /*
3089          * Then if there's extra tuning, do it
3090          */
3091         if (extra_setup) {
3092                 extra_setup(z);
3093         }
3094
3095         /*
3096          * Configure debugging features
3097          */
3098 #if CONFIG_GZALLOC
3099         gzalloc_zone_init(z); /* might set z->gzalloc_tracked */
3100 #endif
3101 #if ZONE_ENABLE_LOGGING
3102         if (!z->gzalloc_tracked && num_zones_logged < max_num_zones_to_log) {
3103                 /*
3104                  * Check for and set up zone leak detection if requested via boot-args.
3105                  * might set z->zone_logging
3106                  */
3107                 zone_setup_logging(z);
3108         }
3109 #endif /* ZONE_ENABLE_LOGGING */
3110 #if VM_MAX_TAG_ZONES
3111         if (!z->gzalloc_tracked && z->kalloc_heap && zone_tagging_on) {
3112                 static int tag_zone_index;
3113                 vm_offset_t esize = zone_elem_size(z);
3114                 z->tags = true;
3115                 z->tags_inline = (((page_size + esize - 1) / esize) <=
3116                     (sizeof(uint32_t) / sizeof(uint16_t)));
3117                 z->tag_zone_index = os_atomic_inc_orig(&tag_zone_index, relaxed);
3118                 assert(z->tag_zone_index < VM_MAX_TAG_ZONES);
3119         }
3120 #endif
3121
3122         /*
3123          * Finally, fixup properties based on security policies, boot-args, ...
3124          */
3125         if ((ZSECURITY_OPTIONS_SUBMAP_USER_DATA & zsecurity_options) &&
3126             z->kalloc_heap == KHEAP_ID_DATA_BUFFERS) {
3127                 z->submap_idx = Z_SUBMAP_IDX_BAG_OF_BYTES_MAP;
3128         }
3129 #if __LP64__
3130         if ((ZSECURITY_OPTIONS_SEQUESTER & zsecurity_options) &&
3131             (flags & ZC_NOSEQUESTER) == 0 &&
3132             z->submap_idx == Z_SUBMAP_IDX_GENERAL_MAP) {
3133                 z->va_sequester = true;
3134         }
3135 #endif
3136         /*
3137          * Always clear zone elements smaller than a cacheline,
3138          * because it's pretty close to free.
3139          */
3140         if (size <= zp_min_size) {
3141                 z->zfree_clear_mem = true;
3142         }
3143         if (zp_factor != 0 && !z->zfree_clear_mem) {
3144                 z->zp_count = zone_poison_count_init(z);
3145         }
3146
3147 #if CONFIG_ZCACHE
3148         if ((flags & ZC_NOCACHING) == 0) {
3149                 /*
3150                  * Append kalloc heap name to zone name (if zone is used by kalloc)
3151                  */
3152                 char temp_zone_name[MAX_ZONE_NAME] = "";
3153                 snprintf(temp_zone_name, MAX_ZONE_NAME, "%s%s", zone_heap_name(z), z->z_name);
3154
3155                 /* Check if boot-arg specified it should have a cache */
3156                 if (track_this_zone(temp_zone_name, cache_zone_name)) {
3157                         flags |= ZC_CACHING;
3158                 } else if (zcc_kalloc && z->kalloc_heap) {
3159                         flags |= ZC_CACHING;
3160                 }
3161         }
3162         if ((flags & ZC_CACHING) &&
3163             !z->tags && !z->zone_logging && !z->gzalloc_tracked) {
3164                 zcache_init(z);
3165         }
3166 #endif /* CONFIG_ZCACHE */
3167
3168         lock_zone(z);
3169         z->z_self = z;
3170         unlock_zone(z);
3171
3172         return z;
3173 }
3174
3175 __startup_func
3176 void
3177 zone_create_startup(struct zone_create_startup_spec *spec)
3178 {
3179         *spec->z_var = zone_create_ext(spec->z_name, spec->z_size,
3180             spec->z_flags, spec->z_zid, spec->z_setup);
3181 }
3182
3183 /*
3184  * The 4 first field of a zone_view and a zone alias, so that the zone_or_view_t
3185  * union works. trust but verify.
3186  */
3187 #define zalloc_check_zov_alias(f1, f2) \
3188     static_assert(offsetof(struct zone, f1) == offsetof(struct zone_view, f2))
3189 zalloc_check_zov_alias(z_self, zv_zone);
3190 zalloc_check_zov_alias(z_stats, zv_stats);
3191 zalloc_check_zov_alias(z_name, zv_name);
3192 zalloc_check_zov_alias(z_views, zv_next);
3193 #undef zalloc_check_zov_alias
3194
3195 __startup_func
3196 void
3197 zone_view_startup_init(struct zone_view_startup_spec *spec)
3198 {
3199         struct kalloc_heap *heap = NULL;
3200         zone_view_t zv = spec->zv_view;
3201         zone_t z;
3202
3203         switch (spec->zv_heapid) {
3204         case KHEAP_ID_DEFAULT:
3205                 heap = KHEAP_DEFAULT;
3206                 break;
3207         case KHEAP_ID_DATA_BUFFERS:
3208                 heap = KHEAP_DATA_BUFFERS;
3209                 break;
3210         case KHEAP_ID_KEXT:
3211                 heap = KHEAP_KEXT;
3212                 break;
3213         default:
3214                 heap = NULL;
3215         }
3216
3217         if (heap) {
3218                 z = kalloc_heap_zone_for_size(heap, spec->zv_size);
3219                 assert(z);
3220         } else {
3221                 z = spec->zv_zone;
3222                 assert(spec->zv_size <= zone_elem_size(z));
3223         }
3224
3225         zv->zv_zone  = z;
3226         zv->zv_stats = zalloc_percpu_permanent_type(struct zone_stats);
3227         zv->zv_next  = z->z_views;
3228         if (z->z_views == NULL && z->kalloc_heap == KHEAP_ID_NONE) {
3229                 /*
3230                  * count the raw view for zones not in a heap,
3231                  * kalloc_heap_init() already counts it for its members.
3232                  */
3233                 zone_view_count += 2;
3234         } else {
3235                 zone_view_count += 1;
3236         }
3237         z->z_views = zv;
3238 }
3239
3240 zone_t
3241 zone_create(
3242         const char             *name,
3243         vm_size_t               size,
3244         zone_create_flags_t     flags)
3245 {
3246         return zone_create_ext(name, size, flags, ZONE_ID_ANY, NULL);
3247 }
3248
3249 zone_t
3250 zinit(
3251         vm_size_t       size,           /* the size of an element */
3252         vm_size_t       max,            /* maximum memory to use */
3253         vm_size_t       alloc __unused, /* allocation size */
3254         const char      *name)          /* a name for the zone */
3255 {
3256         zone_t z = zone_create(name, size, ZC_DESTRUCTIBLE);
3257         zone_set_max(z, max);
3258         return z;
3259 }
3260
3261 void
3262 zdestroy(zone_t z)
3263 {
3264         unsigned int zindex = zone_index(z);
3265
3266         lock_zone(z);
3267
3268         if (!z->destructible || zone_caching_enabled(z) || z->allows_foreign) {
3269                 panic("zdestroy: Zone %s%s isn't destructible",
3270                     zone_heap_name(z), z->z_name);
3271         }
3272
3273         if (!z->z_self || z->expanding_no_vm_priv || z->expanding_vm_priv ||
3274             z->async_pending || z->waiting) {
3275                 panic("zdestroy: Zone %s%s in an invalid state for destruction",
3276                     zone_heap_name(z), z->z_name);
3277         }
3278
3279 #if !KASAN_ZALLOC
3280         /*
3281          * Unset the valid bit. We'll hit an assert failure on further operations
3282          * on this zone, until zinit() is called again.
3283          *
3284          * Leave the zone valid for KASan as we will see zfree's on quarantined free
3285          * elements even after the zone is destroyed.
3286          */
3287         z->z_self = NULL;
3288 #endif
3289         z->destroyed = true;
3290         unlock_zone(z);
3291
3292         /* Dump all the free elements */
3293         zone_drop_free_elements(z);
3294
3295 #if CONFIG_GZALLOC
3296         if (__improbable(z->gzalloc_tracked)) {
3297                 /* If the zone is gzalloc managed dump all the elements in the free cache */
3298                 gzalloc_empty_free_cache(z);
3299         }
3300 #endif
3301
3302         lock_zone(z);
3303
3304         while (!zone_pva_is_null(z->pages_sequester)) {
3305                 struct zone_page_metadata *page_meta;
3306                 vm_offset_t                free_addr;
3307
3308                 page_meta = zone_sequestered_page_get(z, &free_addr);
3309                 unlock_zone(z);
3310                 kmem_free(submap_for_zone(z), free_addr, ptoa(z->alloc_pages));
3311                 lock_zone(z);
3312         }
3313
3314 #if !KASAN_ZALLOC
3315         /* Assert that all counts are zero */
3316         if (z->countavail || z->countfree || zone_size_wired(z) ||
3317             z->allfree_page_count || z->sequester_page_count) {
3318                 panic("zdestroy: Zone %s%s isn't empty at zdestroy() time",
3319                     zone_heap_name(z), z->z_name);
3320         }
3321
3322         /* consistency check: make sure everything is indeed empty */
3323         assert(zone_pva_is_null(z->pages_any_free_foreign));
3324         assert(zone_pva_is_null(z->pages_all_used_foreign));
3325         assert(zone_pva_is_null(z->pages_all_free));
3326         assert(zone_pva_is_null(z->pages_intermediate));
3327         assert(zone_pva_is_null(z->pages_all_used));
3328         assert(zone_pva_is_null(z->pages_sequester));
3329 #endif
3330
3331         unlock_zone(z);
3332
3333         simple_lock(&all_zones_lock, &zone_locks_grp);
3334
3335         assert(!bitmap_test(zone_destroyed_bitmap, zindex));
3336         /* Mark the zone as empty in the bitmap */
3337         bitmap_set(zone_destroyed_bitmap, zindex);
3338         num_zones_in_use--;
3339         assert(num_zones_in_use > 0);
3340
3341         simple_unlock(&all_zones_lock);
3342 }
3343
3344 #pragma mark zone (re)fill, jetsam
3345
3346 /*
3347  * Dealing with zone allocations from the mach VM code.
3348  *
3349  * The implementation of the mach VM itself uses the zone allocator
3350  * for things like the vm_map_entry data structure. In order to prevent
3351  * an infinite recursion problem when adding more pages to a zone, zalloc
3352  * uses a replenish thread to refill the VM layer's zones before they have
3353  * too few remaining free entries. The reserved remaining free entries
3354  * guarantee that the VM routines can get entries from already mapped pages.
3355  *
3356  * In order for that to work, the amount of allocations in the nested
3357  * case have to be bounded. There are currently 2 replenish zones, and
3358  * if each needs 1 element of each zone to add a new page to itself, that
3359  * gives us a minumum reserve of 2 elements.
3360  *
3361  * There is also a deadlock issue with the zone garbage collection thread,
3362  * or any thread that is trying to free zone pages. While holding
3363  * the kernel's map lock they may need to allocate new VM map entries, hence
3364  * we need enough reserve to allow them to get past the point of holding the
3365  * map lock. After freeing that page, the GC thread will wait in drop_free_elements()
3366  * until the replenish threads can finish. Since there's only 1 GC thread at a time,
3367  * that adds a minimum of 1 to the reserve size.
3368  *
3369  * Since the minumum amount you can add to a zone is 1 page, we'll use 16K (from ARM)
3370  * as the refill size on all platforms.
3371  *
3372  * When a refill zone drops to half that available, i.e. REFILL_SIZE / 2,
3373  * zalloc_ext() will wake the replenish thread. The replenish thread runs
3374  * until at least REFILL_SIZE worth of free elements exist, before sleeping again.
3375  * In the meantime threads may continue to use the reserve until there are only REFILL_SIZE / 4
3376  * elements left. Below that point only the replenish threads themselves and the GC
3377  * thread may continue to use from the reserve.
3378  */
3379 static unsigned zone_replenish_loops;
3380 static unsigned zone_replenish_wakeups;
3381 static unsigned zone_replenish_wakeups_initiated;
3382 static unsigned zone_replenish_throttle_count;
3383
3384 #define ZONE_REPLENISH_TARGET (16 * 1024)
3385 static unsigned zone_replenish_active = 0; /* count of zones currently replenishing */
3386 static unsigned zone_replenish_max_threads = 0;
3387
3388 LCK_GRP_DECLARE(zone_replenish_lock_grp, "zone_replenish_lock");
3389 LCK_SPIN_DECLARE(zone_replenish_lock, &zone_replenish_lock_grp);
3390
3391 __abortlike
3392 static void
3393 zone_replenish_panic(zone_t zone, kern_return_t kr)
3394 {
3395         panic_include_zprint = TRUE;
3396 #if CONFIG_ZLEAKS
3397         if ((zleak_state & ZLEAK_STATE_ACTIVE)) {
3398                 panic_include_ztrace = TRUE;
3399         }
3400 #endif /* CONFIG_ZLEAKS */
3401         if (kr == KERN_NO_SPACE) {
3402                 zone_t zone_largest = zone_find_largest();
3403                 panic("zalloc: zone map exhausted while allocating from zone %s%s, "
3404                     "likely due to memory leak in zone %s%s "
3405                     "(%lu total bytes, %d elements allocated)",
3406                     zone_heap_name(zone), zone->z_name,
3407                     zone_heap_name(zone_largest), zone_largest->z_name,
3408                     (unsigned long)zone_size_wired(zone_largest),
3409                     zone_count_allocated(zone_largest));
3410         }
3411         panic("zalloc: %s%s (%d elements) retry fail %d",
3412             zone_heap_name(zone), zone->z_name,
3413             zone_count_allocated(zone), kr);
3414 }
3415
3416 static void
3417 zone_replenish_locked(zone_t z, zalloc_flags_t flags, bool asynchronously)
3418 {
3419         int kmaflags = KMA_KOBJECT | KMA_ZERO;
3420         vm_offset_t space, alloc_size;
3421         uint32_t retry = 0;
3422         kern_return_t kr;
3423
3424         if (z->noencrypt) {
3425                 kmaflags |= KMA_NOENCRYPT;
3426         }
3427         if (flags & Z_NOPAGEWAIT) {
3428                 kmaflags |= KMA_NOPAGEWAIT;
3429         }
3430         if (z->permanent) {
3431                 kmaflags |= KMA_PERMANENT;
3432         }
3433
3434         for (;;) {
3435                 struct zone_page_metadata *page_meta = NULL;
3436
3437                 /*
3438                  * Try to allocate our regular chunk of pages,
3439                  * unless the system is under massive pressure
3440                  * and we're looking for more than 2 pages.
3441                  */
3442                 if (!z->percpu && z->alloc_pages > 2 && (vm_pool_low() || retry > 0)) {
3443                         alloc_size = round_page(zone_elem_size(z));
3444                 } else {
3445                         alloc_size = ptoa(z->alloc_pages);
3446                         page_meta = zone_sequestered_page_get(z, &space);
3447                 }
3448
3449                 unlock_zone(z);
3450
3451 #if CONFIG_ZLEAKS
3452                 /*
3453                  * Do the zone leak activation here because zleak_activate()
3454                  * may block, and can't be done on the way out.
3455                  */
3456                 if (__improbable(zleak_state & ZLEAK_STATE_ENABLED)) {
3457                         if (!(zleak_state & ZLEAK_STATE_ACTIVE) &&
3458                             zone_submaps_approx_size() >= zleak_global_tracking_threshold) {
3459                                 kr = zleak_activate();
3460                                 if (kr != KERN_SUCCESS) {
3461                                         printf("Failed to activate live zone leak debugging (%d).\n", kr);
3462                                 }
3463                         }
3464                 }
3465 #endif /* CONFIG_ZLEAKS */
3466
3467                 /*
3468                  * Trigger jetsams via the vm_pageout_garbage_collect thread if
3469                  * we're running out of zone memory
3470                  */
3471                 if (is_zone_map_nearing_exhaustion()) {
3472                         thread_wakeup((event_t) &vm_pageout_garbage_collect);
3473                 }
3474
3475                 if (page_meta) {
3476                         kr = zone_sequestered_page_populate(z, page_meta, space,
3477                             alloc_size, kmaflags);
3478                 } else {
3479                         if (z->submap_idx == Z_SUBMAP_IDX_GENERAL_MAP && z->kalloc_heap != KHEAP_ID_NONE) {
3480                                 kmaflags |= KMA_KHEAP;
3481                         }
3482                         kr = kernel_memory_allocate(submap_for_zone(z),
3483                             &space, alloc_size, 0, kmaflags, VM_KERN_MEMORY_ZONE);
3484                 }
3485
3486 #if !__LP64__
3487                 if (kr == KERN_NO_SPACE && z->allows_foreign) {
3488                         /*
3489                          * For zones allowing foreign pages, fallback to the kernel map
3490                          */
3491                         kr = kernel_memory_allocate(kernel_map, &space,
3492                             alloc_size, 0, kmaflags, VM_KERN_MEMORY_ZONE);
3493                 }
3494 #endif
3495
3496                 if (kr == KERN_SUCCESS) {
3497                         break;
3498                 }
3499
3500                 if (flags & Z_NOPAGEWAIT) {
3501                         lock_zone(z);
3502                         return;
3503                 }
3504
3505                 if (asynchronously) {
3506                         assert_wait_timeout(&z->prio_refill_count,
3507                             THREAD_UNINT, 1, 100 * NSEC_PER_USEC);
3508                         thread_block(THREAD_CONTINUE_NULL);
3509                 } else if (++retry == 3) {
3510                         zone_replenish_panic(z, kr);
3511                 }
3512
3513                 lock_zone(z);
3514         }
3515
3516         zcram_and_lock(z, space, alloc_size);
3517
3518 #if CONFIG_ZLEAKS
3519         if (__improbable(zleak_state & ZLEAK_STATE_ACTIVE)) {
3520                 if (!z->zleak_on &&
3521                     zone_size_wired(z) >= zleak_per_zone_tracking_threshold) {
3522                         z->zleak_on = true;
3523                 }
3524         }
3525 #endif /* CONFIG_ZLEAKS */
3526 }
3527
3528 /*
3529  * High priority VM privileged thread used to asynchronously refill a given zone.
3530  * These are needed for data structures used by the lower level VM itself. The
3531  * replenish thread maintains a reserve of elements, so that the VM will never
3532  * block in the zone allocator.
3533  */
3534 __dead2
3535 static void
3536 zone_replenish_thread(void *_z, wait_result_t __unused wr)
3537 {
3538         zone_t z = _z;
3539
3540         current_thread()->options |= (TH_OPT_VMPRIV | TH_OPT_ZONE_PRIV);
3541
3542         for (;;) {
3543                 lock_zone(z);
3544                 assert(z->z_self == z);
3545                 assert(z->zone_replenishing);
3546                 assert(z->prio_refill_count != 0);
3547
3548                 while (z->countfree < z->prio_refill_count) {
3549                         assert(!z->expanding_no_vm_priv);
3550                         assert(!z->expanding_vm_priv);
3551
3552                         zone_replenish_locked(z, Z_WAITOK, true);
3553
3554                         assert(z->z_self == z);
3555                         zone_replenish_loops++;
3556                 }
3557
3558                 /* Wakeup any potentially throttled allocations. */
3559                 thread_wakeup(z);
3560
3561                 assert_wait(&z->prio_refill_count, THREAD_UNINT);
3562
3563                 /*
3564                  * We finished refilling the zone, so decrement the active count
3565                  * and wake up any waiting GC threads.
3566                  */
3567                 lck_spin_lock(&zone_replenish_lock);
3568                 assert(zone_replenish_active > 0);
3569                 if (--zone_replenish_active == 0) {
3570                         thread_wakeup((event_t)&zone_replenish_active);
3571                 }
3572                 lck_spin_unlock(&zone_replenish_lock);
3573
3574                 z->zone_replenishing = false;
3575                 unlock_zone(z);
3576
3577                 thread_block(THREAD_CONTINUE_NULL);
3578                 zone_replenish_wakeups++;
3579         }
3580 }
3581
3582 void
3583 zone_prio_refill_configure(zone_t z)
3584 {
3585         thread_t th;
3586         kern_return_t tres;
3587
3588         lock_zone(z);
3589         assert(!z->prio_refill_count && !z->destructible);
3590         z->prio_refill_count = (uint16_t)(ZONE_REPLENISH_TARGET / zone_elem_size(z));
3591         z->zone_replenishing = true;
3592         unlock_zone(z);
3593
3594         lck_spin_lock(&zone_replenish_lock);
3595         ++zone_replenish_max_threads;
3596         ++zone_replenish_active;
3597         lck_spin_unlock(&zone_replenish_lock);
3598         OSMemoryBarrier();
3599
3600         tres = kernel_thread_start_priority(zone_replenish_thread, z,
3601             MAXPRI_KERNEL, &th);
3602         if (tres != KERN_SUCCESS) {
3603                 panic("zone_prio_refill_configure, thread create: 0x%x", tres);
3604         }
3605
3606         thread_deallocate(th);
3607 }
3608
3609 static void
3610 zone_randomize_freelist(zone_t zone, struct zone_page_metadata *meta,
3611     vm_offset_t size, zone_addr_kind_t kind, unsigned int *entropy_buffer)
3612 {
3613         const vm_size_t elem_size = zone_elem_size(zone);
3614         vm_offset_t     left, right, head, base;
3615         vm_offset_t     element;
3616
3617         left  = ZONE_PAGE_FIRST_OFFSET(kind);
3618         right = size - ((size - left) % elem_size);
3619         head  = 0;
3620         base  = zone_meta_to_addr(meta, kind);
3621
3622         while (left < right) {
3623                 if (zone_leaks_scan_enable || __improbable(zone->tags) ||
3624                     random_bool_gen_bits(&zone_bool_gen, entropy_buffer, MAX_ENTROPY_PER_ZCRAM, 1)) {
3625                         element = base + left;
3626                         left += elem_size;
3627                 } else {
3628                         right -= elem_size;
3629                         element = base + right;
3630                 }
3631
3632                 vm_offset_t *primary  = (vm_offset_t *)element;
3633                 vm_offset_t *backup   = get_backup_ptr(elem_size, primary);
3634
3635                 *primary = *backup = head ^ zp_nopoison_cookie;
3636                 head = element;
3637         }
3638
3639         meta->zm_freelist_offs = (uint16_t)(head - base);
3640 }
3641
3642 /*
3643  *      Cram the given memory into the specified zone. Update the zone page count accordingly.
3644  */
3645 static void
3646 zcram_and_lock(zone_t zone, vm_offset_t newmem, vm_size_t size)
3647 {
3648         unsigned int entropy_buffer[MAX_ENTROPY_PER_ZCRAM] = { 0 };
3649         struct zone_page_metadata *meta;
3650         zone_addr_kind_t kind;
3651         uint32_t pg_count = (uint32_t)atop(size);
3652         uint32_t zindex = zone_index(zone);
3653         uint32_t free_count;
3654         uint16_t empty_freelist_offs = PAGE_METADATA_EMPTY_FREELIST;
3655
3656         /* Basic sanity checks */
3657         assert(zone != ZONE_NULL && newmem != (vm_offset_t)0);
3658         assert((newmem & PAGE_MASK) == 0);
3659         assert((size & PAGE_MASK) == 0);
3660
3661         KDBG(MACHDBG_CODE(DBG_MACH_ZALLOC, ZALLOC_ZCRAM) | DBG_FUNC_START,
3662             zindex, size);
3663
3664         kind = zone_addr_kind(newmem, size);
3665 #if DEBUG || DEVELOPMENT
3666         if (zalloc_debug & ZALLOC_DEBUG_ZCRAM) {
3667                 kprintf("zcram(%p[%s%s], 0x%lx%s, 0x%lx)\n", zone,
3668                     zone_heap_name(zone), zone->z_name, (uintptr_t)newmem,
3669                     kind == ZONE_ADDR_FOREIGN ? "[F]" : "", (uintptr_t)size);
3670         }
3671 #endif /* DEBUG || DEVELOPMENT */
3672
3673         /*
3674          * Initialize the metadata for all pages. We dont need the zone lock
3675          * here because we are not manipulating any zone related state yet.
3676          *
3677          * This includes randomizing the freelists as the metadata isn't
3678          * published yet.
3679          */
3680
3681         if (kind == ZONE_ADDR_NATIVE) {
3682                 /*
3683                  * We're being called by zfill,
3684                  * zone_replenish_thread or vm_page_more_fictitious,
3685                  *
3686                  * which will only either allocate a single page, or `alloc_pages`
3687                  * worth.
3688                  */
3689                 assert(pg_count <= zone->alloc_pages);
3690
3691                 /*
3692                  * Make sure the range of metadata entries we're about to init
3693                  * have proper physical backing, then initialize them.
3694                  */
3695                 meta = zone_meta_from_addr(newmem, kind);
3696                 zone_meta_populate(meta, meta + pg_count);
3697
3698                 if (zone->permanent) {
3699                         empty_freelist_offs = 0;
3700                 }
3701
3702                 meta[0] = (struct zone_page_metadata){
3703                         .zm_index         = zindex,
3704                         .zm_page_count    = pg_count,
3705                         .zm_percpu        = zone->percpu,
3706                         .zm_freelist_offs = empty_freelist_offs,
3707                 };
3708
3709                 for (uint32_t i = 1; i < pg_count; i++) {
3710                         meta[i] = (struct zone_page_metadata){
3711                                 .zm_index          = zindex,
3712                                 .zm_page_count     = i,
3713                                 .zm_percpu         = zone->percpu,
3714                                 .zm_secondary_page = true,
3715                                 .zm_freelist_offs  = empty_freelist_offs,
3716                         };
3717                 }
3718
3719                 if (!zone->permanent) {
3720                         zone_randomize_freelist(zone, meta,
3721                             zone->percpu ? PAGE_SIZE : size, kind, entropy_buffer);
3722                 }
3723         } else {
3724                 if (!zone->allows_foreign || !from_foreign_range(newmem, size)) {
3725                         panic("zcram_and_lock: foreign memory [%lx] being crammed is "
3726                             "outside of foreign range", (uintptr_t)newmem);
3727                 }
3728
3729                 /*
3730                  * We cannot support elements larger than page size for foreign
3731                  * memory because we put metadata on the page itself for each
3732                  * page of foreign memory.
3733                  *
3734                  * We need to do this in order to be able to reach the metadata
3735                  * when any element is freed.
3736                  */
3737                 assert(!zone->percpu && !zone->permanent);
3738                 assert(zone_elem_size(zone) <= PAGE_SIZE - sizeof(struct zone_page_metadata));
3739
3740                 bzero((void *)newmem, size);
3741
3742                 for (vm_offset_t offs = 0; offs < size; offs += PAGE_SIZE) {
3743                         meta = (struct zone_page_metadata *)(newmem + offs);
3744                         *meta = (struct zone_page_metadata){
3745                                 .zm_index         = zindex,
3746                                 .zm_page_count    = 1,
3747                                 .zm_freelist_offs = empty_freelist_offs,
3748                         };
3749                         meta->zm_foreign_cookie[0] = ZONE_FOREIGN_COOKIE;
3750                         zone_randomize_freelist(zone, meta, PAGE_SIZE, kind,
3751                             entropy_buffer);
3752                 }
3753         }
3754
3755 #if VM_MAX_TAG_ZONES
3756         if (__improbable(zone->tags)) {
3757                 assert(kind == ZONE_ADDR_NATIVE && !zone->percpu);
3758                 ztMemoryAdd(zone, newmem, size);
3759         }
3760 #endif /* VM_MAX_TAG_ZONES */
3761
3762         /*
3763          * Insert the initialized pages / metadatas into the right lists.
3764          */
3765
3766         lock_zone(zone);
3767         assert(zone->z_self == zone);
3768
3769         zone->page_count += pg_count;
3770         if (zone->page_count_hwm < zone->page_count) {
3771                 zone->page_count_hwm = zone->page_count;
3772         }
3773         os_atomic_add(&zones_phys_page_count, pg_count, relaxed);
3774
3775         if (kind == ZONE_ADDR_NATIVE) {
3776                 os_atomic_add(&zones_phys_page_mapped_count, pg_count, relaxed);
3777                 if (zone->permanent) {
3778                         zone_meta_queue_push(zone, &zone->pages_intermediate, meta, kind);
3779                 } else {
3780                         zone_meta_queue_push(zone, &zone->pages_all_free, meta, kind);
3781                         zone->allfree_page_count += meta->zm_page_count;
3782                 }
3783                 free_count = zone_elem_count(zone, size, kind);
3784                 zone->countfree  += free_count;
3785                 zone->countavail += free_count;
3786         } else {
3787                 free_count = zone_elem_count(zone, PAGE_SIZE, kind);
3788                 for (vm_offset_t offs = 0; offs < size; offs += PAGE_SIZE) {
3789                         meta = (struct zone_page_metadata *)(newmem + offs);
3790                         zone_meta_queue_push(zone, &zone->pages_any_free_foreign, meta, kind);
3791                         zone->countfree  += free_count;
3792                         zone->countavail += free_count;
3793                 }
3794         }
3795
3796         KDBG(MACHDBG_CODE(DBG_MACH_ZALLOC, ZALLOC_ZCRAM) | DBG_FUNC_END, zindex);
3797 }
3798
3799 void
3800 zcram(zone_t zone, vm_offset_t newmem, vm_size_t size)
3801 {
3802         zcram_and_lock(zone, newmem, size);
3803         unlock_zone(zone);
3804 }
3805
3806 /*
3807  * Fill a zone with enough memory to contain at least nelem elements.
3808  * Return the number of elements actually put into the zone, which may
3809  * be more than the caller asked for since the memory allocation is
3810  * rounded up to the next zone allocation size.
3811  */
3812 int
3813 zfill(
3814         zone_t  zone,
3815         int     nelem)
3816 {
3817         kern_return_t kr;
3818         vm_offset_t   memory;
3819
3820         vm_size_t alloc_size = ptoa(zone->alloc_pages);
3821         vm_size_t nalloc_inc = zone_elem_count(zone, alloc_size, ZONE_ADDR_NATIVE);
3822         vm_size_t nalloc = 0, goal = MAX(0, nelem);
3823         int kmaflags = KMA_KOBJECT | KMA_ZERO;
3824
3825         if (zone->noencrypt) {
3826                 kmaflags |= KMA_NOENCRYPT;
3827         }
3828
3829         assert(!zone->allows_foreign && !zone->permanent);
3830
3831         /*
3832          * Trigger jetsams via the vm_pageout_garbage_collect thread if we're
3833          * running out of zone memory
3834          */
3835         if (is_zone_map_nearing_exhaustion()) {
3836                 thread_wakeup((event_t) &vm_pageout_garbage_collect);
3837         }
3838
3839         if (zone->va_sequester) {
3840                 lock_zone(zone);
3841
3842                 do {
3843                         struct zone_page_metadata *page_meta;
3844                         page_meta = zone_sequestered_page_get(zone, &memory);
3845                         if (NULL == page_meta) {
3846                                 break;
3847                         }
3848                         unlock_zone(zone);
3849
3850                         kr = zone_sequestered_page_populate(zone, page_meta,
3851                             memory, alloc_size, kmaflags);
3852                         if (KERN_SUCCESS != kr) {
3853                                 goto out_nolock;
3854                         }
3855
3856                         zcram_and_lock(zone, memory, alloc_size);
3857                         nalloc += nalloc_inc;
3858                 } while (nalloc < goal);
3859
3860                 unlock_zone(zone);
3861         }
3862
3863 out_nolock:
3864         while (nalloc < goal) {
3865                 kr = kernel_memory_allocate(submap_for_zone(zone), &memory,
3866                     alloc_size, 0, kmaflags, VM_KERN_MEMORY_ZONE);
3867                 if (kr != KERN_SUCCESS) {
3868                         printf("%s: kernel_memory_allocate() of %lu bytes failed\n",
3869                             __func__, (unsigned long)(nalloc * alloc_size));
3870                         break;
3871                 }
3872
3873                 zcram(zone, memory, alloc_size);
3874                 nalloc += nalloc_inc;
3875         }
3876
3877         return (int)nalloc;
3878 }
3879
3880 /*
3881  * We're being very conservative here and picking a value of 95%. We might need to lower this if
3882  * we find that we're not catching the problem and are still hitting zone map exhaustion panics.
3883  */
3884 #define ZONE_MAP_JETSAM_LIMIT_DEFAULT 95
3885
3886 /*
3887  * Trigger zone-map-exhaustion jetsams if the zone map is X% full, where X=zone_map_jetsam_limit.
3888  * Can be set via boot-arg "zone_map_jetsam_limit". Set to 95% by default.
3889  */
3890 TUNABLE_WRITEABLE(unsigned int, zone_map_jetsam_limit, "zone_map_jetsam_limit",
3891     ZONE_MAP_JETSAM_LIMIT_DEFAULT);
3892
3893 void
3894 get_zone_map_size(uint64_t *current_size, uint64_t *capacity)
3895 {
3896         vm_offset_t phys_pages = os_atomic_load(&zones_phys_page_mapped_count, relaxed);
3897         *current_size = ptoa_64(phys_pages);
3898         *capacity = zone_phys_mapped_max;
3899 }
3900
3901 void
3902 get_largest_zone_info(char *zone_name, size_t zone_name_len, uint64_t *zone_size)
3903 {
3904         zone_t largest_zone = zone_find_largest();
3905
3906         /*
3907          * Append kalloc heap name to zone name (if zone is used by kalloc)
3908          */
3909         snprintf(zone_name, zone_name_len, "%s%s",
3910             zone_heap_name(largest_zone), largest_zone->z_name);
3911
3912         *zone_size = zone_size_wired(largest_zone);
3913 }
3914
3915 boolean_t
3916 is_zone_map_nearing_exhaustion(void)
3917 {
3918         vm_offset_t phys_pages = os_atomic_load(&zones_phys_page_mapped_count, relaxed);
3919         return ptoa_64(phys_pages) > (zone_phys_mapped_max * zone_map_jetsam_limit) / 100;
3920 }
3921
3922
3923 #define VMENTRY_TO_VMOBJECT_COMPARISON_RATIO 98
3924
3925 /*
3926  * Tries to kill a single process if it can attribute one to the largest zone. If not, wakes up the memorystatus thread
3927  * to walk through the jetsam priority bands and kill processes.
3928  */
3929 static void
3930 kill_process_in_largest_zone(void)
3931 {
3932         pid_t pid = -1;
3933         zone_t largest_zone = zone_find_largest();
3934
3935         printf("zone_map_exhaustion: Zone mapped %lld of %lld, used %lld, map size %lld, capacity %lld [jetsam limit %d%%]\n",
3936             ptoa_64(os_atomic_load(&zones_phys_page_mapped_count, relaxed)), ptoa_64(zone_phys_mapped_max),
3937             ptoa_64(os_atomic_load(&zones_phys_page_count, relaxed)),
3938             (uint64_t)zone_submaps_approx_size(),
3939             (uint64_t)zone_range_size(&zone_info.zi_map_range),
3940             zone_map_jetsam_limit);
3941         printf("zone_map_exhaustion: Largest zone %s%s, size %lu\n", zone_heap_name(largest_zone),
3942             largest_zone->z_name, (uintptr_t)zone_size_wired(largest_zone));
3943
3944         /*
3945          * We want to make sure we don't call this function from userspace.
3946          * Or we could end up trying to synchronously kill the process
3947          * whose context we're in, causing the system to hang.
3948          */
3949         assert(current_task() == kernel_task);
3950
3951         /*
3952          * If vm_object_zone is the largest, check to see if the number of
3953          * elements in vm_map_entry_zone is comparable.
3954          *
3955          * If so, consider vm_map_entry_zone as the largest. This lets us target
3956          * a specific process to jetsam to quickly recover from the zone map
3957          * bloat.
3958          */
3959         if (largest_zone == vm_object_zone) {
3960                 unsigned int vm_object_zone_count = zone_count_allocated(vm_object_zone);
3961                 unsigned int vm_map_entry_zone_count = zone_count_allocated(vm_map_entry_zone);
3962                 /* Is the VM map entries zone count >= 98% of the VM objects zone count? */
3963                 if (vm_map_entry_zone_count >= ((vm_object_zone_count * VMENTRY_TO_VMOBJECT_COMPARISON_RATIO) / 100)) {
3964                         largest_zone = vm_map_entry_zone;
3965                         printf("zone_map_exhaustion: Picking VM map entries as the zone to target, size %lu\n",
3966                             (uintptr_t)zone_size_wired(largest_zone));
3967                 }
3968         }
3969
3970         /* TODO: Extend this to check for the largest process in other zones as well. */
3971         if (largest_zone == vm_map_entry_zone) {
3972                 pid = find_largest_process_vm_map_entries();
3973         } else {
3974                 printf("zone_map_exhaustion: Nothing to do for the largest zone [%s%s]. "
3975                     "Waking up memorystatus thread.\n", zone_heap_name(largest_zone),
3976                     largest_zone->z_name);
3977         }
3978         if (!memorystatus_kill_on_zone_map_exhaustion(pid)) {
3979                 printf("zone_map_exhaustion: Call to memorystatus failed, victim pid: %d\n", pid);
3980         }
3981 }
3982
3983 #pragma mark zalloc module init
3984
3985 /*
3986  *      Initialize the "zone of zones" which uses fixed memory allocated
3987  *      earlier in memory initialization.  zone_bootstrap is called
3988  *      before zone_init.
3989  */
3990 __startup_func
3991 void
3992 zone_bootstrap(void)
3993 {
3994         /* Validate struct zone_page_metadata expectations */
3995         if ((1U << ZONE_PAGECOUNT_BITS) <
3996             atop(ZONE_MAX_ALLOC_SIZE) * sizeof(struct zone_page_metadata)) {
3997                 panic("ZONE_PAGECOUNT_BITS is not large enough to hold page counts");
3998         }
3999
4000         /* Validate struct zone_packed_virtual_address expectations */
4001         static_assert((intptr_t)VM_MIN_KERNEL_ADDRESS < 0, "the top bit must be 1");
4002         if (VM_KERNEL_POINTER_SIGNIFICANT_BITS - PAGE_SHIFT > 31) {
4003                 panic("zone_pva_t can't pack a kernel page address in 31 bits");
4004         }
4005
4006         zpercpu_early_count = ml_early_cpu_max_number() + 1;
4007
4008         /* Set up zone element poisoning */
4009         zp_bootstrap();
4010
4011         random_bool_init(&zone_bool_gen);
4012
4013         /*
4014          * the KASAN quarantine for kalloc doesn't understand heaps
4015          * and trips the heap confusion panics. At the end of the day,
4016          * all these security measures are double duty with KASAN.
4017          *
4018          * On 32bit kernels, these protections are just too expensive.
4019          */
4020 #if !defined(__LP64__) || KASAN_ZALLOC
4021         zsecurity_options &= ~ZSECURITY_OPTIONS_SEQUESTER;
4022         zsecurity_options &= ~ZSECURITY_OPTIONS_SUBMAP_USER_DATA;
4023         zsecurity_options &= ~ZSECURITY_OPTIONS_SEQUESTER_KEXT_KALLOC;
4024 #endif
4025
4026         thread_call_setup(&call_async_alloc, zalloc_async, NULL);
4027
4028 #if CONFIG_ZCACHE
4029         /* zcc_enable_for_zone_name=<zone>: enable per-cpu zone caching for <zone>. */
4030         if (PE_parse_boot_arg_str("zcc_enable_for_zone_name", cache_zone_name, sizeof(cache_zone_name))) {
4031                 printf("zcache: caching enabled for zone %s\n", cache_zone_name);
4032         }
4033 #endif /* CONFIG_ZCACHE */
4034 }
4035
4036 #if __LP64__
4037 #if CONFIG_EMBEDDED
4038 #define ZONE_MAP_VIRTUAL_SIZE_LP64      (32ULL * 1024ULL * 1024 * 1024)
4039 #else
4040 #define ZONE_MAP_VIRTUAL_SIZE_LP64      (128ULL * 1024ULL * 1024 * 1024)
4041 #endif
4042 #endif /* __LP64__ */
4043
4044 #define SINGLE_GUARD                    16384
4045 #define MULTI_GUARD                     (3 * SINGLE_GUARD)
4046
4047 #if __LP64__
4048 static inline vm_offset_t
4049 zone_restricted_va_max(void)
4050 {
4051         vm_offset_t compressor_max = VM_PACKING_MAX_PACKABLE(C_SLOT_PACKED_PTR);
4052         vm_offset_t vm_page_max    = VM_PACKING_MAX_PACKABLE(VM_PAGE_PACKED_PTR);
4053
4054         return trunc_page(MIN(compressor_max, vm_page_max));
4055 }
4056 #endif
4057
4058 __startup_func
4059 static void
4060 zone_tunables_fixup(void)
4061 {
4062         if (zone_map_jetsam_limit == 0 || zone_map_jetsam_limit > 100) {
4063                 zone_map_jetsam_limit = ZONE_MAP_JETSAM_LIMIT_DEFAULT;
4064         }
4065 }
4066 STARTUP(TUNABLES, STARTUP_RANK_MIDDLE, zone_tunables_fixup);
4067
4068 __startup_func
4069 static vm_size_t
4070 zone_phys_size_max(void)
4071 {
4072         mach_vm_size_t zsize;
4073         vm_size_t zsizearg;
4074
4075         if (PE_parse_boot_argn("zsize", &zsizearg, sizeof(zsizearg))) {
4076                 zsize = zsizearg * (1024ULL * 1024);
4077         } else {
4078                 zsize = sane_size >> 2;         /* Set target zone size as 1/4 of physical memory */
4079 #if defined(__LP64__)
4080                 zsize += zsize >> 1;
4081 #endif /* __LP64__ */
4082         }
4083
4084         if (zsize < CONFIG_ZONE_MAP_MIN) {
4085                 zsize = CONFIG_ZONE_MAP_MIN;   /* Clamp to min */
4086         }
4087         if (zsize > sane_size >> 1) {
4088                 zsize = sane_size >> 1; /* Clamp to half of RAM max */
4089         }
4090         if (zsizearg == 0 && zsize > ZONE_MAP_MAX) {
4091                 /* if zsize boot-arg not present and zsize exceeds platform maximum, clip zsize */
4092                 vm_size_t orig_zsize = zsize;
4093                 zsize = ZONE_MAP_MAX;
4094                 printf("NOTE: zonemap size reduced from 0x%lx to 0x%lx\n",
4095                     (uintptr_t)orig_zsize, (uintptr_t)zsize);
4096         }
4097
4098         assert((vm_size_t) zsize == zsize);
4099         return (vm_size_t)trunc_page(zsize);
4100 }
4101
4102 __startup_func
4103 static struct zone_map_range
4104 zone_init_allocate_va(vm_offset_t *submap_min, vm_size_t size, bool guard)
4105 {
4106         struct zone_map_range r;
4107         kern_return_t kr;
4108
4109         if (guard) {
4110                 vm_map_offset_t addr = *submap_min;
4111                 vm_map_kernel_flags_t vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
4112
4113                 vmk_flags.vmkf_permanent = TRUE;
4114                 kr = vm_map_enter(kernel_map, &addr, size, 0,
4115                     VM_FLAGS_FIXED, vmk_flags, VM_KERN_MEMORY_ZONE, kernel_object,
4116                     0, FALSE, VM_PROT_NONE, VM_PROT_NONE, VM_INHERIT_DEFAULT);
4117                 *submap_min = (vm_offset_t)addr;
4118         } else {
4119                 kr = kernel_memory_allocate(kernel_map, submap_min, size,
4120                     0, KMA_KOBJECT | KMA_PAGEABLE | KMA_VAONLY, VM_KERN_MEMORY_ZONE);
4121         }
4122         if (kr != KERN_SUCCESS) {
4123                 panic("zone_init_allocate_va(0x%lx:0x%zx) failed: %d",
4124                     (uintptr_t)*submap_min, (size_t)size, kr);
4125         }
4126
4127         r.min_address = *submap_min;
4128         *submap_min  += size;
4129         r.max_address = *submap_min;
4130
4131         return r;
4132 }
4133
4134 __startup_func
4135 static void
4136 zone_submap_init(
4137         vm_offset_t *submap_min,
4138         unsigned    idx,
4139         uint64_t    zone_sub_map_numer,
4140         uint64_t    *remaining_denom,
4141         vm_offset_t *remaining_size,
4142         vm_size_t   guard_size)
4143 {
4144         vm_offset_t submap_start, submap_end;
4145         vm_size_t submap_size;
4146         vm_map_t  submap;
4147         kern_return_t kr;
4148
4149         submap_size = trunc_page(zone_sub_map_numer * *remaining_size /
4150             *remaining_denom);
4151         submap_start = *submap_min;
4152         submap_end = submap_start + submap_size;
4153
4154 #if defined(__LP64__)
4155         if (idx == Z_SUBMAP_IDX_VA_RESTRICTED_MAP) {
4156                 vm_offset_t restricted_va_max = zone_restricted_va_max();
4157                 if (submap_end > restricted_va_max) {
4158 #if DEBUG || DEVELOPMENT
4159                         printf("zone_init: submap[%d] clipped to %zdM of %zdM\n", idx,
4160                             (size_t)(restricted_va_max - submap_start) >> 20,
4161                             (size_t)submap_size >> 20);
4162 #endif /* DEBUG || DEVELOPMENT */
4163                         guard_size += submap_end - restricted_va_max;
4164                         *remaining_size -= submap_end - restricted_va_max;
4165                         submap_end  = restricted_va_max;
4166                         submap_size = restricted_va_max - submap_start;
4167                 }
4168
4169                 vm_packing_verify_range("vm_compressor",
4170                     submap_start, submap_end, VM_PACKING_PARAMS(C_SLOT_PACKED_PTR));
4171                 vm_packing_verify_range("vm_page",
4172                     submap_start, submap_end, VM_PACKING_PARAMS(VM_PAGE_PACKED_PTR));
4173         }
4174 #endif /* defined(__LP64__) */
4175
4176         vm_map_kernel_flags_t vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
4177         vmk_flags.vmkf_permanent = TRUE;
4178         kr = kmem_suballoc(kernel_map, submap_min, submap_size,
4179             FALSE, VM_FLAGS_FIXED, vmk_flags,
4180             VM_KERN_MEMORY_ZONE, &submap);
4181         if (kr != KERN_SUCCESS) {
4182                 panic("kmem_suballoc(kernel_map[%d] %p:%p) failed: %d",
4183                     idx, (void *)submap_start, (void *)submap_end, kr);
4184         }
4185
4186 #if DEBUG || DEVELOPMENT
4187         printf("zone_init: submap[%d] %p:%p (%zuM)\n",
4188             idx, (void *)submap_start, (void *)submap_end,
4189             (size_t)submap_size >> 20);
4190 #endif /* DEBUG || DEVELOPMENT */
4191
4192         zone_submaps[idx] = submap;
4193         *submap_min       = submap_end;
4194         *remaining_size  -= submap_size;
4195         *remaining_denom -= zone_sub_map_numer;
4196
4197         zone_init_allocate_va(submap_min, guard_size, true);
4198 }
4199
4200 /* Global initialization of Zone Allocator.
4201  * Runs after zone_bootstrap.
4202  */
4203 __startup_func
4204 static void
4205 zone_init(void)
4206 {
4207         vm_size_t       zone_meta_size;
4208         vm_size_t       zone_map_size;
4209         vm_size_t       remaining_size;
4210         vm_offset_t     submap_min = 0;
4211
4212         if (ZSECURITY_OPTIONS_SUBMAP_USER_DATA & zsecurity_options) {
4213                 zone_last_submap_idx = Z_SUBMAP_IDX_BAG_OF_BYTES_MAP;
4214         } else {
4215                 zone_last_submap_idx = Z_SUBMAP_IDX_GENERAL_MAP;
4216         }
4217         zone_phys_mapped_max  = zone_phys_size_max();
4218
4219 #if __LP64__
4220         zone_map_size = ZONE_MAP_VIRTUAL_SIZE_LP64;
4221 #else
4222         zone_map_size = zone_phys_mapped_max;
4223 #endif
4224         zone_meta_size = round_page(atop(zone_map_size) *
4225             sizeof(struct zone_page_metadata));
4226
4227         /*
4228          * Zone "map" setup:
4229          *
4230          * [  VA_RESTRICTED  ] <-- LP64 only
4231          * [  SINGLE_GUARD   ] <-- LP64 only
4232          * [  meta           ]
4233          * [  SINGLE_GUARD   ]
4234          * [  map<i>         ] \ for each extra map
4235          * [  MULTI_GUARD    ] /
4236          */
4237         remaining_size = zone_map_size;
4238 #if defined(__LP64__)
4239         remaining_size -= SINGLE_GUARD;
4240 #endif
4241         remaining_size -= zone_meta_size + SINGLE_GUARD;
4242         remaining_size -= MULTI_GUARD * (zone_last_submap_idx -
4243             Z_SUBMAP_IDX_GENERAL_MAP + 1);
4244
4245 #if VM_MAX_TAG_ZONES
4246         if (zone_tagging_on) {
4247                 zone_tagging_init(zone_map_size);
4248         }
4249 #endif
4250
4251         uint64_t remaining_denom = 0;
4252         uint64_t zone_sub_map_numer[Z_SUBMAP_IDX_COUNT] = {
4253 #ifdef __LP64__
4254                 [Z_SUBMAP_IDX_VA_RESTRICTED_MAP] = 20,
4255 #endif /* defined(__LP64__) */
4256                 [Z_SUBMAP_IDX_GENERAL_MAP]       = 40,
4257                 [Z_SUBMAP_IDX_BAG_OF_BYTES_MAP]  = 40,
4258         };
4259
4260         for (unsigned idx = 0; idx <= zone_last_submap_idx; idx++) {
4261 #if DEBUG || DEVELOPMENT
4262                 char submap_name[MAX_SUBMAP_NAME];
4263                 snprintf(submap_name, MAX_SUBMAP_NAME, "submap%d", idx);
4264                 PE_parse_boot_argn(submap_name, &zone_sub_map_numer[idx], sizeof(uint64_t));
4265 #endif
4266                 remaining_denom += zone_sub_map_numer[idx];
4267         }
4268
4269         /*
4270          * And now allocate the various pieces of VA and submaps.
4271          *
4272          * Make a first allocation of contiguous VA, that we'll deallocate,
4273          * and we'll carve-out memory in that range again linearly.
4274          * The kernel is stil single threaded at this stage.
4275          */
4276
4277         struct zone_map_range *map_range = &zone_info.zi_map_range;
4278
4279         *map_range = zone_init_allocate_va(&submap_min, zone_map_size, false);
4280         submap_min = map_range->min_address;
4281         kmem_free(kernel_map, submap_min, zone_map_size);
4282
4283 #if defined(__LP64__)
4284         /*
4285          * Allocate `Z_SUBMAP_IDX_VA_RESTRICTED_MAP` first because its VA range
4286          * can't go beyond RESTRICTED_VA_MAX for the vm_page_t packing to work.
4287          */
4288         zone_submap_init(&submap_min, Z_SUBMAP_IDX_VA_RESTRICTED_MAP,
4289             zone_sub_map_numer[Z_SUBMAP_IDX_VA_RESTRICTED_MAP], &remaining_denom,
4290             &remaining_size, SINGLE_GUARD);
4291 #endif /* defined(__LP64__) */
4292
4293         /*
4294          * Allocate metadata array
4295          */
4296         zone_info.zi_meta_range =
4297             zone_init_allocate_va(&submap_min, zone_meta_size, true);
4298         zone_init_allocate_va(&submap_min, SINGLE_GUARD, true);
4299
4300         zone_info.zi_array_base =
4301             (struct zone_page_metadata *)zone_info.zi_meta_range.min_address -
4302             zone_pva_from_addr(map_range->min_address).packed_address;
4303
4304         /*
4305          * Allocate other submaps
4306          */
4307         for (unsigned idx = Z_SUBMAP_IDX_GENERAL_MAP; idx <= zone_last_submap_idx; idx++) {
4308                 zone_submap_init(&submap_min, idx, zone_sub_map_numer[idx],
4309                     &remaining_denom, &remaining_size, MULTI_GUARD);
4310         }
4311
4312         vm_map_t general_map = zone_submaps[Z_SUBMAP_IDX_GENERAL_MAP];
4313         zone_info.zi_general_range.min_address = vm_map_min(general_map);
4314         zone_info.zi_general_range.max_address = vm_map_max(general_map);
4315
4316         assert(submap_min == map_range->max_address);
4317
4318 #if CONFIG_GZALLOC
4319         gzalloc_init(zone_map_size);
4320 #endif
4321
4322         zone_create_flags_t kma_flags = ZC_NOCACHING |
4323             ZC_NOGC | ZC_NOENCRYPT | ZC_NOGZALLOC | ZC_NOCALLOUT |
4324             ZC_KASAN_NOQUARANTINE | ZC_KASAN_NOREDZONE;
4325
4326         (void)zone_create_ext("vm.permanent", 1, kma_flags,
4327             ZONE_ID_PERMANENT, ^(zone_t z){
4328                 z->permanent = true;
4329                 z->z_elem_size = 1;
4330                 z->pcpu_elem_size = 1;
4331 #if defined(__LP64__)
4332                 z->submap_idx = Z_SUBMAP_IDX_VA_RESTRICTED_MAP;
4333 #endif
4334         });
4335         (void)zone_create_ext("vm.permanent.percpu", 1, kma_flags | ZC_PERCPU,
4336             ZONE_ID_PERCPU_PERMANENT, ^(zone_t z){
4337                 z->permanent = true;
4338                 z->z_elem_size = 1;
4339                 z->pcpu_elem_size = zpercpu_count();
4340 #if defined(__LP64__)
4341                 z->submap_idx = Z_SUBMAP_IDX_VA_RESTRICTED_MAP;
4342 #endif
4343         });
4344
4345         /*
4346          * Now fix the zones that are missing their zone stats
4347          * we don't really know if zfree()s happened so our stats
4348          * are slightly off for early boot. ¯\_(ツ)_/¯
4349          */
4350         zone_index_foreach(idx) {
4351                 zone_t tz = &zone_array[idx];
4352
4353                 if (tz->z_self) {
4354                         zone_stats_t zs = zalloc_percpu_permanent_type(struct zone_stats);
4355
4356                         zpercpu_get_cpu(zs, 0)->zs_mem_allocated +=
4357                             (tz->countavail - tz->countfree) *
4358                             zone_elem_size(tz);
4359                         assert(tz->z_stats == NULL);
4360                         tz->z_stats = zs;
4361 #if ZONE_ENABLE_LOGGING
4362                         if (tz->zone_logging && !tz->zlog_btlog) {
4363                                 zone_enable_logging(tz);
4364                         }
4365 #endif
4366                 }
4367         }
4368
4369 #if CONFIG_ZLEAKS
4370         /*
4371          * Initialize the zone leak monitor
4372          */
4373         zleak_init(zone_map_size);
4374 #endif /* CONFIG_ZLEAKS */
4375
4376 #if VM_MAX_TAG_ZONES
4377         if (zone_tagging_on) {
4378                 vm_allocation_zones_init();
4379         }
4380 #endif
4381 }
4382 STARTUP(ZALLOC, STARTUP_RANK_FIRST, zone_init);
4383
4384 __startup_func
4385 static void
4386 zone_set_foreign_range(
4387         vm_offset_t range_min,
4388         vm_offset_t range_max)
4389 {
4390         zone_info.zi_foreign_range.min_address = range_min;
4391         zone_info.zi_foreign_range.max_address = range_max;
4392 }
4393
4394 __startup_func
4395 vm_offset_t
4396 zone_foreign_mem_init(vm_size_t size)
4397 {
4398         vm_offset_t mem = (vm_offset_t) pmap_steal_memory(size);
4399         zone_set_foreign_range(mem, mem + size);
4400         return mem;
4401 }
4402
4403 #pragma mark zalloc
4404
4405 #if KASAN_ZALLOC
4406 /*
4407  * Called from zfree() to add the element being freed to the KASan quarantine.
4408  *
4409  * Returns true if the newly-freed element made it into the quarantine without
4410  * displacing another, false otherwise. In the latter case, addrp points to the
4411  * address of the displaced element, which will be freed by the zone.
4412  */
4413 static bool
4414 kasan_quarantine_freed_element(
4415         zone_t          *zonep,         /* the zone the element is being freed to */
4416         void            **addrp)        /* address of the element being freed */
4417 {
4418         zone_t zone = *zonep;
4419         void *addr = *addrp;
4420
4421         /*
4422          * Resize back to the real allocation size and hand off to the KASan
4423          * quarantine. `addr` may then point to a different allocation, if the
4424          * current element replaced another in the quarantine. The zone then
4425          * takes ownership of the swapped out free element.
4426          */
4427         vm_size_t usersz = zone_elem_size(zone) - 2 * zone->kasan_redzone;
4428         vm_size_t sz = usersz;
4429
4430         if (addr && zone->kasan_redzone) {
4431                 kasan_check_free((vm_address_t)addr, usersz, KASAN_HEAP_ZALLOC);
4432                 addr = (void *)kasan_dealloc((vm_address_t)addr, &sz);
4433                 assert(sz == zone_elem_size(zone));
4434         }
4435         if (addr && !zone->kasan_noquarantine) {
4436                 kasan_free(&addr, &sz, KASAN_HEAP_ZALLOC, zonep, usersz, true);
4437                 if (!addr) {
4438                         return TRUE;
4439                 }
4440         }
4441         if (addr && zone->kasan_noquarantine) {
4442                 kasan_unpoison(addr, zone_elem_size(zone));
4443         }
4444         *addrp = addr;
4445         return FALSE;
4446 }
4447
4448 #endif /* KASAN_ZALLOC */
4449
4450 static inline bool
4451 zone_needs_async_refill(zone_t zone)
4452 {
4453         if (zone->countfree != 0 || zone->async_pending || zone->no_callout) {
4454                 return false;
4455         }
4456
4457         return zone->expandable || zone->page_count < zone->page_count_max;
4458 }
4459
4460 __attribute__((noinline))
4461 static void
4462 zone_refill_synchronously_locked(
4463         zone_t         zone,
4464         zalloc_flags_t flags)
4465 {
4466         thread_t thr = current_thread();
4467         bool     set_expanding_vm_priv = false;
4468         zone_pva_t orig = zone->pages_intermediate;
4469
4470         while ((flags & Z_NOWAIT) == 0 && (zone->permanent
4471             ? zone_pva_is_equal(zone->pages_intermediate, orig)
4472             : zone->countfree == 0)) {
4473                 /*
4474                  * zone is empty, try to expand it
4475                  *
4476                  * Note that we now allow up to 2 threads (1 vm_privliged and
4477                  * 1 non-vm_privliged) to expand the zone concurrently...
4478                  *
4479                  * this is necessary to avoid stalling vm_privileged threads
4480                  * running critical code necessary to continue
4481                  * compressing/swapping pages (i.e. making new free pages) from
4482                  * stalling behind non-vm_privileged threads waiting to acquire
4483                  * free pages when the vm_page_free_count is below the
4484                  * vm_page_free_reserved limit.
4485                  */
4486                 if ((zone->expanding_no_vm_priv || zone->expanding_vm_priv) &&
4487                     (((thr->options & TH_OPT_VMPRIV) == 0) || zone->expanding_vm_priv)) {
4488                         /*
4489                          * This is a non-vm_privileged thread and a non-vm_privileged or
4490                          * a vm_privileged thread is already expanding the zone...
4491                          *    OR
4492                          * this is a vm_privileged thread and a vm_privileged thread is
4493                          * already expanding the zone...
4494                          *
4495                          * In either case wait for a thread to finish, then try again.
4496                          */
4497                         zone->waiting = true;
4498                         assert_wait(zone, THREAD_UNINT);
4499                         unlock_zone(zone);
4500                         thread_block(THREAD_CONTINUE_NULL);
4501                         lock_zone(zone);
4502                         continue;
4503                 }
4504
4505                 if (zone->page_count >= zone->page_count_max) {
4506                         if (zone->exhaustible) {
4507                                 break;
4508                         }
4509                         if (zone->expandable) {
4510                                 /*
4511                                  * If we're expandable, just don't go through this again.
4512                                  */
4513                                 zone->page_count_max = ~0u;
4514                         } else {
4515                                 unlock_zone(zone);
4516
4517                                 panic_include_zprint = true;
4518 #if CONFIG_ZLEAKS
4519                                 if (zleak_state & ZLEAK_STATE_ACTIVE) {
4520                                         panic_include_ztrace = true;
4521                                 }
4522 #endif /* CONFIG_ZLEAKS */
4523                                 panic("zalloc: zone \"%s\" empty.", zone->z_name);
4524                         }
4525                 }
4526
4527                 /*
4528                  * It is possible that a BG thread is refilling/expanding the zone
4529                  * and gets pre-empted during that operation. That blocks all other
4530                  * threads from making progress leading to a watchdog timeout. To
4531                  * avoid that, boost the thread priority using the rwlock boost
4532                  */
4533                 set_thread_rwlock_boost();
4534
4535                 if ((thr->options & TH_OPT_VMPRIV)) {
4536                         zone->expanding_vm_priv = true;
4537                         set_expanding_vm_priv = true;
4538                 } else {
4539                         zone->expanding_no_vm_priv = true;
4540                 }
4541
4542                 zone_replenish_locked(zone, flags, false);
4543
4544                 if (set_expanding_vm_priv == true) {
4545                         zone->expanding_vm_priv = false;
4546                 } else {
4547                         zone->expanding_no_vm_priv = false;
4548                 }
4549
4550                 if (zone->waiting) {
4551                         zone->waiting = false;
4552                         thread_wakeup(zone);
4553                 }
4554                 clear_thread_rwlock_boost();
4555
4556                 if (zone->countfree == 0) {
4557                         assert(flags & Z_NOPAGEWAIT);
4558                         break;
4559                 }
4560         }
4561
4562         if ((flags & (Z_NOWAIT | Z_NOPAGEWAIT)) &&
4563             zone_needs_async_refill(zone) && !vm_pool_low()) {
4564                 zone->async_pending = true;
4565                 unlock_zone(zone);
4566                 thread_call_enter(&call_async_alloc);
4567                 lock_zone(zone);
4568                 assert(zone->z_self == zone);
4569         }
4570 }
4571
4572 __attribute__((noinline))
4573 static void
4574 zone_refill_asynchronously_locked(zone_t zone)
4575 {
4576         uint32_t min_free = zone->prio_refill_count / 2;
4577         uint32_t resv_free = zone->prio_refill_count / 4;
4578         thread_t thr = current_thread();
4579
4580         /*
4581          * Nothing to do if there are plenty of elements.
4582          */
4583         while (zone->countfree <= min_free) {
4584                 /*
4585                  * Wakeup the replenish thread if not running.
4586                  */
4587                 if (!zone->zone_replenishing) {
4588                         lck_spin_lock(&zone_replenish_lock);
4589                         assert(zone_replenish_active < zone_replenish_max_threads);
4590                         ++zone_replenish_active;
4591                         lck_spin_unlock(&zone_replenish_lock);
4592                         zone->zone_replenishing = true;
4593                         zone_replenish_wakeups_initiated++;
4594                         thread_wakeup(&zone->prio_refill_count);
4595                 }
4596
4597                 /*
4598                  * We'll let VM_PRIV threads to continue to allocate until the
4599                  * reserve drops to 25%. After that only TH_OPT_ZONE_PRIV threads
4600                  * may continue.
4601                  *
4602                  * TH_OPT_ZONE_PRIV threads are the GC thread and a replenish thread itself.
4603                  * Replenish threads *need* to use the reserve. GC threads need to
4604                  * get through the current allocation, but then will wait at a higher
4605                  * level after they've dropped any locks which would deadlock the
4606                  * replenish thread.
4607                  */
4608                 if ((zone->countfree > resv_free && (thr->options & TH_OPT_VMPRIV)) ||
4609                     (thr->options & TH_OPT_ZONE_PRIV)) {
4610                         break;
4611                 }
4612
4613                 /*
4614                  * Wait for the replenish threads to add more elements for us to allocate from.
4615                  */
4616                 zone_replenish_throttle_count++;
4617                 unlock_zone(zone);
4618                 assert_wait_timeout(zone, THREAD_UNINT, 1, NSEC_PER_MSEC);
4619                 thread_block(THREAD_CONTINUE_NULL);
4620                 lock_zone(zone);
4621
4622                 assert(zone->z_self == zone);
4623         }
4624
4625         /*
4626          * If we're here because of zone_gc(), we didn't wait for
4627          * zone_replenish_thread to finish.  So we need to ensure that
4628          * we will successfully grab an element.
4629          *
4630          * zones that have a replenish thread configured.
4631          * The value of (refill_level / 2) in the previous bit of code should have
4632          * given us headroom even though this thread didn't wait.
4633          */
4634         if (thr->options & TH_OPT_ZONE_PRIV) {
4635                 assert(zone->countfree != 0);
4636         }
4637 }
4638
4639 #if ZONE_ENABLE_LOGGING || CONFIG_ZLEAKS
4640 __attribute__((noinline))
4641 static void
4642 zalloc_log_or_trace_leaks(zone_t zone, vm_offset_t addr)
4643 {
4644         uintptr_t       zbt[MAX_ZTRACE_DEPTH];  /* used in zone leak logging and zone leak detection */
4645         unsigned int    numsaved = 0;
4646
4647 #if ZONE_ENABLE_LOGGING
4648         if (DO_LOGGING(zone)) {
4649                 numsaved = backtrace_frame(zbt, MAX_ZTRACE_DEPTH,
4650                     __builtin_frame_address(0), NULL);
4651                 btlog_add_entry(zone->zlog_btlog, (void *)addr,
4652                     ZOP_ALLOC, (void **)zbt, numsaved);
4653         }
4654 #endif
4655
4656 #if CONFIG_ZLEAKS
4657         /*
4658          * Zone leak detection: capture a backtrace every zleak_sample_factor
4659          * allocations in this zone.
4660          */
4661         if (__improbable(zone->zleak_on)) {
4662                 if (sample_counter(&zone->zleak_capture, zleak_sample_factor)) {
4663                         /* Avoid backtracing twice if zone logging is on */
4664                         if (numsaved == 0) {
4665                                 numsaved = backtrace_frame(zbt, MAX_ZTRACE_DEPTH,
4666                                     __builtin_frame_address(1), NULL);
4667                         }
4668                         /* Sampling can fail if another sample is happening at the same time in a different zone. */
4669                         if (!zleak_log(zbt, addr, numsaved, zone_elem_size(zone))) {
4670                                 /* If it failed, roll back the counter so we sample the next allocation instead. */
4671                                 zone->zleak_capture = zleak_sample_factor;
4672                         }
4673                 }
4674         }
4675
4676         if (__improbable(zone_leaks_scan_enable &&
4677             !(zone_elem_size(zone) & (sizeof(uintptr_t) - 1)))) {
4678                 unsigned int count, idx;
4679                 /* Fill element, from tail, with backtrace in reverse order */
4680                 if (numsaved == 0) {
4681                         numsaved = backtrace_frame(zbt, MAX_ZTRACE_DEPTH,
4682                             __builtin_frame_address(1), NULL);
4683                 }
4684                 count = (unsigned int)(zone_elem_size(zone) / sizeof(uintptr_t));
4685                 if (count >= numsaved) {
4686                         count = numsaved - 1;
4687                 }
4688                 for (idx = 0; idx < count; idx++) {
4689                         ((uintptr_t *)addr)[count - 1 - idx] = zbt[idx + 1];
4690                 }
4691         }
4692 #endif /* CONFIG_ZLEAKS */
4693 }
4694
4695 static inline bool
4696 zalloc_should_log_or_trace_leaks(zone_t zone, vm_size_t elem_size)
4697 {
4698 #if ZONE_ENABLE_LOGGING
4699         if (DO_LOGGING(zone)) {
4700                 return true;
4701         }
4702 #endif
4703 #if CONFIG_ZLEAKS
4704         /*
4705          * Zone leak detection: capture a backtrace every zleak_sample_factor
4706          * allocations in this zone.
4707          */
4708         if (zone->zleak_on) {
4709                 return true;
4710         }
4711         if (zone_leaks_scan_enable && !(elem_size & (sizeof(uintptr_t) - 1))) {
4712                 return true;
4713         }
4714 #endif /* CONFIG_ZLEAKS */
4715         return false;
4716 }
4717 #endif /* ZONE_ENABLE_LOGGING || CONFIG_ZLEAKS */
4718 #if ZONE_ENABLE_LOGGING
4719
4720 __attribute__((noinline))
4721 static void
4722 zfree_log_trace(zone_t zone, vm_offset_t addr)
4723 {
4724         /*
4725          * See if we're doing logging on this zone.
4726          *
4727          * There are two styles of logging used depending on
4728          * whether we're trying to catch a leak or corruption.
4729          */
4730         if (__improbable(DO_LOGGING(zone))) {
4731                 if (corruption_debug_flag) {
4732                         uintptr_t       zbt[MAX_ZTRACE_DEPTH];
4733                         unsigned int    numsaved;
4734                         /*
4735                          * We're logging to catch a corruption.
4736                          *
4737                          * Add a record of this zfree operation to log.
4738                          */
4739                         numsaved = backtrace_frame(zbt, MAX_ZTRACE_DEPTH,
4740                             __builtin_frame_address(1), NULL);
4741                         btlog_add_entry(zone->zlog_btlog, (void *)addr, ZOP_FREE,
4742                             (void **)zbt, numsaved);
4743                 } else {
4744                         /*
4745                          * We're logging to catch a leak.
4746                          *
4747                          * Remove any record we might have for this element
4748                          * since it's being freed.  Note that we may not find it
4749                          * if the buffer overflowed and that's OK.
4750                          *
4751                          * Since the log is of a limited size, old records get
4752                          * overwritten if there are more zallocs than zfrees.
4753                          */
4754                         btlog_remove_entries_for_element(zone->zlog_btlog, (void *)addr);
4755                 }
4756         }
4757 }
4758 #endif /* ZONE_ENABLE_LOGGING */
4759
4760 /*
4761  * Removes an element from the zone's free list, returning 0 if the free list is empty.
4762  * Verifies that the next-pointer and backup next-pointer are intact,
4763  * and verifies that a poisoned element hasn't been modified.
4764  */
4765 vm_offset_t
4766 zalloc_direct_locked(
4767         zone_t              zone,
4768         zalloc_flags_t      flags __unused,
4769         vm_size_t           waste __unused)
4770 {
4771         struct zone_page_metadata *page_meta;
4772         zone_addr_kind_t kind = ZONE_ADDR_NATIVE;
4773         vm_offset_t element, page, validate_bit = 0;
4774
4775         /* if zone is empty, bail */
4776         if (!zone_pva_is_null(zone->pages_any_free_foreign)) {
4777                 kind = ZONE_ADDR_FOREIGN;
4778                 page_meta = zone_pva_to_meta(zone->pages_any_free_foreign, kind);
4779                 page = (vm_offset_t)page_meta;
4780         } else if (!zone_pva_is_null(zone->pages_intermediate)) {
4781                 page_meta = zone_pva_to_meta(zone->pages_intermediate, kind);
4782                 page = zone_pva_to_addr(zone->pages_intermediate);
4783         } else if (!zone_pva_is_null(zone->pages_all_free)) {
4784                 page_meta = zone_pva_to_meta(zone->pages_all_free, kind);
4785                 page = zone_pva_to_addr(zone->pages_all_free);
4786                 if (os_sub_overflow(zone->allfree_page_count,
4787                     page_meta->zm_page_count, &zone->allfree_page_count)) {
4788                         zone_accounting_panic(zone, "allfree_page_count wrap-around");
4789                 }
4790         } else {
4791                 zone_accounting_panic(zone, "countfree corruption");
4792         }
4793
4794         if (!zone_has_index(zone, page_meta->zm_index)) {
4795                 zone_page_metadata_index_confusion_panic(zone, page, page_meta);
4796         }
4797
4798         element = zone_page_meta_get_freelist(zone, page_meta, page);
4799
4800         vm_offset_t *primary = (vm_offset_t *) element;
4801         vm_offset_t *backup  = get_backup_ptr(zone_elem_size(zone), primary);
4802
4803         /*
4804          * since the primary next pointer is xor'ed with zp_nopoison_cookie
4805          * for obfuscation, retrieve the original value back
4806          */
4807         vm_offset_t  next_element          = *primary ^ zp_nopoison_cookie;
4808         vm_offset_t  next_element_primary  = *primary;
4809         vm_offset_t  next_element_backup   = *backup;
4810
4811         /*
4812          * backup_ptr_mismatch_panic will determine what next_element
4813          * should have been, and print it appropriately
4814          */
4815         if (!zone_page_meta_is_sane_element(zone, page_meta, page, next_element, kind)) {
4816                 backup_ptr_mismatch_panic(zone, page_meta, page, element);
4817         }
4818
4819         /* Check the backup pointer for the regular cookie */
4820         if (__improbable(next_element_primary != next_element_backup)) {
4821                 /* Check for the poisoned cookie instead */
4822                 if (__improbable(next_element != (next_element_backup ^ zp_poisoned_cookie))) {
4823                         /* Neither cookie is valid, corruption has occurred */
4824                         backup_ptr_mismatch_panic(zone, page_meta, page, element);
4825                 }
4826
4827                 /*
4828                  * Element was marked as poisoned, so check its integrity before using it.
4829                  */
4830                 validate_bit = ZALLOC_ELEMENT_NEEDS_VALIDATION;
4831         } else if (zone->zfree_clear_mem) {
4832                 validate_bit = ZALLOC_ELEMENT_NEEDS_VALIDATION;
4833         }
4834
4835         /* Remove this element from the free list */
4836         zone_page_meta_set_freelist(page_meta, page, next_element);
4837
4838         if (kind == ZONE_ADDR_FOREIGN) {
4839                 if (next_element == 0) {
4840                         /* last foreign element allocated on page, move to all_used_foreign */
4841                         zone_meta_requeue(zone, &zone->pages_all_used_foreign, page_meta, kind);
4842                 }
4843         } else if (next_element == 0) {
4844                 zone_meta_requeue(zone, &zone->pages_all_used, page_meta, kind);
4845         } else if (page_meta->zm_alloc_count == 0) {
4846                 /* remove from free, move to intermediate */
4847                 zone_meta_requeue(zone, &zone->pages_intermediate, page_meta, kind);
4848         }
4849
4850         if (os_add_overflow(page_meta->zm_alloc_count, 1,
4851             &page_meta->zm_alloc_count)) {
4852                 /*
4853                  * This will not catch a lot of errors, the proper check
4854                  * would be against the number of elements this run should
4855                  * have which is expensive to count.
4856                  *
4857                  * But zm_alloc_count is a 16 bit number which could
4858                  * theoretically be valuable to cause to wrap around,
4859                  * so catch this.
4860                  */
4861                 zone_page_meta_accounting_panic(zone, page_meta,
4862                     "zm_alloc_count overflow");
4863         }
4864         if (os_sub_overflow(zone->countfree, 1, &zone->countfree)) {
4865                 zone_accounting_panic(zone, "countfree wrap-around");
4866         }
4867
4868 #if VM_MAX_TAG_ZONES
4869         if (__improbable(zone->tags)) {
4870                 vm_tag_t tag = zalloc_flags_get_tag(flags);
4871                 // set the tag with b0 clear so the block remains inuse
4872                 ZTAG(zone, element)[0] = (vm_tag_t)(tag << 1);
4873                 vm_tag_update_zone_size(tag, zone->tag_zone_index,
4874                     zone_elem_size(zone), waste);
4875         }
4876 #endif /* VM_MAX_TAG_ZONES */
4877 #if KASAN_ZALLOC
4878         if (zone->percpu) {
4879                 zpercpu_foreach_cpu(i) {
4880                         kasan_poison_range(element + ptoa(i),
4881                             zone_elem_size(zone), ASAN_VALID);
4882                 }
4883         } else {
4884                 kasan_poison_range(element, zone_elem_size(zone), ASAN_VALID);
4885         }
4886 #endif
4887
4888         return element | validate_bit;
4889 }
4890
4891 /*
4892  *      zalloc returns an element from the specified zone.
4893  *
4894  *      The function is noinline when zlog can be used so that the backtracing can
4895  *      reliably skip the zalloc_ext() and zalloc_log_or_trace_leaks()
4896  *      boring frames.
4897  */
4898 #if ZONE_ENABLE_LOGGING
4899 __attribute__((noinline))
4900 #endif
4901 void *
4902 zalloc_ext(
4903         zone_t          zone,
4904         zone_stats_t    zstats,
4905         zalloc_flags_t  flags,
4906         vm_size_t       waste)
4907 {
4908         vm_offset_t     addr = 0;
4909         vm_size_t       elem_size = zone_elem_size(zone);
4910
4911         /*
4912          * KASan uses zalloc() for fakestack, which can be called anywhere.
4913          * However, we make sure these calls can never block.
4914          */
4915         assert(zone->kasan_fakestacks ||
4916             ml_get_interrupts_enabled() ||
4917             ml_is_quiescing() ||
4918             debug_mode_active() ||
4919             startup_phase < STARTUP_SUB_EARLY_BOOT);
4920
4921         /*
4922          * Make sure Z_NOFAIL was not obviously misused
4923          */
4924         if ((flags & Z_NOFAIL) && !zone->prio_refill_count) {
4925                 assert(!zone->exhaustible && (flags & (Z_NOWAIT | Z_NOPAGEWAIT)) == 0);
4926         }
4927
4928 #if CONFIG_ZCACHE
4929         /*
4930          * Note: if zone caching is on, gzalloc and tags aren't used
4931          *       so we can always check this first
4932          */
4933         if (zone_caching_enabled(zone)) {
4934                 addr = zcache_alloc_from_cpu_cache(zone, zstats, waste);
4935                 if (__probable(addr)) {
4936                         goto allocated_from_cache;
4937                 }
4938         }
4939 #endif /* CONFIG_ZCACHE */
4940
4941 #if CONFIG_GZALLOC
4942         if (__improbable(zone->gzalloc_tracked)) {
4943                 addr = gzalloc_alloc(zone, zstats, flags);
4944                 goto allocated_from_gzalloc;
4945         }
4946 #endif /* CONFIG_GZALLOC */
4947 #if VM_MAX_TAG_ZONES
4948         if (__improbable(zone->tags)) {
4949                 vm_tag_t tag = zalloc_flags_get_tag(flags);
4950                 if (tag == VM_KERN_MEMORY_NONE) {
4951                         /*
4952                          * zone views into heaps can lead to a site-less call
4953                          * and we fallback to KALLOC as a tag for those.
4954                          */
4955                         tag = VM_KERN_MEMORY_KALLOC;
4956                         flags |= Z_VM_TAG(tag);
4957                 }
4958                 vm_tag_will_update_zone(tag, zone->tag_zone_index);
4959         }
4960 #endif /* VM_MAX_TAG_ZONES */
4961
4962         lock_zone(zone);
4963         assert(zone->z_self == zone);
4964
4965         /*
4966          * Check if we need another thread to replenish the zone or
4967          * if we have to wait for a replenish thread to finish.
4968          * This is used for elements, like vm_map_entry, which are
4969          * needed themselves to implement zalloc().
4970          */
4971         if (__improbable(zone->prio_refill_count &&
4972             zone->countfree <= zone->prio_refill_count / 2)) {
4973                 zone_refill_asynchronously_locked(zone);
4974         } else if (__improbable(zone->countfree == 0)) {
4975                 zone_refill_synchronously_locked(zone, flags);
4976                 if (__improbable(zone->countfree == 0)) {
4977                         unlock_zone(zone);
4978                         if (__improbable(flags & Z_NOFAIL)) {
4979                                 zone_nofail_panic(zone);
4980                         }
4981                         goto out_nomem;
4982                 }
4983         }
4984
4985         addr = zalloc_direct_locked(zone, flags, waste);
4986         if (__probable(zstats != NULL)) {
4987                 /*
4988                  * The few vm zones used before zone_init() runs do not have
4989                  * per-cpu stats yet
4990                  */
4991                 int cpu = cpu_number();
4992                 zpercpu_get_cpu(zstats, cpu)->zs_mem_allocated += elem_size;
4993 #if ZALLOC_DETAILED_STATS
4994                 if (waste) {
4995                         zpercpu_get_cpu(zstats, cpu)->zs_mem_wasted += waste;
4996                 }
4997 #endif /* ZALLOC_DETAILED_STATS */
4998         }
4999
5000         unlock_zone(zone);
5001
5002 #if ZALLOC_ENABLE_POISONING
5003         bool validate = addr & ZALLOC_ELEMENT_NEEDS_VALIDATION;
5004 #endif
5005         addr &= ~ZALLOC_ELEMENT_NEEDS_VALIDATION;
5006         zone_clear_freelist_pointers(zone, addr);
5007 #if ZALLOC_ENABLE_POISONING
5008         /*
5009          * Note: percpu zones do not respect ZONE_MIN_ELEM_SIZE,
5010          *       so we will check the first word even if we just
5011          *       cleared it.
5012          */
5013         zalloc_validate_element(zone, addr, elem_size - sizeof(vm_offset_t),
5014             validate);
5015 #endif /* ZALLOC_ENABLE_POISONING */
5016
5017 allocated_from_cache:
5018 #if ZONE_ENABLE_LOGGING || CONFIG_ZLEAKS
5019         if (__improbable(zalloc_should_log_or_trace_leaks(zone, elem_size))) {
5020                 zalloc_log_or_trace_leaks(zone, addr);
5021         }
5022 #endif /* ZONE_ENABLE_LOGGING || CONFIG_ZLEAKS */
5023
5024 #if CONFIG_GZALLOC
5025 allocated_from_gzalloc:
5026 #endif
5027 #if KASAN_ZALLOC
5028         if (zone->kasan_redzone) {
5029                 addr = kasan_alloc(addr, elem_size,
5030                     elem_size - 2 * zone->kasan_redzone, zone->kasan_redzone);
5031                 elem_size -= 2 * zone->kasan_redzone;
5032         }
5033         /*
5034          * Initialize buffer with unique pattern only if memory
5035          * wasn't expected to be zeroed.
5036          */
5037         if (!zone->zfree_clear_mem && !(flags & Z_ZERO)) {
5038                 kasan_leak_init(addr, elem_size);
5039         }
5040 #endif /* KASAN_ZALLOC */
5041         if ((flags & Z_ZERO) && !zone->zfree_clear_mem) {
5042                 bzero((void *)addr, elem_size);
5043         }
5044
5045         TRACE_MACHLEAKS(ZALLOC_CODE, ZALLOC_CODE_2, elem_size, addr);
5046
5047 out_nomem:
5048         DTRACE_VM2(zalloc, zone_t, zone, void*, addr);
5049         return (void *)addr;
5050 }
5051
5052 void *
5053 zalloc(union zone_or_view zov)
5054 {
5055         return zalloc_flags(zov, Z_WAITOK);
5056 }
5057
5058 void *
5059 zalloc_noblock(union zone_or_view zov)
5060 {
5061         return zalloc_flags(zov, Z_NOWAIT);
5062 }
5063
5064 void *
5065 zalloc_flags(union zone_or_view zov, zalloc_flags_t flags)
5066 {
5067         zone_t zone = zov.zov_view->zv_zone;
5068         zone_stats_t zstats = zov.zov_view->zv_stats;
5069         assert(!zone->percpu);
5070         return zalloc_ext(zone, zstats, flags, 0);
5071 }
5072
5073 void *
5074 zalloc_percpu(union zone_or_view zov, zalloc_flags_t flags)
5075 {
5076         zone_t zone = zov.zov_view->zv_zone;
5077         zone_stats_t zstats = zov.zov_view->zv_stats;
5078         assert(zone->percpu);
5079         return (void *)__zpcpu_mangle(zalloc_ext(zone, zstats, flags, 0));
5080 }
5081
5082 static void *
5083 _zalloc_permanent(zone_t zone, vm_size_t size, vm_offset_t mask)
5084 {
5085         const zone_addr_kind_t kind = ZONE_ADDR_NATIVE;
5086         struct zone_page_metadata *page_meta;
5087         vm_offset_t offs, addr;
5088         zone_pva_t pva;
5089
5090         assert(ml_get_interrupts_enabled() ||
5091             ml_is_quiescing() ||
5092             debug_mode_active() ||
5093             startup_phase < STARTUP_SUB_EARLY_BOOT);
5094
5095         size = (size + mask) & ~mask;
5096         assert(size <= PAGE_SIZE);
5097
5098         lock_zone(zone);
5099         assert(zone->z_self == zone);
5100
5101         for (;;) {
5102                 pva = zone->pages_intermediate;
5103                 while (!zone_pva_is_null(pva)) {
5104                         page_meta = zone_pva_to_meta(pva, kind);
5105                         if (page_meta->zm_freelist_offs + size <= PAGE_SIZE) {
5106                                 goto found;
5107                         }
5108                         pva = page_meta->zm_page_next;
5109                 }
5110
5111                 zone_refill_synchronously_locked(zone, Z_WAITOK);
5112         }
5113
5114 found:
5115         offs = (page_meta->zm_freelist_offs + mask) & ~mask;
5116         page_meta->zm_freelist_offs = offs + size;
5117         page_meta->zm_alloc_count += size;
5118         zone->countfree -= size;
5119         if (__probable(zone->z_stats)) {
5120                 zpercpu_get(zone->z_stats)->zs_mem_allocated += size;
5121         }
5122
5123         if (page_meta->zm_alloc_count >= PAGE_SIZE - sizeof(vm_offset_t)) {
5124                 zone_meta_requeue(zone, &zone->pages_all_used, page_meta, kind);
5125         }
5126
5127         unlock_zone(zone);
5128
5129         addr = offs + zone_pva_to_addr(pva);
5130
5131         DTRACE_VM2(zalloc, zone_t, zone, void*, addr);
5132         return (void *)addr;
5133 }
5134
5135 static void *
5136 _zalloc_permanent_large(size_t size, vm_offset_t mask)
5137 {
5138         kern_return_t kr;
5139         vm_offset_t addr;
5140
5141         kr = kernel_memory_allocate(kernel_map, &addr, size, mask,
5142             KMA_KOBJECT | KMA_PERMANENT | KMA_ZERO,
5143             VM_KERN_MEMORY_KALLOC);
5144         if (kr != 0) {
5145                 panic("zalloc_permanent: unable to allocate %zd bytes (%d)",
5146                     size, kr);
5147         }
5148         return (void *)addr;
5149 }
5150
5151 void *
5152 zalloc_permanent(vm_size_t size, vm_offset_t mask)
5153 {
5154         if (size <= PAGE_SIZE) {
5155                 zone_t zone = &zone_array[ZONE_ID_PERMANENT];
5156                 return _zalloc_permanent(zone, size, mask);
5157         }
5158         return _zalloc_permanent_large(size, mask);
5159 }
5160
5161 void *
5162 zalloc_percpu_permanent(vm_size_t size, vm_offset_t mask)
5163 {
5164         zone_t zone = &zone_array[ZONE_ID_PERCPU_PERMANENT];
5165         return (void *)__zpcpu_mangle(_zalloc_permanent(zone, size, mask));
5166 }
5167
5168 void
5169 zalloc_async(__unused thread_call_param_t p0, __unused thread_call_param_t p1)
5170 {
5171         zone_index_foreach(i) {
5172                 zone_t z = &zone_array[i];
5173
5174                 if (z->no_callout) {
5175                         /* async_pending will never be set */
5176                         continue;
5177                 }
5178
5179                 lock_zone(z);
5180                 if (z->z_self && z->async_pending) {
5181                         z->async_pending = false;
5182                         zone_refill_synchronously_locked(z, Z_WAITOK);
5183                 }
5184                 unlock_zone(z);
5185         }
5186 }
5187
5188 /*
5189  * Adds the element to the head of the zone's free list
5190  * Keeps a backup next-pointer at the end of the element
5191  */
5192 void
5193 zfree_direct_locked(zone_t zone, vm_offset_t element, bool poison)
5194 {
5195         struct zone_page_metadata *page_meta;
5196         vm_offset_t page, old_head;
5197         zone_addr_kind_t kind;
5198         vm_size_t elem_size = zone_elem_size(zone);
5199
5200         vm_offset_t *primary  = (vm_offset_t *) element;
5201         vm_offset_t *backup   = get_backup_ptr(elem_size, primary);
5202
5203         page_meta = zone_allocated_element_resolve(zone, element, &page, &kind);
5204         old_head = zone_page_meta_get_freelist(zone, page_meta, page);
5205
5206         if (__improbable(old_head == element)) {
5207                 panic("zfree: double free of %p to zone %s%s\n",
5208                     (void *) element, zone_heap_name(zone), zone->z_name);
5209         }
5210
5211 #if ZALLOC_ENABLE_POISONING
5212         if (poison && elem_size < ZONE_MIN_ELEM_SIZE) {
5213                 assert(zone->percpu);
5214                 poison = false;
5215         }
5216 #else
5217         poison = false;
5218 #endif
5219
5220         /*
5221          * Always write a redundant next pointer
5222          * So that it is more difficult to forge, xor it with a random cookie
5223          * A poisoned element is indicated by using zp_poisoned_cookie
5224          * instead of zp_nopoison_cookie
5225          */
5226
5227         *backup = old_head ^ (poison ? zp_poisoned_cookie : zp_nopoison_cookie);
5228
5229         /*
5230          * Insert this element at the head of the free list. We also xor the
5231          * primary pointer with the zp_nopoison_cookie to make sure a free
5232          * element does not provide the location of the next free element directly.
5233          */
5234         *primary = old_head ^ zp_nopoison_cookie;
5235
5236 #if VM_MAX_TAG_ZONES
5237         if (__improbable(zone->tags)) {
5238                 vm_tag_t tag = (ZTAG(zone, element)[0] >> 1);
5239                 // set the tag with b0 clear so the block remains inuse
5240                 ZTAG(zone, element)[0] = 0xFFFE;
5241                 vm_tag_update_zone_size(tag, zone->tag_zone_index,
5242                     -((int64_t)elem_size), 0);
5243         }
5244 #endif /* VM_MAX_TAG_ZONES */
5245
5246         zone_page_meta_set_freelist(page_meta, page, element);
5247         if (os_sub_overflow(page_meta->zm_alloc_count, 1,
5248             &page_meta->zm_alloc_count)) {
5249                 zone_page_meta_accounting_panic(zone, page_meta,
5250                     "alloc_count wrap-around");
5251         }
5252         zone->countfree++;
5253
5254         if (kind == ZONE_ADDR_FOREIGN) {
5255                 if (old_head == 0) {
5256                         /* first foreign element freed on page, move from all_used_foreign */
5257                         zone_meta_requeue(zone, &zone->pages_any_free_foreign, page_meta, kind);
5258                 }
5259         } else if (page_meta->zm_alloc_count == 0) {
5260                 /* whether the page was on the intermediate or all_used, queue, move it to free */
5261                 zone_meta_requeue(zone, &zone->pages_all_free, page_meta, kind);
5262                 zone->allfree_page_count += page_meta->zm_page_count;
5263         } else if (old_head == 0) {
5264                 /* first free element on page, move from all_used */
5265                 zone_meta_requeue(zone, &zone->pages_intermediate, page_meta, kind);
5266         }
5267
5268 #if KASAN_ZALLOC
5269         if (zone->percpu) {
5270                 zpercpu_foreach_cpu(i) {
5271                         kasan_poison_range(element + ptoa(i), elem_size,
5272                             ASAN_HEAP_FREED);
5273                 }
5274         } else {
5275                 kasan_poison_range(element, elem_size, ASAN_HEAP_FREED);
5276         }
5277 #endif
5278 }
5279
5280 /*
5281  *      The function is noinline when zlog can be used so that the backtracing can
5282  *      reliably skip the zfree_ext() and zfree_log_trace()
5283  *      boring frames.
5284  */
5285 #if ZONE_ENABLE_LOGGING
5286 __attribute__((noinline))
5287 #endif
5288 void
5289 zfree_ext(zone_t zone, zone_stats_t zstats, void *addr)
5290 {
5291         vm_offset_t     elem = (vm_offset_t)addr;
5292         vm_size_t       elem_size = zone_elem_size(zone);
5293         bool            poison = false;
5294
5295         DTRACE_VM2(zfree, zone_t, zone, void*, addr);
5296         TRACE_MACHLEAKS(ZFREE_CODE, ZFREE_CODE_2, elem_size, elem);
5297
5298 #if KASAN_ZALLOC
5299         if (kasan_quarantine_freed_element(&zone, &addr)) {
5300                 return;
5301         }
5302         /*
5303          * kasan_quarantine_freed_element() might return a different
5304          * {zone, addr} than the one being freed for kalloc heaps.
5305          *
5306          * Make sure we reload everything.
5307          */
5308         elem = (vm_offset_t)addr;
5309         elem_size = zone_elem_size(zone);
5310 #endif
5311
5312 #if CONFIG_ZLEAKS
5313         /*
5314          * Zone leak detection: un-track the allocation
5315          */
5316         if (__improbable(zone->zleak_on)) {
5317                 zleak_free(elem, elem_size);
5318         }
5319 #endif /* CONFIG_ZLEAKS */
5320
5321 #if CONFIG_ZCACHE
5322         /*
5323          * Note: if zone caching is on, gzalloc and tags aren't used
5324          *       so we can always check this first
5325          */
5326         if (zone_caching_enabled(zone)) {
5327                 return zcache_free_to_cpu_cache(zone, zstats, (vm_offset_t)addr);
5328         }
5329 #endif /* CONFIG_ZCACHE */
5330
5331 #if CONFIG_GZALLOC
5332         if (__improbable(zone->gzalloc_tracked)) {
5333                 return gzalloc_free(zone, zstats, addr);
5334         }
5335 #endif /* CONFIG_GZALLOC */
5336
5337 #if ZONE_ENABLE_LOGGING
5338         if (__improbable(DO_LOGGING(zone))) {
5339                 zfree_log_trace(zone, elem);
5340         }
5341 #endif /* ZONE_ENABLE_LOGGING */
5342
5343         if (zone->zfree_clear_mem) {
5344                 poison = zfree_clear(zone, elem, elem_size);
5345         }
5346
5347         lock_zone(zone);
5348         assert(zone->z_self == zone);
5349
5350         if (!poison) {
5351                 poison = zfree_poison_element(zone, &zone->zp_count, elem);
5352         }
5353
5354         if (__probable(zstats != NULL)) {
5355                 /*
5356                  * The few vm zones used before zone_init() runs do not have
5357                  * per-cpu stats yet
5358                  */
5359                 zpercpu_get(zstats)->zs_mem_freed += elem_size;
5360         }
5361
5362         zfree_direct_locked(zone, elem, poison);
5363
5364         unlock_zone(zone);
5365 }
5366
5367 void
5368 (zfree)(union zone_or_view zov, void *addr)
5369 {
5370         zone_t zone = zov.zov_view->zv_zone;
5371         zone_stats_t zstats = zov.zov_view->zv_stats;
5372         assert(!zone->percpu);
5373         zfree_ext(zone, zstats, addr);
5374 }
5375
5376 void
5377 zfree_percpu(union zone_or_view zov, void *addr)
5378 {
5379         zone_t zone = zov.zov_view->zv_zone;
5380         zone_stats_t zstats = zov.zov_view->zv_stats;
5381         assert(zone->percpu);
5382         zfree_ext(zone, zstats, (void *)__zpcpu_demangle(addr));
5383 }
5384
5385 #pragma mark vm integration, MIG routines
5386
5387 /*
5388  * Drops (i.e. frees) the elements in the all free pages queue of a zone.
5389  * Called by zone_gc() on each zone and when a zone is zdestroy()ed.
5390  */
5391 static void
5392 zone_drop_free_elements(zone_t z)
5393 {
5394         const zone_addr_kind_t    kind = ZONE_ADDR_NATIVE;
5395         unsigned int              total_freed_pages = 0;
5396         struct zone_page_metadata *page_meta, *seq_meta;
5397         vm_address_t              page_addr;
5398         vm_size_t                 size_to_free;
5399         vm_size_t                 free_count;
5400         uint32_t                  page_count;
5401
5402         current_thread()->options |= TH_OPT_ZONE_PRIV;
5403         lock_zone(z);
5404
5405         while (!zone_pva_is_null(z->pages_all_free)) {
5406                 /*
5407                  * If any replenishment threads are running, defer to them,
5408                  * so that we don't deplete reserved zones.
5409                  *
5410                  * The timing of the check isn't super important, as there are
5411                  * enough reserves to allow freeing an extra page_meta.
5412                  *
5413                  * Hence, we can check without grabbing the lock every time
5414                  * through the loop.  We do need the lock however to avoid
5415                  * missing a wakeup when we decide to block.
5416                  */
5417                 if (zone_replenish_active > 0) {
5418                         lck_spin_lock(&zone_replenish_lock);
5419                         if (zone_replenish_active > 0) {
5420                                 assert_wait(&zone_replenish_active, THREAD_UNINT);
5421                                 lck_spin_unlock(&zone_replenish_lock);
5422                                 unlock_zone(z);
5423                                 thread_block(THREAD_CONTINUE_NULL);
5424                                 lock_zone(z);
5425                                 continue;
5426                         }
5427                         lck_spin_unlock(&zone_replenish_lock);
5428                 }
5429
5430                 page_meta = zone_pva_to_meta(z->pages_all_free, kind);
5431                 page_count = page_meta->zm_page_count;
5432                 free_count = zone_elem_count(z, ptoa(page_count), kind);
5433
5434                 /*
5435                  * Don't drain zones with async refill to below the refill
5436                  * threshold, as they need some reserve to function properly.
5437                  */
5438                 if (!z->destroyed && z->prio_refill_count &&
5439                     (vm_size_t)(z->countfree - free_count) < z->prio_refill_count) {
5440                         break;
5441                 }
5442
5443                 zone_meta_queue_pop(z, &z->pages_all_free, kind, &page_addr);
5444
5445                 if (os_sub_overflow(z->countfree, free_count, &z->countfree)) {
5446                         zone_accounting_panic(z, "countfree wrap-around");
5447                 }
5448                 if (os_sub_overflow(z->countavail, free_count, &z->countavail)) {
5449                         zone_accounting_panic(z, "countavail wrap-around");
5450                 }
5451                 if (os_sub_overflow(z->allfree_page_count, page_count,
5452                     &z->allfree_page_count)) {
5453                         zone_accounting_panic(z, "allfree_page_count wrap-around");
5454                 }
5455                 if (os_sub_overflow(z->page_count, page_count, &z->page_count)) {
5456                         zone_accounting_panic(z, "page_count wrap-around");
5457                 }
5458
5459                 os_atomic_sub(&zones_phys_page_count, page_count, relaxed);
5460                 os_atomic_sub(&zones_phys_page_mapped_count, page_count, relaxed);
5461
5462                 bzero(page_meta, sizeof(*page_meta) * page_count);
5463                 seq_meta = page_meta;
5464                 page_meta = NULL; /* page_meta fields are zeroed, prevent reuse */
5465
5466                 unlock_zone(z);
5467
5468                 /* Free the pages for metadata and account for them */
5469                 total_freed_pages += page_count;
5470                 size_to_free = ptoa(page_count);
5471 #if KASAN_ZALLOC
5472                 kasan_poison_range(page_addr, size_to_free, ASAN_VALID);
5473 #endif
5474 #if VM_MAX_TAG_ZONES
5475                 if (z->tags) {
5476                         ztMemoryRemove(z, page_addr, size_to_free);
5477                 }
5478 #endif /* VM_MAX_TAG_ZONES */
5479
5480                 if (z->va_sequester && z->alloc_pages == page_count) {
5481                         kernel_memory_depopulate(submap_for_zone(z), page_addr,
5482                             size_to_free, KMA_KOBJECT, VM_KERN_MEMORY_ZONE);
5483                 } else {
5484                         kmem_free(submap_for_zone(z), page_addr, size_to_free);
5485                         seq_meta = NULL;
5486                 }
5487                 thread_yield_to_preemption();
5488
5489                 lock_zone(z);
5490
5491                 if (seq_meta) {
5492                         zone_meta_queue_push(z, &z->pages_sequester, seq_meta, kind);
5493                         z->sequester_page_count += page_count;
5494                 }
5495         }
5496         if (z->destroyed) {
5497                 assert(zone_pva_is_null(z->pages_all_free));
5498                 assert(z->allfree_page_count == 0);
5499         }
5500         unlock_zone(z);
5501         current_thread()->options &= ~TH_OPT_ZONE_PRIV;
5502
5503 #if DEBUG || DEVELOPMENT
5504         if (zalloc_debug & ZALLOC_DEBUG_ZONEGC) {
5505                 kprintf("zone_gc() of zone %s%s freed %lu elements, %d pages\n",
5506                     zone_heap_name(z), z->z_name,
5507                     (unsigned long)(ptoa(total_freed_pages) / z->pcpu_elem_size),
5508                     total_freed_pages);
5509         }
5510 #endif /* DEBUG || DEVELOPMENT */
5511 }
5512
5513 /*      Zone garbage collection
5514  *
5515  *      zone_gc will walk through all the free elements in all the
5516  *      zones that are marked collectable looking for reclaimable
5517  *      pages.  zone_gc is called by consider_zone_gc when the system
5518  *      begins to run out of memory.
5519  *
5520  *      We should ensure that zone_gc never blocks.
5521  */
5522 void
5523 zone_gc(boolean_t consider_jetsams)
5524 {
5525         if (consider_jetsams) {
5526                 kill_process_in_largest_zone();
5527                 /*
5528                  * If we do end up jetsamming something, we need to do a zone_gc so that
5529                  * we can reclaim free zone elements and update the zone map size.
5530                  * Fall through.
5531                  */
5532         }
5533
5534         lck_mtx_lock(&zone_gc_lock);
5535
5536 #if DEBUG || DEVELOPMENT
5537         if (zalloc_debug & ZALLOC_DEBUG_ZONEGC) {
5538                 kprintf("zone_gc() starting...\n");
5539         }
5540 #endif /* DEBUG || DEVELOPMENT */
5541
5542         zone_index_foreach(i) {
5543                 zone_t z = &zone_array[i];
5544
5545                 if (!z->collectable) {
5546                         continue;
5547                 }
5548 #if CONFIG_ZCACHE
5549                 if (zone_caching_enabled(z)) {
5550                         zcache_drain_depot(z);
5551                 }
5552 #endif /* CONFIG_ZCACHE */
5553                 if (zone_pva_is_null(z->pages_all_free)) {
5554                         continue;
5555                 }
5556
5557                 zone_drop_free_elements(z);
5558         }
5559
5560         lck_mtx_unlock(&zone_gc_lock);
5561 }
5562
5563 /*
5564  *      consider_zone_gc:
5565  *
5566  *      Called by the pageout daemon when the system needs more free pages.
5567  */
5568
5569 void
5570 consider_zone_gc(boolean_t consider_jetsams)
5571 {
5572         /*
5573          * One-time reclaim of kernel_map resources we allocated in
5574          * early boot.
5575          *
5576          * Use atomic exchange in case multiple threads race into here.
5577          */
5578         vm_offset_t deallocate_kaddr;
5579         if (kmapoff_kaddr != 0 &&
5580             (deallocate_kaddr = os_atomic_xchg(&kmapoff_kaddr, 0, relaxed)) != 0) {
5581                 vm_deallocate(kernel_map, deallocate_kaddr, ptoa_64(kmapoff_pgcnt));
5582         }
5583
5584         zone_gc(consider_jetsams);
5585 }
5586
5587 /*
5588  * Creates a vm_map_copy_t to return to the caller of mach_* MIG calls
5589  * requesting zone information.
5590  * Frees unused pages towards the end of the region, and zero'es out unused
5591  * space on the last page.
5592  */
5593 static vm_map_copy_t
5594 create_vm_map_copy(
5595         vm_offset_t             start_addr,
5596         vm_size_t               total_size,
5597         vm_size_t               used_size)
5598 {
5599         kern_return_t   kr;
5600         vm_offset_t             end_addr;
5601         vm_size_t               free_size;
5602         vm_map_copy_t   copy;
5603
5604         if (used_size != total_size) {
5605                 end_addr = start_addr + used_size;
5606                 free_size = total_size - (round_page(end_addr) - start_addr);
5607
5608                 if (free_size >= PAGE_SIZE) {
5609                         kmem_free(ipc_kernel_map,
5610                             round_page(end_addr), free_size);
5611                 }
5612                 bzero((char *) end_addr, round_page(end_addr) - end_addr);
5613         }
5614
5615         kr = vm_map_copyin(ipc_kernel_map, (vm_map_address_t)start_addr,
5616             (vm_map_size_t)used_size, TRUE, &copy);
5617         assert(kr == KERN_SUCCESS);
5618
5619         return copy;
5620 }
5621
5622 static boolean_t
5623 get_zone_info(
5624         zone_t                   z,
5625         mach_zone_name_t        *zn,
5626         mach_zone_info_t        *zi)
5627 {
5628         struct zone zcopy;
5629
5630         assert(z != ZONE_NULL);
5631         lock_zone(z);
5632         if (!z->z_self) {
5633                 unlock_zone(z);
5634                 return FALSE;
5635         }
5636         zcopy = *z;
5637         unlock_zone(z);
5638
5639         if (zn != NULL) {
5640                 /*
5641                  * Append kalloc heap name to zone name (if zone is used by kalloc)
5642                  */
5643                 char temp_zone_name[MAX_ZONE_NAME] = "";
5644                 snprintf(temp_zone_name, MAX_ZONE_NAME, "%s%s",
5645                     zone_heap_name(z), z->z_name);
5646
5647                 /* assuming here the name data is static */
5648                 (void) __nosan_strlcpy(zn->mzn_name, temp_zone_name,
5649                     strlen(temp_zone_name) + 1);
5650         }
5651
5652         if (zi != NULL) {
5653                 *zi = (mach_zone_info_t) {
5654                         .mzi_count = zone_count_allocated(&zcopy),
5655                         .mzi_cur_size = ptoa_64(zcopy.page_count),
5656                         // max_size for zprint is now high-watermark of pages used
5657                         .mzi_max_size = ptoa_64(zcopy.page_count_hwm),
5658                         .mzi_elem_size = zcopy.pcpu_elem_size,
5659                         .mzi_alloc_size = ptoa_64(zcopy.alloc_pages),
5660                         .mzi_exhaustible = (uint64_t)zcopy.exhaustible,
5661                 };
5662                 zpercpu_foreach(zs, zcopy.z_stats) {
5663                         zi->mzi_sum_size += zs->zs_mem_allocated;
5664                 }
5665                 if (zcopy.collectable) {
5666                         SET_MZI_COLLECTABLE_BYTES(zi->mzi_collectable,
5667                             ptoa_64(zcopy.allfree_page_count));
5668                         SET_MZI_COLLECTABLE_FLAG(zi->mzi_collectable, TRUE);
5669                 }
5670         }
5671
5672         return TRUE;
5673 }
5674
5675 kern_return_t
5676 task_zone_info(
5677         __unused task_t                                 task,
5678         __unused mach_zone_name_array_t *namesp,
5679         __unused mach_msg_type_number_t *namesCntp,
5680         __unused task_zone_info_array_t *infop,
5681         __unused mach_msg_type_number_t *infoCntp)
5682 {
5683         return KERN_FAILURE;
5684 }
5685
5686 kern_return_t
5687 mach_zone_info(
5688         host_priv_t             host,
5689         mach_zone_name_array_t  *namesp,
5690         mach_msg_type_number_t  *namesCntp,
5691         mach_zone_info_array_t  *infop,
5692         mach_msg_type_number_t  *infoCntp)
5693 {
5694         return mach_memory_info(host, namesp, namesCntp, infop, infoCntp, NULL, NULL);
5695 }
5696
5697
5698 kern_return_t
5699 mach_memory_info(
5700         host_priv_t             host,
5701         mach_zone_name_array_t  *namesp,
5702         mach_msg_type_number_t  *namesCntp,
5703         mach_zone_info_array_t  *infop,
5704         mach_msg_type_number_t  *infoCntp,
5705         mach_memory_info_array_t *memoryInfop,
5706         mach_msg_type_number_t   *memoryInfoCntp)
5707 {
5708         mach_zone_name_t        *names;
5709         vm_offset_t             names_addr;
5710         vm_size_t               names_size;
5711
5712         mach_zone_info_t        *info;
5713         vm_offset_t             info_addr;
5714         vm_size_t               info_size;
5715
5716         mach_memory_info_t      *memory_info;
5717         vm_offset_t             memory_info_addr;
5718         vm_size_t               memory_info_size;
5719         vm_size_t               memory_info_vmsize;
5720         unsigned int            num_info;
5721
5722         unsigned int            max_zones, used_zones, i;
5723         mach_zone_name_t        *zn;
5724         mach_zone_info_t        *zi;
5725         kern_return_t           kr;
5726
5727         uint64_t                zones_collectable_bytes = 0;
5728
5729         if (host == HOST_NULL) {
5730                 return KERN_INVALID_HOST;
5731         }
5732 #if CONFIG_DEBUGGER_FOR_ZONE_INFO
5733         if (!PE_i_can_has_debugger(NULL)) {
5734                 return KERN_INVALID_HOST;
5735         }
5736 #endif
5737
5738         /*
5739          *      We assume that zones aren't freed once allocated.
5740          *      We won't pick up any zones that are allocated later.
5741          */
5742
5743         max_zones = os_atomic_load(&num_zones, relaxed);
5744
5745         names_size = round_page(max_zones * sizeof *names);
5746         kr = kmem_alloc_pageable(ipc_kernel_map,
5747             &names_addr, names_size, VM_KERN_MEMORY_IPC);
5748         if (kr != KERN_SUCCESS) {
5749                 return kr;
5750         }
5751         names = (mach_zone_name_t *) names_addr;
5752
5753         info_size = round_page(max_zones * sizeof *info);
5754         kr = kmem_alloc_pageable(ipc_kernel_map,
5755             &info_addr, info_size, VM_KERN_MEMORY_IPC);
5756         if (kr != KERN_SUCCESS) {
5757                 kmem_free(ipc_kernel_map,
5758                     names_addr, names_size);
5759                 return kr;
5760         }
5761         info = (mach_zone_info_t *) info_addr;
5762
5763         zn = &names[0];
5764         zi = &info[0];
5765
5766         used_zones = max_zones;
5767         for (i = 0; i < max_zones; i++) {
5768                 if (!get_zone_info(&(zone_array[i]), zn, zi)) {
5769                         used_zones--;
5770                         continue;
5771                 }
5772                 zones_collectable_bytes += GET_MZI_COLLECTABLE_BYTES(zi->mzi_collectable);
5773                 zn++;
5774                 zi++;
5775         }
5776
5777         *namesp = (mach_zone_name_t *) create_vm_map_copy(names_addr, names_size, used_zones * sizeof *names);
5778         *namesCntp = used_zones;
5779
5780         *infop = (mach_zone_info_t *) create_vm_map_copy(info_addr, info_size, used_zones * sizeof *info);
5781         *infoCntp = used_zones;
5782
5783         num_info = 0;
5784         memory_info_addr = 0;
5785
5786         if (memoryInfop && memoryInfoCntp) {
5787                 vm_map_copy_t           copy;
5788                 num_info = vm_page_diagnose_estimate();
5789                 memory_info_size = num_info * sizeof(*memory_info);
5790                 memory_info_vmsize = round_page(memory_info_size);
5791                 kr = kmem_alloc_pageable(ipc_kernel_map,
5792                     &memory_info_addr, memory_info_vmsize, VM_KERN_MEMORY_IPC);
5793                 if (kr != KERN_SUCCESS) {
5794                         return kr;
5795                 }
5796
5797                 kr = vm_map_wire_kernel(ipc_kernel_map, memory_info_addr, memory_info_addr + memory_info_vmsize,
5798                     VM_PROT_READ | VM_PROT_WRITE, VM_KERN_MEMORY_IPC, FALSE);
5799                 assert(kr == KERN_SUCCESS);
5800
5801                 memory_info = (mach_memory_info_t *) memory_info_addr;
5802                 vm_page_diagnose(memory_info, num_info, zones_collectable_bytes);
5803
5804                 kr = vm_map_unwire(ipc_kernel_map, memory_info_addr, memory_info_addr + memory_info_vmsize, FALSE);
5805                 assert(kr == KERN_SUCCESS);
5806
5807                 kr = vm_map_copyin(ipc_kernel_map, (vm_map_address_t)memory_info_addr,
5808                     (vm_map_size_t)memory_info_size, TRUE, &copy);
5809                 assert(kr == KERN_SUCCESS);
5810
5811                 *memoryInfop = (mach_memory_info_t *) copy;
5812                 *memoryInfoCntp = num_info;
5813         }
5814
5815         return KERN_SUCCESS;
5816 }
5817
5818 kern_return_t
5819 mach_zone_info_for_zone(
5820         host_priv_t                     host,
5821         mach_zone_name_t        name,
5822         mach_zone_info_t        *infop)
5823 {
5824         zone_t zone_ptr;
5825
5826         if (host == HOST_NULL) {
5827                 return KERN_INVALID_HOST;
5828         }
5829 #if CONFIG_DEBUGGER_FOR_ZONE_INFO
5830         if (!PE_i_can_has_debugger(NULL)) {
5831                 return KERN_INVALID_HOST;
5832         }
5833 #endif
5834
5835         if (infop == NULL) {
5836                 return KERN_INVALID_ARGUMENT;
5837         }
5838
5839         zone_ptr = ZONE_NULL;
5840         zone_index_foreach(i) {
5841                 zone_t z = &(zone_array[i]);
5842                 assert(z != ZONE_NULL);
5843
5844                 /*
5845                  * Append kalloc heap name to zone name (if zone is used by kalloc)
5846                  */
5847                 char temp_zone_name[MAX_ZONE_NAME] = "";
5848                 snprintf(temp_zone_name, MAX_ZONE_NAME, "%s%s",
5849                     zone_heap_name(z), z->z_name);
5850
5851                 /* Find the requested zone by name */
5852                 if (track_this_zone(temp_zone_name, name.mzn_name)) {
5853                         zone_ptr = z;
5854                         break;
5855                 }
5856         }
5857
5858         /* No zones found with the requested zone name */
5859         if (zone_ptr == ZONE_NULL) {
5860                 return KERN_INVALID_ARGUMENT;
5861         }
5862
5863         if (get_zone_info(zone_ptr, NULL, infop)) {
5864                 return KERN_SUCCESS;
5865         }
5866         return KERN_FAILURE;
5867 }
5868
5869 kern_return_t
5870 mach_zone_info_for_largest_zone(
5871         host_priv_t                     host,
5872         mach_zone_name_t        *namep,
5873         mach_zone_info_t        *infop)
5874 {
5875         if (host == HOST_NULL) {
5876                 return KERN_INVALID_HOST;
5877         }
5878 #if CONFIG_DEBUGGER_FOR_ZONE_INFO
5879         if (!PE_i_can_has_debugger(NULL)) {
5880                 return KERN_INVALID_HOST;
5881         }
5882 #endif
5883
5884         if (namep == NULL || infop == NULL) {
5885                 return KERN_INVALID_ARGUMENT;
5886         }
5887
5888         if (get_zone_info(zone_find_largest(), namep, infop)) {
5889                 return KERN_SUCCESS;
5890         }
5891         return KERN_FAILURE;
5892 }
5893
5894 uint64_t
5895 get_zones_collectable_bytes(void)
5896 {
5897         uint64_t zones_collectable_bytes = 0;
5898         mach_zone_info_t zi;
5899
5900         zone_index_foreach(i) {
5901                 if (get_zone_info(&zone_array[i], NULL, &zi)) {
5902                         zones_collectable_bytes +=
5903                             GET_MZI_COLLECTABLE_BYTES(zi.mzi_collectable);
5904                 }
5905         }
5906
5907         return zones_collectable_bytes;
5908 }
5909
5910 kern_return_t
5911 mach_zone_get_zlog_zones(
5912         host_priv_t                             host,
5913         mach_zone_name_array_t  *namesp,
5914         mach_msg_type_number_t  *namesCntp)
5915 {
5916 #if ZONE_ENABLE_LOGGING
5917         unsigned int max_zones, logged_zones, i;
5918         kern_return_t kr;
5919         zone_t zone_ptr;
5920         mach_zone_name_t *names;
5921         vm_offset_t names_addr;
5922         vm_size_t names_size;
5923
5924         if (host == HOST_NULL) {
5925                 return KERN_INVALID_HOST;
5926         }
5927
5928         if (namesp == NULL || namesCntp == NULL) {
5929                 return KERN_INVALID_ARGUMENT;
5930         }
5931
5932         max_zones = os_atomic_load(&num_zones, relaxed);
5933
5934         names_size = round_page(max_zones * sizeof *names);
5935         kr = kmem_alloc_pageable(ipc_kernel_map,
5936             &names_addr, names_size, VM_KERN_MEMORY_IPC);
5937         if (kr != KERN_SUCCESS) {
5938                 return kr;
5939         }
5940         names = (mach_zone_name_t *) names_addr;
5941
5942         zone_ptr = ZONE_NULL;
5943         logged_zones = 0;
5944         for (i = 0; i < max_zones; i++) {
5945                 zone_t z = &(zone_array[i]);
5946                 assert(z != ZONE_NULL);
5947
5948                 /* Copy out the zone name if zone logging is enabled */
5949                 if (z->zlog_btlog) {
5950                         get_zone_info(z, &names[logged_zones], NULL);
5951                         logged_zones++;
5952                 }
5953         }
5954
5955         *namesp = (mach_zone_name_t *) create_vm_map_copy(names_addr, names_size, logged_zones * sizeof *names);
5956         *namesCntp = logged_zones;
5957
5958         return KERN_SUCCESS;
5959
5960 #else /* ZONE_ENABLE_LOGGING */
5961 #pragma unused(host, namesp, namesCntp)
5962         return KERN_FAILURE;
5963 #endif /* ZONE_ENABLE_LOGGING */
5964 }
5965
5966 kern_return_t
5967 mach_zone_get_btlog_records(
5968         host_priv_t                             host,
5969         mach_zone_name_t                name,
5970         zone_btrecord_array_t   *recsp,
5971         mach_msg_type_number_t  *recsCntp)
5972 {
5973 #if DEBUG || DEVELOPMENT
5974         unsigned int numrecs = 0;
5975         zone_btrecord_t *recs;
5976         kern_return_t kr;
5977         zone_t zone_ptr;
5978         vm_offset_t recs_addr;
5979         vm_size_t recs_size;
5980
5981         if (host == HOST_NULL) {
5982                 return KERN_INVALID_HOST;
5983         }
5984
5985         if (recsp == NULL || recsCntp == NULL) {
5986                 return KERN_INVALID_ARGUMENT;
5987         }
5988
5989         zone_ptr = ZONE_NULL;
5990         zone_index_foreach(i) {
5991                 zone_t z = &zone_array[i];
5992
5993                 /*
5994                  * Append kalloc heap name to zone name (if zone is used by kalloc)
5995                  */
5996                 char temp_zone_name[MAX_ZONE_NAME] = "";
5997                 snprintf(temp_zone_name, MAX_ZONE_NAME, "%s%s",
5998                     zone_heap_name(z), z->z_name);
5999
6000                 /* Find the requested zone by name */
6001                 if (track_this_zone(temp_zone_name, name.mzn_name)) {
6002                         zone_ptr = z;
6003                         break;
6004                 }
6005         }
6006
6007         /* No zones found with the requested zone name */
6008         if (zone_ptr == ZONE_NULL) {
6009                 return KERN_INVALID_ARGUMENT;
6010         }
6011
6012         /* Logging not turned on for the requested zone */
6013         if (!DO_LOGGING(zone_ptr)) {
6014                 return KERN_FAILURE;
6015         }
6016
6017         /* Allocate memory for btlog records */
6018         numrecs = (unsigned int)(get_btlog_records_count(zone_ptr->zlog_btlog));
6019         recs_size = round_page(numrecs * sizeof *recs);
6020
6021         kr = kmem_alloc_pageable(ipc_kernel_map, &recs_addr, recs_size, VM_KERN_MEMORY_IPC);
6022         if (kr != KERN_SUCCESS) {
6023                 return kr;
6024         }
6025
6026         /*
6027          * We will call get_btlog_records() below which populates this region while holding a spinlock
6028          * (the btlog lock). So these pages need to be wired.
6029          */
6030         kr = vm_map_wire_kernel(ipc_kernel_map, recs_addr, recs_addr + recs_size,
6031             VM_PROT_READ | VM_PROT_WRITE, VM_KERN_MEMORY_IPC, FALSE);
6032         assert(kr == KERN_SUCCESS);
6033
6034         recs = (zone_btrecord_t *)recs_addr;
6035         get_btlog_records(zone_ptr->zlog_btlog, recs, &numrecs);
6036
6037         kr = vm_map_unwire(ipc_kernel_map, recs_addr, recs_addr + recs_size, FALSE);
6038         assert(kr == KERN_SUCCESS);
6039
6040         *recsp = (zone_btrecord_t *) create_vm_map_copy(recs_addr, recs_size, numrecs * sizeof *recs);
6041         *recsCntp = numrecs;
6042
6043         return KERN_SUCCESS;
6044
6045 #else /* DEBUG || DEVELOPMENT */
6046 #pragma unused(host, name, recsp, recsCntp)
6047         return KERN_FAILURE;
6048 #endif /* DEBUG || DEVELOPMENT */
6049 }
6050
6051
6052 #if DEBUG || DEVELOPMENT
6053
6054 kern_return_t
6055 mach_memory_info_check(void)
6056 {
6057         mach_memory_info_t * memory_info;
6058         mach_memory_info_t * info;
6059         unsigned int         num_info;
6060         vm_offset_t          memory_info_addr;
6061         kern_return_t        kr;
6062         size_t               memory_info_size, memory_info_vmsize;
6063         uint64_t             top_wired, zonestotal, total;
6064
6065         num_info = vm_page_diagnose_estimate();
6066         memory_info_size = num_info * sizeof(*memory_info);
6067         memory_info_vmsize = round_page(memory_info_size);
6068         kr = kmem_alloc(kernel_map, &memory_info_addr, memory_info_vmsize, VM_KERN_MEMORY_DIAG);
6069         assert(kr == KERN_SUCCESS);
6070
6071         memory_info = (mach_memory_info_t *) memory_info_addr;
6072         vm_page_diagnose(memory_info, num_info, 0);
6073
6074         top_wired = total = zonestotal = 0;
6075         zone_index_foreach(idx) {
6076                 zonestotal += zone_size_wired(&zone_array[idx]);
6077         }
6078
6079         for (uint32_t idx = 0; idx < num_info; idx++) {
6080                 info = &memory_info[idx];
6081                 if (!info->size) {
6082                         continue;
6083                 }
6084                 if (VM_KERN_COUNT_WIRED == info->site) {
6085                         top_wired = info->size;
6086                 }
6087                 if (VM_KERN_SITE_HIDE & info->flags) {
6088                         continue;
6089                 }
6090                 if (!(VM_KERN_SITE_WIRED & info->flags)) {
6091                         continue;
6092                 }
6093                 total += info->size;
6094         }
6095         total += zonestotal;
6096
6097         printf("vm_page_diagnose_check %qd of %qd, zones %qd, short 0x%qx\n",
6098             total, top_wired, zonestotal, top_wired - total);
6099
6100         kmem_free(kernel_map, memory_info_addr, memory_info_vmsize);
6101
6102         return kr;
6103 }
6104
6105 extern boolean_t(*volatile consider_buffer_cache_collect)(int);
6106
6107 #endif /* DEBUG || DEVELOPMENT */
6108
6109 kern_return_t
6110 mach_zone_force_gc(
6111         host_t host)
6112 {
6113         if (host == HOST_NULL) {
6114                 return KERN_INVALID_HOST;
6115         }
6116
6117 #if DEBUG || DEVELOPMENT
6118         /* Callout to buffer cache GC to drop elements in the apfs zones */
6119         if (consider_buffer_cache_collect != NULL) {
6120                 (void)(*consider_buffer_cache_collect)(0);
6121         }
6122         consider_zone_gc(FALSE);
6123 #endif /* DEBUG || DEVELOPMENT */
6124         return KERN_SUCCESS;
6125 }
6126
6127 zone_t
6128 zone_find_largest(void)
6129 {
6130         uint32_t    largest_idx  = 0;
6131         vm_offset_t largest_size = zone_size_wired(&zone_array[0]);
6132
6133         zone_index_foreach(i) {
6134                 vm_offset_t size = zone_size_wired(&zone_array[i]);
6135                 if (size > largest_size) {
6136                         largest_idx = i;
6137                         largest_size = size;
6138                 }
6139         }
6140
6141         return &zone_array[largest_idx];
6142 }
6143
6144 #pragma mark - tests
6145 #if DEBUG || DEVELOPMENT
6146
6147 /*
6148  * Used for sysctl kern.run_zone_test which is not thread-safe. Ensure only one
6149  * thread goes through at a time.  Or we can end up with multiple test zones (if
6150  * a second zinit() comes through before zdestroy()),  which could lead us to
6151  * run out of zones.
6152  */
6153 SIMPLE_LOCK_DECLARE(zone_test_lock, 0);
6154 static boolean_t zone_test_running = FALSE;
6155 static zone_t test_zone_ptr = NULL;
6156
6157 static uintptr_t *
6158 zone_copy_allocations(zone_t z, uintptr_t *elems, bitmap_t *bits,
6159     zone_pva_t page_index, zone_addr_kind_t kind)
6160 {
6161         vm_offset_t free, first, end, page;
6162         struct zone_page_metadata *meta;
6163
6164         while (!zone_pva_is_null(page_index)) {
6165                 page  = zone_pva_to_addr(page_index);
6166                 meta  = zone_pva_to_meta(page_index, kind);
6167                 end   = page + ptoa(meta->zm_percpu ? 1 : meta->zm_page_count);
6168                 first = page + ZONE_PAGE_FIRST_OFFSET(kind);
6169
6170                 bitmap_clear(bits, (uint32_t)((end - first) / zone_elem_size(z)));
6171
6172                 // construct bitmap of all freed elements
6173                 free = zone_page_meta_get_freelist(z, meta, page);
6174                 while (free) {
6175                         bitmap_set(bits, (uint32_t)((free - first) / zone_elem_size(z)));
6176
6177                         // next free element
6178                         free = *(vm_offset_t *)free ^ zp_nopoison_cookie;
6179                 }
6180
6181                 for (unsigned i = 0; first < end; i++, first += zone_elem_size(z)) {
6182                         if (!bitmap_test(bits, i)) {
6183                                 *elems++ = INSTANCE_PUT(first);
6184                         }
6185                 }
6186
6187                 page_index = meta->zm_page_next;
6188         }
6189         return elems;
6190 }
6191
6192 kern_return_t
6193 zone_leaks(const char * zoneName, uint32_t nameLen, leak_site_proc proc, void * refCon)
6194 {
6195         uintptr_t     zbt[MAX_ZTRACE_DEPTH];
6196         zone_t        zone = NULL;
6197         uintptr_t *   array;
6198         uintptr_t *   next;
6199         uintptr_t     element, bt;
6200         uint32_t      idx, count, found;
6201         uint32_t      btidx, btcount, nobtcount, btfound;
6202         uint32_t      elemSize;
6203         uint64_t      maxElems;
6204         kern_return_t kr;
6205         bitmap_t     *bits;
6206
6207         zone_index_foreach(i) {
6208                 if (!strncmp(zoneName, zone_array[i].z_name, nameLen)) {
6209                         zone = &zone_array[i];
6210                         break;
6211                 }
6212         }
6213         if (zone == NULL) {
6214                 return KERN_INVALID_NAME;
6215         }
6216
6217         elemSize = zone_elem_size(zone);
6218         maxElems = (zone->countavail + 1) & ~1ul;
6219
6220         if ((ptoa(zone->percpu ? 1 : zone->alloc_pages) % elemSize) &&
6221             !zone_leaks_scan_enable) {
6222                 return KERN_INVALID_CAPABILITY;
6223         }
6224
6225         kr = kmem_alloc_kobject(kernel_map, (vm_offset_t *) &array,
6226             maxElems * sizeof(uintptr_t) + BITMAP_LEN(ZONE_CHUNK_MAXELEMENTS),
6227             VM_KERN_MEMORY_DIAG);
6228         if (KERN_SUCCESS != kr) {
6229                 return kr;
6230         }
6231
6232         /* maxElems is a 2-multiple so we're always aligned */
6233         bits = CAST_DOWN_EXPLICIT(bitmap_t *, array + maxElems);
6234
6235         lock_zone(zone);
6236
6237         next = array;
6238         next = zone_copy_allocations(zone, next, bits,
6239             zone->pages_any_free_foreign, ZONE_ADDR_FOREIGN);
6240         next = zone_copy_allocations(zone, next, bits,
6241             zone->pages_all_used_foreign, ZONE_ADDR_FOREIGN);
6242         next = zone_copy_allocations(zone, next, bits,
6243             zone->pages_intermediate, ZONE_ADDR_NATIVE);
6244         next = zone_copy_allocations(zone, next, bits,
6245             zone->pages_all_used, ZONE_ADDR_NATIVE);
6246         count = (uint32_t)(next - array);
6247
6248         unlock_zone(zone);
6249
6250         zone_leaks_scan(array, count, zone_elem_size(zone), &found);
6251         assert(found <= count);
6252
6253         for (idx = 0; idx < count; idx++) {
6254                 element = array[idx];
6255                 if (kInstanceFlagReferenced & element) {
6256                         continue;
6257                 }
6258                 element = INSTANCE_PUT(element) & ~kInstanceFlags;
6259         }
6260
6261 #if ZONE_ENABLE_LOGGING
6262         if (zone->zlog_btlog && !corruption_debug_flag) {
6263                 // btlog_copy_backtraces_for_elements will set kInstanceFlagReferenced on elements it found
6264                 btlog_copy_backtraces_for_elements(zone->zlog_btlog, array, &count, elemSize, proc, refCon);
6265         }
6266 #endif /* ZONE_ENABLE_LOGGING */
6267
6268         for (nobtcount = idx = 0; idx < count; idx++) {
6269                 element = array[idx];
6270                 if (!element) {
6271                         continue;
6272                 }
6273                 if (kInstanceFlagReferenced & element) {
6274                         continue;
6275                 }
6276                 element = INSTANCE_PUT(element) & ~kInstanceFlags;
6277
6278                 // see if we can find any backtrace left in the element
6279                 btcount = (typeof(btcount))(zone_elem_size(zone) / sizeof(uintptr_t));
6280                 if (btcount >= MAX_ZTRACE_DEPTH) {
6281                         btcount = MAX_ZTRACE_DEPTH - 1;
6282                 }
6283                 for (btfound = btidx = 0; btidx < btcount; btidx++) {
6284                         bt = ((uintptr_t *)element)[btcount - 1 - btidx];
6285                         if (!VM_KERNEL_IS_SLID(bt)) {
6286                                 break;
6287                         }
6288                         zbt[btfound++] = bt;
6289                 }
6290                 if (btfound) {
6291                         (*proc)(refCon, 1, elemSize, &zbt[0], btfound);
6292                 } else {
6293                         nobtcount++;
6294                 }
6295         }
6296         if (nobtcount) {
6297                 // fake backtrace when we found nothing
6298                 zbt[0] = (uintptr_t) &zalloc;
6299                 (*proc)(refCon, nobtcount, elemSize, &zbt[0], 1);
6300         }
6301
6302         kmem_free(kernel_map, (vm_offset_t) array, maxElems * sizeof(uintptr_t));
6303
6304         return KERN_SUCCESS;
6305 }
6306
6307 boolean_t
6308 run_zone_test(void)
6309 {
6310         unsigned int i = 0, max_iter = 5;
6311         void * test_ptr;
6312         zone_t test_zone;
6313
6314         simple_lock(&zone_test_lock, &zone_locks_grp);
6315         if (!zone_test_running) {
6316                 zone_test_running = TRUE;
6317         } else {
6318                 simple_unlock(&zone_test_lock);
6319                 printf("run_zone_test: Test already running.\n");
6320                 return FALSE;
6321         }
6322         simple_unlock(&zone_test_lock);
6323
6324         printf("run_zone_test: Testing zinit(), zalloc(), zfree() and zdestroy() on zone \"test_zone_sysctl\"\n");
6325
6326         /* zinit() and zdestroy() a zone with the same name a bunch of times, verify that we get back the same zone each time */
6327         do {
6328                 test_zone = zinit(sizeof(uint64_t), 100 * sizeof(uint64_t), sizeof(uint64_t), "test_zone_sysctl");
6329                 if (test_zone == NULL) {
6330                         printf("run_zone_test: zinit() failed\n");
6331                         return FALSE;
6332                 }
6333
6334 #if KASAN_ZALLOC
6335                 if (test_zone_ptr == NULL && test_zone->countfree != 0) {
6336 #else
6337                 if (test_zone->countfree != 0) {
6338 #endif
6339                         printf("run_zone_test: free count is not zero\n");
6340                         return FALSE;
6341                 }
6342
6343                 if (test_zone_ptr == NULL) {
6344                         /* Stash the zone pointer returned on the fist zinit */
6345                         printf("run_zone_test: zone created for the first time\n");
6346                         test_zone_ptr = test_zone;
6347                 } else if (test_zone != test_zone_ptr) {
6348                         printf("run_zone_test: old zone pointer and new zone pointer don't match\n");
6349                         return FALSE;
6350                 }
6351
6352                 test_ptr = zalloc(test_zone);
6353                 if (test_ptr == NULL) {
6354                         printf("run_zone_test: zalloc() failed\n");
6355                         return FALSE;
6356                 }
6357                 zfree(test_zone, test_ptr);
6358
6359                 zdestroy(test_zone);
6360                 i++;
6361
6362                 printf("run_zone_test: Iteration %d successful\n", i);
6363         } while (i < max_iter);
6364
6365         /* test Z_VA_SEQUESTER */
6366         if (zsecurity_options & ZSECURITY_OPTIONS_SEQUESTER) {
6367                 int idx, num_allocs = 8;
6368                 vm_size_t elem_size = 2 * PAGE_SIZE / num_allocs;
6369                 void *allocs[num_allocs];
6370                 vm_offset_t phys_pages = os_atomic_load(&zones_phys_page_count, relaxed);
6371                 vm_size_t zone_map_size = zone_range_size(&zone_info.zi_map_range);
6372
6373                 test_zone = zone_create("test_zone_sysctl", elem_size,
6374                     ZC_DESTRUCTIBLE | ZC_SEQUESTER);
6375                 if (test_zone == NULL) {
6376                         printf("run_zone_test: zinit() failed\n");
6377                         return FALSE;
6378                 }
6379
6380                 for (idx = 0; idx < num_allocs; idx++) {
6381                         allocs[idx] = zalloc(test_zone);
6382                         assert(NULL != allocs[idx]);
6383                         printf("alloc[%d] %p\n", idx, allocs[idx]);
6384                 }
6385                 for (idx = 0; idx < num_allocs; idx++) {
6386                         zfree(test_zone, allocs[idx]);
6387                 }
6388                 assert(!zone_pva_is_null(test_zone->pages_all_free));
6389
6390                 printf("vm_page_wire_count %d, vm_page_free_count %d, p to v %qd%%\n",
6391                     vm_page_wire_count, vm_page_free_count,
6392                     (100ULL * ptoa_64(phys_pages)) / zone_map_size);
6393                 zone_gc(FALSE);
6394                 printf("vm_page_wire_count %d, vm_page_free_count %d, p to v %qd%%\n",
6395                     vm_page_wire_count, vm_page_free_count,
6396                     (100ULL * ptoa_64(phys_pages)) / zone_map_size);
6397                 unsigned int allva = 0;
6398                 zone_index_foreach(zidx) {
6399                         zone_t z = &zone_array[zidx];
6400                         lock_zone(z);
6401                         allva += z->page_count;
6402                         if (!z->sequester_page_count) {
6403                                 unlock_zone(z);
6404                                 continue;
6405                         }
6406                         unsigned count = 0;
6407                         uint64_t size;
6408                         zone_pva_t pg = z->pages_sequester;
6409                         struct zone_page_metadata *page_meta;
6410                         while (pg.packed_address) {
6411                                 page_meta = zone_pva_to_meta(pg, ZONE_ADDR_NATIVE);
6412                                 count += z->alloc_pages;
6413                                 pg = page_meta->zm_page_next;
6414                         }
6415                         assert(count == z->sequester_page_count);
6416                         size = zone_size_wired(z);
6417                         if (!size) {
6418                                 size = 1;
6419                         }
6420                         printf("%s%s: seq %d, res %d, %qd %%\n",
6421                             zone_heap_name(z), z->z_name, z->sequester_page_count,
6422                             z->page_count, zone_size_allocated(z) * 100ULL / size);
6423                         unlock_zone(z);
6424                 }
6425
6426                 printf("total va: %d\n", allva);
6427
6428                 assert(zone_pva_is_null(test_zone->pages_all_free));
6429                 assert(!zone_pva_is_null(test_zone->pages_sequester));
6430                 assert(2 == test_zone->sequester_page_count);
6431                 for (idx = 0; idx < num_allocs; idx++) {
6432                         assert(0 == pmap_find_phys(kernel_pmap, (addr64_t)(uintptr_t) allocs[idx]));
6433                 }
6434                 for (idx = 0; idx < num_allocs; idx++) {
6435                         allocs[idx] = zalloc(test_zone);
6436                         assert(allocs[idx]);
6437                         printf("alloc[%d] %p\n", idx, allocs[idx]);
6438                 }
6439                 assert(zone_pva_is_null(test_zone->pages_sequester));
6440                 assert(0 == test_zone->sequester_page_count);
6441                 for (idx = 0; idx < num_allocs; idx++) {
6442                         zfree(test_zone, allocs[idx]);
6443                 }
6444                 zdestroy(test_zone);
6445         } else {
6446                 printf("run_zone_test: skipping sequester test (not enabled)\n");
6447         }
6448
6449         printf("run_zone_test: Test passed\n");
6450
6451         simple_lock(&zone_test_lock, &zone_locks_grp);
6452         zone_test_running = FALSE;
6453         simple_unlock(&zone_test_lock);
6454
6455         return TRUE;
6456 }
6457
6458 /*
6459  * Routines to test that zone garbage collection and zone replenish threads
6460  * running at the same time don't cause problems.
6461  */
6462
6463 void
6464 zone_gc_replenish_test(void)
6465 {
6466         zone_gc(FALSE);
6467 }
6468
6469
6470 void
6471 zone_alloc_replenish_test(void)
6472 {
6473         zone_t z = NULL;
6474         struct data { struct data *next; } *node, *list = NULL;
6475
6476         /*
6477          * Find a zone that has a replenish thread
6478          */
6479         zone_index_foreach(i) {
6480                 z = &zone_array[i];
6481                 if (z->prio_refill_count &&
6482                     zone_elem_size(z) >= sizeof(struct data)) {
6483                         z = &zone_array[i];
6484                         break;
6485                 }
6486         }
6487         if (z == NULL) {
6488                 printf("Couldn't find a replenish zone\n");
6489                 return;
6490         }
6491
6492         for (uint32_t i = 0; i < 2000; ++i) {      /* something big enough to go past replenishment */
6493                 node = zalloc(z);
6494                 node->next = list;
6495                 list = node;
6496         }
6497
6498         /*
6499          * release the memory we allocated
6500          */
6501         while (list != NULL) {
6502                 node = list;
6503                 list = list->next;
6504                 zfree(z, node);
6505         }
6506 }
6507
6508 #endif /* DEBUG || DEVELOPMENT */