osfmk/kern/zalloc.c

   1 /*
   2  * Copyright (c) 2000-2020 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * @OSF_COPYRIGHT@
  30  */
  31 /*
  32  * Mach Operating System
  33  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
  34  * All Rights Reserved.
  35  *
  36  * Permission to use, copy, modify and distribute this software and its
  37  * documentation is hereby granted, provided that both the copyright
  38  * notice and this permission notice appear in all copies of the
  39  * software, derivative works or modified versions, and any portions
  40  * thereof, and that both notices appear in supporting documentation.
  41  *
  42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
  44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  45  *
  46  * Carnegie Mellon requests users of this software to return to
  47  *
  48  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  49  *  School of Computer Science
  50  *  Carnegie Mellon University
  51  *  Pittsburgh PA 15213-3890
  52  *
  53  * any improvements or extensions that they make and grant Carnegie Mellon
  54  * the rights to redistribute these changes.
  55  */
  56 /*
  57  */
  58 /*
  59  *      File:   kern/zalloc.c
  60  *      Author: Avadis Tevanian, Jr.
  61  *
  62  *      Zone-based memory allocator.  A zone is a collection of fixed size
  63  *      data blocks for which quick allocation/deallocation is possible.
  64  */
  65
  66 #define ZALLOC_ALLOW_DEPRECATED 1
  67 #include <mach/mach_types.h>
  68 #include <mach/vm_param.h>
  69 #include <mach/kern_return.h>
  70 #include <mach/mach_host_server.h>
  71 #include <mach/task_server.h>
  72 #include <mach/machine/vm_types.h>
  73 #include <mach/vm_map.h>
  74 #include <mach/sdt.h>
  75
  76 #include <kern/bits.h>
  77 #include <kern/startup.h>
  78 #include <kern/kern_types.h>
  79 #include <kern/assert.h>
  80 #include <kern/backtrace.h>
  81 #include <kern/host.h>
  82 #include <kern/macro_help.h>
  83 #include <kern/sched.h>
  84 #include <kern/locks.h>
  85 #include <kern/sched_prim.h>
  86 #include <kern/misc_protos.h>
  87 #include <kern/thread_call.h>
  88 #include <kern/zalloc_internal.h>
  89 #include <kern/kalloc.h>
  90
  91 #include <prng/random.h>
  92
  93 #include <vm/pmap.h>
  94 #include <vm/vm_map.h>
  95 #include <vm/vm_kern.h>
  96 #include <vm/vm_page.h>
  97 #include <vm/vm_compressor.h> /* C_SLOT_PACKED_PTR* */
  98
  99 #include <pexpert/pexpert.h>
 100
 101 #include <machine/machparam.h>
 102 #include <machine/machine_routines.h>  /* ml_cpu_get_info */
 103
 104 #include <os/atomic.h>
 105
 106 #include <libkern/OSDebug.h>
 107 #include <libkern/OSAtomic.h>
 108 #include <libkern/section_keywords.h>
 109 #include <sys/kdebug.h>
 110
 111 #include <san/kasan.h>
 112
 113 #if KASAN_ZALLOC
 114 #define ZONE_ENABLE_LOGGING 0
 115 #elif DEBUG || DEVELOPMENT
 116 #define ZONE_ENABLE_LOGGING 1
 117 #else
 118 #define ZONE_ENABLE_LOGGING 0
 119 #endif
 120
 121 extern void vm_pageout_garbage_collect(int collect);
 122
 123 /* Returns pid of the task with the largest number of VM map entries.  */
 124 extern pid_t find_largest_process_vm_map_entries(void);
 125
 126 /*
 127  * Callout to jetsam. If pid is -1, we wake up the memorystatus thread to do asynchronous kills.
 128  * For any other pid we try to kill that process synchronously.
 129  */
 130 extern boolean_t memorystatus_kill_on_zone_map_exhaustion(pid_t pid);
 131
 132 extern zone_t vm_map_entry_zone;
 133 extern zone_t vm_object_zone;
 134 extern vm_offset_t kmapoff_kaddr;
 135 extern unsigned int kmapoff_pgcnt;
 136 extern unsigned int stack_total;
 137 extern unsigned long long stack_allocs;
 138
 139 /*
 140  * The max # of elements in a chunk should fit into
 141  * zone_page_metadata.free_count (uint16_t).
 142  *
 143  * Update this if the type of free_count changes.
 144  */
 145 #define ZONE_CHUNK_MAXELEMENTS  (UINT16_MAX)
 146
 147 #define ZONE_PAGECOUNT_BITS     14
 148
 149 /* Zone elements must fit both a next pointer and a backup pointer */
 150 #define ZONE_MIN_ELEM_SIZE      (2 * sizeof(vm_offset_t))
 151 #define ZONE_MAX_ALLOC_SIZE     (32 * 1024)
 152
 153 /* per-cpu zones are special because of counters */
 154 #define ZONE_MIN_PCPU_ELEM_SIZE (1 * sizeof(vm_offset_t))
 155
 156 struct zone_map_range {
 157         vm_offset_t min_address;
 158         vm_offset_t max_address;
 159 };
 160
 161 struct zone_page_metadata {
 162         /* The index of the zone this metadata page belongs to */
 163         zone_id_t       zm_index;
 164
 165         /*
 166          * zm_secondary_page == 0: number of pages in this run
 167          * zm_secondary_page == 1: offset to the chunk start
 168          */
 169         uint16_t        zm_page_count : ZONE_PAGECOUNT_BITS;
 170
 171         /* Whether this page is part of a chunk run */
 172         uint16_t        zm_percpu : 1;
 173         uint16_t        zm_secondary_page : 1;
 174
 175         /*
 176          * The start of the freelist can be maintained as a 16-bit
 177          * offset instead of a pointer because the free elements would
 178          * be at max ZONE_MAX_ALLOC_SIZE bytes away from the start
 179          * of the allocation chunk.
 180          *
 181          * Offset from start of the allocation chunk to free element
 182          * list head.
 183          */
 184         uint16_t        zm_freelist_offs;
 185
 186         /*
 187          * zm_secondary_page == 0: number of allocated elements in the chunk
 188          * zm_secondary_page == 1: unused
 189          *
 190          * PAGE_METADATA_EMPTY_FREELIST indicates an empty freelist
 191          */
 192         uint16_t        zm_alloc_count;
 193 #define PAGE_METADATA_EMPTY_FREELIST  UINT16_MAX
 194
 195         zone_pva_t      zm_page_next;
 196         zone_pva_t      zm_page_prev;
 197
 198         /*
 199          * This is only for the sake of debuggers
 200          */
 201 #define ZONE_FOREIGN_COOKIE           0x123456789abcdef
 202         uint64_t        zm_foreign_cookie[];
 203 };
 204
 205
 206 /* Align elements that use the zone page list to 32 byte boundaries. */
 207 #define ZONE_PAGE_FIRST_OFFSET(kind)  ((kind) == ZONE_ADDR_NATIVE ? 0 : 32)
 208
 209 static_assert(sizeof(struct zone_page_metadata) == 16, "validate packing");
 210
 211 static __security_const_late struct {
 212         struct zone_map_range      zi_map_range;
 213         struct zone_map_range      zi_general_range;
 214         struct zone_map_range      zi_meta_range;
 215         struct zone_map_range      zi_foreign_range;
 216
 217         /*
 218          * The metadata lives within the zi_meta_range address range.
 219          *
 220          * The correct formula to find a metadata index is:
 221          *     absolute_page_index - page_index(zi_meta_range.min_address)
 222          *
 223          * And then this index is used to dereference zi_meta_range.min_address
 224          * as a `struct zone_page_metadata` array.
 225          *
 226          * To avoid doing that substraction all the time in the various fast-paths,
 227          * zi_array_base is offset by `page_index(zi_meta_range.min_address)`
 228          * to avoid redoing that math all the time.
 229          */
 230         struct zone_page_metadata *zi_array_base;
 231 } zone_info;
 232
 233 /*
 234  *      The zone_locks_grp allows for collecting lock statistics.
 235  *      All locks are associated to this group in zinit.
 236  *      Look at tools/lockstat for debugging lock contention.
 237  */
 238 LCK_GRP_DECLARE(zone_locks_grp, "zone_locks");
 239 LCK_MTX_EARLY_DECLARE(zone_metadata_region_lck, &zone_locks_grp);
 240
 241 /*
 242  *      Exclude more than one concurrent garbage collection
 243  */
 244 LCK_GRP_DECLARE(zone_gc_lck_grp, "zone_gc");
 245 LCK_MTX_EARLY_DECLARE(zone_gc_lock, &zone_gc_lck_grp);
 246
 247 boolean_t panic_include_zprint = FALSE;
 248 mach_memory_info_t *panic_kext_memory_info = NULL;
 249 vm_size_t panic_kext_memory_size = 0;
 250
 251 /*
 252  *      Protects zone_array, num_zones, num_zones_in_use, and
 253  *      zone_destroyed_bitmap
 254  */
 255 static SIMPLE_LOCK_DECLARE(all_zones_lock, 0);
 256 static unsigned int     num_zones_in_use;
 257 unsigned int _Atomic    num_zones;
 258 SECURITY_READ_ONLY_LATE(unsigned int) zone_view_count;
 259
 260 #if KASAN_ZALLOC
 261 #define MAX_ZONES       566
 262 #else /* !KASAN_ZALLOC */
 263 #define MAX_ZONES       402
 264 #endif/* !KASAN_ZALLOC */
 265 struct zone             zone_array[MAX_ZONES];
 266
 267 /* Initialized in zone_bootstrap(), how many "copies" the per-cpu system does */
 268 static SECURITY_READ_ONLY_LATE(unsigned) zpercpu_early_count;
 269
 270 /* Used to keep track of destroyed slots in the zone_array */
 271 static bitmap_t zone_destroyed_bitmap[BITMAP_LEN(MAX_ZONES)];
 272
 273 /* number of pages used by all zones */
 274 static long _Atomic zones_phys_page_count;
 275
 276 /* number of zone mapped pages used by all zones */
 277 static long _Atomic zones_phys_page_mapped_count;
 278
 279 #if CONFIG_ZALLOC_SEQUESTER
 280 #define ZSECURITY_OPTIONS_SEQUESTER_DEFAULT ZSECURITY_OPTIONS_SEQUESTER
 281 #else
 282 #define ZSECURITY_OPTIONS_SEQUESTER_DEFAULT 0
 283 #endif
 284 /*
 285  * Turn ZSECURITY_OPTIONS_STRICT_IOKIT_FREE off on x86 so as not
 286  * not break third party kexts that haven't yet been recompiled
 287  * to use the new iokit macros.
 288  */
 289 #if XNU_TARGET_OS_OSX && __x86_64__
 290 #define ZSECURITY_OPTIONS_STRICT_IOKIT_FREE_DEFAULT 0
 291 #else
 292 #define ZSECURITY_OPTIONS_STRICT_IOKIT_FREE_DEFAULT \
 293   ZSECURITY_OPTIONS_STRICT_IOKIT_FREE
 294 #endif
 295
 296 #define ZSECURITY_DEFAULT ( \
 297                 ZSECURITY_OPTIONS_SEQUESTER_DEFAULT | \
 298                 ZSECURITY_OPTIONS_SUBMAP_USER_DATA | \
 299                 ZSECURITY_OPTIONS_SEQUESTER_KEXT_KALLOC | \
 300                 ZSECURITY_OPTIONS_STRICT_IOKIT_FREE_DEFAULT | \
 301                 0)
 302 TUNABLE(zone_security_options_t, zsecurity_options, "zs", ZSECURITY_DEFAULT);
 303
 304 #if VM_MAX_TAG_ZONES
 305 /* enable tags for zones that ask for it */
 306 TUNABLE(bool, zone_tagging_on, "-zt", false);
 307 #endif /* VM_MAX_TAG_ZONES */
 308
 309 #if DEBUG || DEVELOPMENT
 310 TUNABLE(bool, zalloc_disable_copyio_check, "-no-copyio-zalloc-check", false);
 311 __options_decl(zalloc_debug_t, uint32_t, {
 312         ZALLOC_DEBUG_ZONEGC     = 0x00000001,
 313         ZALLOC_DEBUG_ZCRAM      = 0x00000002,
 314 });
 315
 316 TUNABLE(zalloc_debug_t, zalloc_debug, "zalloc_debug", 0);
 317 #endif /* DEBUG || DEVELOPMENT */
 318 #if CONFIG_ZLEAKS
 319 /* Making pointer scanning leaks detection possible for all zones */
 320 TUNABLE(bool, zone_leaks_scan_enable, "-zl", false);
 321 #else
 322 #define zone_leaks_scan_enable false
 323 #endif
 324
 325 /*
 326  * Async allocation of zones
 327  * This mechanism allows for bootstrapping an empty zone which is setup with
 328  * non-blocking flags. The first call to zalloc_noblock() will kick off a thread_call
 329  * to zalloc_async. We perform a zalloc() (which may block) and then an immediate free.
 330  * This will prime the zone for the next use.
 331  *
 332  * Currently the thread_callout function (zalloc_async) will loop through all zones
 333  * looking for any zone with async_pending set and do the work for it.
 334  *
 335  * NOTE: If the calling thread for zalloc_noblock is lower priority than thread_call,
 336  * then zalloc_noblock to an empty zone may succeed.
 337  */
 338 static void zalloc_async(thread_call_param_t p0, thread_call_param_t p1);
 339 static thread_call_data_t call_async_alloc;
 340 static void zcram_and_lock(zone_t zone, vm_offset_t newmem, vm_size_t size);
 341
 342 /*
 343  * Zone Corruption Debugging
 344  *
 345  * We use four techniques to detect modification of a zone element
 346  * after it's been freed.
 347  *
 348  * (1) Check the freelist next pointer for sanity.
 349  * (2) Store a backup of the next pointer at the end of the element,
 350  *     and compare it to the primary next pointer when the element is allocated
 351  *     to detect corruption of the freelist due to use-after-free bugs.
 352  *     The backup pointer is also XORed with a per-boot random cookie.
 353  * (3) Poison the freed element by overwriting it with 0xdeadbeef,
 354  *     and check for that value when the element is being reused to make sure
 355  *     no part of the element has been modified while it was on the freelist.
 356  *     This will also help catch read-after-frees, as code will now dereference
 357  *     0xdeadbeef instead of a valid but freed pointer.
 358  * (4) If the zfree_clear_mem flag is set clear the element on free and
 359  *     assert that it is still clear when alloc-ed.
 360  *
 361  * (1) and (2) occur for every allocation and free to a zone.
 362  * This is done to make it slightly more difficult for an attacker to
 363  * manipulate the freelist to behave in a specific way.
 364  *
 365  * Poisoning (3) occurs periodically for every N frees (counted per-zone).
 366  * If -zp is passed as a boot arg, poisoning occurs for every free.
 367  *
 368  * Zeroing (4) is done for those zones that pass the ZC_ZFREE_CLEARMEM
 369  * flag on creation or if the element size is less than one cacheline.
 370  *
 371  * Performance slowdown is inversely proportional to the frequency of poisoning,
 372  * with a 4-5% hit around N=1, down to ~0.3% at N=16 and just "noise" at N=32
 373  * and higher. You can expect to find a 100% reproducible bug in an average of
 374  * N tries, with a standard deviation of about N, but you will want to set
 375  * "-zp" to always poison every free if you are attempting to reproduce
 376  * a known bug.
 377  *
 378  * For a more heavyweight, but finer-grained method of detecting misuse
 379  * of zone memory, look up the "Guard mode" zone allocator in gzalloc.c.
 380  *
 381  * Zone Corruption Logging
 382  *
 383  * You can also track where corruptions come from by using the boot-arguments
 384  * "zlog=<zone name to log> -zc". Search for "Zone corruption logging" later
 385  * in this document for more implementation and usage information.
 386  *
 387  * Zone Leak Detection
 388  *
 389  * To debug leaks of zone memory, use the zone leak detection tool 'zleaks'
 390  * found later in this file via the showtopztrace and showz* macros in kgmacros,
 391  * or use zlog without the -zc argument.
 392  *
 393  */
 394
 395 #define ZP_DEFAULT_SAMPLING_FACTOR 16
 396 #define ZP_DEFAULT_SCALE_FACTOR 4
 397
 398 /*
 399  * set by zp-factor=N boot arg
 400  *
 401  * A zp_factor of 0 indicates zone poisoning is disabled and can also be set by
 402  * passing the -no-zp boot-arg.
 403  *
 404  * A zp_factor of 1 indicates zone poisoning is on for all elements and can be
 405  * set by passing the -zp boot-arg.
 406  */
 407 static TUNABLE(uint32_t, zp_factor, "zp-factor", ZP_DEFAULT_SAMPLING_FACTOR);
 408
 409 /* set by zp-scale=N boot arg, scales zp_factor by zone size */
 410 static TUNABLE(uint32_t, zp_scale, "zp-scale", ZP_DEFAULT_SCALE_FACTOR);
 411
 412 /* initialized to a per-boot random value in zp_bootstrap */
 413 static SECURITY_READ_ONLY_LATE(uintptr_t) zp_poisoned_cookie;
 414 static SECURITY_READ_ONLY_LATE(uintptr_t) zp_nopoison_cookie;
 415 static SECURITY_READ_ONLY_LATE(uintptr_t) zp_min_size;
 416 static SECURITY_READ_ONLY_LATE(uint64_t) zone_phys_mapped_max;
 417
 418 static SECURITY_READ_ONLY_LATE(vm_map_t) zone_submaps[Z_SUBMAP_IDX_COUNT];
 419 static SECURITY_READ_ONLY_LATE(uint32_t) zone_last_submap_idx;
 420
 421 static struct bool_gen zone_bool_gen;
 422 static zone_t          zone_find_largest(void);
 423 static void            zone_drop_free_elements(zone_t z);
 424
 425 #define submap_for_zone(z) zone_submaps[(z)->submap_idx]
 426 #define MAX_SUBMAP_NAME                16
 427
 428 /* Globals for random boolean generator for elements in free list */
 429 #define MAX_ENTROPY_PER_ZCRAM           4
 430
 431 #if CONFIG_ZCACHE
 432 /*
 433  * Specifies a single zone to enable CPU caching for.
 434  * Can be set using boot-args: zcc_enable_for_zone_name=<zone>
 435  */
 436 static char cache_zone_name[MAX_ZONE_NAME];
 437 static TUNABLE(bool, zcc_kalloc, "zcc_kalloc", false);
 438
 439 __header_always_inline bool
 440 zone_caching_enabled(zone_t z)
 441 {
 442         return z->zcache.zcc_depot != NULL;
 443 }
 444 #else
 445 __header_always_inline bool
 446 zone_caching_enabled(zone_t z __unused)
 447 {
 448         return false;
 449 }
 450 #endif /* CONFIG_ZCACHE */
 451
 452 #pragma mark Zone metadata
 453
 454 __enum_closed_decl(zone_addr_kind_t, bool, {
 455         ZONE_ADDR_NATIVE,
 456         ZONE_ADDR_FOREIGN,
 457 });
 458
 459 static inline zone_id_t
 460 zone_index(zone_t z)
 461 {
 462         return (zone_id_t)(z - zone_array);
 463 }
 464
 465 static inline bool
 466 zone_has_index(zone_t z, zone_id_t zid)
 467 {
 468         return zone_array + zid == z;
 469 }
 470
 471 static inline vm_size_t
 472 zone_elem_count(zone_t zone, vm_size_t alloc_size, zone_addr_kind_t kind)
 473 {
 474         if (kind == ZONE_ADDR_NATIVE) {
 475                 if (zone->percpu) {
 476                         return PAGE_SIZE / zone_elem_size(zone);
 477                 }
 478                 return alloc_size / zone_elem_size(zone);
 479         } else {
 480                 assert(alloc_size == PAGE_SIZE);
 481                 return (PAGE_SIZE - ZONE_PAGE_FIRST_OFFSET(kind)) / zone_elem_size(zone);
 482         }
 483 }
 484
 485 __abortlike
 486 static void
 487 zone_metadata_corruption(zone_t zone, struct zone_page_metadata *meta,
 488     const char *kind)
 489 {
 490         panic("zone metadata corruption: %s (meta %p, zone %s%s)",
 491             kind, meta, zone_heap_name(zone), zone->z_name);
 492 }
 493
 494 __abortlike
 495 static void
 496 zone_invalid_element_addr_panic(zone_t zone, vm_offset_t addr)
 497 {
 498         panic("zone element pointer validation failed (addr: %p, zone %s%s)",
 499             (void *)addr, zone_heap_name(zone), zone->z_name);
 500 }
 501
 502 __abortlike
 503 static void
 504 zone_page_metadata_index_confusion_panic(zone_t zone, vm_offset_t addr,
 505     struct zone_page_metadata *meta)
 506 {
 507         panic("%p not in the expected zone %s%s (%d != %d)",
 508             (void *)addr, zone_heap_name(zone), zone->z_name,
 509             meta->zm_index, zone_index(zone));
 510 }
 511
 512 __abortlike
 513 static void
 514 zone_page_metadata_native_queue_corruption(zone_t zone, zone_pva_t *queue)
 515 {
 516         panic("foreign metadata index %d enqueued in native head %p from zone %s%s",
 517             queue->packed_address, queue, zone_heap_name(zone),
 518             zone->z_name);
 519 }
 520
 521 __abortlike
 522 static void
 523 zone_page_metadata_list_corruption(zone_t zone, struct zone_page_metadata *meta)
 524 {
 525         panic("metadata list corruption through element %p detected in zone %s%s",
 526             meta, zone_heap_name(zone), zone->z_name);
 527 }
 528
 529 __abortlike
 530 static void
 531 zone_page_metadata_foreign_queue_corruption(zone_t zone, zone_pva_t *queue)
 532 {
 533         panic("native metadata index %d enqueued in foreign head %p from zone %s%s",
 534             queue->packed_address, queue, zone_heap_name(zone), zone->z_name);
 535 }
 536
 537 __abortlike
 538 static void
 539 zone_page_metadata_foreign_confusion_panic(zone_t zone, vm_offset_t addr)
 540 {
 541         panic("manipulating foreign address %p in a native-only zone %s%s",
 542             (void *)addr, zone_heap_name(zone), zone->z_name);
 543 }
 544
 545 __abortlike __unused
 546 static void
 547 zone_invalid_foreign_addr_panic(zone_t zone, vm_offset_t addr)
 548 {
 549         panic("addr %p being freed to foreign zone %s%s not from foreign range",
 550             (void *)addr, zone_heap_name(zone), zone->z_name);
 551 }
 552
 553 __abortlike
 554 static void
 555 zone_page_meta_accounting_panic(zone_t zone, struct zone_page_metadata *meta,
 556     const char *kind)
 557 {
 558         panic("accounting mismatch (%s) for zone %s%s, meta %p", kind,
 559             zone_heap_name(zone), zone->z_name, meta);
 560 }
 561
 562 __abortlike
 563 static void
 564 zone_accounting_panic(zone_t zone, const char *kind)
 565 {
 566         panic("accounting mismatch (%s) for zone %s%s", kind,
 567             zone_heap_name(zone), zone->z_name);
 568 }
 569
 570 __abortlike
 571 static void
 572 zone_nofail_panic(zone_t zone)
 573 {
 574         panic("zalloc(Z_NOFAIL) can't be satisfied for zone %s%s (potential leak)",
 575             zone_heap_name(zone), zone->z_name);
 576 }
 577
 578 #if __arm64__
 579 // <rdar://problem/48304934> arm64 doesn't use ldp when I'd expect it to
 580 #define zone_range_load(r, rmin, rmax) \
 581         asm("ldp %[rmin], %[rmax], [%[range]]" \
 582             : [rmin] "=r"(rmin), [rmax] "=r"(rmax) \
 583             : [range] "r"(r))
 584 #else
 585 #define zone_range_load(r, rmin, rmax) \
 586         ({ rmin = (r)->min_address; rmax = (r)->max_address; })
 587 #endif
 588
 589 __header_always_inline bool
 590 zone_range_contains(const struct zone_map_range *r, vm_offset_t addr, vm_offset_t size)
 591 {
 592         vm_offset_t rmin, rmax;
 593
 594         /*
 595          * The `&` is not a typo: we really expect the check to pass,
 596          * so encourage the compiler to eagerly load and test without branches
 597          */
 598         zone_range_load(r, rmin, rmax);
 599         return (addr >= rmin) & (addr + size >= rmin) & (addr + size <= rmax);
 600 }
 601
 602 __header_always_inline vm_size_t
 603 zone_range_size(const struct zone_map_range *r)
 604 {
 605         vm_offset_t rmin, rmax;
 606
 607         zone_range_load(r, rmin, rmax);
 608         return rmax - rmin;
 609 }
 610
 611 #define from_zone_map(addr, size) \
 612         zone_range_contains(&zone_info.zi_map_range, (vm_offset_t)(addr), size)
 613
 614 #define from_general_submap(addr, size) \
 615         zone_range_contains(&zone_info.zi_general_range, (vm_offset_t)(addr), size)
 616
 617 #define from_foreign_range(addr, size) \
 618         zone_range_contains(&zone_info.zi_foreign_range, (vm_offset_t)(addr), size)
 619
 620 #define from_native_meta_map(addr) \
 621         zone_range_contains(&zone_info.zi_meta_range, (vm_offset_t)(addr), \
 622             sizeof(struct zone_page_metadata))
 623
 624 #define zone_addr_kind(addr, size) \
 625         (from_zone_map(addr, size) ? ZONE_ADDR_NATIVE : ZONE_ADDR_FOREIGN)
 626
 627 __header_always_inline bool
 628 zone_pva_is_null(zone_pva_t page)
 629 {
 630         return page.packed_address == 0;
 631 }
 632
 633 __header_always_inline bool
 634 zone_pva_is_queue(zone_pva_t page)
 635 {
 636         // actual kernel pages have the top bit set
 637         return (int32_t)page.packed_address > 0;
 638 }
 639
 640 __header_always_inline bool
 641 zone_pva_is_equal(zone_pva_t pva1, zone_pva_t pva2)
 642 {
 643         return pva1.packed_address == pva2.packed_address;
 644 }
 645
 646 __header_always_inline void
 647 zone_queue_set_head(zone_t z, zone_pva_t queue, zone_pva_t oldv,
 648     struct zone_page_metadata *meta)
 649 {
 650         zone_pva_t *queue_head = &((zone_pva_t *)zone_array)[queue.packed_address];
 651
 652         if (!zone_pva_is_equal(*queue_head, oldv)) {
 653                 zone_page_metadata_list_corruption(z, meta);
 654         }
 655         *queue_head = meta->zm_page_next;
 656 }
 657
 658 __header_always_inline zone_pva_t
 659 zone_queue_encode(zone_pva_t *headp)
 660 {
 661         return (zone_pva_t){ (uint32_t)(headp - (zone_pva_t *)zone_array) };
 662 }
 663
 664 __header_always_inline zone_pva_t
 665 zone_pva_from_addr(vm_address_t addr)
 666 {
 667         // cannot use atop() because we want to maintain the sign bit
 668         return (zone_pva_t){ (uint32_t)((intptr_t)addr >> PAGE_SHIFT) };
 669 }
 670
 671 __header_always_inline vm_address_t
 672 zone_pva_to_addr(zone_pva_t page)
 673 {
 674         // cause sign extension so that we end up with the right address
 675         return (vm_offset_t)(int32_t)page.packed_address << PAGE_SHIFT;
 676 }
 677
 678 __header_always_inline struct zone_page_metadata *
 679 zone_pva_to_meta(zone_pva_t page, zone_addr_kind_t kind)
 680 {
 681         if (kind == ZONE_ADDR_NATIVE) {
 682                 return &zone_info.zi_array_base[page.packed_address];
 683         } else {
 684                 return (struct zone_page_metadata *)zone_pva_to_addr(page);
 685         }
 686 }
 687
 688 __header_always_inline zone_pva_t
 689 zone_pva_from_meta(struct zone_page_metadata *meta, zone_addr_kind_t kind)
 690 {
 691         if (kind == ZONE_ADDR_NATIVE) {
 692                 uint32_t index = (uint32_t)(meta - zone_info.zi_array_base);
 693                 return (zone_pva_t){ index };
 694         } else {
 695                 return zone_pva_from_addr((vm_address_t)meta);
 696         }
 697 }
 698
 699 __header_always_inline struct zone_page_metadata *
 700 zone_meta_from_addr(vm_offset_t addr, zone_addr_kind_t kind)
 701 {
 702         if (kind == ZONE_ADDR_NATIVE) {
 703                 return zone_pva_to_meta(zone_pva_from_addr(addr), kind);
 704         } else {
 705                 return (struct zone_page_metadata *)trunc_page(addr);
 706         }
 707 }
 708
 709 #define zone_native_meta_from_addr(addr) \
 710         zone_meta_from_addr((vm_offset_t)(addr), ZONE_ADDR_NATIVE)
 711
 712 __header_always_inline vm_offset_t
 713 zone_meta_to_addr(struct zone_page_metadata *meta, zone_addr_kind_t kind)
 714 {
 715         if (kind == ZONE_ADDR_NATIVE) {
 716                 return ptoa((int)(meta - zone_info.zi_array_base));
 717         } else {
 718                 return (vm_offset_t)meta;
 719         }
 720 }
 721
 722 __header_always_inline void
 723 zone_meta_queue_push(zone_t z, zone_pva_t *headp,
 724     struct zone_page_metadata *meta, zone_addr_kind_t kind)
 725 {
 726         zone_pva_t head = *headp;
 727         zone_pva_t queue_pva = zone_queue_encode(headp);
 728         struct zone_page_metadata *tmp;
 729
 730         meta->zm_page_next = head;
 731         if (!zone_pva_is_null(head)) {
 732                 tmp = zone_pva_to_meta(head, kind);
 733                 if (!zone_pva_is_equal(tmp->zm_page_prev, queue_pva)) {
 734                         zone_page_metadata_list_corruption(z, meta);
 735                 }
 736                 tmp->zm_page_prev = zone_pva_from_meta(meta, kind);
 737         }
 738         meta->zm_page_prev = queue_pva;
 739         *headp = zone_pva_from_meta(meta, kind);
 740 }
 741
 742 __header_always_inline struct zone_page_metadata *
 743 zone_meta_queue_pop(zone_t z, zone_pva_t *headp, zone_addr_kind_t kind,
 744     vm_offset_t *page_addrp)
 745 {
 746         zone_pva_t head = *headp;
 747         struct zone_page_metadata *meta = zone_pva_to_meta(head, kind);
 748         vm_offset_t page_addr = zone_pva_to_addr(head);
 749         struct zone_page_metadata *tmp;
 750
 751         if (kind == ZONE_ADDR_NATIVE && !from_native_meta_map(meta)) {
 752                 zone_page_metadata_native_queue_corruption(z, headp);
 753         }
 754         if (kind == ZONE_ADDR_FOREIGN && from_zone_map(meta, sizeof(*meta))) {
 755                 zone_page_metadata_foreign_queue_corruption(z, headp);
 756         }
 757
 758         if (!zone_pva_is_null(meta->zm_page_next)) {
 759                 tmp = zone_pva_to_meta(meta->zm_page_next, kind);
 760                 if (!zone_pva_is_equal(tmp->zm_page_prev, head)) {
 761                         zone_page_metadata_list_corruption(z, meta);
 762                 }
 763                 tmp->zm_page_prev = meta->zm_page_prev;
 764         }
 765         *headp = meta->zm_page_next;
 766
 767         *page_addrp = page_addr;
 768         return meta;
 769 }
 770
 771 __header_always_inline void
 772 zone_meta_requeue(zone_t z, zone_pva_t *headp,
 773     struct zone_page_metadata *meta, zone_addr_kind_t kind)
 774 {
 775         zone_pva_t meta_pva = zone_pva_from_meta(meta, kind);
 776         struct zone_page_metadata *tmp;
 777
 778         if (!zone_pva_is_null(meta->zm_page_next)) {
 779                 tmp = zone_pva_to_meta(meta->zm_page_next, kind);
 780                 if (!zone_pva_is_equal(tmp->zm_page_prev, meta_pva)) {
 781                         zone_page_metadata_list_corruption(z, meta);
 782                 }
 783                 tmp->zm_page_prev = meta->zm_page_prev;
 784         }
 785         if (zone_pva_is_queue(meta->zm_page_prev)) {
 786                 zone_queue_set_head(z, meta->zm_page_prev, meta_pva, meta);
 787         } else {
 788                 tmp = zone_pva_to_meta(meta->zm_page_prev, kind);
 789                 if (!zone_pva_is_equal(tmp->zm_page_next, meta_pva)) {
 790                         zone_page_metadata_list_corruption(z, meta);
 791                 }
 792                 tmp->zm_page_next = meta->zm_page_next;
 793         }
 794
 795         zone_meta_queue_push(z, headp, meta, kind);
 796 }
 797
 798 /*
 799  * Routine to populate a page backing metadata in the zone_metadata_region.
 800  * Must be called without the zone lock held as it might potentially block.
 801  */
 802 static void
 803 zone_meta_populate(struct zone_page_metadata *from, struct zone_page_metadata *to)
 804 {
 805         vm_offset_t page_addr = trunc_page(from);
 806
 807         for (; page_addr < (vm_offset_t)to; page_addr += PAGE_SIZE) {
 808 #if !KASAN_ZALLOC
 809                 /*
 810                  * This can race with another thread doing a populate on the same metadata
 811                  * page, where we see an updated pmap but unmapped KASan shadow, causing a
 812                  * fault in the shadow when we first access the metadata page. Avoid this
 813                  * by always synchronizing on the zone_metadata_region lock with KASan.
 814                  */
 815                 if (pmap_find_phys(kernel_pmap, page_addr)) {
 816                         continue;
 817                 }
 818 #endif
 819
 820                 for (;;) {
 821                         kern_return_t ret = KERN_SUCCESS;
 822
 823                         /* All updates to the zone_metadata_region are done under the zone_metadata_region_lck */
 824                         lck_mtx_lock(&zone_metadata_region_lck);
 825                         if (0 == pmap_find_phys(kernel_pmap, page_addr)) {
 826                                 ret = kernel_memory_populate(kernel_map, page_addr,
 827                                     PAGE_SIZE, KMA_NOPAGEWAIT | KMA_KOBJECT | KMA_ZERO,
 828                                     VM_KERN_MEMORY_OSFMK);
 829                         }
 830                         lck_mtx_unlock(&zone_metadata_region_lck);
 831
 832                         if (ret == KERN_SUCCESS) {
 833                                 break;
 834                         }
 835
 836                         /*
 837                          * We can't pass KMA_NOPAGEWAIT under a global lock as it leads
 838                          * to bad system deadlocks, so if the allocation failed,
 839                          * we need to do the VM_PAGE_WAIT() outside of the lock.
 840                          */
 841                         VM_PAGE_WAIT();
 842                 }
 843         }
 844 }
 845
 846 static inline bool
 847 zone_allocated_element_offset_is_valid(zone_t zone, vm_offset_t addr,
 848     vm_offset_t page, zone_addr_kind_t kind)
 849 {
 850         vm_offset_t offs = addr - page - ZONE_PAGE_FIRST_OFFSET(kind);
 851         vm_offset_t esize = zone_elem_size(zone);
 852
 853         if (esize & (esize - 1)) { /* not a power of 2 */
 854                 return (offs % esize) == 0;
 855         } else {
 856                 return (offs & (esize - 1)) == 0;
 857         }
 858 }
 859
 860 __attribute__((always_inline))
 861 static struct zone_page_metadata *
 862 zone_allocated_element_resolve(zone_t zone, vm_offset_t addr,
 863     vm_offset_t *pagep, zone_addr_kind_t *kindp)
 864 {
 865         struct zone_page_metadata *meta;
 866         zone_addr_kind_t kind;
 867         vm_offset_t page;
 868         vm_offset_t esize = zone_elem_size(zone);
 869
 870         kind = zone_addr_kind(addr, esize);
 871         page = trunc_page(addr);
 872         meta = zone_meta_from_addr(addr, kind);
 873
 874         if (kind == ZONE_ADDR_NATIVE) {
 875                 if (meta->zm_secondary_page) {
 876                         if (meta->zm_percpu) {
 877                                 zone_invalid_element_addr_panic(zone, addr);
 878                         }
 879                         page -= ptoa(meta->zm_page_count);
 880                         meta -= meta->zm_page_count;
 881                 }
 882         } else if (!zone->allows_foreign) {
 883                 zone_page_metadata_foreign_confusion_panic(zone, addr);
 884 #if __LP64__
 885         } else if (!from_foreign_range(addr, esize)) {
 886                 zone_invalid_foreign_addr_panic(zone, addr);
 887 #else
 888         } else if (!pmap_kernel_va(addr)) {
 889                 zone_invalid_element_addr_panic(zone, addr);
 890 #endif
 891         }
 892
 893         if (!zone_allocated_element_offset_is_valid(zone, addr, page, kind)) {
 894                 zone_invalid_element_addr_panic(zone, addr);
 895         }
 896
 897         if (!zone_has_index(zone, meta->zm_index)) {
 898                 zone_page_metadata_index_confusion_panic(zone, addr, meta);
 899         }
 900
 901         if (kindp) {
 902                 *kindp = kind;
 903         }
 904         if (pagep) {
 905                 *pagep = page;
 906         }
 907         return meta;
 908 }
 909
 910 __attribute__((always_inline))
 911 void
 912 zone_allocated_element_validate(zone_t zone, vm_offset_t addr)
 913 {
 914         zone_allocated_element_resolve(zone, addr, NULL, NULL);
 915 }
 916
 917 __header_always_inline vm_offset_t
 918 zone_page_meta_get_freelist(zone_t zone, struct zone_page_metadata *meta,
 919     vm_offset_t page)
 920 {
 921         assert(!meta->zm_secondary_page);
 922         if (meta->zm_freelist_offs == PAGE_METADATA_EMPTY_FREELIST) {
 923                 return 0;
 924         }
 925
 926         vm_size_t size = ptoa(meta->zm_percpu ? 1 : meta->zm_page_count);
 927         if (meta->zm_freelist_offs + zone_elem_size(zone) > size) {
 928                 zone_metadata_corruption(zone, meta, "freelist corruption");
 929         }
 930
 931         return page + meta->zm_freelist_offs;
 932 }
 933
 934 __header_always_inline void
 935 zone_page_meta_set_freelist(struct zone_page_metadata *meta,
 936     vm_offset_t page, vm_offset_t addr)
 937 {
 938         assert(!meta->zm_secondary_page);
 939         if (addr) {
 940                 meta->zm_freelist_offs = (uint16_t)(addr - page);
 941         } else {
 942                 meta->zm_freelist_offs = PAGE_METADATA_EMPTY_FREELIST;
 943         }
 944 }
 945
 946 static bool
 947 zone_page_meta_is_sane_element(zone_t zone, struct zone_page_metadata *meta,
 948     vm_offset_t page, vm_offset_t element, zone_addr_kind_t kind)
 949 {
 950         if (element == 0) {
 951                 /* ends of the freelist are NULL */
 952                 return true;
 953         }
 954         if (element < page + ZONE_PAGE_FIRST_OFFSET(kind)) {
 955                 return false;
 956         }
 957         vm_size_t size = ptoa(meta->zm_percpu ? 1 : meta->zm_page_count);
 958         if (element > page + size - zone_elem_size(zone)) {
 959                 return false;
 960         }
 961         return true;
 962 }
 963
 964 /* Routine to get the size of a zone allocated address.
 965  * If the address doesnt belong to the zone maps, returns 0.
 966  */
 967 vm_size_t
 968 zone_element_size(void *addr, zone_t *z)
 969 {
 970         struct zone_page_metadata *meta;
 971         struct zone *src_zone;
 972
 973         if (from_zone_map(addr, sizeof(void *))) {
 974                 meta = zone_native_meta_from_addr(addr);
 975                 src_zone = &zone_array[meta->zm_index];
 976                 if (z) {
 977                         *z = src_zone;
 978                 }
 979                 return zone_elem_size(src_zone);
 980         }
 981 #if CONFIG_GZALLOC
 982         if (__improbable(gzalloc_enabled())) {
 983                 vm_size_t gzsize;
 984                 if (gzalloc_element_size(addr, z, &gzsize)) {
 985                         return gzsize;
 986                 }
 987         }
 988 #endif /* CONFIG_GZALLOC */
 989
 990         return 0;
 991 }
 992
 993 /* This function just formats the reason for the panics by redoing the checks */
 994 __abortlike
 995 static void
 996 zone_require_panic(zone_t zone, void *addr)
 997 {
 998         uint32_t zindex;
 999         zone_t other;
1000
1001         if (!from_zone_map(addr, zone_elem_size(zone))) {
1002                 panic("zone_require failed: address not in a zone (addr: %p)", addr);
1003         }
1004
1005         zindex = zone_native_meta_from_addr(addr)->zm_index;
1006         other = &zone_array[zindex];
1007         if (zindex >= os_atomic_load(&num_zones, relaxed) || !other->z_self) {
1008                 panic("zone_require failed: invalid zone index %d "
1009                     "(addr: %p, expected: %s%s)", zindex,
1010                     addr, zone_heap_name(zone), zone->z_name);
1011         } else {
1012                 panic("zone_require failed: address in unexpected zone id %d (%s%s) "
1013                     "(addr: %p, expected: %s%s)",
1014                     zindex, zone_heap_name(other), other->z_name,
1015                     addr, zone_heap_name(zone), zone->z_name);
1016         }
1017 }
1018
1019 __abortlike
1020 static void
1021 zone_id_require_panic(zone_id_t zid, void *addr)
1022 {
1023         zone_require_panic(&zone_array[zid], addr);
1024 }
1025
1026 /*
1027  * Routines to panic if a pointer is not mapped to an expected zone.
1028  * This can be used as a means of pinning an object to the zone it is expected
1029  * to be a part of.  Causes a panic if the address does not belong to any
1030  * specified zone, does not belong to any zone, has been freed and therefore
1031  * unmapped from the zone, or the pointer contains an uninitialized value that
1032  * does not belong to any zone.
1033  *
1034  * Note that this can only work with collectable zones without foreign pages.
1035  */
1036 void
1037 zone_require(zone_t zone, void *addr)
1038 {
1039         if (__probable(from_general_submap(addr, zone_elem_size(zone)) &&
1040             (zone_has_index(zone, zone_native_meta_from_addr(addr)->zm_index)))) {
1041                 return;
1042         }
1043 #if CONFIG_GZALLOC
1044         if (__probable(gzalloc_enabled())) {
1045                 return;
1046         }
1047 #endif
1048         zone_require_panic(zone, addr);
1049 }
1050
1051 void
1052 zone_id_require(zone_id_t zid, vm_size_t esize, void *addr)
1053 {
1054         if (__probable(from_general_submap(addr, esize) &&
1055             (zid == zone_native_meta_from_addr(addr)->zm_index))) {
1056                 return;
1057         }
1058 #if CONFIG_GZALLOC
1059         if (__probable(gzalloc_enabled())) {
1060                 return;
1061         }
1062 #endif
1063         zone_id_require_panic(zid, addr);
1064 }
1065
1066 bool
1067 zone_owns(zone_t zone, void *addr)
1068 {
1069         if (__probable(from_general_submap(addr, zone_elem_size(zone)) &&
1070             (zone_has_index(zone, zone_native_meta_from_addr(addr)->zm_index)))) {
1071                 return true;
1072         }
1073 #if CONFIG_GZALLOC
1074         if (__probable(gzalloc_enabled())) {
1075                 return true;
1076         }
1077 #endif
1078         return false;
1079 }
1080
1081 #pragma mark ZTAGS
1082 #if VM_MAX_TAG_ZONES
1083
1084 // for zones with tagging enabled:
1085
1086 // calculate a pointer to the tag base entry,
1087 // holding either a uint32_t the first tag offset for a page in the zone map,
1088 // or two uint16_t tags if the page can only hold one or two elements
1089
1090 #define ZTAGBASE(zone, element) \
1091     (&((uint32_t *)zone_tagbase_min)[atop((element) - zone_info.zi_map_range.min_address)])
1092
1093 // pointer to the tag for an element
1094 #define ZTAG(zone, element)                                     \
1095     ({                                                          \
1096         vm_tag_t * result;                                      \
1097         if ((zone)->tags_inline) {                              \
1098             result = (vm_tag_t *) ZTAGBASE((zone), (element));  \
1099             if ((page_mask & element) >= zone_elem_size(zone)) result++;    \
1100         } else {                                                \
1101             result =  &((vm_tag_t *)zone_tags_min)[ZTAGBASE((zone), (element))[0] + ((element) & page_mask) / zone_elem_size((zone))];   \
1102         }                                                       \
1103         result;                                                 \
1104     })
1105
1106
1107 static vm_offset_t  zone_tagbase_min;
1108 static vm_offset_t  zone_tagbase_max;
1109 static vm_offset_t  zone_tagbase_map_size;
1110 static vm_map_t     zone_tagbase_map;
1111
1112 static vm_offset_t  zone_tags_min;
1113 static vm_offset_t  zone_tags_max;
1114 static vm_offset_t  zone_tags_map_size;
1115 static vm_map_t     zone_tags_map;
1116
1117 // simple heap allocator for allocating the tags for new memory
1118
1119 LCK_MTX_EARLY_DECLARE(ztLock, &zone_locks_grp); /* heap lock */
1120
1121 enum{
1122         ztFreeIndexCount = 8,
1123         ztFreeIndexMax   = (ztFreeIndexCount - 1),
1124         ztTagsPerBlock   = 4
1125 };
1126
1127 struct ztBlock {
1128 #if __LITTLE_ENDIAN__
1129         uint64_t free:1,
1130             next:21,
1131             prev:21,
1132             size:21;
1133 #else
1134 // ztBlock needs free bit least significant
1135 #error !__LITTLE_ENDIAN__
1136 #endif
1137 };
1138 typedef struct ztBlock ztBlock;
1139
1140 static ztBlock * ztBlocks;
1141 static uint32_t  ztBlocksCount;
1142 static uint32_t  ztBlocksFree;
1143
1144 static uint32_t
1145 ztLog2up(uint32_t size)
1146 {
1147         if (1 == size) {
1148                 size = 0;
1149         } else {
1150                 size = 32 - __builtin_clz(size - 1);
1151         }
1152         return size;
1153 }
1154
1155 static uint32_t
1156 ztLog2down(uint32_t size)
1157 {
1158         size = 31 - __builtin_clz(size);
1159         return size;
1160 }
1161
1162 static void
1163 ztFault(vm_map_t map, const void * address, size_t size, uint32_t flags)
1164 {
1165         vm_map_offset_t addr = (vm_map_offset_t) address;
1166         vm_map_offset_t page, end;
1167
1168         page = trunc_page(addr);
1169         end  = round_page(addr + size);
1170
1171         for (; page < end; page += page_size) {
1172                 if (!pmap_find_phys(kernel_pmap, page)) {
1173                         kern_return_t __unused
1174                         ret = kernel_memory_populate(map, page, PAGE_SIZE,
1175                             KMA_KOBJECT | flags, VM_KERN_MEMORY_DIAG);
1176                         assert(ret == KERN_SUCCESS);
1177                 }
1178         }
1179 }
1180
1181 static boolean_t
1182 ztPresent(const void * address, size_t size)
1183 {
1184         vm_map_offset_t addr = (vm_map_offset_t) address;
1185         vm_map_offset_t page, end;
1186         boolean_t       result;
1187
1188         page = trunc_page(addr);
1189         end  = round_page(addr + size);
1190         for (result = TRUE; (page < end); page += page_size) {
1191                 result = pmap_find_phys(kernel_pmap, page);
1192                 if (!result) {
1193                         break;
1194                 }
1195         }
1196         return result;
1197 }
1198
1199
1200 void __unused
1201 ztDump(boolean_t sanity);
1202 void __unused
1203 ztDump(boolean_t sanity)
1204 {
1205         uint32_t q, cq, p;
1206
1207         for (q = 0; q <= ztFreeIndexMax; q++) {
1208                 p = q;
1209                 do{
1210                         if (sanity) {
1211                                 cq = ztLog2down(ztBlocks[p].size);
1212                                 if (cq > ztFreeIndexMax) {
1213                                         cq = ztFreeIndexMax;
1214                                 }
1215                                 if (!ztBlocks[p].free
1216                                     || ((p != q) && (q != cq))
1217                                     || (ztBlocks[ztBlocks[p].next].prev != p)
1218                                     || (ztBlocks[ztBlocks[p].prev].next != p)) {
1219                                         kprintf("zterror at %d", p);
1220                                         ztDump(FALSE);
1221                                         kprintf("zterror at %d", p);
1222                                         assert(FALSE);
1223                                 }
1224                                 continue;
1225                         }
1226                         kprintf("zt[%03d]%c %d, %d, %d\n",
1227                             p, ztBlocks[p].free ? 'F' : 'A',
1228                             ztBlocks[p].next, ztBlocks[p].prev,
1229                             ztBlocks[p].size);
1230                         p = ztBlocks[p].next;
1231                         if (p == q) {
1232                                 break;
1233                         }
1234                 }while (p != q);
1235                 if (!sanity) {
1236                         printf("\n");
1237                 }
1238         }
1239         if (!sanity) {
1240                 printf("-----------------------\n");
1241         }
1242 }
1243
1244
1245
1246 #define ZTBDEQ(idx)                                                 \
1247     ztBlocks[ztBlocks[(idx)].prev].next = ztBlocks[(idx)].next;     \
1248     ztBlocks[ztBlocks[(idx)].next].prev = ztBlocks[(idx)].prev;
1249
1250 static void
1251 ztFree(zone_t zone __unused, uint32_t index, uint32_t count)
1252 {
1253         uint32_t q, w, p, size, merge;
1254
1255         assert(count);
1256         ztBlocksFree += count;
1257
1258         // merge with preceding
1259         merge = (index + count);
1260         if ((merge < ztBlocksCount)
1261             && ztPresent(&ztBlocks[merge], sizeof(ztBlocks[merge]))
1262             && ztBlocks[merge].free) {
1263                 ZTBDEQ(merge);
1264                 count += ztBlocks[merge].size;
1265         }
1266
1267         // merge with following
1268         merge = (index - 1);
1269         if ((merge > ztFreeIndexMax)
1270             && ztPresent(&ztBlocks[merge], sizeof(ztBlocks[merge]))
1271             && ztBlocks[merge].free) {
1272                 size = ztBlocks[merge].size;
1273                 count += size;
1274                 index -= size;
1275                 ZTBDEQ(index);
1276         }
1277
1278         q = ztLog2down(count);
1279         if (q > ztFreeIndexMax) {
1280                 q = ztFreeIndexMax;
1281         }
1282         w = q;
1283         // queue in order of size
1284         while (TRUE) {
1285                 p = ztBlocks[w].next;
1286                 if (p == q) {
1287                         break;
1288                 }
1289                 if (ztBlocks[p].size >= count) {
1290                         break;
1291                 }
1292                 w = p;
1293         }
1294         ztBlocks[p].prev = index;
1295         ztBlocks[w].next = index;
1296
1297         // fault in first
1298         ztFault(zone_tags_map, &ztBlocks[index], sizeof(ztBlocks[index]), 0);
1299
1300         // mark first & last with free flag and size
1301         ztBlocks[index].free = TRUE;
1302         ztBlocks[index].size = count;
1303         ztBlocks[index].prev = w;
1304         ztBlocks[index].next = p;
1305         if (count > 1) {
1306                 index += (count - 1);
1307                 // fault in last
1308                 ztFault(zone_tags_map, &ztBlocks[index], sizeof(ztBlocks[index]), 0);
1309                 ztBlocks[index].free = TRUE;
1310                 ztBlocks[index].size = count;
1311         }
1312 }
1313
1314 static uint32_t
1315 ztAlloc(zone_t zone, uint32_t count)
1316 {
1317         uint32_t q, w, p, leftover;
1318
1319         assert(count);
1320
1321         q = ztLog2up(count);
1322         if (q > ztFreeIndexMax) {
1323                 q = ztFreeIndexMax;
1324         }
1325         do{
1326                 w = q;
1327                 while (TRUE) {
1328                         p = ztBlocks[w].next;
1329                         if (p == q) {
1330                                 break;
1331                         }
1332                         if (ztBlocks[p].size >= count) {
1333                                 // dequeue, mark both ends allocated
1334                                 ztBlocks[w].next = ztBlocks[p].next;
1335                                 ztBlocks[ztBlocks[p].next].prev = w;
1336                                 ztBlocks[p].free = FALSE;
1337                                 ztBlocksFree -= ztBlocks[p].size;
1338                                 if (ztBlocks[p].size > 1) {
1339                                         ztBlocks[p + ztBlocks[p].size - 1].free = FALSE;
1340                                 }
1341
1342                                 // fault all the allocation
1343                                 ztFault(zone_tags_map, &ztBlocks[p], count * sizeof(ztBlocks[p]), 0);
1344                                 // mark last as allocated
1345                                 if (count > 1) {
1346                                         ztBlocks[p + count - 1].free = FALSE;
1347                                 }
1348                                 // free remainder
1349                                 leftover = ztBlocks[p].size - count;
1350                                 if (leftover) {
1351                                         ztFree(zone, p + ztBlocks[p].size - leftover, leftover);
1352                                 }
1353
1354                                 return p;
1355                         }
1356                         w = p;
1357                 }
1358                 q++;
1359         }while (q <= ztFreeIndexMax);
1360
1361         return -1U;
1362 }
1363
1364 __startup_func
1365 static void
1366 zone_tagging_init(vm_size_t max_zonemap_size)
1367 {
1368         kern_return_t         ret;
1369         vm_map_kernel_flags_t vmk_flags;
1370         uint32_t              idx;
1371
1372         // allocate submaps VM_KERN_MEMORY_DIAG
1373
1374         zone_tagbase_map_size = atop(max_zonemap_size) * sizeof(uint32_t);
1375         vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
1376         vmk_flags.vmkf_permanent = TRUE;
1377         ret = kmem_suballoc(kernel_map, &zone_tagbase_min, zone_tagbase_map_size,
1378             FALSE, VM_FLAGS_ANYWHERE, vmk_flags, VM_KERN_MEMORY_DIAG,
1379             &zone_tagbase_map);
1380
1381         if (ret != KERN_SUCCESS) {
1382                 panic("zone_init: kmem_suballoc failed");
1383         }
1384         zone_tagbase_max = zone_tagbase_min + round_page(zone_tagbase_map_size);
1385
1386         zone_tags_map_size = 2048 * 1024 * sizeof(vm_tag_t);
1387         vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
1388         vmk_flags.vmkf_permanent = TRUE;
1389         ret = kmem_suballoc(kernel_map, &zone_tags_min, zone_tags_map_size,
1390             FALSE, VM_FLAGS_ANYWHERE, vmk_flags, VM_KERN_MEMORY_DIAG,
1391             &zone_tags_map);
1392
1393         if (ret != KERN_SUCCESS) {
1394                 panic("zone_init: kmem_suballoc failed");
1395         }
1396         zone_tags_max = zone_tags_min + round_page(zone_tags_map_size);
1397
1398         ztBlocks = (ztBlock *) zone_tags_min;
1399         ztBlocksCount = (uint32_t)(zone_tags_map_size / sizeof(ztBlock));
1400
1401         // initialize the qheads
1402         lck_mtx_lock(&ztLock);
1403
1404         ztFault(zone_tags_map, &ztBlocks[0], sizeof(ztBlocks[0]), 0);
1405         for (idx = 0; idx < ztFreeIndexCount; idx++) {
1406                 ztBlocks[idx].free = TRUE;
1407                 ztBlocks[idx].next = idx;
1408                 ztBlocks[idx].prev = idx;
1409                 ztBlocks[idx].size = 0;
1410         }
1411         // free remaining space
1412         ztFree(NULL, ztFreeIndexCount, ztBlocksCount - ztFreeIndexCount);
1413
1414         lck_mtx_unlock(&ztLock);
1415 }
1416
1417 static void
1418 ztMemoryAdd(zone_t zone, vm_offset_t mem, vm_size_t size)
1419 {
1420         uint32_t * tagbase;
1421         uint32_t   count, block, blocks, idx;
1422         size_t     pages;
1423
1424         pages = atop(size);
1425         tagbase = ZTAGBASE(zone, mem);
1426
1427         lck_mtx_lock(&ztLock);
1428
1429         // fault tagbase
1430         ztFault(zone_tagbase_map, tagbase, pages * sizeof(uint32_t), 0);
1431
1432         if (!zone->tags_inline) {
1433                 // allocate tags
1434                 count = (uint32_t)(size / zone_elem_size(zone));
1435                 blocks = ((count + ztTagsPerBlock - 1) / ztTagsPerBlock);
1436                 block = ztAlloc(zone, blocks);
1437                 if (-1U == block) {
1438                         ztDump(false);
1439                 }
1440                 assert(-1U != block);
1441         }
1442
1443         lck_mtx_unlock(&ztLock);
1444
1445         if (!zone->tags_inline) {
1446                 // set tag base for each page
1447                 block *= ztTagsPerBlock;
1448                 for (idx = 0; idx < pages; idx++) {
1449                         vm_offset_t esize = zone_elem_size(zone);
1450                         tagbase[idx] = block + (uint32_t)((ptoa(idx) + esize - 1) / esize);
1451                 }
1452         }
1453 }
1454
1455 static void
1456 ztMemoryRemove(zone_t zone, vm_offset_t mem, vm_size_t size)
1457 {
1458         uint32_t * tagbase;
1459         uint32_t   count, block, blocks, idx;
1460         size_t     pages;
1461
1462         // set tag base for each page
1463         pages = atop(size);
1464         tagbase = ZTAGBASE(zone, mem);
1465         block = tagbase[0];
1466         for (idx = 0; idx < pages; idx++) {
1467                 tagbase[idx] = 0xFFFFFFFF;
1468         }
1469
1470         lck_mtx_lock(&ztLock);
1471         if (!zone->tags_inline) {
1472                 count = (uint32_t)(size / zone_elem_size(zone));
1473                 blocks = ((count + ztTagsPerBlock - 1) / ztTagsPerBlock);
1474                 assert(block != 0xFFFFFFFF);
1475                 block /= ztTagsPerBlock;
1476                 ztFree(NULL /* zone is unlocked */, block, blocks);
1477         }
1478
1479         lck_mtx_unlock(&ztLock);
1480 }
1481
1482 uint32_t
1483 zone_index_from_tag_index(uint32_t tag_zone_index, vm_size_t * elem_size)
1484 {
1485         simple_lock(&all_zones_lock, &zone_locks_grp);
1486
1487         zone_index_foreach(idx) {
1488                 zone_t z = &zone_array[idx];
1489                 if (!z->tags) {
1490                         continue;
1491                 }
1492                 if (tag_zone_index != z->tag_zone_index) {
1493                         continue;
1494                 }
1495
1496                 *elem_size = zone_elem_size(z);
1497                 simple_unlock(&all_zones_lock);
1498                 return idx;
1499         }
1500
1501         simple_unlock(&all_zones_lock);
1502
1503         return -1U;
1504 }
1505
1506 #endif /* VM_MAX_TAG_ZONES */
1507 #pragma mark zalloc helpers
1508
1509 const char *
1510 zone_name(zone_t z)
1511 {
1512         return z->z_name;
1513 }
1514
1515 const char *
1516 zone_heap_name(zone_t z)
1517 {
1518         if (__probable(z->kalloc_heap < KHEAP_ID_COUNT)) {
1519                 return kalloc_heap_names[z->kalloc_heap];
1520         }
1521         return "invalid";
1522 }
1523
1524 static inline vm_size_t
1525 zone_submaps_approx_size(void)
1526 {
1527         vm_size_t size = 0;
1528
1529         for (unsigned idx = 0; idx <= zone_last_submap_idx; idx++) {
1530                 size += zone_submaps[idx]->size;
1531         }
1532
1533         return size;
1534 }
1535
1536 bool
1537 zone_maps_owned(vm_address_t addr, vm_size_t size)
1538 {
1539         return from_zone_map(addr, size);
1540 }
1541
1542 void
1543 zone_map_sizes(
1544         vm_map_size_t    *psize,
1545         vm_map_size_t    *pfree,
1546         vm_map_size_t    *plargest_free)
1547 {
1548         vm_map_sizes(zone_submaps[Z_SUBMAP_IDX_GENERAL_MAP], psize, pfree, plargest_free);
1549 }
1550
1551 vm_map_t
1552 zone_submap(zone_t zone)
1553 {
1554         return submap_for_zone(zone);
1555 }
1556
1557 unsigned
1558 zpercpu_count(void)
1559 {
1560         return zpercpu_early_count;
1561 }
1562
1563 int
1564 track_this_zone(const char *zonename, const char *logname)
1565 {
1566         unsigned int len;
1567         const char *zc = zonename;
1568         const char *lc = logname;
1569
1570         /*
1571          * Compare the strings.  We bound the compare by MAX_ZONE_NAME.
1572          */
1573
1574         for (len = 1; len <= MAX_ZONE_NAME; zc++, lc++, len++) {
1575                 /*
1576                  * If the current characters don't match, check for a space in
1577                  * in the zone name and a corresponding period in the log name.
1578                  * If that's not there, then the strings don't match.
1579                  */
1580
1581                 if (*zc != *lc && !(*zc == ' ' && *lc == '.')) {
1582                         break;
1583                 }
1584
1585                 /*
1586                  * The strings are equal so far.  If we're at the end, then it's a match.
1587                  */
1588
1589                 if (*zc == '\0') {
1590                         return TRUE;
1591                 }
1592         }
1593
1594         return FALSE;
1595 }
1596
1597 #if DEBUG || DEVELOPMENT
1598
1599 vm_size_t
1600 zone_element_info(void *addr, vm_tag_t * ptag)
1601 {
1602         vm_size_t     size = 0;
1603         vm_tag_t      tag = VM_KERN_MEMORY_NONE;
1604         struct zone_page_metadata *meta;
1605         struct zone *src_zone;
1606
1607         if (from_zone_map(addr, sizeof(void *))) {
1608                 meta = zone_native_meta_from_addr(addr);
1609                 src_zone = &zone_array[meta->zm_index];
1610 #if VM_MAX_TAG_ZONES
1611                 if (__improbable(src_zone->tags)) {
1612                         tag = (ZTAG(src_zone, (vm_offset_t) addr)[0] >> 1);
1613                 }
1614 #endif /* VM_MAX_TAG_ZONES */
1615                 size = zone_elem_size(src_zone);
1616         } else {
1617 #if CONFIG_GZALLOC
1618                 gzalloc_element_size(addr, NULL, &size);
1619 #endif /* CONFIG_GZALLOC */
1620         }
1621         *ptag = tag;
1622         return size;
1623 }
1624
1625 #endif /* DEBUG || DEVELOPMENT */
1626
1627 /* Someone wrote to freed memory. */
1628 __abortlike
1629 static void
1630 zone_element_was_modified_panic(
1631         zone_t        zone,
1632         vm_offset_t   element,
1633         vm_offset_t   found,
1634         vm_offset_t   expected,
1635         vm_offset_t   offset)
1636 {
1637         panic("a freed zone element has been modified in zone %s%s: "
1638             "expected %p but found %p, bits changed %p, "
1639             "at offset %d of %d in element %p, cookies %p %p",
1640             zone_heap_name(zone),
1641             zone->z_name,
1642             (void *)   expected,
1643             (void *)   found,
1644             (void *)   (expected ^ found),
1645             (uint32_t) offset,
1646             (uint32_t) zone_elem_size(zone),
1647             (void *)   element,
1648             (void *)   zp_nopoison_cookie,
1649             (void *)   zp_poisoned_cookie);
1650 }
1651
1652 /* The backup pointer is stored in the last pointer-sized location in an element. */
1653 __header_always_inline vm_offset_t *
1654 get_backup_ptr(vm_size_t elem_size, vm_offset_t *element)
1655 {
1656         return (vm_offset_t *)((vm_offset_t)element + elem_size - sizeof(vm_offset_t));
1657 }
1658
1659 /*
1660  * The primary and backup pointers don't match.
1661  * Determine which one was likely the corrupted pointer, find out what it
1662  * probably should have been, and panic.
1663  */
1664 __abortlike
1665 static void
1666 backup_ptr_mismatch_panic(
1667         zone_t        zone,
1668         struct zone_page_metadata *page_meta,
1669         vm_offset_t   page,
1670         vm_offset_t   element)
1671 {
1672         vm_offset_t primary = *(vm_offset_t *)element;
1673         vm_offset_t backup  = *get_backup_ptr(zone_elem_size(zone), &element);
1674         vm_offset_t likely_backup;
1675         vm_offset_t likely_primary;
1676         zone_addr_kind_t kind = zone_addr_kind(page, zone_elem_size(zone));
1677
1678         likely_primary = primary ^ zp_nopoison_cookie;
1679         boolean_t   sane_backup;
1680         boolean_t   sane_primary = zone_page_meta_is_sane_element(zone, page_meta,
1681             page, likely_primary, kind);
1682         boolean_t   element_was_poisoned = (backup & 0x1);
1683
1684 #if defined(__LP64__)
1685         /* We can inspect the tag in the upper bits for additional confirmation */
1686         if ((backup & 0xFFFFFF0000000000) == 0xFACADE0000000000) {
1687                 element_was_poisoned = TRUE;
1688         } else if ((backup & 0xFFFFFF0000000000) == 0xC0FFEE0000000000) {
1689                 element_was_poisoned = FALSE;
1690         }
1691 #endif
1692
1693         if (element_was_poisoned) {
1694                 likely_backup = backup ^ zp_poisoned_cookie;
1695         } else {
1696                 likely_backup = backup ^ zp_nopoison_cookie;
1697         }
1698         sane_backup = zone_page_meta_is_sane_element(zone, page_meta,
1699             page, likely_backup, kind);
1700
1701         /* The primary is definitely the corrupted one */
1702         if (!sane_primary && sane_backup) {
1703                 zone_element_was_modified_panic(zone, element, primary, (likely_backup ^ zp_nopoison_cookie), 0);
1704         }
1705
1706         /* The backup is definitely the corrupted one */
1707         if (sane_primary && !sane_backup) {
1708                 zone_element_was_modified_panic(zone, element, backup,
1709                     (likely_primary ^ (element_was_poisoned ? zp_poisoned_cookie : zp_nopoison_cookie)),
1710                     zone_elem_size(zone) - sizeof(vm_offset_t));
1711         }
1712
1713         /*
1714          * Not sure which is the corrupted one.
1715          * It's less likely that the backup pointer was overwritten with
1716          * ( (sane address) ^ (valid cookie) ), so we'll guess that the
1717          * primary pointer has been overwritten with a sane but incorrect address.
1718          */
1719         if (sane_primary && sane_backup) {
1720                 zone_element_was_modified_panic(zone, element, primary, (likely_backup ^ zp_nopoison_cookie), 0);
1721         }
1722
1723         /* Neither are sane, so just guess. */
1724         zone_element_was_modified_panic(zone, element, primary, (likely_backup ^ zp_nopoison_cookie), 0);
1725 }
1726
1727 /*
1728  * zone_sequestered_page_get
1729  * z is locked
1730  */
1731 static struct zone_page_metadata *
1732 zone_sequestered_page_get(zone_t z, vm_offset_t *page)
1733 {
1734         const zone_addr_kind_t kind = ZONE_ADDR_NATIVE;
1735
1736         if (!zone_pva_is_null(z->pages_sequester)) {
1737                 if (os_sub_overflow(z->sequester_page_count, z->alloc_pages,
1738                     &z->sequester_page_count)) {
1739                         zone_accounting_panic(z, "sequester_page_count wrap-around");
1740                 }
1741                 return zone_meta_queue_pop(z, &z->pages_sequester, kind, page);
1742         }
1743
1744         return NULL;
1745 }
1746
1747 /*
1748  * zone_sequestered_page_populate
1749  * z is unlocked
1750  * page_meta is invalid on failure
1751  */
1752 static kern_return_t
1753 zone_sequestered_page_populate(zone_t z, struct zone_page_metadata *page_meta,
1754     vm_offset_t space, vm_size_t alloc_size, int zflags)
1755 {
1756         kern_return_t retval;
1757
1758         assert(alloc_size == ptoa(z->alloc_pages));
1759         retval = kernel_memory_populate(submap_for_zone(z), space, alloc_size,
1760             zflags, VM_KERN_MEMORY_ZONE);
1761         if (retval != KERN_SUCCESS) {
1762                 lock_zone(z);
1763                 zone_meta_queue_push(z, &z->pages_sequester, page_meta, ZONE_ADDR_NATIVE);
1764                 z->sequester_page_count += z->alloc_pages;
1765                 unlock_zone(z);
1766         }
1767         return retval;
1768 }
1769
1770 #pragma mark Zone poisoning/zeroing
1771
1772 /*
1773  * Initialize zone poisoning
1774  * called from zone_bootstrap before any allocations are made from zalloc
1775  */
1776 __startup_func
1777 static void
1778 zp_bootstrap(void)
1779 {
1780         char temp_buf[16];
1781
1782         /*
1783          * Initialize backup pointer random cookie for poisoned elements
1784          * Try not to call early_random() back to back, it may return
1785          * the same value if mach_absolute_time doesn't have sufficient time
1786          * to tick over between calls.  <rdar://problem/11597395>
1787          * (This is only a problem on embedded devices)
1788          */
1789         zp_poisoned_cookie = (uintptr_t) early_random();
1790
1791         /* -zp: enable poisoning for every alloc and free */
1792         if (PE_parse_boot_argn("-zp", temp_buf, sizeof(temp_buf))) {
1793                 zp_factor = 1;
1794         }
1795
1796         /* -no-zp: disable poisoning */
1797         if (PE_parse_boot_argn("-no-zp", temp_buf, sizeof(temp_buf))) {
1798                 zp_factor = 0;
1799                 printf("Zone poisoning disabled\n");
1800         }
1801
1802         /* Initialize backup pointer random cookie for unpoisoned elements */
1803         zp_nopoison_cookie = (uintptr_t) early_random();
1804
1805 #if MACH_ASSERT
1806         if (zp_poisoned_cookie == zp_nopoison_cookie) {
1807                 panic("early_random() is broken: %p and %p are not random\n",
1808                     (void *) zp_poisoned_cookie, (void *) zp_nopoison_cookie);
1809         }
1810 #endif
1811
1812         /*
1813          * Use the last bit in the backup pointer to hint poisoning state
1814          * to backup_ptr_mismatch_panic. Valid zone pointers are aligned, so
1815          * the low bits are zero.
1816          */
1817         zp_poisoned_cookie |=   (uintptr_t)0x1ULL;
1818         zp_nopoison_cookie &= ~((uintptr_t)0x1ULL);
1819
1820 #if defined(__LP64__)
1821         /*
1822          * Make backup pointers more obvious in GDB for 64 bit
1823          * by making OxFFFFFF... ^ cookie = 0xFACADE...
1824          * (0xFACADE = 0xFFFFFF ^ 0x053521)
1825          * (0xC0FFEE = 0xFFFFFF ^ 0x3f0011)
1826          * The high 3 bytes of a zone pointer are always 0xFFFFFF, and are checked
1827          * by the sanity check, so it's OK for that part of the cookie to be predictable.
1828          *
1829          * TODO: Use #defines, xors, and shifts
1830          */
1831
1832         zp_poisoned_cookie &= 0x000000FFFFFFFFFF;
1833         zp_poisoned_cookie |= 0x0535210000000000; /* 0xFACADE */
1834
1835         zp_nopoison_cookie &= 0x000000FFFFFFFFFF;
1836         zp_nopoison_cookie |= 0x3f00110000000000; /* 0xC0FFEE */
1837 #endif
1838
1839         /*
1840          * Initialize zp_min_size to two cachelines. Elements smaller than this will
1841          * be zero-ed.
1842          */
1843         ml_cpu_info_t cpu_info;
1844         ml_cpu_get_info(&cpu_info);
1845         zp_min_size = 2 * cpu_info.cache_line_size;
1846 }
1847
1848 inline uint32_t
1849 zone_poison_count_init(zone_t zone)
1850 {
1851         return zp_factor + (((uint32_t)zone_elem_size(zone)) >> zp_scale) ^
1852                (mach_absolute_time() & 0x7);
1853 }
1854
1855 #if ZALLOC_ENABLE_POISONING
1856 static bool
1857 zfree_poison_element(zone_t zone, uint32_t *zp_count, vm_offset_t elem)
1858 {
1859         bool poison = false;
1860         uint32_t zp_count_local;
1861
1862         assert(!zone->percpu);
1863         if (zp_factor != 0) {
1864                 /*
1865                  * Poison the memory of every zp_count-th element before it ends up
1866                  * on the freelist to catch use-after-free and use of uninitialized
1867                  * memory.
1868                  *
1869                  * Every element is poisoned when zp_factor is set to 1.
1870                  *
1871                  */
1872                 zp_count_local = os_atomic_load(zp_count, relaxed);
1873                 if (__improbable(zp_count_local == 0 || zp_factor == 1)) {
1874                         poison = true;
1875
1876                         os_atomic_store(zp_count, zone_poison_count_init(zone), relaxed);
1877
1878                         /* memset_pattern{4|8} could help make this faster: <rdar://problem/4662004> */
1879                         vm_offset_t *element_cursor  = ((vm_offset_t *) elem);
1880                         vm_offset_t *end_cursor      = (vm_offset_t *)(elem + zone_elem_size(zone));
1881
1882                         for (; element_cursor < end_cursor; element_cursor++) {
1883                                 *element_cursor = ZONE_POISON;
1884                         }
1885                 } else {
1886                         os_atomic_store(zp_count, zp_count_local - 1, relaxed);
1887                         /*
1888                          * Zero first zp_min_size bytes of elements that aren't being poisoned.
1889                          * Element size is larger than zp_min_size in this path as elements
1890                          * that are smaller will always be zero-ed.
1891                          */
1892                         bzero((void *) elem, zp_min_size);
1893                 }
1894         }
1895         return poison;
1896 }
1897 #else
1898 static bool
1899 zfree_poison_element(zone_t zone, uint32_t *zp_count, vm_offset_t elem)
1900 {
1901 #pragma unused(zone, zp_count, elem)
1902         assert(!zone->percpu);
1903         return false;
1904 }
1905 #endif
1906
1907 __attribute__((always_inline))
1908 static bool
1909 zfree_clear(zone_t zone, vm_offset_t addr, vm_size_t elem_size)
1910 {
1911         assert(zone->zfree_clear_mem);
1912         if (zone->percpu) {
1913                 zpercpu_foreach_cpu(i) {
1914                         bzero((void *)(addr + ptoa(i)), elem_size);
1915                 }
1916         } else {
1917                 bzero((void *)addr, elem_size);
1918         }
1919
1920         return true;
1921 }
1922
1923 /*
1924  * Zero the element if zone has zfree_clear_mem flag set else poison
1925  * the element if zp_count hits 0.
1926  */
1927 __attribute__((always_inline))
1928 bool
1929 zfree_clear_or_poison(zone_t zone, uint32_t *zp_count, vm_offset_t addr)
1930 {
1931         vm_size_t elem_size = zone_elem_size(zone);
1932
1933         if (zone->zfree_clear_mem) {
1934                 return zfree_clear(zone, addr, elem_size);
1935         }
1936
1937         return zfree_poison_element(zone, zp_count, (vm_offset_t)addr);
1938 }
1939
1940 /*
1941  * Clear out the old next pointer and backup to avoid leaking the zone
1942  * poisoning cookie and so that only values on the freelist have a valid
1943  * cookie.
1944  */
1945 void
1946 zone_clear_freelist_pointers(zone_t zone, vm_offset_t addr)
1947 {
1948         vm_offset_t perm_value = 0;
1949
1950         if (!zone->zfree_clear_mem) {
1951                 perm_value = ZONE_POISON;
1952         }
1953
1954         vm_offset_t *primary  = (vm_offset_t *) addr;
1955         vm_offset_t *backup   = get_backup_ptr(zone_elem_size(zone), primary);
1956
1957         *primary = perm_value;
1958         *backup  = perm_value;
1959 }
1960
1961 #if ZALLOC_ENABLE_POISONING
1962 __abortlike
1963 static void
1964 zone_element_not_clear_panic(zone_t zone, void *addr)
1965 {
1966         panic("Zone element %p was modified after free for zone %s%s: "
1967             "Expected element to be cleared", addr, zone_heap_name(zone),
1968             zone->z_name);
1969 }
1970
1971 /*
1972  * Validate that the element was not tampered with while it was in the
1973  * freelist.
1974  */
1975 void
1976 zalloc_validate_element(zone_t zone, vm_offset_t addr, vm_size_t size, bool validate)
1977 {
1978         if (zone->percpu) {
1979                 assert(zone->zfree_clear_mem);
1980                 zpercpu_foreach_cpu(i) {
1981                         if (memcmp_zero_ptr_aligned((void *)(addr + ptoa(i)), size)) {
1982                                 zone_element_not_clear_panic(zone, (void *)(addr + ptoa(i)));
1983                         }
1984                 }
1985         } else if (zone->zfree_clear_mem) {
1986                 if (memcmp_zero_ptr_aligned((void *)addr, size)) {
1987                         zone_element_not_clear_panic(zone, (void *)addr);
1988                 }
1989         } else if (__improbable(validate)) {
1990                 const vm_offset_t *p   = (vm_offset_t *)addr;
1991                 const vm_offset_t *end = (vm_offset_t *)(addr + size);
1992
1993                 for (; p < end; p++) {
1994                         if (*p != ZONE_POISON) {
1995                                 zone_element_was_modified_panic(zone, addr,
1996                                     *p, ZONE_POISON, (vm_offset_t)p - addr);
1997                         }
1998                 }
1999         } else {
2000                 /*
2001                  * If element wasn't poisoned or entirely cleared, validate that the
2002                  * minimum bytes that were cleared on free haven't been corrupted.
2003                  * addr is advanced by ptr size as we have already validated and cleared
2004                  * the freelist pointer/zcache canary.
2005                  */
2006                 if (memcmp_zero_ptr_aligned((void *) (addr + sizeof(vm_offset_t)),
2007                     zp_min_size - sizeof(vm_offset_t))) {
2008                         zone_element_not_clear_panic(zone, (void *)addr);
2009                 }
2010         }
2011 }
2012 #endif /* ZALLOC_ENABLE_POISONING */
2013
2014 #pragma mark Zone Leak Detection
2015
2016 /*
2017  * Zone leak debugging code
2018  *
2019  * When enabled, this code keeps a log to track allocations to a particular zone that have not
2020  * yet been freed.  Examining this log will reveal the source of a zone leak.  The log is allocated
2021  * only when logging is enabled, so there is no effect on the system when it's turned off.  Logging is
2022  * off by default.
2023  *
2024  * Enable the logging via the boot-args. Add the parameter "zlog=<zone>" to boot-args where <zone>
2025  * is the name of the zone you wish to log.
2026  *
2027  * This code only tracks one zone, so you need to identify which one is leaking first.
2028  * Generally, you'll know you have a leak when you get a "zalloc retry failed 3" panic from the zone
2029  * garbage collector.  Note that the zone name printed in the panic message is not necessarily the one
2030  * containing the leak.  So do a zprint from gdb and locate the zone with the bloated size.  This
2031  * is most likely the problem zone, so set zlog in boot-args to this zone name, reboot and re-run the test.  The
2032  * next time it panics with this message, examine the log using the kgmacros zstack, findoldest and countpcs.
2033  * See the help in the kgmacros for usage info.
2034  *
2035  *
2036  * Zone corruption logging
2037  *
2038  * Logging can also be used to help identify the source of a zone corruption.  First, identify the zone
2039  * that is being corrupted, then add "-zc zlog=<zone name>" to the boot-args.  When -zc is used in conjunction
2040  * with zlog, it changes the logging style to track both allocations and frees to the zone.  So when the
2041  * corruption is detected, examining the log will show you the stack traces of the callers who last allocated
2042  * and freed any particular element in the zone.  Use the findelem kgmacro with the address of the element that's been
2043  * corrupted to examine its history.  This should lead to the source of the corruption.
2044  */
2045
2046 /* Returns TRUE if we rolled over the counter at factor */
2047 __header_always_inline bool
2048 sample_counter(volatile uint32_t *count_p, uint32_t factor)
2049 {
2050         uint32_t old_count, new_count = 0;
2051         if (count_p != NULL) {
2052                 os_atomic_rmw_loop(count_p, old_count, new_count, relaxed, {
2053                         new_count = old_count + 1;
2054                         if (new_count >= factor) {
2055                                 new_count = 0;
2056                         }
2057                 });
2058         }
2059
2060         return new_count == 0;
2061 }
2062
2063 #if ZONE_ENABLE_LOGGING
2064 /* Log allocations and frees to help debug a zone element corruption */
2065 TUNABLE(bool, corruption_debug_flag, "-zc", false);
2066
2067 #define MAX_NUM_ZONES_ALLOWED_LOGGING   10 /* Maximum 10 zones can be logged at once */
2068
2069 static int  max_num_zones_to_log = MAX_NUM_ZONES_ALLOWED_LOGGING;
2070 static int  num_zones_logged = 0;
2071
2072 /*
2073  * The number of records in the log is configurable via the zrecs parameter in boot-args.  Set this to
2074  * the number of records you want in the log.  For example, "zrecs=10" sets it to 10 records. Since this
2075  * is the number of stacks suspected of leaking, we don't need many records.
2076  */
2077
2078 #if defined(__LP64__)
2079 #define ZRECORDS_MAX            2560            /* Max records allowed in the log */
2080 #else
2081 #define ZRECORDS_MAX            1536            /* Max records allowed in the log */
2082 #endif
2083 #define ZRECORDS_DEFAULT        1024            /* default records in log if zrecs is not specificed in boot-args */
2084
2085 static TUNABLE(uint32_t, log_records, "zrecs", ZRECORDS_DEFAULT);
2086
2087 static void
2088 zone_enable_logging(zone_t z)
2089 {
2090         z->zlog_btlog = btlog_create(log_records, MAX_ZTRACE_DEPTH,
2091             (corruption_debug_flag == FALSE) /* caller_will_remove_entries_for_element? */);
2092
2093         if (z->zlog_btlog) {
2094                 printf("zone: logging started for zone %s%s\n",
2095                     zone_heap_name(z), z->z_name);
2096         } else {
2097                 printf("zone: couldn't allocate memory for zrecords, turning off zleak logging\n");
2098                 z->zone_logging = false;
2099         }
2100 }
2101
2102 /**
2103  * @function zone_setup_logging
2104  *
2105  * @abstract
2106  * Optionally sets up a zone for logging.
2107  *
2108  * @discussion
2109  * We recognized two boot-args:
2110  *
2111  *      zlog=<zone_to_log>
2112  *      zrecs=<num_records_in_log>
2113  *
2114  * The zlog arg is used to specify the zone name that should be logged,
2115  * and zrecs is used to control the size of the log.
2116  *
2117  * If zrecs is not specified, a default value is used.
2118  */
2119 static void
2120 zone_setup_logging(zone_t z)
2121 {
2122         char zone_name[MAX_ZONE_NAME]; /* Temp. buffer for the zone name */
2123         char zlog_name[MAX_ZONE_NAME]; /* Temp. buffer to create the strings zlog1, zlog2 etc... */
2124         char zlog_val[MAX_ZONE_NAME];  /* the zone name we're logging, if any */
2125
2126         /*
2127          * Don't allow more than ZRECORDS_MAX records even if the user asked for more.
2128          *
2129          * This prevents accidentally hogging too much kernel memory
2130          * and making the system unusable.
2131          */
2132         if (log_records > ZRECORDS_MAX) {
2133                 log_records = ZRECORDS_MAX;
2134         }
2135
2136         /*
2137          * Append kalloc heap name to zone name (if zone is used by kalloc)
2138          */
2139         snprintf(zone_name, MAX_ZONE_NAME, "%s%s", zone_heap_name(z), z->z_name);
2140
2141         /* zlog0 isn't allowed. */
2142         for (int i = 1; i <= max_num_zones_to_log; i++) {
2143                 snprintf(zlog_name, MAX_ZONE_NAME, "zlog%d", i);
2144
2145                 if (PE_parse_boot_argn(zlog_name, zlog_val, sizeof(zlog_val)) &&
2146                     track_this_zone(zone_name, zlog_val)) {
2147                         z->zone_logging = true;
2148                         num_zones_logged++;
2149                         break;
2150                 }
2151         }
2152
2153         /*
2154          * Backwards compat. with the old boot-arg used to specify single zone
2155          * logging i.e. zlog Needs to happen after the newer zlogn checks
2156          * because the prefix will match all the zlogn
2157          * boot-args.
2158          */
2159         if (!z->zone_logging &&
2160             PE_parse_boot_argn("zlog", zlog_val, sizeof(zlog_val)) &&
2161             track_this_zone(zone_name, zlog_val)) {
2162                 z->zone_logging = true;
2163                 num_zones_logged++;
2164         }
2165
2166
2167         /*
2168          * If we want to log a zone, see if we need to allocate buffer space for
2169          * the log.
2170          *
2171          * Some vm related zones are zinit'ed before we can do a kmem_alloc, so
2172          * we have to defer allocation in that case.
2173          *
2174          * zone_init() will finish the job.
2175          *
2176          * If we want to log one of the VM related zones that's set up early on,
2177          * we will skip allocation of the log until zinit is called again later
2178          * on some other zone.
2179          */
2180         if (z->zone_logging && startup_phase >= STARTUP_SUB_KMEM_ALLOC) {
2181                 zone_enable_logging(z);
2182         }
2183 }
2184
2185 /*
2186  * Each record in the log contains a pointer to the zone element it refers to,
2187  * and a small array to hold the pc's from the stack trace.  A
2188  * record is added to the log each time a zalloc() is done in the zone_of_interest.  For leak debugging,
2189  * the record is cleared when a zfree() is done.  For corruption debugging, the log tracks both allocs and frees.
2190  * If the log fills, old records are replaced as if it were a circular buffer.
2191  */
2192
2193
2194 /*
2195  * Decide if we want to log this zone by doing a string compare between a zone name and the name
2196  * of the zone to log. Return true if the strings are equal, false otherwise.  Because it's not
2197  * possible to include spaces in strings passed in via the boot-args, a period in the logname will
2198  * match a space in the zone name.
2199  */
2200
2201 /*
2202  * Test if we want to log this zalloc/zfree event.  We log if this is the zone we're interested in and
2203  * the buffer for the records has been allocated.
2204  */
2205
2206 #define DO_LOGGING(z)           (z->zlog_btlog != NULL)
2207 #else /* !ZONE_ENABLE_LOGGING */
2208 #define DO_LOGGING(z)           0
2209 #endif /* !ZONE_ENABLE_LOGGING */
2210
2211 #if CONFIG_ZLEAKS
2212
2213 /*
2214  * The zone leak detector, abbreviated 'zleak', keeps track of a subset of the currently outstanding
2215  * allocations made by the zone allocator.  Every zleak_sample_factor allocations in each zone, we capture a
2216  * backtrace.  Every free, we examine the table and determine if the allocation was being tracked,
2217  * and stop tracking it if it was being tracked.
2218  *
2219  * We track the allocations in the zallocations hash table, which stores the address that was returned from
2220  * the zone allocator.  Each stored entry in the zallocations table points to an entry in the ztraces table, which
2221  * stores the backtrace associated with that allocation.  This provides uniquing for the relatively large
2222  * backtraces - we don't store them more than once.
2223  *
2224  * Data collection begins when the zone map is 50% full, and only occurs for zones that are taking up
2225  * a large amount of virtual space.
2226  */
2227 #define ZLEAK_STATE_ENABLED             0x01    /* Zone leak monitoring should be turned on if zone_map fills up. */
2228 #define ZLEAK_STATE_ACTIVE              0x02    /* We are actively collecting traces. */
2229 #define ZLEAK_STATE_ACTIVATING          0x04    /* Some thread is doing setup; others should move along. */
2230 #define ZLEAK_STATE_FAILED              0x08    /* Attempt to allocate tables failed.  We will not try again. */
2231 uint32_t        zleak_state = 0;                /* State of collection, as above */
2232
2233 boolean_t       panic_include_ztrace    = FALSE;        /* Enable zleak logging on panic */
2234 vm_size_t       zleak_global_tracking_threshold;        /* Size of zone map at which to start collecting data */
2235 vm_size_t       zleak_per_zone_tracking_threshold;      /* Size a zone will have before we will collect data on it */
2236 unsigned int    zleak_sample_factor     = 1000;         /* Allocations per sample attempt */
2237
2238 /*
2239  * Counters for allocation statistics.
2240  */
2241
2242 /* Times two active records want to occupy the same spot */
2243 unsigned int z_alloc_collisions = 0;
2244 unsigned int z_trace_collisions = 0;
2245
2246 /* Times a new record lands on a spot previously occupied by a freed allocation */
2247 unsigned int z_alloc_overwrites = 0;
2248 unsigned int z_trace_overwrites = 0;
2249
2250 /* Times a new alloc or trace is put into the hash table */
2251 unsigned int z_alloc_recorded   = 0;
2252 unsigned int z_trace_recorded   = 0;
2253
2254 /* Times zleak_log returned false due to not being able to acquire the lock */
2255 unsigned int z_total_conflicts  = 0;
2256
2257 /*
2258  * Structure for keeping track of an allocation
2259  * An allocation bucket is in use if its element is not NULL
2260  */
2261 struct zallocation {
2262         uintptr_t               za_element;             /* the element that was zalloc'ed or zfree'ed, NULL if bucket unused */
2263         vm_size_t               za_size;                        /* how much memory did this allocation take up? */
2264         uint32_t                za_trace_index; /* index into ztraces for backtrace associated with allocation */
2265         /* TODO: #if this out */
2266         uint32_t                za_hit_count;           /* for determining effectiveness of hash function */
2267 };
2268
2269 /* Size must be a power of two for the zhash to be able to just mask off bits instead of mod */
2270 uint32_t zleak_alloc_buckets = CONFIG_ZLEAK_ALLOCATION_MAP_NUM;
2271 uint32_t zleak_trace_buckets = CONFIG_ZLEAK_TRACE_MAP_NUM;
2272
2273 vm_size_t zleak_max_zonemap_size;
2274
2275 /* Hashmaps of allocations and their corresponding traces */
2276 static struct zallocation*      zallocations;
2277 static struct ztrace*           ztraces;
2278
2279 /* not static so that panic can see this, see kern/debug.c */
2280 struct ztrace*                          top_ztrace;
2281
2282 /* Lock to protect zallocations, ztraces, and top_ztrace from concurrent modification. */
2283 LCK_GRP_DECLARE(zleak_lock_grp, "zleak_lock");
2284 LCK_SPIN_DECLARE(zleak_lock, &zleak_lock_grp);
2285
2286 /*
2287  * Initializes the zone leak monitor.  Called from zone_init()
2288  */
2289 __startup_func
2290 static void
2291 zleak_init(vm_size_t max_zonemap_size)
2292 {
2293         char                    scratch_buf[16];
2294         boolean_t               zleak_enable_flag = FALSE;
2295
2296         zleak_max_zonemap_size = max_zonemap_size;
2297         zleak_global_tracking_threshold = max_zonemap_size / 2;
2298         zleak_per_zone_tracking_threshold = zleak_global_tracking_threshold / 8;
2299
2300 #if CONFIG_EMBEDDED
2301         if (PE_parse_boot_argn("-zleakon", scratch_buf, sizeof(scratch_buf))) {
2302                 zleak_enable_flag = TRUE;
2303                 printf("zone leak detection enabled\n");
2304         } else {
2305                 zleak_enable_flag = FALSE;
2306                 printf("zone leak detection disabled\n");
2307         }
2308 #else /* CONFIG_EMBEDDED */
2309         /* -zleakoff (flag to disable zone leak monitor) */
2310         if (PE_parse_boot_argn("-zleakoff", scratch_buf, sizeof(scratch_buf))) {
2311                 zleak_enable_flag = FALSE;
2312                 printf("zone leak detection disabled\n");
2313         } else {
2314                 zleak_enable_flag = TRUE;
2315                 printf("zone leak detection enabled\n");
2316         }
2317 #endif /* CONFIG_EMBEDDED */
2318
2319         /* zfactor=XXXX (override how often to sample the zone allocator) */
2320         if (PE_parse_boot_argn("zfactor", &zleak_sample_factor, sizeof(zleak_sample_factor))) {
2321                 printf("Zone leak factor override: %u\n", zleak_sample_factor);
2322         }
2323
2324         /* zleak-allocs=XXXX (override number of buckets in zallocations) */
2325         if (PE_parse_boot_argn("zleak-allocs", &zleak_alloc_buckets, sizeof(zleak_alloc_buckets))) {
2326                 printf("Zone leak alloc buckets override: %u\n", zleak_alloc_buckets);
2327                 /* uses 'is power of 2' trick: (0x01000 & 0x00FFF == 0) */
2328                 if (zleak_alloc_buckets == 0 || (zleak_alloc_buckets & (zleak_alloc_buckets - 1))) {
2329                         printf("Override isn't a power of two, bad things might happen!\n");
2330                 }
2331         }
2332
2333         /* zleak-traces=XXXX (override number of buckets in ztraces) */
2334         if (PE_parse_boot_argn("zleak-traces", &zleak_trace_buckets, sizeof(zleak_trace_buckets))) {
2335                 printf("Zone leak trace buckets override: %u\n", zleak_trace_buckets);
2336                 /* uses 'is power of 2' trick: (0x01000 & 0x00FFF == 0) */
2337                 if (zleak_trace_buckets == 0 || (zleak_trace_buckets & (zleak_trace_buckets - 1))) {
2338                         printf("Override isn't a power of two, bad things might happen!\n");
2339                 }
2340         }
2341
2342         if (zleak_enable_flag) {
2343                 zleak_state = ZLEAK_STATE_ENABLED;
2344         }
2345 }
2346
2347 /*
2348  * Support for kern.zleak.active sysctl - a simplified
2349  * version of the zleak_state variable.
2350  */
2351 int
2352 get_zleak_state(void)
2353 {
2354         if (zleak_state & ZLEAK_STATE_FAILED) {
2355                 return -1;
2356         }
2357         if (zleak_state & ZLEAK_STATE_ACTIVE) {
2358                 return 1;
2359         }
2360         return 0;
2361 }
2362
2363 kern_return_t
2364 zleak_activate(void)
2365 {
2366         kern_return_t retval;
2367         vm_size_t z_alloc_size = zleak_alloc_buckets * sizeof(struct zallocation);
2368         vm_size_t z_trace_size = zleak_trace_buckets * sizeof(struct ztrace);
2369         void *allocations_ptr = NULL;
2370         void *traces_ptr = NULL;
2371
2372         /* Only one thread attempts to activate at a time */
2373         if (zleak_state & (ZLEAK_STATE_ACTIVE | ZLEAK_STATE_ACTIVATING | ZLEAK_STATE_FAILED)) {
2374                 return KERN_SUCCESS;
2375         }
2376
2377         /* Indicate that we're doing the setup */
2378         lck_spin_lock(&zleak_lock);
2379         if (zleak_state & (ZLEAK_STATE_ACTIVE | ZLEAK_STATE_ACTIVATING | ZLEAK_STATE_FAILED)) {
2380                 lck_spin_unlock(&zleak_lock);
2381                 return KERN_SUCCESS;
2382         }
2383
2384         zleak_state |= ZLEAK_STATE_ACTIVATING;
2385         lck_spin_unlock(&zleak_lock);
2386
2387         /* Allocate and zero tables */
2388         retval = kmem_alloc_kobject(kernel_map, (vm_offset_t*)&allocations_ptr, z_alloc_size, VM_KERN_MEMORY_OSFMK);
2389         if (retval != KERN_SUCCESS) {
2390                 goto fail;
2391         }
2392
2393         retval = kmem_alloc_kobject(kernel_map, (vm_offset_t*)&traces_ptr, z_trace_size, VM_KERN_MEMORY_OSFMK);
2394         if (retval != KERN_SUCCESS) {
2395                 goto fail;
2396         }
2397
2398         bzero(allocations_ptr, z_alloc_size);
2399         bzero(traces_ptr, z_trace_size);
2400
2401         /* Everything's set.  Install tables, mark active. */
2402         zallocations = allocations_ptr;
2403         ztraces = traces_ptr;
2404
2405         /*
2406          * Initialize the top_ztrace to the first entry in ztraces,
2407          * so we don't have to check for null in zleak_log
2408          */
2409         top_ztrace = &ztraces[0];
2410
2411         /*
2412          * Note that we do need a barrier between installing
2413          * the tables and setting the active flag, because the zfree()
2414          * path accesses the table without a lock if we're active.
2415          */
2416         lck_spin_lock(&zleak_lock);
2417         zleak_state |= ZLEAK_STATE_ACTIVE;
2418         zleak_state &= ~ZLEAK_STATE_ACTIVATING;
2419         lck_spin_unlock(&zleak_lock);
2420
2421         return 0;
2422
2423 fail:
2424         /*
2425          * If we fail to allocate memory, don't further tax
2426          * the system by trying again.
2427          */
2428         lck_spin_lock(&zleak_lock);
2429         zleak_state |= ZLEAK_STATE_FAILED;
2430         zleak_state &= ~ZLEAK_STATE_ACTIVATING;
2431         lck_spin_unlock(&zleak_lock);
2432
2433         if (allocations_ptr != NULL) {
2434                 kmem_free(kernel_map, (vm_offset_t)allocations_ptr, z_alloc_size);
2435         }
2436
2437         if (traces_ptr != NULL) {
2438                 kmem_free(kernel_map, (vm_offset_t)traces_ptr, z_trace_size);
2439         }
2440
2441         return retval;
2442 }
2443
2444 /*
2445  * TODO: What about allocations that never get deallocated,
2446  * especially ones with unique backtraces? Should we wait to record
2447  * until after boot has completed?
2448  * (How many persistent zallocs are there?)
2449  */
2450
2451 /*
2452  * This function records the allocation in the allocations table,
2453  * and stores the associated backtrace in the traces table
2454  * (or just increments the refcount if the trace is already recorded)
2455  * If the allocation slot is in use, the old allocation is replaced with the new allocation, and
2456  * the associated trace's refcount is decremented.
2457  * If the trace slot is in use, it returns.
2458  * The refcount is incremented by the amount of memory the allocation consumes.
2459  * The return value indicates whether to try again next time.
2460  */
2461 static boolean_t
2462 zleak_log(uintptr_t* bt,
2463     uintptr_t addr,
2464     uint32_t depth,
2465     vm_size_t allocation_size)
2466 {
2467         /* Quit if there's someone else modifying the hash tables */
2468         if (!lck_spin_try_lock(&zleak_lock)) {
2469                 z_total_conflicts++;
2470                 return FALSE;
2471         }
2472
2473         struct zallocation* allocation  = &zallocations[hashaddr(addr, zleak_alloc_buckets)];
2474
2475         uint32_t trace_index = hashbacktrace(bt, depth, zleak_trace_buckets);
2476         struct ztrace* trace = &ztraces[trace_index];
2477
2478         allocation->za_hit_count++;
2479         trace->zt_hit_count++;
2480
2481         /*
2482          * If the allocation bucket we want to be in is occupied, and if the occupier
2483          * has the same trace as us, just bail.
2484          */
2485         if (allocation->za_element != (uintptr_t) 0 && trace_index == allocation->za_trace_index) {
2486                 z_alloc_collisions++;
2487
2488                 lck_spin_unlock(&zleak_lock);
2489                 return TRUE;
2490         }
2491
2492         /* STEP 1: Store the backtrace in the traces array. */
2493         /* A size of zero indicates that the trace bucket is free. */
2494
2495         if (trace->zt_size > 0 && bcmp(trace->zt_stack, bt, (depth * sizeof(uintptr_t))) != 0) {
2496                 /*
2497                  * Different unique trace with same hash!
2498                  * Just bail - if we're trying to record the leaker, hopefully the other trace will be deallocated
2499                  * and get out of the way for later chances
2500                  */
2501                 trace->zt_collisions++;
2502                 z_trace_collisions++;
2503
2504                 lck_spin_unlock(&zleak_lock);
2505                 return TRUE;
2506         } else if (trace->zt_size > 0) {
2507                 /* Same trace, already added, so increment refcount */
2508                 trace->zt_size += allocation_size;
2509         } else {
2510                 /* Found an unused trace bucket, record the trace here! */
2511                 if (trace->zt_depth != 0) { /* if this slot was previously used but not currently in use */
2512                         z_trace_overwrites++;
2513                 }
2514
2515                 z_trace_recorded++;
2516                 trace->zt_size                  = allocation_size;
2517                 memcpy(trace->zt_stack, bt, (depth * sizeof(uintptr_t)));
2518
2519                 trace->zt_depth         = depth;
2520                 trace->zt_collisions    = 0;
2521         }
2522
2523         /* STEP 2: Store the allocation record in the allocations array. */
2524
2525         if (allocation->za_element != (uintptr_t) 0) {
2526                 /*
2527                  * Straight up replace any allocation record that was there.  We don't want to do the work
2528                  * to preserve the allocation entries that were there, because we only record a subset of the
2529                  * allocations anyways.
2530                  */
2531
2532                 z_alloc_collisions++;
2533
2534                 struct ztrace* associated_trace = &ztraces[allocation->za_trace_index];
2535                 /* Knock off old allocation's size, not the new allocation */
2536                 associated_trace->zt_size -= allocation->za_size;
2537         } else if (allocation->za_trace_index != 0) {
2538                 /* Slot previously used but not currently in use */
2539                 z_alloc_overwrites++;
2540         }
2541
2542         allocation->za_element          = addr;
2543         allocation->za_trace_index      = trace_index;
2544         allocation->za_size             = allocation_size;
2545
2546         z_alloc_recorded++;
2547
2548         if (top_ztrace->zt_size < trace->zt_size) {
2549                 top_ztrace = trace;
2550         }
2551
2552         lck_spin_unlock(&zleak_lock);
2553         return TRUE;
2554 }
2555
2556 /*
2557  * Free the allocation record and release the stacktrace.
2558  * This should be as fast as possible because it will be called for every free.
2559  */
2560 __attribute__((noinline))
2561 static void
2562 zleak_free(uintptr_t addr,
2563     vm_size_t allocation_size)
2564 {
2565         if (addr == (uintptr_t) 0) {
2566                 return;
2567         }
2568
2569         struct zallocation* allocation = &zallocations[hashaddr(addr, zleak_alloc_buckets)];
2570
2571         /* Double-checked locking: check to find out if we're interested, lock, check to make
2572          * sure it hasn't changed, then modify it, and release the lock.
2573          */
2574
2575         if (allocation->za_element == addr && allocation->za_trace_index < zleak_trace_buckets) {
2576                 /* if the allocation was the one, grab the lock, check again, then delete it */
2577                 lck_spin_lock(&zleak_lock);
2578
2579                 if (allocation->za_element == addr && allocation->za_trace_index < zleak_trace_buckets) {
2580                         struct ztrace *trace;
2581
2582                         /* allocation_size had better match what was passed into zleak_log - otherwise someone is freeing into the wrong zone! */
2583                         if (allocation->za_size != allocation_size) {
2584                                 panic("Freeing as size %lu memory that was allocated with size %lu\n",
2585                                     (uintptr_t)allocation_size, (uintptr_t)allocation->za_size);
2586                         }
2587
2588                         trace = &ztraces[allocation->za_trace_index];
2589
2590                         /* size of 0 indicates trace bucket is unused */
2591                         if (trace->zt_size > 0) {
2592                                 trace->zt_size -= allocation_size;
2593                         }
2594
2595                         /* A NULL element means the allocation bucket is unused */
2596                         allocation->za_element = 0;
2597                 }
2598                 lck_spin_unlock(&zleak_lock);
2599         }
2600 }
2601
2602 #endif /* CONFIG_ZLEAKS */
2603
2604 /*  These functions outside of CONFIG_ZLEAKS because they are also used in
2605  *  mbuf.c for mbuf leak-detection.  This is why they lack the z_ prefix.
2606  */
2607
2608 /* "Thomas Wang's 32/64 bit mix functions."  http://www.concentric.net/~Ttwang/tech/inthash.htm */
2609 uintptr_t
2610 hash_mix(uintptr_t x)
2611 {
2612 #ifndef __LP64__
2613         x += ~(x << 15);
2614         x ^=  (x >> 10);
2615         x +=  (x << 3);
2616         x ^=  (x >> 6);
2617         x += ~(x << 11);
2618         x ^=  (x >> 16);
2619 #else
2620         x += ~(x << 32);
2621         x ^=  (x >> 22);
2622         x += ~(x << 13);
2623         x ^=  (x >> 8);
2624         x +=  (x << 3);
2625         x ^=  (x >> 15);
2626         x += ~(x << 27);
2627         x ^=  (x >> 31);
2628 #endif
2629         return x;
2630 }
2631
2632 uint32_t
2633 hashbacktrace(uintptr_t* bt, uint32_t depth, uint32_t max_size)
2634 {
2635         uintptr_t hash = 0;
2636         uintptr_t mask = max_size - 1;
2637
2638         while (depth) {
2639                 hash += bt[--depth];
2640         }
2641
2642         hash = hash_mix(hash) & mask;
2643
2644         assert(hash < max_size);
2645
2646         return (uint32_t) hash;
2647 }
2648
2649 /*
2650  *  TODO: Determine how well distributed this is
2651  *      max_size must be a power of 2. i.e 0x10000 because 0x10000-1 is 0x0FFFF which is a great bitmask
2652  */
2653 uint32_t
2654 hashaddr(uintptr_t pt, uint32_t max_size)
2655 {
2656         uintptr_t hash = 0;
2657         uintptr_t mask = max_size - 1;
2658
2659         hash = hash_mix(pt) & mask;
2660
2661         assert(hash < max_size);
2662
2663         return (uint32_t) hash;
2664 }
2665
2666 /* End of all leak-detection code */
2667 #pragma mark zone creation, configuration, destruction
2668
2669 static zone_t
2670 zone_init_defaults(zone_id_t zid)
2671 {
2672         zone_t z = &zone_array[zid];
2673
2674         z->page_count_max = ~0u;
2675         z->collectable = true;
2676         z->expandable = true;
2677         z->submap_idx = Z_SUBMAP_IDX_GENERAL_MAP;
2678
2679         simple_lock_init(&z->lock, 0);
2680
2681         return z;
2682 }
2683
2684 static bool
2685 zone_is_initializing(zone_t z)
2686 {
2687         return !z->z_self && !z->destroyed;
2688 }
2689
2690 static void
2691 zone_set_max(zone_t z, vm_size_t max)
2692 {
2693 #if KASAN_ZALLOC
2694         if (z->kasan_redzone) {
2695                 /*
2696                  * Adjust the max memory for the kasan redzones
2697                  */
2698                 max += (max / z->pcpu_elem_size) * z->kasan_redzone * 2;
2699         }
2700 #endif
2701         if (max < z->percpu ? 1 : z->alloc_pages) {
2702                 max = z->percpu ? 1 : z->alloc_pages;
2703         } else {
2704                 max = atop(round_page(max));
2705         }
2706         z->page_count_max = max;
2707 }
2708
2709 void
2710 zone_set_submap_idx(zone_t zone, unsigned int sub_map_idx)
2711 {
2712         if (!zone_is_initializing(zone)) {
2713                 panic("%s: called after zone_create()", __func__);
2714         }
2715         if (sub_map_idx > zone_last_submap_idx) {
2716                 panic("zone_set_submap_idx(%d) > %d", sub_map_idx, zone_last_submap_idx);
2717         }
2718         zone->submap_idx = sub_map_idx;
2719 }
2720
2721 void
2722 zone_set_noexpand(
2723         zone_t          zone,
2724         vm_size_t       max)
2725 {
2726         if (!zone_is_initializing(zone)) {
2727                 panic("%s: called after zone_create()", __func__);
2728         }
2729         zone->expandable = false;
2730         zone_set_max(zone, max);
2731 }
2732
2733 void
2734 zone_set_exhaustible(
2735         zone_t          zone,
2736         vm_size_t       max)
2737 {
2738         if (!zone_is_initializing(zone)) {
2739                 panic("%s: called after zone_create()", __func__);
2740         }
2741         zone->expandable = false;
2742         zone->exhaustible = true;
2743         zone_set_max(zone, max);
2744 }
2745
2746 /**
2747  * @function zone_create_find
2748  *
2749  * @abstract
2750  * Finds an unused zone for the given name and element size.
2751  *
2752  * @param name          the zone name
2753  * @param size          the element size (including redzones, ...)
2754  * @param flags         the flags passed to @c zone_create*
2755  * @param zid           the desired zone ID or ZONE_ID_ANY
2756  *
2757  * @returns             a zone to initialize further.
2758  */
2759 static zone_t
2760 zone_create_find(
2761         const char             *name,
2762         vm_size_t               size,
2763         zone_create_flags_t     flags,
2764         zone_id_t               zid)
2765 {
2766         zone_id_t nzones;
2767         zone_t z;
2768
2769         simple_lock(&all_zones_lock, &zone_locks_grp);
2770
2771         nzones = (zone_id_t)os_atomic_load(&num_zones, relaxed);
2772         assert(num_zones_in_use <= nzones && nzones < MAX_ZONES);
2773
2774         if (__improbable(nzones < ZONE_ID__FIRST_DYNAMIC)) {
2775                 /*
2776                  * The first time around, make sure the reserved zone IDs
2777                  * have an initialized lock as zone_index_foreach() will
2778                  * enumerate them.
2779                  */
2780                 while (nzones < ZONE_ID__FIRST_DYNAMIC) {
2781                         zone_init_defaults(nzones++);
2782                 }
2783
2784                 os_atomic_store(&num_zones, nzones, release);
2785         }
2786
2787         if (zid != ZONE_ID_ANY) {
2788                 if (zid >= ZONE_ID__FIRST_DYNAMIC) {
2789                         panic("zone_create: invalid desired zone ID %d for %s",
2790                             zid, name);
2791                 }
2792                 if (flags & ZC_DESTRUCTIBLE) {
2793                         panic("zone_create: ID %d (%s) must be permanent", zid, name);
2794                 }
2795                 if (zone_array[zid].z_self) {
2796                         panic("zone_create: creating zone ID %d (%s) twice", zid, name);
2797                 }
2798                 z = &zone_array[zid];
2799         } else {
2800                 if (flags & ZC_DESTRUCTIBLE) {
2801                         /*
2802                          * If possible, find a previously zdestroy'ed zone in the
2803                          * zone_array that we can reuse.
2804                          */
2805                         for (int i = bitmap_first(zone_destroyed_bitmap, MAX_ZONES);
2806                             i >= 0; i = bitmap_next(zone_destroyed_bitmap, i)) {
2807                                 z = &zone_array[i];
2808
2809                                 /*
2810                                  * If the zone name and the element size are the
2811                                  * same, we can just reuse the old zone struct.
2812                                  */
2813                                 if (strcmp(z->z_name, name) || zone_elem_size(z) != size) {
2814                                         continue;
2815                                 }
2816                                 bitmap_clear(zone_destroyed_bitmap, i);
2817                                 z->destroyed = false;
2818                                 z->z_self = z;
2819                                 zid = (zone_id_t)i;
2820                                 goto out;
2821                         }
2822                 }
2823
2824                 zid = nzones++;
2825                 z = zone_init_defaults(zid);
2826
2827                 /*
2828                  * The release barrier pairs with the acquire in
2829                  * zone_index_foreach() and makes sure that enumeration loops
2830                  * always see an initialized zone lock.
2831                  */
2832                 os_atomic_store(&num_zones, nzones, release);
2833         }
2834
2835 out:
2836         num_zones_in_use++;
2837         simple_unlock(&all_zones_lock);
2838
2839         return z;
2840 }
2841
2842 __abortlike
2843 static void
2844 zone_create_panic(const char *name, const char *f1, const char *f2)
2845 {
2846         panic("zone_create: creating zone %s: flag %s and %s are incompatible",
2847             name, f1, f2);
2848 }
2849 #define zone_create_assert_not_both(name, flags, current_flag, forbidden_flag) \
2850         if ((flags) & forbidden_flag) { \
2851                 zone_create_panic(name, #current_flag, #forbidden_flag); \
2852         }
2853
2854 /*
2855  * Adjusts the size of the element based on minimum size, alignment
2856  * and kasan redzones
2857  */
2858 static vm_size_t
2859 zone_elem_adjust_size(
2860         const char             *name __unused,
2861         vm_size_t               elem_size,
2862         zone_create_flags_t     flags,
2863         vm_size_t              *redzone __unused)
2864 {
2865         vm_size_t size;
2866         /*
2867          * Adjust element size for minimum size and pointer alignment
2868          */
2869         size = (elem_size + sizeof(vm_offset_t) - 1) & -sizeof(vm_offset_t);
2870         if (((flags & ZC_PERCPU) == 0) && size < ZONE_MIN_ELEM_SIZE) {
2871                 size = ZONE_MIN_ELEM_SIZE;
2872         }
2873
2874 #if KASAN_ZALLOC
2875         /*
2876          * Expand the zone allocation size to include the redzones.
2877          *
2878          * For page-multiple zones add a full guard page because they
2879          * likely require alignment.
2880          */
2881         vm_size_t redzone_tmp;
2882         if (flags & (ZC_KASAN_NOREDZONE | ZC_PERCPU)) {
2883                 redzone_tmp = 0;
2884         } else if ((size & PAGE_MASK) == 0) {
2885                 if (size != PAGE_SIZE && (flags & ZC_ALIGNMENT_REQUIRED)) {
2886                         panic("zone_create: zone %s can't provide more than PAGE_SIZE"
2887                             "alignment", name);
2888                 }
2889                 redzone_tmp = PAGE_SIZE;
2890         } else if (flags & ZC_ALIGNMENT_REQUIRED) {
2891                 redzone_tmp = 0;
2892         } else {
2893                 redzone_tmp = KASAN_GUARD_SIZE;
2894         }
2895         size += redzone_tmp * 2;
2896         if (redzone) {
2897                 *redzone = redzone_tmp;
2898         }
2899 #endif
2900         return size;
2901 }
2902
2903 /*
2904  * Returns the allocation chunk size that has least framentation
2905  */
2906 static vm_size_t
2907 zone_get_min_alloc_granule(
2908         vm_size_t               elem_size,
2909         zone_create_flags_t     flags)
2910 {
2911         vm_size_t alloc_granule = PAGE_SIZE;
2912         if (flags & ZC_PERCPU) {
2913                 alloc_granule = PAGE_SIZE * zpercpu_count();
2914                 if (PAGE_SIZE % elem_size > 256) {
2915                         panic("zone_create: per-cpu zone has too much fragmentation");
2916                 }
2917         } else if ((elem_size & PAGE_MASK) == 0) {
2918                 /* zero fragmentation by definition */
2919                 alloc_granule = elem_size;
2920         } else if (alloc_granule % elem_size == 0) {
2921                 /* zero fragmentation by definition */
2922         } else {
2923                 vm_size_t frag = (alloc_granule % elem_size) * 100 / alloc_granule;
2924                 vm_size_t alloc_tmp = PAGE_SIZE;
2925                 while ((alloc_tmp += PAGE_SIZE) <= ZONE_MAX_ALLOC_SIZE) {
2926                         vm_size_t frag_tmp = (alloc_tmp % elem_size) * 100 / alloc_tmp;
2927                         if (frag_tmp < frag) {
2928                                 frag = frag_tmp;
2929                                 alloc_granule = alloc_tmp;
2930                         }
2931                 }
2932         }
2933         return alloc_granule;
2934 }
2935
2936 vm_size_t
2937 zone_get_foreign_alloc_size(
2938         const char             *name __unused,
2939         vm_size_t               elem_size,
2940         zone_create_flags_t     flags,
2941         uint16_t                min_pages)
2942 {
2943         vm_size_t adjusted_size = zone_elem_adjust_size(name, elem_size, flags,
2944             NULL);
2945         vm_size_t alloc_granule = zone_get_min_alloc_granule(adjusted_size,
2946             flags);
2947         vm_size_t min_size = min_pages * PAGE_SIZE;
2948         /*
2949          * Round up min_size to a multiple of alloc_granule
2950          */
2951         return ((min_size + alloc_granule - 1) / alloc_granule)
2952                * alloc_granule;
2953 }
2954
2955 zone_t
2956 zone_create_ext(
2957         const char             *name,
2958         vm_size_t               size,
2959         zone_create_flags_t     flags,
2960         zone_id_t               desired_zid,
2961         void                  (^extra_setup)(zone_t))
2962 {
2963         vm_size_t alloc;
2964         vm_size_t redzone;
2965         zone_t z;
2966
2967         if (size > ZONE_MAX_ALLOC_SIZE) {
2968                 panic("zone_create: element size too large: %zd", (size_t)size);
2969         }
2970
2971         size = zone_elem_adjust_size(name, size, flags, &redzone);
2972         /*
2973          * Allocate the zone slot, return early if we found an older match.
2974          */
2975         z = zone_create_find(name, size, flags, desired_zid);
2976         if (__improbable(z->z_self)) {
2977                 /* We found a zone to reuse */
2978                 return z;
2979         }
2980
2981         /*
2982          * Initialize the zone properly.
2983          */
2984
2985         /*
2986          * If the kernel is post lockdown, copy the zone name passed in.
2987          * Else simply maintain a pointer to the name string as it can only
2988          * be a core XNU zone (no unloadable kext exists before lockdown).
2989          */
2990         if (startup_phase >= STARTUP_SUB_LOCKDOWN) {
2991                 size_t nsz = MIN(strlen(name) + 1, MACH_ZONE_NAME_MAX_LEN);
2992                 char *buf = zalloc_permanent(nsz, ZALIGN_NONE);
2993                 strlcpy(buf, name, nsz);
2994                 z->z_name = buf;
2995         } else {
2996                 z->z_name = name;
2997         }
2998         /*
2999          * If zone_init() hasn't run yet, the permanent zones do not exist.
3000          * We can limp along without properly initialized stats for a while,
3001          * zone_init() will rebuild the missing stats when it runs.
3002          */
3003         if (__probable(zone_array[ZONE_ID_PERCPU_PERMANENT].z_self)) {
3004                 z->z_stats = zalloc_percpu_permanent_type(struct zone_stats);
3005         }
3006
3007         alloc = zone_get_min_alloc_granule(size, flags);
3008
3009         if (flags & ZC_KALLOC_HEAP) {
3010                 size_t rem = (alloc % size) / (alloc / size);
3011
3012                 /*
3013                  * Try to grow the elements size and spread them more if the remaining
3014                  * space is large enough.
3015                  */
3016                 size += rem & ~(KALLOC_MINALIGN - 1);
3017         }
3018
3019         z->pcpu_elem_size = z->z_elem_size = (uint16_t)size;
3020         z->alloc_pages = (uint16_t)atop(alloc);
3021 #if KASAN_ZALLOC
3022         z->kasan_redzone = redzone;
3023         if (strncmp(name, "fakestack.", sizeof("fakestack.") - 1) == 0) {
3024                 z->kasan_fakestacks = true;
3025         }
3026 #endif
3027
3028         /*
3029          * Handle KPI flags
3030          */
3031 #if __LP64__
3032         if (flags & ZC_SEQUESTER) {
3033                 z->va_sequester = true;
3034         }
3035 #endif
3036         /* ZC_CACHING applied after all configuration is done */
3037
3038         if (flags & ZC_PERCPU) {
3039                 /*
3040                  * ZC_CACHING is disallowed because it uses per-cpu zones for its
3041                  * implementation and it would be circular. These allocations are
3042                  * also quite expensive, so caching feels dangerous memory wise too.
3043                  *
3044                  * ZC_ZFREE_CLEARMEM is forced because per-cpu zones allow for
3045                  * pointer-sized allocations which poisoning doesn't support.
3046                  */
3047                 zone_create_assert_not_both(name, flags, ZC_PERCPU, ZC_CACHING);
3048                 zone_create_assert_not_both(name, flags, ZC_PERCPU, ZC_ALLOW_FOREIGN);
3049                 z->percpu = true;
3050                 z->gzalloc_exempt = true;
3051                 z->zfree_clear_mem = true;
3052                 z->pcpu_elem_size *= zpercpu_count();
3053         }
3054         if (flags & ZC_ZFREE_CLEARMEM) {
3055                 z->zfree_clear_mem = true;
3056         }
3057         if (flags & ZC_NOGC) {
3058                 z->collectable = false;
3059         }
3060         if (flags & ZC_NOENCRYPT) {
3061                 z->noencrypt = true;
3062         }
3063         if (flags & ZC_ALIGNMENT_REQUIRED) {
3064                 z->alignment_required = true;
3065         }
3066         if (flags & ZC_NOGZALLOC) {
3067                 z->gzalloc_exempt = true;
3068         }
3069         if (flags & ZC_NOCALLOUT) {
3070                 z->no_callout = true;
3071         }
3072         if (flags & ZC_DESTRUCTIBLE) {
3073                 zone_create_assert_not_both(name, flags, ZC_DESTRUCTIBLE, ZC_CACHING);
3074                 zone_create_assert_not_both(name, flags, ZC_DESTRUCTIBLE, ZC_ALLOW_FOREIGN);
3075                 z->destructible = true;
3076         }
3077
3078         /*
3079          * Handle Internal flags
3080          */
3081         if (flags & ZC_ALLOW_FOREIGN) {
3082                 z->allows_foreign = true;
3083         }
3084         if ((ZSECURITY_OPTIONS_SUBMAP_USER_DATA & zsecurity_options) &&
3085             (flags & ZC_DATA_BUFFERS)) {
3086                 z->submap_idx = Z_SUBMAP_IDX_BAG_OF_BYTES_MAP;
3087         }
3088         if (flags & ZC_KASAN_NOQUARANTINE) {
3089                 z->kasan_noquarantine = true;
3090         }
3091         /* ZC_KASAN_NOREDZONE already handled */
3092
3093         /*
3094          * Then if there's extra tuning, do it
3095          */
3096         if (extra_setup) {
3097                 extra_setup(z);
3098         }
3099
3100         /*
3101          * Configure debugging features
3102          */
3103 #if CONFIG_GZALLOC
3104         gzalloc_zone_init(z); /* might set z->gzalloc_tracked */
3105 #endif
3106 #if ZONE_ENABLE_LOGGING
3107         if (!z->gzalloc_tracked && num_zones_logged < max_num_zones_to_log) {
3108                 /*
3109                  * Check for and set up zone leak detection if requested via boot-args.
3110                  * might set z->zone_logging
3111                  */
3112                 zone_setup_logging(z);
3113         }
3114 #endif /* ZONE_ENABLE_LOGGING */
3115 #if VM_MAX_TAG_ZONES
3116         if (!z->gzalloc_tracked && z->kalloc_heap && zone_tagging_on) {
3117                 static int tag_zone_index;
3118                 vm_offset_t esize = zone_elem_size(z);
3119                 z->tags = true;
3120                 z->tags_inline = (((page_size + esize - 1) / esize) <=
3121                     (sizeof(uint32_t) / sizeof(uint16_t)));
3122                 z->tag_zone_index = os_atomic_inc_orig(&tag_zone_index, relaxed);
3123                 assert(z->tag_zone_index < VM_MAX_TAG_ZONES);
3124         }
3125 #endif
3126
3127         /*
3128          * Finally, fixup properties based on security policies, boot-args, ...
3129          */
3130         if ((ZSECURITY_OPTIONS_SUBMAP_USER_DATA & zsecurity_options) &&
3131             z->kalloc_heap == KHEAP_ID_DATA_BUFFERS) {
3132                 z->submap_idx = Z_SUBMAP_IDX_BAG_OF_BYTES_MAP;
3133         }
3134 #if __LP64__
3135         if ((ZSECURITY_OPTIONS_SEQUESTER & zsecurity_options) &&
3136             (flags & ZC_NOSEQUESTER) == 0 &&
3137             z->submap_idx == Z_SUBMAP_IDX_GENERAL_MAP) {
3138                 z->va_sequester = true;
3139         }
3140 #endif
3141         /*
3142          * Always clear zone elements smaller than a cacheline,
3143          * because it's pretty close to free.
3144          */
3145         if (size <= zp_min_size) {
3146                 z->zfree_clear_mem = true;
3147         }
3148         if (zp_factor != 0 && !z->zfree_clear_mem) {
3149                 z->zp_count = zone_poison_count_init(z);
3150         }
3151
3152 #if CONFIG_ZCACHE
3153         if ((flags & ZC_NOCACHING) == 0) {
3154                 /*
3155                  * Append kalloc heap name to zone name (if zone is used by kalloc)
3156                  */
3157                 char temp_zone_name[MAX_ZONE_NAME] = "";
3158                 snprintf(temp_zone_name, MAX_ZONE_NAME, "%s%s", zone_heap_name(z), z->z_name);
3159
3160                 /* Check if boot-arg specified it should have a cache */
3161                 if (track_this_zone(temp_zone_name, cache_zone_name)) {
3162                         flags |= ZC_CACHING;
3163                 } else if (zcc_kalloc && z->kalloc_heap) {
3164                         flags |= ZC_CACHING;
3165                 }
3166         }
3167         if ((flags & ZC_CACHING) &&
3168             !z->tags && !z->zone_logging && !z->gzalloc_tracked) {
3169                 zcache_init(z);
3170         }
3171 #endif /* CONFIG_ZCACHE */
3172
3173         lock_zone(z);
3174         z->z_self = z;
3175         unlock_zone(z);
3176
3177         return z;
3178 }
3179
3180 __startup_func
3181 void
3182 zone_create_startup(struct zone_create_startup_spec *spec)
3183 {
3184         *spec->z_var = zone_create_ext(spec->z_name, spec->z_size,
3185             spec->z_flags, spec->z_zid, spec->z_setup);
3186 }
3187
3188 /*
3189  * The 4 first field of a zone_view and a zone alias, so that the zone_or_view_t
3190  * union works. trust but verify.
3191  */
3192 #define zalloc_check_zov_alias(f1, f2) \
3193     static_assert(offsetof(struct zone, f1) == offsetof(struct zone_view, f2))
3194 zalloc_check_zov_alias(z_self, zv_zone);
3195 zalloc_check_zov_alias(z_stats, zv_stats);
3196 zalloc_check_zov_alias(z_name, zv_name);
3197 zalloc_check_zov_alias(z_views, zv_next);
3198 #undef zalloc_check_zov_alias
3199
3200 __startup_func
3201 void
3202 zone_view_startup_init(struct zone_view_startup_spec *spec)
3203 {
3204         struct kalloc_heap *heap = NULL;
3205         zone_view_t zv = spec->zv_view;
3206         zone_t z;
3207
3208         switch (spec->zv_heapid) {
3209         case KHEAP_ID_DEFAULT:
3210                 heap = KHEAP_DEFAULT;
3211                 break;
3212         case KHEAP_ID_DATA_BUFFERS:
3213                 heap = KHEAP_DATA_BUFFERS;
3214                 break;
3215         case KHEAP_ID_KEXT:
3216                 heap = KHEAP_KEXT;
3217                 break;
3218         default:
3219                 heap = NULL;
3220         }
3221
3222         if (heap) {
3223                 z = kalloc_heap_zone_for_size(heap, spec->zv_size);
3224                 assert(z);
3225         } else {
3226                 z = spec->zv_zone;
3227                 assert(spec->zv_size <= zone_elem_size(z));
3228         }
3229
3230         zv->zv_zone  = z;
3231         zv->zv_stats = zalloc_percpu_permanent_type(struct zone_stats);
3232         zv->zv_next  = z->z_views;
3233         if (z->z_views == NULL && z->kalloc_heap == KHEAP_ID_NONE) {
3234                 /*
3235                  * count the raw view for zones not in a heap,
3236                  * kalloc_heap_init() already counts it for its members.
3237                  */
3238                 zone_view_count += 2;
3239         } else {
3240                 zone_view_count += 1;
3241         }
3242         z->z_views = zv;
3243 }
3244
3245 zone_t
3246 zone_create(
3247         const char             *name,
3248         vm_size_t               size,
3249         zone_create_flags_t     flags)
3250 {
3251         return zone_create_ext(name, size, flags, ZONE_ID_ANY, NULL);
3252 }
3253
3254 zone_t
3255 zinit(
3256         vm_size_t       size,           /* the size of an element */
3257         vm_size_t       max,            /* maximum memory to use */
3258         vm_size_t       alloc __unused, /* allocation size */
3259         const char      *name)          /* a name for the zone */
3260 {
3261         zone_t z = zone_create(name, size, ZC_DESTRUCTIBLE);
3262         zone_set_max(z, max);
3263         return z;
3264 }
3265
3266 void
3267 zdestroy(zone_t z)
3268 {
3269         unsigned int zindex = zone_index(z);
3270
3271         lock_zone(z);
3272
3273         if (!z->destructible || zone_caching_enabled(z) || z->allows_foreign) {
3274                 panic("zdestroy: Zone %s%s isn't destructible",
3275                     zone_heap_name(z), z->z_name);
3276         }
3277
3278         if (!z->z_self || z->expanding_no_vm_priv || z->expanding_vm_priv ||
3279             z->async_pending || z->waiting) {
3280                 panic("zdestroy: Zone %s%s in an invalid state for destruction",
3281                     zone_heap_name(z), z->z_name);
3282         }
3283
3284 #if !KASAN_ZALLOC
3285         /*
3286          * Unset the valid bit. We'll hit an assert failure on further operations
3287          * on this zone, until zinit() is called again.
3288          *
3289          * Leave the zone valid for KASan as we will see zfree's on quarantined free
3290          * elements even after the zone is destroyed.
3291          */
3292         z->z_self = NULL;
3293 #endif
3294         z->destroyed = true;
3295         unlock_zone(z);
3296
3297         /* Dump all the free elements */
3298         zone_drop_free_elements(z);
3299
3300 #if CONFIG_GZALLOC
3301         if (__improbable(z->gzalloc_tracked)) {
3302                 /* If the zone is gzalloc managed dump all the elements in the free cache */
3303                 gzalloc_empty_free_cache(z);
3304         }
3305 #endif
3306
3307         lock_zone(z);
3308
3309         while (!zone_pva_is_null(z->pages_sequester)) {
3310                 struct zone_page_metadata *page_meta;
3311                 vm_offset_t                free_addr;
3312
3313                 page_meta = zone_sequestered_page_get(z, &free_addr);
3314                 unlock_zone(z);
3315                 kmem_free(submap_for_zone(z), free_addr, ptoa(z->alloc_pages));
3316                 lock_zone(z);
3317         }
3318
3319 #if !KASAN_ZALLOC
3320         /* Assert that all counts are zero */
3321         if (z->countavail || z->countfree || zone_size_wired(z) ||
3322             z->allfree_page_count || z->sequester_page_count) {
3323                 panic("zdestroy: Zone %s%s isn't empty at zdestroy() time",
3324                     zone_heap_name(z), z->z_name);
3325         }
3326
3327         /* consistency check: make sure everything is indeed empty */
3328         assert(zone_pva_is_null(z->pages_any_free_foreign));
3329         assert(zone_pva_is_null(z->pages_all_used_foreign));
3330         assert(zone_pva_is_null(z->pages_all_free));
3331         assert(zone_pva_is_null(z->pages_intermediate));
3332         assert(zone_pva_is_null(z->pages_all_used));
3333         assert(zone_pva_is_null(z->pages_sequester));
3334 #endif
3335
3336         unlock_zone(z);
3337
3338         simple_lock(&all_zones_lock, &zone_locks_grp);
3339
3340         assert(!bitmap_test(zone_destroyed_bitmap, zindex));
3341         /* Mark the zone as empty in the bitmap */
3342         bitmap_set(zone_destroyed_bitmap, zindex);
3343         num_zones_in_use--;
3344         assert(num_zones_in_use > 0);
3345
3346         simple_unlock(&all_zones_lock);
3347 }
3348
3349 #pragma mark zone (re)fill, jetsam
3350
3351 /*
3352  * Dealing with zone allocations from the mach VM code.
3353  *
3354  * The implementation of the mach VM itself uses the zone allocator
3355  * for things like the vm_map_entry data structure. In order to prevent
3356  * an infinite recursion problem when adding more pages to a zone, zalloc
3357  * uses a replenish thread to refill the VM layer's zones before they have
3358  * too few remaining free entries. The reserved remaining free entries
3359  * guarantee that the VM routines can get entries from already mapped pages.
3360  *
3361  * In order for that to work, the amount of allocations in the nested
3362  * case have to be bounded. There are currently 2 replenish zones, and
3363  * if each needs 1 element of each zone to add a new page to itself, that
3364  * gives us a minumum reserve of 2 elements.
3365  *
3366  * There is also a deadlock issue with the zone garbage collection thread,
3367  * or any thread that is trying to free zone pages. While holding
3368  * the kernel's map lock they may need to allocate new VM map entries, hence
3369  * we need enough reserve to allow them to get past the point of holding the
3370  * map lock. After freeing that page, the GC thread will wait in drop_free_elements()
3371  * until the replenish threads can finish. Since there's only 1 GC thread at a time,
3372  * that adds a minimum of 1 to the reserve size.
3373  *
3374  * Since the minumum amount you can add to a zone is 1 page, we'll use 16K (from ARM)
3375  * as the refill size on all platforms.
3376  *
3377  * When a refill zone drops to half that available, i.e. REFILL_SIZE / 2,
3378  * zalloc_ext() will wake the replenish thread. The replenish thread runs
3379  * until at least REFILL_SIZE worth of free elements exist, before sleeping again.
3380  * In the meantime threads may continue to use the reserve until there are only REFILL_SIZE / 4
3381  * elements left. Below that point only the replenish threads themselves and the GC
3382  * thread may continue to use from the reserve.
3383  */
3384 static unsigned zone_replenish_loops;
3385 static unsigned zone_replenish_wakeups;
3386 static unsigned zone_replenish_wakeups_initiated;
3387 static unsigned zone_replenish_throttle_count;
3388
3389 #define ZONE_REPLENISH_TARGET (16 * 1024)
3390 static unsigned zone_replenish_active = 0; /* count of zones currently replenishing */
3391 static unsigned zone_replenish_max_threads = 0;
3392
3393 LCK_GRP_DECLARE(zone_replenish_lock_grp, "zone_replenish_lock");
3394 LCK_SPIN_DECLARE(zone_replenish_lock, &zone_replenish_lock_grp);
3395
3396 __abortlike
3397 static void
3398 zone_replenish_panic(zone_t zone, kern_return_t kr)
3399 {
3400         panic_include_zprint = TRUE;
3401 #if CONFIG_ZLEAKS
3402         if ((zleak_state & ZLEAK_STATE_ACTIVE)) {
3403                 panic_include_ztrace = TRUE;
3404         }
3405 #endif /* CONFIG_ZLEAKS */
3406         if (kr == KERN_NO_SPACE) {
3407                 zone_t zone_largest = zone_find_largest();
3408                 panic("zalloc: zone map exhausted while allocating from zone %s%s, "
3409                     "likely due to memory leak in zone %s%s "
3410                     "(%lu total bytes, %d elements allocated)",
3411                     zone_heap_name(zone), zone->z_name,
3412                     zone_heap_name(zone_largest), zone_largest->z_name,
3413                     (unsigned long)zone_size_wired(zone_largest),
3414                     zone_count_allocated(zone_largest));
3415         }
3416         panic("zalloc: %s%s (%d elements) retry fail %d",
3417             zone_heap_name(zone), zone->z_name,
3418             zone_count_allocated(zone), kr);
3419 }
3420
3421 static void
3422 zone_replenish_locked(zone_t z, zalloc_flags_t flags, bool asynchronously)
3423 {
3424         int kmaflags = KMA_KOBJECT | KMA_ZERO;
3425         vm_offset_t space, alloc_size;
3426         uint32_t retry = 0;
3427         kern_return_t kr;
3428
3429         if (z->noencrypt) {
3430                 kmaflags |= KMA_NOENCRYPT;
3431         }
3432         if (flags & Z_NOPAGEWAIT) {
3433                 kmaflags |= KMA_NOPAGEWAIT;
3434         }
3435         if (z->permanent) {
3436                 kmaflags |= KMA_PERMANENT;
3437         }
3438
3439         for (;;) {
3440                 struct zone_page_metadata *page_meta = NULL;
3441
3442                 /*
3443                  * Try to allocate our regular chunk of pages,
3444                  * unless the system is under massive pressure
3445                  * and we're looking for more than 2 pages.
3446                  */
3447                 if (!z->percpu && z->alloc_pages > 2 && (vm_pool_low() || retry > 0)) {
3448                         alloc_size = round_page(zone_elem_size(z));
3449                 } else {
3450                         alloc_size = ptoa(z->alloc_pages);
3451                         page_meta = zone_sequestered_page_get(z, &space);
3452                 }
3453
3454                 unlock_zone(z);
3455
3456 #if CONFIG_ZLEAKS
3457                 /*
3458                  * Do the zone leak activation here because zleak_activate()
3459                  * may block, and can't be done on the way out.
3460                  */
3461                 if (__improbable(zleak_state & ZLEAK_STATE_ENABLED)) {
3462                         if (!(zleak_state & ZLEAK_STATE_ACTIVE) &&
3463                             zone_submaps_approx_size() >= zleak_global_tracking_threshold) {
3464                                 kr = zleak_activate();
3465                                 if (kr != KERN_SUCCESS) {
3466                                         printf("Failed to activate live zone leak debugging (%d).\n", kr);
3467                                 }
3468                         }
3469                 }
3470 #endif /* CONFIG_ZLEAKS */
3471
3472                 /*
3473                  * Trigger jetsams via the vm_pageout_garbage_collect thread if
3474                  * we're running out of zone memory
3475                  */
3476                 if (is_zone_map_nearing_exhaustion()) {
3477                         thread_wakeup((event_t) &vm_pageout_garbage_collect);
3478                 }
3479
3480                 if (page_meta) {
3481                         kr = zone_sequestered_page_populate(z, page_meta, space,
3482                             alloc_size, kmaflags);
3483                 } else {
3484                         if (z->submap_idx == Z_SUBMAP_IDX_GENERAL_MAP && z->kalloc_heap != KHEAP_ID_NONE) {
3485                                 kmaflags |= KMA_KHEAP;
3486                         }
3487                         kr = kernel_memory_allocate(submap_for_zone(z),
3488                             &space, alloc_size, 0, kmaflags, VM_KERN_MEMORY_ZONE);
3489                 }
3490
3491 #if !__LP64__
3492                 if (kr == KERN_NO_SPACE && z->allows_foreign) {
3493                         /*
3494                          * For zones allowing foreign pages, fallback to the kernel map
3495                          */
3496                         kr = kernel_memory_allocate(kernel_map, &space,
3497                             alloc_size, 0, kmaflags, VM_KERN_MEMORY_ZONE);
3498                 }
3499 #endif
3500
3501                 if (kr == KERN_SUCCESS) {
3502                         break;
3503                 }
3504
3505                 if (flags & Z_NOPAGEWAIT) {
3506                         lock_zone(z);
3507                         return;
3508                 }
3509
3510                 if (asynchronously) {
3511                         assert_wait_timeout(&z->prio_refill_count,
3512                             THREAD_UNINT, 1, 100 * NSEC_PER_USEC);
3513                         thread_block(THREAD_CONTINUE_NULL);
3514                 } else if (++retry == 3) {
3515                         zone_replenish_panic(z, kr);
3516                 }
3517
3518                 lock_zone(z);
3519         }
3520
3521         zcram_and_lock(z, space, alloc_size);
3522
3523 #if CONFIG_ZLEAKS
3524         if (__improbable(zleak_state & ZLEAK_STATE_ACTIVE)) {
3525                 if (!z->zleak_on &&
3526                     zone_size_wired(z) >= zleak_per_zone_tracking_threshold) {
3527                         z->zleak_on = true;
3528                 }
3529         }
3530 #endif /* CONFIG_ZLEAKS */
3531 }
3532
3533 /*
3534  * High priority VM privileged thread used to asynchronously refill a given zone.
3535  * These are needed for data structures used by the lower level VM itself. The
3536  * replenish thread maintains a reserve of elements, so that the VM will never
3537  * block in the zone allocator.
3538  */
3539 __dead2
3540 static void
3541 zone_replenish_thread(void *_z, wait_result_t __unused wr)
3542 {
3543         zone_t z = _z;
3544
3545         current_thread()->options |= (TH_OPT_VMPRIV | TH_OPT_ZONE_PRIV);
3546
3547         for (;;) {
3548                 lock_zone(z);
3549                 assert(z->z_self == z);
3550                 assert(z->zone_replenishing);
3551                 assert(z->prio_refill_count != 0);
3552
3553                 while (z->countfree < z->prio_refill_count) {
3554                         assert(!z->expanding_no_vm_priv);
3555                         assert(!z->expanding_vm_priv);
3556
3557                         zone_replenish_locked(z, Z_WAITOK, true);
3558
3559                         assert(z->z_self == z);
3560                         zone_replenish_loops++;
3561                 }
3562
3563                 /* Wakeup any potentially throttled allocations. */
3564                 thread_wakeup(z);
3565
3566                 assert_wait(&z->prio_refill_count, THREAD_UNINT);
3567
3568                 /*
3569                  * We finished refilling the zone, so decrement the active count
3570                  * and wake up any waiting GC threads.
3571                  */
3572                 lck_spin_lock(&zone_replenish_lock);
3573                 assert(zone_replenish_active > 0);
3574                 if (--zone_replenish_active == 0) {
3575                         thread_wakeup((event_t)&zone_replenish_active);
3576                 }
3577                 lck_spin_unlock(&zone_replenish_lock);
3578
3579                 z->zone_replenishing = false;
3580                 unlock_zone(z);
3581
3582                 thread_block(THREAD_CONTINUE_NULL);
3583                 zone_replenish_wakeups++;
3584         }
3585 }
3586
3587 void
3588 zone_prio_refill_configure(zone_t z)
3589 {
3590         thread_t th;
3591         kern_return_t tres;
3592
3593         lock_zone(z);
3594         assert(!z->prio_refill_count && !z->destructible);
3595         z->prio_refill_count = (uint16_t)(ZONE_REPLENISH_TARGET / zone_elem_size(z));
3596         z->zone_replenishing = true;
3597         unlock_zone(z);
3598
3599         lck_spin_lock(&zone_replenish_lock);
3600         ++zone_replenish_max_threads;
3601         ++zone_replenish_active;
3602         lck_spin_unlock(&zone_replenish_lock);
3603         OSMemoryBarrier();
3604
3605         tres = kernel_thread_start_priority(zone_replenish_thread, z,
3606             MAXPRI_KERNEL, &th);
3607         if (tres != KERN_SUCCESS) {
3608                 panic("zone_prio_refill_configure, thread create: 0x%x", tres);
3609         }
3610
3611         thread_deallocate(th);
3612 }
3613
3614 static void
3615 zone_randomize_freelist(zone_t zone, struct zone_page_metadata *meta,
3616     vm_offset_t size, zone_addr_kind_t kind, unsigned int *entropy_buffer)
3617 {
3618         const vm_size_t elem_size = zone_elem_size(zone);
3619         vm_offset_t     left, right, head, base;
3620         vm_offset_t     element;
3621
3622         left  = ZONE_PAGE_FIRST_OFFSET(kind);
3623         right = size - ((size - left) % elem_size);
3624         head  = 0;
3625         base  = zone_meta_to_addr(meta, kind);
3626
3627         while (left < right) {
3628                 if (zone_leaks_scan_enable || __improbable(zone->tags) ||
3629                     random_bool_gen_bits(&zone_bool_gen, entropy_buffer, MAX_ENTROPY_PER_ZCRAM, 1)) {
3630                         element = base + left;
3631                         left += elem_size;
3632                 } else {
3633                         right -= elem_size;
3634                         element = base + right;
3635                 }
3636
3637                 vm_offset_t *primary  = (vm_offset_t *)element;
3638                 vm_offset_t *backup   = get_backup_ptr(elem_size, primary);
3639
3640                 *primary = *backup = head ^ zp_nopoison_cookie;
3641                 head = element;
3642         }
3643
3644         meta->zm_freelist_offs = (uint16_t)(head - base);
3645 }
3646
3647 /*
3648  *      Cram the given memory into the specified zone. Update the zone page count accordingly.
3649  */
3650 static void
3651 zcram_and_lock(zone_t zone, vm_offset_t newmem, vm_size_t size)
3652 {
3653         unsigned int entropy_buffer[MAX_ENTROPY_PER_ZCRAM] = { 0 };
3654         struct zone_page_metadata *meta;
3655         zone_addr_kind_t kind;
3656         uint32_t pg_count = (uint32_t)atop(size);
3657         uint32_t zindex = zone_index(zone);
3658         uint32_t free_count;
3659         uint16_t empty_freelist_offs = PAGE_METADATA_EMPTY_FREELIST;
3660
3661         /* Basic sanity checks */
3662         assert(zone != ZONE_NULL && newmem != (vm_offset_t)0);
3663         assert((newmem & PAGE_MASK) == 0);
3664         assert((size & PAGE_MASK) == 0);
3665
3666         KDBG(MACHDBG_CODE(DBG_MACH_ZALLOC, ZALLOC_ZCRAM) | DBG_FUNC_START,
3667             zindex, size);
3668
3669         kind = zone_addr_kind(newmem, size);
3670 #if DEBUG || DEVELOPMENT
3671         if (zalloc_debug & ZALLOC_DEBUG_ZCRAM) {
3672                 kprintf("zcram(%p[%s%s], 0x%lx%s, 0x%lx)\n", zone,
3673                     zone_heap_name(zone), zone->z_name, (uintptr_t)newmem,
3674                     kind == ZONE_ADDR_FOREIGN ? "[F]" : "", (uintptr_t)size);
3675         }
3676 #endif /* DEBUG || DEVELOPMENT */
3677
3678         /*
3679          * Initialize the metadata for all pages. We dont need the zone lock
3680          * here because we are not manipulating any zone related state yet.
3681          *
3682          * This includes randomizing the freelists as the metadata isn't
3683          * published yet.
3684          */
3685
3686         if (kind == ZONE_ADDR_NATIVE) {
3687                 /*
3688                  * We're being called by zfill,
3689                  * zone_replenish_thread or vm_page_more_fictitious,
3690                  *
3691                  * which will only either allocate a single page, or `alloc_pages`
3692                  * worth.
3693                  */
3694                 assert(pg_count <= zone->alloc_pages);
3695
3696                 /*
3697                  * Make sure the range of metadata entries we're about to init
3698                  * have proper physical backing, then initialize them.
3699                  */
3700                 meta = zone_meta_from_addr(newmem, kind);
3701                 zone_meta_populate(meta, meta + pg_count);
3702
3703                 if (zone->permanent) {
3704                         empty_freelist_offs = 0;
3705                 }
3706
3707                 meta[0] = (struct zone_page_metadata){
3708                         .zm_index         = zindex,
3709                         .zm_page_count    = pg_count,
3710                         .zm_percpu        = zone->percpu,
3711                         .zm_freelist_offs = empty_freelist_offs,
3712                 };
3713
3714                 for (uint32_t i = 1; i < pg_count; i++) {
3715                         meta[i] = (struct zone_page_metadata){
3716                                 .zm_index          = zindex,
3717                                 .zm_page_count     = i,
3718                                 .zm_percpu         = zone->percpu,
3719                                 .zm_secondary_page = true,
3720                                 .zm_freelist_offs  = empty_freelist_offs,
3721                         };
3722                 }
3723
3724                 if (!zone->permanent) {
3725                         zone_randomize_freelist(zone, meta,
3726                             zone->percpu ? PAGE_SIZE : size, kind, entropy_buffer);
3727                 }
3728         } else {
3729                 if (!zone->allows_foreign || !from_foreign_range(newmem, size)) {
3730                         panic("zcram_and_lock: foreign memory [%lx] being crammed is "
3731                             "outside of foreign range", (uintptr_t)newmem);
3732                 }
3733
3734                 /*
3735                  * We cannot support elements larger than page size for foreign
3736                  * memory because we put metadata on the page itself for each
3737                  * page of foreign memory.
3738                  *
3739                  * We need to do this in order to be able to reach the metadata
3740                  * when any element is freed.
3741                  */
3742                 assert(!zone->percpu && !zone->permanent);
3743                 assert(zone_elem_size(zone) <= PAGE_SIZE - sizeof(struct zone_page_metadata));
3744
3745                 bzero((void *)newmem, size);
3746
3747                 for (vm_offset_t offs = 0; offs < size; offs += PAGE_SIZE) {
3748                         meta = (struct zone_page_metadata *)(newmem + offs);
3749                         *meta = (struct zone_page_metadata){
3750                                 .zm_index         = zindex,
3751                                 .zm_page_count    = 1,
3752                                 .zm_freelist_offs = empty_freelist_offs,
3753                         };
3754                         meta->zm_foreign_cookie[0] = ZONE_FOREIGN_COOKIE;
3755                         zone_randomize_freelist(zone, meta, PAGE_SIZE, kind,
3756                             entropy_buffer);
3757                 }
3758         }
3759
3760 #if VM_MAX_TAG_ZONES
3761         if (__improbable(zone->tags)) {
3762                 assert(kind == ZONE_ADDR_NATIVE && !zone->percpu);
3763                 ztMemoryAdd(zone, newmem, size);
3764         }
3765 #endif /* VM_MAX_TAG_ZONES */
3766
3767         /*
3768          * Insert the initialized pages / metadatas into the right lists.
3769          */
3770
3771         lock_zone(zone);
3772         assert(zone->z_self == zone);
3773
3774         zone->page_count += pg_count;
3775         if (zone->page_count_hwm < zone->page_count) {
3776                 zone->page_count_hwm = zone->page_count;
3777         }
3778         os_atomic_add(&zones_phys_page_count, pg_count, relaxed);
3779
3780         if (kind == ZONE_ADDR_NATIVE) {
3781                 os_atomic_add(&zones_phys_page_mapped_count, pg_count, relaxed);
3782                 if (zone->permanent) {
3783                         zone_meta_queue_push(zone, &zone->pages_intermediate, meta, kind);
3784                 } else {
3785                         zone_meta_queue_push(zone, &zone->pages_all_free, meta, kind);
3786                         zone->allfree_page_count += meta->zm_page_count;
3787                 }
3788                 free_count = zone_elem_count(zone, size, kind);
3789                 zone->countfree  += free_count;
3790                 zone->countavail += free_count;
3791         } else {
3792                 free_count = zone_elem_count(zone, PAGE_SIZE, kind);
3793                 for (vm_offset_t offs = 0; offs < size; offs += PAGE_SIZE) {
3794                         meta = (struct zone_page_metadata *)(newmem + offs);
3795                         zone_meta_queue_push(zone, &zone->pages_any_free_foreign, meta, kind);
3796                         zone->countfree  += free_count;
3797                         zone->countavail += free_count;
3798                 }
3799         }
3800
3801         KDBG(MACHDBG_CODE(DBG_MACH_ZALLOC, ZALLOC_ZCRAM) | DBG_FUNC_END, zindex);
3802 }
3803
3804 void
3805 zcram(zone_t zone, vm_offset_t newmem, vm_size_t size)
3806 {
3807         zcram_and_lock(zone, newmem, size);
3808         unlock_zone(zone);
3809 }
3810
3811 /*
3812  * Fill a zone with enough memory to contain at least nelem elements.
3813  * Return the number of elements actually put into the zone, which may
3814  * be more than the caller asked for since the memory allocation is
3815  * rounded up to the next zone allocation size.
3816  */
3817 int
3818 zfill(
3819         zone_t  zone,
3820         int     nelem)
3821 {
3822         kern_return_t kr;
3823         vm_offset_t   memory;
3824
3825         vm_size_t alloc_size = ptoa(zone->alloc_pages);
3826         vm_size_t nalloc_inc = zone_elem_count(zone, alloc_size, ZONE_ADDR_NATIVE);
3827         vm_size_t nalloc = 0, goal = MAX(0, nelem);
3828         int kmaflags = KMA_KOBJECT | KMA_ZERO;
3829
3830         if (zone->noencrypt) {
3831                 kmaflags |= KMA_NOENCRYPT;
3832         }
3833
3834         assert(!zone->allows_foreign && !zone->permanent);
3835
3836         /*
3837          * Trigger jetsams via the vm_pageout_garbage_collect thread if we're
3838          * running out of zone memory
3839          */
3840         if (is_zone_map_nearing_exhaustion()) {
3841                 thread_wakeup((event_t) &vm_pageout_garbage_collect);
3842         }
3843
3844         if (zone->va_sequester) {
3845                 lock_zone(zone);
3846
3847                 do {
3848                         struct zone_page_metadata *page_meta;
3849                         page_meta = zone_sequestered_page_get(zone, &memory);
3850                         if (NULL == page_meta) {
3851                                 break;
3852                         }
3853                         unlock_zone(zone);
3854
3855                         kr = zone_sequestered_page_populate(zone, page_meta,
3856                             memory, alloc_size, kmaflags);
3857                         if (KERN_SUCCESS != kr) {
3858                                 goto out_nolock;
3859                         }
3860
3861                         zcram_and_lock(zone, memory, alloc_size);
3862                         nalloc += nalloc_inc;
3863                 } while (nalloc < goal);
3864
3865                 unlock_zone(zone);
3866         }
3867
3868 out_nolock:
3869         while (nalloc < goal) {
3870                 kr = kernel_memory_allocate(submap_for_zone(zone), &memory,
3871                     alloc_size, 0, kmaflags, VM_KERN_MEMORY_ZONE);
3872                 if (kr != KERN_SUCCESS) {
3873                         printf("%s: kernel_memory_allocate() of %lu bytes failed\n",
3874                             __func__, (unsigned long)(nalloc * alloc_size));
3875                         break;
3876                 }
3877
3878                 zcram(zone, memory, alloc_size);
3879                 nalloc += nalloc_inc;
3880         }
3881
3882         return (int)nalloc;
3883 }
3884
3885 /*
3886  * We're being very conservative here and picking a value of 95%. We might need to lower this if
3887  * we find that we're not catching the problem and are still hitting zone map exhaustion panics.
3888  */
3889 #define ZONE_MAP_JETSAM_LIMIT_DEFAULT 95
3890
3891 /*
3892  * Trigger zone-map-exhaustion jetsams if the zone map is X% full, where X=zone_map_jetsam_limit.
3893  * Can be set via boot-arg "zone_map_jetsam_limit". Set to 95% by default.
3894  */
3895 TUNABLE_WRITEABLE(unsigned int, zone_map_jetsam_limit, "zone_map_jetsam_limit",
3896     ZONE_MAP_JETSAM_LIMIT_DEFAULT);
3897
3898 void
3899 get_zone_map_size(uint64_t *current_size, uint64_t *capacity)
3900 {
3901         vm_offset_t phys_pages = os_atomic_load(&zones_phys_page_mapped_count, relaxed);
3902         *current_size = ptoa_64(phys_pages);
3903         *capacity = zone_phys_mapped_max;
3904 }
3905
3906 void
3907 get_largest_zone_info(char *zone_name, size_t zone_name_len, uint64_t *zone_size)
3908 {
3909         zone_t largest_zone = zone_find_largest();
3910
3911         /*
3912          * Append kalloc heap name to zone name (if zone is used by kalloc)
3913          */
3914         snprintf(zone_name, zone_name_len, "%s%s",
3915             zone_heap_name(largest_zone), largest_zone->z_name);
3916
3917         *zone_size = zone_size_wired(largest_zone);
3918 }
3919
3920 boolean_t
3921 is_zone_map_nearing_exhaustion(void)
3922 {
3923         vm_offset_t phys_pages = os_atomic_load(&zones_phys_page_mapped_count, relaxed);
3924         return ptoa_64(phys_pages) > (zone_phys_mapped_max * zone_map_jetsam_limit) / 100;
3925 }
3926
3927
3928 #define VMENTRY_TO_VMOBJECT_COMPARISON_RATIO 98
3929
3930 /*
3931  * Tries to kill a single process if it can attribute one to the largest zone. If not, wakes up the memorystatus thread
3932  * to walk through the jetsam priority bands and kill processes.
3933  */
3934 static void
3935 kill_process_in_largest_zone(void)
3936 {
3937         pid_t pid = -1;
3938         zone_t largest_zone = zone_find_largest();
3939
3940         printf("zone_map_exhaustion: Zone mapped %lld of %lld, used %lld, map size %lld, capacity %lld [jetsam limit %d%%]\n",
3941             ptoa_64(os_atomic_load(&zones_phys_page_mapped_count, relaxed)), ptoa_64(zone_phys_mapped_max),
3942             ptoa_64(os_atomic_load(&zones_phys_page_count, relaxed)),
3943             (uint64_t)zone_submaps_approx_size(),
3944             (uint64_t)zone_range_size(&zone_info.zi_map_range),
3945             zone_map_jetsam_limit);
3946         printf("zone_map_exhaustion: Largest zone %s%s, size %lu\n", zone_heap_name(largest_zone),
3947             largest_zone->z_name, (uintptr_t)zone_size_wired(largest_zone));
3948
3949         /*
3950          * We want to make sure we don't call this function from userspace.
3951          * Or we could end up trying to synchronously kill the process
3952          * whose context we're in, causing the system to hang.
3953          */
3954         assert(current_task() == kernel_task);
3955
3956         /*
3957          * If vm_object_zone is the largest, check to see if the number of
3958          * elements in vm_map_entry_zone is comparable.
3959          *
3960          * If so, consider vm_map_entry_zone as the largest. This lets us target
3961          * a specific process to jetsam to quickly recover from the zone map
3962          * bloat.
3963          */
3964         if (largest_zone == vm_object_zone) {
3965                 unsigned int vm_object_zone_count = zone_count_allocated(vm_object_zone);
3966                 unsigned int vm_map_entry_zone_count = zone_count_allocated(vm_map_entry_zone);
3967                 /* Is the VM map entries zone count >= 98% of the VM objects zone count? */
3968                 if (vm_map_entry_zone_count >= ((vm_object_zone_count * VMENTRY_TO_VMOBJECT_COMPARISON_RATIO) / 100)) {
3969                         largest_zone = vm_map_entry_zone;
3970                         printf("zone_map_exhaustion: Picking VM map entries as the zone to target, size %lu\n",
3971                             (uintptr_t)zone_size_wired(largest_zone));
3972                 }
3973         }
3974
3975         /* TODO: Extend this to check for the largest process in other zones as well. */
3976         if (largest_zone == vm_map_entry_zone) {
3977                 pid = find_largest_process_vm_map_entries();
3978         } else {
3979                 printf("zone_map_exhaustion: Nothing to do for the largest zone [%s%s]. "
3980                     "Waking up memorystatus thread.\n", zone_heap_name(largest_zone),
3981                     largest_zone->z_name);
3982         }
3983         if (!memorystatus_kill_on_zone_map_exhaustion(pid)) {
3984                 printf("zone_map_exhaustion: Call to memorystatus failed, victim pid: %d\n", pid);
3985         }
3986 }
3987
3988 #pragma mark zalloc module init
3989
3990 /*
3991  *      Initialize the "zone of zones" which uses fixed memory allocated
3992  *      earlier in memory initialization.  zone_bootstrap is called
3993  *      before zone_init.
3994  */
3995 __startup_func
3996 void
3997 zone_bootstrap(void)
3998 {
3999         /* Validate struct zone_page_metadata expectations */
4000         if ((1U << ZONE_PAGECOUNT_BITS) <
4001             atop(ZONE_MAX_ALLOC_SIZE) * sizeof(struct zone_page_metadata)) {
4002                 panic("ZONE_PAGECOUNT_BITS is not large enough to hold page counts");
4003         }
4004
4005         /* Validate struct zone_packed_virtual_address expectations */
4006         static_assert((intptr_t)VM_MIN_KERNEL_ADDRESS < 0, "the top bit must be 1");
4007         if (VM_KERNEL_POINTER_SIGNIFICANT_BITS - PAGE_SHIFT > 31) {
4008                 panic("zone_pva_t can't pack a kernel page address in 31 bits");
4009         }
4010
4011         zpercpu_early_count = ml_early_cpu_max_number() + 1;
4012
4013         /* Set up zone element poisoning */
4014         zp_bootstrap();
4015
4016         random_bool_init(&zone_bool_gen);
4017
4018         /*
4019          * the KASAN quarantine for kalloc doesn't understand heaps
4020          * and trips the heap confusion panics. At the end of the day,
4021          * all these security measures are double duty with KASAN.
4022          *
4023          * On 32bit kernels, these protections are just too expensive.
4024          */
4025 #if !defined(__LP64__) || KASAN_ZALLOC
4026         zsecurity_options &= ~ZSECURITY_OPTIONS_SEQUESTER;
4027         zsecurity_options &= ~ZSECURITY_OPTIONS_SUBMAP_USER_DATA;
4028         zsecurity_options &= ~ZSECURITY_OPTIONS_SEQUESTER_KEXT_KALLOC;
4029 #endif
4030
4031         thread_call_setup(&call_async_alloc, zalloc_async, NULL);
4032
4033 #if CONFIG_ZCACHE
4034         /* zcc_enable_for_zone_name=<zone>: enable per-cpu zone caching for <zone>. */
4035         if (PE_parse_boot_arg_str("zcc_enable_for_zone_name", cache_zone_name, sizeof(cache_zone_name))) {
4036                 printf("zcache: caching enabled for zone %s\n", cache_zone_name);
4037         }
4038 #endif /* CONFIG_ZCACHE */
4039 }
4040
4041 #if __LP64__
4042 #if CONFIG_EMBEDDED
4043 #define ZONE_MAP_VIRTUAL_SIZE_LP64      (32ULL * 1024ULL * 1024 * 1024)
4044 #else
4045 #define ZONE_MAP_VIRTUAL_SIZE_LP64      (128ULL * 1024ULL * 1024 * 1024)
4046 #endif
4047 #endif /* __LP64__ */
4048
4049 #define SINGLE_GUARD                    16384
4050 #define MULTI_GUARD                     (3 * SINGLE_GUARD)
4051
4052 #if __LP64__
4053 static inline vm_offset_t
4054 zone_restricted_va_max(void)
4055 {
4056         vm_offset_t compressor_max = VM_PACKING_MAX_PACKABLE(C_SLOT_PACKED_PTR);
4057         vm_offset_t vm_page_max    = VM_PACKING_MAX_PACKABLE(VM_PAGE_PACKED_PTR);
4058
4059         return trunc_page(MIN(compressor_max, vm_page_max));
4060 }
4061 #endif
4062
4063 __startup_func
4064 static void
4065 zone_tunables_fixup(void)
4066 {
4067         if (zone_map_jetsam_limit == 0 || zone_map_jetsam_limit > 100) {
4068                 zone_map_jetsam_limit = ZONE_MAP_JETSAM_LIMIT_DEFAULT;
4069         }
4070 }
4071 STARTUP(TUNABLES, STARTUP_RANK_MIDDLE, zone_tunables_fixup);
4072
4073 __startup_func
4074 static vm_size_t
4075 zone_phys_size_max(void)
4076 {
4077         mach_vm_size_t zsize;
4078         vm_size_t zsizearg;
4079
4080         if (PE_parse_boot_argn("zsize", &zsizearg, sizeof(zsizearg))) {
4081                 zsize = zsizearg * (1024ULL * 1024);
4082         } else {
4083                 zsize = sane_size >> 2;         /* Set target zone size as 1/4 of physical memory */
4084 #if defined(__LP64__)
4085                 zsize += zsize >> 1;
4086 #endif /* __LP64__ */
4087         }
4088
4089         if (zsize < CONFIG_ZONE_MAP_MIN) {
4090                 zsize = CONFIG_ZONE_MAP_MIN;   /* Clamp to min */
4091         }
4092         if (zsize > sane_size >> 1) {
4093                 zsize = sane_size >> 1; /* Clamp to half of RAM max */
4094         }
4095         if (zsizearg == 0 && zsize > ZONE_MAP_MAX) {
4096                 /* if zsize boot-arg not present and zsize exceeds platform maximum, clip zsize */
4097                 vm_size_t orig_zsize = zsize;
4098                 zsize = ZONE_MAP_MAX;
4099                 printf("NOTE: zonemap size reduced from 0x%lx to 0x%lx\n",
4100                     (uintptr_t)orig_zsize, (uintptr_t)zsize);
4101         }
4102
4103         assert((vm_size_t) zsize == zsize);
4104         return (vm_size_t)trunc_page(zsize);
4105 }
4106
4107 __startup_func
4108 static struct zone_map_range
4109 zone_init_allocate_va(vm_offset_t *submap_min, vm_size_t size, bool guard)
4110 {
4111         struct zone_map_range r;
4112         kern_return_t kr;
4113
4114         if (guard) {
4115                 vm_map_offset_t addr = *submap_min;
4116                 vm_map_kernel_flags_t vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
4117
4118                 vmk_flags.vmkf_permanent = TRUE;
4119                 kr = vm_map_enter(kernel_map, &addr, size, 0,
4120                     VM_FLAGS_FIXED, vmk_flags, VM_KERN_MEMORY_ZONE, kernel_object,
4121                     0, FALSE, VM_PROT_NONE, VM_PROT_NONE, VM_INHERIT_DEFAULT);
4122                 *submap_min = (vm_offset_t)addr;
4123         } else {
4124                 kr = kernel_memory_allocate(kernel_map, submap_min, size,
4125                     0, KMA_KOBJECT | KMA_PAGEABLE | KMA_VAONLY, VM_KERN_MEMORY_ZONE);
4126         }
4127         if (kr != KERN_SUCCESS) {
4128                 panic("zone_init_allocate_va(0x%lx:0x%zx) failed: %d",
4129                     (uintptr_t)*submap_min, (size_t)size, kr);
4130         }
4131
4132         r.min_address = *submap_min;
4133         *submap_min  += size;
4134         r.max_address = *submap_min;
4135
4136         return r;
4137 }
4138
4139 __startup_func
4140 static void
4141 zone_submap_init(
4142         vm_offset_t *submap_min,
4143         unsigned    idx,
4144         uint64_t    zone_sub_map_numer,
4145         uint64_t    *remaining_denom,
4146         vm_offset_t *remaining_size,
4147         vm_size_t   guard_size)
4148 {
4149         vm_offset_t submap_start, submap_end;
4150         vm_size_t submap_size;
4151         vm_map_t  submap;
4152         kern_return_t kr;
4153
4154         submap_size = trunc_page(zone_sub_map_numer * *remaining_size /
4155             *remaining_denom);
4156         submap_start = *submap_min;
4157         submap_end = submap_start + submap_size;
4158
4159 #if defined(__LP64__)
4160         if (idx == Z_SUBMAP_IDX_VA_RESTRICTED_MAP) {
4161                 vm_offset_t restricted_va_max = zone_restricted_va_max();
4162                 if (submap_end > restricted_va_max) {
4163 #if DEBUG || DEVELOPMENT
4164                         printf("zone_init: submap[%d] clipped to %zdM of %zdM\n", idx,
4165                             (size_t)(restricted_va_max - submap_start) >> 20,
4166                             (size_t)submap_size >> 20);
4167 #endif /* DEBUG || DEVELOPMENT */
4168                         guard_size += submap_end - restricted_va_max;
4169                         *remaining_size -= submap_end - restricted_va_max;
4170                         submap_end  = restricted_va_max;
4171                         submap_size = restricted_va_max - submap_start;
4172                 }
4173
4174                 vm_packing_verify_range("vm_compressor",
4175                     submap_start, submap_end, VM_PACKING_PARAMS(C_SLOT_PACKED_PTR));
4176                 vm_packing_verify_range("vm_page",
4177                     submap_start, submap_end, VM_PACKING_PARAMS(VM_PAGE_PACKED_PTR));
4178         }
4179 #endif /* defined(__LP64__) */
4180
4181         vm_map_kernel_flags_t vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
4182         vmk_flags.vmkf_permanent = TRUE;
4183         kr = kmem_suballoc(kernel_map, submap_min, submap_size,
4184             FALSE, VM_FLAGS_FIXED, vmk_flags,
4185             VM_KERN_MEMORY_ZONE, &submap);
4186         if (kr != KERN_SUCCESS) {
4187                 panic("kmem_suballoc(kernel_map[%d] %p:%p) failed: %d",
4188                     idx, (void *)submap_start, (void *)submap_end, kr);
4189         }
4190
4191 #if DEBUG || DEVELOPMENT
4192         printf("zone_init: submap[%d] %p:%p (%zuM)\n",
4193             idx, (void *)submap_start, (void *)submap_end,
4194             (size_t)submap_size >> 20);
4195 #endif /* DEBUG || DEVELOPMENT */
4196
4197         zone_submaps[idx] = submap;
4198         *submap_min       = submap_end;
4199         *remaining_size  -= submap_size;
4200         *remaining_denom -= zone_sub_map_numer;
4201
4202         zone_init_allocate_va(submap_min, guard_size, true);
4203 }
4204
4205 /* Global initialization of Zone Allocator.
4206  * Runs after zone_bootstrap.
4207  */
4208 __startup_func
4209 static void
4210 zone_init(void)
4211 {
4212         vm_size_t       zone_meta_size;
4213         vm_size_t       zone_map_size;
4214         vm_size_t       remaining_size;
4215         vm_offset_t     submap_min = 0;
4216
4217         if (ZSECURITY_OPTIONS_SUBMAP_USER_DATA & zsecurity_options) {
4218                 zone_last_submap_idx = Z_SUBMAP_IDX_BAG_OF_BYTES_MAP;
4219         } else {
4220                 zone_last_submap_idx = Z_SUBMAP_IDX_GENERAL_MAP;
4221         }
4222         zone_phys_mapped_max  = zone_phys_size_max();
4223
4224 #if __LP64__
4225         zone_map_size = ZONE_MAP_VIRTUAL_SIZE_LP64;
4226 #else
4227         zone_map_size = zone_phys_mapped_max;
4228 #endif
4229         zone_meta_size = round_page(atop(zone_map_size) *
4230             sizeof(struct zone_page_metadata));
4231
4232         /*
4233          * Zone "map" setup:
4234          *
4235          * [  VA_RESTRICTED  ] <-- LP64 only
4236          * [  SINGLE_GUARD   ] <-- LP64 only
4237          * [  meta           ]
4238          * [  SINGLE_GUARD   ]
4239          * [  map<i>         ] \ for each extra map
4240          * [  MULTI_GUARD    ] /
4241          */
4242         remaining_size = zone_map_size;
4243 #if defined(__LP64__)
4244         remaining_size -= SINGLE_GUARD;
4245 #endif
4246         remaining_size -= zone_meta_size + SINGLE_GUARD;
4247         remaining_size -= MULTI_GUARD * (zone_last_submap_idx -
4248             Z_SUBMAP_IDX_GENERAL_MAP + 1);
4249
4250 #if VM_MAX_TAG_ZONES
4251         if (zone_tagging_on) {
4252                 zone_tagging_init(zone_map_size);
4253         }
4254 #endif
4255
4256         uint64_t remaining_denom = 0;
4257         uint64_t zone_sub_map_numer[Z_SUBMAP_IDX_COUNT] = {
4258 #ifdef __LP64__
4259                 [Z_SUBMAP_IDX_VA_RESTRICTED_MAP] = 20,
4260 #endif /* defined(__LP64__) */
4261                 [Z_SUBMAP_IDX_GENERAL_MAP]       = 40,
4262                 [Z_SUBMAP_IDX_BAG_OF_BYTES_MAP]  = 40,
4263         };
4264
4265         for (unsigned idx = 0; idx <= zone_last_submap_idx; idx++) {
4266 #if DEBUG || DEVELOPMENT
4267                 char submap_name[MAX_SUBMAP_NAME];
4268                 snprintf(submap_name, MAX_SUBMAP_NAME, "submap%d", idx);
4269                 PE_parse_boot_argn(submap_name, &zone_sub_map_numer[idx], sizeof(uint64_t));
4270 #endif
4271                 remaining_denom += zone_sub_map_numer[idx];
4272         }
4273
4274         /*
4275          * And now allocate the various pieces of VA and submaps.
4276          *
4277          * Make a first allocation of contiguous VA, that we'll deallocate,
4278          * and we'll carve-out memory in that range again linearly.
4279          * The kernel is stil single threaded at this stage.
4280          */
4281
4282         struct zone_map_range *map_range = &zone_info.zi_map_range;
4283
4284         *map_range = zone_init_allocate_va(&submap_min, zone_map_size, false);
4285         submap_min = map_range->min_address;
4286         kmem_free(kernel_map, submap_min, zone_map_size);
4287
4288 #if defined(__LP64__)
4289         /*
4290          * Allocate `Z_SUBMAP_IDX_VA_RESTRICTED_MAP` first because its VA range
4291          * can't go beyond RESTRICTED_VA_MAX for the vm_page_t packing to work.
4292          */
4293         zone_submap_init(&submap_min, Z_SUBMAP_IDX_VA_RESTRICTED_MAP,
4294             zone_sub_map_numer[Z_SUBMAP_IDX_VA_RESTRICTED_MAP], &remaining_denom,
4295             &remaining_size, SINGLE_GUARD);
4296 #endif /* defined(__LP64__) */
4297
4298         /*
4299          * Allocate metadata array
4300          */
4301         zone_info.zi_meta_range =
4302             zone_init_allocate_va(&submap_min, zone_meta_size, true);
4303         zone_init_allocate_va(&submap_min, SINGLE_GUARD, true);
4304
4305         zone_info.zi_array_base =
4306             (struct zone_page_metadata *)zone_info.zi_meta_range.min_address -
4307             zone_pva_from_addr(map_range->min_address).packed_address;
4308
4309         /*
4310          * Allocate other submaps
4311          */
4312         for (unsigned idx = Z_SUBMAP_IDX_GENERAL_MAP; idx <= zone_last_submap_idx; idx++) {
4313                 zone_submap_init(&submap_min, idx, zone_sub_map_numer[idx],
4314                     &remaining_denom, &remaining_size, MULTI_GUARD);
4315         }
4316
4317         vm_map_t general_map = zone_submaps[Z_SUBMAP_IDX_GENERAL_MAP];
4318         zone_info.zi_general_range.min_address = vm_map_min(general_map);
4319         zone_info.zi_general_range.max_address = vm_map_max(general_map);
4320
4321         assert(submap_min == map_range->max_address);
4322
4323 #if CONFIG_GZALLOC
4324         gzalloc_init(zone_map_size);
4325 #endif
4326
4327         zone_create_flags_t kma_flags = ZC_NOCACHING |
4328             ZC_NOGC | ZC_NOENCRYPT | ZC_NOGZALLOC | ZC_NOCALLOUT |
4329             ZC_KASAN_NOQUARANTINE | ZC_KASAN_NOREDZONE;
4330
4331         (void)zone_create_ext("vm.permanent", 1, kma_flags,
4332             ZONE_ID_PERMANENT, ^(zone_t z){
4333                 z->permanent = true;
4334                 z->z_elem_size = 1;
4335                 z->pcpu_elem_size = 1;
4336 #if defined(__LP64__)
4337                 z->submap_idx = Z_SUBMAP_IDX_VA_RESTRICTED_MAP;
4338 #endif
4339         });
4340         (void)zone_create_ext("vm.permanent.percpu", 1, kma_flags | ZC_PERCPU,
4341             ZONE_ID_PERCPU_PERMANENT, ^(zone_t z){
4342                 z->permanent = true;
4343                 z->z_elem_size = 1;
4344                 z->pcpu_elem_size = zpercpu_count();
4345 #if defined(__LP64__)
4346                 z->submap_idx = Z_SUBMAP_IDX_VA_RESTRICTED_MAP;
4347 #endif
4348         });
4349
4350         /*
4351          * Now fix the zones that are missing their zone stats
4352          * we don't really know if zfree()s happened so our stats
4353          * are slightly off for early boot. ¯\_(ツ)_/¯
4354          */
4355         zone_index_foreach(idx) {
4356                 zone_t tz = &zone_array[idx];
4357
4358                 if (tz->z_self) {
4359                         zone_stats_t zs = zalloc_percpu_permanent_type(struct zone_stats);
4360
4361                         zpercpu_get_cpu(zs, 0)->zs_mem_allocated +=
4362                             (tz->countavail - tz->countfree) *
4363                             zone_elem_size(tz);
4364                         assert(tz->z_stats == NULL);
4365                         tz->z_stats = zs;
4366 #if ZONE_ENABLE_LOGGING
4367                         if (tz->zone_logging && !tz->zlog_btlog) {
4368                                 zone_enable_logging(tz);
4369                         }
4370 #endif
4371                 }
4372         }
4373
4374 #if CONFIG_ZLEAKS
4375         /*
4376          * Initialize the zone leak monitor
4377          */
4378         zleak_init(zone_map_size);
4379 #endif /* CONFIG_ZLEAKS */
4380
4381 #if VM_MAX_TAG_ZONES
4382         if (zone_tagging_on) {
4383                 vm_allocation_zones_init();
4384         }
4385 #endif
4386 }
4387 STARTUP(ZALLOC, STARTUP_RANK_FIRST, zone_init);
4388
4389 __startup_func
4390 static void
4391 zone_set_foreign_range(
4392         vm_offset_t range_min,
4393         vm_offset_t range_max)
4394 {
4395         zone_info.zi_foreign_range.min_address = range_min;
4396         zone_info.zi_foreign_range.max_address = range_max;
4397 }
4398
4399 __startup_func
4400 vm_offset_t
4401 zone_foreign_mem_init(vm_size_t size)
4402 {
4403         vm_offset_t mem = (vm_offset_t) pmap_steal_memory(size);
4404         zone_set_foreign_range(mem, mem + size);
4405         return mem;
4406 }
4407
4408 #pragma mark zalloc
4409
4410 #if KASAN_ZALLOC
4411 /*
4412  * Called from zfree() to add the element being freed to the KASan quarantine.
4413  *
4414  * Returns true if the newly-freed element made it into the quarantine without
4415  * displacing another, false otherwise. In the latter case, addrp points to the
4416  * address of the displaced element, which will be freed by the zone.
4417  */
4418 static bool
4419 kasan_quarantine_freed_element(
4420         zone_t          *zonep,         /* the zone the element is being freed to */
4421         void            **addrp)        /* address of the element being freed */
4422 {
4423         zone_t zone = *zonep;
4424         void *addr = *addrp;
4425
4426         /*
4427          * Resize back to the real allocation size and hand off to the KASan
4428          * quarantine. `addr` may then point to a different allocation, if the
4429          * current element replaced another in the quarantine. The zone then
4430          * takes ownership of the swapped out free element.
4431          */
4432         vm_size_t usersz = zone_elem_size(zone) - 2 * zone->kasan_redzone;
4433         vm_size_t sz = usersz;
4434
4435         if (addr && zone->kasan_redzone) {
4436                 kasan_check_free((vm_address_t)addr, usersz, KASAN_HEAP_ZALLOC);
4437                 addr = (void *)kasan_dealloc((vm_address_t)addr, &sz);
4438                 assert(sz == zone_elem_size(zone));
4439         }
4440         if (addr && !zone->kasan_noquarantine) {
4441                 kasan_free(&addr, &sz, KASAN_HEAP_ZALLOC, zonep, usersz, true);
4442                 if (!addr) {
4443                         return TRUE;
4444                 }
4445         }
4446         if (addr && zone->kasan_noquarantine) {
4447                 kasan_unpoison(addr, zone_elem_size(zone));
4448         }
4449         *addrp = addr;
4450         return FALSE;
4451 }
4452
4453 #endif /* KASAN_ZALLOC */
4454
4455 static inline bool
4456 zone_needs_async_refill(zone_t zone)
4457 {
4458         if (zone->countfree != 0 || zone->async_pending || zone->no_callout) {
4459                 return false;
4460         }
4461
4462         return zone->expandable || zone->page_count < zone->page_count_max;
4463 }
4464
4465 __attribute__((noinline))
4466 static void
4467 zone_refill_synchronously_locked(
4468         zone_t         zone,
4469         zalloc_flags_t flags)
4470 {
4471         thread_t thr = current_thread();
4472         bool     set_expanding_vm_priv = false;
4473         zone_pva_t orig = zone->pages_intermediate;
4474
4475         while ((flags & Z_NOWAIT) == 0 && (zone->permanent
4476             ? zone_pva_is_equal(zone->pages_intermediate, orig)
4477             : zone->countfree == 0)) {
4478                 /*
4479                  * zone is empty, try to expand it
4480                  *
4481                  * Note that we now allow up to 2 threads (1 vm_privliged and
4482                  * 1 non-vm_privliged) to expand the zone concurrently...
4483                  *
4484                  * this is necessary to avoid stalling vm_privileged threads
4485                  * running critical code necessary to continue
4486                  * compressing/swapping pages (i.e. making new free pages) from
4487                  * stalling behind non-vm_privileged threads waiting to acquire
4488                  * free pages when the vm_page_free_count is below the
4489                  * vm_page_free_reserved limit.
4490                  */
4491                 if ((zone->expanding_no_vm_priv || zone->expanding_vm_priv) &&
4492                     (((thr->options & TH_OPT_VMPRIV) == 0) || zone->expanding_vm_priv)) {
4493                         /*
4494                          * This is a non-vm_privileged thread and a non-vm_privileged or
4495                          * a vm_privileged thread is already expanding the zone...
4496                          *    OR
4497                          * this is a vm_privileged thread and a vm_privileged thread is
4498                          * already expanding the zone...
4499                          *
4500                          * In either case wait for a thread to finish, then try again.
4501                          */
4502                         zone->waiting = true;
4503                         assert_wait(zone, THREAD_UNINT);
4504                         unlock_zone(zone);
4505                         thread_block(THREAD_CONTINUE_NULL);
4506                         lock_zone(zone);
4507                         continue;
4508                 }
4509
4510                 if (zone->page_count >= zone->page_count_max) {
4511                         if (zone->exhaustible) {
4512                                 break;
4513                         }
4514                         if (zone->expandable) {
4515                                 /*
4516                                  * If we're expandable, just don't go through this again.
4517                                  */
4518                                 zone->page_count_max = ~0u;
4519                         } else {
4520                                 unlock_zone(zone);
4521
4522                                 panic_include_zprint = true;
4523 #if CONFIG_ZLEAKS
4524                                 if (zleak_state & ZLEAK_STATE_ACTIVE) {
4525                                         panic_include_ztrace = true;
4526                                 }
4527 #endif /* CONFIG_ZLEAKS */
4528                                 panic("zalloc: zone \"%s\" empty.", zone->z_name);
4529                         }
4530                 }
4531
4532                 /*
4533                  * It is possible that a BG thread is refilling/expanding the zone
4534                  * and gets pre-empted during that operation. That blocks all other
4535                  * threads from making progress leading to a watchdog timeout. To
4536                  * avoid that, boost the thread priority using the rwlock boost
4537                  */
4538                 set_thread_rwlock_boost();
4539
4540                 if ((thr->options & TH_OPT_VMPRIV)) {
4541                         zone->expanding_vm_priv = true;
4542                         set_expanding_vm_priv = true;
4543                 } else {
4544                         zone->expanding_no_vm_priv = true;
4545                 }
4546
4547                 zone_replenish_locked(zone, flags, false);
4548
4549                 if (set_expanding_vm_priv == true) {
4550                         zone->expanding_vm_priv = false;
4551                 } else {
4552                         zone->expanding_no_vm_priv = false;
4553                 }
4554
4555                 if (zone->waiting) {
4556                         zone->waiting = false;
4557                         thread_wakeup(zone);
4558                 }
4559                 clear_thread_rwlock_boost();
4560
4561                 if (zone->countfree == 0) {
4562                         assert(flags & Z_NOPAGEWAIT);
4563                         break;
4564                 }
4565         }
4566
4567         if ((flags & (Z_NOWAIT | Z_NOPAGEWAIT)) &&
4568             zone_needs_async_refill(zone) && !vm_pool_low()) {
4569                 zone->async_pending = true;
4570                 unlock_zone(zone);
4571                 thread_call_enter(&call_async_alloc);
4572                 lock_zone(zone);
4573                 assert(zone->z_self == zone);
4574         }
4575 }
4576
4577 __attribute__((noinline))
4578 static void
4579 zone_refill_asynchronously_locked(zone_t zone)
4580 {
4581         uint32_t min_free = zone->prio_refill_count / 2;
4582         uint32_t resv_free = zone->prio_refill_count / 4;
4583         thread_t thr = current_thread();
4584
4585         /*
4586          * Nothing to do if there are plenty of elements.
4587          */
4588         while (zone->countfree <= min_free) {
4589                 /*
4590                  * Wakeup the replenish thread if not running.
4591                  */
4592                 if (!zone->zone_replenishing) {
4593                         lck_spin_lock(&zone_replenish_lock);
4594                         assert(zone_replenish_active < zone_replenish_max_threads);
4595                         ++zone_replenish_active;
4596                         lck_spin_unlock(&zone_replenish_lock);
4597                         zone->zone_replenishing = true;
4598                         zone_replenish_wakeups_initiated++;
4599                         thread_wakeup(&zone->prio_refill_count);
4600                 }
4601
4602                 /*
4603                  * We'll let VM_PRIV threads to continue to allocate until the
4604                  * reserve drops to 25%. After that only TH_OPT_ZONE_PRIV threads
4605                  * may continue.
4606                  *
4607                  * TH_OPT_ZONE_PRIV threads are the GC thread and a replenish thread itself.
4608                  * Replenish threads *need* to use the reserve. GC threads need to
4609                  * get through the current allocation, but then will wait at a higher
4610                  * level after they've dropped any locks which would deadlock the
4611                  * replenish thread.
4612                  */
4613                 if ((zone->countfree > resv_free && (thr->options & TH_OPT_VMPRIV)) ||
4614                     (thr->options & TH_OPT_ZONE_PRIV)) {
4615                         break;
4616                 }
4617
4618                 /*
4619                  * Wait for the replenish threads to add more elements for us to allocate from.
4620                  */
4621                 zone_replenish_throttle_count++;
4622                 unlock_zone(zone);
4623                 assert_wait_timeout(zone, THREAD_UNINT, 1, NSEC_PER_MSEC);
4624                 thread_block(THREAD_CONTINUE_NULL);
4625                 lock_zone(zone);
4626
4627                 assert(zone->z_self == zone);
4628         }
4629
4630         /*
4631          * If we're here because of zone_gc(), we didn't wait for
4632          * zone_replenish_thread to finish.  So we need to ensure that
4633          * we will successfully grab an element.
4634          *
4635          * zones that have a replenish thread configured.
4636          * The value of (refill_level / 2) in the previous bit of code should have
4637          * given us headroom even though this thread didn't wait.
4638          */
4639         if (thr->options & TH_OPT_ZONE_PRIV) {
4640                 assert(zone->countfree != 0);
4641         }
4642 }
4643
4644 #if ZONE_ENABLE_LOGGING || CONFIG_ZLEAKS
4645 __attribute__((noinline))
4646 static void
4647 zalloc_log_or_trace_leaks(zone_t zone, vm_offset_t addr)
4648 {
4649         uintptr_t       zbt[MAX_ZTRACE_DEPTH];  /* used in zone leak logging and zone leak detection */
4650         unsigned int    numsaved = 0;
4651
4652 #if ZONE_ENABLE_LOGGING
4653         if (DO_LOGGING(zone)) {
4654                 numsaved = backtrace(zbt, MAX_ZTRACE_DEPTH, NULL);
4655                 btlog_add_entry(zone->zlog_btlog, (void *)addr,
4656                     ZOP_ALLOC, (void **)zbt, numsaved);
4657         }
4658 #endif
4659
4660 #if CONFIG_ZLEAKS
4661         /*
4662          * Zone leak detection: capture a backtrace every zleak_sample_factor
4663          * allocations in this zone.
4664          */
4665         if (__improbable(zone->zleak_on)) {
4666                 if (sample_counter(&zone->zleak_capture, zleak_sample_factor)) {
4667                         /* Avoid backtracing twice if zone logging is on */
4668                         if (numsaved == 0) {
4669                                 numsaved = backtrace(zbt, MAX_ZTRACE_DEPTH, NULL);
4670                         }
4671                         /* Sampling can fail if another sample is happening at the same time in a different zone. */
4672                         if (!zleak_log(zbt, addr, numsaved, zone_elem_size(zone))) {
4673                                 /* If it failed, roll back the counter so we sample the next allocation instead. */
4674                                 zone->zleak_capture = zleak_sample_factor;
4675                         }
4676                 }
4677         }
4678
4679         if (__improbable(zone_leaks_scan_enable &&
4680             !(zone_elem_size(zone) & (sizeof(uintptr_t) - 1)))) {
4681                 unsigned int count, idx;
4682                 /* Fill element, from tail, with backtrace in reverse order */
4683                 if (numsaved == 0) {
4684                         numsaved = backtrace(zbt, MAX_ZTRACE_DEPTH, NULL);
4685                 }
4686                 count = (unsigned int)(zone_elem_size(zone) / sizeof(uintptr_t));
4687                 if (count >= numsaved) {
4688                         count = numsaved - 1;
4689                 }
4690                 for (idx = 0; idx < count; idx++) {
4691                         ((uintptr_t *)addr)[count - 1 - idx] = zbt[idx + 1];
4692                 }
4693         }
4694 #endif /* CONFIG_ZLEAKS */
4695 }
4696
4697 static inline bool
4698 zalloc_should_log_or_trace_leaks(zone_t zone, vm_size_t elem_size)
4699 {
4700 #if ZONE_ENABLE_LOGGING
4701         if (DO_LOGGING(zone)) {
4702                 return true;
4703         }
4704 #endif
4705 #if CONFIG_ZLEAKS
4706         /*
4707          * Zone leak detection: capture a backtrace every zleak_sample_factor
4708          * allocations in this zone.
4709          */
4710         if (zone->zleak_on) {
4711                 return true;
4712         }
4713         if (zone_leaks_scan_enable && !(elem_size & (sizeof(uintptr_t) - 1))) {
4714                 return true;
4715         }
4716 #endif /* CONFIG_ZLEAKS */
4717         return false;
4718 }
4719 #endif /* ZONE_ENABLE_LOGGING || CONFIG_ZLEAKS */
4720 #if ZONE_ENABLE_LOGGING
4721
4722 __attribute__((noinline))
4723 static void
4724 zfree_log_trace(zone_t zone, vm_offset_t addr)
4725 {
4726         /*
4727          * See if we're doing logging on this zone.
4728          *
4729          * There are two styles of logging used depending on
4730          * whether we're trying to catch a leak or corruption.
4731          */
4732         if (__improbable(DO_LOGGING(zone))) {
4733                 if (corruption_debug_flag) {
4734                         uintptr_t       zbt[MAX_ZTRACE_DEPTH];
4735                         unsigned int    numsaved;
4736                         /*
4737                          * We're logging to catch a corruption.
4738                          *
4739                          * Add a record of this zfree operation to log.
4740                          */
4741                         numsaved = backtrace(zbt, MAX_ZTRACE_DEPTH, NULL);
4742                         btlog_add_entry(zone->zlog_btlog, (void *)addr, ZOP_FREE,
4743                             (void **)zbt, numsaved);
4744                 } else {
4745                         /*
4746                          * We're logging to catch a leak.
4747                          *
4748                          * Remove any record we might have for this element
4749                          * since it's being freed.  Note that we may not find it
4750                          * if the buffer overflowed and that's OK.
4751                          *
4752                          * Since the log is of a limited size, old records get
4753                          * overwritten if there are more zallocs than zfrees.
4754                          */
4755                         btlog_remove_entries_for_element(zone->zlog_btlog, (void *)addr);
4756                 }
4757         }
4758 }
4759 #endif /* ZONE_ENABLE_LOGGING */
4760
4761 /*
4762  * Removes an element from the zone's free list, returning 0 if the free list is empty.
4763  * Verifies that the next-pointer and backup next-pointer are intact,
4764  * and verifies that a poisoned element hasn't been modified.
4765  */
4766 vm_offset_t
4767 zalloc_direct_locked(
4768         zone_t              zone,
4769         zalloc_flags_t      flags __unused,
4770         vm_size_t           waste __unused)
4771 {
4772         struct zone_page_metadata *page_meta;
4773         zone_addr_kind_t kind = ZONE_ADDR_NATIVE;
4774         vm_offset_t element, page, validate_bit = 0;
4775
4776         /* if zone is empty, bail */
4777         if (!zone_pva_is_null(zone->pages_any_free_foreign)) {
4778                 kind = ZONE_ADDR_FOREIGN;
4779                 page_meta = zone_pva_to_meta(zone->pages_any_free_foreign, kind);
4780                 page = (vm_offset_t)page_meta;
4781         } else if (!zone_pva_is_null(zone->pages_intermediate)) {
4782                 page_meta = zone_pva_to_meta(zone->pages_intermediate, kind);
4783                 page = zone_pva_to_addr(zone->pages_intermediate);
4784         } else if (!zone_pva_is_null(zone->pages_all_free)) {
4785                 page_meta = zone_pva_to_meta(zone->pages_all_free, kind);
4786                 page = zone_pva_to_addr(zone->pages_all_free);
4787                 if (os_sub_overflow(zone->allfree_page_count,
4788                     page_meta->zm_page_count, &zone->allfree_page_count)) {
4789                         zone_accounting_panic(zone, "allfree_page_count wrap-around");
4790                 }
4791         } else {
4792                 zone_accounting_panic(zone, "countfree corruption");
4793         }
4794
4795         if (!zone_has_index(zone, page_meta->zm_index)) {
4796                 zone_page_metadata_index_confusion_panic(zone, page, page_meta);
4797         }
4798
4799         element = zone_page_meta_get_freelist(zone, page_meta, page);
4800
4801         vm_offset_t *primary = (vm_offset_t *) element;
4802         vm_offset_t *backup  = get_backup_ptr(zone_elem_size(zone), primary);
4803
4804         /*
4805          * since the primary next pointer is xor'ed with zp_nopoison_cookie
4806          * for obfuscation, retrieve the original value back
4807          */
4808         vm_offset_t  next_element          = *primary ^ zp_nopoison_cookie;
4809         vm_offset_t  next_element_primary  = *primary;
4810         vm_offset_t  next_element_backup   = *backup;
4811
4812         /*
4813          * backup_ptr_mismatch_panic will determine what next_element
4814          * should have been, and print it appropriately
4815          */
4816         if (!zone_page_meta_is_sane_element(zone, page_meta, page, next_element, kind)) {
4817                 backup_ptr_mismatch_panic(zone, page_meta, page, element);
4818         }
4819
4820         /* Check the backup pointer for the regular cookie */
4821         if (__improbable(next_element_primary != next_element_backup)) {
4822                 /* Check for the poisoned cookie instead */
4823                 if (__improbable(next_element != (next_element_backup ^ zp_poisoned_cookie))) {
4824                         /* Neither cookie is valid, corruption has occurred */
4825                         backup_ptr_mismatch_panic(zone, page_meta, page, element);
4826                 }
4827
4828                 /*
4829                  * Element was marked as poisoned, so check its integrity before using it.
4830                  */
4831                 validate_bit = ZALLOC_ELEMENT_NEEDS_VALIDATION;
4832         } else if (zone->zfree_clear_mem) {
4833                 validate_bit = ZALLOC_ELEMENT_NEEDS_VALIDATION;
4834         }
4835
4836         /* Remove this element from the free list */
4837         zone_page_meta_set_freelist(page_meta, page, next_element);
4838
4839         if (kind == ZONE_ADDR_FOREIGN) {
4840                 if (next_element == 0) {
4841                         /* last foreign element allocated on page, move to all_used_foreign */
4842                         zone_meta_requeue(zone, &zone->pages_all_used_foreign, page_meta, kind);
4843                 }
4844         } else if (next_element == 0) {
4845                 zone_meta_requeue(zone, &zone->pages_all_used, page_meta, kind);
4846         } else if (page_meta->zm_alloc_count == 0) {
4847                 /* remove from free, move to intermediate */
4848                 zone_meta_requeue(zone, &zone->pages_intermediate, page_meta, kind);
4849         }
4850
4851         if (os_add_overflow(page_meta->zm_alloc_count, 1,
4852             &page_meta->zm_alloc_count)) {
4853                 /*
4854                  * This will not catch a lot of errors, the proper check
4855                  * would be against the number of elements this run should
4856                  * have which is expensive to count.
4857                  *
4858                  * But zm_alloc_count is a 16 bit number which could
4859                  * theoretically be valuable to cause to wrap around,
4860                  * so catch this.
4861                  */
4862                 zone_page_meta_accounting_panic(zone, page_meta,
4863                     "zm_alloc_count overflow");
4864         }
4865         if (os_sub_overflow(zone->countfree, 1, &zone->countfree)) {
4866                 zone_accounting_panic(zone, "countfree wrap-around");
4867         }
4868
4869 #if VM_MAX_TAG_ZONES
4870         if (__improbable(zone->tags)) {
4871                 vm_tag_t tag = zalloc_flags_get_tag(flags);
4872                 // set the tag with b0 clear so the block remains inuse
4873                 ZTAG(zone, element)[0] = (vm_tag_t)(tag << 1);
4874                 vm_tag_update_zone_size(tag, zone->tag_zone_index,
4875                     zone_elem_size(zone), waste);
4876         }
4877 #endif /* VM_MAX_TAG_ZONES */
4878 #if KASAN_ZALLOC
4879         if (zone->percpu) {
4880                 zpercpu_foreach_cpu(i) {
4881                         kasan_poison_range(element + ptoa(i),
4882                             zone_elem_size(zone), ASAN_VALID);
4883                 }
4884         } else {
4885                 kasan_poison_range(element, zone_elem_size(zone), ASAN_VALID);
4886         }
4887 #endif
4888
4889         return element | validate_bit;
4890 }
4891
4892 /*
4893  *      zalloc returns an element from the specified zone.
4894  */
4895 void *
4896 zalloc_ext(
4897         zone_t          zone,
4898         zone_stats_t    zstats,
4899         zalloc_flags_t  flags,
4900         vm_size_t       waste)
4901 {
4902         vm_offset_t     addr = 0;
4903         vm_size_t       elem_size = zone_elem_size(zone);
4904
4905         /*
4906          * KASan uses zalloc() for fakestack, which can be called anywhere.
4907          * However, we make sure these calls can never block.
4908          */
4909         assert(zone->kasan_fakestacks ||
4910             ml_get_interrupts_enabled() ||
4911             ml_is_quiescing() ||
4912             debug_mode_active() ||
4913             startup_phase < STARTUP_SUB_EARLY_BOOT);
4914
4915         /*
4916          * Make sure Z_NOFAIL was not obviously misused
4917          */
4918         if ((flags & Z_NOFAIL) && !zone->prio_refill_count) {
4919                 assert(!zone->exhaustible && (flags & (Z_NOWAIT | Z_NOPAGEWAIT)) == 0);
4920         }
4921
4922 #if CONFIG_ZCACHE
4923         /*
4924          * Note: if zone caching is on, gzalloc and tags aren't used
4925          *       so we can always check this first
4926          */
4927         if (zone_caching_enabled(zone)) {
4928                 addr = zcache_alloc_from_cpu_cache(zone, zstats, waste);
4929                 if (__probable(addr)) {
4930                         goto allocated_from_cache;
4931                 }
4932         }
4933 #endif /* CONFIG_ZCACHE */
4934
4935 #if CONFIG_GZALLOC
4936         if (__improbable(zone->gzalloc_tracked)) {
4937                 addr = gzalloc_alloc(zone, zstats, flags);
4938                 goto allocated_from_gzalloc;
4939         }
4940 #endif /* CONFIG_GZALLOC */
4941 #if VM_MAX_TAG_ZONES
4942         if (__improbable(zone->tags)) {
4943                 vm_tag_t tag = zalloc_flags_get_tag(flags);
4944                 if (tag == VM_KERN_MEMORY_NONE) {
4945                         /*
4946                          * zone views into heaps can lead to a site-less call
4947                          * and we fallback to KALLOC as a tag for those.
4948                          */
4949                         tag = VM_KERN_MEMORY_KALLOC;
4950                         flags |= Z_VM_TAG(tag);
4951                 }
4952                 vm_tag_will_update_zone(tag, zone->tag_zone_index);
4953         }
4954 #endif /* VM_MAX_TAG_ZONES */
4955
4956         lock_zone(zone);
4957         assert(zone->z_self == zone);
4958
4959         /*
4960          * Check if we need another thread to replenish the zone or
4961          * if we have to wait for a replenish thread to finish.
4962          * This is used for elements, like vm_map_entry, which are
4963          * needed themselves to implement zalloc().
4964          */
4965         if (__improbable(zone->prio_refill_count &&
4966             zone->countfree <= zone->prio_refill_count / 2)) {
4967                 zone_refill_asynchronously_locked(zone);
4968         } else if (__improbable(zone->countfree == 0)) {
4969                 zone_refill_synchronously_locked(zone, flags);
4970                 if (__improbable(zone->countfree == 0)) {
4971                         unlock_zone(zone);
4972                         if (__improbable(flags & Z_NOFAIL)) {
4973                                 zone_nofail_panic(zone);
4974                         }
4975                         goto out_nomem;
4976                 }
4977         }
4978
4979         addr = zalloc_direct_locked(zone, flags, waste);
4980         if (__probable(zstats != NULL)) {
4981                 /*
4982                  * The few vm zones used before zone_init() runs do not have
4983                  * per-cpu stats yet
4984                  */
4985                 int cpu = cpu_number();
4986                 zpercpu_get_cpu(zstats, cpu)->zs_mem_allocated += elem_size;
4987 #if ZALLOC_DETAILED_STATS
4988                 if (waste) {
4989                         zpercpu_get_cpu(zstats, cpu)->zs_mem_wasted += waste;
4990                 }
4991 #endif /* ZALLOC_DETAILED_STATS */
4992         }
4993
4994         unlock_zone(zone);
4995
4996 #if ZALLOC_ENABLE_POISONING
4997         bool validate = addr & ZALLOC_ELEMENT_NEEDS_VALIDATION;
4998 #endif
4999         addr &= ~ZALLOC_ELEMENT_NEEDS_VALIDATION;
5000         zone_clear_freelist_pointers(zone, addr);
5001 #if ZALLOC_ENABLE_POISONING
5002         /*
5003          * Note: percpu zones do not respect ZONE_MIN_ELEM_SIZE,
5004          *       so we will check the first word even if we just
5005          *       cleared it.
5006          */
5007         zalloc_validate_element(zone, addr, elem_size - sizeof(vm_offset_t),
5008             validate);
5009 #endif /* ZALLOC_ENABLE_POISONING */
5010
5011 allocated_from_cache:
5012 #if ZONE_ENABLE_LOGGING || CONFIG_ZLEAKS
5013         if (__improbable(zalloc_should_log_or_trace_leaks(zone, elem_size))) {
5014                 zalloc_log_or_trace_leaks(zone, addr);
5015         }
5016 #endif /* ZONE_ENABLE_LOGGING || CONFIG_ZLEAKS */
5017
5018 #if CONFIG_GZALLOC
5019 allocated_from_gzalloc:
5020 #endif
5021 #if KASAN_ZALLOC
5022         if (zone->kasan_redzone) {
5023                 addr = kasan_alloc(addr, elem_size,
5024                     elem_size - 2 * zone->kasan_redzone, zone->kasan_redzone);
5025                 elem_size -= 2 * zone->kasan_redzone;
5026         }
5027         /*
5028          * Initialize buffer with unique pattern only if memory
5029          * wasn't expected to be zeroed.
5030          */
5031         if (!zone->zfree_clear_mem && !(flags & Z_ZERO)) {
5032                 kasan_leak_init(addr, elem_size);
5033         }
5034 #endif /* KASAN_ZALLOC */
5035         if ((flags & Z_ZERO) && !zone->zfree_clear_mem) {
5036                 bzero((void *)addr, elem_size);
5037         }
5038
5039         TRACE_MACHLEAKS(ZALLOC_CODE, ZALLOC_CODE_2, elem_size, addr);
5040
5041 out_nomem:
5042         DTRACE_VM2(zalloc, zone_t, zone, void*, addr);
5043         return (void *)addr;
5044 }
5045
5046 void *
5047 zalloc(union zone_or_view zov)
5048 {
5049         return zalloc_flags(zov, Z_WAITOK);
5050 }
5051
5052 void *
5053 zalloc_noblock(union zone_or_view zov)
5054 {
5055         return zalloc_flags(zov, Z_NOWAIT);
5056 }
5057
5058 void *
5059 zalloc_flags(union zone_or_view zov, zalloc_flags_t flags)
5060 {
5061         zone_t zone = zov.zov_view->zv_zone;
5062         zone_stats_t zstats = zov.zov_view->zv_stats;
5063         assert(!zone->percpu);
5064         return zalloc_ext(zone, zstats, flags, 0);
5065 }
5066
5067 void *
5068 zalloc_percpu(union zone_or_view zov, zalloc_flags_t flags)
5069 {
5070         zone_t zone = zov.zov_view->zv_zone;
5071         zone_stats_t zstats = zov.zov_view->zv_stats;
5072         assert(zone->percpu);
5073         return (void *)__zpcpu_mangle(zalloc_ext(zone, zstats, flags, 0));
5074 }
5075
5076 static void *
5077 _zalloc_permanent(zone_t zone, vm_size_t size, vm_offset_t mask)
5078 {
5079         const zone_addr_kind_t kind = ZONE_ADDR_NATIVE;
5080         struct zone_page_metadata *page_meta;
5081         vm_offset_t offs, addr;
5082         zone_pva_t pva;
5083
5084         assert(ml_get_interrupts_enabled() ||
5085             ml_is_quiescing() ||
5086             debug_mode_active() ||
5087             startup_phase < STARTUP_SUB_EARLY_BOOT);
5088
5089         size = (size + mask) & ~mask;
5090         assert(size <= PAGE_SIZE);
5091
5092         lock_zone(zone);
5093         assert(zone->z_self == zone);
5094
5095         for (;;) {
5096                 pva = zone->pages_intermediate;
5097                 while (!zone_pva_is_null(pva)) {
5098                         page_meta = zone_pva_to_meta(pva, kind);
5099                         if (page_meta->zm_freelist_offs + size <= PAGE_SIZE) {
5100                                 goto found;
5101                         }
5102                         pva = page_meta->zm_page_next;
5103                 }
5104
5105                 zone_refill_synchronously_locked(zone, Z_WAITOK);
5106         }
5107
5108 found:
5109         offs = (page_meta->zm_freelist_offs + mask) & ~mask;
5110         page_meta->zm_freelist_offs = offs + size;
5111         page_meta->zm_alloc_count += size;
5112         zone->countfree -= size;
5113         if (__probable(zone->z_stats)) {
5114                 zpercpu_get(zone->z_stats)->zs_mem_allocated += size;
5115         }
5116
5117         if (page_meta->zm_alloc_count >= PAGE_SIZE - sizeof(vm_offset_t)) {
5118                 zone_meta_requeue(zone, &zone->pages_all_used, page_meta, kind);
5119         }
5120
5121         unlock_zone(zone);
5122
5123         addr = offs + zone_pva_to_addr(pva);
5124
5125         DTRACE_VM2(zalloc, zone_t, zone, void*, addr);
5126         return (void *)addr;
5127 }
5128
5129 static void *
5130 _zalloc_permanent_large(size_t size, vm_offset_t mask)
5131 {
5132         kern_return_t kr;
5133         vm_offset_t addr;
5134
5135         kr = kernel_memory_allocate(kernel_map, &addr, size, mask,
5136             KMA_KOBJECT | KMA_PERMANENT | KMA_ZERO,
5137             VM_KERN_MEMORY_KALLOC);
5138         if (kr != 0) {
5139                 panic("zalloc_permanent: unable to allocate %zd bytes (%d)",
5140                     size, kr);
5141         }
5142         return (void *)addr;
5143 }
5144
5145 void *
5146 zalloc_permanent(vm_size_t size, vm_offset_t mask)
5147 {
5148         if (size <= PAGE_SIZE) {
5149                 zone_t zone = &zone_array[ZONE_ID_PERMANENT];
5150                 return _zalloc_permanent(zone, size, mask);
5151         }
5152         return _zalloc_permanent_large(size, mask);
5153 }
5154
5155 void *
5156 zalloc_percpu_permanent(vm_size_t size, vm_offset_t mask)
5157 {
5158         zone_t zone = &zone_array[ZONE_ID_PERCPU_PERMANENT];
5159         return (void *)__zpcpu_mangle(_zalloc_permanent(zone, size, mask));
5160 }
5161
5162 void
5163 zalloc_async(__unused thread_call_param_t p0, __unused thread_call_param_t p1)
5164 {
5165         zone_index_foreach(i) {
5166                 zone_t z = &zone_array[i];
5167
5168                 if (z->no_callout) {
5169                         /* async_pending will never be set */
5170                         continue;
5171                 }
5172
5173                 lock_zone(z);
5174                 if (z->z_self && z->async_pending) {
5175                         z->async_pending = false;
5176                         zone_refill_synchronously_locked(z, Z_WAITOK);
5177                 }
5178                 unlock_zone(z);
5179         }
5180 }
5181
5182 /*
5183  * Adds the element to the head of the zone's free list
5184  * Keeps a backup next-pointer at the end of the element
5185  */
5186 void
5187 zfree_direct_locked(zone_t zone, vm_offset_t element, bool poison)
5188 {
5189         struct zone_page_metadata *page_meta;
5190         vm_offset_t page, old_head;
5191         zone_addr_kind_t kind;
5192         vm_size_t elem_size = zone_elem_size(zone);
5193
5194         vm_offset_t *primary  = (vm_offset_t *) element;
5195         vm_offset_t *backup   = get_backup_ptr(elem_size, primary);
5196
5197         page_meta = zone_allocated_element_resolve(zone, element, &page, &kind);
5198         old_head = zone_page_meta_get_freelist(zone, page_meta, page);
5199
5200         if (__improbable(old_head == element)) {
5201                 panic("zfree: double free of %p to zone %s%s\n",
5202                     (void *) element, zone_heap_name(zone), zone->z_name);
5203         }
5204
5205 #if ZALLOC_ENABLE_POISONING
5206         if (poison && elem_size < ZONE_MIN_ELEM_SIZE) {
5207                 assert(zone->percpu);
5208                 poison = false;
5209         }
5210 #else
5211         poison = false;
5212 #endif
5213
5214         /*
5215          * Always write a redundant next pointer
5216          * So that it is more difficult to forge, xor it with a random cookie
5217          * A poisoned element is indicated by using zp_poisoned_cookie
5218          * instead of zp_nopoison_cookie
5219          */
5220
5221         *backup = old_head ^ (poison ? zp_poisoned_cookie : zp_nopoison_cookie);
5222
5223         /*
5224          * Insert this element at the head of the free list. We also xor the
5225          * primary pointer with the zp_nopoison_cookie to make sure a free
5226          * element does not provide the location of the next free element directly.
5227          */
5228         *primary = old_head ^ zp_nopoison_cookie;
5229
5230 #if VM_MAX_TAG_ZONES
5231         if (__improbable(zone->tags)) {
5232                 vm_tag_t tag = (ZTAG(zone, element)[0] >> 1);
5233                 // set the tag with b0 clear so the block remains inuse
5234                 ZTAG(zone, element)[0] = 0xFFFE;
5235                 vm_tag_update_zone_size(tag, zone->tag_zone_index,
5236                     -((int64_t)elem_size), 0);
5237         }
5238 #endif /* VM_MAX_TAG_ZONES */
5239
5240         zone_page_meta_set_freelist(page_meta, page, element);
5241         if (os_sub_overflow(page_meta->zm_alloc_count, 1,
5242             &page_meta->zm_alloc_count)) {
5243                 zone_page_meta_accounting_panic(zone, page_meta,
5244                     "alloc_count wrap-around");
5245         }
5246         zone->countfree++;
5247
5248         if (kind == ZONE_ADDR_FOREIGN) {
5249                 if (old_head == 0) {
5250                         /* first foreign element freed on page, move from all_used_foreign */
5251                         zone_meta_requeue(zone, &zone->pages_any_free_foreign, page_meta, kind);
5252                 }
5253         } else if (page_meta->zm_alloc_count == 0) {
5254                 /* whether the page was on the intermediate or all_used, queue, move it to free */
5255                 zone_meta_requeue(zone, &zone->pages_all_free, page_meta, kind);
5256                 zone->allfree_page_count += page_meta->zm_page_count;
5257         } else if (old_head == 0) {
5258                 /* first free element on page, move from all_used */
5259                 zone_meta_requeue(zone, &zone->pages_intermediate, page_meta, kind);
5260         }
5261
5262 #if KASAN_ZALLOC
5263         if (zone->percpu) {
5264                 zpercpu_foreach_cpu(i) {
5265                         kasan_poison_range(element + ptoa(i), elem_size,
5266                             ASAN_HEAP_FREED);
5267                 }
5268         } else {
5269                 kasan_poison_range(element, elem_size, ASAN_HEAP_FREED);
5270         }
5271 #endif
5272 }
5273
5274 void
5275 zfree_ext(zone_t zone, zone_stats_t zstats, void *addr)
5276 {
5277         vm_offset_t     elem = (vm_offset_t)addr;
5278         vm_size_t       elem_size = zone_elem_size(zone);
5279         bool            poison = false;
5280
5281         DTRACE_VM2(zfree, zone_t, zone, void*, addr);
5282         TRACE_MACHLEAKS(ZFREE_CODE, ZFREE_CODE_2, elem_size, elem);
5283
5284 #if KASAN_ZALLOC
5285         if (kasan_quarantine_freed_element(&zone, &addr)) {
5286                 return;
5287         }
5288         /*
5289          * kasan_quarantine_freed_element() might return a different
5290          * {zone, addr} than the one being freed for kalloc heaps.
5291          *
5292          * Make sure we reload everything.
5293          */
5294         elem = (vm_offset_t)addr;
5295         elem_size = zone_elem_size(zone);
5296 #endif
5297
5298 #if CONFIG_ZLEAKS
5299         /*
5300          * Zone leak detection: un-track the allocation
5301          */
5302         if (__improbable(zone->zleak_on)) {
5303                 zleak_free(elem, elem_size);
5304         }
5305 #endif /* CONFIG_ZLEAKS */
5306
5307 #if CONFIG_ZCACHE
5308         /*
5309          * Note: if zone caching is on, gzalloc and tags aren't used
5310          *       so we can always check this first
5311          */
5312         if (zone_caching_enabled(zone)) {
5313                 return zcache_free_to_cpu_cache(zone, zstats, (vm_offset_t)addr);
5314         }
5315 #endif /* CONFIG_ZCACHE */
5316
5317 #if CONFIG_GZALLOC
5318         if (__improbable(zone->gzalloc_tracked)) {
5319                 return gzalloc_free(zone, zstats, addr);
5320         }
5321 #endif /* CONFIG_GZALLOC */
5322
5323 #if ZONE_ENABLE_LOGGING
5324         if (__improbable(DO_LOGGING(zone))) {
5325                 zfree_log_trace(zone, elem);
5326         }
5327 #endif /* ZONE_ENABLE_LOGGING */
5328
5329         if (zone->zfree_clear_mem) {
5330                 poison = zfree_clear(zone, elem, elem_size);
5331         }
5332
5333         lock_zone(zone);
5334         assert(zone->z_self == zone);
5335
5336         if (!poison) {
5337                 poison = zfree_poison_element(zone, &zone->zp_count, elem);
5338         }
5339
5340         if (__probable(zstats != NULL)) {
5341                 /*
5342                  * The few vm zones used before zone_init() runs do not have
5343                  * per-cpu stats yet
5344                  */
5345                 zpercpu_get(zstats)->zs_mem_freed += elem_size;
5346         }
5347
5348         zfree_direct_locked(zone, elem, poison);
5349
5350         unlock_zone(zone);
5351 }
5352
5353 void
5354 (zfree)(union zone_or_view zov, void *addr)
5355 {
5356         zone_t zone = zov.zov_view->zv_zone;
5357         zone_stats_t zstats = zov.zov_view->zv_stats;
5358         assert(!zone->percpu);
5359         zfree_ext(zone, zstats, addr);
5360 }
5361
5362 void
5363 zfree_percpu(union zone_or_view zov, void *addr)
5364 {
5365         zone_t zone = zov.zov_view->zv_zone;
5366         zone_stats_t zstats = zov.zov_view->zv_stats;
5367         assert(zone->percpu);
5368         zfree_ext(zone, zstats, (void *)__zpcpu_demangle(addr));
5369 }
5370
5371 #pragma mark vm integration, MIG routines
5372
5373 /*
5374  * Drops (i.e. frees) the elements in the all free pages queue of a zone.
5375  * Called by zone_gc() on each zone and when a zone is zdestroy()ed.
5376  */
5377 static void
5378 zone_drop_free_elements(zone_t z)
5379 {
5380         const zone_addr_kind_t    kind = ZONE_ADDR_NATIVE;
5381         unsigned int              total_freed_pages = 0;
5382         struct zone_page_metadata *page_meta, *seq_meta;
5383         vm_address_t              page_addr;
5384         vm_size_t                 size_to_free;
5385         vm_size_t                 free_count;
5386         uint32_t                  page_count;
5387
5388         current_thread()->options |= TH_OPT_ZONE_PRIV;
5389         lock_zone(z);
5390
5391         while (!zone_pva_is_null(z->pages_all_free)) {
5392                 /*
5393                  * If any replenishment threads are running, defer to them,
5394                  * so that we don't deplete reserved zones.
5395                  *
5396                  * The timing of the check isn't super important, as there are
5397                  * enough reserves to allow freeing an extra page_meta.
5398                  *
5399                  * Hence, we can check without grabbing the lock every time
5400                  * through the loop.  We do need the lock however to avoid
5401                  * missing a wakeup when we decide to block.
5402                  */
5403                 if (zone_replenish_active > 0) {
5404                         lck_spin_lock(&zone_replenish_lock);
5405                         if (zone_replenish_active > 0) {
5406                                 assert_wait(&zone_replenish_active, THREAD_UNINT);
5407                                 lck_spin_unlock(&zone_replenish_lock);
5408                                 unlock_zone(z);
5409                                 thread_block(THREAD_CONTINUE_NULL);
5410                                 lock_zone(z);
5411                                 continue;
5412                         }
5413                         lck_spin_unlock(&zone_replenish_lock);
5414                 }
5415
5416                 page_meta = zone_pva_to_meta(z->pages_all_free, kind);
5417                 page_count = page_meta->zm_page_count;
5418                 free_count = zone_elem_count(z, ptoa(page_count), kind);
5419
5420                 /*
5421                  * Don't drain zones with async refill to below the refill
5422                  * threshold, as they need some reserve to function properly.
5423                  */
5424                 if (!z->destroyed && z->prio_refill_count &&
5425                     (vm_size_t)(z->countfree - free_count) < z->prio_refill_count) {
5426                         break;
5427                 }
5428
5429                 zone_meta_queue_pop(z, &z->pages_all_free, kind, &page_addr);
5430
5431                 if (os_sub_overflow(z->countfree, free_count, &z->countfree)) {
5432                         zone_accounting_panic(z, "countfree wrap-around");
5433                 }
5434                 if (os_sub_overflow(z->countavail, free_count, &z->countavail)) {
5435                         zone_accounting_panic(z, "countavail wrap-around");
5436                 }
5437                 if (os_sub_overflow(z->allfree_page_count, page_count,
5438                     &z->allfree_page_count)) {
5439                         zone_accounting_panic(z, "allfree_page_count wrap-around");
5440                 }
5441                 if (os_sub_overflow(z->page_count, page_count, &z->page_count)) {
5442                         zone_accounting_panic(z, "page_count wrap-around");
5443                 }
5444
5445                 os_atomic_sub(&zones_phys_page_count, page_count, relaxed);
5446                 os_atomic_sub(&zones_phys_page_mapped_count, page_count, relaxed);
5447
5448                 bzero(page_meta, sizeof(*page_meta) * page_count);
5449                 seq_meta = page_meta;
5450                 page_meta = NULL; /* page_meta fields are zeroed, prevent reuse */
5451
5452                 unlock_zone(z);
5453
5454                 /* Free the pages for metadata and account for them */
5455                 total_freed_pages += page_count;
5456                 size_to_free = ptoa(page_count);
5457 #if KASAN_ZALLOC
5458                 kasan_poison_range(page_addr, size_to_free, ASAN_VALID);
5459 #endif
5460 #if VM_MAX_TAG_ZONES
5461                 if (z->tags) {
5462                         ztMemoryRemove(z, page_addr, size_to_free);
5463                 }
5464 #endif /* VM_MAX_TAG_ZONES */
5465
5466                 if (z->va_sequester && z->alloc_pages == page_count) {
5467                         kernel_memory_depopulate(submap_for_zone(z), page_addr,
5468                             size_to_free, KMA_KOBJECT, VM_KERN_MEMORY_ZONE);
5469                 } else {
5470                         kmem_free(submap_for_zone(z), page_addr, size_to_free);
5471                         seq_meta = NULL;
5472                 }
5473                 thread_yield_to_preemption();
5474
5475                 lock_zone(z);
5476
5477                 if (seq_meta) {
5478                         zone_meta_queue_push(z, &z->pages_sequester, seq_meta, kind);
5479                         z->sequester_page_count += page_count;
5480                 }
5481         }
5482         if (z->destroyed) {
5483                 assert(zone_pva_is_null(z->pages_all_free));
5484                 assert(z->allfree_page_count == 0);
5485         }
5486         unlock_zone(z);
5487         current_thread()->options &= ~TH_OPT_ZONE_PRIV;
5488
5489 #if DEBUG || DEVELOPMENT
5490         if (zalloc_debug & ZALLOC_DEBUG_ZONEGC) {
5491                 kprintf("zone_gc() of zone %s%s freed %lu elements, %d pages\n",
5492                     zone_heap_name(z), z->z_name,
5493                     (unsigned long)(ptoa(total_freed_pages) / z->pcpu_elem_size),
5494                     total_freed_pages);
5495         }
5496 #endif /* DEBUG || DEVELOPMENT */
5497 }
5498
5499 /*      Zone garbage collection
5500  *
5501  *      zone_gc will walk through all the free elements in all the
5502  *      zones that are marked collectable looking for reclaimable
5503  *      pages.  zone_gc is called by consider_zone_gc when the system
5504  *      begins to run out of memory.
5505  *
5506  *      We should ensure that zone_gc never blocks.
5507  */
5508 void
5509 zone_gc(boolean_t consider_jetsams)
5510 {
5511         if (consider_jetsams) {
5512                 kill_process_in_largest_zone();
5513                 /*
5514                  * If we do end up jetsamming something, we need to do a zone_gc so that
5515                  * we can reclaim free zone elements and update the zone map size.
5516                  * Fall through.
5517                  */
5518         }
5519
5520         lck_mtx_lock(&zone_gc_lock);
5521
5522 #if DEBUG || DEVELOPMENT
5523         if (zalloc_debug & ZALLOC_DEBUG_ZONEGC) {
5524                 kprintf("zone_gc() starting...\n");
5525         }
5526 #endif /* DEBUG || DEVELOPMENT */
5527
5528         zone_index_foreach(i) {
5529                 zone_t z = &zone_array[i];
5530
5531                 if (!z->collectable) {
5532                         continue;
5533                 }
5534 #if CONFIG_ZCACHE
5535                 if (zone_caching_enabled(z)) {
5536                         zcache_drain_depot(z);
5537                 }
5538 #endif /* CONFIG_ZCACHE */
5539                 if (zone_pva_is_null(z->pages_all_free)) {
5540                         continue;
5541                 }
5542
5543                 zone_drop_free_elements(z);
5544         }
5545
5546         lck_mtx_unlock(&zone_gc_lock);
5547 }
5548
5549 /*
5550  *      consider_zone_gc:
5551  *
5552  *      Called by the pageout daemon when the system needs more free pages.
5553  */
5554
5555 void
5556 consider_zone_gc(boolean_t consider_jetsams)
5557 {
5558         /*
5559          * One-time reclaim of kernel_map resources we allocated in
5560          * early boot.
5561          *
5562          * Use atomic exchange in case multiple threads race into here.
5563          */
5564         vm_offset_t deallocate_kaddr;
5565         if (kmapoff_kaddr != 0 &&
5566             (deallocate_kaddr = os_atomic_xchg(&kmapoff_kaddr, 0, relaxed)) != 0) {
5567                 vm_deallocate(kernel_map, deallocate_kaddr, ptoa_64(kmapoff_pgcnt));
5568         }
5569
5570         zone_gc(consider_jetsams);
5571 }
5572
5573 /*
5574  * Creates a vm_map_copy_t to return to the caller of mach_* MIG calls
5575  * requesting zone information.
5576  * Frees unused pages towards the end of the region, and zero'es out unused
5577  * space on the last page.
5578  */
5579 static vm_map_copy_t
5580 create_vm_map_copy(
5581         vm_offset_t             start_addr,
5582         vm_size_t               total_size,
5583         vm_size_t               used_size)
5584 {
5585         kern_return_t   kr;
5586         vm_offset_t             end_addr;
5587         vm_size_t               free_size;
5588         vm_map_copy_t   copy;
5589
5590         if (used_size != total_size) {
5591                 end_addr = start_addr + used_size;
5592                 free_size = total_size - (round_page(end_addr) - start_addr);
5593
5594                 if (free_size >= PAGE_SIZE) {
5595                         kmem_free(ipc_kernel_map,
5596                             round_page(end_addr), free_size);
5597                 }
5598                 bzero((char *) end_addr, round_page(end_addr) - end_addr);
5599         }
5600
5601         kr = vm_map_copyin(ipc_kernel_map, (vm_map_address_t)start_addr,
5602             (vm_map_size_t)used_size, TRUE, &copy);
5603         assert(kr == KERN_SUCCESS);
5604
5605         return copy;
5606 }
5607
5608 static boolean_t
5609 get_zone_info(
5610         zone_t                   z,
5611         mach_zone_name_t        *zn,
5612         mach_zone_info_t        *zi)
5613 {
5614         struct zone zcopy;
5615
5616         assert(z != ZONE_NULL);
5617         lock_zone(z);
5618         if (!z->z_self) {
5619                 unlock_zone(z);
5620                 return FALSE;
5621         }
5622         zcopy = *z;
5623         unlock_zone(z);
5624
5625         if (zn != NULL) {
5626                 /*
5627                  * Append kalloc heap name to zone name (if zone is used by kalloc)
5628                  */
5629                 char temp_zone_name[MAX_ZONE_NAME] = "";
5630                 snprintf(temp_zone_name, MAX_ZONE_NAME, "%s%s",
5631                     zone_heap_name(z), z->z_name);
5632
5633                 /* assuming here the name data is static */
5634                 (void) __nosan_strlcpy(zn->mzn_name, temp_zone_name,
5635                     strlen(temp_zone_name) + 1);
5636         }
5637
5638         if (zi != NULL) {
5639                 *zi = (mach_zone_info_t) {
5640                         .mzi_count = zone_count_allocated(&zcopy),
5641                         .mzi_cur_size = ptoa_64(zcopy.page_count),
5642                         // max_size for zprint is now high-watermark of pages used
5643                         .mzi_max_size = ptoa_64(zcopy.page_count_hwm),
5644                         .mzi_elem_size = zcopy.pcpu_elem_size,
5645                         .mzi_alloc_size = ptoa_64(zcopy.alloc_pages),
5646                         .mzi_exhaustible = (uint64_t)zcopy.exhaustible,
5647                 };
5648                 zpercpu_foreach(zs, zcopy.z_stats) {
5649                         zi->mzi_sum_size += zs->zs_mem_allocated;
5650                 }
5651                 if (zcopy.collectable) {
5652                         SET_MZI_COLLECTABLE_BYTES(zi->mzi_collectable,
5653                             ptoa_64(zcopy.allfree_page_count));
5654                         SET_MZI_COLLECTABLE_FLAG(zi->mzi_collectable, TRUE);
5655                 }
5656         }
5657
5658         return TRUE;
5659 }
5660
5661 kern_return_t
5662 task_zone_info(
5663         __unused task_t                                 task,
5664         __unused mach_zone_name_array_t *namesp,
5665         __unused mach_msg_type_number_t *namesCntp,
5666         __unused task_zone_info_array_t *infop,
5667         __unused mach_msg_type_number_t *infoCntp)
5668 {
5669         return KERN_FAILURE;
5670 }
5671
5672 kern_return_t
5673 mach_zone_info(
5674         host_priv_t             host,
5675         mach_zone_name_array_t  *namesp,
5676         mach_msg_type_number_t  *namesCntp,
5677         mach_zone_info_array_t  *infop,
5678         mach_msg_type_number_t  *infoCntp)
5679 {
5680         return mach_memory_info(host, namesp, namesCntp, infop, infoCntp, NULL, NULL);
5681 }
5682
5683
5684 kern_return_t
5685 mach_memory_info(
5686         host_priv_t             host,
5687         mach_zone_name_array_t  *namesp,
5688         mach_msg_type_number_t  *namesCntp,
5689         mach_zone_info_array_t  *infop,
5690         mach_msg_type_number_t  *infoCntp,
5691         mach_memory_info_array_t *memoryInfop,
5692         mach_msg_type_number_t   *memoryInfoCntp)
5693 {
5694         mach_zone_name_t        *names;
5695         vm_offset_t             names_addr;
5696         vm_size_t               names_size;
5697
5698         mach_zone_info_t        *info;
5699         vm_offset_t             info_addr;
5700         vm_size_t               info_size;
5701
5702         mach_memory_info_t      *memory_info;
5703         vm_offset_t             memory_info_addr;
5704         vm_size_t               memory_info_size;
5705         vm_size_t               memory_info_vmsize;
5706         unsigned int            num_info;
5707
5708         unsigned int            max_zones, used_zones, i;
5709         mach_zone_name_t        *zn;
5710         mach_zone_info_t        *zi;
5711         kern_return_t           kr;
5712
5713         uint64_t                zones_collectable_bytes = 0;
5714
5715         if (host == HOST_NULL) {
5716                 return KERN_INVALID_HOST;
5717         }
5718 #if CONFIG_DEBUGGER_FOR_ZONE_INFO
5719         if (!PE_i_can_has_debugger(NULL)) {
5720                 return KERN_INVALID_HOST;
5721         }
5722 #endif
5723
5724         /*
5725          *      We assume that zones aren't freed once allocated.
5726          *      We won't pick up any zones that are allocated later.
5727          */
5728
5729         max_zones = os_atomic_load(&num_zones, relaxed);
5730
5731         names_size = round_page(max_zones * sizeof *names);
5732         kr = kmem_alloc_pageable(ipc_kernel_map,
5733             &names_addr, names_size, VM_KERN_MEMORY_IPC);
5734         if (kr != KERN_SUCCESS) {
5735                 return kr;
5736         }
5737         names = (mach_zone_name_t *) names_addr;
5738
5739         info_size = round_page(max_zones * sizeof *info);
5740         kr = kmem_alloc_pageable(ipc_kernel_map,
5741             &info_addr, info_size, VM_KERN_MEMORY_IPC);
5742         if (kr != KERN_SUCCESS) {
5743                 kmem_free(ipc_kernel_map,
5744                     names_addr, names_size);
5745                 return kr;
5746         }
5747         info = (mach_zone_info_t *) info_addr;
5748
5749         zn = &names[0];
5750         zi = &info[0];
5751
5752         used_zones = max_zones;
5753         for (i = 0; i < max_zones; i++) {
5754                 if (!get_zone_info(&(zone_array[i]), zn, zi)) {
5755                         used_zones--;
5756                         continue;
5757                 }
5758                 zones_collectable_bytes += GET_MZI_COLLECTABLE_BYTES(zi->mzi_collectable);
5759                 zn++;
5760                 zi++;
5761         }
5762
5763         *namesp = (mach_zone_name_t *) create_vm_map_copy(names_addr, names_size, used_zones * sizeof *names);
5764         *namesCntp = used_zones;
5765
5766         *infop = (mach_zone_info_t *) create_vm_map_copy(info_addr, info_size, used_zones * sizeof *info);
5767         *infoCntp = used_zones;
5768
5769         num_info = 0;
5770         memory_info_addr = 0;
5771
5772         if (memoryInfop && memoryInfoCntp) {
5773                 vm_map_copy_t           copy;
5774                 num_info = vm_page_diagnose_estimate();
5775                 memory_info_size = num_info * sizeof(*memory_info);
5776                 memory_info_vmsize = round_page(memory_info_size);
5777                 kr = kmem_alloc_pageable(ipc_kernel_map,
5778                     &memory_info_addr, memory_info_vmsize, VM_KERN_MEMORY_IPC);
5779                 if (kr != KERN_SUCCESS) {
5780                         return kr;
5781                 }
5782
5783                 kr = vm_map_wire_kernel(ipc_kernel_map, memory_info_addr, memory_info_addr + memory_info_vmsize,
5784                     VM_PROT_READ | VM_PROT_WRITE, VM_KERN_MEMORY_IPC, FALSE);
5785                 assert(kr == KERN_SUCCESS);
5786
5787                 memory_info = (mach_memory_info_t *) memory_info_addr;
5788                 vm_page_diagnose(memory_info, num_info, zones_collectable_bytes);
5789
5790                 kr = vm_map_unwire(ipc_kernel_map, memory_info_addr, memory_info_addr + memory_info_vmsize, FALSE);
5791                 assert(kr == KERN_SUCCESS);
5792
5793                 kr = vm_map_copyin(ipc_kernel_map, (vm_map_address_t)memory_info_addr,
5794                     (vm_map_size_t)memory_info_size, TRUE, &copy);
5795                 assert(kr == KERN_SUCCESS);
5796
5797                 *memoryInfop = (mach_memory_info_t *) copy;
5798                 *memoryInfoCntp = num_info;
5799         }
5800
5801         return KERN_SUCCESS;
5802 }
5803
5804 kern_return_t
5805 mach_zone_info_for_zone(
5806         host_priv_t                     host,
5807         mach_zone_name_t        name,
5808         mach_zone_info_t        *infop)
5809 {
5810         zone_t zone_ptr;
5811
5812         if (host == HOST_NULL) {
5813                 return KERN_INVALID_HOST;
5814         }
5815 #if CONFIG_DEBUGGER_FOR_ZONE_INFO
5816         if (!PE_i_can_has_debugger(NULL)) {
5817                 return KERN_INVALID_HOST;
5818         }
5819 #endif
5820
5821         if (infop == NULL) {
5822                 return KERN_INVALID_ARGUMENT;
5823         }
5824
5825         zone_ptr = ZONE_NULL;
5826         zone_index_foreach(i) {
5827                 zone_t z = &(zone_array[i]);
5828                 assert(z != ZONE_NULL);
5829
5830                 /*
5831                  * Append kalloc heap name to zone name (if zone is used by kalloc)
5832                  */
5833                 char temp_zone_name[MAX_ZONE_NAME] = "";
5834                 snprintf(temp_zone_name, MAX_ZONE_NAME, "%s%s",
5835                     zone_heap_name(z), z->z_name);
5836
5837                 /* Find the requested zone by name */
5838                 if (track_this_zone(temp_zone_name, name.mzn_name)) {
5839                         zone_ptr = z;
5840                         break;
5841                 }
5842         }
5843
5844         /* No zones found with the requested zone name */
5845         if (zone_ptr == ZONE_NULL) {
5846                 return KERN_INVALID_ARGUMENT;
5847         }
5848
5849         if (get_zone_info(zone_ptr, NULL, infop)) {
5850                 return KERN_SUCCESS;
5851         }
5852         return KERN_FAILURE;
5853 }
5854
5855 kern_return_t
5856 mach_zone_info_for_largest_zone(
5857         host_priv_t                     host,
5858         mach_zone_name_t        *namep,
5859         mach_zone_info_t        *infop)
5860 {
5861         if (host == HOST_NULL) {
5862                 return KERN_INVALID_HOST;
5863         }
5864 #if CONFIG_DEBUGGER_FOR_ZONE_INFO
5865         if (!PE_i_can_has_debugger(NULL)) {
5866                 return KERN_INVALID_HOST;
5867         }
5868 #endif
5869
5870         if (namep == NULL || infop == NULL) {
5871                 return KERN_INVALID_ARGUMENT;
5872         }
5873
5874         if (get_zone_info(zone_find_largest(), namep, infop)) {
5875                 return KERN_SUCCESS;
5876         }
5877         return KERN_FAILURE;
5878 }
5879
5880 uint64_t
5881 get_zones_collectable_bytes(void)
5882 {
5883         uint64_t zones_collectable_bytes = 0;
5884         mach_zone_info_t zi;
5885
5886         zone_index_foreach(i) {
5887                 if (get_zone_info(&zone_array[i], NULL, &zi)) {
5888                         zones_collectable_bytes +=
5889                             GET_MZI_COLLECTABLE_BYTES(zi.mzi_collectable);
5890                 }
5891         }
5892
5893         return zones_collectable_bytes;
5894 }
5895
5896 kern_return_t
5897 mach_zone_get_zlog_zones(
5898         host_priv_t                             host,
5899         mach_zone_name_array_t  *namesp,
5900         mach_msg_type_number_t  *namesCntp)
5901 {
5902 #if ZONE_ENABLE_LOGGING
5903         unsigned int max_zones, logged_zones, i;
5904         kern_return_t kr;
5905         zone_t zone_ptr;
5906         mach_zone_name_t *names;
5907         vm_offset_t names_addr;
5908         vm_size_t names_size;
5909
5910         if (host == HOST_NULL) {
5911                 return KERN_INVALID_HOST;
5912         }
5913
5914         if (namesp == NULL || namesCntp == NULL) {
5915                 return KERN_INVALID_ARGUMENT;
5916         }
5917
5918         max_zones = os_atomic_load(&num_zones, relaxed);
5919
5920         names_size = round_page(max_zones * sizeof *names);
5921         kr = kmem_alloc_pageable(ipc_kernel_map,
5922             &names_addr, names_size, VM_KERN_MEMORY_IPC);
5923         if (kr != KERN_SUCCESS) {
5924                 return kr;
5925         }
5926         names = (mach_zone_name_t *) names_addr;
5927
5928         zone_ptr = ZONE_NULL;
5929         logged_zones = 0;
5930         for (i = 0; i < max_zones; i++) {
5931                 zone_t z = &(zone_array[i]);
5932                 assert(z != ZONE_NULL);
5933
5934                 /* Copy out the zone name if zone logging is enabled */
5935                 if (z->zlog_btlog) {
5936                         get_zone_info(z, &names[logged_zones], NULL);
5937                         logged_zones++;
5938                 }
5939         }
5940
5941         *namesp = (mach_zone_name_t *) create_vm_map_copy(names_addr, names_size, logged_zones * sizeof *names);
5942         *namesCntp = logged_zones;
5943
5944         return KERN_SUCCESS;
5945
5946 #else /* ZONE_ENABLE_LOGGING */
5947 #pragma unused(host, namesp, namesCntp)
5948         return KERN_FAILURE;
5949 #endif /* ZONE_ENABLE_LOGGING */
5950 }
5951
5952 kern_return_t
5953 mach_zone_get_btlog_records(
5954         host_priv_t                             host,
5955         mach_zone_name_t                name,
5956         zone_btrecord_array_t   *recsp,
5957         mach_msg_type_number_t  *recsCntp)
5958 {
5959 #if DEBUG || DEVELOPMENT
5960         unsigned int numrecs = 0;
5961         zone_btrecord_t *recs;
5962         kern_return_t kr;
5963         zone_t zone_ptr;
5964         vm_offset_t recs_addr;
5965         vm_size_t recs_size;
5966
5967         if (host == HOST_NULL) {
5968                 return KERN_INVALID_HOST;
5969         }
5970
5971         if (recsp == NULL || recsCntp == NULL) {
5972                 return KERN_INVALID_ARGUMENT;
5973         }
5974
5975         zone_ptr = ZONE_NULL;
5976         zone_index_foreach(i) {
5977                 zone_t z = &zone_array[i];
5978
5979                 /*
5980                  * Append kalloc heap name to zone name (if zone is used by kalloc)
5981                  */
5982                 char temp_zone_name[MAX_ZONE_NAME] = "";
5983                 snprintf(temp_zone_name, MAX_ZONE_NAME, "%s%s",
5984                     zone_heap_name(z), z->z_name);
5985
5986                 /* Find the requested zone by name */
5987                 if (track_this_zone(temp_zone_name, name.mzn_name)) {
5988                         zone_ptr = z;
5989                         break;
5990                 }
5991         }
5992
5993         /* No zones found with the requested zone name */
5994         if (zone_ptr == ZONE_NULL) {
5995                 return KERN_INVALID_ARGUMENT;
5996         }
5997
5998         /* Logging not turned on for the requested zone */
5999         if (!DO_LOGGING(zone_ptr)) {
6000                 return KERN_FAILURE;
6001         }
6002
6003         /* Allocate memory for btlog records */
6004         numrecs = (unsigned int)(get_btlog_records_count(zone_ptr->zlog_btlog));
6005         recs_size = round_page(numrecs * sizeof *recs);
6006
6007         kr = kmem_alloc_pageable(ipc_kernel_map, &recs_addr, recs_size, VM_KERN_MEMORY_IPC);
6008         if (kr != KERN_SUCCESS) {
6009                 return kr;
6010         }
6011
6012         /*
6013          * We will call get_btlog_records() below which populates this region while holding a spinlock
6014          * (the btlog lock). So these pages need to be wired.
6015          */
6016         kr = vm_map_wire_kernel(ipc_kernel_map, recs_addr, recs_addr + recs_size,
6017             VM_PROT_READ | VM_PROT_WRITE, VM_KERN_MEMORY_IPC, FALSE);
6018         assert(kr == KERN_SUCCESS);
6019
6020         recs = (zone_btrecord_t *)recs_addr;
6021         get_btlog_records(zone_ptr->zlog_btlog, recs, &numrecs);
6022
6023         kr = vm_map_unwire(ipc_kernel_map, recs_addr, recs_addr + recs_size, FALSE);
6024         assert(kr == KERN_SUCCESS);
6025
6026         *recsp = (zone_btrecord_t *) create_vm_map_copy(recs_addr, recs_size, numrecs * sizeof *recs);
6027         *recsCntp = numrecs;
6028
6029         return KERN_SUCCESS;
6030
6031 #else /* DEBUG || DEVELOPMENT */
6032 #pragma unused(host, name, recsp, recsCntp)
6033         return KERN_FAILURE;
6034 #endif /* DEBUG || DEVELOPMENT */
6035 }
6036
6037
6038 #if DEBUG || DEVELOPMENT
6039
6040 kern_return_t
6041 mach_memory_info_check(void)
6042 {
6043         mach_memory_info_t * memory_info;
6044         mach_memory_info_t * info;
6045         unsigned int         num_info;
6046         vm_offset_t          memory_info_addr;
6047         kern_return_t        kr;
6048         size_t               memory_info_size, memory_info_vmsize;
6049         uint64_t             top_wired, zonestotal, total;
6050
6051         num_info = vm_page_diagnose_estimate();
6052         memory_info_size = num_info * sizeof(*memory_info);
6053         memory_info_vmsize = round_page(memory_info_size);
6054         kr = kmem_alloc(kernel_map, &memory_info_addr, memory_info_vmsize, VM_KERN_MEMORY_DIAG);
6055         assert(kr == KERN_SUCCESS);
6056
6057         memory_info = (mach_memory_info_t *) memory_info_addr;
6058         vm_page_diagnose(memory_info, num_info, 0);
6059
6060         top_wired = total = zonestotal = 0;
6061         zone_index_foreach(idx) {
6062                 zonestotal += zone_size_wired(&zone_array[idx]);
6063         }
6064
6065         for (uint32_t idx = 0; idx < num_info; idx++) {
6066                 info = &memory_info[idx];
6067                 if (!info->size) {
6068                         continue;
6069                 }
6070                 if (VM_KERN_COUNT_WIRED == info->site) {
6071                         top_wired = info->size;
6072                 }
6073                 if (VM_KERN_SITE_HIDE & info->flags) {
6074                         continue;
6075                 }
6076                 if (!(VM_KERN_SITE_WIRED & info->flags)) {
6077                         continue;
6078                 }
6079                 total += info->size;
6080         }
6081         total += zonestotal;
6082
6083         printf("vm_page_diagnose_check %qd of %qd, zones %qd, short 0x%qx\n",
6084             total, top_wired, zonestotal, top_wired - total);
6085
6086         kmem_free(kernel_map, memory_info_addr, memory_info_vmsize);
6087
6088         return kr;
6089 }
6090
6091 extern boolean_t(*volatile consider_buffer_cache_collect)(int);
6092
6093 #endif /* DEBUG || DEVELOPMENT */
6094
6095 kern_return_t
6096 mach_zone_force_gc(
6097         host_t host)
6098 {
6099         if (host == HOST_NULL) {
6100                 return KERN_INVALID_HOST;
6101         }
6102
6103 #if DEBUG || DEVELOPMENT
6104         /* Callout to buffer cache GC to drop elements in the apfs zones */
6105         if (consider_buffer_cache_collect != NULL) {
6106                 (void)(*consider_buffer_cache_collect)(0);
6107         }
6108         consider_zone_gc(FALSE);
6109 #endif /* DEBUG || DEVELOPMENT */
6110         return KERN_SUCCESS;
6111 }
6112
6113 zone_t
6114 zone_find_largest(void)
6115 {
6116         uint32_t    largest_idx  = 0;
6117         vm_offset_t largest_size = zone_size_wired(&zone_array[0]);
6118
6119         zone_index_foreach(i) {
6120                 vm_offset_t size = zone_size_wired(&zone_array[i]);
6121                 if (size > largest_size) {
6122                         largest_idx = i;
6123                         largest_size = size;
6124                 }
6125         }
6126
6127         return &zone_array[largest_idx];
6128 }
6129
6130 #pragma mark - tests
6131 #if DEBUG || DEVELOPMENT
6132
6133 /*
6134  * Used for sysctl kern.run_zone_test which is not thread-safe. Ensure only one
6135  * thread goes through at a time.  Or we can end up with multiple test zones (if
6136  * a second zinit() comes through before zdestroy()),  which could lead us to
6137  * run out of zones.
6138  */
6139 SIMPLE_LOCK_DECLARE(zone_test_lock, 0);
6140 static boolean_t zone_test_running = FALSE;
6141 static zone_t test_zone_ptr = NULL;
6142
6143 static uintptr_t *
6144 zone_copy_allocations(zone_t z, uintptr_t *elems, bitmap_t *bits,
6145     zone_pva_t page_index, zone_addr_kind_t kind)
6146 {
6147         vm_offset_t free, first, end, page;
6148         struct zone_page_metadata *meta;
6149
6150         while (!zone_pva_is_null(page_index)) {
6151                 page  = zone_pva_to_addr(page_index);
6152                 meta  = zone_pva_to_meta(page_index, kind);
6153                 end   = page + ptoa(meta->zm_percpu ? 1 : meta->zm_page_count);
6154                 first = page + ZONE_PAGE_FIRST_OFFSET(kind);
6155
6156                 bitmap_clear(bits, (uint32_t)((end - first) / zone_elem_size(z)));
6157
6158                 // construct bitmap of all freed elements
6159                 free = zone_page_meta_get_freelist(z, meta, page);
6160                 while (free) {
6161                         bitmap_set(bits, (uint32_t)((free - first) / zone_elem_size(z)));
6162
6163                         // next free element
6164                         free = *(vm_offset_t *)free ^ zp_nopoison_cookie;
6165                 }
6166
6167                 for (unsigned i = 0; first < end; i++, first += zone_elem_size(z)) {
6168                         if (!bitmap_test(bits, i)) {
6169                                 *elems++ = INSTANCE_PUT(first);
6170                         }
6171                 }
6172
6173                 page_index = meta->zm_page_next;
6174         }
6175         return elems;
6176 }
6177
6178 kern_return_t
6179 zone_leaks(const char * zoneName, uint32_t nameLen, leak_site_proc proc, void * refCon)
6180 {
6181         uintptr_t     zbt[MAX_ZTRACE_DEPTH];
6182         zone_t        zone = NULL;
6183         uintptr_t *   array;
6184         uintptr_t *   next;
6185         uintptr_t     element, bt;
6186         uint32_t      idx, count, found;
6187         uint32_t      btidx, btcount, nobtcount, btfound;
6188         uint32_t      elemSize;
6189         uint64_t      maxElems;
6190         kern_return_t kr;
6191         bitmap_t     *bits;
6192
6193         zone_index_foreach(i) {
6194                 if (!strncmp(zoneName, zone_array[i].z_name, nameLen)) {
6195                         zone = &zone_array[i];
6196                         break;
6197                 }
6198         }
6199         if (zone == NULL) {
6200                 return KERN_INVALID_NAME;
6201         }
6202
6203         elemSize = zone_elem_size(zone);
6204         maxElems = (zone->countavail + 1) & ~1ul;
6205
6206         if ((ptoa(zone->percpu ? 1 : zone->alloc_pages) % elemSize) &&
6207             !zone_leaks_scan_enable) {
6208                 return KERN_INVALID_CAPABILITY;
6209         }
6210
6211         kr = kmem_alloc_kobject(kernel_map, (vm_offset_t *) &array,
6212             maxElems * sizeof(uintptr_t) + BITMAP_LEN(ZONE_CHUNK_MAXELEMENTS),
6213             VM_KERN_MEMORY_DIAG);
6214         if (KERN_SUCCESS != kr) {
6215                 return kr;
6216         }
6217
6218         /* maxElems is a 2-multiple so we're always aligned */
6219         bits = CAST_DOWN_EXPLICIT(bitmap_t *, array + maxElems);
6220
6221         lock_zone(zone);
6222
6223         next = array;
6224         next = zone_copy_allocations(zone, next, bits,
6225             zone->pages_any_free_foreign, ZONE_ADDR_FOREIGN);
6226         next = zone_copy_allocations(zone, next, bits,
6227             zone->pages_all_used_foreign, ZONE_ADDR_FOREIGN);
6228         next = zone_copy_allocations(zone, next, bits,
6229             zone->pages_intermediate, ZONE_ADDR_NATIVE);
6230         next = zone_copy_allocations(zone, next, bits,
6231             zone->pages_all_used, ZONE_ADDR_NATIVE);
6232         count = (uint32_t)(next - array);
6233
6234         unlock_zone(zone);
6235
6236         zone_leaks_scan(array, count, zone_elem_size(zone), &found);
6237         assert(found <= count);
6238
6239         for (idx = 0; idx < count; idx++) {
6240                 element = array[idx];
6241                 if (kInstanceFlagReferenced & element) {
6242                         continue;
6243                 }
6244                 element = INSTANCE_PUT(element) & ~kInstanceFlags;
6245         }
6246
6247 #if ZONE_ENABLE_LOGGING
6248         if (zone->zlog_btlog && !corruption_debug_flag) {
6249                 // btlog_copy_backtraces_for_elements will set kInstanceFlagReferenced on elements it found
6250                 btlog_copy_backtraces_for_elements(zone->zlog_btlog, array, &count, elemSize, proc, refCon);
6251         }
6252 #endif /* ZONE_ENABLE_LOGGING */
6253
6254         for (nobtcount = idx = 0; idx < count; idx++) {
6255                 element = array[idx];
6256                 if (!element) {
6257                         continue;
6258                 }
6259                 if (kInstanceFlagReferenced & element) {
6260                         continue;
6261                 }
6262                 element = INSTANCE_PUT(element) & ~kInstanceFlags;
6263
6264                 // see if we can find any backtrace left in the element
6265                 btcount = (typeof(btcount))(zone_elem_size(zone) / sizeof(uintptr_t));
6266                 if (btcount >= MAX_ZTRACE_DEPTH) {
6267                         btcount = MAX_ZTRACE_DEPTH - 1;
6268                 }
6269                 for (btfound = btidx = 0; btidx < btcount; btidx++) {
6270                         bt = ((uintptr_t *)element)[btcount - 1 - btidx];
6271                         if (!VM_KERNEL_IS_SLID(bt)) {
6272                                 break;
6273                         }
6274                         zbt[btfound++] = bt;
6275                 }
6276                 if (btfound) {
6277                         (*proc)(refCon, 1, elemSize, &zbt[0], btfound);
6278                 } else {
6279                         nobtcount++;
6280                 }
6281         }
6282         if (nobtcount) {
6283                 // fake backtrace when we found nothing
6284                 zbt[0] = (uintptr_t) &zalloc;
6285                 (*proc)(refCon, nobtcount, elemSize, &zbt[0], 1);
6286         }
6287
6288         kmem_free(kernel_map, (vm_offset_t) array, maxElems * sizeof(uintptr_t));
6289
6290         return KERN_SUCCESS;
6291 }
6292
6293 boolean_t
6294 run_zone_test(void)
6295 {
6296         unsigned int i = 0, max_iter = 5;
6297         void * test_ptr;
6298         zone_t test_zone;
6299
6300         simple_lock(&zone_test_lock, &zone_locks_grp);
6301         if (!zone_test_running) {
6302                 zone_test_running = TRUE;
6303         } else {
6304                 simple_unlock(&zone_test_lock);
6305                 printf("run_zone_test: Test already running.\n");
6306                 return FALSE;
6307         }
6308         simple_unlock(&zone_test_lock);
6309
6310         printf("run_zone_test: Testing zinit(), zalloc(), zfree() and zdestroy() on zone \"test_zone_sysctl\"\n");
6311
6312         /* zinit() and zdestroy() a zone with the same name a bunch of times, verify that we get back the same zone each time */
6313         do {
6314                 test_zone = zinit(sizeof(uint64_t), 100 * sizeof(uint64_t), sizeof(uint64_t), "test_zone_sysctl");
6315                 if (test_zone == NULL) {
6316                         printf("run_zone_test: zinit() failed\n");
6317                         return FALSE;
6318                 }
6319
6320 #if KASAN_ZALLOC
6321                 if (test_zone_ptr == NULL && test_zone->countfree != 0) {
6322 #else
6323                 if (test_zone->countfree != 0) {
6324 #endif
6325                         printf("run_zone_test: free count is not zero\n");
6326                         return FALSE;
6327                 }
6328
6329                 if (test_zone_ptr == NULL) {
6330                         /* Stash the zone pointer returned on the fist zinit */
6331                         printf("run_zone_test: zone created for the first time\n");
6332                         test_zone_ptr = test_zone;
6333                 } else if (test_zone != test_zone_ptr) {
6334                         printf("run_zone_test: old zone pointer and new zone pointer don't match\n");
6335                         return FALSE;
6336                 }
6337
6338                 test_ptr = zalloc(test_zone);
6339                 if (test_ptr == NULL) {
6340                         printf("run_zone_test: zalloc() failed\n");
6341                         return FALSE;
6342                 }
6343                 zfree(test_zone, test_ptr);
6344
6345                 zdestroy(test_zone);
6346                 i++;
6347
6348                 printf("run_zone_test: Iteration %d successful\n", i);
6349         } while (i < max_iter);
6350
6351         /* test Z_VA_SEQUESTER */
6352         if (zsecurity_options & ZSECURITY_OPTIONS_SEQUESTER) {
6353                 int idx, num_allocs = 8;
6354                 vm_size_t elem_size = 2 * PAGE_SIZE / num_allocs;
6355                 void *allocs[num_allocs];
6356                 vm_offset_t phys_pages = os_atomic_load(&zones_phys_page_count, relaxed);
6357                 vm_size_t zone_map_size = zone_range_size(&zone_info.zi_map_range);
6358
6359                 test_zone = zone_create("test_zone_sysctl", elem_size,
6360                     ZC_DESTRUCTIBLE | ZC_SEQUESTER);
6361                 if (test_zone == NULL) {
6362                         printf("run_zone_test: zinit() failed\n");
6363                         return FALSE;
6364                 }
6365
6366                 for (idx = 0; idx < num_allocs; idx++) {
6367                         allocs[idx] = zalloc(test_zone);
6368                         assert(NULL != allocs[idx]);
6369                         printf("alloc[%d] %p\n", idx, allocs[idx]);
6370                 }
6371                 for (idx = 0; idx < num_allocs; idx++) {
6372                         zfree(test_zone, allocs[idx]);
6373                 }
6374                 assert(!zone_pva_is_null(test_zone->pages_all_free));
6375
6376                 printf("vm_page_wire_count %d, vm_page_free_count %d, p to v %qd%%\n",
6377                     vm_page_wire_count, vm_page_free_count,
6378                     (100ULL * ptoa_64(phys_pages)) / zone_map_size);
6379                 zone_gc(FALSE);
6380                 printf("vm_page_wire_count %d, vm_page_free_count %d, p to v %qd%%\n",
6381                     vm_page_wire_count, vm_page_free_count,
6382                     (100ULL * ptoa_64(phys_pages)) / zone_map_size);
6383                 unsigned int allva = 0;
6384                 zone_index_foreach(zidx) {
6385                         zone_t z = &zone_array[zidx];
6386                         lock_zone(z);
6387                         allva += z->page_count;
6388                         if (!z->sequester_page_count) {
6389                                 unlock_zone(z);
6390                                 continue;
6391                         }
6392                         unsigned count = 0;
6393                         uint64_t size;
6394                         zone_pva_t pg = z->pages_sequester;
6395                         struct zone_page_metadata *page_meta;
6396                         while (pg.packed_address) {
6397                                 page_meta = zone_pva_to_meta(pg, ZONE_ADDR_NATIVE);
6398                                 count += z->alloc_pages;
6399                                 pg = page_meta->zm_page_next;
6400                         }
6401                         assert(count == z->sequester_page_count);
6402                         size = zone_size_wired(z);
6403                         if (!size) {
6404                                 size = 1;
6405                         }
6406                         printf("%s%s: seq %d, res %d, %qd %%\n",
6407                             zone_heap_name(z), z->z_name, z->sequester_page_count,
6408                             z->page_count, zone_size_allocated(z) * 100ULL / size);
6409                         unlock_zone(z);
6410                 }
6411
6412                 printf("total va: %d\n", allva);
6413
6414                 assert(zone_pva_is_null(test_zone->pages_all_free));
6415                 assert(!zone_pva_is_null(test_zone->pages_sequester));
6416                 assert(2 == test_zone->sequester_page_count);
6417                 for (idx = 0; idx < num_allocs; idx++) {
6418                         assert(0 == pmap_find_phys(kernel_pmap, (addr64_t)(uintptr_t) allocs[idx]));
6419                 }
6420                 for (idx = 0; idx < num_allocs; idx++) {
6421                         allocs[idx] = zalloc(test_zone);
6422                         assert(allocs[idx]);
6423                         printf("alloc[%d] %p\n", idx, allocs[idx]);
6424                 }
6425                 assert(zone_pva_is_null(test_zone->pages_sequester));
6426                 assert(0 == test_zone->sequester_page_count);
6427                 for (idx = 0; idx < num_allocs; idx++) {
6428                         zfree(test_zone, allocs[idx]);
6429                 }
6430                 zdestroy(test_zone);
6431         } else {
6432                 printf("run_zone_test: skipping sequester test (not enabled)\n");
6433         }
6434
6435         printf("run_zone_test: Test passed\n");
6436
6437         simple_lock(&zone_test_lock, &zone_locks_grp);
6438         zone_test_running = FALSE;
6439         simple_unlock(&zone_test_lock);
6440
6441         return TRUE;
6442 }
6443
6444 /*
6445  * Routines to test that zone garbage collection and zone replenish threads
6446  * running at the same time don't cause problems.
6447  */
6448
6449 void
6450 zone_gc_replenish_test(void)
6451 {
6452         zone_gc(FALSE);
6453 }
6454
6455
6456 void
6457 zone_alloc_replenish_test(void)
6458 {
6459         zone_t z = NULL;
6460         struct data { struct data *next; } *node, *list = NULL;
6461
6462         /*
6463          * Find a zone that has a replenish thread
6464          */
6465         zone_index_foreach(i) {
6466                 z = &zone_array[i];
6467                 if (z->prio_refill_count &&
6468                     zone_elem_size(z) >= sizeof(struct data)) {
6469                         z = &zone_array[i];
6470                         break;
6471                 }
6472         }
6473         if (z == NULL) {
6474                 printf("Couldn't find a replenish zone\n");
6475                 return;
6476         }
6477
6478         for (uint32_t i = 0; i < 2000; ++i) {      /* something big enough to go past replenishment */
6479                 node = zalloc(z);
6480                 node->next = list;
6481                 list = node;
6482         }
6483
6484         /*
6485          * release the memory we allocated
6486          */
6487         while (list != NULL) {
6488                 node = list;
6489                 list = list->next;
6490                 zfree(z, node);
6491         }
6492 }
6493
6494 #endif /* DEBUG || DEVELOPMENT */