osfmk/kern/zalloc.c

   1 /*
   2  * Copyright (c) 2000-2019 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /*
  29  * @OSF_COPYRIGHT@
  30  */
  31 /*
  32  * Mach Operating System
  33  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
  34  * All Rights Reserved.
  35  *
  36  * Permission to use, copy, modify and distribute this software and its
  37  * documentation is hereby granted, provided that both the copyright
  38  * notice and this permission notice appear in all copies of the
  39  * software, derivative works or modified versions, and any portions
  40  * thereof, and that both notices appear in supporting documentation.
  41  *
  42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
  44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  45  *
  46  * Carnegie Mellon requests users of this software to return to
  47  *
  48  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  49  *  School of Computer Science
  50  *  Carnegie Mellon University
  51  *  Pittsburgh PA 15213-3890
  52  *
  53  * any improvements or extensions that they make and grant Carnegie Mellon
  54  * the rights to redistribute these changes.
  55  */
  56 /*
  57  */
  58 /*
  59  *      File:   kern/zalloc.c
  60  *      Author: Avadis Tevanian, Jr.
  61  *
  62  *      Zone-based memory allocator.  A zone is a collection of fixed size
  63  *      data blocks for which quick allocation/deallocation is possible.
  64  */
  65 #include <zone_debug.h>
  66
  67 #include <mach/mach_types.h>
  68 #include <mach/vm_param.h>
  69 #include <mach/kern_return.h>
  70 #include <mach/mach_host_server.h>
  71 #include <mach/task_server.h>
  72 #include <mach/machine/vm_types.h>
  73 #include <mach/vm_map.h>
  74 #include <mach/sdt.h>
  75
  76 #include <kern/bits.h>
  77 #include <kern/kern_types.h>
  78 #include <kern/assert.h>
  79 #include <kern/backtrace.h>
  80 #include <kern/host.h>
  81 #include <kern/macro_help.h>
  82 #include <kern/sched.h>
  83 #include <kern/locks.h>
  84 #include <kern/sched_prim.h>
  85 #include <kern/misc_protos.h>
  86 #include <kern/thread_call.h>
  87 #include <kern/zalloc.h>
  88 #include <kern/kalloc.h>
  89
  90 #include <prng/random.h>
  91
  92 #include <vm/pmap.h>
  93 #include <vm/vm_map.h>
  94 #include <vm/vm_kern.h>
  95 #include <vm/vm_page.h>
  96
  97 #include <pexpert/pexpert.h>
  98
  99 #include <machine/machparam.h>
 100 #include <machine/machine_routines.h>  /* ml_cpu_get_info */
 101
 102 #include <libkern/OSDebug.h>
 103 #include <libkern/OSAtomic.h>
 104 #include <libkern/section_keywords.h>
 105 #include <sys/kdebug.h>
 106
 107 #include <san/kasan.h>
 108
 109 /*
 110  *      The zone_locks_grp allows for collecting lock statistics.
 111  *      All locks are associated to this group in zinit.
 112  *      Look at tools/lockstat for debugging lock contention.
 113  */
 114
 115 lck_grp_t       zone_locks_grp;
 116 lck_grp_attr_t  zone_locks_grp_attr;
 117
 118 /*
 119  *  ZONE_ALIAS_ADDR (deprecated)
 120  */
 121
 122 #define from_zone_map(addr, size) \
 123         ((vm_offset_t)(addr)             >= zone_map_min_address && \
 124         ((vm_offset_t)(addr) + size - 1) <  zone_map_max_address )
 125
 126 /*
 127  * Zone Corruption Debugging
 128  *
 129  * We use three techniques to detect modification of a zone element
 130  * after it's been freed.
 131  *
 132  * (1) Check the freelist next pointer for sanity.
 133  * (2) Store a backup of the next pointer at the end of the element,
 134  *     and compare it to the primary next pointer when the element is allocated
 135  *     to detect corruption of the freelist due to use-after-free bugs.
 136  *     The backup pointer is also XORed with a per-boot random cookie.
 137  * (3) Poison the freed element by overwriting it with 0xdeadbeef,
 138  *     and check for that value when the element is being reused to make sure
 139  *     no part of the element has been modified while it was on the freelist.
 140  *     This will also help catch read-after-frees, as code will now dereference
 141  *     0xdeadbeef instead of a valid but freed pointer.
 142  *
 143  * (1) and (2) occur for every allocation and free to a zone.
 144  * This is done to make it slightly more difficult for an attacker to
 145  * manipulate the freelist to behave in a specific way.
 146  *
 147  * Poisoning (3) occurs periodically for every N frees (counted per-zone)
 148  * and on every free for zones smaller than a cacheline.  If -zp
 149  * is passed as a boot arg, poisoning occurs for every free.
 150  *
 151  * Performance slowdown is inversely proportional to the frequency of poisoning,
 152  * with a 4-5% hit around N=1, down to ~0.3% at N=16 and just "noise" at N=32
 153  * and higher. You can expect to find a 100% reproducible bug in an average of
 154  * N tries, with a standard deviation of about N, but you will want to set
 155  * "-zp" to always poison every free if you are attempting to reproduce
 156  * a known bug.
 157  *
 158  * For a more heavyweight, but finer-grained method of detecting misuse
 159  * of zone memory, look up the "Guard mode" zone allocator in gzalloc.c.
 160  *
 161  * Zone Corruption Logging
 162  *
 163  * You can also track where corruptions come from by using the boot-arguments
 164  * "zlog=<zone name to log> -zc". Search for "Zone corruption logging" later
 165  * in this document for more implementation and usage information.
 166  *
 167  * Zone Leak Detection
 168  *
 169  * To debug leaks of zone memory, use the zone leak detection tool 'zleaks'
 170  * found later in this file via the showtopztrace and showz* macros in kgmacros,
 171  * or use zlog without the -zc argument.
 172  *
 173  */
 174
 175 /* Returns TRUE if we rolled over the counter at factor */
 176 static inline boolean_t
 177 sample_counter(volatile uint32_t * count_p, uint32_t factor)
 178 {
 179         uint32_t old_count, new_count;
 180         boolean_t rolled_over;
 181
 182         do {
 183                 new_count = old_count = *count_p;
 184
 185                 if (++new_count >= factor) {
 186                         rolled_over = TRUE;
 187                         new_count = 0;
 188                 } else {
 189                         rolled_over = FALSE;
 190                 }
 191         } while (!OSCompareAndSwap(old_count, new_count, count_p));
 192
 193         return rolled_over;
 194 }
 195
 196 #if defined(__LP64__)
 197 #define ZP_POISON       0xdeadbeefdeadbeef
 198 #else
 199 #define ZP_POISON       0xdeadbeef
 200 #endif
 201
 202 boolean_t zfree_poison_element(zone_t zone, vm_offset_t elem);
 203 void zalloc_poison_element(boolean_t check_poison, zone_t zone, vm_offset_t addr);
 204
 205 #define ZP_DEFAULT_SAMPLING_FACTOR 16
 206 #define ZP_DEFAULT_SCALE_FACTOR 4
 207
 208 /*
 209  *  A zp_factor of 0 indicates zone poisoning is disabled,
 210  *  however, we still poison zones smaller than zp_tiny_zone_limit (a cacheline).
 211  *  Passing the -no-zp boot-arg disables even this behavior.
 212  *  In all cases, we record and check the integrity of a backup pointer.
 213  */
 214
 215 /* set by zp-factor=N boot arg, zero indicates non-tiny poisoning disabled */
 216 #if DEBUG
 217 #define DEFAULT_ZP_FACTOR (1)
 218 #else
 219 #define DEFAULT_ZP_FACTOR (0)
 220 #endif
 221 uint32_t        zp_factor               = DEFAULT_ZP_FACTOR;
 222
 223 /* set by zp-scale=N boot arg, scales zp_factor by zone size */
 224 uint32_t        zp_scale                = 0;
 225
 226 /* set in zp_init, zero indicates -no-zp boot-arg */
 227 vm_size_t       zp_tiny_zone_limit      = 0;
 228
 229 /* initialized to a per-boot random value in zp_init */
 230 uintptr_t       zp_poisoned_cookie      = 0;
 231 uintptr_t       zp_nopoison_cookie      = 0;
 232
 233 #if VM_MAX_TAG_ZONES
 234 boolean_t       zone_tagging_on;
 235 #endif /* VM_MAX_TAG_ZONES */
 236
 237 SECURITY_READ_ONLY_LATE(boolean_t) copyio_zalloc_check = TRUE;
 238 static struct bool_gen zone_bool_gen;
 239
 240 /*
 241  * initialize zone poisoning
 242  * called from zone_bootstrap before any allocations are made from zalloc
 243  */
 244 static inline void
 245 zp_init(void)
 246 {
 247         char temp_buf[16];
 248
 249         /*
 250          * Initialize backup pointer random cookie for poisoned elements
 251          * Try not to call early_random() back to back, it may return
 252          * the same value if mach_absolute_time doesn't have sufficient time
 253          * to tick over between calls.  <rdar://problem/11597395>
 254          * (This is only a problem on embedded devices)
 255          */
 256         zp_poisoned_cookie = (uintptr_t) early_random();
 257
 258         /*
 259          * Always poison zones smaller than a cacheline,
 260          * because it's pretty close to free
 261          */
 262         ml_cpu_info_t cpu_info;
 263         ml_cpu_get_info(&cpu_info);
 264         zp_tiny_zone_limit = (vm_size_t) cpu_info.cache_line_size;
 265
 266         zp_factor = ZP_DEFAULT_SAMPLING_FACTOR;
 267         zp_scale  = ZP_DEFAULT_SCALE_FACTOR;
 268
 269         //TODO: Bigger permutation?
 270         /*
 271          * Permute the default factor +/- 1 to make it less predictable
 272          * This adds or subtracts ~4 poisoned objects per 1000 frees.
 273          */
 274         if (zp_factor != 0) {
 275                 uint32_t rand_bits = early_random() & 0x3;
 276
 277                 if (rand_bits == 0x1) {
 278                         zp_factor += 1;
 279                 } else if (rand_bits == 0x2) {
 280                         zp_factor -= 1;
 281                 }
 282                 /* if 0x0 or 0x3, leave it alone */
 283         }
 284
 285         /* -zp: enable poisoning for every alloc and free */
 286         if (PE_parse_boot_argn("-zp", temp_buf, sizeof(temp_buf))) {
 287                 zp_factor = 1;
 288         }
 289
 290         /* -no-zp: disable poisoning completely even for tiny zones */
 291         if (PE_parse_boot_argn("-no-zp", temp_buf, sizeof(temp_buf))) {
 292                 zp_factor          = 0;
 293                 zp_tiny_zone_limit = 0;
 294                 printf("Zone poisoning disabled\n");
 295         }
 296
 297         /* zp-factor=XXXX: override how often to poison freed zone elements */
 298         if (PE_parse_boot_argn("zp-factor", &zp_factor, sizeof(zp_factor))) {
 299                 printf("Zone poisoning factor override: %u\n", zp_factor);
 300         }
 301
 302         /* zp-scale=XXXX: override how much zone size scales zp-factor by */
 303         if (PE_parse_boot_argn("zp-scale", &zp_scale, sizeof(zp_scale))) {
 304                 printf("Zone poisoning scale factor override: %u\n", zp_scale);
 305         }
 306
 307         /* Initialize backup pointer random cookie for unpoisoned elements */
 308         zp_nopoison_cookie = (uintptr_t) early_random();
 309
 310 #if MACH_ASSERT
 311         if (zp_poisoned_cookie == zp_nopoison_cookie) {
 312                 panic("early_random() is broken: %p and %p are not random\n",
 313                     (void *) zp_poisoned_cookie, (void *) zp_nopoison_cookie);
 314         }
 315 #endif
 316
 317         /*
 318          * Use the last bit in the backup pointer to hint poisoning state
 319          * to backup_ptr_mismatch_panic. Valid zone pointers are aligned, so
 320          * the low bits are zero.
 321          */
 322         zp_poisoned_cookie |=   (uintptr_t)0x1ULL;
 323         zp_nopoison_cookie &= ~((uintptr_t)0x1ULL);
 324
 325 #if defined(__LP64__)
 326         /*
 327          * Make backup pointers more obvious in GDB for 64 bit
 328          * by making OxFFFFFF... ^ cookie = 0xFACADE...
 329          * (0xFACADE = 0xFFFFFF ^ 0x053521)
 330          * (0xC0FFEE = 0xFFFFFF ^ 0x3f0011)
 331          * The high 3 bytes of a zone pointer are always 0xFFFFFF, and are checked
 332          * by the sanity check, so it's OK for that part of the cookie to be predictable.
 333          *
 334          * TODO: Use #defines, xors, and shifts
 335          */
 336
 337         zp_poisoned_cookie &= 0x000000FFFFFFFFFF;
 338         zp_poisoned_cookie |= 0x0535210000000000; /* 0xFACADE */
 339
 340         zp_nopoison_cookie &= 0x000000FFFFFFFFFF;
 341         zp_nopoison_cookie |= 0x3f00110000000000; /* 0xC0FFEE */
 342 #endif
 343 }
 344
 345 /*
 346  * These macros are used to keep track of the number
 347  * of pages being used by the zone currently. The
 348  * z->page_count is not protected by the zone lock.
 349  */
 350 #define ZONE_PAGE_COUNT_INCR(z, count)          \
 351 {                                               \
 352         OSAddAtomic64(count, &(z->page_count)); \
 353 }
 354
 355 #define ZONE_PAGE_COUNT_DECR(z, count)                  \
 356 {                                                       \
 357         OSAddAtomic64(-count, &(z->page_count));        \
 358 }
 359
 360 vm_map_t        zone_map = VM_MAP_NULL;
 361
 362 /* for is_sane_zone_element and garbage collection */
 363
 364 vm_offset_t     zone_map_min_address = 0;  /* initialized in zone_init */
 365 vm_offset_t     zone_map_max_address = 0;
 366
 367 /* Globals for random boolean generator for elements in free list */
 368 #define MAX_ENTROPY_PER_ZCRAM           4
 369
 370 /* VM region for all metadata structures */
 371 vm_offset_t     zone_metadata_region_min = 0;
 372 vm_offset_t     zone_metadata_region_max = 0;
 373 decl_lck_mtx_data(static, zone_metadata_region_lck);
 374 lck_attr_t      zone_metadata_lock_attr;
 375 lck_mtx_ext_t   zone_metadata_region_lck_ext;
 376
 377 /* Helpful for walking through a zone's free element list. */
 378 struct zone_free_element {
 379         struct zone_free_element *next;
 380         /* ... */
 381         /* void *backup_ptr; */
 382 };
 383
 384 #if CONFIG_ZCACHE
 385
 386 /*
 387  * Decides whether per-cpu zone caching is to be enabled for all zones.
 388  * Can be set to TRUE via the boot-arg '-zcache_all'.
 389  */
 390 bool cache_all_zones = FALSE;
 391
 392 /*
 393  * Specifies a single zone to enable CPU caching for.
 394  * Can be set using boot-args: zcc_enable_for_zone_name=<zone>
 395  */
 396 static char cache_zone_name[MAX_ZONE_NAME];
 397
 398 static inline bool
 399 zone_caching_enabled(zone_t z)
 400 {
 401         return z->cpu_cache_enabled && !z->tags && !z->zleak_on;
 402 }
 403
 404 #endif /* CONFIG_ZCACHE */
 405
 406 /*
 407  *      Protects zone_array, num_zones, num_zones_in_use, and zone_empty_bitmap
 408  */
 409 decl_simple_lock_data(, all_zones_lock);
 410 unsigned int            num_zones_in_use;
 411 unsigned int            num_zones;
 412
 413 #if KASAN
 414 #define MAX_ZONES       512
 415 #else /* !KASAN */
 416 #define MAX_ZONES       320
 417 #endif/* !KASAN */
 418 struct zone             zone_array[MAX_ZONES];
 419
 420 /* Used to keep track of empty slots in the zone_array */
 421 bitmap_t zone_empty_bitmap[BITMAP_LEN(MAX_ZONES)];
 422
 423 #if DEBUG || DEVELOPMENT
 424 /*
 425  * Used for sysctl kern.run_zone_test which is not thread-safe. Ensure only one thread goes through at a time.
 426  * Or we can end up with multiple test zones (if a second zinit() comes through before zdestroy()),  which could lead us to
 427  * run out of zones.
 428  */
 429 decl_simple_lock_data(, zone_test_lock);
 430 static boolean_t zone_test_running = FALSE;
 431 static zone_t test_zone_ptr = NULL;
 432 #endif /* DEBUG || DEVELOPMENT */
 433
 434 #define PAGE_METADATA_GET_ZINDEX(page_meta)                     \
 435         (page_meta->zindex)
 436
 437 #define PAGE_METADATA_GET_ZONE(page_meta)                               \
 438         (&(zone_array[page_meta->zindex]))
 439
 440 #define PAGE_METADATA_SET_ZINDEX(page_meta, index)              \
 441         page_meta->zindex = (index);
 442
 443 struct zone_page_metadata {
 444         queue_chain_t           pages; /* linkage pointer for metadata lists */
 445
 446         /* Union for maintaining start of element free list and real metadata (for multipage allocations) */
 447         union {
 448                 /*
 449                  * The start of the freelist can be maintained as a 32-bit offset instead of a pointer because
 450                  * the free elements would be at max ZONE_MAX_ALLOC_SIZE bytes away from the metadata. Offset
 451                  * from start of the allocation chunk to free element list head.
 452                  */
 453                 uint32_t                freelist_offset;
 454                 /*
 455                  * This field is used to lookup the real metadata for multipage allocations, where we mark the
 456                  * metadata for all pages except the first as "fake" metadata using MULTIPAGE_METADATA_MAGIC.
 457                  * Offset from this fake metadata to real metadata of allocation chunk (-ve offset).
 458                  */
 459                 uint32_t                real_metadata_offset;
 460         };
 461
 462         /*
 463          * For the first page in the allocation chunk, this represents the total number of free elements in
 464          * the chunk.
 465          */
 466         uint16_t                        free_count;
 467         unsigned                        zindex     : ZINDEX_BITS;    /* Zone index within the zone_array */
 468         unsigned                        page_count : PAGECOUNT_BITS; /* Count of pages within the allocation chunk */
 469 };
 470
 471 /* Macro to get page index (within zone_map) of page containing element */
 472 #define PAGE_INDEX_FOR_ELEMENT(element)                         \
 473         (((vm_offset_t)trunc_page(element) - zone_map_min_address) / PAGE_SIZE)
 474
 475 /* Macro to get metadata structure given a page index in zone_map */
 476 #define PAGE_METADATA_FOR_PAGE_INDEX(index)                     \
 477         (zone_metadata_region_min + ((index) * sizeof(struct zone_page_metadata)))
 478
 479 /* Macro to get index (within zone_map) for given metadata */
 480 #define PAGE_INDEX_FOR_METADATA(page_meta)                      \
 481         (((vm_offset_t)page_meta - zone_metadata_region_min) / sizeof(struct zone_page_metadata))
 482
 483 /* Macro to get page for given page index in zone_map */
 484 #define PAGE_FOR_PAGE_INDEX(index)                              \
 485         (zone_map_min_address + (PAGE_SIZE * (index)))
 486
 487 /* Macro to get the actual metadata for a given address */
 488 #define PAGE_METADATA_FOR_ELEMENT(element)              \
 489         (struct zone_page_metadata *)(PAGE_METADATA_FOR_PAGE_INDEX(PAGE_INDEX_FOR_ELEMENT(element)))
 490
 491 /* Magic value to indicate empty element free list */
 492 #define PAGE_METADATA_EMPTY_FREELIST            ((uint32_t)(~0))
 493
 494 vm_map_copy_t create_vm_map_copy(vm_offset_t start_addr, vm_size_t total_size, vm_size_t used_size);
 495 boolean_t get_zone_info(zone_t z, mach_zone_name_t *zn, mach_zone_info_t *zi);
 496 boolean_t is_zone_map_nearing_exhaustion(void);
 497 extern void vm_pageout_garbage_collect(int collect);
 498
 499 static inline void *
 500 page_metadata_get_freelist(struct zone_page_metadata *page_meta)
 501 {
 502         assert(PAGE_METADATA_GET_ZINDEX(page_meta) != MULTIPAGE_METADATA_MAGIC);
 503         if (page_meta->freelist_offset == PAGE_METADATA_EMPTY_FREELIST) {
 504                 return NULL;
 505         } else {
 506                 if (from_zone_map(page_meta, sizeof(struct zone_page_metadata))) {
 507                         return (void *)(PAGE_FOR_PAGE_INDEX(PAGE_INDEX_FOR_METADATA(page_meta)) + page_meta->freelist_offset);
 508                 } else {
 509                         return (void *)((vm_offset_t)page_meta + page_meta->freelist_offset);
 510                 }
 511         }
 512 }
 513
 514 static inline void
 515 page_metadata_set_freelist(struct zone_page_metadata *page_meta, void *addr)
 516 {
 517         assert(PAGE_METADATA_GET_ZINDEX(page_meta) != MULTIPAGE_METADATA_MAGIC);
 518         if (addr == NULL) {
 519                 page_meta->freelist_offset = PAGE_METADATA_EMPTY_FREELIST;
 520         } else {
 521                 if (from_zone_map(page_meta, sizeof(struct zone_page_metadata))) {
 522                         page_meta->freelist_offset = (uint32_t)((vm_offset_t)(addr) - PAGE_FOR_PAGE_INDEX(PAGE_INDEX_FOR_METADATA(page_meta)));
 523                 } else {
 524                         page_meta->freelist_offset = (uint32_t)((vm_offset_t)(addr) - (vm_offset_t)page_meta);
 525                 }
 526         }
 527 }
 528
 529 static inline struct zone_page_metadata *
 530 page_metadata_get_realmeta(struct zone_page_metadata *page_meta)
 531 {
 532         assert(PAGE_METADATA_GET_ZINDEX(page_meta) == MULTIPAGE_METADATA_MAGIC);
 533         return (struct zone_page_metadata *)((vm_offset_t)page_meta - page_meta->real_metadata_offset);
 534 }
 535
 536 static inline void
 537 page_metadata_set_realmeta(struct zone_page_metadata *page_meta, struct zone_page_metadata *real_meta)
 538 {
 539         assert(PAGE_METADATA_GET_ZINDEX(page_meta) == MULTIPAGE_METADATA_MAGIC);
 540         assert(PAGE_METADATA_GET_ZINDEX(real_meta) != MULTIPAGE_METADATA_MAGIC);
 541         assert((vm_offset_t)page_meta > (vm_offset_t)real_meta);
 542         vm_offset_t offset = (vm_offset_t)page_meta - (vm_offset_t)real_meta;
 543         assert(offset <= UINT32_MAX);
 544         page_meta->real_metadata_offset = (uint32_t)offset;
 545 }
 546
 547 /* The backup pointer is stored in the last pointer-sized location in an element. */
 548 static inline vm_offset_t *
 549 get_backup_ptr(vm_size_t  elem_size,
 550     vm_offset_t *element)
 551 {
 552         return (vm_offset_t *) ((vm_offset_t)element + elem_size - sizeof(vm_offset_t));
 553 }
 554
 555 /*
 556  * Routine to populate a page backing metadata in the zone_metadata_region.
 557  * Must be called without the zone lock held as it might potentially block.
 558  */
 559 static inline void
 560 zone_populate_metadata_page(struct zone_page_metadata *page_meta)
 561 {
 562         vm_offset_t page_metadata_begin = trunc_page(page_meta);
 563         vm_offset_t page_metadata_end = trunc_page((vm_offset_t)page_meta + sizeof(struct zone_page_metadata));
 564
 565         for (; page_metadata_begin <= page_metadata_end; page_metadata_begin += PAGE_SIZE) {
 566 #if !KASAN
 567                 /*
 568                  * This can race with another thread doing a populate on the same metadata
 569                  * page, where we see an updated pmap but unmapped KASan shadow, causing a
 570                  * fault in the shadow when we first access the metadata page. Avoid this
 571                  * by always synchronizing on the zone_metadata_region lock with KASan.
 572                  */
 573                 if (pmap_find_phys(kernel_pmap, (vm_map_address_t)page_metadata_begin)) {
 574                         continue;
 575                 }
 576 #endif
 577                 /* All updates to the zone_metadata_region are done under the zone_metadata_region_lck */
 578                 lck_mtx_lock(&zone_metadata_region_lck);
 579                 if (0 == pmap_find_phys(kernel_pmap, (vm_map_address_t)page_metadata_begin)) {
 580                         kern_return_t __assert_only ret = kernel_memory_populate(zone_map,
 581                             page_metadata_begin,
 582                             PAGE_SIZE,
 583                             KMA_KOBJECT,
 584                             VM_KERN_MEMORY_OSFMK);
 585
 586                         /* should not fail with the given arguments */
 587                         assert(ret == KERN_SUCCESS);
 588                 }
 589                 lck_mtx_unlock(&zone_metadata_region_lck);
 590         }
 591         return;
 592 }
 593
 594 static inline uint16_t
 595 get_metadata_alloc_count(struct zone_page_metadata *page_meta)
 596 {
 597         assert(PAGE_METADATA_GET_ZINDEX(page_meta) != MULTIPAGE_METADATA_MAGIC);
 598         struct zone *z = PAGE_METADATA_GET_ZONE(page_meta);
 599         return (page_meta->page_count * PAGE_SIZE) / z->elem_size;
 600 }
 601
 602 /*
 603  * Routine to lookup metadata for any given address.
 604  * If init is marked as TRUE, this should be called without holding the zone lock
 605  * since the initialization might block.
 606  */
 607 static inline struct zone_page_metadata *
 608 get_zone_page_metadata(struct zone_free_element *element, boolean_t init)
 609 {
 610         struct zone_page_metadata *page_meta = 0;
 611
 612         if (from_zone_map(element, sizeof(struct zone_free_element))) {
 613                 page_meta = (struct zone_page_metadata *)(PAGE_METADATA_FOR_ELEMENT(element));
 614                 if (init) {
 615                         zone_populate_metadata_page(page_meta);
 616                 }
 617         } else {
 618                 page_meta = (struct zone_page_metadata *)(trunc_page((vm_offset_t)element));
 619         }
 620         if (init) {
 621                 bzero((char *)page_meta, sizeof(struct zone_page_metadata));
 622         }
 623         return (PAGE_METADATA_GET_ZINDEX(page_meta) != MULTIPAGE_METADATA_MAGIC) ? page_meta : page_metadata_get_realmeta(page_meta);
 624 }
 625
 626 /* Routine to get the page for a given metadata */
 627 static inline vm_offset_t
 628 get_zone_page(struct zone_page_metadata *page_meta)
 629 {
 630         if (from_zone_map(page_meta, sizeof(struct zone_page_metadata))) {
 631                 return (vm_offset_t)(PAGE_FOR_PAGE_INDEX(PAGE_INDEX_FOR_METADATA(page_meta)));
 632         } else {
 633                 return (vm_offset_t)(trunc_page(page_meta));
 634         }
 635 }
 636
 637 /*
 638  * Routine to panic if a pointer is not mapped to an expected zone.
 639  * This can be used as a means of pinning an object to the zone it is expected
 640  * to be a part of.  Causes a panic if the address does not belong to any
 641  * specified zone, does not belong to any zone, has been freed and therefore
 642  * unmapped from the zone, or the pointer contains an uninitialized value that
 643  * does not belong to any zone.
 644  */
 645
 646 void
 647 zone_require(void *addr, zone_t expected_zone)
 648 {
 649         struct zone *src_zone = NULL;
 650         struct zone_page_metadata *page_meta = get_zone_page_metadata((struct zone_free_element *)addr, FALSE);
 651
 652         src_zone = PAGE_METADATA_GET_ZONE(page_meta);
 653         if (__improbable(src_zone == NULL)) {
 654                 panic("Address not in a zone for zone_require check (addr: %p)", addr);
 655         }
 656
 657         if (__improbable(src_zone != expected_zone)) {
 658                 panic("Address not in expected zone for zone_require check (addr: %p, zone: %s)", addr, src_zone->zone_name);
 659         }
 660 }
 661
 662 /*
 663  * ZTAGS
 664  */
 665
 666 #if VM_MAX_TAG_ZONES
 667
 668 // for zones with tagging enabled:
 669
 670 // calculate a pointer to the tag base entry,
 671 // holding either a uint32_t the first tag offset for a page in the zone map,
 672 // or two uint16_t tags if the page can only hold one or two elements
 673
 674 #define ZTAGBASE(zone, element) \
 675     (&((uint32_t *)zone_tagbase_min)[atop((element) - zone_map_min_address)])
 676
 677 // pointer to the tag for an element
 678 #define ZTAG(zone, element)                                     \
 679     ({                                                          \
 680         vm_tag_t * result;                                      \
 681         if ((zone)->tags_inline) {                              \
 682             result = (vm_tag_t *) ZTAGBASE((zone), (element));  \
 683             if ((page_mask & element) >= (zone)->elem_size) result++;    \
 684         } else {                                                \
 685             result =  &((vm_tag_t *)zone_tags_min)[ZTAGBASE((zone), (element))[0] + ((element) & page_mask) / (zone)->elem_size];   \
 686         }                                                       \
 687         result;                                                 \
 688     })
 689
 690
 691 static vm_offset_t  zone_tagbase_min;
 692 static vm_offset_t  zone_tagbase_max;
 693 static vm_offset_t  zone_tagbase_map_size;
 694 static vm_map_t     zone_tagbase_map;
 695
 696 static vm_offset_t  zone_tags_min;
 697 static vm_offset_t  zone_tags_max;
 698 static vm_offset_t  zone_tags_map_size;
 699 static vm_map_t     zone_tags_map;
 700
 701 // simple heap allocator for allocating the tags for new memory
 702
 703 decl_lck_mtx_data(, ztLock);    /* heap lock */
 704 enum{
 705         ztFreeIndexCount = 8,
 706         ztFreeIndexMax   = (ztFreeIndexCount - 1),
 707         ztTagsPerBlock   = 4
 708 };
 709
 710 struct ztBlock {
 711 #if __LITTLE_ENDIAN__
 712         uint64_t free:1,
 713             next:21,
 714             prev:21,
 715             size:21;
 716 #else
 717 // ztBlock needs free bit least significant
 718 #error !__LITTLE_ENDIAN__
 719 #endif
 720 };
 721 typedef struct ztBlock ztBlock;
 722
 723 static ztBlock * ztBlocks;
 724 static uint32_t  ztBlocksCount;
 725 static uint32_t  ztBlocksFree;
 726
 727 static uint32_t
 728 ztLog2up(uint32_t size)
 729 {
 730         if (1 == size) {
 731                 size = 0;
 732         } else {
 733                 size = 32 - __builtin_clz(size - 1);
 734         }
 735         return size;
 736 }
 737
 738 static uint32_t
 739 ztLog2down(uint32_t size)
 740 {
 741         size = 31 - __builtin_clz(size);
 742         return size;
 743 }
 744
 745 static void
 746 ztFault(vm_map_t map, const void * address, size_t size, uint32_t flags)
 747 {
 748         vm_map_offset_t addr = (vm_map_offset_t) address;
 749         vm_map_offset_t page, end;
 750
 751         page = trunc_page(addr);
 752         end  = round_page(addr + size);
 753
 754         for (; page < end; page += page_size) {
 755                 if (!pmap_find_phys(kernel_pmap, page)) {
 756                         kern_return_t __unused
 757                         ret = kernel_memory_populate(map, page, PAGE_SIZE,
 758                             KMA_KOBJECT | flags, VM_KERN_MEMORY_DIAG);
 759                         assert(ret == KERN_SUCCESS);
 760                 }
 761         }
 762 }
 763
 764 static boolean_t
 765 ztPresent(const void * address, size_t size)
 766 {
 767         vm_map_offset_t addr = (vm_map_offset_t) address;
 768         vm_map_offset_t page, end;
 769         boolean_t       result;
 770
 771         page = trunc_page(addr);
 772         end  = round_page(addr + size);
 773         for (result = TRUE; (page < end); page += page_size) {
 774                 result = pmap_find_phys(kernel_pmap, page);
 775                 if (!result) {
 776                         break;
 777                 }
 778         }
 779         return result;
 780 }
 781
 782
 783 void __unused
 784 ztDump(boolean_t sanity);
 785 void __unused
 786 ztDump(boolean_t sanity)
 787 {
 788         uint32_t q, cq, p;
 789
 790         for (q = 0; q <= ztFreeIndexMax; q++) {
 791                 p = q;
 792                 do{
 793                         if (sanity) {
 794                                 cq = ztLog2down(ztBlocks[p].size);
 795                                 if (cq > ztFreeIndexMax) {
 796                                         cq = ztFreeIndexMax;
 797                                 }
 798                                 if (!ztBlocks[p].free
 799                                     || ((p != q) && (q != cq))
 800                                     || (ztBlocks[ztBlocks[p].next].prev != p)
 801                                     || (ztBlocks[ztBlocks[p].prev].next != p)) {
 802                                         kprintf("zterror at %d", p);
 803                                         ztDump(FALSE);
 804                                         kprintf("zterror at %d", p);
 805                                         assert(FALSE);
 806                                 }
 807                                 continue;
 808                         }
 809                         kprintf("zt[%03d]%c %d, %d, %d\n",
 810                             p, ztBlocks[p].free ? 'F' : 'A',
 811                             ztBlocks[p].next, ztBlocks[p].prev,
 812                             ztBlocks[p].size);
 813                         p = ztBlocks[p].next;
 814                         if (p == q) {
 815                                 break;
 816                         }
 817                 }while (p != q);
 818                 if (!sanity) {
 819                         printf("\n");
 820                 }
 821         }
 822         if (!sanity) {
 823                 printf("-----------------------\n");
 824         }
 825 }
 826
 827
 828
 829 #define ZTBDEQ(idx)                                                 \
 830     ztBlocks[ztBlocks[(idx)].prev].next = ztBlocks[(idx)].next;     \
 831     ztBlocks[ztBlocks[(idx)].next].prev = ztBlocks[(idx)].prev;
 832
 833 static void
 834 ztFree(zone_t zone __unused, uint32_t index, uint32_t count)
 835 {
 836         uint32_t q, w, p, size, merge;
 837
 838         assert(count);
 839         ztBlocksFree += count;
 840
 841         // merge with preceding
 842         merge = (index + count);
 843         if ((merge < ztBlocksCount)
 844             && ztPresent(&ztBlocks[merge], sizeof(ztBlocks[merge]))
 845             && ztBlocks[merge].free) {
 846                 ZTBDEQ(merge);
 847                 count += ztBlocks[merge].size;
 848         }
 849
 850         // merge with following
 851         merge = (index - 1);
 852         if ((merge > ztFreeIndexMax)
 853             && ztPresent(&ztBlocks[merge], sizeof(ztBlocks[merge]))
 854             && ztBlocks[merge].free) {
 855                 size = ztBlocks[merge].size;
 856                 count += size;
 857                 index -= size;
 858                 ZTBDEQ(index);
 859         }
 860
 861         q = ztLog2down(count);
 862         if (q > ztFreeIndexMax) {
 863                 q = ztFreeIndexMax;
 864         }
 865         w = q;
 866         // queue in order of size
 867         while (TRUE) {
 868                 p = ztBlocks[w].next;
 869                 if (p == q) {
 870                         break;
 871                 }
 872                 if (ztBlocks[p].size >= count) {
 873                         break;
 874                 }
 875                 w = p;
 876         }
 877         ztBlocks[p].prev = index;
 878         ztBlocks[w].next = index;
 879
 880         // fault in first
 881         ztFault(zone_tags_map, &ztBlocks[index], sizeof(ztBlocks[index]), 0);
 882
 883         // mark first & last with free flag and size
 884         ztBlocks[index].free = TRUE;
 885         ztBlocks[index].size = count;
 886         ztBlocks[index].prev = w;
 887         ztBlocks[index].next = p;
 888         if (count > 1) {
 889                 index += (count - 1);
 890                 // fault in last
 891                 ztFault(zone_tags_map, &ztBlocks[index], sizeof(ztBlocks[index]), 0);
 892                 ztBlocks[index].free = TRUE;
 893                 ztBlocks[index].size = count;
 894         }
 895 }
 896
 897 static uint32_t
 898 ztAlloc(zone_t zone, uint32_t count)
 899 {
 900         uint32_t q, w, p, leftover;
 901
 902         assert(count);
 903
 904         q = ztLog2up(count);
 905         if (q > ztFreeIndexMax) {
 906                 q = ztFreeIndexMax;
 907         }
 908         do{
 909                 w = q;
 910                 while (TRUE) {
 911                         p = ztBlocks[w].next;
 912                         if (p == q) {
 913                                 break;
 914                         }
 915                         if (ztBlocks[p].size >= count) {
 916                                 // dequeue, mark both ends allocated
 917                                 ztBlocks[w].next = ztBlocks[p].next;
 918                                 ztBlocks[ztBlocks[p].next].prev = w;
 919                                 ztBlocks[p].free = FALSE;
 920                                 ztBlocksFree -= ztBlocks[p].size;
 921                                 if (ztBlocks[p].size > 1) {
 922                                         ztBlocks[p + ztBlocks[p].size - 1].free = FALSE;
 923                                 }
 924
 925                                 // fault all the allocation
 926                                 ztFault(zone_tags_map, &ztBlocks[p], count * sizeof(ztBlocks[p]), 0);
 927                                 // mark last as allocated
 928                                 if (count > 1) {
 929                                         ztBlocks[p + count - 1].free = FALSE;
 930                                 }
 931                                 // free remainder
 932                                 leftover = ztBlocks[p].size - count;
 933                                 if (leftover) {
 934                                         ztFree(zone, p + ztBlocks[p].size - leftover, leftover);
 935                                 }
 936
 937                                 return p;
 938                         }
 939                         w = p;
 940                 }
 941                 q++;
 942         }while (q <= ztFreeIndexMax);
 943
 944         return -1U;
 945 }
 946
 947 static void
 948 ztInit(vm_size_t max_zonemap_size, lck_grp_t * group)
 949 {
 950         kern_return_t         ret;
 951         vm_map_kernel_flags_t vmk_flags;
 952         uint32_t              idx;
 953
 954         lck_mtx_init(&ztLock, group, LCK_ATTR_NULL);
 955
 956         // allocate submaps VM_KERN_MEMORY_DIAG
 957
 958         zone_tagbase_map_size = atop(max_zonemap_size) * sizeof(uint32_t);
 959         vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
 960         vmk_flags.vmkf_permanent = TRUE;
 961         ret = kmem_suballoc(kernel_map, &zone_tagbase_min, zone_tagbase_map_size,
 962             FALSE, VM_FLAGS_ANYWHERE, vmk_flags, VM_KERN_MEMORY_DIAG,
 963             &zone_tagbase_map);
 964
 965         if (ret != KERN_SUCCESS) {
 966                 panic("zone_init: kmem_suballoc failed");
 967         }
 968         zone_tagbase_max = zone_tagbase_min + round_page(zone_tagbase_map_size);
 969
 970         zone_tags_map_size = 2048 * 1024 * sizeof(vm_tag_t);
 971         vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
 972         vmk_flags.vmkf_permanent = TRUE;
 973         ret = kmem_suballoc(kernel_map, &zone_tags_min, zone_tags_map_size,
 974             FALSE, VM_FLAGS_ANYWHERE, vmk_flags, VM_KERN_MEMORY_DIAG,
 975             &zone_tags_map);
 976
 977         if (ret != KERN_SUCCESS) {
 978                 panic("zone_init: kmem_suballoc failed");
 979         }
 980         zone_tags_max = zone_tags_min + round_page(zone_tags_map_size);
 981
 982         ztBlocks = (ztBlock *) zone_tags_min;
 983         ztBlocksCount = (uint32_t)(zone_tags_map_size / sizeof(ztBlock));
 984
 985         // initialize the qheads
 986         lck_mtx_lock(&ztLock);
 987
 988         ztFault(zone_tags_map, &ztBlocks[0], sizeof(ztBlocks[0]), 0);
 989         for (idx = 0; idx < ztFreeIndexCount; idx++) {
 990                 ztBlocks[idx].free = TRUE;
 991                 ztBlocks[idx].next = idx;
 992                 ztBlocks[idx].prev = idx;
 993                 ztBlocks[idx].size = 0;
 994         }
 995         // free remaining space
 996         ztFree(NULL, ztFreeIndexCount, ztBlocksCount - ztFreeIndexCount);
 997
 998         lck_mtx_unlock(&ztLock);
 999 }
1000
1001 static void
1002 ztMemoryAdd(zone_t zone, vm_offset_t mem, vm_size_t size)
1003 {
1004         uint32_t * tagbase;
1005         uint32_t   count, block, blocks, idx;
1006         size_t     pages;
1007
1008         pages = atop(size);
1009         tagbase = ZTAGBASE(zone, mem);
1010
1011         lck_mtx_lock(&ztLock);
1012
1013         // fault tagbase
1014         ztFault(zone_tagbase_map, tagbase, pages * sizeof(uint32_t), 0);
1015
1016         if (!zone->tags_inline) {
1017                 // allocate tags
1018                 count = (uint32_t)(size / zone->elem_size);
1019                 blocks = ((count + ztTagsPerBlock - 1) / ztTagsPerBlock);
1020                 block = ztAlloc(zone, blocks);
1021                 if (-1U == block) {
1022                         ztDump(false);
1023                 }
1024                 assert(-1U != block);
1025         }
1026
1027         lck_mtx_unlock(&ztLock);
1028
1029         if (!zone->tags_inline) {
1030                 // set tag base for each page
1031                 block *= ztTagsPerBlock;
1032                 for (idx = 0; idx < pages; idx++) {
1033                         tagbase[idx] = block + (uint32_t)((ptoa(idx) + (zone->elem_size - 1)) / zone->elem_size);
1034                 }
1035         }
1036 }
1037
1038 static void
1039 ztMemoryRemove(zone_t zone, vm_offset_t mem, vm_size_t size)
1040 {
1041         uint32_t * tagbase;
1042         uint32_t   count, block, blocks, idx;
1043         size_t     pages;
1044
1045         // set tag base for each page
1046         pages = atop(size);
1047         tagbase = ZTAGBASE(zone, mem);
1048         block = tagbase[0];
1049         for (idx = 0; idx < pages; idx++) {
1050                 tagbase[idx] = 0xFFFFFFFF;
1051         }
1052
1053         lck_mtx_lock(&ztLock);
1054         if (!zone->tags_inline) {
1055                 count = (uint32_t)(size / zone->elem_size);
1056                 blocks = ((count + ztTagsPerBlock - 1) / ztTagsPerBlock);
1057                 assert(block != 0xFFFFFFFF);
1058                 block /= ztTagsPerBlock;
1059                 ztFree(NULL /* zone is unlocked */, block, blocks);
1060         }
1061
1062         lck_mtx_unlock(&ztLock);
1063 }
1064
1065 uint32_t
1066 zone_index_from_tag_index(uint32_t tag_zone_index, vm_size_t * elem_size)
1067 {
1068         zone_t z;
1069         uint32_t idx;
1070
1071         simple_lock(&all_zones_lock, &zone_locks_grp);
1072
1073         for (idx = 0; idx < num_zones; idx++) {
1074                 z = &(zone_array[idx]);
1075                 if (!z->tags) {
1076                         continue;
1077                 }
1078                 if (tag_zone_index != z->tag_zone_index) {
1079                         continue;
1080                 }
1081                 *elem_size = z->elem_size;
1082                 break;
1083         }
1084
1085         simple_unlock(&all_zones_lock);
1086
1087         if (idx == num_zones) {
1088                 idx = -1U;
1089         }
1090
1091         return idx;
1092 }
1093
1094 #endif /* VM_MAX_TAG_ZONES */
1095
1096 /* Routine to get the size of a zone allocated address. If the address doesnt belong to the
1097  * zone_map, returns 0.
1098  */
1099 vm_size_t
1100 zone_element_size(void *addr, zone_t *z)
1101 {
1102         struct zone *src_zone;
1103         if (from_zone_map(addr, sizeof(void *))) {
1104                 struct zone_page_metadata *page_meta = get_zone_page_metadata((struct zone_free_element *)addr, FALSE);
1105                 src_zone = PAGE_METADATA_GET_ZONE(page_meta);
1106                 if (z) {
1107                         *z = src_zone;
1108                 }
1109                 return src_zone->elem_size;
1110         } else {
1111 #if CONFIG_GZALLOC
1112                 vm_size_t gzsize;
1113                 if (gzalloc_element_size(addr, z, &gzsize)) {
1114                         return gzsize;
1115                 }
1116 #endif /* CONFIG_GZALLOC */
1117
1118                 return 0;
1119         }
1120 }
1121
1122 #if DEBUG || DEVELOPMENT
1123
1124 vm_size_t
1125 zone_element_info(void *addr, vm_tag_t * ptag)
1126 {
1127         vm_size_t     size = 0;
1128         vm_tag_t      tag = VM_KERN_MEMORY_NONE;
1129         struct zone * src_zone;
1130
1131         if (from_zone_map(addr, sizeof(void *))) {
1132                 struct zone_page_metadata *page_meta = get_zone_page_metadata((struct zone_free_element *)addr, FALSE);
1133                 src_zone = PAGE_METADATA_GET_ZONE(page_meta);
1134 #if VM_MAX_TAG_ZONES
1135                 if (__improbable(src_zone->tags)) {
1136                         tag = (ZTAG(src_zone, (vm_offset_t) addr)[0] >> 1);
1137                 }
1138 #endif /* VM_MAX_TAG_ZONES */
1139                 size = src_zone->elem_size;
1140         } else {
1141 #if CONFIG_GZALLOC
1142                 gzalloc_element_size(addr, NULL, &size);
1143 #endif /* CONFIG_GZALLOC */
1144         }
1145         *ptag = tag;
1146         return size;
1147 }
1148
1149 #endif /* DEBUG || DEVELOPMENT */
1150
1151 /*
1152  * Zone checking helper function.
1153  * A pointer that satisfies these conditions is OK to be a freelist next pointer
1154  * A pointer that doesn't satisfy these conditions indicates corruption
1155  */
1156 static inline boolean_t
1157 is_sane_zone_ptr(zone_t         zone,
1158     vm_offset_t    addr,
1159     size_t         obj_size)
1160 {
1161         /*  Must be aligned to pointer boundary */
1162         if (__improbable((addr & (sizeof(vm_offset_t) - 1)) != 0)) {
1163                 return FALSE;
1164         }
1165
1166         /*  Must be a kernel address */
1167         if (__improbable(!pmap_kernel_va(addr))) {
1168                 return FALSE;
1169         }
1170
1171         /*  Must be from zone map if the zone only uses memory from the zone_map */
1172         /*
1173          *  TODO: Remove the zone->collectable check when every
1174          *  zone using foreign memory is properly tagged with allows_foreign
1175          */
1176         if (zone->collectable && !zone->allows_foreign) {
1177                 /*  check if addr is from zone map */
1178                 if (addr >= zone_map_min_address &&
1179                     (addr + obj_size - 1) < zone_map_max_address) {
1180                         return TRUE;
1181                 }
1182
1183                 return FALSE;
1184         }
1185
1186         return TRUE;
1187 }
1188
1189 static inline boolean_t
1190 is_sane_zone_page_metadata(zone_t       zone,
1191     vm_offset_t  page_meta)
1192 {
1193         /* NULL page metadata structures are invalid */
1194         if (page_meta == 0) {
1195                 return FALSE;
1196         }
1197         return is_sane_zone_ptr(zone, page_meta, sizeof(struct zone_page_metadata));
1198 }
1199
1200 static inline boolean_t
1201 is_sane_zone_element(zone_t      zone,
1202     vm_offset_t addr)
1203 {
1204         /*  NULL is OK because it indicates the tail of the list */
1205         if (addr == 0) {
1206                 return TRUE;
1207         }
1208         return is_sane_zone_ptr(zone, addr, zone->elem_size);
1209 }
1210
1211 /* Someone wrote to freed memory. */
1212 __dead2
1213 static inline void
1214 zone_element_was_modified_panic(zone_t        zone,
1215     vm_offset_t   element,
1216     vm_offset_t   found,
1217     vm_offset_t   expected,
1218     vm_offset_t   offset)
1219 {
1220         panic("a freed zone element has been modified in zone %s: expected %p but found %p, bits changed %p, at offset %d of %d in element %p, cookies %p %p",
1221             zone->zone_name,
1222             (void *)   expected,
1223             (void *)   found,
1224             (void *)   (expected ^ found),
1225             (uint32_t) offset,
1226             (uint32_t) zone->elem_size,
1227             (void *)   element,
1228             (void *)   zp_nopoison_cookie,
1229             (void *)   zp_poisoned_cookie);
1230 }
1231
1232 /*
1233  * The primary and backup pointers don't match.
1234  * Determine which one was likely the corrupted pointer, find out what it
1235  * probably should have been, and panic.
1236  */
1237 __dead2
1238 static void
1239 backup_ptr_mismatch_panic(zone_t        zone,
1240     vm_offset_t   element,
1241     vm_offset_t   primary,
1242     vm_offset_t   backup)
1243 {
1244         vm_offset_t likely_backup;
1245         vm_offset_t likely_primary;
1246
1247         likely_primary = primary ^ zp_nopoison_cookie;
1248         boolean_t   sane_backup;
1249         boolean_t   sane_primary = is_sane_zone_element(zone, likely_primary);
1250         boolean_t   element_was_poisoned = (backup & 0x1) ? TRUE : FALSE;
1251
1252 #if defined(__LP64__)
1253         /* We can inspect the tag in the upper bits for additional confirmation */
1254         if ((backup & 0xFFFFFF0000000000) == 0xFACADE0000000000) {
1255                 element_was_poisoned = TRUE;
1256         } else if ((backup & 0xFFFFFF0000000000) == 0xC0FFEE0000000000) {
1257                 element_was_poisoned = FALSE;
1258         }
1259 #endif
1260
1261         if (element_was_poisoned) {
1262                 likely_backup = backup ^ zp_poisoned_cookie;
1263                 sane_backup = is_sane_zone_element(zone, likely_backup);
1264         } else {
1265                 likely_backup = backup ^ zp_nopoison_cookie;
1266                 sane_backup = is_sane_zone_element(zone, likely_backup);
1267         }
1268
1269         /* The primary is definitely the corrupted one */
1270         if (!sane_primary && sane_backup) {
1271                 zone_element_was_modified_panic(zone, element, primary, (likely_backup ^ zp_nopoison_cookie), 0);
1272         }
1273
1274         /* The backup is definitely the corrupted one */
1275         if (sane_primary && !sane_backup) {
1276                 zone_element_was_modified_panic(zone, element, backup,
1277                     (likely_primary ^ (element_was_poisoned ? zp_poisoned_cookie : zp_nopoison_cookie)),
1278                     zone->elem_size - sizeof(vm_offset_t));
1279         }
1280
1281         /*
1282          * Not sure which is the corrupted one.
1283          * It's less likely that the backup pointer was overwritten with
1284          * ( (sane address) ^ (valid cookie) ), so we'll guess that the
1285          * primary pointer has been overwritten with a sane but incorrect address.
1286          */
1287         if (sane_primary && sane_backup) {
1288                 zone_element_was_modified_panic(zone, element, primary, (likely_backup ^ zp_nopoison_cookie), 0);
1289         }
1290
1291         /* Neither are sane, so just guess. */
1292         zone_element_was_modified_panic(zone, element, primary, (likely_backup ^ zp_nopoison_cookie), 0);
1293 }
1294
1295 /*
1296  * Adds the element to the head of the zone's free list
1297  * Keeps a backup next-pointer at the end of the element
1298  */
1299 static inline void
1300 free_to_zone(zone_t      zone,
1301     vm_offset_t element,
1302     boolean_t   poison)
1303 {
1304         vm_offset_t old_head;
1305         struct zone_page_metadata *page_meta;
1306
1307         vm_offset_t *primary  = (vm_offset_t *) element;
1308         vm_offset_t *backup   = get_backup_ptr(zone->elem_size, primary);
1309
1310         page_meta = get_zone_page_metadata((struct zone_free_element *)element, FALSE);
1311         assert(PAGE_METADATA_GET_ZONE(page_meta) == zone);
1312         old_head = (vm_offset_t)page_metadata_get_freelist(page_meta);
1313
1314         if (__improbable(!is_sane_zone_element(zone, old_head))) {
1315                 panic("zfree: invalid head pointer %p for freelist of zone %s\n",
1316                     (void *) old_head, zone->zone_name);
1317         }
1318
1319         if (__improbable(!is_sane_zone_element(zone, element))) {
1320                 panic("zfree: freeing invalid pointer %p to zone %s\n",
1321                     (void *) element, zone->zone_name);
1322         }
1323
1324         if (__improbable(old_head == element)) {
1325                 panic("zfree: double free of %p to zone %s\n",
1326                     (void *) element, zone->zone_name);
1327         }
1328         /*
1329          * Always write a redundant next pointer
1330          * So that it is more difficult to forge, xor it with a random cookie
1331          * A poisoned element is indicated by using zp_poisoned_cookie
1332          * instead of zp_nopoison_cookie
1333          */
1334
1335         *backup = old_head ^ (poison ? zp_poisoned_cookie : zp_nopoison_cookie);
1336
1337         /*
1338          * Insert this element at the head of the free list. We also xor the
1339          * primary pointer with the zp_nopoison_cookie to make sure a free
1340          * element does not provide the location of the next free element directly.
1341          */
1342         *primary             = old_head ^ zp_nopoison_cookie;
1343         page_metadata_set_freelist(page_meta, (struct zone_free_element *)element);
1344         page_meta->free_count++;
1345         if (zone->allows_foreign && !from_zone_map(element, zone->elem_size)) {
1346                 if (page_meta->free_count == 1) {
1347                         /* first foreign element freed on page, move from all_used */
1348                         re_queue_tail(&zone->pages.any_free_foreign, &(page_meta->pages));
1349                 } else {
1350                         /* no other list transitions */
1351                 }
1352         } else if (page_meta->free_count == get_metadata_alloc_count(page_meta)) {
1353                 /* whether the page was on the intermediate or all_used, queue, move it to free */
1354                 re_queue_tail(&zone->pages.all_free, &(page_meta->pages));
1355                 zone->count_all_free_pages += page_meta->page_count;
1356         } else if (page_meta->free_count == 1) {
1357                 /* first free element on page, move from all_used */
1358                 re_queue_tail(&zone->pages.intermediate, &(page_meta->pages));
1359         }
1360         zone->count--;
1361         zone->countfree++;
1362
1363 #if KASAN_ZALLOC
1364         kasan_poison_range(element, zone->elem_size, ASAN_HEAP_FREED);
1365 #endif
1366 }
1367
1368
1369 /*
1370  * Removes an element from the zone's free list, returning 0 if the free list is empty.
1371  * Verifies that the next-pointer and backup next-pointer are intact,
1372  * and verifies that a poisoned element hasn't been modified.
1373  */
1374 static inline vm_offset_t
1375 try_alloc_from_zone(zone_t zone,
1376     vm_tag_t tag __unused,
1377     boolean_t* check_poison)
1378 {
1379         vm_offset_t  element;
1380         struct zone_page_metadata *page_meta;
1381
1382         *check_poison = FALSE;
1383
1384         /* if zone is empty, bail */
1385         if (zone->allows_foreign && !queue_empty(&zone->pages.any_free_foreign)) {
1386                 page_meta = (struct zone_page_metadata *)queue_first(&zone->pages.any_free_foreign);
1387         } else if (!queue_empty(&zone->pages.intermediate)) {
1388                 page_meta = (struct zone_page_metadata *)queue_first(&zone->pages.intermediate);
1389         } else if (!queue_empty(&zone->pages.all_free)) {
1390                 page_meta = (struct zone_page_metadata *)queue_first(&zone->pages.all_free);
1391                 assert(zone->count_all_free_pages >= page_meta->page_count);
1392                 zone->count_all_free_pages -= page_meta->page_count;
1393         } else {
1394                 return 0;
1395         }
1396         /* Check if page_meta passes is_sane_zone_element */
1397         if (__improbable(!is_sane_zone_page_metadata(zone, (vm_offset_t)page_meta))) {
1398                 panic("zalloc: invalid metadata structure %p for freelist of zone %s\n",
1399                     (void *) page_meta, zone->zone_name);
1400         }
1401         assert(PAGE_METADATA_GET_ZONE(page_meta) == zone);
1402         element = (vm_offset_t)page_metadata_get_freelist(page_meta);
1403
1404         if (__improbable(!is_sane_zone_ptr(zone, element, zone->elem_size))) {
1405                 panic("zfree: invalid head pointer %p for freelist of zone %s\n",
1406                     (void *) element, zone->zone_name);
1407         }
1408
1409         vm_offset_t *primary = (vm_offset_t *) element;
1410         vm_offset_t *backup  = get_backup_ptr(zone->elem_size, primary);
1411
1412         /*
1413          * Since the primary next pointer is xor'ed with zp_nopoison_cookie
1414          * for obfuscation, retrieve the original value back
1415          */
1416         vm_offset_t  next_element          = *primary ^ zp_nopoison_cookie;
1417         vm_offset_t  next_element_primary  = *primary;
1418         vm_offset_t  next_element_backup   = *backup;
1419
1420         /*
1421          * backup_ptr_mismatch_panic will determine what next_element
1422          * should have been, and print it appropriately
1423          */
1424         if (__improbable(!is_sane_zone_element(zone, next_element))) {
1425                 backup_ptr_mismatch_panic(zone, element, next_element_primary, next_element_backup);
1426         }
1427
1428         /* Check the backup pointer for the regular cookie */
1429         if (__improbable(next_element != (next_element_backup ^ zp_nopoison_cookie))) {
1430                 /* Check for the poisoned cookie instead */
1431                 if (__improbable(next_element != (next_element_backup ^ zp_poisoned_cookie))) {
1432                         /* Neither cookie is valid, corruption has occurred */
1433                         backup_ptr_mismatch_panic(zone, element, next_element_primary, next_element_backup);
1434                 }
1435
1436                 /*
1437                  * Element was marked as poisoned, so check its integrity before using it.
1438                  */
1439                 *check_poison = TRUE;
1440         }
1441
1442         /* Make sure the page_meta is at the correct offset from the start of page */
1443         if (__improbable(page_meta != get_zone_page_metadata((struct zone_free_element *)element, FALSE))) {
1444                 panic("zalloc: Incorrect metadata %p found in zone %s page queue. Expected metadata: %p\n",
1445                     page_meta, zone->zone_name, get_zone_page_metadata((struct zone_free_element *)element, FALSE));
1446         }
1447
1448         /* Make sure next_element belongs to the same page as page_meta */
1449         if (next_element) {
1450                 if (__improbable(page_meta != get_zone_page_metadata((struct zone_free_element *)next_element, FALSE))) {
1451                         panic("zalloc: next element pointer %p for element %p points to invalid element for zone %s\n",
1452                             (void *)next_element, (void *)element, zone->zone_name);
1453                 }
1454         }
1455
1456         /* Remove this element from the free list */
1457         page_metadata_set_freelist(page_meta, (struct zone_free_element *)next_element);
1458         page_meta->free_count--;
1459
1460         if (page_meta->free_count == 0) {
1461                 /* move to all used */
1462                 re_queue_tail(&zone->pages.all_used, &(page_meta->pages));
1463         } else {
1464                 if (!zone->allows_foreign || from_zone_map(element, zone->elem_size)) {
1465                         if (get_metadata_alloc_count(page_meta) == page_meta->free_count + 1) {
1466                                 /* remove from free, move to intermediate */
1467                                 re_queue_tail(&zone->pages.intermediate, &(page_meta->pages));
1468                         }
1469                 }
1470         }
1471         zone->countfree--;
1472         zone->count++;
1473         zone->sum_count++;
1474
1475 #if VM_MAX_TAG_ZONES
1476         if (__improbable(zone->tags)) {
1477                 // set the tag with b0 clear so the block remains inuse
1478                 ZTAG(zone, element)[0] = (tag << 1);
1479         }
1480 #endif /* VM_MAX_TAG_ZONES */
1481
1482
1483 #if KASAN_ZALLOC
1484         kasan_poison_range(element, zone->elem_size, ASAN_VALID);
1485 #endif
1486
1487         return element;
1488 }
1489
1490 /*
1491  * End of zone poisoning
1492  */
1493
1494 /*
1495  * Zone info options
1496  */
1497 #define ZINFO_SLOTS     MAX_ZONES               /* for now */
1498
1499 zone_t          zone_find_largest(void);
1500
1501 /*
1502  * Async allocation of zones
1503  * This mechanism allows for bootstrapping an empty zone which is setup with
1504  * non-blocking flags. The first call to zalloc_noblock() will kick off a thread_call
1505  * to zalloc_async. We perform a zalloc() (which may block) and then an immediate free.
1506  * This will prime the zone for the next use.
1507  *
1508  * Currently the thread_callout function (zalloc_async) will loop through all zones
1509  * looking for any zone with async_pending set and do the work for it.
1510  *
1511  * NOTE: If the calling thread for zalloc_noblock is lower priority than thread_call,
1512  * then zalloc_noblock to an empty zone may succeed.
1513  */
1514 void            zalloc_async(
1515         thread_call_param_t     p0,
1516         thread_call_param_t     p1);
1517
1518 static thread_call_data_t call_async_alloc;
1519
1520 /*
1521  * Align elements that use the zone page list to 32 byte boundaries.
1522  */
1523 #define ZONE_ELEMENT_ALIGNMENT 32
1524
1525 #define zone_wakeup(zone) thread_wakeup((event_t)(zone))
1526 #define zone_sleep(zone)                                \
1527         (void) lck_mtx_sleep(&(zone)->lock, LCK_SLEEP_SPIN_ALWAYS, (event_t)(zone), THREAD_UNINT);
1528
1529
1530 #define lock_zone_init(zone)                            \
1531 MACRO_BEGIN                                             \
1532         lck_attr_setdefault(&(zone)->lock_attr);                        \
1533         lck_mtx_init_ext(&(zone)->lock, &(zone)->lock_ext,              \
1534             &zone_locks_grp, &(zone)->lock_attr);                       \
1535 MACRO_END
1536
1537 #define lock_try_zone(zone)     lck_mtx_try_lock_spin(&zone->lock)
1538
1539 /*
1540  *      Exclude more than one concurrent garbage collection
1541  */
1542 decl_lck_mtx_data(, zone_gc_lock);
1543
1544 lck_attr_t      zone_gc_lck_attr;
1545 lck_grp_t       zone_gc_lck_grp;
1546 lck_grp_attr_t  zone_gc_lck_grp_attr;
1547 lck_mtx_ext_t   zone_gc_lck_ext;
1548
1549 boolean_t zone_gc_allowed = TRUE;
1550 boolean_t panic_include_zprint = FALSE;
1551
1552 mach_memory_info_t *panic_kext_memory_info = NULL;
1553 vm_size_t panic_kext_memory_size = 0;
1554
1555 #define ZALLOC_DEBUG_ZONEGC             0x00000001
1556 #define ZALLOC_DEBUG_ZCRAM              0x00000002
1557
1558 #if DEBUG || DEVELOPMENT
1559 static uint32_t zalloc_debug = 0;
1560 #endif
1561
1562 /*
1563  * Zone leak debugging code
1564  *
1565  * When enabled, this code keeps a log to track allocations to a particular zone that have not
1566  * yet been freed.  Examining this log will reveal the source of a zone leak.  The log is allocated
1567  * only when logging is enabled, so there is no effect on the system when it's turned off.  Logging is
1568  * off by default.
1569  *
1570  * Enable the logging via the boot-args. Add the parameter "zlog=<zone>" to boot-args where <zone>
1571  * is the name of the zone you wish to log.
1572  *
1573  * This code only tracks one zone, so you need to identify which one is leaking first.
1574  * Generally, you'll know you have a leak when you get a "zalloc retry failed 3" panic from the zone
1575  * garbage collector.  Note that the zone name printed in the panic message is not necessarily the one
1576  * containing the leak.  So do a zprint from gdb and locate the zone with the bloated size.  This
1577  * is most likely the problem zone, so set zlog in boot-args to this zone name, reboot and re-run the test.  The
1578  * next time it panics with this message, examine the log using the kgmacros zstack, findoldest and countpcs.
1579  * See the help in the kgmacros for usage info.
1580  *
1581  *
1582  * Zone corruption logging
1583  *
1584  * Logging can also be used to help identify the source of a zone corruption.  First, identify the zone
1585  * that is being corrupted, then add "-zc zlog=<zone name>" to the boot-args.  When -zc is used in conjunction
1586  * with zlog, it changes the logging style to track both allocations and frees to the zone.  So when the
1587  * corruption is detected, examining the log will show you the stack traces of the callers who last allocated
1588  * and freed any particular element in the zone.  Use the findelem kgmacro with the address of the element that's been
1589  * corrupted to examine its history.  This should lead to the source of the corruption.
1590  */
1591
1592 static boolean_t log_records_init = FALSE;
1593 static int log_records; /* size of the log, expressed in number of records */
1594
1595 #define MAX_NUM_ZONES_ALLOWED_LOGGING   10 /* Maximum 10 zones can be logged at once */
1596
1597 static int  max_num_zones_to_log = MAX_NUM_ZONES_ALLOWED_LOGGING;
1598 static int  num_zones_logged = 0;
1599
1600 static char zone_name_to_log[MAX_ZONE_NAME] = "";       /* the zone name we're logging, if any */
1601
1602 /* Log allocations and frees to help debug a zone element corruption */
1603 boolean_t       corruption_debug_flag    = DEBUG;    /* enabled by "-zc" boot-arg */
1604 /* Making pointer scanning leaks detection possible for all zones */
1605
1606 #if DEBUG || DEVELOPMENT
1607 boolean_t       leak_scan_debug_flag     = FALSE;    /* enabled by "-zl" boot-arg */
1608 #endif /* DEBUG || DEVELOPMENT */
1609
1610
1611 /*
1612  * The number of records in the log is configurable via the zrecs parameter in boot-args.  Set this to
1613  * the number of records you want in the log.  For example, "zrecs=10" sets it to 10 records. Since this
1614  * is the number of stacks suspected of leaking, we don't need many records.
1615  */
1616
1617 #if     defined(__LP64__)
1618 #define ZRECORDS_MAX            2560            /* Max records allowed in the log */
1619 #else
1620 #define ZRECORDS_MAX            1536            /* Max records allowed in the log */
1621 #endif
1622 #define ZRECORDS_DEFAULT        1024            /* default records in log if zrecs is not specificed in boot-args */
1623
1624 /*
1625  * Each record in the log contains a pointer to the zone element it refers to,
1626  * and a small array to hold the pc's from the stack trace.  A
1627  * record is added to the log each time a zalloc() is done in the zone_of_interest.  For leak debugging,
1628  * the record is cleared when a zfree() is done.  For corruption debugging, the log tracks both allocs and frees.
1629  * If the log fills, old records are replaced as if it were a circular buffer.
1630  */
1631
1632
1633 /*
1634  * Decide if we want to log this zone by doing a string compare between a zone name and the name
1635  * of the zone to log. Return true if the strings are equal, false otherwise.  Because it's not
1636  * possible to include spaces in strings passed in via the boot-args, a period in the logname will
1637  * match a space in the zone name.
1638  */
1639
1640 int
1641 track_this_zone(const char *zonename, const char *logname)
1642 {
1643         unsigned int len;
1644         const char *zc = zonename;
1645         const char *lc = logname;
1646
1647         /*
1648          * Compare the strings.  We bound the compare by MAX_ZONE_NAME.
1649          */
1650
1651         for (len = 1; len <= MAX_ZONE_NAME; zc++, lc++, len++) {
1652                 /*
1653                  * If the current characters don't match, check for a space in
1654                  * in the zone name and a corresponding period in the log name.
1655                  * If that's not there, then the strings don't match.
1656                  */
1657
1658                 if (*zc != *lc && !(*zc == ' ' && *lc == '.')) {
1659                         break;
1660                 }
1661
1662                 /*
1663                  * The strings are equal so far.  If we're at the end, then it's a match.
1664                  */
1665
1666                 if (*zc == '\0') {
1667                         return TRUE;
1668                 }
1669         }
1670
1671         return FALSE;
1672 }
1673
1674
1675 /*
1676  * Test if we want to log this zalloc/zfree event.  We log if this is the zone we're interested in and
1677  * the buffer for the records has been allocated.
1678  */
1679
1680 #define DO_LOGGING(z)           (z->zone_logging == TRUE && z->zlog_btlog)
1681
1682 extern boolean_t kmem_alloc_ready;
1683
1684 #if CONFIG_ZLEAKS
1685 #pragma mark -
1686 #pragma mark Zone Leak Detection
1687
1688 /*
1689  * The zone leak detector, abbreviated 'zleak', keeps track of a subset of the currently outstanding
1690  * allocations made by the zone allocator.  Every zleak_sample_factor allocations in each zone, we capture a
1691  * backtrace.  Every free, we examine the table and determine if the allocation was being tracked,
1692  * and stop tracking it if it was being tracked.
1693  *
1694  * We track the allocations in the zallocations hash table, which stores the address that was returned from
1695  * the zone allocator.  Each stored entry in the zallocations table points to an entry in the ztraces table, which
1696  * stores the backtrace associated with that allocation.  This provides uniquing for the relatively large
1697  * backtraces - we don't store them more than once.
1698  *
1699  * Data collection begins when the zone map is 50% full, and only occurs for zones that are taking up
1700  * a large amount of virtual space.
1701  */
1702 #define ZLEAK_STATE_ENABLED             0x01    /* Zone leak monitoring should be turned on if zone_map fills up. */
1703 #define ZLEAK_STATE_ACTIVE              0x02    /* We are actively collecting traces. */
1704 #define ZLEAK_STATE_ACTIVATING          0x04    /* Some thread is doing setup; others should move along. */
1705 #define ZLEAK_STATE_FAILED              0x08    /* Attempt to allocate tables failed.  We will not try again. */
1706 uint32_t        zleak_state = 0;                /* State of collection, as above */
1707
1708 boolean_t       panic_include_ztrace    = FALSE;        /* Enable zleak logging on panic */
1709 vm_size_t       zleak_global_tracking_threshold;        /* Size of zone map at which to start collecting data */
1710 vm_size_t       zleak_per_zone_tracking_threshold;      /* Size a zone will have before we will collect data on it */
1711 unsigned int    zleak_sample_factor     = 1000;         /* Allocations per sample attempt */
1712
1713 /*
1714  * Counters for allocation statistics.
1715  */
1716
1717 /* Times two active records want to occupy the same spot */
1718 unsigned int z_alloc_collisions = 0;
1719 unsigned int z_trace_collisions = 0;
1720
1721 /* Times a new record lands on a spot previously occupied by a freed allocation */
1722 unsigned int z_alloc_overwrites = 0;
1723 unsigned int z_trace_overwrites = 0;
1724
1725 /* Times a new alloc or trace is put into the hash table */
1726 unsigned int z_alloc_recorded   = 0;
1727 unsigned int z_trace_recorded   = 0;
1728
1729 /* Times zleak_log returned false due to not being able to acquire the lock */
1730 unsigned int z_total_conflicts  = 0;
1731
1732
1733 #pragma mark struct zallocation
1734 /*
1735  * Structure for keeping track of an allocation
1736  * An allocation bucket is in use if its element is not NULL
1737  */
1738 struct zallocation {
1739         uintptr_t               za_element;             /* the element that was zalloc'ed or zfree'ed, NULL if bucket unused */
1740         vm_size_t               za_size;                        /* how much memory did this allocation take up? */
1741         uint32_t                za_trace_index; /* index into ztraces for backtrace associated with allocation */
1742         /* TODO: #if this out */
1743         uint32_t                za_hit_count;           /* for determining effectiveness of hash function */
1744 };
1745
1746 /* Size must be a power of two for the zhash to be able to just mask off bits instead of mod */
1747 uint32_t zleak_alloc_buckets = CONFIG_ZLEAK_ALLOCATION_MAP_NUM;
1748 uint32_t zleak_trace_buckets = CONFIG_ZLEAK_TRACE_MAP_NUM;
1749
1750 vm_size_t zleak_max_zonemap_size;
1751
1752 /* Hashmaps of allocations and their corresponding traces */
1753 static struct zallocation*      zallocations;
1754 static struct ztrace*           ztraces;
1755
1756 /* not static so that panic can see this, see kern/debug.c */
1757 struct ztrace*                          top_ztrace;
1758
1759 /* Lock to protect zallocations, ztraces, and top_ztrace from concurrent modification. */
1760 static lck_spin_t                       zleak_lock;
1761 static lck_attr_t                       zleak_lock_attr;
1762 static lck_grp_t                        zleak_lock_grp;
1763 static lck_grp_attr_t                   zleak_lock_grp_attr;
1764
1765 /*
1766  * Initializes the zone leak monitor.  Called from zone_init()
1767  */
1768 static void
1769 zleak_init(vm_size_t max_zonemap_size)
1770 {
1771         char                    scratch_buf[16];
1772         boolean_t               zleak_enable_flag = FALSE;
1773
1774         zleak_max_zonemap_size = max_zonemap_size;
1775         zleak_global_tracking_threshold = max_zonemap_size / 2;
1776         zleak_per_zone_tracking_threshold = zleak_global_tracking_threshold / 8;
1777
1778 #if CONFIG_EMBEDDED
1779         if (PE_parse_boot_argn("-zleakon", scratch_buf, sizeof(scratch_buf))) {
1780                 zleak_enable_flag = TRUE;
1781                 printf("zone leak detection enabled\n");
1782         } else {
1783                 zleak_enable_flag = FALSE;
1784                 printf("zone leak detection disabled\n");
1785         }
1786 #else /* CONFIG_EMBEDDED */
1787         /* -zleakoff (flag to disable zone leak monitor) */
1788         if (PE_parse_boot_argn("-zleakoff", scratch_buf, sizeof(scratch_buf))) {
1789                 zleak_enable_flag = FALSE;
1790                 printf("zone leak detection disabled\n");
1791         } else {
1792                 zleak_enable_flag = TRUE;
1793                 printf("zone leak detection enabled\n");
1794         }
1795 #endif /* CONFIG_EMBEDDED */
1796
1797         /* zfactor=XXXX (override how often to sample the zone allocator) */
1798         if (PE_parse_boot_argn("zfactor", &zleak_sample_factor, sizeof(zleak_sample_factor))) {
1799                 printf("Zone leak factor override: %u\n", zleak_sample_factor);
1800         }
1801
1802         /* zleak-allocs=XXXX (override number of buckets in zallocations) */
1803         if (PE_parse_boot_argn("zleak-allocs", &zleak_alloc_buckets, sizeof(zleak_alloc_buckets))) {
1804                 printf("Zone leak alloc buckets override: %u\n", zleak_alloc_buckets);
1805                 /* uses 'is power of 2' trick: (0x01000 & 0x00FFF == 0) */
1806                 if (zleak_alloc_buckets == 0 || (zleak_alloc_buckets & (zleak_alloc_buckets - 1))) {
1807                         printf("Override isn't a power of two, bad things might happen!\n");
1808                 }
1809         }
1810
1811         /* zleak-traces=XXXX (override number of buckets in ztraces) */
1812         if (PE_parse_boot_argn("zleak-traces", &zleak_trace_buckets, sizeof(zleak_trace_buckets))) {
1813                 printf("Zone leak trace buckets override: %u\n", zleak_trace_buckets);
1814                 /* uses 'is power of 2' trick: (0x01000 & 0x00FFF == 0) */
1815                 if (zleak_trace_buckets == 0 || (zleak_trace_buckets & (zleak_trace_buckets - 1))) {
1816                         printf("Override isn't a power of two, bad things might happen!\n");
1817                 }
1818         }
1819
1820         /* allocate the zleak_lock */
1821         lck_grp_attr_setdefault(&zleak_lock_grp_attr);
1822         lck_grp_init(&zleak_lock_grp, "zleak_lock", &zleak_lock_grp_attr);
1823         lck_attr_setdefault(&zleak_lock_attr);
1824         lck_spin_init(&zleak_lock, &zleak_lock_grp, &zleak_lock_attr);
1825
1826         if (zleak_enable_flag) {
1827                 zleak_state = ZLEAK_STATE_ENABLED;
1828         }
1829 }
1830
1831 #if CONFIG_ZLEAKS
1832
1833 /*
1834  * Support for kern.zleak.active sysctl - a simplified
1835  * version of the zleak_state variable.
1836  */
1837 int
1838 get_zleak_state(void)
1839 {
1840         if (zleak_state & ZLEAK_STATE_FAILED) {
1841                 return -1;
1842         }
1843         if (zleak_state & ZLEAK_STATE_ACTIVE) {
1844                 return 1;
1845         }
1846         return 0;
1847 }
1848
1849 #endif
1850
1851
1852 kern_return_t
1853 zleak_activate(void)
1854 {
1855         kern_return_t retval;
1856         vm_size_t z_alloc_size = zleak_alloc_buckets * sizeof(struct zallocation);
1857         vm_size_t z_trace_size = zleak_trace_buckets * sizeof(struct ztrace);
1858         void *allocations_ptr = NULL;
1859         void *traces_ptr = NULL;
1860
1861         /* Only one thread attempts to activate at a time */
1862         if (zleak_state & (ZLEAK_STATE_ACTIVE | ZLEAK_STATE_ACTIVATING | ZLEAK_STATE_FAILED)) {
1863                 return KERN_SUCCESS;
1864         }
1865
1866         /* Indicate that we're doing the setup */
1867         lck_spin_lock(&zleak_lock);
1868         if (zleak_state & (ZLEAK_STATE_ACTIVE | ZLEAK_STATE_ACTIVATING | ZLEAK_STATE_FAILED)) {
1869                 lck_spin_unlock(&zleak_lock);
1870                 return KERN_SUCCESS;
1871         }
1872
1873         zleak_state |= ZLEAK_STATE_ACTIVATING;
1874         lck_spin_unlock(&zleak_lock);
1875
1876         /* Allocate and zero tables */
1877         retval = kmem_alloc_kobject(kernel_map, (vm_offset_t*)&allocations_ptr, z_alloc_size, VM_KERN_MEMORY_OSFMK);
1878         if (retval != KERN_SUCCESS) {
1879                 goto fail;
1880         }
1881
1882         retval = kmem_alloc_kobject(kernel_map, (vm_offset_t*)&traces_ptr, z_trace_size, VM_KERN_MEMORY_OSFMK);
1883         if (retval != KERN_SUCCESS) {
1884                 goto fail;
1885         }
1886
1887         bzero(allocations_ptr, z_alloc_size);
1888         bzero(traces_ptr, z_trace_size);
1889
1890         /* Everything's set.  Install tables, mark active. */
1891         zallocations = allocations_ptr;
1892         ztraces = traces_ptr;
1893
1894         /*
1895          * Initialize the top_ztrace to the first entry in ztraces,
1896          * so we don't have to check for null in zleak_log
1897          */
1898         top_ztrace = &ztraces[0];
1899
1900         /*
1901          * Note that we do need a barrier between installing
1902          * the tables and setting the active flag, because the zfree()
1903          * path accesses the table without a lock if we're active.
1904          */
1905         lck_spin_lock(&zleak_lock);
1906         zleak_state |= ZLEAK_STATE_ACTIVE;
1907         zleak_state &= ~ZLEAK_STATE_ACTIVATING;
1908         lck_spin_unlock(&zleak_lock);
1909
1910         return 0;
1911
1912 fail:
1913         /*
1914          * If we fail to allocate memory, don't further tax
1915          * the system by trying again.
1916          */
1917         lck_spin_lock(&zleak_lock);
1918         zleak_state |= ZLEAK_STATE_FAILED;
1919         zleak_state &= ~ZLEAK_STATE_ACTIVATING;
1920         lck_spin_unlock(&zleak_lock);
1921
1922         if (allocations_ptr != NULL) {
1923                 kmem_free(kernel_map, (vm_offset_t)allocations_ptr, z_alloc_size);
1924         }
1925
1926         if (traces_ptr != NULL) {
1927                 kmem_free(kernel_map, (vm_offset_t)traces_ptr, z_trace_size);
1928         }
1929
1930         return retval;
1931 }
1932
1933 /*
1934  * TODO: What about allocations that never get deallocated,
1935  * especially ones with unique backtraces? Should we wait to record
1936  * until after boot has completed?
1937  * (How many persistent zallocs are there?)
1938  */
1939
1940 /*
1941  * This function records the allocation in the allocations table,
1942  * and stores the associated backtrace in the traces table
1943  * (or just increments the refcount if the trace is already recorded)
1944  * If the allocation slot is in use, the old allocation is replaced with the new allocation, and
1945  * the associated trace's refcount is decremented.
1946  * If the trace slot is in use, it returns.
1947  * The refcount is incremented by the amount of memory the allocation consumes.
1948  * The return value indicates whether to try again next time.
1949  */
1950 static boolean_t
1951 zleak_log(uintptr_t* bt,
1952     uintptr_t addr,
1953     uint32_t depth,
1954     vm_size_t allocation_size)
1955 {
1956         /* Quit if there's someone else modifying the hash tables */
1957         if (!lck_spin_try_lock(&zleak_lock)) {
1958                 z_total_conflicts++;
1959                 return FALSE;
1960         }
1961
1962         struct zallocation* allocation  = &zallocations[hashaddr(addr, zleak_alloc_buckets)];
1963
1964         uint32_t trace_index = hashbacktrace(bt, depth, zleak_trace_buckets);
1965         struct ztrace* trace = &ztraces[trace_index];
1966
1967         allocation->za_hit_count++;
1968         trace->zt_hit_count++;
1969
1970         /*
1971          * If the allocation bucket we want to be in is occupied, and if the occupier
1972          * has the same trace as us, just bail.
1973          */
1974         if (allocation->za_element != (uintptr_t) 0 && trace_index == allocation->za_trace_index) {
1975                 z_alloc_collisions++;
1976
1977                 lck_spin_unlock(&zleak_lock);
1978                 return TRUE;
1979         }
1980
1981         /* STEP 1: Store the backtrace in the traces array. */
1982         /* A size of zero indicates that the trace bucket is free. */
1983
1984         if (trace->zt_size > 0 && bcmp(trace->zt_stack, bt, (depth * sizeof(uintptr_t))) != 0) {
1985                 /*
1986                  * Different unique trace with same hash!
1987                  * Just bail - if we're trying to record the leaker, hopefully the other trace will be deallocated
1988                  * and get out of the way for later chances
1989                  */
1990                 trace->zt_collisions++;
1991                 z_trace_collisions++;
1992
1993                 lck_spin_unlock(&zleak_lock);
1994                 return TRUE;
1995         } else if (trace->zt_size > 0) {
1996                 /* Same trace, already added, so increment refcount */
1997                 trace->zt_size += allocation_size;
1998         } else {
1999                 /* Found an unused trace bucket, record the trace here! */
2000                 if (trace->zt_depth != 0) { /* if this slot was previously used but not currently in use */
2001                         z_trace_overwrites++;
2002                 }
2003
2004                 z_trace_recorded++;
2005                 trace->zt_size                  = allocation_size;
2006                 memcpy(trace->zt_stack, bt, (depth * sizeof(uintptr_t)));
2007
2008                 trace->zt_depth         = depth;
2009                 trace->zt_collisions    = 0;
2010         }
2011
2012         /* STEP 2: Store the allocation record in the allocations array. */
2013
2014         if (allocation->za_element != (uintptr_t) 0) {
2015                 /*
2016                  * Straight up replace any allocation record that was there.  We don't want to do the work
2017                  * to preserve the allocation entries that were there, because we only record a subset of the
2018                  * allocations anyways.
2019                  */
2020
2021                 z_alloc_collisions++;
2022
2023                 struct ztrace* associated_trace = &ztraces[allocation->za_trace_index];
2024                 /* Knock off old allocation's size, not the new allocation */
2025                 associated_trace->zt_size -= allocation->za_size;
2026         } else if (allocation->za_trace_index != 0) {
2027                 /* Slot previously used but not currently in use */
2028                 z_alloc_overwrites++;
2029         }
2030
2031         allocation->za_element          = addr;
2032         allocation->za_trace_index      = trace_index;
2033         allocation->za_size             = allocation_size;
2034
2035         z_alloc_recorded++;
2036
2037         if (top_ztrace->zt_size < trace->zt_size) {
2038                 top_ztrace = trace;
2039         }
2040
2041         lck_spin_unlock(&zleak_lock);
2042         return TRUE;
2043 }
2044
2045 /*
2046  * Free the allocation record and release the stacktrace.
2047  * This should be as fast as possible because it will be called for every free.
2048  */
2049 static void
2050 zleak_free(uintptr_t addr,
2051     vm_size_t allocation_size)
2052 {
2053         if (addr == (uintptr_t) 0) {
2054                 return;
2055         }
2056
2057         struct zallocation* allocation = &zallocations[hashaddr(addr, zleak_alloc_buckets)];
2058
2059         /* Double-checked locking: check to find out if we're interested, lock, check to make
2060          * sure it hasn't changed, then modify it, and release the lock.
2061          */
2062
2063         if (allocation->za_element == addr && allocation->za_trace_index < zleak_trace_buckets) {
2064                 /* if the allocation was the one, grab the lock, check again, then delete it */
2065                 lck_spin_lock(&zleak_lock);
2066
2067                 if (allocation->za_element == addr && allocation->za_trace_index < zleak_trace_buckets) {
2068                         struct ztrace *trace;
2069
2070                         /* allocation_size had better match what was passed into zleak_log - otherwise someone is freeing into the wrong zone! */
2071                         if (allocation->za_size != allocation_size) {
2072                                 panic("Freeing as size %lu memory that was allocated with size %lu\n",
2073                                     (uintptr_t)allocation_size, (uintptr_t)allocation->za_size);
2074                         }
2075
2076                         trace = &ztraces[allocation->za_trace_index];
2077
2078                         /* size of 0 indicates trace bucket is unused */
2079                         if (trace->zt_size > 0) {
2080                                 trace->zt_size -= allocation_size;
2081                         }
2082
2083                         /* A NULL element means the allocation bucket is unused */
2084                         allocation->za_element = 0;
2085                 }
2086                 lck_spin_unlock(&zleak_lock);
2087         }
2088 }
2089
2090 #endif /* CONFIG_ZLEAKS */
2091
2092 /*  These functions outside of CONFIG_ZLEAKS because they are also used in
2093  *  mbuf.c for mbuf leak-detection.  This is why they lack the z_ prefix.
2094  */
2095
2096 /* "Thomas Wang's 32/64 bit mix functions."  http://www.concentric.net/~Ttwang/tech/inthash.htm */
2097 uintptr_t
2098 hash_mix(uintptr_t x)
2099 {
2100 #ifndef __LP64__
2101         x += ~(x << 15);
2102         x ^=  (x >> 10);
2103         x +=  (x << 3);
2104         x ^=  (x >> 6);
2105         x += ~(x << 11);
2106         x ^=  (x >> 16);
2107 #else
2108         x += ~(x << 32);
2109         x ^=  (x >> 22);
2110         x += ~(x << 13);
2111         x ^=  (x >> 8);
2112         x +=  (x << 3);
2113         x ^=  (x >> 15);
2114         x += ~(x << 27);
2115         x ^=  (x >> 31);
2116 #endif
2117         return x;
2118 }
2119
2120 uint32_t
2121 hashbacktrace(uintptr_t* bt, uint32_t depth, uint32_t max_size)
2122 {
2123         uintptr_t hash = 0;
2124         uintptr_t mask = max_size - 1;
2125
2126         while (depth) {
2127                 hash += bt[--depth];
2128         }
2129
2130         hash = hash_mix(hash) & mask;
2131
2132         assert(hash < max_size);
2133
2134         return (uint32_t) hash;
2135 }
2136
2137 /*
2138  *  TODO: Determine how well distributed this is
2139  *      max_size must be a power of 2. i.e 0x10000 because 0x10000-1 is 0x0FFFF which is a great bitmask
2140  */
2141 uint32_t
2142 hashaddr(uintptr_t pt, uint32_t max_size)
2143 {
2144         uintptr_t hash = 0;
2145         uintptr_t mask = max_size - 1;
2146
2147         hash = hash_mix(pt) & mask;
2148
2149         assert(hash < max_size);
2150
2151         return (uint32_t) hash;
2152 }
2153
2154 /* End of all leak-detection code */
2155 #pragma mark -
2156
2157 #define ZONE_MAX_ALLOC_SIZE     (32 * 1024)
2158 #define ZONE_ALLOC_FRAG_PERCENT(alloc_size, ele_size) (((alloc_size % ele_size) * 100) / alloc_size)
2159
2160 /* Used to manage copying in of new zone names */
2161 static vm_offset_t zone_names_start;
2162 static vm_offset_t zone_names_next;
2163
2164 static vm_size_t
2165 compute_element_size(vm_size_t requested_size)
2166 {
2167         vm_size_t element_size = requested_size;
2168
2169         /* Zone elements must fit both a next pointer and a backup pointer */
2170         vm_size_t  minimum_element_size = sizeof(vm_offset_t) * 2;
2171         if (element_size < minimum_element_size) {
2172                 element_size = minimum_element_size;
2173         }
2174
2175         /*
2176          *  Round element size to a multiple of sizeof(pointer)
2177          *  This also enforces that allocations will be aligned on pointer boundaries
2178          */
2179         element_size = ((element_size - 1) + sizeof(vm_offset_t)) -
2180             ((element_size - 1) % sizeof(vm_offset_t));
2181
2182         return element_size;
2183 }
2184
2185 #if KASAN_ZALLOC
2186
2187 /*
2188  * Called from zinit().
2189  *
2190  * Fixes up the zone's element size to incorporate the redzones.
2191  */
2192 static void
2193 kasan_update_element_size_for_redzone(
2194         zone_t          zone,           /* the zone that needs to be updated */
2195         vm_size_t       *size,          /* requested zone element size */
2196         vm_size_t       *max,           /* maximum memory to use */
2197         const char      *name)          /* zone name */
2198 {
2199         /* Expand the zone allocation size to include the redzones. For page-multiple
2200          * zones add a full guard page because they likely require alignment. kalloc
2201          * and fakestack handles its own KASan state, so ignore those zones. */
2202         /* XXX: remove this when zinit_with_options() is a thing */
2203         const char *kalloc_name = "kalloc.";
2204         const char *fakestack_name = "fakestack.";
2205         if (strncmp(name, kalloc_name, strlen(kalloc_name)) == 0) {
2206                 zone->kasan_redzone = 0;
2207         } else if (strncmp(name, fakestack_name, strlen(fakestack_name)) == 0) {
2208                 zone->kasan_redzone = 0;
2209         } else {
2210                 if ((*size % PAGE_SIZE) != 0) {
2211                         zone->kasan_redzone = KASAN_GUARD_SIZE;
2212                 } else {
2213                         zone->kasan_redzone = PAGE_SIZE;
2214                 }
2215                 *max = (*max / *size) * (*size + zone->kasan_redzone * 2);
2216                 *size += zone->kasan_redzone * 2;
2217         }
2218 }
2219
2220 /*
2221  * Called from zalloc_internal() to fix up the address of the newly
2222  * allocated element.
2223  *
2224  * Returns the element address skipping over the redzone on the left.
2225  */
2226 static vm_offset_t
2227 kasan_fixup_allocated_element_address(
2228         zone_t                  zone,   /* the zone the element belongs to */
2229         vm_offset_t             addr)   /* address of the element, including the redzone */
2230 {
2231         /* Fixup the return address to skip the redzone */
2232         if (zone->kasan_redzone) {
2233                 addr = kasan_alloc(addr, zone->elem_size,
2234                     zone->elem_size - 2 * zone->kasan_redzone, zone->kasan_redzone);
2235         }
2236         return addr;
2237 }
2238
2239 /*
2240  * Called from zfree() to add the element being freed to the KASan quarantine.
2241  *
2242  * Returns true if the newly-freed element made it into the quarantine without
2243  * displacing another, false otherwise. In the latter case, addrp points to the
2244  * address of the displaced element, which will be freed by the zone.
2245  */
2246 static bool
2247 kasan_quarantine_freed_element(
2248         zone_t          *zonep,         /* the zone the element is being freed to */
2249         void            **addrp)        /* address of the element being freed */
2250 {
2251         zone_t zone = *zonep;
2252         void *addr = *addrp;
2253
2254         /*
2255          * Resize back to the real allocation size and hand off to the KASan
2256          * quarantine. `addr` may then point to a different allocation, if the
2257          * current element replaced another in the quarantine. The zone then
2258          * takes ownership of the swapped out free element.
2259          */
2260         vm_size_t usersz = zone->elem_size - 2 * zone->kasan_redzone;
2261         vm_size_t sz = usersz;
2262
2263         if (addr && zone->kasan_redzone) {
2264                 kasan_check_free((vm_address_t)addr, usersz, KASAN_HEAP_ZALLOC);
2265                 addr = (void *)kasan_dealloc((vm_address_t)addr, &sz);
2266                 assert(sz == zone->elem_size);
2267         }
2268         if (addr && zone->kasan_quarantine) {
2269                 kasan_free(&addr, &sz, KASAN_HEAP_ZALLOC, zonep, usersz, true);
2270                 if (!addr) {
2271                         return TRUE;
2272                 }
2273         }
2274         *addrp = addr;
2275         return FALSE;
2276 }
2277
2278 #endif /* KASAN_ZALLOC */
2279
2280 /*
2281  *      zinit initializes a new zone.  The zone data structures themselves
2282  *      are stored in a zone, which is initially a static structure that
2283  *      is initialized by zone_init.
2284  */
2285
2286 zone_t
2287 zinit(
2288         vm_size_t       size,           /* the size of an element */
2289         vm_size_t       max,            /* maximum memory to use */
2290         vm_size_t       alloc,          /* allocation size */
2291         const char      *name)          /* a name for the zone */
2292 {
2293         zone_t                  z;
2294
2295         size = compute_element_size(size);
2296
2297         simple_lock(&all_zones_lock, &zone_locks_grp);
2298
2299         assert(num_zones < MAX_ZONES);
2300         assert(num_zones_in_use <= num_zones);
2301
2302         /* If possible, find a previously zdestroy'ed zone in the zone_array that we can reuse instead of initializing a new zone. */
2303         for (int index = bitmap_first(zone_empty_bitmap, MAX_ZONES);
2304             index >= 0 && index < (int)num_zones;
2305             index = bitmap_next(zone_empty_bitmap, index)) {
2306                 z = &(zone_array[index]);
2307
2308                 /*
2309                  * If the zone name and the element size are the same, we can just reuse the old zone struct.
2310                  * Otherwise hand out a new zone from the zone_array.
2311                  */
2312                 if (!strcmp(z->zone_name, name)) {
2313                         vm_size_t old_size = z->elem_size;
2314 #if KASAN_ZALLOC
2315                         old_size -= z->kasan_redzone * 2;
2316 #endif
2317                         if (old_size == size) {
2318                                 /* Clear the empty bit for this zone, increment num_zones_in_use, and mark the zone as valid again. */
2319                                 bitmap_clear(zone_empty_bitmap, index);
2320                                 num_zones_in_use++;
2321                                 z->zone_valid = TRUE;
2322                                 z->zone_destruction = FALSE;
2323
2324                                 /* All other state is already set up since the zone was previously in use. Return early. */
2325                                 simple_unlock(&all_zones_lock);
2326                                 return z;
2327                         }
2328                 }
2329         }
2330
2331         /* If we're here, it means we didn't find a zone above that we could simply reuse. Set up a new zone. */
2332
2333         /* Clear the empty bit for the new zone */
2334         bitmap_clear(zone_empty_bitmap, num_zones);
2335
2336         z = &(zone_array[num_zones]);
2337         z->index = num_zones;
2338
2339         num_zones++;
2340         num_zones_in_use++;
2341
2342         /*
2343          * Initialize the zone lock here before dropping the all_zones_lock. Otherwise we could race with
2344          * zalloc_async() and try to grab the zone lock before it has been initialized, causing a panic.
2345          */
2346         lock_zone_init(z);
2347
2348         simple_unlock(&all_zones_lock);
2349
2350 #if KASAN_ZALLOC
2351         kasan_update_element_size_for_redzone(z, &size, &max, name);
2352 #endif
2353
2354         max = round_page(max);
2355
2356         vm_size_t best_alloc = PAGE_SIZE;
2357
2358         if ((size % PAGE_SIZE) == 0) {
2359                 /* zero fragmentation by definition */
2360                 best_alloc = size;
2361         } else {
2362                 vm_size_t alloc_size;
2363                 for (alloc_size = (2 * PAGE_SIZE); alloc_size <= ZONE_MAX_ALLOC_SIZE; alloc_size += PAGE_SIZE) {
2364                         if (ZONE_ALLOC_FRAG_PERCENT(alloc_size, size) < ZONE_ALLOC_FRAG_PERCENT(best_alloc, size)) {
2365                                 best_alloc = alloc_size;
2366                         }
2367                 }
2368         }
2369
2370         alloc = best_alloc;
2371         if (max && (max < alloc)) {
2372                 max = alloc;
2373         }
2374
2375         z->free_elements = NULL;
2376         queue_init(&z->pages.any_free_foreign);
2377         queue_init(&z->pages.all_free);
2378         queue_init(&z->pages.intermediate);
2379         queue_init(&z->pages.all_used);
2380         z->cur_size = 0;
2381         z->page_count = 0;
2382         z->max_size = max;
2383         z->elem_size = size;
2384         z->alloc_size = alloc;
2385         z->count = 0;
2386         z->countfree = 0;
2387         z->count_all_free_pages = 0;
2388         z->sum_count = 0LL;
2389         z->doing_alloc_without_vm_priv = FALSE;
2390         z->doing_alloc_with_vm_priv = FALSE;
2391         z->exhaustible = FALSE;
2392         z->collectable = TRUE;
2393         z->allows_foreign = FALSE;
2394         z->expandable  = TRUE;
2395         z->waiting = FALSE;
2396         z->async_pending = FALSE;
2397         z->caller_acct = TRUE;
2398         z->noencrypt = FALSE;
2399         z->no_callout = FALSE;
2400         z->async_prio_refill = FALSE;
2401         z->gzalloc_exempt = FALSE;
2402         z->alignment_required = FALSE;
2403         z->zone_replenishing = FALSE;
2404         z->prio_refill_watermark = 0;
2405         z->zone_replenish_thread = NULL;
2406         z->zp_count = 0;
2407         z->kasan_quarantine = TRUE;
2408         z->zone_valid = TRUE;
2409         z->zone_destruction = FALSE;
2410         z->cpu_cache_enabled = FALSE;
2411         z->clear_memory = FALSE;
2412
2413 #if CONFIG_ZLEAKS
2414         z->zleak_capture = 0;
2415         z->zleak_on = FALSE;
2416 #endif /* CONFIG_ZLEAKS */
2417
2418         /*
2419          * If the VM is ready to handle kmem_alloc requests, copy the zone name passed in.
2420          *
2421          * Else simply maintain a pointer to the name string. The only zones we'll actually have
2422          * to do this for would be the VM-related zones that are created very early on before any
2423          * kexts can be loaded (unloaded). So we should be fine with just a pointer in this case.
2424          */
2425         if (kmem_alloc_ready) {
2426                 size_t len = MIN(strlen(name) + 1, MACH_ZONE_NAME_MAX_LEN);
2427
2428                 if (zone_names_start == 0 || ((zone_names_next - zone_names_start) + len) > PAGE_SIZE) {
2429                         printf("zalloc: allocating memory for zone names buffer\n");
2430                         kern_return_t retval = kmem_alloc_kobject(kernel_map, &zone_names_start,
2431                             PAGE_SIZE, VM_KERN_MEMORY_OSFMK);
2432                         if (retval != KERN_SUCCESS) {
2433                                 panic("zalloc: zone_names memory allocation failed");
2434                         }
2435                         bzero((char *)zone_names_start, PAGE_SIZE);
2436                         zone_names_next = zone_names_start;
2437                 }
2438
2439                 strlcpy((char *)zone_names_next, name, len);
2440                 z->zone_name = (char *)zone_names_next;
2441                 zone_names_next += len;
2442         } else {
2443                 z->zone_name = name;
2444         }
2445
2446         /*
2447          * Check for and set up zone leak detection if requested via boot-args.  We recognized two
2448          * boot-args:
2449          *
2450          *      zlog=<zone_to_log>
2451          *      zrecs=<num_records_in_log>
2452          *
2453          * The zlog arg is used to specify the zone name that should be logged, and zrecs is used to
2454          * control the size of the log.  If zrecs is not specified, a default value is used.
2455          */
2456
2457         if (num_zones_logged < max_num_zones_to_log) {
2458                 int             i = 1; /* zlog0 isn't allowed. */
2459                 boolean_t       zone_logging_enabled = FALSE;
2460                 char            zlog_name[MAX_ZONE_NAME] = ""; /* Temp. buffer to create the strings zlog1, zlog2 etc... */
2461
2462                 while (i <= max_num_zones_to_log) {
2463                         snprintf(zlog_name, MAX_ZONE_NAME, "zlog%d", i);
2464
2465                         if (PE_parse_boot_argn(zlog_name, zone_name_to_log, sizeof(zone_name_to_log)) == TRUE) {
2466                                 if (track_this_zone(z->zone_name, zone_name_to_log)) {
2467                                         if (z->zone_valid) {
2468                                                 z->zone_logging = TRUE;
2469                                                 zone_logging_enabled = TRUE;
2470                                                 num_zones_logged++;
2471                                                 break;
2472                                         }
2473                                 }
2474                         }
2475                         i++;
2476                 }
2477
2478                 if (zone_logging_enabled == FALSE) {
2479                         /*
2480                          * Backwards compat. with the old boot-arg used to specify single zone logging i.e. zlog
2481                          * Needs to happen after the newer zlogn checks because the prefix will match all the zlogn
2482                          * boot-args.
2483                          */
2484                         if (PE_parse_boot_argn("zlog", zone_name_to_log, sizeof(zone_name_to_log)) == TRUE) {
2485                                 if (track_this_zone(z->zone_name, zone_name_to_log)) {
2486                                         if (z->zone_valid) {
2487                                                 z->zone_logging = TRUE;
2488                                                 zone_logging_enabled = TRUE;
2489                                                 num_zones_logged++;
2490                                         }
2491                                 }
2492                         }
2493                 }
2494
2495                 if (log_records_init == FALSE && zone_logging_enabled == TRUE) {
2496                         if (PE_parse_boot_argn("zrecs", &log_records, sizeof(log_records)) == TRUE) {
2497                                 /*
2498                                  * Don't allow more than ZRECORDS_MAX records even if the user asked for more.
2499                                  * This prevents accidentally hogging too much kernel memory and making the system
2500                                  * unusable.
2501                                  */
2502
2503                                 log_records = MIN(ZRECORDS_MAX, log_records);
2504                                 log_records_init = TRUE;
2505                         } else {
2506                                 log_records = ZRECORDS_DEFAULT;
2507                                 log_records_init = TRUE;
2508                         }
2509                 }
2510
2511                 /*
2512                  * If we want to log a zone, see if we need to allocate buffer space for the log.  Some vm related zones are
2513                  * zinit'ed before we can do a kmem_alloc, so we have to defer allocation in that case.  kmem_alloc_ready is set to
2514                  * TRUE once enough of the VM system is up and running to allow a kmem_alloc to work.  If we want to log one
2515                  * of the VM related zones that's set up early on, we will skip allocation of the log until zinit is called again
2516                  * later on some other zone.  So note we may be allocating a buffer to log a zone other than the one being initialized
2517                  * right now.
2518                  */
2519                 if (kmem_alloc_ready) {
2520                         zone_t curr_zone = NULL;
2521                         unsigned int max_zones = 0, zone_idx = 0;
2522
2523                         simple_lock(&all_zones_lock, &zone_locks_grp);
2524                         max_zones = num_zones;
2525                         simple_unlock(&all_zones_lock);
2526
2527                         for (zone_idx = 0; zone_idx < max_zones; zone_idx++) {
2528                                 curr_zone = &(zone_array[zone_idx]);
2529
2530                                 if (!curr_zone->zone_valid) {
2531                                         continue;
2532                                 }
2533
2534                                 /*
2535                                  * We work with the zone unlocked here because we could end up needing the zone lock to
2536                                  * enable logging for this zone e.g. need a VM object to allocate memory to enable logging for the
2537                                  * VM objects zone.
2538                                  *
2539                                  * We don't expect these zones to be needed at this early a time in boot and so take this chance.
2540                                  */
2541                                 if (curr_zone->zone_logging && curr_zone->zlog_btlog == NULL) {
2542                                         curr_zone->zlog_btlog = btlog_create(log_records, MAX_ZTRACE_DEPTH, (corruption_debug_flag == FALSE) /* caller_will_remove_entries_for_element? */);
2543
2544                                         if (curr_zone->zlog_btlog) {
2545                                                 printf("zone: logging started for zone %s\n", curr_zone->zone_name);
2546                                         } else {
2547                                                 printf("zone: couldn't allocate memory for zrecords, turning off zleak logging\n");
2548                                                 curr_zone->zone_logging = FALSE;
2549                                         }
2550                                 }
2551                         }
2552                 }
2553         }
2554
2555 #if     CONFIG_GZALLOC
2556         gzalloc_zone_init(z);
2557 #endif
2558
2559 #if     CONFIG_ZCACHE
2560         /* Check if boot-arg specified it should have a cache */
2561         if (cache_all_zones || track_this_zone(name, cache_zone_name)) {
2562                 zone_change(z, Z_CACHING_ENABLED, TRUE);
2563         }
2564 #endif
2565
2566         return z;
2567 }
2568 unsigned        zone_replenish_loops, zone_replenish_wakeups, zone_replenish_wakeups_initiated, zone_replenish_throttle_count;
2569
2570 static void zone_replenish_thread(zone_t);
2571
2572 /* High priority VM privileged thread used to asynchronously refill a designated
2573  * zone, such as the reserved VM map entry zone.
2574  */
2575 __dead2
2576 static void
2577 zone_replenish_thread(zone_t z)
2578 {
2579         vm_size_t free_size;
2580         current_thread()->options |= TH_OPT_VMPRIV;
2581
2582         for (;;) {
2583                 lock_zone(z);
2584                 assert(z->zone_valid);
2585                 z->zone_replenishing = TRUE;
2586                 assert(z->prio_refill_watermark != 0);
2587                 while ((free_size = (z->cur_size - (z->count * z->elem_size))) < (z->prio_refill_watermark * z->elem_size)) {
2588                         assert(z->doing_alloc_without_vm_priv == FALSE);
2589                         assert(z->doing_alloc_with_vm_priv == FALSE);
2590                         assert(z->async_prio_refill == TRUE);
2591
2592                         unlock_zone(z);
2593                         int     zflags = KMA_KOBJECT | KMA_NOPAGEWAIT;
2594                         vm_offset_t space, alloc_size;
2595                         kern_return_t kr;
2596
2597                         if (vm_pool_low()) {
2598                                 alloc_size = round_page(z->elem_size);
2599                         } else {
2600                                 alloc_size = z->alloc_size;
2601                         }
2602
2603                         if (z->noencrypt) {
2604                                 zflags |= KMA_NOENCRYPT;
2605                         }
2606
2607                         if (z->clear_memory) {
2608                                 zflags |= KMA_ZERO;
2609                         }
2610
2611                         /* Trigger jetsams via the vm_pageout_garbage_collect thread if we're running out of zone memory */
2612                         if (is_zone_map_nearing_exhaustion()) {
2613                                 thread_wakeup((event_t) &vm_pageout_garbage_collect);
2614                         }
2615
2616                         kr = kernel_memory_allocate(zone_map, &space, alloc_size, 0, zflags, VM_KERN_MEMORY_ZONE);
2617
2618                         if (kr == KERN_SUCCESS) {
2619                                 zcram(z, space, alloc_size);
2620                         } else if (kr == KERN_RESOURCE_SHORTAGE) {
2621                                 VM_PAGE_WAIT();
2622                         } else if (kr == KERN_NO_SPACE) {
2623                                 kr = kernel_memory_allocate(kernel_map, &space, alloc_size, 0, zflags, VM_KERN_MEMORY_ZONE);
2624                                 if (kr == KERN_SUCCESS) {
2625                                         zcram(z, space, alloc_size);
2626                                 } else {
2627                                         assert_wait_timeout(&z->zone_replenish_thread, THREAD_UNINT, 1, 100 * NSEC_PER_USEC);
2628                                         thread_block(THREAD_CONTINUE_NULL);
2629                                 }
2630                         }
2631
2632                         lock_zone(z);
2633                         assert(z->zone_valid);
2634                         zone_replenish_loops++;
2635                 }
2636
2637                 z->zone_replenishing = FALSE;
2638                 /* Signal any potential throttled consumers, terminating
2639                  * their timer-bounded waits.
2640                  */
2641                 thread_wakeup(z);
2642
2643                 assert_wait(&z->zone_replenish_thread, THREAD_UNINT);
2644                 unlock_zone(z);
2645                 thread_block(THREAD_CONTINUE_NULL);
2646                 zone_replenish_wakeups++;
2647         }
2648 }
2649
2650 void
2651 zone_prio_refill_configure(zone_t z, vm_size_t low_water_mark)
2652 {
2653         z->prio_refill_watermark = low_water_mark;
2654
2655         z->async_prio_refill = TRUE;
2656         OSMemoryBarrier();
2657         kern_return_t tres = kernel_thread_start_priority((thread_continue_t)zone_replenish_thread, z, MAXPRI_KERNEL, &z->zone_replenish_thread);
2658
2659         if (tres != KERN_SUCCESS) {
2660                 panic("zone_prio_refill_configure, thread create: 0x%x", tres);
2661         }
2662
2663         thread_deallocate(z->zone_replenish_thread);
2664 }
2665
2666 void
2667 zdestroy(zone_t z)
2668 {
2669         unsigned int zindex;
2670
2671         assert(z != NULL);
2672
2673         lock_zone(z);
2674         assert(z->zone_valid);
2675
2676         /* Assert that the zone does not have any allocations in flight */
2677         assert(z->doing_alloc_without_vm_priv == FALSE);
2678         assert(z->doing_alloc_with_vm_priv == FALSE);
2679         assert(z->async_pending == FALSE);
2680         assert(z->waiting == FALSE);
2681         assert(z->async_prio_refill == FALSE);
2682
2683 #if !KASAN_ZALLOC
2684         /*
2685          * Unset the valid bit. We'll hit an assert failure on further operations on this zone, until zinit() is called again.
2686          * Leave the zone valid for KASan as we will see zfree's on quarantined free elements even after the zone is destroyed.
2687          */
2688         z->zone_valid = FALSE;
2689 #endif
2690         z->zone_destruction = TRUE;
2691         unlock_zone(z);
2692
2693 #if CONFIG_ZCACHE
2694         /* Drain the per-cpu caches if caching is enabled for the zone. */
2695         if (zone_caching_enabled(z)) {
2696                 panic("zdestroy: Zone caching enabled for zone %s", z->zone_name);
2697         }
2698 #endif /* CONFIG_ZCACHE */
2699
2700         /* Dump all the free elements */
2701         drop_free_elements(z);
2702
2703 #if     CONFIG_GZALLOC
2704         /* If the zone is gzalloc managed dump all the elements in the free cache */
2705         gzalloc_empty_free_cache(z);
2706 #endif
2707
2708         lock_zone(z);
2709
2710 #if !KASAN_ZALLOC
2711         /* Assert that all counts are zero */
2712         assert(z->count == 0);
2713         assert(z->countfree == 0);
2714         assert(z->cur_size == 0);
2715         assert(z->page_count == 0);
2716         assert(z->count_all_free_pages == 0);
2717
2718         /* Assert that all queues except the foreign queue are empty. The zone allocator doesn't know how to free up foreign memory. */
2719         assert(queue_empty(&z->pages.all_used));
2720         assert(queue_empty(&z->pages.intermediate));
2721         assert(queue_empty(&z->pages.all_free));
2722 #endif
2723
2724         zindex = z->index;
2725
2726         unlock_zone(z);
2727
2728         simple_lock(&all_zones_lock, &zone_locks_grp);
2729
2730         assert(!bitmap_test(zone_empty_bitmap, zindex));
2731         /* Mark the zone as empty in the bitmap */
2732         bitmap_set(zone_empty_bitmap, zindex);
2733         num_zones_in_use--;
2734         assert(num_zones_in_use > 0);
2735
2736         simple_unlock(&all_zones_lock);
2737 }
2738
2739 /* Initialize the metadata for an allocation chunk */
2740 static inline void
2741 zcram_metadata_init(vm_offset_t newmem, vm_size_t size, struct zone_page_metadata *chunk_metadata)
2742 {
2743         struct zone_page_metadata *page_metadata;
2744
2745         /* The first page is the real metadata for this allocation chunk. We mark the others as fake metadata */
2746         size -= PAGE_SIZE;
2747         newmem += PAGE_SIZE;
2748
2749         for (; size > 0; newmem += PAGE_SIZE, size -= PAGE_SIZE) {
2750                 page_metadata = get_zone_page_metadata((struct zone_free_element *)newmem, TRUE);
2751                 assert(page_metadata != chunk_metadata);
2752                 PAGE_METADATA_SET_ZINDEX(page_metadata, MULTIPAGE_METADATA_MAGIC);
2753                 page_metadata_set_realmeta(page_metadata, chunk_metadata);
2754                 page_metadata->free_count = 0;
2755         }
2756         return;
2757 }
2758
2759
2760 static void
2761 random_free_to_zone(
2762         zone_t          zone,
2763         vm_offset_t     newmem,
2764         vm_offset_t     first_element_offset,
2765         int             element_count,
2766         unsigned int    *entropy_buffer)
2767 {
2768         vm_offset_t     last_element_offset;
2769         vm_offset_t     element_addr;
2770         vm_size_t       elem_size;
2771         int             index;
2772
2773         assert(element_count && element_count <= ZONE_CHUNK_MAXELEMENTS);
2774         elem_size = zone->elem_size;
2775         last_element_offset = first_element_offset + ((element_count * elem_size) - elem_size);
2776         for (index = 0; index < element_count; index++) {
2777                 assert(first_element_offset <= last_element_offset);
2778                 if (
2779 #if DEBUG || DEVELOPMENT
2780                         leak_scan_debug_flag || __improbable(zone->tags) ||
2781 #endif /* DEBUG || DEVELOPMENT */
2782                         random_bool_gen_bits(&zone_bool_gen, entropy_buffer, MAX_ENTROPY_PER_ZCRAM, 1)) {
2783                         element_addr = newmem + first_element_offset;
2784                         first_element_offset += elem_size;
2785                 } else {
2786                         element_addr = newmem + last_element_offset;
2787                         last_element_offset -= elem_size;
2788                 }
2789                 if (element_addr != (vm_offset_t)zone) {
2790                         zone->count++;  /* compensate for free_to_zone */
2791                         free_to_zone(zone, element_addr, FALSE);
2792                 }
2793                 zone->cur_size += elem_size;
2794         }
2795 }
2796
2797 /*
2798  *      Cram the given memory into the specified zone. Update the zone page count accordingly.
2799  */
2800 void
2801 zcram(
2802         zone_t          zone,
2803         vm_offset_t                     newmem,
2804         vm_size_t               size)
2805 {
2806         vm_size_t       elem_size;
2807         boolean_t   from_zm = FALSE;
2808         int element_count;
2809         unsigned int entropy_buffer[MAX_ENTROPY_PER_ZCRAM] = { 0 };
2810
2811         /* Basic sanity checks */
2812         assert(zone != ZONE_NULL && newmem != (vm_offset_t)0);
2813         assert(!zone->collectable || zone->allows_foreign
2814             || (from_zone_map(newmem, size)));
2815
2816         elem_size = zone->elem_size;
2817
2818         KDBG(MACHDBG_CODE(DBG_MACH_ZALLOC, ZALLOC_ZCRAM) | DBG_FUNC_START, zone->index, size);
2819
2820         if (from_zone_map(newmem, size)) {
2821                 from_zm = TRUE;
2822         }
2823
2824         if (!from_zm) {
2825                 /* We cannot support elements larger than page size for foreign memory because we
2826                  * put metadata on the page itself for each page of foreign memory. We need to do
2827                  * this in order to be able to reach the metadata when any element is freed
2828                  */
2829                 assert((zone->allows_foreign == TRUE) && (zone->elem_size <= (PAGE_SIZE - sizeof(struct zone_page_metadata))));
2830         }
2831
2832 #if DEBUG || DEVELOPMENT
2833         if (zalloc_debug & ZALLOC_DEBUG_ZCRAM) {
2834                 kprintf("zcram(%p[%s], 0x%lx%s, 0x%lx)\n", zone, zone->zone_name,
2835                     (unsigned long)newmem, from_zm ? "" : "[F]", (unsigned long)size);
2836         }
2837 #endif /* DEBUG || DEVELOPMENT */
2838
2839         ZONE_PAGE_COUNT_INCR(zone, (size / PAGE_SIZE));
2840
2841         /*
2842          * Initialize the metadata for all pages. We dont need the zone lock
2843          * here because we are not manipulating any zone related state yet.
2844          */
2845
2846         struct zone_page_metadata *chunk_metadata;
2847         size_t zone_page_metadata_size = sizeof(struct zone_page_metadata);
2848
2849         assert((newmem & PAGE_MASK) == 0);
2850         assert((size & PAGE_MASK) == 0);
2851
2852         chunk_metadata = get_zone_page_metadata((struct zone_free_element *)newmem, TRUE);
2853         chunk_metadata->pages.next = NULL;
2854         chunk_metadata->pages.prev = NULL;
2855         page_metadata_set_freelist(chunk_metadata, 0);
2856         PAGE_METADATA_SET_ZINDEX(chunk_metadata, zone->index);
2857         chunk_metadata->free_count = 0;
2858         assert((size / PAGE_SIZE) <= ZONE_CHUNK_MAXPAGES);
2859         chunk_metadata->page_count = (unsigned)(size / PAGE_SIZE);
2860
2861         zcram_metadata_init(newmem, size, chunk_metadata);
2862
2863 #if VM_MAX_TAG_ZONES
2864         if (__improbable(zone->tags)) {
2865                 assert(from_zm);
2866                 ztMemoryAdd(zone, newmem, size);
2867         }
2868 #endif /* VM_MAX_TAG_ZONES */
2869
2870         lock_zone(zone);
2871         assert(zone->zone_valid);
2872         enqueue_tail(&zone->pages.all_used, &(chunk_metadata->pages));
2873
2874         if (!from_zm) {
2875                 /* We cannot support elements larger than page size for foreign memory because we
2876                  * put metadata on the page itself for each page of foreign memory. We need to do
2877                  * this in order to be able to reach the metadata when any element is freed
2878                  */
2879
2880                 for (; size > 0; newmem += PAGE_SIZE, size -= PAGE_SIZE) {
2881                         vm_offset_t first_element_offset = 0;
2882                         if (zone_page_metadata_size % ZONE_ELEMENT_ALIGNMENT == 0) {
2883                                 first_element_offset = zone_page_metadata_size;
2884                         } else {
2885                                 first_element_offset = zone_page_metadata_size + (ZONE_ELEMENT_ALIGNMENT - (zone_page_metadata_size % ZONE_ELEMENT_ALIGNMENT));
2886                         }
2887                         element_count = (unsigned int)((PAGE_SIZE - first_element_offset) / elem_size);
2888                         random_free_to_zone(zone, newmem, first_element_offset, element_count, entropy_buffer);
2889                 }
2890         } else {
2891                 element_count = (unsigned int)(size / elem_size);
2892                 random_free_to_zone(zone, newmem, 0, element_count, entropy_buffer);
2893         }
2894         unlock_zone(zone);
2895
2896         KDBG(MACHDBG_CODE(DBG_MACH_ZALLOC, ZALLOC_ZCRAM) | DBG_FUNC_END, zone->index);
2897 }
2898
2899 /*
2900  * Fill a zone with enough memory to contain at least nelem elements.
2901  * Return the number of elements actually put into the zone, which may
2902  * be more than the caller asked for since the memory allocation is
2903  * rounded up to the next zone allocation size.
2904  */
2905 int
2906 zfill(
2907         zone_t  zone,
2908         int     nelem)
2909 {
2910         kern_return_t kr;
2911         vm_offset_t     memory;
2912
2913         vm_size_t alloc_size = zone->alloc_size;
2914         vm_size_t elem_per_alloc = alloc_size / zone->elem_size;
2915         vm_size_t nalloc = (nelem + elem_per_alloc - 1) / elem_per_alloc;
2916         int zflags = KMA_KOBJECT;
2917
2918         if (zone->clear_memory) {
2919                 zflags |= KMA_ZERO;
2920         }
2921
2922         /* Don't mix-and-match zfill with foreign memory */
2923         assert(!zone->allows_foreign);
2924
2925         /* Trigger jetsams via the vm_pageout_garbage_collect thread if we're running out of zone memory */
2926         if (is_zone_map_nearing_exhaustion()) {
2927                 thread_wakeup((event_t) &vm_pageout_garbage_collect);
2928         }
2929
2930         kr = kernel_memory_allocate(zone_map, &memory, nalloc * alloc_size, 0, zflags, VM_KERN_MEMORY_ZONE);
2931         if (kr != KERN_SUCCESS) {
2932                 printf("%s: kernel_memory_allocate() of %lu bytes failed\n",
2933                     __func__, (unsigned long)(nalloc * alloc_size));
2934                 return 0;
2935         }
2936
2937         for (vm_size_t i = 0; i < nalloc; i++) {
2938                 zcram(zone, memory + i * alloc_size, alloc_size);
2939         }
2940
2941         return (int)(nalloc * elem_per_alloc);
2942 }
2943
2944 /*
2945  *      Initialize the "zone of zones" which uses fixed memory allocated
2946  *      earlier in memory initialization.  zone_bootstrap is called
2947  *      before zone_init.
2948  */
2949 void
2950 zone_bootstrap(void)
2951 {
2952         char temp_buf[16];
2953
2954 #if DEBUG || DEVELOPMENT
2955         if (!PE_parse_boot_argn("zalloc_debug", &zalloc_debug, sizeof(zalloc_debug))) {
2956                 zalloc_debug = 0;
2957         }
2958 #endif /* DEBUG || DEVELOPMENT */
2959
2960         /* Set up zone element poisoning */
2961         zp_init();
2962
2963         random_bool_init(&zone_bool_gen);
2964
2965         /* should zlog log to debug zone corruption instead of leaks? */
2966         if (PE_parse_boot_argn("-zc", temp_buf, sizeof(temp_buf))) {
2967                 corruption_debug_flag = TRUE;
2968         }
2969
2970 #if DEBUG || DEVELOPMENT
2971         /* should perform zone element size checking in copyin/copyout? */
2972         if (PE_parse_boot_argn("-no-copyio-zalloc-check", temp_buf, sizeof(temp_buf))) {
2973                 copyio_zalloc_check = FALSE;
2974         }
2975 #if VM_MAX_TAG_ZONES
2976         /* enable tags for zones that ask for  */
2977         if (PE_parse_boot_argn("-zt", temp_buf, sizeof(temp_buf))) {
2978                 zone_tagging_on = TRUE;
2979         }
2980 #endif /* VM_MAX_TAG_ZONES */
2981         /* disable element location randomization in a page */
2982         if (PE_parse_boot_argn("-zl", temp_buf, sizeof(temp_buf))) {
2983                 leak_scan_debug_flag = TRUE;
2984         }
2985 #endif
2986
2987         simple_lock_init(&all_zones_lock, 0);
2988
2989         num_zones_in_use = 0;
2990         num_zones = 0;
2991         /* Mark all zones as empty */
2992         bitmap_full(zone_empty_bitmap, BITMAP_LEN(MAX_ZONES));
2993         zone_names_next = zone_names_start = 0;
2994
2995 #if DEBUG || DEVELOPMENT
2996         simple_lock_init(&zone_test_lock, 0);
2997 #endif /* DEBUG || DEVELOPMENT */
2998
2999         thread_call_setup(&call_async_alloc, zalloc_async, NULL);
3000
3001         /* initializing global lock group for zones */
3002         lck_grp_attr_setdefault(&zone_locks_grp_attr);
3003         lck_grp_init(&zone_locks_grp, "zone_locks", &zone_locks_grp_attr);
3004
3005         lck_attr_setdefault(&zone_metadata_lock_attr);
3006         lck_mtx_init_ext(&zone_metadata_region_lck, &zone_metadata_region_lck_ext, &zone_locks_grp, &zone_metadata_lock_attr);
3007
3008 #if     CONFIG_ZCACHE
3009         /* zcc_enable_for_zone_name=<zone>: enable per-cpu zone caching for <zone>. */
3010         if (PE_parse_boot_arg_str("zcc_enable_for_zone_name", cache_zone_name, sizeof(cache_zone_name))) {
3011                 printf("zcache: caching enabled for zone %s\n", cache_zone_name);
3012         }
3013
3014         /* -zcache_all: enable per-cpu zone caching for all zones, overrides 'zcc_enable_for_zone_name'. */
3015         if (PE_parse_boot_argn("-zcache_all", temp_buf, sizeof(temp_buf))) {
3016                 cache_all_zones = TRUE;
3017                 printf("zcache: caching enabled for all zones\n");
3018         }
3019 #endif /* CONFIG_ZCACHE */
3020 }
3021
3022 /*
3023  * We're being very conservative here and picking a value of 95%. We might need to lower this if
3024  * we find that we're not catching the problem and are still hitting zone map exhaustion panics.
3025  */
3026 #define ZONE_MAP_JETSAM_LIMIT_DEFAULT 95
3027
3028 /*
3029  * Trigger zone-map-exhaustion jetsams if the zone map is X% full, where X=zone_map_jetsam_limit.
3030  * Can be set via boot-arg "zone_map_jetsam_limit". Set to 95% by default.
3031  */
3032 unsigned int zone_map_jetsam_limit = ZONE_MAP_JETSAM_LIMIT_DEFAULT;
3033
3034 /*
3035  * Returns pid of the task with the largest number of VM map entries.
3036  */
3037 extern pid_t find_largest_process_vm_map_entries(void);
3038
3039 /*
3040  * Callout to jetsam. If pid is -1, we wake up the memorystatus thread to do asynchronous kills.
3041  * For any other pid we try to kill that process synchronously.
3042  */
3043 boolean_t memorystatus_kill_on_zone_map_exhaustion(pid_t pid);
3044
3045 void
3046 get_zone_map_size(uint64_t *current_size, uint64_t *capacity)
3047 {
3048         *current_size = zone_map->size;
3049         *capacity = vm_map_max(zone_map) - vm_map_min(zone_map);
3050 }
3051
3052 void
3053 get_largest_zone_info(char *zone_name, size_t zone_name_len, uint64_t *zone_size)
3054 {
3055         zone_t largest_zone = zone_find_largest();
3056         strlcpy(zone_name, largest_zone->zone_name, zone_name_len);
3057         *zone_size = largest_zone->cur_size;
3058 }
3059
3060 boolean_t
3061 is_zone_map_nearing_exhaustion(void)
3062 {
3063         uint64_t size = zone_map->size;
3064         uint64_t capacity = vm_map_max(zone_map) - vm_map_min(zone_map);
3065         if (size > ((capacity * zone_map_jetsam_limit) / 100)) {
3066                 return TRUE;
3067         }
3068         return FALSE;
3069 }
3070
3071 extern zone_t vm_map_entry_zone;
3072 extern zone_t vm_object_zone;
3073
3074 #define VMENTRY_TO_VMOBJECT_COMPARISON_RATIO 98
3075
3076 /*
3077  * Tries to kill a single process if it can attribute one to the largest zone. If not, wakes up the memorystatus thread
3078  * to walk through the jetsam priority bands and kill processes.
3079  */
3080 static void
3081 kill_process_in_largest_zone(void)
3082 {
3083         pid_t pid = -1;
3084         zone_t largest_zone = zone_find_largest();
3085
3086         printf("zone_map_exhaustion: Zone map size %lld, capacity %lld [jetsam limit %d%%]\n", (uint64_t)zone_map->size,
3087             (uint64_t)(vm_map_max(zone_map) - vm_map_min(zone_map)), zone_map_jetsam_limit);
3088         printf("zone_map_exhaustion: Largest zone %s, size %lu\n", largest_zone->zone_name, (uintptr_t)largest_zone->cur_size);
3089
3090         /*
3091          * We want to make sure we don't call this function from userspace. Or we could end up trying to synchronously kill the process
3092          * whose context we're in, causing the system to hang.
3093          */
3094         assert(current_task() == kernel_task);
3095
3096         /*
3097          * If vm_object_zone is the largest, check to see if the number of elements in vm_map_entry_zone is comparable. If so, consider
3098          * vm_map_entry_zone as the largest. This lets us target a specific process to jetsam to quickly recover from the zone map bloat.
3099          */
3100         if (largest_zone == vm_object_zone) {
3101                 unsigned int vm_object_zone_count = vm_object_zone->count;
3102                 unsigned int vm_map_entry_zone_count = vm_map_entry_zone->count;
3103                 /* Is the VM map entries zone count >= 98% of the VM objects zone count? */
3104                 if (vm_map_entry_zone_count >= ((vm_object_zone_count * VMENTRY_TO_VMOBJECT_COMPARISON_RATIO) / 100)) {
3105                         largest_zone = vm_map_entry_zone;
3106                         printf("zone_map_exhaustion: Picking VM map entries as the zone to target, size %lu\n", (uintptr_t)largest_zone->cur_size);
3107                 }
3108         }
3109
3110         /* TODO: Extend this to check for the largest process in other zones as well. */
3111         if (largest_zone == vm_map_entry_zone) {
3112                 pid = find_largest_process_vm_map_entries();
3113         } else {
3114                 printf("zone_map_exhaustion: Nothing to do for the largest zone [%s]. Waking up memorystatus thread.\n", largest_zone->zone_name);
3115         }
3116         if (!memorystatus_kill_on_zone_map_exhaustion(pid)) {
3117                 printf("zone_map_exhaustion: Call to memorystatus failed, victim pid: %d\n", pid);
3118         }
3119 }
3120
3121 /* Global initialization of Zone Allocator.
3122  * Runs after zone_bootstrap.
3123  */
3124 void
3125 zone_init(
3126         vm_size_t max_zonemap_size)
3127 {
3128         kern_return_t   retval;
3129         vm_offset_t     zone_min;
3130         vm_offset_t     zone_max;
3131         vm_offset_t     zone_metadata_space;
3132         unsigned int    zone_pages;
3133         vm_map_kernel_flags_t vmk_flags;
3134
3135 #if VM_MAX_TAG_ZONES
3136         if (zone_tagging_on) {
3137                 ztInit(max_zonemap_size, &zone_locks_grp);
3138         }
3139 #endif
3140
3141         vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
3142         vmk_flags.vmkf_permanent = TRUE;
3143         retval = kmem_suballoc(kernel_map, &zone_min, max_zonemap_size,
3144             FALSE, VM_FLAGS_ANYWHERE, vmk_flags, VM_KERN_MEMORY_ZONE,
3145             &zone_map);
3146
3147         if (retval != KERN_SUCCESS) {
3148                 panic("zone_init: kmem_suballoc failed");
3149         }
3150         zone_max = zone_min + round_page(max_zonemap_size);
3151
3152 #if     CONFIG_GZALLOC
3153         gzalloc_init(max_zonemap_size);
3154 #endif
3155
3156         /*
3157          * Setup garbage collection information:
3158          */
3159         zone_map_min_address = zone_min;
3160         zone_map_max_address = zone_max;
3161
3162         zone_pages = (unsigned int)atop_kernel(zone_max - zone_min);
3163         zone_metadata_space = round_page(zone_pages * sizeof(struct zone_page_metadata));
3164         retval = kernel_memory_allocate(zone_map, &zone_metadata_region_min, zone_metadata_space,
3165             0, KMA_KOBJECT | KMA_VAONLY | KMA_PERMANENT, VM_KERN_MEMORY_OSFMK);
3166         if (retval != KERN_SUCCESS) {
3167                 panic("zone_init: zone_metadata_region initialization failed!");
3168         }
3169         zone_metadata_region_max = zone_metadata_region_min + zone_metadata_space;
3170
3171 #if defined(__LP64__)
3172         /*
3173          * ensure that any vm_page_t that gets created from
3174          * the vm_page zone can be packed properly (see vm_page.h
3175          * for the packing requirements
3176          */
3177         if ((vm_page_t)(VM_PAGE_UNPACK_PTR(VM_PAGE_PACK_PTR(zone_metadata_region_max))) != (vm_page_t)zone_metadata_region_max) {
3178                 panic("VM_PAGE_PACK_PTR failed on zone_metadata_region_max - %p", (void *)zone_metadata_region_max);
3179         }
3180
3181         if ((vm_page_t)(VM_PAGE_UNPACK_PTR(VM_PAGE_PACK_PTR(zone_map_max_address))) != (vm_page_t)zone_map_max_address) {
3182                 panic("VM_PAGE_PACK_PTR failed on zone_map_max_address - %p", (void *)zone_map_max_address);
3183         }
3184 #endif
3185
3186         lck_grp_attr_setdefault(&zone_gc_lck_grp_attr);
3187         lck_grp_init(&zone_gc_lck_grp, "zone_gc", &zone_gc_lck_grp_attr);
3188         lck_attr_setdefault(&zone_gc_lck_attr);
3189         lck_mtx_init_ext(&zone_gc_lock, &zone_gc_lck_ext, &zone_gc_lck_grp, &zone_gc_lck_attr);
3190
3191 #if CONFIG_ZLEAKS
3192         /*
3193          * Initialize the zone leak monitor
3194          */
3195         zleak_init(max_zonemap_size);
3196 #endif /* CONFIG_ZLEAKS */
3197
3198 #if VM_MAX_TAG_ZONES
3199         if (zone_tagging_on) {
3200                 vm_allocation_zones_init();
3201         }
3202 #endif
3203
3204         int jetsam_limit_temp = 0;
3205         if (PE_parse_boot_argn("zone_map_jetsam_limit", &jetsam_limit_temp, sizeof(jetsam_limit_temp)) &&
3206             jetsam_limit_temp > 0 && jetsam_limit_temp <= 100) {
3207                 zone_map_jetsam_limit = jetsam_limit_temp;
3208         }
3209 }
3210
3211 #pragma mark -
3212 #pragma mark zalloc_canblock
3213
3214 extern boolean_t early_boot_complete;
3215
3216 void
3217 zalloc_poison_element(boolean_t check_poison, zone_t zone, vm_offset_t addr)
3218 {
3219         vm_offset_t     inner_size = zone->elem_size;
3220         if (__improbable(check_poison && addr)) {
3221                 vm_offset_t *element_cursor  = ((vm_offset_t *) addr) + 1;
3222                 vm_offset_t *backup  = get_backup_ptr(inner_size, (vm_offset_t *) addr);
3223
3224                 for (; element_cursor < backup; element_cursor++) {
3225                         if (__improbable(*element_cursor != ZP_POISON)) {
3226                                 zone_element_was_modified_panic(zone,
3227                                     addr,
3228                                     *element_cursor,
3229                                     ZP_POISON,
3230                                     ((vm_offset_t)element_cursor) - addr);
3231                         }
3232                 }
3233         }
3234
3235         if (addr) {
3236                 /*
3237                  * Clear out the old next pointer and backup to avoid leaking the cookie
3238                  * and so that only values on the freelist have a valid cookie
3239                  */
3240
3241                 vm_offset_t *primary  = (vm_offset_t *) addr;
3242                 vm_offset_t *backup   = get_backup_ptr(inner_size, primary);
3243
3244                 *primary = ZP_POISON;
3245                 *backup  = ZP_POISON;
3246         }
3247 }
3248
3249 /*
3250  * When deleting page mappings from the kernel map, it might be necessary to split
3251  * apart an existing vm_map_entry. That means that a "free" operation, will need to
3252  * *allocate* new vm_map_entry structures before it can free a page.
3253  *
3254  * This reserve here is the number of elements which are held back from everyone except
3255  * the zone_gc thread. This is done so the zone_gc thread should never have to wait for
3256  * the zone replenish thread for vm_map_entry structs. If it did, it could wind up
3257  * in a deadlock.
3258  */
3259 #define VM_MAP_ENTRY_RESERVE_CNT 8
3260
3261 /*
3262  *      zalloc returns an element from the specified zone.
3263  */
3264 static void *
3265 zalloc_internal(
3266         zone_t  zone,
3267         boolean_t canblock,
3268         boolean_t nopagewait,
3269         vm_size_t
3270 #if !VM_MAX_TAG_ZONES
3271         __unused
3272 #endif
3273         reqsize,
3274         vm_tag_t  tag)
3275 {
3276         vm_offset_t     addr = 0;
3277         kern_return_t   retval;
3278         uintptr_t       zbt[MAX_ZTRACE_DEPTH];  /* used in zone leak logging and zone leak detection */
3279         unsigned int    numsaved = 0;
3280         thread_t        thr = current_thread();
3281         boolean_t       check_poison = FALSE;
3282         boolean_t       set_doing_alloc_with_vm_priv = FALSE;
3283
3284 #if CONFIG_ZLEAKS
3285         uint32_t        zleak_tracedepth = 0;  /* log this allocation if nonzero */
3286 #endif /* CONFIG_ZLEAKS */
3287
3288 #if KASAN
3289         /*
3290          * KASan uses zalloc() for fakestack, which can be called anywhere. However,
3291          * we make sure these calls can never block.
3292          */
3293         boolean_t irq_safe = FALSE;
3294         const char *fakestack_name = "fakestack.";
3295         if (strncmp(zone->zone_name, fakestack_name, strlen(fakestack_name)) == 0) {
3296                 irq_safe = TRUE;
3297         }
3298 #elif MACH_ASSERT
3299         /* In every other case, zalloc() from interrupt context is unsafe. */
3300         const boolean_t irq_safe = FALSE;
3301 #endif
3302
3303         assert(zone != ZONE_NULL);
3304         assert(irq_safe || ml_get_interrupts_enabled() || ml_is_quiescing() || debug_mode_active() || !early_boot_complete);
3305
3306 #if     CONFIG_GZALLOC
3307         addr = gzalloc_alloc(zone, canblock);
3308 #endif
3309         /*
3310          * If zone logging is turned on and this is the zone we're tracking, grab a backtrace.
3311          */
3312         if (__improbable(DO_LOGGING(zone))) {
3313                 numsaved = OSBacktrace((void*) zbt, MAX_ZTRACE_DEPTH);
3314         }
3315
3316 #if CONFIG_ZLEAKS
3317         /*
3318          * Zone leak detection: capture a backtrace every zleak_sample_factor
3319          * allocations in this zone.
3320          */
3321         if (__improbable(zone->zleak_on && sample_counter(&zone->zleak_capture, zleak_sample_factor) == TRUE)) {
3322                 /* Avoid backtracing twice if zone logging is on */
3323                 if (numsaved == 0) {
3324                         zleak_tracedepth = backtrace(zbt, MAX_ZTRACE_DEPTH, NULL);
3325                 } else {
3326                         zleak_tracedepth = numsaved;
3327                 }
3328         }
3329 #endif /* CONFIG_ZLEAKS */
3330
3331 #if VM_MAX_TAG_ZONES
3332         if (__improbable(zone->tags)) {
3333                 vm_tag_will_update_zone(tag, zone->tag_zone_index);
3334         }
3335 #endif /* VM_MAX_TAG_ZONES */
3336
3337 #if CONFIG_ZCACHE
3338         if (__probable(addr == 0)) {
3339                 if (zone_caching_enabled(zone)) {
3340                         addr = zcache_alloc_from_cpu_cache(zone);
3341                         if (addr) {
3342 #if KASAN_ZALLOC
3343                                 addr = kasan_fixup_allocated_element_address(zone, addr);
3344 #endif
3345                                 if (__improbable(DO_LOGGING(zone) && addr)) {
3346                                         btlog_add_entry(zone->zlog_btlog, (void *)addr,
3347                                             ZOP_ALLOC, (void **)zbt, numsaved);
3348                                 }
3349                                 DTRACE_VM2(zalloc, zone_t, zone, void*, addr);
3350                                 return (void *)addr;
3351                         }
3352                 }
3353         }
3354 #endif /* CONFIG_ZCACHE */
3355
3356         lock_zone(zone);
3357         assert(zone->zone_valid);
3358
3359         /*
3360          * Check if we need another thread to replenish the zone.
3361          * This is used for elements, like vm_map_entry, which are
3362          * needed themselves to implement zalloc().
3363          */
3364         if (zone->async_prio_refill && zone->zone_replenish_thread) {
3365                 vm_size_t curr_free;
3366                 vm_size_t refill_level;
3367                 const vm_size_t reserved_min = VM_MAP_ENTRY_RESERVE_CNT * zone->elem_size;
3368
3369                 for (;;) {
3370                         curr_free = (zone->cur_size - (zone->count * zone->elem_size));
3371                         refill_level = zone->prio_refill_watermark * zone->elem_size;
3372
3373                         /*
3374                          * Nothing to do if there are plenty of elements.
3375                          */
3376                         if (curr_free > refill_level) {
3377                                 break;
3378                         }
3379
3380                         /*
3381                          * Wakeup the replenish thread.
3382                          */
3383                         zone_replenish_wakeups_initiated++;
3384                         thread_wakeup(&zone->zone_replenish_thread);
3385
3386                         /*
3387                          * If we:
3388                          * - still have head room, more than half the refill amount, or
3389                          * - this is a VMPRIV thread and we're still above reserved, or
3390                          * - this is the zone garbage collection thread which may use the reserve
3391                          * then we don't have to wait for the replenish thread.
3392                          *
3393                          * The reserve for the garbage collection thread is to avoid a deadlock
3394                          * on the zone_map_lock between the replenish thread and GC thread.
3395                          */
3396                         if (curr_free > refill_level / 2 ||
3397                             ((thr->options & TH_OPT_VMPRIV) && curr_free > reserved_min) ||
3398                             (thr->options & TH_OPT_ZONE_GC)) {
3399                                 break;
3400                         }
3401                         zone_replenish_throttle_count++;
3402                         unlock_zone(zone);
3403                         assert_wait_timeout(zone, THREAD_UNINT, 1, NSEC_PER_MSEC);
3404                         thread_block(THREAD_CONTINUE_NULL);
3405                         lock_zone(zone);
3406
3407                         assert(zone->zone_valid);
3408                 }
3409         }
3410
3411         if (__probable(addr == 0)) {
3412                 addr = try_alloc_from_zone(zone, tag, &check_poison);
3413         }
3414
3415         /* If we're here because of zone_gc(), we didn't wait for zone_replenish_thread to finish.
3416          * So we need to ensure that we did successfully grab an element. And we only need to assert
3417          * this for zones that have a replenish thread configured (in this case, the Reserved VM map
3418          * entries zone). The value of reserved_min in the previous bit of code should have given us
3419          * headroom even though the GC thread didn't wait.
3420          */
3421         if ((thr->options & TH_OPT_ZONE_GC) && zone->async_prio_refill) {
3422                 assert(addr != 0);
3423         }
3424
3425         while ((addr == 0) && canblock) {
3426                 /*
3427                  * zone is empty, try to expand it
3428                  *
3429                  * Note that we now allow up to 2 threads (1 vm_privliged and 1 non-vm_privliged)
3430                  * to expand the zone concurrently...  this is necessary to avoid stalling
3431                  * vm_privileged threads running critical code necessary to continue compressing/swapping
3432                  * pages (i.e. making new free pages) from stalling behind non-vm_privileged threads
3433                  * waiting to acquire free pages when the vm_page_free_count is below the
3434                  * vm_page_free_reserved limit.
3435                  */
3436                 if ((zone->doing_alloc_without_vm_priv || zone->doing_alloc_with_vm_priv) &&
3437                     (((thr->options & TH_OPT_VMPRIV) == 0) || zone->doing_alloc_with_vm_priv)) {
3438                         /*
3439                          * This is a non-vm_privileged thread and a non-vm_privileged or
3440                          * a vm_privileged thread is already expanding the zone...
3441                          *    OR
3442                          * this is a vm_privileged thread and a vm_privileged thread is
3443                          * already expanding the zone...
3444                          *
3445                          * In either case wait for a thread to finish, then try again.
3446                          */
3447                         zone->waiting = TRUE;
3448                         zone_sleep(zone);
3449                 } else {
3450                         vm_offset_t space;
3451                         vm_size_t alloc_size;
3452                         int retry = 0;
3453
3454                         if ((zone->cur_size + zone->elem_size) >
3455                             zone->max_size) {
3456                                 if (zone->exhaustible) {
3457                                         break;
3458                                 }
3459                                 if (zone->expandable) {
3460                                         /*
3461                                          * We're willing to overflow certain
3462                                          * zones, but not without complaining.
3463                                          *
3464                                          * This is best used in conjunction
3465                                          * with the collectable flag. What we
3466                                          * want is an assurance we can get the
3467                                          * memory back, assuming there's no
3468                                          * leak.
3469                                          */
3470                                         zone->max_size += (zone->max_size >> 1);
3471                                 } else {
3472                                         unlock_zone(zone);
3473
3474                                         panic_include_zprint = TRUE;
3475 #if CONFIG_ZLEAKS
3476                                         if (zleak_state & ZLEAK_STATE_ACTIVE) {
3477                                                 panic_include_ztrace = TRUE;
3478                                         }
3479 #endif /* CONFIG_ZLEAKS */
3480                                         panic("zalloc: zone \"%s\" empty.", zone->zone_name);
3481                                 }
3482                         }
3483                         /*
3484                          * It is possible that a BG thread is refilling/expanding the zone
3485                          * and gets pre-empted during that operation. That blocks all other
3486                          * threads from making progress leading to a watchdog timeout. To
3487                          * avoid that, boost the thread priority using the rwlock boost
3488                          */
3489                         set_thread_rwlock_boost();
3490
3491                         if ((thr->options & TH_OPT_VMPRIV)) {
3492                                 zone->doing_alloc_with_vm_priv = TRUE;
3493                                 set_doing_alloc_with_vm_priv = TRUE;
3494                         } else {
3495                                 zone->doing_alloc_without_vm_priv = TRUE;
3496                         }
3497                         unlock_zone(zone);
3498
3499                         for (;;) {
3500                                 int     zflags = KMA_KOBJECT | KMA_NOPAGEWAIT;
3501
3502                                 if (vm_pool_low() || retry >= 1) {
3503                                         alloc_size =
3504                                             round_page(zone->elem_size);
3505                                 } else {
3506                                         alloc_size = zone->alloc_size;
3507                                 }
3508
3509                                 if (zone->noencrypt) {
3510                                         zflags |= KMA_NOENCRYPT;
3511                                 }
3512
3513                                 if (zone->clear_memory) {
3514                                         zflags |= KMA_ZERO;
3515                                 }
3516
3517                                 /* Trigger jetsams via the vm_pageout_garbage_collect thread if we're running out of zone memory */
3518                                 if (is_zone_map_nearing_exhaustion()) {
3519                                         thread_wakeup((event_t) &vm_pageout_garbage_collect);
3520                                 }
3521
3522                                 retval = kernel_memory_allocate(zone_map, &space, alloc_size, 0, zflags, VM_KERN_MEMORY_ZONE);
3523                                 if (retval == KERN_SUCCESS) {
3524 #if CONFIG_ZLEAKS
3525                                         if ((zleak_state & (ZLEAK_STATE_ENABLED | ZLEAK_STATE_ACTIVE)) == ZLEAK_STATE_ENABLED) {
3526                                                 if (zone_map->size >= zleak_global_tracking_threshold) {
3527                                                         kern_return_t kr;
3528
3529                                                         kr = zleak_activate();
3530                                                         if (kr != KERN_SUCCESS) {
3531                                                                 printf("Failed to activate live zone leak debugging (%d).\n", kr);
3532                                                         }
3533                                                 }
3534                                         }
3535
3536                                         if ((zleak_state & ZLEAK_STATE_ACTIVE) && !(zone->zleak_on)) {
3537                                                 if (zone->cur_size > zleak_per_zone_tracking_threshold) {
3538                                                         zone->zleak_on = TRUE;
3539                                                 }
3540                                         }
3541 #endif /* CONFIG_ZLEAKS */
3542                                         zcram(zone, space, alloc_size);
3543
3544                                         break;
3545                                 } else if (retval != KERN_RESOURCE_SHORTAGE) {
3546                                         retry++;
3547
3548                                         if (retry == 3) {
3549                                                 panic_include_zprint = TRUE;
3550 #if CONFIG_ZLEAKS
3551                                                 if ((zleak_state & ZLEAK_STATE_ACTIVE)) {
3552                                                         panic_include_ztrace = TRUE;
3553                                                 }
3554 #endif /* CONFIG_ZLEAKS */
3555                                                 if (retval == KERN_NO_SPACE) {
3556                                                         zone_t zone_largest = zone_find_largest();
3557                                                         panic("zalloc: zone map exhausted while allocating from zone %s, likely due to memory leak in zone %s (%lu total bytes, %d elements allocated)",
3558                                                             zone->zone_name, zone_largest->zone_name,
3559                                                             (unsigned long)zone_largest->cur_size, zone_largest->count);
3560                                                 }
3561                                                 panic("zalloc: \"%s\" (%d elements) retry fail %d", zone->zone_name, zone->count, retval);
3562                                         }
3563                                 } else {
3564                                         break;
3565                                 }
3566                         }
3567                         lock_zone(zone);
3568                         assert(zone->zone_valid);
3569
3570                         if (set_doing_alloc_with_vm_priv == TRUE) {
3571                                 zone->doing_alloc_with_vm_priv = FALSE;
3572                         } else {
3573                                 zone->doing_alloc_without_vm_priv = FALSE;
3574                         }
3575
3576                         if (zone->waiting) {
3577                                 zone->waiting = FALSE;
3578                                 zone_wakeup(zone);
3579                         }
3580                         clear_thread_rwlock_boost();
3581
3582                         addr = try_alloc_from_zone(zone, tag, &check_poison);
3583                         if (addr == 0 &&
3584                             retval == KERN_RESOURCE_SHORTAGE) {
3585                                 if (nopagewait == TRUE) {
3586                                         break;  /* out of the main while loop */
3587                                 }
3588                                 unlock_zone(zone);
3589
3590                                 VM_PAGE_WAIT();
3591                                 lock_zone(zone);
3592                                 assert(zone->zone_valid);
3593                         }
3594                 }
3595                 if (addr == 0) {
3596                         addr = try_alloc_from_zone(zone, tag, &check_poison);
3597                 }
3598         }
3599
3600 #if CONFIG_ZLEAKS
3601         /* Zone leak detection:
3602          * If we're sampling this allocation, add it to the zleaks hash table.
3603          */
3604         if (addr && zleak_tracedepth > 0) {
3605                 /* Sampling can fail if another sample is happening at the same time in a different zone. */
3606                 if (!zleak_log(zbt, addr, zleak_tracedepth, zone->elem_size)) {
3607                         /* If it failed, roll back the counter so we sample the next allocation instead. */
3608                         zone->zleak_capture = zleak_sample_factor;
3609                 }
3610         }
3611 #endif /* CONFIG_ZLEAKS */
3612
3613
3614         if ((addr == 0) && (!canblock || nopagewait) && (zone->async_pending == FALSE) && (zone->no_callout == FALSE) && (zone->exhaustible == FALSE) && (!vm_pool_low())) {
3615                 zone->async_pending = TRUE;
3616                 unlock_zone(zone);
3617                 thread_call_enter(&call_async_alloc);
3618                 lock_zone(zone);
3619                 assert(zone->zone_valid);
3620                 addr = try_alloc_from_zone(zone, tag, &check_poison);
3621         }
3622
3623 #if VM_MAX_TAG_ZONES
3624         if (__improbable(zone->tags) && addr) {
3625                 if (reqsize) {
3626                         reqsize = zone->elem_size - reqsize;
3627                 }
3628                 vm_tag_update_zone_size(tag, zone->tag_zone_index, zone->elem_size, reqsize);
3629         }
3630 #endif /* VM_MAX_TAG_ZONES */
3631
3632         unlock_zone(zone);
3633
3634         if (__improbable(DO_LOGGING(zone) && addr)) {
3635                 btlog_add_entry(zone->zlog_btlog, (void *)addr, ZOP_ALLOC, (void **)zbt, numsaved);
3636         }
3637
3638         zalloc_poison_element(check_poison, zone, addr);
3639
3640         if (addr) {
3641 #if DEBUG || DEVELOPMENT
3642                 if (__improbable(leak_scan_debug_flag && !(zone->elem_size & (sizeof(uintptr_t) - 1)))) {
3643                         unsigned int count, idx;
3644                         /* Fill element, from tail, with backtrace in reverse order */
3645                         if (numsaved == 0) {
3646                                 numsaved = backtrace(zbt, MAX_ZTRACE_DEPTH, NULL);
3647                         }
3648                         count = (unsigned int)(zone->elem_size / sizeof(uintptr_t));
3649                         if (count >= numsaved) {
3650                                 count = numsaved - 1;
3651                         }
3652                         for (idx = 0; idx < count; idx++) {
3653                                 ((uintptr_t *)addr)[count - 1 - idx] = zbt[idx + 1];
3654                         }
3655                 }
3656 #endif /* DEBUG || DEVELOPMENT */
3657         }
3658
3659         TRACE_MACHLEAKS(ZALLOC_CODE, ZALLOC_CODE_2, zone->elem_size, addr);
3660
3661
3662 #if KASAN_ZALLOC
3663         addr = kasan_fixup_allocated_element_address(zone, addr);
3664 #endif
3665
3666         DTRACE_VM2(zalloc, zone_t, zone, void*, addr);
3667
3668         return (void *)addr;
3669 }
3670
3671 void *
3672 zalloc(zone_t zone)
3673 {
3674         return zalloc_internal(zone, TRUE, FALSE, 0, VM_KERN_MEMORY_NONE);
3675 }
3676
3677 void *
3678 zalloc_noblock(zone_t zone)
3679 {
3680         return zalloc_internal(zone, FALSE, FALSE, 0, VM_KERN_MEMORY_NONE);
3681 }
3682
3683 void *
3684 zalloc_nopagewait(zone_t zone)
3685 {
3686         return zalloc_internal(zone, TRUE, TRUE, 0, VM_KERN_MEMORY_NONE);
3687 }
3688
3689 void *
3690 zalloc_canblock_tag(zone_t zone, boolean_t canblock, vm_size_t reqsize, vm_tag_t tag)
3691 {
3692         return zalloc_internal(zone, canblock, FALSE, reqsize, tag);
3693 }
3694
3695 void *
3696 zalloc_canblock(zone_t zone, boolean_t canblock)
3697 {
3698         return zalloc_internal(zone, canblock, FALSE, 0, VM_KERN_MEMORY_NONE);
3699 }
3700
3701 void *
3702 zalloc_attempt(zone_t zone)
3703 {
3704         boolean_t check_poison = FALSE;
3705         vm_offset_t addr = try_alloc_from_zone(zone, VM_KERN_MEMORY_NONE, &check_poison);
3706         zalloc_poison_element(check_poison, zone, addr);
3707         return (void *)addr;
3708 }
3709
3710 void
3711 zfree_direct(zone_t zone, vm_offset_t elem)
3712 {
3713         boolean_t       poison = zfree_poison_element(zone, elem);
3714         free_to_zone(zone, elem, poison);
3715 }
3716
3717
3718 void
3719 zalloc_async(
3720         __unused thread_call_param_t          p0,
3721         __unused thread_call_param_t p1)
3722 {
3723         zone_t current_z = NULL;
3724         unsigned int max_zones, i;
3725         void *elt = NULL;
3726         boolean_t pending = FALSE;
3727
3728         simple_lock(&all_zones_lock, &zone_locks_grp);
3729         max_zones = num_zones;
3730         simple_unlock(&all_zones_lock);
3731         for (i = 0; i < max_zones; i++) {
3732                 current_z = &(zone_array[i]);
3733
3734                 if (current_z->no_callout == TRUE) {
3735                         /* async_pending will never be set */
3736                         continue;
3737                 }
3738
3739                 lock_zone(current_z);
3740                 if (current_z->zone_valid && current_z->async_pending == TRUE) {
3741                         current_z->async_pending = FALSE;
3742                         pending = TRUE;
3743                 }
3744                 unlock_zone(current_z);
3745
3746                 if (pending == TRUE) {
3747                         elt = zalloc_canblock_tag(current_z, TRUE, 0, VM_KERN_MEMORY_OSFMK);
3748                         zfree(current_z, elt);
3749                         pending = FALSE;
3750                 }
3751         }
3752 }
3753
3754 /*
3755  *      zget returns an element from the specified zone
3756  *      and immediately returns nothing if there is nothing there.
3757  */
3758 void *
3759 zget(
3760         zone_t  zone)
3761 {
3762         return zalloc_internal(zone, FALSE, TRUE, 0, VM_KERN_MEMORY_NONE);
3763 }
3764
3765 /* Keep this FALSE by default.  Large memory machine run orders of magnitude
3766  *  slower in debug mode when true.  Use debugger to enable if needed */
3767 /* static */ boolean_t zone_check = FALSE;
3768
3769 static void
3770 zone_check_freelist(zone_t zone, vm_offset_t elem)
3771 {
3772         struct zone_free_element *this;
3773         struct zone_page_metadata *thispage;
3774
3775         if (zone->allows_foreign) {
3776                 for (thispage = (struct zone_page_metadata *)queue_first(&zone->pages.any_free_foreign);
3777                     !queue_end(&zone->pages.any_free_foreign, &(thispage->pages));
3778                     thispage = (struct zone_page_metadata *)queue_next(&(thispage->pages))) {
3779                         for (this = page_metadata_get_freelist(thispage);
3780                             this != NULL;
3781                             this = this->next) {
3782                                 if (!is_sane_zone_element(zone, (vm_address_t)this) || (vm_address_t)this == elem) {
3783                                         panic("zone_check_freelist");
3784                                 }
3785                         }
3786                 }
3787         }
3788         for (thispage = (struct zone_page_metadata *)queue_first(&zone->pages.all_free);
3789             !queue_end(&zone->pages.all_free, &(thispage->pages));
3790             thispage = (struct zone_page_metadata *)queue_next(&(thispage->pages))) {
3791                 for (this = page_metadata_get_freelist(thispage);
3792                     this != NULL;
3793                     this = this->next) {
3794                         if (!is_sane_zone_element(zone, (vm_address_t)this) || (vm_address_t)this == elem) {
3795                                 panic("zone_check_freelist");
3796                         }
3797                 }
3798         }
3799         for (thispage = (struct zone_page_metadata *)queue_first(&zone->pages.intermediate);
3800             !queue_end(&zone->pages.intermediate, &(thispage->pages));
3801             thispage = (struct zone_page_metadata *)queue_next(&(thispage->pages))) {
3802                 for (this = page_metadata_get_freelist(thispage);
3803                     this != NULL;
3804                     this = this->next) {
3805                         if (!is_sane_zone_element(zone, (vm_address_t)this) || (vm_address_t)this == elem) {
3806                                 panic("zone_check_freelist");
3807                         }
3808                 }
3809         }
3810 }
3811
3812 boolean_t
3813 zfree_poison_element(zone_t zone, vm_offset_t elem)
3814 {
3815         boolean_t       poison = FALSE;
3816         if (zp_factor != 0 || zp_tiny_zone_limit != 0) {
3817                 /*
3818                  * Poison the memory before it ends up on the freelist to catch
3819                  * use-after-free and use of uninitialized memory
3820                  *
3821                  * Always poison tiny zones' elements (limit is 0 if -no-zp is set)
3822                  * Also poison larger elements periodically
3823                  */
3824
3825                 vm_offset_t     inner_size = zone->elem_size;
3826
3827                 uint32_t sample_factor = zp_factor + (((uint32_t)inner_size) >> zp_scale);
3828
3829                 if (inner_size <= zp_tiny_zone_limit) {
3830                         poison = TRUE;
3831                 } else if (zp_factor != 0 && sample_counter(&zone->zp_count, sample_factor) == TRUE) {
3832                         poison = TRUE;
3833                 }
3834
3835                 if (__improbable(poison)) {
3836                         /* memset_pattern{4|8} could help make this faster: <rdar://problem/4662004> */
3837                         /* Poison everything but primary and backup */
3838                         vm_offset_t *element_cursor  = ((vm_offset_t *) elem) + 1;
3839                         vm_offset_t *backup   = get_backup_ptr(inner_size, (vm_offset_t *)elem);
3840
3841                         for (; element_cursor < backup; element_cursor++) {
3842                                 *element_cursor = ZP_POISON;
3843                         }
3844                 }
3845         }
3846         return poison;
3847 }
3848 void
3849 (zfree)(
3850         zone_t  zone,
3851         void            *addr)
3852 {
3853         vm_offset_t     elem = (vm_offset_t) addr;
3854         uintptr_t       zbt[MAX_ZTRACE_DEPTH];                  /* only used if zone logging is enabled via boot-args */
3855         unsigned int            numsaved = 0;
3856         boolean_t       gzfreed = FALSE;
3857         boolean_t       poison = FALSE;
3858 #if VM_MAX_TAG_ZONES
3859         vm_tag_t tag;
3860 #endif /* VM_MAX_TAG_ZONES */
3861
3862         assert(zone != ZONE_NULL);
3863         DTRACE_VM2(zfree, zone_t, zone, void*, addr);
3864 #if KASAN_ZALLOC
3865         if (kasan_quarantine_freed_element(&zone, &addr)) {
3866                 return;
3867         }
3868         elem = (vm_offset_t)addr;
3869 #endif
3870
3871         /*
3872          * If zone logging is turned on and this is the zone we're tracking, grab a backtrace.
3873          */
3874
3875         if (__improbable(DO_LOGGING(zone) && corruption_debug_flag)) {
3876                 numsaved = OSBacktrace((void *)zbt, MAX_ZTRACE_DEPTH);
3877         }
3878
3879 #if MACH_ASSERT
3880         /* Basic sanity checks */
3881         if (zone == ZONE_NULL || elem == (vm_offset_t)0) {
3882                 panic("zfree: NULL");
3883         }
3884 #endif
3885
3886 #if     CONFIG_GZALLOC
3887         gzfreed = gzalloc_free(zone, addr);
3888 #endif
3889
3890         if (!gzfreed) {
3891                 struct zone_page_metadata *page_meta = get_zone_page_metadata((struct zone_free_element *)addr, FALSE);
3892                 if (zone != PAGE_METADATA_GET_ZONE(page_meta)) {
3893                         panic("Element %p from zone %s caught being freed to wrong zone %s\n", addr, PAGE_METADATA_GET_ZONE(page_meta)->zone_name, zone->zone_name);
3894                 }
3895         }
3896
3897         TRACE_MACHLEAKS(ZFREE_CODE, ZFREE_CODE_2, zone->elem_size, (uintptr_t)addr);
3898
3899         if (__improbable(!gzfreed && zone->collectable && !zone->allows_foreign &&
3900             !from_zone_map(elem, zone->elem_size))) {
3901                 panic("zfree: non-allocated memory in collectable zone!");
3902         }
3903
3904         if (!gzfreed) {
3905                 poison = zfree_poison_element(zone, elem);
3906         }
3907
3908         /*
3909          * See if we're doing logging on this zone.  There are two styles of logging used depending on
3910          * whether we're trying to catch a leak or corruption.  See comments above in zalloc for details.
3911          */
3912
3913         if (__improbable(DO_LOGGING(zone))) {
3914                 if (corruption_debug_flag) {
3915                         /*
3916                          * We're logging to catch a corruption.  Add a record of this zfree operation
3917                          * to log.
3918                          */
3919                         btlog_add_entry(zone->zlog_btlog, (void *)addr, ZOP_FREE, (void **)zbt, numsaved);
3920                 } else {
3921                         /*
3922                          * We're logging to catch a leak. Remove any record we might have for this
3923                          * element since it's being freed.  Note that we may not find it if the buffer
3924                          * overflowed and that's OK.  Since the log is of a limited size, old records
3925                          * get overwritten if there are more zallocs than zfrees.
3926                          */
3927                         btlog_remove_entries_for_element(zone->zlog_btlog, (void *)addr);
3928                 }
3929         }
3930
3931 #if CONFIG_ZCACHE
3932         if (zone_caching_enabled(zone)) {
3933                 int __assert_only ret = zcache_free_to_cpu_cache(zone, addr);
3934                 assert(ret != FALSE);
3935                 return;
3936         }
3937 #endif /* CONFIG_ZCACHE */
3938
3939         lock_zone(zone);
3940         assert(zone->zone_valid);
3941
3942         if (zone_check) {
3943                 zone_check_freelist(zone, elem);
3944         }
3945
3946         if (__probable(!gzfreed)) {
3947 #if VM_MAX_TAG_ZONES
3948                 if (__improbable(zone->tags)) {
3949                         tag = (ZTAG(zone, elem)[0] >> 1);
3950                         // set the tag with b0 clear so the block remains inuse
3951                         ZTAG(zone, elem)[0] = 0xFFFE;
3952                 }
3953 #endif /* VM_MAX_TAG_ZONES */
3954                 free_to_zone(zone, elem, poison);
3955         }
3956
3957         if (__improbable(zone->count < 0)) {
3958                 panic("zfree: zone count underflow in zone %s while freeing element %p, possible cause: double frees or freeing memory that did not come from this zone",
3959                     zone->zone_name, addr);
3960         }
3961
3962 #if CONFIG_ZLEAKS
3963         /*
3964          * Zone leak detection: un-track the allocation
3965          */
3966         if (zone->zleak_on) {
3967                 zleak_free(elem, zone->elem_size);
3968         }
3969 #endif /* CONFIG_ZLEAKS */
3970
3971 #if VM_MAX_TAG_ZONES
3972         if (__improbable(zone->tags) && __probable(!gzfreed)) {
3973                 vm_tag_update_zone_size(tag, zone->tag_zone_index, -((int64_t)zone->elem_size), 0);
3974         }
3975 #endif /* VM_MAX_TAG_ZONES */
3976
3977         unlock_zone(zone);
3978 }
3979
3980 /*      Change a zone's flags.
3981  *      This routine must be called immediately after zinit.
3982  */
3983 void
3984 zone_change(
3985         zone_t          zone,
3986         unsigned int    item,
3987         boolean_t       value)
3988 {
3989         assert( zone != ZONE_NULL );
3990         assert( value == TRUE || value == FALSE );
3991
3992         switch (item) {
3993         case Z_NOENCRYPT:
3994                 zone->noencrypt = value;
3995                 break;
3996         case Z_EXHAUST:
3997                 zone->exhaustible = value;
3998                 break;
3999         case Z_COLLECT:
4000                 zone->collectable = value;
4001                 break;
4002         case Z_EXPAND:
4003                 zone->expandable = value;
4004                 break;
4005         case Z_FOREIGN:
4006                 zone->allows_foreign = value;
4007                 break;
4008         case Z_CALLERACCT:
4009                 zone->caller_acct = value;
4010                 break;
4011         case Z_NOCALLOUT:
4012                 zone->no_callout = value;
4013                 break;
4014         case Z_TAGS_ENABLED:
4015 #if VM_MAX_TAG_ZONES
4016                 {
4017                         static int tag_zone_index;
4018                         zone->tags = TRUE;
4019                         zone->tags_inline = (((page_size + zone->elem_size - 1) / zone->elem_size) <= (sizeof(uint32_t) / sizeof(uint16_t)));
4020                         zone->tag_zone_index = OSAddAtomic(1, &tag_zone_index);
4021                 }
4022 #endif /* VM_MAX_TAG_ZONES */
4023                 break;
4024         case Z_GZALLOC_EXEMPT:
4025                 zone->gzalloc_exempt = value;
4026 #if     CONFIG_GZALLOC
4027                 gzalloc_reconfigure(zone);
4028 #endif
4029                 break;
4030         case Z_ALIGNMENT_REQUIRED:
4031                 zone->alignment_required = value;
4032 #if KASAN_ZALLOC
4033                 if (zone->kasan_redzone == KASAN_GUARD_SIZE) {
4034                         /* Don't disturb alignment with the redzone for zones with
4035                          * specific alignment requirements. */
4036                         zone->elem_size -= zone->kasan_redzone * 2;
4037                         zone->kasan_redzone = 0;
4038                 }
4039 #endif
4040 #if     CONFIG_GZALLOC
4041                 gzalloc_reconfigure(zone);
4042 #endif
4043                 break;
4044         case Z_KASAN_QUARANTINE:
4045                 zone->kasan_quarantine = value;
4046                 break;
4047         case Z_CACHING_ENABLED:
4048 #if     CONFIG_ZCACHE
4049                 if (value == TRUE) {
4050 #if     CONFIG_GZALLOC
4051                         /*
4052                          * Per cpu zone caching should be
4053                          * disabled if gzalloc is enabled.
4054                          */
4055                         if (gzalloc_enabled()) {
4056                                 break;
4057                         }
4058 #endif
4059                         if (zcache_ready()) {
4060                                 zcache_init(zone);
4061                         } else {
4062                                 zone->cpu_cache_enable_when_ready = TRUE;
4063                         }
4064                 }
4065 #endif
4066                 break;
4067         case Z_CLEARMEMORY:
4068                 zone->clear_memory = value;
4069                 break;
4070         default:
4071                 panic("Zone_change: Wrong Item Type!");
4072                 /* break; */
4073         }
4074 }
4075
4076 /*
4077  * Return the expected number of free elements in the zone.
4078  * This calculation will be incorrect if items are zfree'd that
4079  * were never zalloc'd/zget'd. The correct way to stuff memory
4080  * into a zone is by zcram.
4081  */
4082
4083 integer_t
4084 zone_free_count(zone_t zone)
4085 {
4086         integer_t free_count;
4087
4088         lock_zone(zone);
4089         free_count = zone->countfree;
4090         unlock_zone(zone);
4091
4092         assert(free_count >= 0);
4093
4094         return free_count;
4095 }
4096
4097 /*
4098  * Drops (i.e. frees) the elements in the all free pages queue of a zone.
4099  * Called by zone_gc() on each zone and when a zone is zdestroy()ed.
4100  */
4101 void
4102 drop_free_elements(zone_t z)
4103 {
4104         vm_size_t                 elt_size;
4105         unsigned int              total_freed_pages = 0;
4106         struct zone_page_metadata *page_meta;
4107         vm_address_t              free_page_address;
4108         vm_size_t                 size_to_free;
4109
4110         lock_zone(z);
4111
4112         elt_size = z->elem_size;
4113
4114         while (!queue_empty(&z->pages.all_free)) {
4115                 page_meta = (struct zone_page_metadata *)queue_first(&z->pages.all_free);
4116                 assert(from_zone_map((vm_address_t)page_meta, sizeof(*page_meta))); /* foreign elements should be in any_free_foreign */
4117                 /*
4118                  * Don't drain zones with async refill to below the refill threshold,
4119                  * as they need some reserve to function properly.
4120                  */
4121                 if (!z->zone_destruction &&
4122                     z->async_prio_refill && z->zone_replenish_thread &&
4123                     (vm_size_t)(page_meta->free_count - z->countfree) < z->prio_refill_watermark) {
4124                         break;
4125                 }
4126
4127                 (void)dequeue_head(&z->pages.all_free);
4128
4129                 assert(z->countfree >= page_meta->free_count);
4130                 z->countfree -= page_meta->free_count;
4131
4132                 assert(z->count_all_free_pages >= page_meta->page_count);
4133                 z->count_all_free_pages -= page_meta->page_count;
4134
4135                 assert(z->cur_size >= page_meta->free_count * elt_size);
4136                 z->cur_size -= page_meta->free_count * elt_size;
4137
4138                 ZONE_PAGE_COUNT_DECR(z, page_meta->page_count);
4139                 unlock_zone(z);
4140
4141                 /* Free the pages for metadata and account for them */
4142                 free_page_address = get_zone_page(page_meta);
4143                 total_freed_pages += page_meta->page_count;
4144                 size_to_free = page_meta->page_count * PAGE_SIZE;
4145 #if KASAN_ZALLOC
4146                 kasan_poison_range(free_page_address, size_to_free, ASAN_VALID);
4147 #endif
4148 #if VM_MAX_TAG_ZONES
4149                 if (z->tags) {
4150                         ztMemoryRemove(z, free_page_address, size_to_free);
4151                 }
4152 #endif /* VM_MAX_TAG_ZONES */
4153                 kmem_free(zone_map, free_page_address, size_to_free);
4154                 if (current_thread()->options & TH_OPT_ZONE_GC) {
4155                         thread_yield_to_preemption();
4156                 }
4157                 lock_zone(z);
4158         }
4159         if (z->zone_destruction) {
4160                 assert(queue_empty(&z->pages.all_free));
4161                 assert(z->count_all_free_pages == 0);
4162         }
4163         unlock_zone(z);
4164
4165
4166 #if DEBUG || DEVELOPMENT
4167         if (zalloc_debug & ZALLOC_DEBUG_ZONEGC) {
4168                 kprintf("zone_gc() of zone %s freed %lu elements, %d pages\n", z->zone_name,
4169                     (unsigned long)((total_freed_pages * PAGE_SIZE) / elt_size), total_freed_pages);
4170         }
4171 #endif /* DEBUG || DEVELOPMENT */
4172 }
4173
4174 /*      Zone garbage collection
4175  *
4176  *      zone_gc will walk through all the free elements in all the
4177  *      zones that are marked collectable looking for reclaimable
4178  *      pages.  zone_gc is called by consider_zone_gc when the system
4179  *      begins to run out of memory.
4180  *
4181  *      We should ensure that zone_gc never blocks.
4182  */
4183 void
4184 zone_gc(boolean_t consider_jetsams)
4185 {
4186         unsigned int    max_zones;
4187         zone_t                  z;
4188         unsigned int    i;
4189
4190         if (consider_jetsams) {
4191                 kill_process_in_largest_zone();
4192                 /*
4193                  * If we do end up jetsamming something, we need to do a zone_gc so that
4194                  * we can reclaim free zone elements and update the zone map size.
4195                  * Fall through.
4196                  */
4197         }
4198
4199         lck_mtx_lock(&zone_gc_lock);
4200
4201         current_thread()->options |= TH_OPT_ZONE_GC;
4202
4203         simple_lock(&all_zones_lock, &zone_locks_grp);
4204         max_zones = num_zones;
4205         simple_unlock(&all_zones_lock);
4206
4207 #if DEBUG || DEVELOPMENT
4208         if (zalloc_debug & ZALLOC_DEBUG_ZONEGC) {
4209                 kprintf("zone_gc() starting...\n");
4210         }
4211 #endif /* DEBUG || DEVELOPMENT */
4212
4213         for (i = 0; i < max_zones; i++) {
4214                 z = &(zone_array[i]);
4215                 assert(z != ZONE_NULL);
4216
4217                 if (!z->collectable) {
4218                         continue;
4219                 }
4220 #if CONFIG_ZCACHE
4221                 if (zone_caching_enabled(z)) {
4222                         zcache_drain_depot(z);
4223                 }
4224 #endif /* CONFIG_ZCACHE */
4225                 if (queue_empty(&z->pages.all_free)) {
4226                         continue;
4227                 }
4228
4229                 drop_free_elements(z);
4230         }
4231
4232         current_thread()->options &= ~TH_OPT_ZONE_GC;
4233
4234         lck_mtx_unlock(&zone_gc_lock);
4235 }
4236
4237 extern vm_offset_t kmapoff_kaddr;
4238 extern unsigned int kmapoff_pgcnt;
4239
4240 /*
4241  *      consider_zone_gc:
4242  *
4243  *      Called by the pageout daemon when the system needs more free pages.
4244  */
4245
4246 void
4247 consider_zone_gc(boolean_t consider_jetsams)
4248 {
4249         if (kmapoff_kaddr != 0) {
4250                 /*
4251                  * One-time reclaim of kernel_map resources we allocated in
4252                  * early boot.
4253                  */
4254                 (void) vm_deallocate(kernel_map,
4255                     kmapoff_kaddr, kmapoff_pgcnt * PAGE_SIZE_64);
4256                 kmapoff_kaddr = 0;
4257         }
4258
4259         if (zone_gc_allowed) {
4260                 zone_gc(consider_jetsams);
4261         }
4262 }
4263
4264 /*
4265  * Creates a vm_map_copy_t to return to the caller of mach_* MIG calls
4266  * requesting zone information.
4267  * Frees unused pages towards the end of the region, and zero'es out unused
4268  * space on the last page.
4269  */
4270 vm_map_copy_t
4271 create_vm_map_copy(
4272         vm_offset_t             start_addr,
4273         vm_size_t               total_size,
4274         vm_size_t               used_size)
4275 {
4276         kern_return_t   kr;
4277         vm_offset_t             end_addr;
4278         vm_size_t               free_size;
4279         vm_map_copy_t   copy;
4280
4281         if (used_size != total_size) {
4282                 end_addr = start_addr + used_size;
4283                 free_size = total_size - (round_page(end_addr) - start_addr);
4284
4285                 if (free_size >= PAGE_SIZE) {
4286                         kmem_free(ipc_kernel_map,
4287                             round_page(end_addr), free_size);
4288                 }
4289                 bzero((char *) end_addr, round_page(end_addr) - end_addr);
4290         }
4291
4292         kr = vm_map_copyin(ipc_kernel_map, (vm_map_address_t)start_addr,
4293             (vm_map_size_t)used_size, TRUE, &copy);
4294         assert(kr == KERN_SUCCESS);
4295
4296         return copy;
4297 }
4298
4299 boolean_t
4300 get_zone_info(
4301         zone_t                          z,
4302         mach_zone_name_t        *zn,
4303         mach_zone_info_t        *zi)
4304 {
4305         struct zone zcopy;
4306
4307         assert(z != ZONE_NULL);
4308         lock_zone(z);
4309         if (!z->zone_valid) {
4310                 unlock_zone(z);
4311                 return FALSE;
4312         }
4313         zcopy = *z;
4314         unlock_zone(z);
4315
4316         if (zn != NULL) {
4317                 /* assuming here the name data is static */
4318                 (void) __nosan_strlcpy(zn->mzn_name, zcopy.zone_name,
4319                     strlen(zcopy.zone_name) + 1);
4320         }
4321
4322         if (zi != NULL) {
4323                 zi->mzi_count = (uint64_t)zcopy.count;
4324                 zi->mzi_cur_size = ptoa_64(zcopy.page_count);
4325                 zi->mzi_max_size = (uint64_t)zcopy.max_size;
4326                 zi->mzi_elem_size = (uint64_t)zcopy.elem_size;
4327                 zi->mzi_alloc_size = (uint64_t)zcopy.alloc_size;
4328                 zi->mzi_sum_size = zcopy.sum_count * zcopy.elem_size;
4329                 zi->mzi_exhaustible = (uint64_t)zcopy.exhaustible;
4330                 zi->mzi_collectable = 0;
4331                 if (zcopy.collectable) {
4332                         SET_MZI_COLLECTABLE_BYTES(zi->mzi_collectable, ((uint64_t)zcopy.count_all_free_pages * PAGE_SIZE));
4333                         SET_MZI_COLLECTABLE_FLAG(zi->mzi_collectable, TRUE);
4334                 }
4335         }
4336
4337         return TRUE;
4338 }
4339
4340 kern_return_t
4341 task_zone_info(
4342         __unused task_t                                 task,
4343         __unused mach_zone_name_array_t *namesp,
4344         __unused mach_msg_type_number_t *namesCntp,
4345         __unused task_zone_info_array_t *infop,
4346         __unused mach_msg_type_number_t *infoCntp)
4347 {
4348         return KERN_FAILURE;
4349 }
4350
4351 kern_return_t
4352 mach_zone_info(
4353         host_priv_t             host,
4354         mach_zone_name_array_t  *namesp,
4355         mach_msg_type_number_t  *namesCntp,
4356         mach_zone_info_array_t  *infop,
4357         mach_msg_type_number_t  *infoCntp)
4358 {
4359         return mach_memory_info(host, namesp, namesCntp, infop, infoCntp, NULL, NULL);
4360 }
4361
4362
4363 kern_return_t
4364 mach_memory_info(
4365         host_priv_t             host,
4366         mach_zone_name_array_t  *namesp,
4367         mach_msg_type_number_t  *namesCntp,
4368         mach_zone_info_array_t  *infop,
4369         mach_msg_type_number_t  *infoCntp,
4370         mach_memory_info_array_t *memoryInfop,
4371         mach_msg_type_number_t   *memoryInfoCntp)
4372 {
4373         mach_zone_name_t        *names;
4374         vm_offset_t             names_addr;
4375         vm_size_t               names_size;
4376
4377         mach_zone_info_t        *info;
4378         vm_offset_t             info_addr;
4379         vm_size_t               info_size;
4380
4381         mach_memory_info_t      *memory_info;
4382         vm_offset_t             memory_info_addr;
4383         vm_size_t               memory_info_size;
4384         vm_size_t               memory_info_vmsize;
4385         unsigned int            num_info;
4386
4387         unsigned int            max_zones, used_zones, i;
4388         mach_zone_name_t        *zn;
4389         mach_zone_info_t        *zi;
4390         kern_return_t           kr;
4391
4392         uint64_t                zones_collectable_bytes = 0;
4393
4394         if (host == HOST_NULL) {
4395                 return KERN_INVALID_HOST;
4396         }
4397 #if CONFIG_DEBUGGER_FOR_ZONE_INFO
4398         if (!PE_i_can_has_debugger(NULL)) {
4399                 return KERN_INVALID_HOST;
4400         }
4401 #endif
4402
4403         /*
4404          *      We assume that zones aren't freed once allocated.
4405          *      We won't pick up any zones that are allocated later.
4406          */
4407
4408         simple_lock(&all_zones_lock, &zone_locks_grp);
4409         max_zones = (unsigned int)(num_zones);
4410         simple_unlock(&all_zones_lock);
4411
4412         names_size = round_page(max_zones * sizeof *names);
4413         kr = kmem_alloc_pageable(ipc_kernel_map,
4414             &names_addr, names_size, VM_KERN_MEMORY_IPC);
4415         if (kr != KERN_SUCCESS) {
4416                 return kr;
4417         }
4418         names = (mach_zone_name_t *) names_addr;
4419
4420         info_size = round_page(max_zones * sizeof *info);
4421         kr = kmem_alloc_pageable(ipc_kernel_map,
4422             &info_addr, info_size, VM_KERN_MEMORY_IPC);
4423         if (kr != KERN_SUCCESS) {
4424                 kmem_free(ipc_kernel_map,
4425                     names_addr, names_size);
4426                 return kr;
4427         }
4428         info = (mach_zone_info_t *) info_addr;
4429
4430         zn = &names[0];
4431         zi = &info[0];
4432
4433         used_zones = max_zones;
4434         for (i = 0; i < max_zones; i++) {
4435                 if (!get_zone_info(&(zone_array[i]), zn, zi)) {
4436                         used_zones--;
4437                         continue;
4438                 }
4439                 zones_collectable_bytes += GET_MZI_COLLECTABLE_BYTES(zi->mzi_collectable);
4440                 zn++;
4441                 zi++;
4442         }
4443
4444         *namesp = (mach_zone_name_t *) create_vm_map_copy(names_addr, names_size, used_zones * sizeof *names);
4445         *namesCntp = used_zones;
4446
4447         *infop = (mach_zone_info_t *) create_vm_map_copy(info_addr, info_size, used_zones * sizeof *info);
4448         *infoCntp = used_zones;
4449
4450         num_info = 0;
4451         memory_info_addr = 0;
4452
4453         if (memoryInfop && memoryInfoCntp) {
4454                 vm_map_copy_t           copy;
4455                 num_info = vm_page_diagnose_estimate();
4456                 memory_info_size = num_info * sizeof(*memory_info);
4457                 memory_info_vmsize = round_page(memory_info_size);
4458                 kr = kmem_alloc_pageable(ipc_kernel_map,
4459                     &memory_info_addr, memory_info_vmsize, VM_KERN_MEMORY_IPC);
4460                 if (kr != KERN_SUCCESS) {
4461                         return kr;
4462                 }
4463
4464                 kr = vm_map_wire_kernel(ipc_kernel_map, memory_info_addr, memory_info_addr + memory_info_vmsize,
4465                     VM_PROT_READ | VM_PROT_WRITE, VM_KERN_MEMORY_IPC, FALSE);
4466                 assert(kr == KERN_SUCCESS);
4467
4468                 memory_info = (mach_memory_info_t *) memory_info_addr;
4469                 vm_page_diagnose(memory_info, num_info, zones_collectable_bytes);
4470
4471                 kr = vm_map_unwire(ipc_kernel_map, memory_info_addr, memory_info_addr + memory_info_vmsize, FALSE);
4472                 assert(kr == KERN_SUCCESS);
4473
4474                 kr = vm_map_copyin(ipc_kernel_map, (vm_map_address_t)memory_info_addr,
4475                     (vm_map_size_t)memory_info_size, TRUE, &copy);
4476                 assert(kr == KERN_SUCCESS);
4477
4478                 *memoryInfop = (mach_memory_info_t *) copy;
4479                 *memoryInfoCntp = num_info;
4480         }
4481
4482         return KERN_SUCCESS;
4483 }
4484
4485 kern_return_t
4486 mach_zone_info_for_zone(
4487         host_priv_t                     host,
4488         mach_zone_name_t        name,
4489         mach_zone_info_t        *infop)
4490 {
4491         unsigned int max_zones, i;
4492         zone_t zone_ptr;
4493
4494         if (host == HOST_NULL) {
4495                 return KERN_INVALID_HOST;
4496         }
4497 #if CONFIG_DEBUGGER_FOR_ZONE_INFO
4498         if (!PE_i_can_has_debugger(NULL)) {
4499                 return KERN_INVALID_HOST;
4500         }
4501 #endif
4502
4503         if (infop == NULL) {
4504                 return KERN_INVALID_ARGUMENT;
4505         }
4506
4507         simple_lock(&all_zones_lock, &zone_locks_grp);
4508         max_zones = (unsigned int)(num_zones);
4509         simple_unlock(&all_zones_lock);
4510
4511         zone_ptr = ZONE_NULL;
4512         for (i = 0; i < max_zones; i++) {
4513                 zone_t z = &(zone_array[i]);
4514                 assert(z != ZONE_NULL);
4515
4516                 /* Find the requested zone by name */
4517                 if (track_this_zone(z->zone_name, name.mzn_name)) {
4518                         zone_ptr = z;
4519                         break;
4520                 }
4521         }
4522
4523         /* No zones found with the requested zone name */
4524         if (zone_ptr == ZONE_NULL) {
4525                 return KERN_INVALID_ARGUMENT;
4526         }
4527
4528         if (get_zone_info(zone_ptr, NULL, infop)) {
4529                 return KERN_SUCCESS;
4530         }
4531         return KERN_FAILURE;
4532 }
4533
4534 kern_return_t
4535 mach_zone_info_for_largest_zone(
4536         host_priv_t                     host,
4537         mach_zone_name_t        *namep,
4538         mach_zone_info_t        *infop)
4539 {
4540         if (host == HOST_NULL) {
4541                 return KERN_INVALID_HOST;
4542         }
4543 #if CONFIG_DEBUGGER_FOR_ZONE_INFO
4544         if (!PE_i_can_has_debugger(NULL)) {
4545                 return KERN_INVALID_HOST;
4546         }
4547 #endif
4548
4549         if (namep == NULL || infop == NULL) {
4550                 return KERN_INVALID_ARGUMENT;
4551         }
4552
4553         if (get_zone_info(zone_find_largest(), namep, infop)) {
4554                 return KERN_SUCCESS;
4555         }
4556         return KERN_FAILURE;
4557 }
4558
4559 uint64_t
4560 get_zones_collectable_bytes(void)
4561 {
4562         unsigned int i, max_zones;
4563         uint64_t zones_collectable_bytes = 0;
4564         mach_zone_info_t zi;
4565
4566         simple_lock(&all_zones_lock, &zone_locks_grp);
4567         max_zones = (unsigned int)(num_zones);
4568         simple_unlock(&all_zones_lock);
4569
4570         for (i = 0; i < max_zones; i++) {
4571                 if (get_zone_info(&(zone_array[i]), NULL, &zi)) {
4572                         zones_collectable_bytes += GET_MZI_COLLECTABLE_BYTES(zi.mzi_collectable);
4573                 }
4574         }
4575
4576         return zones_collectable_bytes;
4577 }
4578
4579 kern_return_t
4580 mach_zone_get_zlog_zones(
4581         host_priv_t                             host,
4582         mach_zone_name_array_t  *namesp,
4583         mach_msg_type_number_t  *namesCntp)
4584 {
4585 #if DEBUG || DEVELOPMENT
4586         unsigned int max_zones, logged_zones, i;
4587         kern_return_t kr;
4588         zone_t zone_ptr;
4589         mach_zone_name_t *names;
4590         vm_offset_t names_addr;
4591         vm_size_t names_size;
4592
4593         if (host == HOST_NULL) {
4594                 return KERN_INVALID_HOST;
4595         }
4596
4597         if (namesp == NULL || namesCntp == NULL) {
4598                 return KERN_INVALID_ARGUMENT;
4599         }
4600
4601         simple_lock(&all_zones_lock, &zone_locks_grp);
4602         max_zones = (unsigned int)(num_zones);
4603         simple_unlock(&all_zones_lock);
4604
4605         names_size = round_page(max_zones * sizeof *names);
4606         kr = kmem_alloc_pageable(ipc_kernel_map,
4607             &names_addr, names_size, VM_KERN_MEMORY_IPC);
4608         if (kr != KERN_SUCCESS) {
4609                 return kr;
4610         }
4611         names = (mach_zone_name_t *) names_addr;
4612
4613         zone_ptr = ZONE_NULL;
4614         logged_zones = 0;
4615         for (i = 0; i < max_zones; i++) {
4616                 zone_t z = &(zone_array[i]);
4617                 assert(z != ZONE_NULL);
4618
4619                 /* Copy out the zone name if zone logging is enabled */
4620                 if (z->zlog_btlog) {
4621                         get_zone_info(z, &names[logged_zones], NULL);
4622                         logged_zones++;
4623                 }
4624         }
4625
4626         *namesp = (mach_zone_name_t *) create_vm_map_copy(names_addr, names_size, logged_zones * sizeof *names);
4627         *namesCntp = logged_zones;
4628
4629         return KERN_SUCCESS;
4630
4631 #else /* DEBUG || DEVELOPMENT */
4632 #pragma unused(host, namesp, namesCntp)
4633         return KERN_FAILURE;
4634 #endif /* DEBUG || DEVELOPMENT */
4635 }
4636
4637 kern_return_t
4638 mach_zone_get_btlog_records(
4639         host_priv_t                             host,
4640         mach_zone_name_t                name,
4641         zone_btrecord_array_t   *recsp,
4642         mach_msg_type_number_t  *recsCntp)
4643 {
4644 #if DEBUG || DEVELOPMENT
4645         unsigned int max_zones, i, numrecs = 0;
4646         zone_btrecord_t *recs;
4647         kern_return_t kr;
4648         zone_t zone_ptr;
4649         vm_offset_t recs_addr;
4650         vm_size_t recs_size;
4651
4652         if (host == HOST_NULL) {
4653                 return KERN_INVALID_HOST;
4654         }
4655
4656         if (recsp == NULL || recsCntp == NULL) {
4657                 return KERN_INVALID_ARGUMENT;
4658         }
4659
4660         simple_lock(&all_zones_lock, &zone_locks_grp);
4661         max_zones = (unsigned int)(num_zones);
4662         simple_unlock(&all_zones_lock);
4663
4664         zone_ptr = ZONE_NULL;
4665         for (i = 0; i < max_zones; i++) {
4666                 zone_t z = &(zone_array[i]);
4667                 assert(z != ZONE_NULL);
4668
4669                 /* Find the requested zone by name */
4670                 if (track_this_zone(z->zone_name, name.mzn_name)) {
4671                         zone_ptr = z;
4672                         break;
4673                 }
4674         }
4675
4676         /* No zones found with the requested zone name */
4677         if (zone_ptr == ZONE_NULL) {
4678                 return KERN_INVALID_ARGUMENT;
4679         }
4680
4681         /* Logging not turned on for the requested zone */
4682         if (!DO_LOGGING(zone_ptr)) {
4683                 return KERN_FAILURE;
4684         }
4685
4686         /* Allocate memory for btlog records */
4687         numrecs = (unsigned int)(get_btlog_records_count(zone_ptr->zlog_btlog));
4688         recs_size = round_page(numrecs * sizeof *recs);
4689
4690         kr = kmem_alloc_pageable(ipc_kernel_map, &recs_addr, recs_size, VM_KERN_MEMORY_IPC);
4691         if (kr != KERN_SUCCESS) {
4692                 return kr;
4693         }
4694
4695         /*
4696          * We will call get_btlog_records() below which populates this region while holding a spinlock
4697          * (the btlog lock). So these pages need to be wired.
4698          */
4699         kr = vm_map_wire_kernel(ipc_kernel_map, recs_addr, recs_addr + recs_size,
4700             VM_PROT_READ | VM_PROT_WRITE, VM_KERN_MEMORY_IPC, FALSE);
4701         assert(kr == KERN_SUCCESS);
4702
4703         recs = (zone_btrecord_t *)recs_addr;
4704         get_btlog_records(zone_ptr->zlog_btlog, recs, &numrecs);
4705
4706         kr = vm_map_unwire(ipc_kernel_map, recs_addr, recs_addr + recs_size, FALSE);
4707         assert(kr == KERN_SUCCESS);
4708
4709         *recsp = (zone_btrecord_t *) create_vm_map_copy(recs_addr, recs_size, numrecs * sizeof *recs);
4710         *recsCntp = numrecs;
4711
4712         return KERN_SUCCESS;
4713
4714 #else /* DEBUG || DEVELOPMENT */
4715 #pragma unused(host, name, recsp, recsCntp)
4716         return KERN_FAILURE;
4717 #endif /* DEBUG || DEVELOPMENT */
4718 }
4719
4720
4721 #if DEBUG || DEVELOPMENT
4722
4723 kern_return_t
4724 mach_memory_info_check(void)
4725 {
4726         mach_memory_info_t * memory_info;
4727         mach_memory_info_t * info;
4728         zone_t                       zone;
4729         unsigned int         idx, num_info, max_zones;
4730         vm_offset_t                  memory_info_addr;
4731         kern_return_t        kr;
4732         size_t               memory_info_size, memory_info_vmsize;
4733         uint64_t             top_wired, zonestotal, total;
4734
4735         num_info = vm_page_diagnose_estimate();
4736         memory_info_size = num_info * sizeof(*memory_info);
4737         memory_info_vmsize = round_page(memory_info_size);
4738         kr = kmem_alloc(kernel_map, &memory_info_addr, memory_info_vmsize, VM_KERN_MEMORY_DIAG);
4739         assert(kr == KERN_SUCCESS);
4740
4741         memory_info = (mach_memory_info_t *) memory_info_addr;
4742         vm_page_diagnose(memory_info, num_info, 0);
4743
4744         simple_lock(&all_zones_lock, &zone_locks_grp);
4745         max_zones = num_zones;
4746         simple_unlock(&all_zones_lock);
4747
4748         top_wired = total = zonestotal = 0;
4749         for (idx = 0; idx < max_zones; idx++) {
4750                 zone = &(zone_array[idx]);
4751                 assert(zone != ZONE_NULL);
4752                 lock_zone(zone);
4753                 zonestotal += ptoa_64(zone->page_count);
4754                 unlock_zone(zone);
4755         }
4756         for (idx = 0; idx < num_info; idx++) {
4757                 info = &memory_info[idx];
4758                 if (!info->size) {
4759                         continue;
4760                 }
4761                 if (VM_KERN_COUNT_WIRED == info->site) {
4762                         top_wired = info->size;
4763                 }
4764                 if (VM_KERN_SITE_HIDE & info->flags) {
4765                         continue;
4766                 }
4767                 if (!(VM_KERN_SITE_WIRED & info->flags)) {
4768                         continue;
4769                 }
4770                 total += info->size;
4771         }
4772         total += zonestotal;
4773
4774         printf("vm_page_diagnose_check %qd of %qd, zones %qd, short 0x%qx\n", total, top_wired, zonestotal, top_wired - total);
4775
4776         kmem_free(kernel_map, memory_info_addr, memory_info_vmsize);
4777
4778         return kr;
4779 }
4780
4781 extern boolean_t(*volatile consider_buffer_cache_collect)(int);
4782
4783 #endif /* DEBUG || DEVELOPMENT */
4784
4785 kern_return_t
4786 mach_zone_force_gc(
4787         host_t host)
4788 {
4789         if (host == HOST_NULL) {
4790                 return KERN_INVALID_HOST;
4791         }
4792
4793 #if DEBUG || DEVELOPMENT
4794         /* Callout to buffer cache GC to drop elements in the apfs zones */
4795         if (consider_buffer_cache_collect != NULL) {
4796                 (void)(*consider_buffer_cache_collect)(0);
4797         }
4798         consider_zone_gc(FALSE);
4799 #endif /* DEBUG || DEVELOPMENT */
4800         return KERN_SUCCESS;
4801 }
4802
4803 extern unsigned int stack_total;
4804 extern unsigned long long stack_allocs;
4805
4806 zone_t
4807 zone_find_largest(void)
4808 {
4809         unsigned int    i;
4810         unsigned int    max_zones;
4811         zone_t          the_zone;
4812         zone_t          zone_largest;
4813
4814         simple_lock(&all_zones_lock, &zone_locks_grp);
4815         max_zones = num_zones;
4816         simple_unlock(&all_zones_lock);
4817
4818         zone_largest = &(zone_array[0]);
4819         for (i = 0; i < max_zones; i++) {
4820                 the_zone = &(zone_array[i]);
4821                 if (the_zone->cur_size > zone_largest->cur_size) {
4822                         zone_largest = the_zone;
4823                 }
4824         }
4825         return zone_largest;
4826 }
4827
4828 #if     ZONE_DEBUG
4829
4830 /* should we care about locks here ? */
4831
4832 #define zone_in_use(z)  ( z->count || z->free_elements \
4833                                                   || !queue_empty(&z->pages.all_free) \
4834                                                   || !queue_empty(&z->pages.intermediate) \
4835                                                   || (z->allows_foreign && !queue_empty(&z->pages.any_free_foreign)))
4836
4837
4838 #endif  /* ZONE_DEBUG */
4839
4840
4841 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
4842
4843 #if DEBUG || DEVELOPMENT
4844
4845 static uintptr_t *
4846 zone_copy_all_allocations_inqueue(zone_t z, queue_head_t * queue, uintptr_t * elems)
4847 {
4848         struct zone_page_metadata *page_meta;
4849         vm_offset_t free, elements;
4850         vm_offset_t idx, numElements, freeCount, bytesAvail, metaSize;
4851
4852         queue_iterate(queue, page_meta, struct zone_page_metadata *, pages)
4853         {
4854                 elements = get_zone_page(page_meta);
4855                 bytesAvail = ptoa(page_meta->page_count);
4856                 freeCount = 0;
4857                 if (z->allows_foreign && !from_zone_map(elements, z->elem_size)) {
4858                         metaSize    = (sizeof(struct zone_page_metadata) + ZONE_ELEMENT_ALIGNMENT - 1) & ~(ZONE_ELEMENT_ALIGNMENT - 1);
4859                         bytesAvail -= metaSize;
4860                         elements   += metaSize;
4861                 }
4862                 numElements = bytesAvail / z->elem_size;
4863                 // construct array of all possible elements
4864                 for (idx = 0; idx < numElements; idx++) {
4865                         elems[idx] = INSTANCE_PUT(elements + idx * z->elem_size);
4866                 }
4867                 // remove from the array all free elements
4868                 free = (vm_offset_t)page_metadata_get_freelist(page_meta);
4869                 while (free) {
4870                         // find idx of free element
4871                         for (idx = 0; (idx < numElements) && (elems[idx] != INSTANCE_PUT(free)); idx++) {
4872                         }
4873                         assert(idx < numElements);
4874                         // remove it
4875                         bcopy(&elems[idx + 1], &elems[idx], (numElements - (idx + 1)) * sizeof(elems[0]));
4876                         numElements--;
4877                         freeCount++;
4878                         // next free element
4879                         vm_offset_t *primary = (vm_offset_t *) free;
4880                         free = *primary ^ zp_nopoison_cookie;
4881                 }
4882                 elems += numElements;
4883         }
4884
4885         return elems;
4886 }
4887
4888 kern_return_t
4889 zone_leaks(const char * zoneName, uint32_t nameLen, leak_site_proc proc, void * refCon)
4890 {
4891         uintptr_t         zbt[MAX_ZTRACE_DEPTH];
4892         zone_t        zone;
4893         uintptr_t *   array;
4894         uintptr_t *   next;
4895         uintptr_t     element, bt;
4896         uint32_t      idx, count, found;
4897         uint32_t      btidx, btcount, nobtcount, btfound;
4898         uint32_t      elemSize;
4899         uint64_t      maxElems;
4900         unsigned int  max_zones;
4901         kern_return_t kr;
4902
4903         simple_lock(&all_zones_lock, &zone_locks_grp);
4904         max_zones = num_zones;
4905         simple_unlock(&all_zones_lock);
4906
4907         for (idx = 0; idx < max_zones; idx++) {
4908                 if (!strncmp(zoneName, zone_array[idx].zone_name, nameLen)) {
4909                         break;
4910                 }
4911         }
4912         if (idx >= max_zones) {
4913                 return KERN_INVALID_NAME;
4914         }
4915         zone = &zone_array[idx];
4916
4917         elemSize = (uint32_t) zone->elem_size;
4918         maxElems = ptoa(zone->page_count) / elemSize;
4919
4920         if ((zone->alloc_size % elemSize)
4921             && !leak_scan_debug_flag) {
4922                 return KERN_INVALID_CAPABILITY;
4923         }
4924
4925         kr = kmem_alloc_kobject(kernel_map, (vm_offset_t *) &array,
4926             maxElems * sizeof(uintptr_t), VM_KERN_MEMORY_DIAG);
4927         if (KERN_SUCCESS != kr) {
4928                 return kr;
4929         }
4930
4931         lock_zone(zone);
4932
4933         next = array;
4934         next = zone_copy_all_allocations_inqueue(zone, &zone->pages.any_free_foreign, next);
4935         next = zone_copy_all_allocations_inqueue(zone, &zone->pages.intermediate, next);
4936         next = zone_copy_all_allocations_inqueue(zone, &zone->pages.all_used, next);
4937         count = (uint32_t)(next - array);
4938
4939         unlock_zone(zone);
4940
4941         zone_leaks_scan(array, count, (uint32_t)zone->elem_size, &found);
4942         assert(found <= count);
4943
4944         for (idx = 0; idx < count; idx++) {
4945                 element = array[idx];
4946                 if (kInstanceFlagReferenced & element) {
4947                         continue;
4948                 }
4949                 element = INSTANCE_PUT(element) & ~kInstanceFlags;
4950         }
4951
4952         if (zone->zlog_btlog && !corruption_debug_flag) {
4953                 // btlog_copy_backtraces_for_elements will set kInstanceFlagReferenced on elements it found
4954                 btlog_copy_backtraces_for_elements(zone->zlog_btlog, array, &count, elemSize, proc, refCon);
4955         }
4956
4957         for (nobtcount = idx = 0; idx < count; idx++) {
4958                 element = array[idx];
4959                 if (!element) {
4960                         continue;
4961                 }
4962                 if (kInstanceFlagReferenced & element) {
4963                         continue;
4964                 }
4965                 element = INSTANCE_PUT(element) & ~kInstanceFlags;
4966
4967                 // see if we can find any backtrace left in the element
4968                 btcount = (typeof(btcount))(zone->elem_size / sizeof(uintptr_t));
4969                 if (btcount >= MAX_ZTRACE_DEPTH) {
4970                         btcount = MAX_ZTRACE_DEPTH - 1;
4971                 }
4972                 for (btfound = btidx = 0; btidx < btcount; btidx++) {
4973                         bt = ((uintptr_t *)element)[btcount - 1 - btidx];
4974                         if (!VM_KERNEL_IS_SLID(bt)) {
4975                                 break;
4976                         }
4977                         zbt[btfound++] = bt;
4978                 }
4979                 if (btfound) {
4980                         (*proc)(refCon, 1, elemSize, &zbt[0], btfound);
4981                 } else {
4982                         nobtcount++;
4983                 }
4984         }
4985         if (nobtcount) {
4986                 // fake backtrace when we found nothing
4987                 zbt[0] = (uintptr_t) &zalloc;
4988                 (*proc)(refCon, nobtcount, elemSize, &zbt[0], 1);
4989         }
4990
4991         kmem_free(kernel_map, (vm_offset_t) array, maxElems * sizeof(uintptr_t));
4992
4993         return KERN_SUCCESS;
4994 }
4995
4996 boolean_t
4997 kdp_is_in_zone(void *addr, const char *zone_name)
4998 {
4999         zone_t z;
5000         return zone_element_size(addr, &z) && !strcmp(z->zone_name, zone_name);
5001 }
5002
5003 boolean_t
5004 run_zone_test(void)
5005 {
5006         unsigned int i = 0, max_iter = 5;
5007         void * test_ptr;
5008         zone_t test_zone;
5009
5010         simple_lock(&zone_test_lock, &zone_locks_grp);
5011         if (!zone_test_running) {
5012                 zone_test_running = TRUE;
5013         } else {
5014                 simple_unlock(&zone_test_lock);
5015                 printf("run_zone_test: Test already running.\n");
5016                 return FALSE;
5017         }
5018         simple_unlock(&zone_test_lock);
5019
5020         printf("run_zone_test: Testing zinit(), zalloc(), zfree() and zdestroy() on zone \"test_zone_sysctl\"\n");
5021
5022         /* zinit() and zdestroy() a zone with the same name a bunch of times, verify that we get back the same zone each time */
5023         do {
5024                 test_zone = zinit(sizeof(uint64_t), 100 * sizeof(uint64_t), sizeof(uint64_t), "test_zone_sysctl");
5025                 if (test_zone == NULL) {
5026                         printf("run_zone_test: zinit() failed\n");
5027                         return FALSE;
5028                 }
5029
5030 #if KASAN_ZALLOC
5031                 if (test_zone_ptr == NULL && zone_free_count(test_zone) != 0) {
5032 #else
5033                 if (zone_free_count(test_zone) != 0) {
5034 #endif
5035                         printf("run_zone_test: free count is not zero\n");
5036                         return FALSE;
5037                 }
5038
5039                 if (test_zone_ptr == NULL) {
5040                         /* Stash the zone pointer returned on the fist zinit */
5041                         printf("run_zone_test: zone created for the first time\n");
5042                         test_zone_ptr = test_zone;
5043                 } else if (test_zone != test_zone_ptr) {
5044                         printf("run_zone_test: old zone pointer and new zone pointer don't match\n");
5045                         return FALSE;
5046                 }
5047
5048                 test_ptr = zalloc(test_zone);
5049                 if (test_ptr == NULL) {
5050                         printf("run_zone_test: zalloc() failed\n");
5051                         return FALSE;
5052                 }
5053                 zfree(test_zone, test_ptr);
5054
5055                 zdestroy(test_zone);
5056                 i++;
5057
5058                 printf("run_zone_test: Iteration %d successful\n", i);
5059         } while (i < max_iter);
5060
5061         printf("run_zone_test: Test passed\n");
5062
5063         simple_lock(&zone_test_lock, &zone_locks_grp);
5064         zone_test_running = FALSE;
5065         simple_unlock(&zone_test_lock);
5066
5067         return TRUE;
5068 }
5069
5070 #endif /* DEBUG || DEVELOPMENT */