]> git.saurik.com Git - apple/xnu.git/blame - osfmk/kern/zalloc.c
xnu-1699.32.7.tar.gz
[apple/xnu.git] / osfmk / kern / zalloc.c
CommitLineData
1c79356b 1/*
6d2010ae 2 * Copyright (c) 2000-2009 Apple Inc. All rights reserved.
1c79356b 3 *
2d21ac55 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
1c79356b 5 *
2d21ac55
A
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
8f6c56a5 14 *
2d21ac55
A
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
8f6c56a5
A
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
2d21ac55
A
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
8f6c56a5 25 *
2d21ac55 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
1c79356b
A
27 */
28/*
29 * @OSF_COPYRIGHT@
30 */
31/*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56/*
57 */
58/*
59 * File: kern/zalloc.c
60 * Author: Avadis Tevanian, Jr.
61 *
62 * Zone-based memory allocator. A zone is a collection of fixed size
63 * data blocks for which quick allocation/deallocation is possible.
64 */
65#include <zone_debug.h>
2d21ac55 66#include <zone_alias_addr.h>
1c79356b
A
67#include <norma_vm.h>
68#include <mach_kdb.h>
91447636
A
69
70#include <mach/mach_types.h>
71#include <mach/vm_param.h>
72#include <mach/kern_return.h>
73#include <mach/mach_host_server.h>
6d2010ae 74#include <mach/task_server.h>
91447636
A
75#include <mach/machine/vm_types.h>
76#include <mach_debug/zone_info.h>
77
78#include <kern/kern_types.h>
1c79356b 79#include <kern/assert.h>
91447636 80#include <kern/host.h>
1c79356b
A
81#include <kern/macro_help.h>
82#include <kern/sched.h>
b0d623f7 83#include <kern/locks.h>
1c79356b
A
84#include <kern/sched_prim.h>
85#include <kern/misc_protos.h>
0b4e3aa0 86#include <kern/thread_call.h>
1c79356b 87#include <kern/zalloc.h>
91447636
A
88#include <kern/kalloc.h>
89
90#include <vm/pmap.h>
91#include <vm/vm_map.h>
1c79356b 92#include <vm/vm_kern.h>
91447636
A
93#include <vm/vm_page.h>
94
1c79356b
A
95#include <machine/machparam.h>
96
2d21ac55 97#include <libkern/OSDebug.h>
7ddcb079 98#include <libkern/OSAtomic.h>
2d21ac55
A
99#include <sys/kdebug.h>
100
c910b4d9
A
101/*
102 * Zone Corruption Debugging
103 *
104 * We provide three methods to detect use of a zone element after it's been freed. These
105 * checks are enabled by specifying "-zc" and/or "-zp" in the boot-args:
106 *
1c79356b
A
107 * (1) Range-check the free-list "next" ptr for sanity.
108 * (2) Store the ptr in two different words, and compare them against
c910b4d9
A
109 * each other when re-using the zone element, to detect modifications.
110 * (3) poison the freed memory by overwriting it with 0xdeadbeef.
111 *
6d2010ae 112 * The first two checks are fairly light weight and are enabled by specifying "-zc"
c910b4d9
A
113 * in the boot-args. If you want more aggressive checking for use-after-free bugs
114 * and you don't mind the additional overhead, then turn on poisoning by adding
115 * "-zp" to the boot-args in addition to "-zc". If you specify -zp without -zc,
116 * it still poisons the memory when it's freed, but doesn't check if the memory
117 * has been altered later when it's reallocated.
1c79356b
A
118 */
119
c910b4d9
A
120boolean_t check_freed_element = FALSE; /* enabled by -zc in boot-args */
121boolean_t zfree_clear = FALSE; /* enabled by -zp in boot-args */
1c79356b 122
6d2010ae
A
123/*
124 * Fake zones for things that want to report via zprint but are not actually zones.
125 */
126struct fake_zone_info {
127 const char* name;
128 void (*init)(int);
129 void (*query)(int *,
130 vm_size_t *, vm_size_t *, vm_size_t *, vm_size_t *,
131 uint64_t *, int *, int *, int *);
132};
133
134static struct fake_zone_info fake_zones[] = {
135 {
136 .name = "kernel_stacks",
137 .init = stack_fake_zone_init,
138 .query = stack_fake_zone_info,
139 },
140#if defined(__i386__) || defined (__x86_64__)
141 {
142 .name = "page_tables",
143 .init = pt_fake_zone_init,
144 .query = pt_fake_zone_info,
145 },
146#endif /* i386 */
147 {
148 .name = "kalloc.large",
149 .init = kalloc_fake_zone_init,
150 .query = kalloc_fake_zone_info,
151 },
152};
153unsigned int num_fake_zones = sizeof(fake_zones)/sizeof(fake_zones[0]);
154
155/*
156 * Zone info options
157 */
158boolean_t zinfo_per_task = FALSE; /* enabled by -zinfop in boot-args */
159#define ZINFO_SLOTS 200 /* for now */
160#define ZONES_MAX (ZINFO_SLOTS - num_fake_zones - 1)
161
162/*
163 * Allocation helper macros
164 */
c910b4d9 165#define is_kernel_data_addr(a) (!(a) || ((a) >= vm_min_kernel_address && !((a) & 0x3)))
1c79356b
A
166
167#define ADD_TO_ZONE(zone, element) \
168MACRO_BEGIN \
c910b4d9
A
169 if (zfree_clear) \
170 { unsigned int i; \
171 for (i=0; \
172 i < zone->elem_size/sizeof(uint32_t); \
173 i++) \
174 ((uint32_t *)(element))[i] = 0xdeadbeef; \
1c79356b 175 } \
c910b4d9
A
176 *((vm_offset_t *)(element)) = (zone)->free_elements; \
177 if (check_freed_element) { \
178 if ((zone)->elem_size >= (2 * sizeof(vm_offset_t))) \
179 ((vm_offset_t *)(element))[((zone)->elem_size/sizeof(vm_offset_t))-1] = \
180 (zone)->free_elements; \
1c79356b 181 } \
c910b4d9
A
182 (zone)->free_elements = (vm_offset_t) (element); \
183 (zone)->count--; \
1c79356b
A
184MACRO_END
185
c910b4d9
A
186#define REMOVE_FROM_ZONE(zone, ret, type) \
187MACRO_BEGIN \
188 (ret) = (type) (zone)->free_elements; \
189 if ((ret) != (type) 0) { \
190 if (check_freed_element) { \
191 if (!is_kernel_data_addr(((vm_offset_t *)(ret))[0]) || \
192 ((zone)->elem_size >= (2 * sizeof(vm_offset_t)) && \
193 ((vm_offset_t *)(ret))[((zone)->elem_size/sizeof(vm_offset_t))-1] != \
194 ((vm_offset_t *)(ret))[0])) \
195 panic("a freed zone element has been modified");\
196 if (zfree_clear) { \
197 unsigned int ii; \
198 for (ii = sizeof(vm_offset_t) / sizeof(uint32_t); \
6d2010ae 199 ii < (zone)->elem_size/sizeof(uint32_t) - sizeof(vm_offset_t) / sizeof(uint32_t); \
c910b4d9
A
200 ii++) \
201 if (((uint32_t *)(ret))[ii] != (uint32_t)0xdeadbeef) \
202 panic("a freed zone element has been modified");\
203 } \
204 } \
205 (zone)->count++; \
6d2010ae 206 (zone)->sum_count++; \
c910b4d9
A
207 (zone)->free_elements = *((vm_offset_t *)(ret)); \
208 } \
209MACRO_END
1c79356b
A
210
211#if ZONE_DEBUG
212#define zone_debug_enabled(z) z->active_zones.next
55e303ae
A
213#define ROUNDUP(x,y) ((((x)+(y)-1)/(y))*(y))
214#define ZONE_DEBUG_OFFSET ROUNDUP(sizeof(queue_chain_t),16)
1c79356b
A
215#endif /* ZONE_DEBUG */
216
217/*
7ddcb079
A
218 * Support for garbage collection of unused zone pages
219 *
220 * The kernel virtually allocates the "zone map" submap of the kernel
221 * map. When an individual zone needs more storage, memory is allocated
222 * out of the zone map, and the two-level "zone_page_table" is
223 * on-demand expanded so that it has entries for those pages.
224 * zone_page_init()/zone_page_alloc() initialize "alloc_count"
225 * to the number of zone elements that occupy the zone page (which may
226 * be a minimum of 1, including if a zone element spans multiple
227 * pages).
228 *
229 * Asynchronously, the zone_gc() logic attempts to walk zone free
230 * lists to see if all the elements on a zone page are free. If
231 * "collect_count" (which it increments during the scan) matches
232 * "alloc_count", the zone page is a candidate for collection and the
233 * physical page is returned to the VM system. During this process, the
234 * first word of the zone page is re-used to maintain a linked list of
235 * to-be-collected zone pages.
1c79356b 236 */
7ddcb079
A
237typedef uint32_t zone_page_index_t;
238#define ZONE_PAGE_INDEX_INVALID ((zone_page_index_t)0xFFFFFFFFU)
1c79356b
A
239
240struct zone_page_table_entry {
7ddcb079
A
241 volatile uint16_t alloc_count;
242 volatile uint16_t collect_count;
1c79356b
A
243};
244
7ddcb079
A
245#define ZONE_PAGE_USED 0
246#define ZONE_PAGE_UNUSED 0xffff
247
1c79356b
A
248/* Forwards */
249void zone_page_init(
250 vm_offset_t addr,
7ddcb079 251 vm_size_t size);
1c79356b
A
252
253void zone_page_alloc(
254 vm_offset_t addr,
255 vm_size_t size);
256
55e303ae 257void zone_page_free_element(
7ddcb079 258 zone_page_index_t *free_page_list,
1c79356b
A
259 vm_offset_t addr,
260 vm_size_t size);
261
55e303ae 262void zone_page_collect(
1c79356b
A
263 vm_offset_t addr,
264 vm_size_t size);
265
266boolean_t zone_page_collectable(
267 vm_offset_t addr,
268 vm_size_t size);
269
270void zone_page_keep(
271 vm_offset_t addr,
272 vm_size_t size);
273
0b4e3aa0
A
274void zalloc_async(
275 thread_call_param_t p0,
276 thread_call_param_t p1);
277
b0d623f7 278void zone_display_zprint( void );
0b4e3aa0 279
1c79356b
A
280#if ZONE_DEBUG && MACH_KDB
281int zone_count(
282 zone_t z,
283 int tail);
284#endif /* ZONE_DEBUG && MACH_KDB */
285
286vm_map_t zone_map = VM_MAP_NULL;
287
288zone_t zone_zone = ZONE_NULL; /* the zone containing other zones */
289
6d2010ae
A
290zone_t zinfo_zone = ZONE_NULL; /* zone of per-task zone info */
291
1c79356b
A
292/*
293 * The VM system gives us an initial chunk of memory.
294 * It has to be big enough to allocate the zone_zone
7ddcb079 295 * all the way through the pmap zone.
1c79356b
A
296 */
297
298vm_offset_t zdata;
299vm_size_t zdata_size;
300
301#define lock_zone(zone) \
302MACRO_BEGIN \
b0d623f7 303 lck_mtx_lock_spin(&(zone)->lock); \
1c79356b
A
304MACRO_END
305
306#define unlock_zone(zone) \
307MACRO_BEGIN \
2d21ac55 308 lck_mtx_unlock(&(zone)->lock); \
1c79356b
A
309MACRO_END
310
9bccf70c
A
311#define zone_wakeup(zone) thread_wakeup((event_t)(zone))
312#define zone_sleep(zone) \
b0d623f7 313 (void) lck_mtx_sleep(&(zone)->lock, LCK_SLEEP_SPIN, (event_t)(zone), THREAD_UNINT);
2d21ac55 314
9bccf70c 315
1c79356b
A
316#define lock_zone_init(zone) \
317MACRO_BEGIN \
2d21ac55
A
318 char _name[32]; \
319 (void) snprintf(_name, sizeof (_name), "zone.%s", (zone)->zone_name); \
320 lck_grp_attr_setdefault(&(zone)->lock_grp_attr); \
321 lck_grp_init(&(zone)->lock_grp, _name, &(zone)->lock_grp_attr); \
322 lck_attr_setdefault(&(zone)->lock_attr); \
323 lck_mtx_init_ext(&(zone)->lock, &(zone)->lock_ext, \
324 &(zone)->lock_grp, &(zone)->lock_attr); \
1c79356b
A
325MACRO_END
326
b0d623f7 327#define lock_try_zone(zone) lck_mtx_try_lock_spin(&zone->lock)
1c79356b 328
1c79356b
A
329/*
330 * Garbage collection map information
331 */
7ddcb079
A
332#define ZONE_PAGE_TABLE_FIRST_LEVEL_SIZE (32)
333struct zone_page_table_entry * volatile zone_page_table[ZONE_PAGE_TABLE_FIRST_LEVEL_SIZE];
334vm_size_t zone_page_table_used_size;
1c79356b
A
335vm_offset_t zone_map_min_address;
336vm_offset_t zone_map_max_address;
91447636 337unsigned int zone_pages;
7ddcb079
A
338unsigned int zone_page_table_second_level_size; /* power of 2 */
339unsigned int zone_page_table_second_level_shift_amount;
340
341#define zone_page_table_first_level_slot(x) ((x) >> zone_page_table_second_level_shift_amount)
342#define zone_page_table_second_level_slot(x) ((x) & (zone_page_table_second_level_size - 1))
343
344void zone_page_table_expand(zone_page_index_t pindex);
345struct zone_page_table_entry *zone_page_table_lookup(zone_page_index_t pindex);
1c79356b
A
346
347/*
348 * Exclude more than one concurrent garbage collection
349 */
b0d623f7
A
350decl_lck_mtx_data(, zone_gc_lock)
351
352lck_attr_t zone_lck_attr;
353lck_grp_t zone_lck_grp;
354lck_grp_attr_t zone_lck_grp_attr;
355lck_mtx_ext_t zone_lck_ext;
356
1c79356b 357
2d21ac55 358#if !ZONE_ALIAS_ADDR
55e303ae 359#define from_zone_map(addr, size) \
1c79356b 360 ((vm_offset_t)(addr) >= zone_map_min_address && \
55e303ae 361 ((vm_offset_t)(addr) + size -1) < zone_map_max_address)
2d21ac55
A
362#else
363#define from_zone_map(addr, size) \
364 ((vm_offset_t)(zone_virtual_addr((vm_map_address_t)addr)) >= zone_map_min_address && \
365 ((vm_offset_t)(zone_virtual_addr((vm_map_address_t)addr)) + size -1) < zone_map_max_address)
366#endif
1c79356b 367
1c79356b
A
368/*
369 * Protects first_zone, last_zone, num_zones,
370 * and the next_zone field of zones.
371 */
372decl_simple_lock_data(, all_zones_lock)
373zone_t first_zone;
374zone_t *last_zone;
91447636 375unsigned int num_zones;
1c79356b 376
0b4e3aa0
A
377boolean_t zone_gc_allowed = TRUE;
378boolean_t zone_gc_forced = FALSE;
c910b4d9 379boolean_t panic_include_zprint = FALSE;
6d2010ae 380boolean_t zone_gc_allowed_by_time_throttle = TRUE;
0b4e3aa0 381
c910b4d9
A
382/*
383 * Zone leak debugging code
384 *
385 * When enabled, this code keeps a log to track allocations to a particular zone that have not
386 * yet been freed. Examining this log will reveal the source of a zone leak. The log is allocated
387 * only when logging is enabled, so there is no effect on the system when it's turned off. Logging is
388 * off by default.
389 *
390 * Enable the logging via the boot-args. Add the parameter "zlog=<zone>" to boot-args where <zone>
391 * is the name of the zone you wish to log.
392 *
393 * This code only tracks one zone, so you need to identify which one is leaking first.
394 * Generally, you'll know you have a leak when you get a "zalloc retry failed 3" panic from the zone
395 * garbage collector. Note that the zone name printed in the panic message is not necessarily the one
396 * containing the leak. So do a zprint from gdb and locate the zone with the bloated size. This
397 * is most likely the problem zone, so set zlog in boot-args to this zone name, reboot and re-run the test. The
398 * next time it panics with this message, examine the log using the kgmacros zstack, findoldest and countpcs.
399 * See the help in the kgmacros for usage info.
400 *
401 *
402 * Zone corruption logging
403 *
404 * Logging can also be used to help identify the source of a zone corruption. First, identify the zone
405 * that is being corrupted, then add "-zc zlog=<zone name>" to the boot-args. When -zc is used in conjunction
406 * with zlog, it changes the logging style to track both allocations and frees to the zone. So when the
407 * corruption is detected, examining the log will show you the stack traces of the callers who last allocated
408 * and freed any particular element in the zone. Use the findelem kgmacro with the address of the element that's been
409 * corrupted to examine its history. This should lead to the source of the corruption.
410 */
411
412static int log_records; /* size of the log, expressed in number of records */
413
414#define MAX_ZONE_NAME 32 /* max length of a zone name we can take from the boot-args */
415
416static char zone_name_to_log[MAX_ZONE_NAME] = ""; /* the zone name we're logging, if any */
417
418/*
419 * The number of records in the log is configurable via the zrecs parameter in boot-args. Set this to
420 * the number of records you want in the log. For example, "zrecs=1000" sets it to 1000 records. Note
421 * that the larger the size of the log, the slower the system will run due to linear searching in the log,
422 * but one doesn't generally care about performance when tracking down a leak. The log is capped at 8000
423 * records since going much larger than this tends to make the system unresponsive and unbootable on small
424 * memory configurations. The default value is 4000 records.
c910b4d9 425 */
6d2010ae
A
426#if defined(__LP64__)
427#define ZRECORDS_MAX 16000 /* Max records allowed in the log */
428#else
c910b4d9 429#define ZRECORDS_MAX 8000 /* Max records allowed in the log */
6d2010ae 430#endif
c910b4d9 431#define ZRECORDS_DEFAULT 4000 /* default records in log if zrecs is not specificed in boot-args */
0b4e3aa0 432
c910b4d9
A
433/*
434 * Each record in the log contains a pointer to the zone element it refers to, a "time" number that allows
435 * the records to be ordered chronologically, and a small array to hold the pc's from the stack trace. A
436 * record is added to the log each time a zalloc() is done in the zone_of_interest. For leak debugging,
437 * the record is cleared when a zfree() is done. For corruption debugging, the log tracks both allocs and frees.
438 * If the log fills, old records are replaced as if it were a circular buffer.
439 */
440
441struct zrecord {
442 void *z_element; /* the element that was zalloc'ed of zfree'ed */
443 uint32_t z_opcode:1, /* whether it was a zalloc or zfree */
444 z_time:31; /* time index when operation was done */
6d2010ae 445 void *z_pc[MAX_ZTRACE_DEPTH]; /* stack trace of caller */
c910b4d9
A
446};
447
448/*
449 * Opcodes for the z_opcode field:
450 */
451
452#define ZOP_ALLOC 1
453#define ZOP_FREE 0
454
455/*
456 * The allocation log and all the related variables are protected by the zone lock for the zone_of_interest
457 */
458
459static struct zrecord *zrecords; /* the log itself, dynamically allocated when logging is enabled */
460static int zcurrent = 0; /* index of the next slot in the log to use */
461static int zrecorded = 0; /* number of allocations recorded in the log */
462static unsigned int ztime = 0; /* a timestamp of sorts */
463static zone_t zone_of_interest = NULL; /* the zone being watched; corresponds to zone_name_to_log */
464
465/*
466 * Decide if we want to log this zone by doing a string compare between a zone name and the name
467 * of the zone to log. Return true if the strings are equal, false otherwise. Because it's not
468 * possible to include spaces in strings passed in via the boot-args, a period in the logname will
469 * match a space in the zone name.
470 */
471
472static int
473log_this_zone(const char *zonename, const char *logname)
474{
475 int len;
476 const char *zc = zonename;
477 const char *lc = logname;
478
479 /*
480 * Compare the strings. We bound the compare by MAX_ZONE_NAME.
481 */
482
483 for (len = 1; len <= MAX_ZONE_NAME; zc++, lc++, len++) {
484
485 /*
486 * If the current characters don't match, check for a space in
487 * in the zone name and a corresponding period in the log name.
488 * If that's not there, then the strings don't match.
489 */
490
491 if (*zc != *lc && !(*zc == ' ' && *lc == '.'))
492 break;
493
494 /*
495 * The strings are equal so far. If we're at the end, then it's a match.
496 */
497
498 if (*zc == '\0')
499 return TRUE;
500 }
501
502 return FALSE;
503}
504
505
506/*
507 * Test if we want to log this zalloc/zfree event. We log if this is the zone we're interested in and
508 * the buffer for the records has been allocated.
509 */
510
511#define DO_LOGGING(z) (zrecords && (z) == zone_of_interest)
512
513extern boolean_t zlog_ready;
514
6d2010ae
A
515#if CONFIG_ZLEAKS
516#pragma mark -
517#pragma mark Zone Leak Detection
518
519/*
520 * The zone leak detector, abbreviated 'zleak', keeps track of a subset of the currently outstanding
521 * allocations made by the zone allocator. Every z_sample_factor allocations in each zone, we capture a
522 * backtrace. Every free, we examine the table and determine if the allocation was being tracked,
523 * and stop tracking it if it was being tracked.
524 *
525 * We track the allocations in the zallocations hash table, which stores the address that was returned from
526 * the zone allocator. Each stored entry in the zallocations table points to an entry in the ztraces table, which
527 * stores the backtrace associated with that allocation. This provides uniquing for the relatively large
528 * backtraces - we don't store them more than once.
529 *
530 * Data collection begins when the zone map is 50% full, and only occurs for zones that are taking up
531 * a large amount of virtual space.
532 */
533#define ZLEAK_STATE_ENABLED 0x01 /* Zone leak monitoring should be turned on if zone_map fills up. */
534#define ZLEAK_STATE_ACTIVE 0x02 /* We are actively collecting traces. */
535#define ZLEAK_STATE_ACTIVATING 0x04 /* Some thread is doing setup; others should move along. */
536#define ZLEAK_STATE_FAILED 0x08 /* Attempt to allocate tables failed. We will not try again. */
537uint32_t zleak_state = 0; /* State of collection, as above */
538
539boolean_t panic_include_ztrace = FALSE; /* Enable zleak logging on panic */
540vm_size_t zleak_global_tracking_threshold; /* Size of zone map at which to start collecting data */
541vm_size_t zleak_per_zone_tracking_threshold; /* Size a zone will have before we will collect data on it */
542unsigned int z_sample_factor = 1000; /* Allocations per sample attempt */
543
544/*
545 * Counters for allocation statistics.
546 */
547
548/* Times two active records want to occupy the same spot */
549unsigned int z_alloc_collisions = 0;
550unsigned int z_trace_collisions = 0;
551
552/* Times a new record lands on a spot previously occupied by a freed allocation */
553unsigned int z_alloc_overwrites = 0;
554unsigned int z_trace_overwrites = 0;
555
556/* Times a new alloc or trace is put into the hash table */
557unsigned int z_alloc_recorded = 0;
558unsigned int z_trace_recorded = 0;
559
560/* Times zleak_log returned false due to not being able to acquire the lock */
561unsigned int z_total_conflicts = 0;
562
563
564#pragma mark struct zallocation
565/*
566 * Structure for keeping track of an allocation
567 * An allocation bucket is in use if its element is not NULL
568 */
569struct zallocation {
570 uintptr_t za_element; /* the element that was zalloc'ed or zfree'ed, NULL if bucket unused */
571 vm_size_t za_size; /* how much memory did this allocation take up? */
572 uint32_t za_trace_index; /* index into ztraces for backtrace associated with allocation */
573 /* TODO: #if this out */
574 uint32_t za_hit_count; /* for determining effectiveness of hash function */
575};
576
577/* Size must be a power of two for the zhash to be able to just mask off bits instead of mod */
578#define ZLEAK_ALLOCATION_MAP_NUM 16384
579#define ZLEAK_TRACE_MAP_NUM 8192
580
581uint32_t zleak_alloc_buckets = ZLEAK_ALLOCATION_MAP_NUM;
582uint32_t zleak_trace_buckets = ZLEAK_TRACE_MAP_NUM;
583
584vm_size_t zleak_max_zonemap_size;
585
586/* Hashmaps of allocations and their corresponding traces */
587static struct zallocation* zallocations;
588static struct ztrace* ztraces;
589
590/* not static so that panic can see this, see kern/debug.c */
591struct ztrace* top_ztrace;
592
593/* Lock to protect zallocations, ztraces, and top_ztrace from concurrent modification. */
594static lck_mtx_t zleak_lock;
595static lck_attr_t zleak_lock_attr;
596static lck_grp_t zleak_lock_grp;
597static lck_grp_attr_t zleak_lock_grp_attr;
598
599/*
600 * Initializes the zone leak monitor. Called from zone_init()
601 */
602static void
603zleak_init(vm_size_t max_zonemap_size)
604{
605 char scratch_buf[16];
606 boolean_t zleak_enable_flag = FALSE;
607
608 zleak_max_zonemap_size = max_zonemap_size;
609 zleak_global_tracking_threshold = max_zonemap_size / 2;
610 zleak_per_zone_tracking_threshold = zleak_global_tracking_threshold / 8;
611
612 /* -zleakoff (flag to disable zone leak monitor) */
613 if (PE_parse_boot_argn("-zleakoff", scratch_buf, sizeof(scratch_buf))) {
614 zleak_enable_flag = FALSE;
615 printf("zone leak detection disabled\n");
616 } else {
617 zleak_enable_flag = TRUE;
618 printf("zone leak detection enabled\n");
619 }
620
621 /* zfactor=XXXX (override how often to sample the zone allocator) */
622 if (PE_parse_boot_argn("zfactor", &z_sample_factor, sizeof(z_sample_factor))) {
623 printf("Zone leak factor override:%u\n", z_sample_factor);
624 }
625
626 /* zleak-allocs=XXXX (override number of buckets in zallocations) */
627 if (PE_parse_boot_argn("zleak-allocs", &zleak_alloc_buckets, sizeof(zleak_alloc_buckets))) {
628 printf("Zone leak alloc buckets override:%u\n", zleak_alloc_buckets);
629 /* uses 'is power of 2' trick: (0x01000 & 0x00FFF == 0) */
630 if (zleak_alloc_buckets == 0 || (zleak_alloc_buckets & (zleak_alloc_buckets-1))) {
631 printf("Override isn't a power of two, bad things might happen!");
632 }
633 }
634
635 /* zleak-traces=XXXX (override number of buckets in ztraces) */
636 if (PE_parse_boot_argn("zleak-traces", &zleak_trace_buckets, sizeof(zleak_trace_buckets))) {
637 printf("Zone leak trace buckets override:%u\n", zleak_trace_buckets);
638 /* uses 'is power of 2' trick: (0x01000 & 0x00FFF == 0) */
639 if (zleak_trace_buckets == 0 || (zleak_trace_buckets & (zleak_trace_buckets-1))) {
640 printf("Override isn't a power of two, bad things might happen!");
641 }
642 }
643
644 /* allocate the zleak_lock */
645 lck_grp_attr_setdefault(&zleak_lock_grp_attr);
646 lck_grp_init(&zleak_lock_grp, "zleak_lock", &zleak_lock_grp_attr);
647 lck_attr_setdefault(&zleak_lock_attr);
648 lck_mtx_init(&zleak_lock, &zleak_lock_grp, &zleak_lock_attr);
649
650 if (zleak_enable_flag) {
651 zleak_state = ZLEAK_STATE_ENABLED;
652 }
653}
654
655#if CONFIG_ZLEAKS
656
657/*
658 * Support for kern.zleak.active sysctl - a simplified
659 * simplified version of the zleak_state variable.
660 */
661int
662get_zleak_state(void)
663{
664 if (zleak_state & ZLEAK_STATE_FAILED)
665 return (-1);
666 if (zleak_state & ZLEAK_STATE_ACTIVE)
667 return (1);
668 return (0);
669}
670
671#endif
672
673
674kern_return_t
675zleak_activate(void)
676{
677 kern_return_t retval;
678 vm_size_t z_alloc_size = zleak_alloc_buckets * sizeof(struct zallocation);
679 vm_size_t z_trace_size = zleak_trace_buckets * sizeof(struct ztrace);
680 void *allocations_ptr = NULL;
681 void *traces_ptr = NULL;
682
683 /* Only one thread attempts to activate at a time */
684 if (zleak_state & (ZLEAK_STATE_ACTIVE | ZLEAK_STATE_ACTIVATING | ZLEAK_STATE_FAILED)) {
685 return KERN_SUCCESS;
686 }
687
688 /* Indicate that we're doing the setup */
689 lck_mtx_lock_spin(&zleak_lock);
690 if (zleak_state & (ZLEAK_STATE_ACTIVE | ZLEAK_STATE_ACTIVATING | ZLEAK_STATE_FAILED)) {
691 lck_mtx_unlock(&zleak_lock);
692 return KERN_SUCCESS;
693 }
694
695 zleak_state |= ZLEAK_STATE_ACTIVATING;
696 lck_mtx_unlock(&zleak_lock);
697
698 /* Allocate and zero tables */
699 retval = kmem_alloc_kobject(kernel_map, (vm_offset_t*)&allocations_ptr, z_alloc_size);
700 if (retval != KERN_SUCCESS) {
701 goto fail;
702 }
703
704 retval = kmem_alloc_kobject(kernel_map, (vm_offset_t*)&traces_ptr, z_trace_size);
705 if (retval != KERN_SUCCESS) {
706 goto fail;
707 }
708
709 bzero(allocations_ptr, z_alloc_size);
710 bzero(traces_ptr, z_trace_size);
711
712 /* Everything's set. Install tables, mark active. */
713 zallocations = allocations_ptr;
714 ztraces = traces_ptr;
715
716 /*
717 * Initialize the top_ztrace to the first entry in ztraces,
718 * so we don't have to check for null in zleak_log
719 */
720 top_ztrace = &ztraces[0];
721
722 /*
723 * Note that we do need a barrier between installing
724 * the tables and setting the active flag, because the zfree()
725 * path accesses the table without a lock if we're active.
726 */
727 lck_mtx_lock_spin(&zleak_lock);
728 zleak_state |= ZLEAK_STATE_ACTIVE;
729 zleak_state &= ~ZLEAK_STATE_ACTIVATING;
730 lck_mtx_unlock(&zleak_lock);
731
732 return 0;
733
734fail:
735 /*
736 * If we fail to allocate memory, don't further tax
737 * the system by trying again.
738 */
739 lck_mtx_lock_spin(&zleak_lock);
740 zleak_state |= ZLEAK_STATE_FAILED;
741 zleak_state &= ~ZLEAK_STATE_ACTIVATING;
742 lck_mtx_unlock(&zleak_lock);
743
744 if (allocations_ptr != NULL) {
745 kmem_free(kernel_map, (vm_offset_t)allocations_ptr, z_alloc_size);
746 }
747
748 if (traces_ptr != NULL) {
749 kmem_free(kernel_map, (vm_offset_t)traces_ptr, z_trace_size);
750 }
751
752 return retval;
753}
754
755/*
756 * TODO: What about allocations that never get deallocated,
757 * especially ones with unique backtraces? Should we wait to record
758 * until after boot has completed?
759 * (How many persistent zallocs are there?)
760 */
761
762/*
763 * This function records the allocation in the allocations table,
764 * and stores the associated backtrace in the traces table
765 * (or just increments the refcount if the trace is already recorded)
766 * If the allocation slot is in use, the old allocation is replaced with the new allocation, and
767 * the associated trace's refcount is decremented.
768 * If the trace slot is in use, it returns.
769 * The refcount is incremented by the amount of memory the allocation consumes.
770 * The return value indicates whether to try again next time.
771 */
772static boolean_t
773zleak_log(uintptr_t* bt,
774 uintptr_t addr,
775 uint32_t depth,
776 vm_size_t allocation_size)
777{
778 /* Quit if there's someone else modifying the hash tables */
779 if (!lck_mtx_try_lock_spin(&zleak_lock)) {
780 z_total_conflicts++;
781 return FALSE;
782 }
783
784 struct zallocation* allocation = &zallocations[hashaddr(addr, zleak_alloc_buckets)];
785
786 uint32_t trace_index = hashbacktrace(bt, depth, zleak_trace_buckets);
787 struct ztrace* trace = &ztraces[trace_index];
788
789 allocation->za_hit_count++;
790 trace->zt_hit_count++;
791
792 /*
793 * If the allocation bucket we want to be in is occupied, and if the occupier
794 * has the same trace as us, just bail.
795 */
796 if (allocation->za_element != (uintptr_t) 0 && trace_index == allocation->za_trace_index) {
797 z_alloc_collisions++;
798
799 lck_mtx_unlock(&zleak_lock);
800 return TRUE;
801 }
802
803 /* STEP 1: Store the backtrace in the traces array. */
804 /* A size of zero indicates that the trace bucket is free. */
805
806 if (trace->zt_size > 0 && bcmp(trace->zt_stack, bt, (depth * sizeof(uintptr_t))) != 0 ) {
807 /*
808 * Different unique trace with same hash!
809 * Just bail - if we're trying to record the leaker, hopefully the other trace will be deallocated
810 * and get out of the way for later chances
811 */
812 trace->zt_collisions++;
813 z_trace_collisions++;
814
815 lck_mtx_unlock(&zleak_lock);
816 return TRUE;
817 } else if (trace->zt_size > 0) {
818 /* Same trace, already added, so increment refcount */
819 trace->zt_size += allocation_size;
820 } else {
821 /* Found an unused trace bucket, record the trace here! */
822 if (trace->zt_depth != 0) /* if this slot was previously used but not currently in use */
823 z_trace_overwrites++;
824
825 z_trace_recorded++;
826 trace->zt_size = allocation_size;
827 memcpy(trace->zt_stack, bt, (depth * sizeof(uintptr_t)) );
828
829 trace->zt_depth = depth;
830 trace->zt_collisions = 0;
831 }
832
833 /* STEP 2: Store the allocation record in the allocations array. */
834
835 if (allocation->za_element != (uintptr_t) 0) {
836 /*
837 * Straight up replace any allocation record that was there. We don't want to do the work
838 * to preserve the allocation entries that were there, because we only record a subset of the
839 * allocations anyways.
840 */
841
842 z_alloc_collisions++;
843
844 struct ztrace* associated_trace = &ztraces[allocation->za_trace_index];
845 /* Knock off old allocation's size, not the new allocation */
846 associated_trace->zt_size -= allocation->za_size;
847 } else if (allocation->za_trace_index != 0) {
848 /* Slot previously used but not currently in use */
849 z_alloc_overwrites++;
850 }
851
852 allocation->za_element = addr;
853 allocation->za_trace_index = trace_index;
854 allocation->za_size = allocation_size;
855
856 z_alloc_recorded++;
857
858 if (top_ztrace->zt_size < trace->zt_size)
859 top_ztrace = trace;
860
861 lck_mtx_unlock(&zleak_lock);
862 return TRUE;
863}
864
865/*
866 * Free the allocation record and release the stacktrace.
867 * This should be as fast as possible because it will be called for every free.
868 */
869static void
870zleak_free(uintptr_t addr,
871 vm_size_t allocation_size)
872{
873 if (addr == (uintptr_t) 0)
874 return;
875
876 struct zallocation* allocation = &zallocations[hashaddr(addr, zleak_alloc_buckets)];
877
878 /* Double-checked locking: check to find out if we're interested, lock, check to make
879 * sure it hasn't changed, then modify it, and release the lock.
880 */
c910b4d9 881
6d2010ae
A
882 if (allocation->za_element == addr && allocation->za_trace_index < zleak_trace_buckets) {
883 /* if the allocation was the one, grab the lock, check again, then delete it */
884 lck_mtx_lock_spin(&zleak_lock);
885
886 if (allocation->za_element == addr && allocation->za_trace_index < zleak_trace_buckets) {
887 struct ztrace *trace;
888
889 /* allocation_size had better match what was passed into zleak_log - otherwise someone is freeing into the wrong zone! */
890 if (allocation->za_size != allocation_size) {
891 panic("Freeing as size %lu memory that was allocated with size %lu\n",
892 (uintptr_t)allocation_size, (uintptr_t)allocation->za_size);
893 }
894
895 trace = &ztraces[allocation->za_trace_index];
896
897 /* size of 0 indicates trace bucket is unused */
898 if (trace->zt_size > 0) {
899 trace->zt_size -= allocation_size;
900 }
901
902 /* A NULL element means the allocation bucket is unused */
903 allocation->za_element = 0;
904 }
905 lck_mtx_unlock(&zleak_lock);
906 }
907}
908
909#endif /* CONFIG_ZLEAKS */
910
911/* These functions outside of CONFIG_ZLEAKS because they are also used in
912 * mbuf.c for mbuf leak-detection. This is why they lack the z_ prefix.
913 */
914
915/*
916 * This function captures a backtrace from the current stack and
917 * returns the number of frames captured, limited by max_frames.
918 * It's fast because it does no checking to make sure there isn't bad data.
919 * Since it's only called from threads that we're going to keep executing,
920 * if there's bad data we were going to die eventually.
921 * This seems to work for x86 and X86_64.
922 * ARMTODO: Test it on ARM, I think it will work but I can't test it. If it works, remove the ifdef.
923 * If this function is inlined, it doesn't record the frame of the function it's inside.
924 * (because there's no stack frame!)
925 */
926uint32_t
927fastbacktrace(uintptr_t* bt, uint32_t max_frames)
928{
929#if defined(__x86_64__) || defined(__i386__)
930 uintptr_t* frameptr = NULL, *frameptr_next = NULL;
931 uintptr_t retaddr = 0;
932 uint32_t frame_index = 0, frames = 0;
933 uintptr_t kstackb, kstackt;
934
935 kstackb = current_thread()->kernel_stack;
936 kstackt = kstackb + kernel_stack_size;
937 /* Load stack frame pointer (EBP on x86) into frameptr */
938 frameptr = __builtin_frame_address(0);
939
940 while (frameptr != NULL && frame_index < max_frames ) {
941 /* Next frame pointer is pointed to by the previous one */
942 frameptr_next = (uintptr_t*) *frameptr;
943
944 /* Bail if we see a zero in the stack frame, that means we've reached the top of the stack */
945 /* That also means the return address is worthless, so don't record it */
946 if (frameptr_next == NULL)
947 break;
948 /* Verify thread stack bounds */
949 if (((uintptr_t)frameptr_next > kstackt) || ((uintptr_t)frameptr_next < kstackb))
950 break;
951 /* Pull return address from one spot above the frame pointer */
952 retaddr = *(frameptr + 1);
953
954 /* Store it in the backtrace array */
955 bt[frame_index++] = retaddr;
956
957 frameptr = frameptr_next;
958 }
959
960 /* Save the number of frames captured for return value */
961 frames = frame_index;
962
963 /* Fill in the rest of the backtrace with zeros */
964 while (frame_index < max_frames)
965 bt[frame_index++] = 0;
966
967 return frames;
968#else
969 return OSBacktrace((void*)bt, max_frames);
970#endif
971}
972
973/* "Thomas Wang's 32/64 bit mix functions." http://www.concentric.net/~Ttwang/tech/inthash.htm */
974uintptr_t
975hash_mix(uintptr_t x)
976{
977#ifndef __LP64__
978 x += ~(x << 15);
979 x ^= (x >> 10);
980 x += (x << 3 );
981 x ^= (x >> 6 );
982 x += ~(x << 11);
983 x ^= (x >> 16);
984#else
985 x += ~(x << 32);
986 x ^= (x >> 22);
987 x += ~(x << 13);
988 x ^= (x >> 8 );
989 x += (x << 3 );
990 x ^= (x >> 15);
991 x += ~(x << 27);
992 x ^= (x >> 31);
993#endif
994 return x;
995}
996
997uint32_t
998hashbacktrace(uintptr_t* bt, uint32_t depth, uint32_t max_size)
999{
1000
1001 uintptr_t hash = 0;
1002 uintptr_t mask = max_size - 1;
1003
1004 while (--depth) {
1005 hash += bt[depth];
1006 }
1007
1008 hash = hash_mix(hash) & mask;
1009
1010 assert(hash < max_size);
1011
1012 return (uint32_t) hash;
1013}
1014
1015/*
1016 * TODO: Determine how well distributed this is
1017 * max_size must be a power of 2. i.e 0x10000 because 0x10000-1 is 0x0FFFF which is a great bitmask
1018 */
1019uint32_t
1020hashaddr(uintptr_t pt, uint32_t max_size)
1021{
1022 uintptr_t hash = 0;
1023 uintptr_t mask = max_size - 1;
1024
1025 hash = hash_mix(pt) & mask;
1026
1027 assert(hash < max_size);
1028
1029 return (uint32_t) hash;
1030}
1031
1032/* End of all leak-detection code */
1033#pragma mark -
1034
1c79356b
A
1035/*
1036 * zinit initializes a new zone. The zone data structures themselves
1037 * are stored in a zone, which is initially a static structure that
1038 * is initialized by zone_init.
1039 */
1040zone_t
1041zinit(
1042 vm_size_t size, /* the size of an element */
1043 vm_size_t max, /* maximum memory to use */
1044 vm_size_t alloc, /* allocation size */
91447636 1045 const char *name) /* a name for the zone */
1c79356b
A
1046{
1047 zone_t z;
1048
1049 if (zone_zone == ZONE_NULL) {
7ddcb079
A
1050
1051 z = (struct zone *)zdata;
1052 zdata += sizeof(*z);
1053 zdata_size -= sizeof(*z);
1c79356b
A
1054 } else
1055 z = (zone_t) zalloc(zone_zone);
1056 if (z == ZONE_NULL)
1057 return(ZONE_NULL);
1058
1059 /*
1060 * Round off all the parameters appropriately.
1061 */
1062 if (size < sizeof(z->free_elements))
1063 size = sizeof(z->free_elements);
1064 size = ((size-1) + sizeof(z->free_elements)) -
1065 ((size-1) % sizeof(z->free_elements));
1066 if (alloc == 0)
1067 alloc = PAGE_SIZE;
91447636
A
1068 alloc = round_page(alloc);
1069 max = round_page(max);
1c79356b 1070 /*
91447636
A
1071 * we look for an allocation size with less than 1% waste
1072 * up to 5 pages in size...
1073 * otherwise, we look for an allocation size with least fragmentation
1074 * in the range of 1 - 5 pages
1075 * This size will be used unless
1c79356b
A
1076 * the user suggestion is larger AND has less fragmentation
1077 */
2d21ac55
A
1078#if ZONE_ALIAS_ADDR
1079 if ((size < PAGE_SIZE) && (PAGE_SIZE % size <= PAGE_SIZE / 10))
1080 alloc = PAGE_SIZE;
1081 else
1082#endif
7ddcb079
A
1083#if defined(__LP64__)
1084 if (((alloc % size) != 0) || (alloc > PAGE_SIZE * 8))
1085#endif
1086 {
1087 vm_size_t best, waste; unsigned int i;
1c79356b
A
1088 best = PAGE_SIZE;
1089 waste = best % size;
91447636
A
1090
1091 for (i = 1; i <= 5; i++) {
1092 vm_size_t tsize, twaste;
1093
1094 tsize = i * PAGE_SIZE;
1095
1096 if ((tsize % size) < (tsize / 100)) {
1097 alloc = tsize;
1098 goto use_this_allocation;
1099 }
1c79356b
A
1100 twaste = tsize % size;
1101 if (twaste < waste)
1102 best = tsize, waste = twaste;
1103 }
1104 if (alloc <= best || (alloc % size >= waste))
1105 alloc = best;
1106 }
91447636 1107use_this_allocation:
1c79356b
A
1108 if (max && (max < alloc))
1109 max = alloc;
1110
1111 z->free_elements = 0;
1112 z->cur_size = 0;
1113 z->max_size = max;
1114 z->elem_size = size;
1115 z->alloc_size = alloc;
1116 z->zone_name = name;
1117 z->count = 0;
6d2010ae 1118 z->sum_count = 0LL;
1c79356b 1119 z->doing_alloc = FALSE;
a3d08fcd 1120 z->doing_gc = FALSE;
1c79356b
A
1121 z->exhaustible = FALSE;
1122 z->collectable = TRUE;
1123 z->allows_foreign = FALSE;
1124 z->expandable = TRUE;
1125 z->waiting = FALSE;
0b4e3aa0 1126 z->async_pending = FALSE;
6d2010ae 1127 z->caller_acct = TRUE;
0b4c1975 1128 z->noencrypt = FALSE;
7ddcb079
A
1129 z->no_callout = FALSE;
1130 z->async_prio_refill = FALSE;
1131 z->prio_refill_watermark = 0;
1132 z->zone_replenish_thread = NULL;
6d2010ae
A
1133#if CONFIG_ZLEAKS
1134 z->num_allocs = 0;
1135 z->num_frees = 0;
1136 z->zleak_capture = 0;
1137 z->zleak_on = FALSE;
1138#endif /* CONFIG_ZLEAKS */
1139
1c79356b 1140#if ZONE_DEBUG
2d21ac55 1141 z->active_zones.next = z->active_zones.prev = NULL;
1c79356b
A
1142 zone_debug_enable(z);
1143#endif /* ZONE_DEBUG */
1144 lock_zone_init(z);
1145
1146 /*
1147 * Add the zone to the all-zones list.
6d2010ae
A
1148 * If we are tracking zone info per task, and we have
1149 * already used all the available stat slots, then keep
1150 * using the overflow zone slot.
1c79356b 1151 */
1c79356b 1152 z->next_zone = ZONE_NULL;
0b4e3aa0 1153 thread_call_setup(&z->call_async_alloc, zalloc_async, z);
1c79356b
A
1154 simple_lock(&all_zones_lock);
1155 *last_zone = z;
1156 last_zone = &z->next_zone;
6d2010ae
A
1157 z->index = num_zones;
1158 if (zinfo_per_task) {
1159 if (num_zones > ZONES_MAX)
1160 z->index = ZONES_MAX;
1161 }
1c79356b
A
1162 num_zones++;
1163 simple_unlock(&all_zones_lock);
1164
c910b4d9
A
1165 /*
1166 * Check if we should be logging this zone. If so, remember the zone pointer.
1167 */
1168
1169 if (log_this_zone(z->zone_name, zone_name_to_log)) {
1170 zone_of_interest = z;
1171 }
1172
1173 /*
1174 * If we want to log a zone, see if we need to allocate buffer space for the log. Some vm related zones are
1175 * zinit'ed before we can do a kmem_alloc, so we have to defer allocation in that case. zlog_ready is set to
1176 * TRUE once enough of the VM system is up and running to allow a kmem_alloc to work. If we want to log one
1177 * of the VM related zones that's set up early on, we will skip allocation of the log until zinit is called again
1178 * later on some other zone. So note we may be allocating a buffer to log a zone other than the one being initialized
1179 * right now.
1180 */
1181
1182 if (zone_of_interest != NULL && zrecords == NULL && zlog_ready) {
1183 if (kmem_alloc(kernel_map, (vm_offset_t *)&zrecords, log_records * sizeof(struct zrecord)) == KERN_SUCCESS) {
1184
1185 /*
1186 * We got the memory for the log. Zero it out since the code needs this to identify unused records.
1187 * At this point, everything is set up and we're ready to start logging this zone.
1188 */
1189
1190 bzero((void *)zrecords, log_records * sizeof(struct zrecord));
1191 printf("zone: logging started for zone %s (%p)\n", zone_of_interest->zone_name, zone_of_interest);
1192
1193 } else {
1194 printf("zone: couldn't allocate memory for zrecords, turning off zleak logging\n");
1195 zone_of_interest = NULL;
1196 }
1197 }
1198
1c79356b
A
1199 return(z);
1200}
7ddcb079
A
1201unsigned zone_replenish_loops, zone_replenish_wakeups, zone_replenish_wakeups_initiated;
1202
1203static void zone_replenish_thread(zone_t);
1204
1205/* High priority VM privileged thread used to asynchronously refill a designated
1206 * zone, such as the reserved VM map entry zone.
1207 */
1208static void zone_replenish_thread(zone_t z) {
1209 vm_size_t free_size;
1210 current_thread()->options |= TH_OPT_VMPRIV;
1211
1212 for (;;) {
1213 lock_zone(z);
1214 assert(z->prio_refill_watermark != 0);
1215 while ((free_size = (z->cur_size - (z->count * z->elem_size))) < (z->prio_refill_watermark * z->elem_size)) {
1216 assert(z->doing_alloc == FALSE);
1217 assert(z->async_prio_refill == TRUE);
1218
1219 unlock_zone(z);
1220 int zflags = KMA_KOBJECT|KMA_NOPAGEWAIT;
1221 vm_offset_t space, alloc_size;
1222 kern_return_t kr;
1223
1224 if (vm_pool_low())
1225 alloc_size = round_page(z->elem_size);
1226 else
1227 alloc_size = z->alloc_size;
1228
1229 if (z->noencrypt)
1230 zflags |= KMA_NOENCRYPT;
1231
1232 kr = kernel_memory_allocate(zone_map, &space, alloc_size, 0, zflags);
1233
1234 if (kr == KERN_SUCCESS) {
1235#if ZONE_ALIAS_ADDR
1236 if (alloc_size == PAGE_SIZE)
1237 space = zone_alias_addr(space);
1238#endif
1239 zcram(z, space, alloc_size);
1240 } else if (kr == KERN_RESOURCE_SHORTAGE) {
1241 VM_PAGE_WAIT();
1242 } else if (kr == KERN_NO_SPACE) {
1243 kr = kernel_memory_allocate(kernel_map, &space, alloc_size, 0, zflags);
1244 if (kr == KERN_SUCCESS) {
1245#if ZONE_ALIAS_ADDR
1246 if (alloc_size == PAGE_SIZE)
1247 space = zone_alias_addr(space);
1248#endif
1249 zcram(z, space, alloc_size);
1250 } else {
1251 assert_wait_timeout(&z->zone_replenish_thread, THREAD_UNINT, 1, 100 * NSEC_PER_USEC);
1252 thread_block(THREAD_CONTINUE_NULL);
1253 }
1254 }
1255
1256 lock_zone(z);
1257 zone_replenish_loops++;
1258 }
1259
1260 unlock_zone(z);
1261 assert_wait(&z->zone_replenish_thread, THREAD_UNINT);
1262 thread_block(THREAD_CONTINUE_NULL);
1263 zone_replenish_wakeups++;
1264 }
1265}
1266
1267void
1268zone_prio_refill_configure(zone_t z, vm_size_t low_water_mark) {
1269 z->prio_refill_watermark = low_water_mark;
1270
1271 z->async_prio_refill = TRUE;
1272 OSMemoryBarrier();
1273 kern_return_t tres = kernel_thread_start_priority((thread_continue_t)zone_replenish_thread, z, MAXPRI_KERNEL, &z->zone_replenish_thread);
1274
1275 if (tres != KERN_SUCCESS) {
1276 panic("zone_prio_refill_configure, thread create: 0x%x", tres);
1277 }
1278
1279 thread_deallocate(z->zone_replenish_thread);
1280}
1c79356b
A
1281
1282/*
1283 * Cram the given memory into the specified zone.
1284 */
1285void
1286zcram(
7ddcb079
A
1287 zone_t zone,
1288 vm_offset_t newmem,
1c79356b
A
1289 vm_size_t size)
1290{
7ddcb079
A
1291 vm_size_t elem_size;
1292 boolean_t from_zm = FALSE;
1c79356b
A
1293
1294 /* Basic sanity checks */
1295 assert(zone != ZONE_NULL && newmem != (vm_offset_t)0);
1296 assert(!zone->collectable || zone->allows_foreign
55e303ae 1297 || (from_zone_map(newmem, size)));
1c79356b
A
1298
1299 elem_size = zone->elem_size;
1300
7ddcb079
A
1301 if (from_zone_map(newmem, size))
1302 from_zm = TRUE;
1303
1304 if (from_zm)
1305 zone_page_init(newmem, size);
1306
1c79356b
A
1307 lock_zone(zone);
1308 while (size >= elem_size) {
1309 ADD_TO_ZONE(zone, newmem);
7ddcb079 1310 if (from_zm)
1c79356b
A
1311 zone_page_alloc(newmem, elem_size);
1312 zone->count++; /* compensate for ADD_TO_ZONE */
1313 size -= elem_size;
1314 newmem += elem_size;
1315 zone->cur_size += elem_size;
1316 }
1317 unlock_zone(zone);
1318}
1319
1c79356b
A
1320
1321/*
1322 * Steal memory for the zone package. Called from
1323 * vm_page_bootstrap().
1324 */
1325void
1326zone_steal_memory(void)
1327{
7ddcb079
A
1328 /* Request enough early memory to get to the pmap zone */
1329 zdata_size = 12 * sizeof(struct zone);
1330 zdata = (vm_offset_t)pmap_steal_memory(round_page(zdata_size));
1c79356b
A
1331}
1332
1333
1334/*
1335 * Fill a zone with enough memory to contain at least nelem elements.
b0d623f7 1336 * Memory is obtained with kmem_alloc_kobject from the kernel_map.
1c79356b
A
1337 * Return the number of elements actually put into the zone, which may
1338 * be more than the caller asked for since the memory allocation is
1339 * rounded up to a full page.
1340 */
1341int
1342zfill(
1343 zone_t zone,
1344 int nelem)
1345{
1346 kern_return_t kr;
1347 vm_size_t size;
1348 vm_offset_t memory;
1349 int nalloc;
1350
1351 assert(nelem > 0);
1352 if (nelem <= 0)
1353 return 0;
1354 size = nelem * zone->elem_size;
91447636 1355 size = round_page(size);
b0d623f7 1356 kr = kmem_alloc_kobject(kernel_map, &memory, size);
1c79356b
A
1357 if (kr != KERN_SUCCESS)
1358 return 0;
1359
1360 zone_change(zone, Z_FOREIGN, TRUE);
7ddcb079 1361 zcram(zone, memory, size);
b0d623f7 1362 nalloc = (int)(size / zone->elem_size);
1c79356b
A
1363 assert(nalloc >= nelem);
1364
1365 return nalloc;
1366}
1367
1368/*
1369 * Initialize the "zone of zones" which uses fixed memory allocated
1370 * earlier in memory initialization. zone_bootstrap is called
1371 * before zone_init.
1372 */
1373void
1374zone_bootstrap(void)
1375{
2d21ac55
A
1376 char temp_buf[16];
1377
6d2010ae
A
1378#if 6094439
1379 /* enable zone checks by default, to try and catch offenders... */
1380#if 0
1381 /* 7968354: turn "-zc" back off */
1382 check_freed_element = TRUE;
1383 /* 7995202: turn "-zp" back off */
1384 zfree_clear = TRUE;
1385#endif
1386
1387 /* ... but allow them to be turned off explicitely */
1388 if (PE_parse_boot_argn("-no_zc", temp_buf, sizeof (temp_buf))) {
1389 check_freed_element = FALSE;
1390 }
1391 if (PE_parse_boot_argn("-no_zp", temp_buf, sizeof (temp_buf))) {
1392 zfree_clear = FALSE;
1393 }
1394#endif
1395
c910b4d9 1396 /* see if we want freed zone element checking and/or poisoning */
593a1d5f 1397 if (PE_parse_boot_argn("-zc", temp_buf, sizeof (temp_buf))) {
c910b4d9
A
1398 check_freed_element = TRUE;
1399 }
1400
1401 if (PE_parse_boot_argn("-zp", temp_buf, sizeof (temp_buf))) {
1402 zfree_clear = TRUE;
1403 }
1404
6d2010ae
A
1405 if (PE_parse_boot_argn("-zinfop", temp_buf, sizeof (temp_buf))) {
1406 zinfo_per_task = TRUE;
1407 }
1408
c910b4d9
A
1409 /*
1410 * Check for and set up zone leak detection if requested via boot-args. We recognized two
1411 * boot-args:
1412 *
1413 * zlog=<zone_to_log>
1414 * zrecs=<num_records_in_log>
1415 *
1416 * The zlog arg is used to specify the zone name that should be logged, and zrecs is used to
1417 * control the size of the log. If zrecs is not specified, a default value is used.
1418 */
1419
1420 if (PE_parse_boot_argn("zlog", zone_name_to_log, sizeof(zone_name_to_log)) == TRUE) {
1421 if (PE_parse_boot_argn("zrecs", &log_records, sizeof(log_records)) == TRUE) {
1422
1423 /*
1424 * Don't allow more than ZRECORDS_MAX records even if the user asked for more.
1425 * This prevents accidentally hogging too much kernel memory and making the system
1426 * unusable.
1427 */
1428
1429 log_records = MIN(ZRECORDS_MAX, log_records);
1430
1431 } else {
1432 log_records = ZRECORDS_DEFAULT;
1433 }
2d21ac55 1434 }
1c79356b 1435
91447636 1436 simple_lock_init(&all_zones_lock, 0);
1c79356b
A
1437
1438 first_zone = ZONE_NULL;
1439 last_zone = &first_zone;
1440 num_zones = 0;
1441
1c79356b
A
1442 /* assertion: nobody else called zinit before us */
1443 assert(zone_zone == ZONE_NULL);
1444 zone_zone = zinit(sizeof(struct zone), 128 * sizeof(struct zone),
1445 sizeof(struct zone), "zones");
1446 zone_change(zone_zone, Z_COLLECT, FALSE);
6d2010ae 1447 zone_change(zone_zone, Z_CALLERACCT, FALSE);
0b4c1975
A
1448 zone_change(zone_zone, Z_NOENCRYPT, TRUE);
1449
7ddcb079 1450 zcram(zone_zone, zdata, zdata_size);
6d2010ae
A
1451
1452 /* initialize fake zones and zone info if tracking by task */
1453 if (zinfo_per_task) {
1454 vm_size_t zisize = sizeof(zinfo_usage_store_t) * ZINFO_SLOTS;
1455 unsigned int i;
1456
1457 for (i = 0; i < num_fake_zones; i++)
1458 fake_zones[i].init(ZINFO_SLOTS - num_fake_zones + i);
1459 zinfo_zone = zinit(zisize, zisize * CONFIG_TASK_MAX,
1460 zisize, "per task zinfo");
1461 zone_change(zinfo_zone, Z_CALLERACCT, FALSE);
1462 }
1463}
1464
1465void
1466zinfo_task_init(task_t task)
1467{
1468 if (zinfo_per_task) {
1469 task->tkm_zinfo = zalloc(zinfo_zone);
1470 memset(task->tkm_zinfo, 0, sizeof(zinfo_usage_store_t) * ZINFO_SLOTS);
1471 } else {
1472 task->tkm_zinfo = NULL;
1473 }
1c79356b
A
1474}
1475
6d2010ae
A
1476void
1477zinfo_task_free(task_t task)
1478{
1479 assert(task != kernel_task);
1480 if (task->tkm_zinfo != NULL) {
1481 zfree(zinfo_zone, task->tkm_zinfo);
1482 task->tkm_zinfo = NULL;
1483 }
1484}
1485
1c79356b
A
1486void
1487zone_init(
1488 vm_size_t max_zonemap_size)
1489{
1490 kern_return_t retval;
1491 vm_offset_t zone_min;
1492 vm_offset_t zone_max;
1c79356b
A
1493
1494 retval = kmem_suballoc(kernel_map, &zone_min, max_zonemap_size,
b0d623f7
A
1495 FALSE, VM_FLAGS_ANYWHERE | VM_FLAGS_PERMANENT,
1496 &zone_map);
91447636 1497
1c79356b
A
1498 if (retval != KERN_SUCCESS)
1499 panic("zone_init: kmem_suballoc failed");
91447636 1500 zone_max = zone_min + round_page(max_zonemap_size);
1c79356b
A
1501 /*
1502 * Setup garbage collection information:
1503 */
1c79356b
A
1504 zone_map_min_address = zone_min;
1505 zone_map_max_address = zone_max;
7ddcb079
A
1506
1507 zone_pages = (unsigned int)atop_kernel(zone_max - zone_min);
1508 zone_page_table_used_size = sizeof(zone_page_table);
1509
1510 zone_page_table_second_level_size = 1;
1511 zone_page_table_second_level_shift_amount = 0;
1512
1513 /*
1514 * Find the power of 2 for the second level that allows
1515 * the first level to fit in ZONE_PAGE_TABLE_FIRST_LEVEL_SIZE
1516 * slots.
1517 */
1518 while ((zone_page_table_first_level_slot(zone_pages-1)) >= ZONE_PAGE_TABLE_FIRST_LEVEL_SIZE) {
1519 zone_page_table_second_level_size <<= 1;
1520 zone_page_table_second_level_shift_amount++;
1521 }
b0d623f7
A
1522
1523 lck_grp_attr_setdefault(&zone_lck_grp_attr);
1524 lck_grp_init(&zone_lck_grp, "zones", &zone_lck_grp_attr);
1525 lck_attr_setdefault(&zone_lck_attr);
1526 lck_mtx_init_ext(&zone_gc_lock, &zone_lck_ext, &zone_lck_grp, &zone_lck_attr);
1527
6d2010ae
A
1528#if CONFIG_ZLEAKS
1529 /*
1530 * Initialize the zone leak monitor
1531 */
1532 zleak_init(max_zonemap_size);
1533#endif /* CONFIG_ZLEAKS */
1c79356b
A
1534}
1535
7ddcb079
A
1536void
1537zone_page_table_expand(zone_page_index_t pindex)
1538{
1539 unsigned int first_index;
1540 struct zone_page_table_entry * volatile * first_level_ptr;
1541
1542 assert(pindex < zone_pages);
1543
1544 first_index = zone_page_table_first_level_slot(pindex);
1545 first_level_ptr = &zone_page_table[first_index];
1546
1547 if (*first_level_ptr == NULL) {
1548 /*
1549 * We were able to verify the old first-level slot
1550 * had NULL, so attempt to populate it.
1551 */
1552
1553 vm_offset_t second_level_array = 0;
1554 vm_size_t second_level_size = round_page(zone_page_table_second_level_size * sizeof(struct zone_page_table_entry));
1555 zone_page_index_t i;
1556 struct zone_page_table_entry *entry_array;
1557
1558 if (kmem_alloc_kobject(zone_map, &second_level_array,
1559 second_level_size) != KERN_SUCCESS) {
1560 panic("zone_page_table_expand");
1561 }
1562
1563 /*
1564 * zone_gc() may scan the "zone_page_table" directly,
1565 * so make sure any slots have a valid unused state.
1566 */
1567 entry_array = (struct zone_page_table_entry *)second_level_array;
1568 for (i=0; i < zone_page_table_second_level_size; i++) {
1569 entry_array[i].alloc_count = ZONE_PAGE_UNUSED;
1570 entry_array[i].collect_count = 0;
1571 }
1572
1573 if (OSCompareAndSwapPtr(NULL, entry_array, first_level_ptr)) {
1574 /* Old slot was NULL, replaced with expanded level */
1575 OSAddAtomicLong(second_level_size, &zone_page_table_used_size);
1576 } else {
1577 /* Old slot was not NULL, someone else expanded first */
1578 kmem_free(zone_map, second_level_array, second_level_size);
1579 }
1580 } else {
1581 /* Old slot was not NULL, already been expanded */
1582 }
1583}
1584
1585struct zone_page_table_entry *
1586zone_page_table_lookup(zone_page_index_t pindex)
1587{
1588 unsigned int first_index = zone_page_table_first_level_slot(pindex);
1589 struct zone_page_table_entry *second_level = zone_page_table[first_index];
1590
1591 if (second_level) {
1592 return &second_level[zone_page_table_second_level_slot(pindex)];
1593 }
1594
1595 return NULL;
1596}
1597
b0d623f7 1598extern volatile SInt32 kfree_nop_count;
1c79356b 1599
6d2010ae
A
1600#pragma mark -
1601#pragma mark zalloc_canblock
1602
1c79356b
A
1603/*
1604 * zalloc returns an element from the specified zone.
1605 */
91447636 1606void *
1c79356b
A
1607zalloc_canblock(
1608 register zone_t zone,
1609 boolean_t canblock)
1610{
1611 vm_offset_t addr;
1612 kern_return_t retval;
6d2010ae 1613 uintptr_t zbt[MAX_ZTRACE_DEPTH]; /* used in zone leak logging and zone leak detection */
c910b4d9 1614 int numsaved = 0;
6d2010ae 1615 int i;
7ddcb079 1616 boolean_t zone_replenish_wakeup = FALSE;
6d2010ae
A
1617
1618#if CONFIG_ZLEAKS
1619 uint32_t zleak_tracedepth = 0; /* log this allocation if nonzero */
1620#endif /* CONFIG_ZLEAKS */
1c79356b
A
1621
1622 assert(zone != ZONE_NULL);
6d2010ae
A
1623
1624 lock_zone(zone);
1c79356b 1625
c910b4d9
A
1626 /*
1627 * If zone logging is turned on and this is the zone we're tracking, grab a backtrace.
1628 */
6d2010ae 1629
c910b4d9 1630 if (DO_LOGGING(zone))
6d2010ae
A
1631 numsaved = OSBacktrace((void*) zbt, MAX_ZTRACE_DEPTH);
1632
1633#if CONFIG_ZLEAKS
1634 /*
1635 * Zone leak detection: capture a backtrace every z_sample_factor
1636 * allocations in this zone.
1637 */
1638 if (zone->zleak_on && (zone->zleak_capture++ % z_sample_factor == 0)) {
1639 zone->zleak_capture = 1;
1640
1641 /* Avoid backtracing twice if zone logging is on */
1642 if (numsaved == 0 )
1643 zleak_tracedepth = fastbacktrace(zbt, MAX_ZTRACE_DEPTH);
1644 else
1645 zleak_tracedepth = numsaved;
1646 }
1647#endif /* CONFIG_ZLEAKS */
1c79356b
A
1648
1649 REMOVE_FROM_ZONE(zone, addr, vm_offset_t);
0b4e3aa0 1650
7ddcb079
A
1651 if (zone->async_prio_refill &&
1652 ((zone->cur_size - (zone->count * zone->elem_size)) < (zone->prio_refill_watermark * zone->elem_size))) {
1653 zone_replenish_wakeup = TRUE;
1654 zone_replenish_wakeups_initiated++;
a3d08fcd
A
1655 }
1656
0b4e3aa0 1657 while ((addr == 0) && canblock) {
1c79356b
A
1658 /*
1659 * If nothing was there, try to get more
1660 */
1661 if (zone->doing_alloc) {
1c79356b
A
1662 /*
1663 * Someone is allocating memory for this zone.
1664 * Wait for it to show up, then try again.
1665 */
1c79356b 1666 zone->waiting = TRUE;
9bccf70c 1667 zone_sleep(zone);
7ddcb079
A
1668 } else if (zone->doing_gc) {
1669 /* zone_gc() is running. Since we need an element
1670 * from the free list that is currently being
1671 * collected, set the waiting bit and try to
1672 * interrupt the GC process, and try again
1673 * when we obtain the lock.
1674 */
1675 zone->waiting = TRUE;
1676 zone_sleep(zone);
1677 } else {
1678 vm_offset_t space;
1679 vm_size_t alloc_size;
1680 int retry = 0;
1681
1c79356b
A
1682 if ((zone->cur_size + zone->elem_size) >
1683 zone->max_size) {
1684 if (zone->exhaustible)
1685 break;
1686 if (zone->expandable) {
1687 /*
1688 * We're willing to overflow certain
1689 * zones, but not without complaining.
1690 *
1691 * This is best used in conjunction
1692 * with the collectable flag. What we
1693 * want is an assurance we can get the
1694 * memory back, assuming there's no
1695 * leak.
1696 */
1697 zone->max_size += (zone->max_size >> 1);
1698 } else {
1699 unlock_zone(zone);
1700
1c79356b
A
1701 panic("zalloc: zone \"%s\" empty.", zone->zone_name);
1702 }
1703 }
1704 zone->doing_alloc = TRUE;
1705 unlock_zone(zone);
1706
7ddcb079
A
1707 for (;;) {
1708 int zflags = KMA_KOBJECT|KMA_NOPAGEWAIT;
1709
1710 if (vm_pool_low() || retry >= 1)
1711 alloc_size =
1712 round_page(zone->elem_size);
1713 else
1714 alloc_size = zone->alloc_size;
1715
1716 if (zone->noencrypt)
1717 zflags |= KMA_NOENCRYPT;
1718
1719 retval = kernel_memory_allocate(zone_map, &space, alloc_size, 0, zflags);
1720 if (retval == KERN_SUCCESS) {
2d21ac55 1721#if ZONE_ALIAS_ADDR
7ddcb079
A
1722 if (alloc_size == PAGE_SIZE)
1723 space = zone_alias_addr(space);
2d21ac55 1724#endif
7ddcb079 1725
6d2010ae 1726#if CONFIG_ZLEAKS
7ddcb079
A
1727 if ((zleak_state & (ZLEAK_STATE_ENABLED | ZLEAK_STATE_ACTIVE)) == ZLEAK_STATE_ENABLED) {
1728 if (zone_map->size >= zleak_global_tracking_threshold) {
1729 kern_return_t kr;
1730
1731 kr = zleak_activate();
1732 if (kr != KERN_SUCCESS) {
1733 printf("Failed to activate live zone leak debugging (%d).\n", kr);
6d2010ae
A
1734 }
1735 }
55e303ae 1736 }
1c79356b 1737
7ddcb079
A
1738 if ((zleak_state & ZLEAK_STATE_ACTIVE) && !(zone->zleak_on)) {
1739 if (zone->cur_size > zleak_per_zone_tracking_threshold) {
1740 zone->zleak_on = TRUE;
1741 }
1c79356b 1742 }
7ddcb079 1743#endif /* CONFIG_ZLEAKS */
1c79356b 1744
7ddcb079
A
1745 zcram(zone, space, alloc_size);
1746
1747 break;
1748 } else if (retval != KERN_RESOURCE_SHORTAGE) {
1749 retry++;
1750
1751 if (retry == 2) {
1752 zone_gc();
1753 printf("zalloc did gc\n");
1754 zone_display_zprint();
1755 }
1756 if (retry == 3) {
6d2010ae
A
1757 panic_include_zprint = TRUE;
1758#if CONFIG_ZLEAKS
7ddcb079 1759 if ((zleak_state & ZLEAK_STATE_ACTIVE)) {
6d2010ae
A
1760 panic_include_ztrace = TRUE;
1761 }
7ddcb079
A
1762#endif /* CONFIG_ZLEAKS */
1763 /* TODO: Change this to something more descriptive, perhaps
1764 * 'zone_map exhausted' only if we get retval 3 (KERN_NO_SPACE).
1765 */
1766 panic("zalloc: \"%s\" (%d elements) retry fail %d, kfree_nop_count: %d", zone->zone_name, zone->count, retval, (int)kfree_nop_count);
6d2010ae 1767 }
7ddcb079
A
1768 } else {
1769 break;
1c79356b
A
1770 }
1771 }
7ddcb079
A
1772 lock_zone(zone);
1773 zone->doing_alloc = FALSE;
1774 if (zone->waiting) {
1775 zone->waiting = FALSE;
1776 zone_wakeup(zone);
1777 }
1778 REMOVE_FROM_ZONE(zone, addr, vm_offset_t);
1779 if (addr == 0 &&
1780 retval == KERN_RESOURCE_SHORTAGE) {
1781 unlock_zone(zone);
1782
1783 VM_PAGE_WAIT();
1784 lock_zone(zone);
1785 }
1c79356b
A
1786 }
1787 if (addr == 0)
1788 REMOVE_FROM_ZONE(zone, addr, vm_offset_t);
1789 }
1790
6d2010ae
A
1791#if CONFIG_ZLEAKS
1792 /* Zone leak detection:
1793 * If we're sampling this allocation, add it to the zleaks hash table.
1794 */
1795 if (addr && zleak_tracedepth > 0) {
1796 /* Sampling can fail if another sample is happening at the same time in a different zone. */
1797 if (!zleak_log(zbt, addr, zleak_tracedepth, zone->elem_size)) {
1798 /* If it failed, roll back the counter so we sample the next allocation instead. */
1799 zone->zleak_capture = z_sample_factor;
1800 }
1801 }
1802#endif /* CONFIG_ZLEAKS */
1803
1804
c910b4d9
A
1805 /*
1806 * See if we should be logging allocations in this zone. Logging is rarely done except when a leak is
1807 * suspected, so this code rarely executes. We need to do this code while still holding the zone lock
1808 * since it protects the various log related data structures.
1809 */
1810
1811 if (DO_LOGGING(zone) && addr) {
1812
1813 /*
1814 * Look for a place to record this new allocation. We implement two different logging strategies
1815 * depending on whether we're looking for the source of a zone leak or a zone corruption. When looking
1816 * for a leak, we want to log as many allocations as possible in order to clearly identify the leaker
1817 * among all the records. So we look for an unused slot in the log and fill that in before overwriting
1818 * an old entry. When looking for a corrution however, it's better to have a chronological log of all
1819 * the allocations and frees done in the zone so that the history of operations for a specific zone
1820 * element can be inspected. So in this case, we treat the log as a circular buffer and overwrite the
1821 * oldest entry whenever a new one needs to be added.
1822 *
1823 * The check_freed_element flag tells us what style of logging to do. It's set if we're supposed to be
1824 * doing corruption style logging (indicated via -zc in the boot-args).
1825 */
1826
1827 if (!check_freed_element && zrecords[zcurrent].z_element && zrecorded < log_records) {
1828
1829 /*
1830 * If we get here, we're doing leak style logging and there's still some unused entries in
1831 * the log (since zrecorded is smaller than the size of the log). Look for an unused slot
1832 * starting at zcurrent and wrap-around if we reach the end of the buffer. If the buffer
1833 * is already full, we just fall through and overwrite the element indexed by zcurrent.
1834 */
1835
1836 for (i = zcurrent; i < log_records; i++) {
1837 if (zrecords[i].z_element == NULL) {
1838 zcurrent = i;
1839 goto empty_slot;
1840 }
1841 }
1842
1843 for (i = 0; i < zcurrent; i++) {
1844 if (zrecords[i].z_element == NULL) {
1845 zcurrent = i;
1846 goto empty_slot;
1847 }
1848 }
1849 }
1850
1851 /*
1852 * Save a record of this allocation
1853 */
1854
1855empty_slot:
1856 if (zrecords[zcurrent].z_element == NULL)
1857 zrecorded++;
1858
1859 zrecords[zcurrent].z_element = (void *)addr;
1860 zrecords[zcurrent].z_time = ztime++;
1861 zrecords[zcurrent].z_opcode = ZOP_ALLOC;
1862
1863 for (i = 0; i < numsaved; i++)
6d2010ae 1864 zrecords[zcurrent].z_pc[i] = (void*) zbt[i];
c910b4d9 1865
6d2010ae 1866 for (; i < MAX_ZTRACE_DEPTH; i++)
c910b4d9
A
1867 zrecords[zcurrent].z_pc[i] = 0;
1868
1869 zcurrent++;
1870
1871 if (zcurrent >= log_records)
1872 zcurrent = 0;
1873 }
1874
7ddcb079 1875 if ((addr == 0) && !canblock && (zone->async_pending == FALSE) && (zone->no_callout == FALSE) && (zone->exhaustible == FALSE) && (!vm_pool_low())) {
0b4e3aa0
A
1876 zone->async_pending = TRUE;
1877 unlock_zone(zone);
1878 thread_call_enter(&zone->call_async_alloc);
1879 lock_zone(zone);
1880 REMOVE_FROM_ZONE(zone, addr, vm_offset_t);
1881 }
1882
1c79356b
A
1883#if ZONE_DEBUG
1884 if (addr && zone_debug_enabled(zone)) {
1885 enqueue_tail(&zone->active_zones, (queue_entry_t)addr);
55e303ae 1886 addr += ZONE_DEBUG_OFFSET;
1c79356b
A
1887 }
1888#endif
6d2010ae
A
1889
1890#if CONFIG_ZLEAKS
1891 if (addr != 0) {
1892 zone->num_allocs++;
1893 }
1894#endif /* CONFIG_ZLEAKS */
1c79356b
A
1895
1896 unlock_zone(zone);
0b4e3aa0 1897
7ddcb079
A
1898 if (zone_replenish_wakeup)
1899 thread_wakeup(&zone->zone_replenish_thread);
1900
2d21ac55
A
1901 TRACE_MACHLEAKS(ZALLOC_CODE, ZALLOC_CODE_2, zone->elem_size, addr);
1902
6d2010ae
A
1903 if (addr) {
1904 thread_t thr = current_thread();
1905 task_t task;
1906 zinfo_usage_t zinfo;
1907
1908 if (zone->caller_acct)
1909 thr->tkm_private.alloc += zone->elem_size;
1910 else
1911 thr->tkm_shared.alloc += zone->elem_size;
1912
1913 if ((task = thr->task) != NULL && (zinfo = task->tkm_zinfo) != NULL)
1914 OSAddAtomic64(zone->elem_size, (int64_t *)&zinfo[zone->index].alloc);
1915 }
91447636 1916 return((void *)addr);
1c79356b
A
1917}
1918
1919
91447636 1920void *
1c79356b
A
1921zalloc(
1922 register zone_t zone)
1923{
1924 return( zalloc_canblock(zone, TRUE) );
1925}
1926
91447636 1927void *
1c79356b
A
1928zalloc_noblock(
1929 register zone_t zone)
1930{
1931 return( zalloc_canblock(zone, FALSE) );
1932}
1933
0b4e3aa0
A
1934void
1935zalloc_async(
91447636
A
1936 thread_call_param_t p0,
1937 __unused thread_call_param_t p1)
0b4e3aa0 1938{
91447636 1939 void *elt;
0b4e3aa0
A
1940
1941 elt = zalloc_canblock((zone_t)p0, TRUE);
1942 zfree((zone_t)p0, elt);
1943 lock_zone(((zone_t)p0));
1944 ((zone_t)p0)->async_pending = FALSE;
1945 unlock_zone(((zone_t)p0));
1946}
1947
1c79356b
A
1948
1949/*
1950 * zget returns an element from the specified zone
1951 * and immediately returns nothing if there is nothing there.
1952 *
1953 * This form should be used when you can not block (like when
1954 * processing an interrupt).
6d2010ae
A
1955 *
1956 * XXX: It seems like only vm_page_grab_fictitious_common uses this, and its
1957 * friend vm_page_more_fictitious can block, so it doesn't seem like
1958 * this is used for interrupts any more....
1c79356b 1959 */
91447636 1960void *
1c79356b
A
1961zget(
1962 register zone_t zone)
1963{
1964 register vm_offset_t addr;
6d2010ae
A
1965
1966#if CONFIG_ZLEAKS
1967 uintptr_t zbt[MAX_ZTRACE_DEPTH]; /* used for zone leak detection */
1968 uint32_t zleak_tracedepth = 0; /* log this allocation if nonzero */
1969#endif /* CONFIG_ZLEAKS */
1c79356b
A
1970
1971 assert( zone != ZONE_NULL );
1972
1973 if (!lock_try_zone(zone))
91447636 1974 return NULL;
6d2010ae
A
1975
1976#if CONFIG_ZLEAKS
1977 /*
1978 * Zone leak detection: capture a backtrace
1979 */
1980 if (zone->zleak_on && (zone->zleak_capture++ % z_sample_factor == 0)) {
1981 zone->zleak_capture = 1;
1982 zleak_tracedepth = fastbacktrace(zbt, MAX_ZTRACE_DEPTH);
1983 }
1984#endif /* CONFIG_ZLEAKS */
1c79356b
A
1985
1986 REMOVE_FROM_ZONE(zone, addr, vm_offset_t);
1987#if ZONE_DEBUG
1988 if (addr && zone_debug_enabled(zone)) {
1989 enqueue_tail(&zone->active_zones, (queue_entry_t)addr);
55e303ae 1990 addr += ZONE_DEBUG_OFFSET;
1c79356b
A
1991 }
1992#endif /* ZONE_DEBUG */
6d2010ae
A
1993
1994#if CONFIG_ZLEAKS
1995 /*
1996 * Zone leak detection: record the allocation
1997 */
1998 if (zone->zleak_on && zleak_tracedepth > 0 && addr) {
1999 /* Sampling can fail if another sample is happening at the same time in a different zone. */
2000 if (!zleak_log(zbt, addr, zleak_tracedepth, zone->elem_size)) {
2001 /* If it failed, roll back the counter so we sample the next allocation instead. */
2002 zone->zleak_capture = z_sample_factor;
2003 }
2004 }
2005
2006 if (addr != 0) {
2007 zone->num_allocs++;
2008 }
2009#endif /* CONFIG_ZLEAKS */
2010
1c79356b
A
2011 unlock_zone(zone);
2012
91447636 2013 return((void *) addr);
1c79356b
A
2014}
2015
2016/* Keep this FALSE by default. Large memory machine run orders of magnitude
2017 slower in debug mode when true. Use debugger to enable if needed */
55e303ae
A
2018/* static */ boolean_t zone_check = FALSE;
2019
2020static zone_t zone_last_bogus_zone = ZONE_NULL;
2021static vm_offset_t zone_last_bogus_elem = 0;
1c79356b
A
2022
2023void
2024zfree(
2025 register zone_t zone,
91447636 2026 void *addr)
1c79356b 2027{
91447636 2028 vm_offset_t elem = (vm_offset_t) addr;
6d2010ae 2029 void *zbt[MAX_ZTRACE_DEPTH]; /* only used if zone logging is enabled via boot-args */
c910b4d9
A
2030 int numsaved = 0;
2031
2032 assert(zone != ZONE_NULL);
2033
2034 /*
2035 * If zone logging is turned on and this is the zone we're tracking, grab a backtrace.
2036 */
2037
2038 if (DO_LOGGING(zone))
6d2010ae 2039 numsaved = OSBacktrace(&zbt[0], MAX_ZTRACE_DEPTH);
1c79356b
A
2040
2041#if MACH_ASSERT
2042 /* Basic sanity checks */
2043 if (zone == ZONE_NULL || elem == (vm_offset_t)0)
2044 panic("zfree: NULL");
2045 /* zone_gc assumes zones are never freed */
2046 if (zone == zone_zone)
2047 panic("zfree: freeing to zone_zone breaks zone_gc!");
55e303ae
A
2048#endif
2049
b0d623f7 2050 TRACE_MACHLEAKS(ZFREE_CODE, ZFREE_CODE_2, zone->elem_size, (uintptr_t)addr);
2d21ac55 2051
1c79356b 2052 if (zone->collectable && !zone->allows_foreign &&
55e303ae
A
2053 !from_zone_map(elem, zone->elem_size)) {
2054#if MACH_ASSERT
1c79356b 2055 panic("zfree: non-allocated memory in collectable zone!");
91447636 2056#endif
55e303ae
A
2057 zone_last_bogus_zone = zone;
2058 zone_last_bogus_elem = elem;
2059 return;
55e303ae 2060 }
1c79356b
A
2061
2062 lock_zone(zone);
c910b4d9
A
2063
2064 /*
2065 * See if we're doing logging on this zone. There are two styles of logging used depending on
2066 * whether we're trying to catch a leak or corruption. See comments above in zalloc for details.
2067 */
2068
2069 if (DO_LOGGING(zone)) {
2070 int i;
2071
2072 if (check_freed_element) {
2073
2074 /*
2075 * We're logging to catch a corruption. Add a record of this zfree operation
2076 * to log.
2077 */
2078
2079 if (zrecords[zcurrent].z_element == NULL)
2080 zrecorded++;
2081
2082 zrecords[zcurrent].z_element = (void *)addr;
2083 zrecords[zcurrent].z_time = ztime++;
2084 zrecords[zcurrent].z_opcode = ZOP_FREE;
2085
2086 for (i = 0; i < numsaved; i++)
6d2010ae 2087 zrecords[zcurrent].z_pc[i] = zbt[i];
c910b4d9 2088
6d2010ae 2089 for (; i < MAX_ZTRACE_DEPTH; i++)
c910b4d9
A
2090 zrecords[zcurrent].z_pc[i] = 0;
2091
2092 zcurrent++;
2093
2094 if (zcurrent >= log_records)
2095 zcurrent = 0;
2096
2097 } else {
2098
2099 /*
2100 * We're logging to catch a leak. Remove any record we might have for this
2101 * element since it's being freed. Note that we may not find it if the buffer
2102 * overflowed and that's OK. Since the log is of a limited size, old records
2103 * get overwritten if there are more zallocs than zfrees.
2104 */
2105
2106 for (i = 0; i < log_records; i++) {
2107 if (zrecords[i].z_element == addr) {
2108 zrecords[i].z_element = NULL;
2109 zcurrent = i;
2110 zrecorded--;
2111 break;
2112 }
2113 }
2114 }
2115 }
2116
2117
1c79356b
A
2118#if ZONE_DEBUG
2119 if (zone_debug_enabled(zone)) {
2120 queue_t tmp_elem;
2121
55e303ae 2122 elem -= ZONE_DEBUG_OFFSET;
1c79356b
A
2123 if (zone_check) {
2124 /* check the zone's consistency */
2125
2126 for (tmp_elem = queue_first(&zone->active_zones);
2127 !queue_end(tmp_elem, &zone->active_zones);
2128 tmp_elem = queue_next(tmp_elem))
2129 if (elem == (vm_offset_t)tmp_elem)
2130 break;
2131 if (elem != (vm_offset_t)tmp_elem)
2132 panic("zfree()ing element from wrong zone");
2133 }
6d2010ae 2134 remqueue((queue_t) elem);
1c79356b
A
2135 }
2136#endif /* ZONE_DEBUG */
2137 if (zone_check) {
2138 vm_offset_t this;
2139
2140 /* check the zone's consistency */
2141
2142 for (this = zone->free_elements;
2143 this != 0;
2144 this = * (vm_offset_t *) this)
2145 if (!pmap_kernel_va(this) || this == elem)
2146 panic("zfree");
2147 }
0b4e3aa0 2148 ADD_TO_ZONE(zone, elem);
b0d623f7
A
2149#if MACH_ASSERT
2150 if (zone->count < 0)
2151 panic("zfree: count < 0!");
2152#endif
6d2010ae 2153
0b4e3aa0 2154
6d2010ae
A
2155#if CONFIG_ZLEAKS
2156 zone->num_frees++;
2157
2158 /*
2159 * Zone leak detection: un-track the allocation
2160 */
2161 if (zone->zleak_on) {
2162 zleak_free(elem, zone->elem_size);
2163 }
2164#endif /* CONFIG_ZLEAKS */
2165
1c79356b
A
2166 /*
2167 * If elements have one or more pages, and memory is low,
0b4e3aa0
A
2168 * request to run the garbage collection in the zone the next
2169 * time the pageout thread runs.
1c79356b
A
2170 */
2171 if (zone->elem_size >= PAGE_SIZE &&
2172 vm_pool_low()){
0b4e3aa0 2173 zone_gc_forced = TRUE;
1c79356b 2174 }
1c79356b 2175 unlock_zone(zone);
6d2010ae
A
2176
2177 {
2178 thread_t thr = current_thread();
2179 task_t task;
2180 zinfo_usage_t zinfo;
2181
2182 if (zone->caller_acct)
2183 thr->tkm_private.free += zone->elem_size;
2184 else
2185 thr->tkm_shared.free += zone->elem_size;
2186 if ((task = thr->task) != NULL && (zinfo = task->tkm_zinfo) != NULL)
2187 OSAddAtomic64(zone->elem_size,
2188 (int64_t *)&zinfo[zone->index].free);
2189 }
1c79356b
A
2190}
2191
2192
2193/* Change a zone's flags.
2194 * This routine must be called immediately after zinit.
2195 */
2196void
2197zone_change(
2198 zone_t zone,
2199 unsigned int item,
2200 boolean_t value)
2201{
2202 assert( zone != ZONE_NULL );
2203 assert( value == TRUE || value == FALSE );
2204
2205 switch(item){
0b4c1975
A
2206 case Z_NOENCRYPT:
2207 zone->noencrypt = value;
2208 break;
1c79356b
A
2209 case Z_EXHAUST:
2210 zone->exhaustible = value;
2211 break;
2212 case Z_COLLECT:
2213 zone->collectable = value;
2214 break;
2215 case Z_EXPAND:
2216 zone->expandable = value;
2217 break;
2218 case Z_FOREIGN:
2219 zone->allows_foreign = value;
2220 break;
6d2010ae
A
2221 case Z_CALLERACCT:
2222 zone->caller_acct = value;
2223 break;
7ddcb079
A
2224 case Z_NOCALLOUT:
2225 zone->no_callout = value;
2226 break;
1c79356b
A
2227#if MACH_ASSERT
2228 default:
2229 panic("Zone_change: Wrong Item Type!");
2230 /* break; */
2231#endif
2232 }
1c79356b
A
2233}
2234
2235/*
2236 * Return the expected number of free elements in the zone.
2237 * This calculation will be incorrect if items are zfree'd that
2238 * were never zalloc'd/zget'd. The correct way to stuff memory
2239 * into a zone is by zcram.
2240 */
2241
2242integer_t
2243zone_free_count(zone_t zone)
2244{
2245 integer_t free_count;
2246
2247 lock_zone(zone);
b0d623f7 2248 free_count = (integer_t)(zone->cur_size/zone->elem_size - zone->count);
1c79356b
A
2249 unlock_zone(zone);
2250
2251 assert(free_count >= 0);
2252
2253 return(free_count);
2254}
2255
2256/*
2257 * zprealloc preallocates wired memory, exanding the specified
2258 * zone to the specified size
2259 */
2260void
2261zprealloc(
2262 zone_t zone,
2263 vm_size_t size)
2264{
2265 vm_offset_t addr;
2266
2267 if (size != 0) {
b0d623f7 2268 if (kmem_alloc_kobject(zone_map, &addr, size) != KERN_SUCCESS)
1c79356b 2269 panic("zprealloc");
7ddcb079 2270 zcram(zone, addr, size);
1c79356b
A
2271 }
2272}
2273
2274/*
2275 * Zone garbage collection subroutines
1c79356b 2276 */
55e303ae 2277
1c79356b
A
2278boolean_t
2279zone_page_collectable(
2280 vm_offset_t addr,
2281 vm_size_t size)
2282{
55e303ae 2283 struct zone_page_table_entry *zp;
7ddcb079 2284 zone_page_index_t i, j;
1c79356b 2285
2d21ac55
A
2286#if ZONE_ALIAS_ADDR
2287 addr = zone_virtual_addr(addr);
2288#endif
1c79356b 2289#if MACH_ASSERT
55e303ae 2290 if (!from_zone_map(addr, size))
1c79356b
A
2291 panic("zone_page_collectable");
2292#endif
2293
7ddcb079
A
2294 i = (zone_page_index_t)atop_kernel(addr-zone_map_min_address);
2295 j = (zone_page_index_t)atop_kernel((addr+size-1) - zone_map_min_address);
55e303ae 2296
7ddcb079
A
2297 for (; i <= j; i++) {
2298 zp = zone_page_table_lookup(i);
55e303ae 2299 if (zp->collect_count == zp->alloc_count)
1c79356b 2300 return (TRUE);
7ddcb079 2301 }
55e303ae 2302
1c79356b
A
2303 return (FALSE);
2304}
2305
2306void
2307zone_page_keep(
2308 vm_offset_t addr,
2309 vm_size_t size)
2310{
55e303ae 2311 struct zone_page_table_entry *zp;
7ddcb079 2312 zone_page_index_t i, j;
1c79356b 2313
2d21ac55
A
2314#if ZONE_ALIAS_ADDR
2315 addr = zone_virtual_addr(addr);
2316#endif
1c79356b 2317#if MACH_ASSERT
55e303ae 2318 if (!from_zone_map(addr, size))
1c79356b
A
2319 panic("zone_page_keep");
2320#endif
2321
7ddcb079
A
2322 i = (zone_page_index_t)atop_kernel(addr-zone_map_min_address);
2323 j = (zone_page_index_t)atop_kernel((addr+size-1) - zone_map_min_address);
1c79356b 2324
7ddcb079
A
2325 for (; i <= j; i++) {
2326 zp = zone_page_table_lookup(i);
55e303ae 2327 zp->collect_count = 0;
7ddcb079 2328 }
1c79356b
A
2329}
2330
2331void
55e303ae 2332zone_page_collect(
1c79356b
A
2333 vm_offset_t addr,
2334 vm_size_t size)
2335{
55e303ae 2336 struct zone_page_table_entry *zp;
7ddcb079 2337 zone_page_index_t i, j;
1c79356b 2338
2d21ac55
A
2339#if ZONE_ALIAS_ADDR
2340 addr = zone_virtual_addr(addr);
2341#endif
1c79356b 2342#if MACH_ASSERT
55e303ae
A
2343 if (!from_zone_map(addr, size))
2344 panic("zone_page_collect");
1c79356b
A
2345#endif
2346
7ddcb079
A
2347 i = (zone_page_index_t)atop_kernel(addr-zone_map_min_address);
2348 j = (zone_page_index_t)atop_kernel((addr+size-1) - zone_map_min_address);
55e303ae 2349
7ddcb079
A
2350 for (; i <= j; i++) {
2351 zp = zone_page_table_lookup(i);
55e303ae 2352 ++zp->collect_count;
7ddcb079 2353 }
1c79356b
A
2354}
2355
2356void
2357zone_page_init(
2358 vm_offset_t addr,
7ddcb079 2359 vm_size_t size)
1c79356b 2360{
55e303ae 2361 struct zone_page_table_entry *zp;
7ddcb079 2362 zone_page_index_t i, j;
1c79356b 2363
2d21ac55
A
2364#if ZONE_ALIAS_ADDR
2365 addr = zone_virtual_addr(addr);
2366#endif
1c79356b 2367#if MACH_ASSERT
55e303ae 2368 if (!from_zone_map(addr, size))
1c79356b
A
2369 panic("zone_page_init");
2370#endif
2371
7ddcb079
A
2372 i = (zone_page_index_t)atop_kernel(addr-zone_map_min_address);
2373 j = (zone_page_index_t)atop_kernel((addr+size-1) - zone_map_min_address);
2374
2375 for (; i <= j; i++) {
2376 /* make sure entry exists before marking unused */
2377 zone_page_table_expand(i);
55e303ae 2378
7ddcb079
A
2379 zp = zone_page_table_lookup(i);
2380 assert(zp);
2381 zp->alloc_count = ZONE_PAGE_UNUSED;
55e303ae 2382 zp->collect_count = 0;
1c79356b 2383 }
1c79356b
A
2384}
2385
2386void
2387zone_page_alloc(
2388 vm_offset_t addr,
2389 vm_size_t size)
2390{
55e303ae 2391 struct zone_page_table_entry *zp;
7ddcb079 2392 zone_page_index_t i, j;
1c79356b 2393
2d21ac55
A
2394#if ZONE_ALIAS_ADDR
2395 addr = zone_virtual_addr(addr);
2396#endif
1c79356b 2397#if MACH_ASSERT
55e303ae 2398 if (!from_zone_map(addr, size))
1c79356b
A
2399 panic("zone_page_alloc");
2400#endif
2401
7ddcb079
A
2402 i = (zone_page_index_t)atop_kernel(addr-zone_map_min_address);
2403 j = (zone_page_index_t)atop_kernel((addr+size-1) - zone_map_min_address);
2404
2405 for (; i <= j; i++) {
2406 zp = zone_page_table_lookup(i);
2407 assert(zp);
55e303ae 2408
55e303ae 2409 /*
7ddcb079 2410 * Set alloc_count to ZONE_PAGE_USED if
1c79356b
A
2411 * it was previously set to ZONE_PAGE_UNUSED.
2412 */
55e303ae 2413 if (zp->alloc_count == ZONE_PAGE_UNUSED)
7ddcb079
A
2414 zp->alloc_count = ZONE_PAGE_USED;
2415
2416 ++zp->alloc_count;
1c79356b 2417 }
1c79356b
A
2418}
2419
2420void
55e303ae 2421zone_page_free_element(
7ddcb079 2422 zone_page_index_t *free_page_list,
1c79356b
A
2423 vm_offset_t addr,
2424 vm_size_t size)
2425{
55e303ae 2426 struct zone_page_table_entry *zp;
7ddcb079 2427 zone_page_index_t i, j;
1c79356b 2428
2d21ac55
A
2429#if ZONE_ALIAS_ADDR
2430 addr = zone_virtual_addr(addr);
2431#endif
1c79356b 2432#if MACH_ASSERT
55e303ae
A
2433 if (!from_zone_map(addr, size))
2434 panic("zone_page_free_element");
1c79356b
A
2435#endif
2436
7ddcb079
A
2437 i = (zone_page_index_t)atop_kernel(addr-zone_map_min_address);
2438 j = (zone_page_index_t)atop_kernel((addr+size-1) - zone_map_min_address);
2439
2440 for (; i <= j; i++) {
2441 zp = zone_page_table_lookup(i);
1c79356b 2442
55e303ae
A
2443 if (zp->collect_count > 0)
2444 --zp->collect_count;
2445 if (--zp->alloc_count == 0) {
7ddcb079
A
2446 vm_address_t free_page_address;
2447
55e303ae
A
2448 zp->alloc_count = ZONE_PAGE_UNUSED;
2449 zp->collect_count = 0;
1c79356b 2450
7ddcb079
A
2451
2452 /*
2453 * This element was the last one on this page, re-use the page's
2454 * storage for a page freelist
2455 */
2456 free_page_address = zone_map_min_address + PAGE_SIZE * ((vm_size_t)i);
2457 *(zone_page_index_t *)free_page_address = *free_page_list;
2458 *free_page_list = i;
1c79356b
A
2459 }
2460 }
1c79356b
A
2461}
2462
2463
2464/* This is used for walking through a zone's free element list.
2465 */
55e303ae
A
2466struct zone_free_element {
2467 struct zone_free_element * next;
1c79356b
A
2468};
2469
2d21ac55
A
2470/*
2471 * Add a linked list of pages starting at base back into the zone
2472 * free list. Tail points to the last element on the list.
2473 */
2474
2475#define ADD_LIST_TO_ZONE(zone, base, tail) \
2476MACRO_BEGIN \
2477 (tail)->next = (void *)((zone)->free_elements); \
2478 if (check_freed_element) { \
2479 if ((zone)->elem_size >= (2 * sizeof(vm_offset_t))) \
2480 ((vm_offset_t *)(tail))[((zone)->elem_size/sizeof(vm_offset_t))-1] = \
2481 (zone)->free_elements; \
2482 } \
2483 (zone)->free_elements = (unsigned long)(base); \
2484MACRO_END
2485
2486/*
2487 * Add an element to the chain pointed to by prev.
2488 */
2489
2490#define ADD_ELEMENT(zone, prev, elem) \
2491MACRO_BEGIN \
2492 (prev)->next = (elem); \
2493 if (check_freed_element) { \
2494 if ((zone)->elem_size >= (2 * sizeof(vm_offset_t))) \
2495 ((vm_offset_t *)(prev))[((zone)->elem_size/sizeof(vm_offset_t))-1] = \
2496 (vm_offset_t)(elem); \
2497 } \
2498MACRO_END
2499
55e303ae
A
2500struct {
2501 uint32_t pgs_freed;
2502
2503 uint32_t elems_collected,
2504 elems_freed,
2505 elems_kept;
2506} zgc_stats;
1c79356b
A
2507
2508/* Zone garbage collection
2509 *
2510 * zone_gc will walk through all the free elements in all the
2511 * zones that are marked collectable looking for reclaimable
2512 * pages. zone_gc is called by consider_zone_gc when the system
2513 * begins to run out of memory.
2514 */
2515void
2516zone_gc(void)
2517{
2518 unsigned int max_zones;
55e303ae 2519 zone_t z;
1c79356b 2520 unsigned int i;
7ddcb079 2521 zone_page_index_t zone_free_page_head;
1c79356b 2522
b0d623f7 2523 lck_mtx_lock(&zone_gc_lock);
1c79356b 2524
1c79356b
A
2525 simple_lock(&all_zones_lock);
2526 max_zones = num_zones;
2527 z = first_zone;
2528 simple_unlock(&all_zones_lock);
2529
2530#if MACH_ASSERT
7ddcb079
A
2531 for (i = 0; i < zone_pages; i++) {
2532 struct zone_page_table_entry *zp;
2533
2534 zp = zone_page_table_lookup(i);
2535 assert(!zp || (zp->collect_count == 0));
2536 }
1c79356b
A
2537#endif /* MACH_ASSERT */
2538
7ddcb079 2539 zone_free_page_head = ZONE_PAGE_INDEX_INVALID;
1c79356b
A
2540
2541 for (i = 0; i < max_zones; i++, z = z->next_zone) {
a3d08fcd 2542 unsigned int n, m;
55e303ae 2543 vm_size_t elt_size, size_freed;
a3d08fcd 2544 struct zone_free_element *elt, *base_elt, *base_prev, *prev, *scan, *keep, *tail;
1c79356b
A
2545
2546 assert(z != ZONE_NULL);
2547
2548 if (!z->collectable)
2549 continue;
2550
2551 lock_zone(z);
2552
55e303ae
A
2553 elt_size = z->elem_size;
2554
1c79356b
A
2555 /*
2556 * Do a quick feasability check before we scan the zone:
91447636
A
2557 * skip unless there is likelihood of getting pages back
2558 * (i.e we need a whole allocation block's worth of free
2559 * elements before we can garbage collect) and
2560 * the zone has more than 10 percent of it's elements free
2d21ac55 2561 * or the element size is a multiple of the PAGE_SIZE
1c79356b 2562 */
2d21ac55
A
2563 if ((elt_size & PAGE_MASK) &&
2564 (((z->cur_size - z->count * elt_size) <= (2 * z->alloc_size)) ||
2565 ((z->cur_size - z->count * elt_size) <= (z->cur_size / 10)))) {
1c79356b
A
2566 unlock_zone(z);
2567 continue;
2568 }
2569
a3d08fcd
A
2570 z->doing_gc = TRUE;
2571
55e303ae
A
2572 /*
2573 * Snatch all of the free elements away from the zone.
1c79356b 2574 */
1c79356b 2575
55e303ae 2576 scan = (void *)z->free_elements;
0c530ab8 2577 z->free_elements = 0;
55e303ae
A
2578
2579 unlock_zone(z);
2580
2581 /*
2582 * Pass 1:
2583 *
2584 * Determine which elements we can attempt to collect
2585 * and count them up in the page table. Foreign elements
2586 * are returned to the zone.
1c79356b 2587 */
55e303ae
A
2588
2589 prev = (void *)&scan;
2590 elt = scan;
2591 n = 0; tail = keep = NULL;
2592 while (elt != NULL) {
2593 if (from_zone_map(elt, elt_size)) {
2594 zone_page_collect((vm_offset_t)elt, elt_size);
2595
1c79356b
A
2596 prev = elt;
2597 elt = elt->next;
55e303ae
A
2598
2599 ++zgc_stats.elems_collected;
1c79356b 2600 }
55e303ae
A
2601 else {
2602 if (keep == NULL)
2603 keep = tail = elt;
2d21ac55
A
2604 else {
2605 ADD_ELEMENT(z, tail, elt);
2606 tail = elt;
2607 }
55e303ae 2608
2d21ac55
A
2609 ADD_ELEMENT(z, prev, elt->next);
2610 elt = elt->next;
2611 ADD_ELEMENT(z, tail, NULL);
1c79356b 2612 }
1c79356b 2613
55e303ae
A
2614 /*
2615 * Dribble back the elements we are keeping.
2616 */
2617
a3d08fcd
A
2618 if (++n >= 50) {
2619 if (z->waiting == TRUE) {
7ddcb079 2620 /* z->waiting checked without lock held, rechecked below after locking */
a3d08fcd 2621 lock_zone(z);
55e303ae 2622
a3d08fcd 2623 if (keep != NULL) {
2d21ac55 2624 ADD_LIST_TO_ZONE(z, keep, tail);
a3d08fcd
A
2625 tail = keep = NULL;
2626 } else {
2627 m =0;
2628 base_elt = elt;
2629 base_prev = prev;
2630 while ((elt != NULL) && (++m < 50)) {
2631 prev = elt;
2632 elt = elt->next;
2633 }
2634 if (m !=0 ) {
2d21ac55
A
2635 ADD_LIST_TO_ZONE(z, base_elt, prev);
2636 ADD_ELEMENT(z, base_prev, elt);
a3d08fcd
A
2637 prev = base_prev;
2638 }
2639 }
55e303ae 2640
a3d08fcd
A
2641 if (z->waiting) {
2642 z->waiting = FALSE;
2643 zone_wakeup(z);
2644 }
55e303ae 2645
a3d08fcd
A
2646 unlock_zone(z);
2647 }
2648 n =0;
55e303ae
A
2649 }
2650 }
2651
2652 /*
2653 * Return any remaining elements.
2654 */
2655
2656 if (keep != NULL) {
2657 lock_zone(z);
2658
2d21ac55 2659 ADD_LIST_TO_ZONE(z, keep, tail);
55e303ae 2660
7ddcb079
A
2661 if (z->waiting) {
2662 z->waiting = FALSE;
2663 zone_wakeup(z);
2664 }
2665
55e303ae
A
2666 unlock_zone(z);
2667 }
2668
2669 /*
2670 * Pass 2:
2671 *
2672 * Determine which pages we can reclaim and
2673 * free those elements.
2674 */
2675
2676 size_freed = 0;
55e303ae
A
2677 elt = scan;
2678 n = 0; tail = keep = NULL;
2679 while (elt != NULL) {
2680 if (zone_page_collectable((vm_offset_t)elt, elt_size)) {
7ddcb079
A
2681 struct zone_free_element *next_elt = elt->next;
2682
55e303ae 2683 size_freed += elt_size;
7ddcb079
A
2684
2685 /*
2686 * If this is the last allocation on the page(s),
2687 * we may use their storage to maintain the linked
2688 * list of free-able pages. So store elt->next because
2689 * "elt" may be scribbled over.
2690 */
2691 zone_page_free_element(&zone_free_page_head,
55e303ae
A
2692 (vm_offset_t)elt, elt_size);
2693
7ddcb079 2694 elt = next_elt;
55e303ae
A
2695
2696 ++zgc_stats.elems_freed;
2697 }
2698 else {
2699 zone_page_keep((vm_offset_t)elt, elt_size);
2700
2701 if (keep == NULL)
2702 keep = tail = elt;
2d21ac55
A
2703 else {
2704 ADD_ELEMENT(z, tail, elt);
2705 tail = elt;
2706 }
55e303ae 2707
2d21ac55
A
2708 elt = elt->next;
2709 ADD_ELEMENT(z, tail, NULL);
55e303ae
A
2710
2711 ++zgc_stats.elems_kept;
2712 }
2713
2714 /*
2715 * Dribble back the elements we are keeping,
2716 * and update the zone size info.
2717 */
2718
a3d08fcd 2719 if (++n >= 50) {
55e303ae
A
2720 lock_zone(z);
2721
2722 z->cur_size -= size_freed;
2723 size_freed = 0;
2724
a3d08fcd 2725 if (keep != NULL) {
2d21ac55 2726 ADD_LIST_TO_ZONE(z, keep, tail);
a3d08fcd
A
2727 }
2728
2729 if (z->waiting) {
2730 z->waiting = FALSE;
2731 zone_wakeup(z);
2732 }
55e303ae
A
2733
2734 unlock_zone(z);
2735
2736 n = 0; tail = keep = NULL;
2737 }
2738 }
2739
2740 /*
2741 * Return any remaining elements, and update
2742 * the zone size info.
2743 */
2744
a3d08fcd
A
2745 lock_zone(z);
2746
55e303ae 2747 if (size_freed > 0 || keep != NULL) {
55e303ae
A
2748
2749 z->cur_size -= size_freed;
2750
2751 if (keep != NULL) {
2d21ac55 2752 ADD_LIST_TO_ZONE(z, keep, tail);
55e303ae
A
2753 }
2754
55e303ae 2755 }
a3d08fcd
A
2756
2757 z->doing_gc = FALSE;
2758 if (z->waiting) {
2759 z->waiting = FALSE;
2760 zone_wakeup(z);
2761 }
2762 unlock_zone(z);
1c79356b
A
2763 }
2764
55e303ae
A
2765 /*
2766 * Reclaim the pages we are freeing.
2767 */
1c79356b 2768
7ddcb079
A
2769 while (zone_free_page_head != ZONE_PAGE_INDEX_INVALID) {
2770 zone_page_index_t zind = zone_free_page_head;
2771 vm_address_t free_page_address;
2d21ac55 2772#if ZONE_ALIAS_ADDR
6d2010ae 2773 z = (zone_t)zone_virtual_addr((vm_map_address_t)z);
2d21ac55 2774#endif
7ddcb079
A
2775 /* Use the first word of the page about to be freed to find the next free page */
2776 free_page_address = zone_map_min_address + PAGE_SIZE * ((vm_size_t)zind);
2777 zone_free_page_head = *(zone_page_index_t *)free_page_address;
2778
2779 kmem_free(zone_map, free_page_address, PAGE_SIZE);
55e303ae 2780 ++zgc_stats.pgs_freed;
1c79356b 2781 }
55e303ae 2782
b0d623f7 2783 lck_mtx_unlock(&zone_gc_lock);
1c79356b
A
2784}
2785
1c79356b
A
2786/*
2787 * consider_zone_gc:
2788 *
2789 * Called by the pageout daemon when the system needs more free pages.
2790 */
2791
2792void
b0d623f7 2793consider_zone_gc(boolean_t force)
1c79356b 2794{
1c79356b
A
2795
2796 if (zone_gc_allowed &&
6d2010ae 2797 (zone_gc_allowed_by_time_throttle ||
b0d623f7
A
2798 zone_gc_forced ||
2799 force)) {
0b4e3aa0 2800 zone_gc_forced = FALSE;
6d2010ae 2801 zone_gc_allowed_by_time_throttle = FALSE; /* reset periodically */
1c79356b
A
2802 zone_gc();
2803 }
2804}
2805
6d2010ae
A
2806/*
2807 * By default, don't attempt zone GC more frequently
2808 * than once / 1 minutes.
2809 */
2810void
2811compute_zone_gc_throttle(void *arg __unused)
2812{
2813 zone_gc_allowed_by_time_throttle = TRUE;
2814}
2d21ac55 2815
1c79356b 2816
6d2010ae
A
2817kern_return_t
2818task_zone_info(
2819 task_t task,
2820 mach_zone_name_array_t *namesp,
2821 mach_msg_type_number_t *namesCntp,
2822 task_zone_info_array_t *infop,
2823 mach_msg_type_number_t *infoCntp)
2824{
2825 mach_zone_name_t *names;
2826 vm_offset_t names_addr;
2827 vm_size_t names_size;
2828 task_zone_info_t *info;
2829 vm_offset_t info_addr;
2830 vm_size_t info_size;
2831 unsigned int max_zones, i;
2832 zone_t z;
2833 mach_zone_name_t *zn;
2834 task_zone_info_t *zi;
2835 kern_return_t kr;
2836
2837 vm_size_t used;
2838 vm_map_copy_t copy;
2839
2840
2841 if (task == TASK_NULL)
2842 return KERN_INVALID_TASK;
2843
2844 /*
2845 * We assume that zones aren't freed once allocated.
2846 * We won't pick up any zones that are allocated later.
2847 */
2848
2849 simple_lock(&all_zones_lock);
2850 max_zones = (unsigned int)(num_zones + num_fake_zones);
2851 z = first_zone;
2852 simple_unlock(&all_zones_lock);
2853
2854 names_size = round_page(max_zones * sizeof *names);
2855 kr = kmem_alloc_pageable(ipc_kernel_map,
2856 &names_addr, names_size);
2857 if (kr != KERN_SUCCESS)
2858 return kr;
2859 names = (mach_zone_name_t *) names_addr;
2860
2861 info_size = round_page(max_zones * sizeof *info);
2862 kr = kmem_alloc_pageable(ipc_kernel_map,
2863 &info_addr, info_size);
2864 if (kr != KERN_SUCCESS) {
2865 kmem_free(ipc_kernel_map,
2866 names_addr, names_size);
2867 return kr;
2868 }
2869
2870 info = (task_zone_info_t *) info_addr;
2871
2872 zn = &names[0];
2873 zi = &info[0];
2874
2875 for (i = 0; i < max_zones - num_fake_zones; i++) {
2876 struct zone zcopy;
2877
2878 assert(z != ZONE_NULL);
2879
2880 lock_zone(z);
2881 zcopy = *z;
2882 unlock_zone(z);
2883
2884 simple_lock(&all_zones_lock);
2885 z = z->next_zone;
2886 simple_unlock(&all_zones_lock);
2887
2888 /* assuming here the name data is static */
2889 (void) strncpy(zn->mzn_name, zcopy.zone_name,
2890 sizeof zn->mzn_name);
2891 zn->mzn_name[sizeof zn->mzn_name - 1] = '\0';
2892
2893 zi->tzi_count = (uint64_t)zcopy.count;
2894 zi->tzi_cur_size = (uint64_t)zcopy.cur_size;
2895 zi->tzi_max_size = (uint64_t)zcopy.max_size;
2896 zi->tzi_elem_size = (uint64_t)zcopy.elem_size;
2897 zi->tzi_alloc_size = (uint64_t)zcopy.alloc_size;
2898 zi->tzi_sum_size = zcopy.sum_count * zcopy.elem_size;
2899 zi->tzi_exhaustible = (uint64_t)zcopy.exhaustible;
2900 zi->tzi_collectable = (uint64_t)zcopy.collectable;
2901 zi->tzi_caller_acct = (uint64_t)zcopy.caller_acct;
2902 if (task->tkm_zinfo != NULL) {
2903 zi->tzi_task_alloc = task->tkm_zinfo[zcopy.index].alloc;
2904 zi->tzi_task_free = task->tkm_zinfo[zcopy.index].free;
2905 } else {
2906 zi->tzi_task_alloc = 0;
2907 zi->tzi_task_free = 0;
2908 }
2909 zn++;
2910 zi++;
2911 }
2912
2913 /*
2914 * loop through the fake zones and fill them using the specialized
2915 * functions
2916 */
2917 for (i = 0; i < num_fake_zones; i++) {
2918 int count, collectable, exhaustible, caller_acct, index;
2919 vm_size_t cur_size, max_size, elem_size, alloc_size;
2920 uint64_t sum_size;
2921
2922 strncpy(zn->mzn_name, fake_zones[i].name, sizeof zn->mzn_name);
2923 zn->mzn_name[sizeof zn->mzn_name - 1] = '\0';
2924 fake_zones[i].query(&count, &cur_size,
2925 &max_size, &elem_size,
2926 &alloc_size, &sum_size,
2927 &collectable, &exhaustible, &caller_acct);
2928 zi->tzi_count = (uint64_t)count;
2929 zi->tzi_cur_size = (uint64_t)cur_size;
2930 zi->tzi_max_size = (uint64_t)max_size;
2931 zi->tzi_elem_size = (uint64_t)elem_size;
2932 zi->tzi_alloc_size = (uint64_t)alloc_size;
2933 zi->tzi_sum_size = sum_size;
2934 zi->tzi_collectable = (uint64_t)collectable;
2935 zi->tzi_exhaustible = (uint64_t)exhaustible;
2936 zi->tzi_caller_acct = (uint64_t)caller_acct;
2937 if (task->tkm_zinfo != NULL) {
2938 index = ZINFO_SLOTS - num_fake_zones + i;
2939 zi->tzi_task_alloc = task->tkm_zinfo[index].alloc;
2940 zi->tzi_task_free = task->tkm_zinfo[index].free;
2941 } else {
2942 zi->tzi_task_alloc = 0;
2943 zi->tzi_task_free = 0;
2944 }
2945 zn++;
2946 zi++;
2947 }
2948
2949 used = max_zones * sizeof *names;
2950 if (used != names_size)
2951 bzero((char *) (names_addr + used), names_size - used);
2952
2953 kr = vm_map_copyin(ipc_kernel_map, (vm_map_address_t)names_addr,
2954 (vm_map_size_t)names_size, TRUE, &copy);
2955 assert(kr == KERN_SUCCESS);
2956
2957 *namesp = (mach_zone_name_t *) copy;
2958 *namesCntp = max_zones;
2959
2960 used = max_zones * sizeof *info;
2961
2962 if (used != info_size)
2963 bzero((char *) (info_addr + used), info_size - used);
2964
2965 kr = vm_map_copyin(ipc_kernel_map, (vm_map_address_t)info_addr,
2966 (vm_map_size_t)info_size, TRUE, &copy);
2967 assert(kr == KERN_SUCCESS);
2968
2969 *infop = (task_zone_info_t *) copy;
2970 *infoCntp = max_zones;
2971
2972 return KERN_SUCCESS;
2973}
2974
2975kern_return_t
2976mach_zone_info(
2977 host_t host,
2978 mach_zone_name_array_t *namesp,
2979 mach_msg_type_number_t *namesCntp,
2980 mach_zone_info_array_t *infop,
2981 mach_msg_type_number_t *infoCntp)
2982{
2983 mach_zone_name_t *names;
2984 vm_offset_t names_addr;
2985 vm_size_t names_size;
2986 mach_zone_info_t *info;
2987 vm_offset_t info_addr;
2988 vm_size_t info_size;
2989 unsigned int max_zones, i;
2990 zone_t z;
2991 mach_zone_name_t *zn;
2992 mach_zone_info_t *zi;
2993 kern_return_t kr;
2994
2995 vm_size_t used;
2996 vm_map_copy_t copy;
2997
2998
2999 if (host == HOST_NULL)
3000 return KERN_INVALID_HOST;
3001
3002 num_fake_zones = sizeof fake_zones / sizeof fake_zones[0];
3003
3004 /*
3005 * We assume that zones aren't freed once allocated.
3006 * We won't pick up any zones that are allocated later.
3007 */
3008
3009 simple_lock(&all_zones_lock);
3010 max_zones = (unsigned int)(num_zones + num_fake_zones);
3011 z = first_zone;
3012 simple_unlock(&all_zones_lock);
3013
3014 names_size = round_page(max_zones * sizeof *names);
3015 kr = kmem_alloc_pageable(ipc_kernel_map,
3016 &names_addr, names_size);
3017 if (kr != KERN_SUCCESS)
3018 return kr;
3019 names = (mach_zone_name_t *) names_addr;
3020
3021 info_size = round_page(max_zones * sizeof *info);
3022 kr = kmem_alloc_pageable(ipc_kernel_map,
3023 &info_addr, info_size);
3024 if (kr != KERN_SUCCESS) {
3025 kmem_free(ipc_kernel_map,
3026 names_addr, names_size);
3027 return kr;
3028 }
3029
3030 info = (mach_zone_info_t *) info_addr;
3031
3032 zn = &names[0];
3033 zi = &info[0];
3034
3035 for (i = 0; i < max_zones - num_fake_zones; i++) {
3036 struct zone zcopy;
3037
3038 assert(z != ZONE_NULL);
3039
3040 lock_zone(z);
3041 zcopy = *z;
3042 unlock_zone(z);
3043
3044 simple_lock(&all_zones_lock);
3045 z = z->next_zone;
3046 simple_unlock(&all_zones_lock);
3047
3048 /* assuming here the name data is static */
3049 (void) strncpy(zn->mzn_name, zcopy.zone_name,
3050 sizeof zn->mzn_name);
3051 zn->mzn_name[sizeof zn->mzn_name - 1] = '\0';
3052
3053 zi->mzi_count = (uint64_t)zcopy.count;
3054 zi->mzi_cur_size = (uint64_t)zcopy.cur_size;
3055 zi->mzi_max_size = (uint64_t)zcopy.max_size;
3056 zi->mzi_elem_size = (uint64_t)zcopy.elem_size;
3057 zi->mzi_alloc_size = (uint64_t)zcopy.alloc_size;
3058 zi->mzi_sum_size = zcopy.sum_count * zcopy.elem_size;
3059 zi->mzi_exhaustible = (uint64_t)zcopy.exhaustible;
3060 zi->mzi_collectable = (uint64_t)zcopy.collectable;
3061 zn++;
3062 zi++;
3063 }
3064
3065 /*
3066 * loop through the fake zones and fill them using the specialized
3067 * functions
3068 */
3069 for (i = 0; i < num_fake_zones; i++) {
3070 int count, collectable, exhaustible, caller_acct;
3071 vm_size_t cur_size, max_size, elem_size, alloc_size;
3072 uint64_t sum_size;
3073
3074 strncpy(zn->mzn_name, fake_zones[i].name, sizeof zn->mzn_name);
3075 zn->mzn_name[sizeof zn->mzn_name - 1] = '\0';
3076 fake_zones[i].query(&count, &cur_size,
3077 &max_size, &elem_size,
3078 &alloc_size, &sum_size,
3079 &collectable, &exhaustible, &caller_acct);
3080 zi->mzi_count = (uint64_t)count;
3081 zi->mzi_cur_size = (uint64_t)cur_size;
3082 zi->mzi_max_size = (uint64_t)max_size;
3083 zi->mzi_elem_size = (uint64_t)elem_size;
3084 zi->mzi_alloc_size = (uint64_t)alloc_size;
3085 zi->mzi_sum_size = sum_size;
3086 zi->mzi_collectable = (uint64_t)collectable;
3087 zi->mzi_exhaustible = (uint64_t)exhaustible;
3088
3089 zn++;
3090 zi++;
3091 }
3092
3093 used = max_zones * sizeof *names;
3094 if (used != names_size)
3095 bzero((char *) (names_addr + used), names_size - used);
3096
3097 kr = vm_map_copyin(ipc_kernel_map, (vm_map_address_t)names_addr,
3098 (vm_map_size_t)names_size, TRUE, &copy);
3099 assert(kr == KERN_SUCCESS);
3100
3101 *namesp = (mach_zone_name_t *) copy;
3102 *namesCntp = max_zones;
3103
3104 used = max_zones * sizeof *info;
3105
3106 if (used != info_size)
3107 bzero((char *) (info_addr + used), info_size - used);
3108
3109 kr = vm_map_copyin(ipc_kernel_map, (vm_map_address_t)info_addr,
3110 (vm_map_size_t)info_size, TRUE, &copy);
3111 assert(kr == KERN_SUCCESS);
3112
3113 *infop = (mach_zone_info_t *) copy;
3114 *infoCntp = max_zones;
3115
3116 return KERN_SUCCESS;
3117}
3118
3119/*
3120 * host_zone_info - LEGACY user interface for Mach zone information
3121 * Should use mach_zone_info() instead!
3122 */
1c79356b
A
3123kern_return_t
3124host_zone_info(
3125 host_t host,
3126 zone_name_array_t *namesp,
3127 mach_msg_type_number_t *namesCntp,
3128 zone_info_array_t *infop,
3129 mach_msg_type_number_t *infoCntp)
3130{
3131 zone_name_t *names;
3132 vm_offset_t names_addr;
3133 vm_size_t names_size;
3134 zone_info_t *info;
3135 vm_offset_t info_addr;
3136 vm_size_t info_size;
3137 unsigned int max_zones, i;
3138 zone_t z;
3139 zone_name_t *zn;
3140 zone_info_t *zi;
3141 kern_return_t kr;
6d2010ae
A
3142
3143 vm_size_t used;
3144 vm_map_copy_t copy;
1c79356b 3145
b0d623f7 3146
1c79356b
A
3147 if (host == HOST_NULL)
3148 return KERN_INVALID_HOST;
3149
b0d623f7
A
3150#if defined(__LP64__)
3151 if (!thread_is_64bit(current_thread()))
3152 return KERN_NOT_SUPPORTED;
3153#else
3154 if (thread_is_64bit(current_thread()))
3155 return KERN_NOT_SUPPORTED;
3156#endif
3157
2d21ac55
A
3158 num_fake_zones = sizeof fake_zones / sizeof fake_zones[0];
3159
1c79356b
A
3160 /*
3161 * We assume that zones aren't freed once allocated.
3162 * We won't pick up any zones that are allocated later.
3163 */
3164
3165 simple_lock(&all_zones_lock);
b0d623f7 3166 max_zones = (unsigned int)(num_zones + num_fake_zones);
1c79356b
A
3167 z = first_zone;
3168 simple_unlock(&all_zones_lock);
3169
6d2010ae
A
3170 names_size = round_page(max_zones * sizeof *names);
3171 kr = kmem_alloc_pageable(ipc_kernel_map,
3172 &names_addr, names_size);
3173 if (kr != KERN_SUCCESS)
3174 return kr;
3175 names = (zone_name_t *) names_addr;
3176
3177 info_size = round_page(max_zones * sizeof *info);
3178 kr = kmem_alloc_pageable(ipc_kernel_map,
3179 &info_addr, info_size);
3180 if (kr != KERN_SUCCESS) {
3181 kmem_free(ipc_kernel_map,
3182 names_addr, names_size);
3183 return kr;
1c79356b 3184 }
6d2010ae
A
3185
3186 info = (zone_info_t *) info_addr;
3187
1c79356b
A
3188 zn = &names[0];
3189 zi = &info[0];
3190
6d2010ae 3191 for (i = 0; i < max_zones - num_fake_zones; i++) {
1c79356b
A
3192 struct zone zcopy;
3193
3194 assert(z != ZONE_NULL);
3195
3196 lock_zone(z);
3197 zcopy = *z;
3198 unlock_zone(z);
3199
3200 simple_lock(&all_zones_lock);
3201 z = z->next_zone;
3202 simple_unlock(&all_zones_lock);
3203
3204 /* assuming here the name data is static */
3205 (void) strncpy(zn->zn_name, zcopy.zone_name,
3206 sizeof zn->zn_name);
2d21ac55 3207 zn->zn_name[sizeof zn->zn_name - 1] = '\0';
1c79356b
A
3208
3209 zi->zi_count = zcopy.count;
3210 zi->zi_cur_size = zcopy.cur_size;
3211 zi->zi_max_size = zcopy.max_size;
3212 zi->zi_elem_size = zcopy.elem_size;
3213 zi->zi_alloc_size = zcopy.alloc_size;
3214 zi->zi_exhaustible = zcopy.exhaustible;
3215 zi->zi_collectable = zcopy.collectable;
3216
3217 zn++;
3218 zi++;
3219 }
0c530ab8 3220
2d21ac55
A
3221 /*
3222 * loop through the fake zones and fill them using the specialized
3223 * functions
3224 */
3225 for (i = 0; i < num_fake_zones; i++) {
6d2010ae
A
3226 int caller_acct;
3227 uint64_t sum_space;
2d21ac55
A
3228 strncpy(zn->zn_name, fake_zones[i].name, sizeof zn->zn_name);
3229 zn->zn_name[sizeof zn->zn_name - 1] = '\0';
6d2010ae
A
3230 fake_zones[i].query(&zi->zi_count, &zi->zi_cur_size,
3231 &zi->zi_max_size, &zi->zi_elem_size,
3232 &zi->zi_alloc_size, &sum_space,
3233 &zi->zi_collectable, &zi->zi_exhaustible, &caller_acct);
2d21ac55
A
3234 zn++;
3235 zi++;
3236 }
1c79356b 3237
6d2010ae
A
3238 used = max_zones * sizeof *names;
3239 if (used != names_size)
3240 bzero((char *) (names_addr + used), names_size - used);
1c79356b 3241
6d2010ae
A
3242 kr = vm_map_copyin(ipc_kernel_map, (vm_map_address_t)names_addr,
3243 (vm_map_size_t)names_size, TRUE, &copy);
3244 assert(kr == KERN_SUCCESS);
1c79356b 3245
6d2010ae 3246 *namesp = (zone_name_t *) copy;
1c79356b
A
3247 *namesCntp = max_zones;
3248
6d2010ae
A
3249 used = max_zones * sizeof *info;
3250 if (used != info_size)
3251 bzero((char *) (info_addr + used), info_size - used);
1c79356b 3252
6d2010ae
A
3253 kr = vm_map_copyin(ipc_kernel_map, (vm_map_address_t)info_addr,
3254 (vm_map_size_t)info_size, TRUE, &copy);
3255 assert(kr == KERN_SUCCESS);
1c79356b 3256
6d2010ae 3257 *infop = (zone_info_t *) copy;
1c79356b
A
3258 *infoCntp = max_zones;
3259
3260 return KERN_SUCCESS;
3261}
3262
b0d623f7 3263extern unsigned int stack_total;
6d2010ae 3264extern unsigned long long stack_allocs;
b0d623f7
A
3265
3266#if defined(__i386__) || defined (__x86_64__)
3267extern unsigned int inuse_ptepages_count;
6d2010ae 3268extern long long alloc_ptepages_count;
b0d623f7
A
3269#endif
3270
3271void zone_display_zprint()
3272{
3273 unsigned int i;
3274 zone_t the_zone;
3275
3276 if(first_zone!=NULL) {
3277 the_zone = first_zone;
3278 for (i = 0; i < num_zones; i++) {
3279 if(the_zone->cur_size > (1024*1024)) {
3280 printf("%.20s:\t%lu\n",the_zone->zone_name,(uintptr_t)the_zone->cur_size);
3281 }
3282
3283 if(the_zone->next_zone == NULL) {
3284 break;
3285 }
3286
3287 the_zone = the_zone->next_zone;
3288 }
3289 }
3290
3291 printf("Kernel Stacks:\t%lu\n",(uintptr_t)(kernel_stack_size * stack_total));
3292
3293#if defined(__i386__) || defined (__x86_64__)
3294 printf("PageTables:\t%lu\n",(uintptr_t)(PAGE_SIZE * inuse_ptepages_count));
3295#endif
3296
3297 printf("Kalloc.Large:\t%lu\n",(uintptr_t)kalloc_large_total);
3298}
3299
3300
3301
1c79356b
A
3302#if MACH_KDB
3303#include <ddb/db_command.h>
3304#include <ddb/db_output.h>
3305#include <kern/kern_print.h>
3306
3307const char *zone_labels =
3308"ENTRY COUNT TOT_SZ MAX_SZ ELT_SZ ALLOC_SZ NAME";
3309
3310/* Forwards */
3311void db_print_zone(
3312 zone_t addr);
3313
3314#if ZONE_DEBUG
3315void db_zone_check_active(
3316 zone_t zone);
3317void db_zone_print_active(
3318 zone_t zone);
3319#endif /* ZONE_DEBUG */
3320void db_zone_print_free(
3321 zone_t zone);
3322void
3323db_print_zone(
3324 zone_t addr)
3325{
3326 struct zone zcopy;
3327
3328 zcopy = *addr;
3329
3330 db_printf("%8x %8x %8x %8x %6x %8x %s ",
3331 addr, zcopy.count, zcopy.cur_size,
3332 zcopy.max_size, zcopy.elem_size,
3333 zcopy.alloc_size, zcopy.zone_name);
3334 if (zcopy.exhaustible)
3335 db_printf("H");
3336 if (zcopy.collectable)
3337 db_printf("C");
3338 if (zcopy.expandable)
3339 db_printf("X");
6d2010ae
A
3340 if (zcopy.caller_acct)
3341 db_printf("A");
1c79356b
A
3342 db_printf("\n");
3343}
3344
3345/*ARGSUSED*/
3346void
2d21ac55
A
3347db_show_one_zone(db_expr_t addr, boolean_t have_addr,
3348 __unused db_expr_t count, __unused char *modif)
1c79356b 3349{
91447636 3350 struct zone *z = (zone_t)((char *)0 + addr);
1c79356b
A
3351
3352 if (z == ZONE_NULL || !have_addr){
3353 db_error("No Zone\n");
3354 /*NOTREACHED*/
3355 }
3356
3357 db_printf("%s\n", zone_labels);
3358 db_print_zone(z);
3359}
3360
3361/*ARGSUSED*/
3362void
2d21ac55
A
3363db_show_all_zones(__unused db_expr_t addr, boolean_t have_addr, db_expr_t count,
3364 __unused char *modif)
1c79356b
A
3365{
3366 zone_t z;
3367 unsigned total = 0;
3368
3369 /*
3370 * Don't risk hanging by unconditionally locking,
3371 * risk of incoherent data is small (zones aren't freed).
3372 */
3373 have_addr = simple_lock_try(&all_zones_lock);
3374 count = num_zones;
3375 z = first_zone;
3376 if (have_addr) {
3377 simple_unlock(&all_zones_lock);
3378 }
3379
3380 db_printf("%s\n", zone_labels);
3381 for ( ; count > 0; count--) {
3382 if (!z) {
3383 db_error("Mangled Zone List\n");
3384 /*NOTREACHED*/
3385 }
3386 db_print_zone(z);
3387 total += z->cur_size,
3388
3389 have_addr = simple_lock_try(&all_zones_lock);
3390 z = z->next_zone;
3391 if (have_addr) {
3392 simple_unlock(&all_zones_lock);
3393 }
3394 }
3395 db_printf("\nTotal %8x", total);
55e303ae 3396 db_printf("\n\nzone_gc() has reclaimed %d pages\n", zgc_stats.pgs_freed);
1c79356b
A
3397}
3398
3399#if ZONE_DEBUG
3400void
3401db_zone_check_active(
3402 zone_t zone)
3403{
3404 int count = 0;
3405 queue_t tmp_elem;
3406
3407 if (!zone_debug_enabled(zone) || !zone_check)
3408 return;
3409 tmp_elem = queue_first(&zone->active_zones);
3410 while (count < zone->count) {
3411 count++;
3412 if (tmp_elem == 0) {
2d21ac55 3413 printf("unexpected zero element, zone=%p, count=%d\n",
1c79356b
A
3414 zone, count);
3415 assert(FALSE);
3416 break;
3417 }
3418 if (queue_end(tmp_elem, &zone->active_zones)) {
2d21ac55 3419 printf("unexpected queue_end, zone=%p, count=%d\n",
1c79356b
A
3420 zone, count);
3421 assert(FALSE);
3422 break;
3423 }
3424 tmp_elem = queue_next(tmp_elem);
3425 }
3426 if (!queue_end(tmp_elem, &zone->active_zones)) {
2d21ac55 3427 printf("not at queue_end, zone=%p, tmp_elem=%p\n",
1c79356b
A
3428 zone, tmp_elem);
3429 assert(FALSE);
3430 }
3431}
3432
3433void
3434db_zone_print_active(
3435 zone_t zone)
3436{
3437 int count = 0;
3438 queue_t tmp_elem;
3439
3440 if (!zone_debug_enabled(zone)) {
2d21ac55 3441 printf("zone %p debug not enabled\n", zone);
1c79356b
A
3442 return;
3443 }
3444 if (!zone_check) {
3445 printf("zone_check FALSE\n");
3446 return;
3447 }
3448
2d21ac55 3449 printf("zone %p, active elements %d\n", zone, zone->count);
1c79356b
A
3450 printf("active list:\n");
3451 tmp_elem = queue_first(&zone->active_zones);
3452 while (count < zone->count) {
2d21ac55 3453 printf(" %p", tmp_elem);
1c79356b
A
3454 count++;
3455 if ((count % 6) == 0)
3456 printf("\n");
3457 if (tmp_elem == 0) {
3458 printf("\nunexpected zero element, count=%d\n", count);
3459 break;
3460 }
3461 if (queue_end(tmp_elem, &zone->active_zones)) {
3462 printf("\nunexpected queue_end, count=%d\n", count);
3463 break;
3464 }
3465 tmp_elem = queue_next(tmp_elem);
3466 }
3467 if (!queue_end(tmp_elem, &zone->active_zones))
2d21ac55 3468 printf("\nnot at queue_end, tmp_elem=%p\n", tmp_elem);
1c79356b
A
3469 else
3470 printf("\n");
3471}
3472#endif /* ZONE_DEBUG */
3473
3474void
3475db_zone_print_free(
3476 zone_t zone)
3477{
3478 int count = 0;
3479 int freecount;
3480 vm_offset_t elem;
3481
3482 freecount = zone_free_count(zone);
2d21ac55 3483 printf("zone %p, free elements %d\n", zone, freecount);
1c79356b
A
3484 printf("free list:\n");
3485 elem = zone->free_elements;
3486 while (count < freecount) {
3487 printf(" 0x%x", elem);
3488 count++;
3489 if ((count % 6) == 0)
3490 printf("\n");
3491 if (elem == 0) {
3492 printf("\nunexpected zero element, count=%d\n", count);
3493 break;
3494 }
3495 elem = *((vm_offset_t *)elem);
3496 }
3497 if (elem != 0)
3498 printf("\nnot at end of free list, elem=0x%x\n", elem);
3499 else
3500 printf("\n");
3501}
3502
3503#endif /* MACH_KDB */
3504
3505
3506#if ZONE_DEBUG
3507
3508/* should we care about locks here ? */
3509
3510#if MACH_KDB
91447636 3511void *
1c79356b
A
3512next_element(
3513 zone_t z,
91447636 3514 void *prev)
1c79356b 3515{
91447636
A
3516 char *elt = (char *)prev;
3517
1c79356b 3518 if (!zone_debug_enabled(z))
2d21ac55 3519 return(NULL);
55e303ae 3520 elt -= ZONE_DEBUG_OFFSET;
91447636 3521 elt = (char *) queue_next((queue_t) elt);
1c79356b 3522 if ((queue_t) elt == &z->active_zones)
2d21ac55 3523 return(NULL);
55e303ae 3524 elt += ZONE_DEBUG_OFFSET;
1c79356b
A
3525 return(elt);
3526}
3527
91447636 3528void *
1c79356b
A
3529first_element(
3530 zone_t z)
3531{
91447636 3532 char *elt;
1c79356b
A
3533
3534 if (!zone_debug_enabled(z))
2d21ac55 3535 return(NULL);
1c79356b 3536 if (queue_empty(&z->active_zones))
2d21ac55 3537 return(NULL);
91447636 3538 elt = (char *)queue_first(&z->active_zones);
55e303ae 3539 elt += ZONE_DEBUG_OFFSET;
1c79356b
A
3540 return(elt);
3541}
3542
3543/*
3544 * Second arg controls how many zone elements are printed:
3545 * 0 => none
3546 * n, n < 0 => all
3547 * n, n > 0 => last n on active list
3548 */
3549int
3550zone_count(
3551 zone_t z,
3552 int tail)
3553{
91447636 3554 void *elt;
1c79356b
A
3555 int count = 0;
3556 boolean_t print = (tail != 0);
3557
3558 if (tail < 0)
3559 tail = z->count;
3560 if (z->count < tail)
3561 tail = 0;
3562 tail = z->count - tail;
3563 for (elt = first_element(z); elt; elt = next_element(z, elt)) {
3564 if (print && tail <= count)
3565 db_printf("%8x\n", elt);
3566 count++;
3567 }
3568 assert(count == z->count);
3569 return(count);
3570}
3571#endif /* MACH_KDB */
3572
3573#define zone_in_use(z) ( z->count || z->free_elements )
3574
3575void
3576zone_debug_enable(
3577 zone_t z)
3578{
3579 if (zone_debug_enabled(z) || zone_in_use(z) ||
55e303ae 3580 z->alloc_size < (z->elem_size + ZONE_DEBUG_OFFSET))
1c79356b
A
3581 return;
3582 queue_init(&z->active_zones);
55e303ae 3583 z->elem_size += ZONE_DEBUG_OFFSET;
1c79356b
A
3584}
3585
3586void
3587zone_debug_disable(
3588 zone_t z)
3589{
3590 if (!zone_debug_enabled(z) || zone_in_use(z))
3591 return;
55e303ae 3592 z->elem_size -= ZONE_DEBUG_OFFSET;
2d21ac55 3593 z->active_zones.next = z->active_zones.prev = NULL;
1c79356b 3594}
b0d623f7
A
3595
3596
1c79356b 3597#endif /* ZONE_DEBUG */