X-Git-Url: https://git.saurik.com/apple/xnu.git/blobdiff_plain/04b8595b18b1b41ac7a206e4b3d51a635f8413d7..HEAD:/osfmk/vm/vm_map.c diff --git a/osfmk/vm/vm_map.c b/osfmk/vm/vm_map.c index ca11e1bae..abe95ed4a 100644 --- a/osfmk/vm/vm_map.c +++ b/osfmk/vm/vm_map.c @@ -1,8 +1,8 @@ /* - * Copyright (c) 2000-2012 Apple Inc. All rights reserved. + * Copyright (c) 2000-2020 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ - * + * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in @@ -11,10 +11,10 @@ * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. - * + * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. - * + * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, @@ -22,34 +22,34 @@ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. - * + * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ /* * @OSF_COPYRIGHT@ */ -/* +/* * Mach Operating System * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University * All Rights Reserved. - * + * * Permission to use, copy, modify and distribute this software and its * documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. - * + * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. - * + * * Carnegie Mellon requests users of this software to return to - * + * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 - * + * * any improvements or extensions that they make and grant Carnegie Mellon * the rights to redistribute these changes. */ @@ -63,7 +63,6 @@ * Virtual memory mapping module. */ -#include #include #include @@ -82,11 +81,14 @@ #include #include -#include +#include +#include +#include #include -#include +#include #include +#include #include #include #include @@ -94,11 +96,11 @@ #include #include #include +#include #include #include #include #include -#include #include #include @@ -109,194 +111,275 @@ #include #include -extern u_int32_t random(void); /* from */ +#include + +#include +#include + +#include +#if DEVELOPMENT || DEBUG +extern int proc_selfcsflags(void); +int panic_on_unsigned_execute = 0; +#endif /* DEVELOPMENT || DEBUG */ + +#if MACH_ASSERT +int debug4k_filter = 0; +char debug4k_proc_name[1024] = ""; +int debug4k_proc_filter = (int)-1 & ~(1 << __DEBUG4K_FAULT); +int debug4k_panic_on_misaligned_sharing = 0; +const char *debug4k_category_name[] = { + "error", /* 0 */ + "life", /* 1 */ + "load", /* 2 */ + "fault", /* 3 */ + "copy", /* 4 */ + "share", /* 5 */ + "adjust", /* 6 */ + "pmap", /* 7 */ + "mementry", /* 8 */ + "iokit", /* 9 */ + "upl", /* 10 */ + "exc", /* 11 */ + "vfs" /* 12 */ +}; +#endif /* MACH_ASSERT */ +int debug4k_no_cow_copyin = 0; + + +#if __arm64__ +extern const int fourk_binary_compatibility_unsafe; +extern const int fourk_binary_compatibility_allow_wx; +#endif /* __arm64__ */ +extern int proc_selfpid(void); +extern char *proc_name_address(void *p); + +#if VM_MAP_DEBUG_APPLE_PROTECT +int vm_map_debug_apple_protect = 0; +#endif /* VM_MAP_DEBUG_APPLE_PROTECT */ +#if VM_MAP_DEBUG_FOURK +int vm_map_debug_fourk = 0; +#endif /* VM_MAP_DEBUG_FOURK */ + +SECURITY_READ_ONLY_LATE(int) vm_map_executable_immutable = 1; +int vm_map_executable_immutable_verbose = 0; + +os_refgrp_decl(static, map_refgrp, "vm_map", NULL); + +extern u_int32_t random(void); /* from */ /* Internal prototypes */ static void vm_map_simplify_range( - vm_map_t map, - vm_map_offset_t start, - vm_map_offset_t end); /* forward */ - -static boolean_t vm_map_range_check( - vm_map_t map, - vm_map_offset_t start, - vm_map_offset_t end, - vm_map_entry_t *entry); - -static vm_map_entry_t _vm_map_entry_create( - struct vm_map_header *map_header, boolean_t map_locked); - -static void _vm_map_entry_dispose( - struct vm_map_header *map_header, - vm_map_entry_t entry); - -static void vm_map_pmap_enter( - vm_map_t map, - vm_map_offset_t addr, - vm_map_offset_t end_addr, - vm_object_t object, - vm_object_offset_t offset, - vm_prot_t protection); - -static void _vm_map_clip_end( - struct vm_map_header *map_header, - vm_map_entry_t entry, - vm_map_offset_t end); - -static void _vm_map_clip_start( - struct vm_map_header *map_header, - vm_map_entry_t entry, - vm_map_offset_t start); - -static void vm_map_entry_delete( - vm_map_t map, - vm_map_entry_t entry); - -static kern_return_t vm_map_delete( - vm_map_t map, - vm_map_offset_t start, - vm_map_offset_t end, - int flags, - vm_map_t zap_map); - -static kern_return_t vm_map_copy_overwrite_unaligned( - vm_map_t dst_map, - vm_map_entry_t entry, - vm_map_copy_t copy, + vm_map_t map, + vm_map_offset_t start, + vm_map_offset_t end); /* forward */ + +static boolean_t vm_map_range_check( + vm_map_t map, + vm_map_offset_t start, + vm_map_offset_t end, + vm_map_entry_t *entry); + +static vm_map_entry_t _vm_map_entry_create( + struct vm_map_header *map_header, boolean_t map_locked); + +static void _vm_map_entry_dispose( + struct vm_map_header *map_header, + vm_map_entry_t entry); + +static void vm_map_pmap_enter( + vm_map_t map, + vm_map_offset_t addr, + vm_map_offset_t end_addr, + vm_object_t object, + vm_object_offset_t offset, + vm_prot_t protection); + +static void _vm_map_clip_end( + struct vm_map_header *map_header, + vm_map_entry_t entry, + vm_map_offset_t end); + +static void _vm_map_clip_start( + struct vm_map_header *map_header, + vm_map_entry_t entry, + vm_map_offset_t start); + +static void vm_map_entry_delete( + vm_map_t map, + vm_map_entry_t entry); + +static kern_return_t vm_map_delete( + vm_map_t map, + vm_map_offset_t start, + vm_map_offset_t end, + int flags, + vm_map_t zap_map); + +static void vm_map_copy_insert( + vm_map_t map, + vm_map_entry_t after_where, + vm_map_copy_t copy); + +static kern_return_t vm_map_copy_overwrite_unaligned( + vm_map_t dst_map, + vm_map_entry_t entry, + vm_map_copy_t copy, vm_map_address_t start, - boolean_t discard_on_success); + boolean_t discard_on_success); -static kern_return_t vm_map_copy_overwrite_aligned( - vm_map_t dst_map, - vm_map_entry_t tmp_entry, - vm_map_copy_t copy, +static kern_return_t vm_map_copy_overwrite_aligned( + vm_map_t dst_map, + vm_map_entry_t tmp_entry, + vm_map_copy_t copy, vm_map_offset_t start, - pmap_t pmap); + pmap_t pmap); -static kern_return_t vm_map_copyin_kernel_buffer( - vm_map_t src_map, +static kern_return_t vm_map_copyin_kernel_buffer( + vm_map_t src_map, vm_map_address_t src_addr, - vm_map_size_t len, - boolean_t src_destroy, - vm_map_copy_t *copy_result); /* OUT */ - -static kern_return_t vm_map_copyout_kernel_buffer( - vm_map_t map, - vm_map_address_t *addr, /* IN/OUT */ - vm_map_copy_t copy, - boolean_t overwrite, - boolean_t consume_on_success); - -static void vm_map_fork_share( - vm_map_t old_map, - vm_map_entry_t old_entry, - vm_map_t new_map); - -static boolean_t vm_map_fork_copy( - vm_map_t old_map, - vm_map_entry_t *old_entry_p, - vm_map_t new_map); - -void vm_map_region_top_walk( - vm_map_entry_t entry, - vm_region_top_info_t top); - -void vm_map_region_walk( - vm_map_t map, - vm_map_offset_t va, - vm_map_entry_t entry, - vm_object_offset_t offset, - vm_object_size_t range, - vm_region_extended_info_t extended, - boolean_t look_for_pages, - mach_msg_type_number_t count); - -static kern_return_t vm_map_wire_nested( - vm_map_t map, - vm_map_offset_t start, - vm_map_offset_t end, - vm_prot_t access_type, - boolean_t user_wire, - pmap_t map_pmap, - vm_map_offset_t pmap_addr, - ppnum_t *physpage_p); - -static kern_return_t vm_map_unwire_nested( - vm_map_t map, - vm_map_offset_t start, - vm_map_offset_t end, - boolean_t user_wire, - pmap_t map_pmap, - vm_map_offset_t pmap_addr); - -static kern_return_t vm_map_overwrite_submap_recurse( - vm_map_t dst_map, - vm_map_offset_t dst_addr, - vm_map_size_t dst_size); - -static kern_return_t vm_map_copy_overwrite_nested( - vm_map_t dst_map, - vm_map_offset_t dst_addr, - vm_map_copy_t copy, - boolean_t interruptible, - pmap_t pmap, - boolean_t discard_on_success); - -static kern_return_t vm_map_remap_extract( - vm_map_t map, - vm_map_offset_t addr, - vm_map_size_t size, - boolean_t copy, - struct vm_map_header *map_header, - vm_prot_t *cur_protection, - vm_prot_t *max_protection, - vm_inherit_t inheritance, - boolean_t pageable); - -static kern_return_t vm_map_remap_range_allocate( - vm_map_t map, - vm_map_address_t *address, - vm_map_size_t size, - vm_map_offset_t mask, - int flags, - vm_map_entry_t *map_entry); - -static void vm_map_region_look_for_page( - vm_map_t map, + vm_map_size_t len, + boolean_t src_destroy, + vm_map_copy_t *copy_result); /* OUT */ + +static kern_return_t vm_map_copyout_kernel_buffer( + vm_map_t map, + vm_map_address_t *addr, /* IN/OUT */ + vm_map_copy_t copy, + vm_map_size_t copy_size, + boolean_t overwrite, + boolean_t consume_on_success); + +static void vm_map_fork_share( + vm_map_t old_map, + vm_map_entry_t old_entry, + vm_map_t new_map); + +static boolean_t vm_map_fork_copy( + vm_map_t old_map, + vm_map_entry_t *old_entry_p, + vm_map_t new_map, + int vm_map_copyin_flags); + +static kern_return_t vm_map_wire_nested( + vm_map_t map, + vm_map_offset_t start, + vm_map_offset_t end, + vm_prot_t caller_prot, + vm_tag_t tag, + boolean_t user_wire, + pmap_t map_pmap, + vm_map_offset_t pmap_addr, + ppnum_t *physpage_p); + +static kern_return_t vm_map_unwire_nested( + vm_map_t map, + vm_map_offset_t start, + vm_map_offset_t end, + boolean_t user_wire, + pmap_t map_pmap, + vm_map_offset_t pmap_addr); + +static kern_return_t vm_map_overwrite_submap_recurse( + vm_map_t dst_map, + vm_map_offset_t dst_addr, + vm_map_size_t dst_size); + +static kern_return_t vm_map_copy_overwrite_nested( + vm_map_t dst_map, + vm_map_offset_t dst_addr, + vm_map_copy_t copy, + boolean_t interruptible, + pmap_t pmap, + boolean_t discard_on_success); + +static kern_return_t vm_map_remap_extract( + vm_map_t map, + vm_map_offset_t addr, + vm_map_size_t size, + boolean_t copy, + struct vm_map_header *map_header, + vm_prot_t *cur_protection, + vm_prot_t *max_protection, + vm_inherit_t inheritance, + vm_map_kernel_flags_t vmk_flags); + +static kern_return_t vm_map_remap_range_allocate( + vm_map_t map, + vm_map_address_t *address, + vm_map_size_t size, + vm_map_offset_t mask, + int flags, + vm_map_kernel_flags_t vmk_flags, + vm_tag_t tag, + vm_map_entry_t *map_entry); + +static void vm_map_region_look_for_page( + vm_map_t map, vm_map_offset_t va, - vm_object_t object, - vm_object_offset_t offset, + vm_object_t object, + vm_object_offset_t offset, int max_refcnt, - int depth, + unsigned short depth, vm_region_extended_info_t extended, mach_msg_type_number_t count); -static int vm_map_region_count_obj_refs( - vm_map_entry_t entry, - vm_object_t object); +static int vm_map_region_count_obj_refs( + vm_map_entry_t entry, + vm_object_t object); -static kern_return_t vm_map_willneed( - vm_map_t map, - vm_map_offset_t start, - vm_map_offset_t end); +static kern_return_t vm_map_willneed( + vm_map_t map, + vm_map_offset_t start, + vm_map_offset_t end); + +static kern_return_t vm_map_reuse_pages( + vm_map_t map, + vm_map_offset_t start, + vm_map_offset_t end); -static kern_return_t vm_map_reuse_pages( - vm_map_t map, - vm_map_offset_t start, - vm_map_offset_t end); +static kern_return_t vm_map_reusable_pages( + vm_map_t map, + vm_map_offset_t start, + vm_map_offset_t end); -static kern_return_t vm_map_reusable_pages( - vm_map_t map, - vm_map_offset_t start, - vm_map_offset_t end); +static kern_return_t vm_map_can_reuse( + vm_map_t map, + vm_map_offset_t start, + vm_map_offset_t end); -static kern_return_t vm_map_can_reuse( - vm_map_t map, - vm_map_offset_t start, - vm_map_offset_t end); +#if MACH_ASSERT +static kern_return_t vm_map_pageout( + vm_map_t map, + vm_map_offset_t start, + vm_map_offset_t end); +#endif /* MACH_ASSERT */ +kern_return_t vm_map_corpse_footprint_collect( + vm_map_t old_map, + vm_map_entry_t old_entry, + vm_map_t new_map); +void vm_map_corpse_footprint_collect_done( + vm_map_t new_map); +void vm_map_corpse_footprint_destroy( + vm_map_t map); +kern_return_t vm_map_corpse_footprint_query_page_info( + vm_map_t map, + vm_map_offset_t va, + int *disposition_p); +void vm_map_footprint_query_page_info( + vm_map_t map, + vm_map_entry_t map_entry, + vm_map_offset_t curr_s_offset, + int *disposition_p); + +static const struct vm_map_entry vm_map_entry_template = { + .behavior = VM_BEHAVIOR_DEFAULT, + .inheritance = VM_INHERIT_DEFAULT, +}; + +pid_t find_largest_process_vm_map_entries(void); /* * Macros to copy a vm_map_entry. We must be careful to correctly @@ -308,38 +391,167 @@ static kern_return_t vm_map_can_reuse( * vm_map_copyout. */ -#define vm_map_entry_copy(NEW,OLD) \ -MACRO_BEGIN \ -boolean_t _vmec_reserved = (NEW)->from_reserved_zone; \ - *(NEW) = *(OLD); \ - (NEW)->is_shared = FALSE; \ - (NEW)->needs_wakeup = FALSE; \ - (NEW)->in_transition = FALSE; \ - (NEW)->wired_count = 0; \ - (NEW)->user_wired_count = 0; \ - (NEW)->permanent = FALSE; \ - (NEW)->used_for_jit = FALSE; \ - (NEW)->from_reserved_zone = _vmec_reserved; \ - (NEW)->iokit_acct = FALSE; \ -MACRO_END +static inline void +vm_map_entry_copy_pmap_cs_assoc( + vm_map_t map __unused, + vm_map_entry_t new __unused, + vm_map_entry_t old __unused) +{ + /* when pmap_cs is not enabled, assert as a sanity check */ + assert(new->pmap_cs_associated == FALSE); +} + +/* + * The "used_for_jit" flag was copied from OLD to NEW in vm_map_entry_copy(). + * But for security reasons on some platforms, we don't want the + * new mapping to be "used for jit", so we reset the flag here. + */ +static inline void +vm_map_entry_copy_code_signing( + vm_map_t map, + vm_map_entry_t new, + vm_map_entry_t old __unused) +{ + if (VM_MAP_POLICY_ALLOW_JIT_COPY(map)) { + assert(new->used_for_jit == old->used_for_jit); + } else { + new->used_for_jit = FALSE; + } +} + +static inline void +vm_map_entry_copy( + vm_map_t map, + vm_map_entry_t new, + vm_map_entry_t old) +{ + boolean_t _vmec_reserved = new->from_reserved_zone; + *new = *old; + new->is_shared = FALSE; + new->needs_wakeup = FALSE; + new->in_transition = FALSE; + new->wired_count = 0; + new->user_wired_count = 0; + new->permanent = FALSE; + vm_map_entry_copy_code_signing(map, new, old); + vm_map_entry_copy_pmap_cs_assoc(map, new, old); + new->from_reserved_zone = _vmec_reserved; + if (new->iokit_acct) { + assertf(!new->use_pmap, "old %p new %p\n", old, new); + new->iokit_acct = FALSE; + new->use_pmap = TRUE; + } + new->vme_resilient_codesign = FALSE; + new->vme_resilient_media = FALSE; + new->vme_atomic = FALSE; + new->vme_no_copy_on_read = FALSE; +} + +static inline void +vm_map_entry_copy_full( + vm_map_entry_t new, + vm_map_entry_t old) +{ + boolean_t _vmecf_reserved = new->from_reserved_zone; + *new = *old; + new->from_reserved_zone = _vmecf_reserved; +} + +/* + * Normal lock_read_to_write() returns FALSE/0 on failure. + * These functions evaluate to zero on success and non-zero value on failure. + */ +__attribute__((always_inline)) +int +vm_map_lock_read_to_write(vm_map_t map) +{ + if (lck_rw_lock_shared_to_exclusive(&(map)->lock)) { + DTRACE_VM(vm_map_lock_upgrade); + return 0; + } + return 1; +} + +__attribute__((always_inline)) +boolean_t +vm_map_try_lock(vm_map_t map) +{ + if (lck_rw_try_lock_exclusive(&(map)->lock)) { + DTRACE_VM(vm_map_lock_w); + return TRUE; + } + return FALSE; +} + +__attribute__((always_inline)) +boolean_t +vm_map_try_lock_read(vm_map_t map) +{ + if (lck_rw_try_lock_shared(&(map)->lock)) { + DTRACE_VM(vm_map_lock_r); + return TRUE; + } + return FALSE; +} + +/* + * Routines to get the page size the caller should + * use while inspecting the target address space. + * Use the "_safely" variant if the caller is dealing with a user-provided + * array whose size depends on the page size, to avoid any overflow or + * underflow of a user-allocated buffer. + */ +int +vm_self_region_page_shift_safely( + vm_map_t target_map) +{ + int effective_page_shift = 0; + + if (PAGE_SIZE == (4096)) { + /* x86_64 and 4k watches: always use 4k */ + return PAGE_SHIFT; + } + /* did caller provide an explicit page size for this thread to use? */ + effective_page_shift = thread_self_region_page_shift(); + if (effective_page_shift) { + /* use the explicitly-provided page size */ + return effective_page_shift; + } + /* no explicit page size: use the caller's page size... */ + effective_page_shift = VM_MAP_PAGE_SHIFT(current_map()); + if (effective_page_shift == VM_MAP_PAGE_SHIFT(target_map)) { + /* page size match: safe to use */ + return effective_page_shift; + } + /* page size mismatch */ + return -1; +} +int +vm_self_region_page_shift( + vm_map_t target_map) +{ + int effective_page_shift; + + effective_page_shift = vm_self_region_page_shift_safely(target_map); + if (effective_page_shift == -1) { + /* no safe value but OK to guess for caller */ + effective_page_shift = MIN(VM_MAP_PAGE_SHIFT(current_map()), + VM_MAP_PAGE_SHIFT(target_map)); + } + return effective_page_shift; +} -#define vm_map_entry_copy_full(NEW,OLD) \ -MACRO_BEGIN \ -boolean_t _vmecf_reserved = (NEW)->from_reserved_zone; \ -(*(NEW) = *(OLD)); \ -(NEW)->from_reserved_zone = _vmecf_reserved; \ -MACRO_END /* * Decide if we want to allow processes to execute from their data or stack areas. - * override_nx() returns true if we do. Data/stack execution can be enabled independently + * override_nx() returns true if we do. Data/stack execution can be enabled independently * for 32 and 64 bit processes. Set the VM_ABI_32 or VM_ABI_64 flags in allow_data_exec * or allow_stack_exec to enable data execution for that type of data area for that particular * ABI (or both by or'ing the flags together). These are initialized in the architecture - * specific pmap files since the default behavior varies according to architecture. The - * main reason it varies is because of the need to provide binary compatibility with old - * applications that were written before these restrictions came into being. In the old - * days, an app could execute anything it could read, but this has slowly been tightened + * specific pmap files since the default behavior varies according to architecture. The + * main reason it varies is because of the need to provide binary compatibility with old + * applications that were written before these restrictions came into being. In the old + * days, an app could execute anything it could read, but this has slowly been tightened * up over time. The default behavior is: * * 32-bit PPC apps may execute from both stack and data areas @@ -347,9 +559,9 @@ MACRO_END * 64-bit PPC/Intel apps may not execute from either data or stack * * An application on any architecture may override these defaults by explicitly - * adding PROT_EXEC permission to the page in question with the mprotect(2) + * adding PROT_EXEC permission to the page in question with the mprotect(2) * system call. This code here just determines what happens when an app tries to - * execute from a page that lacks execute permission. + * execute from a page that lacks execute permission. * * Note that allow_data_exec or allow_stack_exec may also be modified by sysctl to change the * default behavior for both 32 and 64 bit apps on a system-wide basis. Furthermore, @@ -367,22 +579,28 @@ override_nx(vm_map_t map, uint32_t user_tag) /* map unused on arm */ { int current_abi; + if (map->pmap == kernel_pmap) { + return FALSE; + } + /* * Determine if the app is running in 32 or 64 bit mode. */ - if (vm_map_is_64bit(map)) + if (vm_map_is_64bit(map)) { current_abi = VM_ABI_64; - else + } else { current_abi = VM_ABI_32; + } /* - * Determine if we should allow the execution based on whether it's a + * Determine if we should allow the execution based on whether it's a * stack or data area and the current architecture. */ - if (user_tag == VM_MEMORY_STACK) + if (user_tag == VM_MEMORY_STACK) { return allow_stack_exec & current_abi; + } return (allow_data_exec & current_abi) && (map->map_disallow_data_exec == FALSE); } @@ -405,7 +623,7 @@ override_nx(vm_map_t map, uint32_t user_tag) /* map unused on arm */ * the asymmetric (delayed) strategy is used for shared temporary * objects instead of the symmetric (shadow) strategy. All maps * are now "top level" maps (either task map, kernel map or submap - * of the kernel map). + * of the kernel map). * * Since portions of maps are specified by start/end addreses, * which may not align with existing map entries, all @@ -431,27 +649,71 @@ override_nx(vm_map_t map, uint32_t user_tag) /* map unused on arm */ * vm_object_copy_strategically() in vm_object.c. */ -static zone_t vm_map_zone; /* zone for vm_map structures */ -static zone_t vm_map_entry_zone; /* zone for vm_map_entry structures */ -static zone_t vm_map_entry_reserved_zone; /* zone with reserve for non-blocking - * allocations */ -static zone_t vm_map_copy_zone; /* zone for vm_map_copy structures */ - +static SECURITY_READ_ONLY_LATE(zone_t) vm_map_zone; /* zone for vm_map structures */ +static SECURITY_READ_ONLY_LATE(zone_t) vm_map_entry_reserved_zone; /* zone with reserve for non-blocking allocations */ +static SECURITY_READ_ONLY_LATE(zone_t) vm_map_copy_zone; /* zone for vm_map_copy structures */ + +SECURITY_READ_ONLY_LATE(zone_t) vm_map_entry_zone; /* zone for vm_map_entry structures */ +SECURITY_READ_ONLY_LATE(zone_t) vm_map_holes_zone; /* zone for vm map holes (vm_map_links) structures */ + +#define VM_MAP_ZONE_NAME "maps" +#define VM_MAP_ZFLAGS ( \ + ZC_NOENCRYPT | \ + ZC_NOGC | \ + ZC_NOGZALLOC | \ + ZC_ALLOW_FOREIGN) + +#define VME_RESERVED_ZONE_NAME "Reserved VM map entries" +#define VM_MAP_RESERVED_ZFLAGS ( \ + ZC_NOENCRYPT | \ + ZC_ALLOW_FOREIGN | \ + ZC_NOCALLOUT | \ + ZC_NOGZALLOC | \ + ZC_KASAN_NOQUARANTINE | \ + ZC_NOGC) + +#define VM_MAP_HOLES_ZONE_NAME "VM map holes" +#define VM_MAP_HOLES_ZFLAGS ( \ + ZC_NOENCRYPT | \ + ZC_NOGC | \ + ZC_NOGZALLOC | \ + ZC_ALLOW_FOREIGN) /* - * Placeholder object for submap operations. This object is dropped - * into the range by a call to vm_map_find, and removed when - * vm_map_submap creates the submap. + * Asserts that a vm_map_copy object is coming from the + * vm_map_copy_zone to ensure that it isn't a fake constructed + * anywhere else. */ +static inline void +vm_map_copy_require(struct vm_map_copy *copy) +{ + zone_id_require(ZONE_ID_VM_MAP_COPY, sizeof(struct vm_map_copy), copy); +} -vm_object_t vm_submap_object; +/* + * vm_map_require: + * + * Ensures that the argument is memory allocated from the genuine + * vm map zone. (See zone_id_require_allow_foreign). + */ +void +vm_map_require(vm_map_t map) +{ + zone_id_require_allow_foreign(ZONE_ID_VM_MAP, sizeof(struct _vm_map), map); +} -static void *map_data; -static vm_size_t map_data_size; -static void *kentry_data; -static vm_size_t kentry_data_size; +static __startup_data vm_offset_t map_data; +static __startup_data vm_size_t map_data_size; +static __startup_data vm_offset_t kentry_data; +static __startup_data vm_size_t kentry_data_size; +static __startup_data vm_offset_t map_holes_data; +static __startup_data vm_size_t map_holes_data_size; +#if XNU_TARGET_OS_OSX #define NO_COALESCE_LIMIT ((1024 * 128) - 1) +#else /* XNU_TARGET_OS_OSX */ +#define NO_COALESCE_LIMIT 0 +#endif /* XNU_TARGET_OS_OSX */ /* Skip acquiring locks if we're in the midst of a kernel core dump */ unsigned int not_in_kdp = 1; @@ -460,12 +722,12 @@ unsigned int vm_map_set_cache_attr_count = 0; kern_return_t vm_map_set_cache_attr( - vm_map_t map, - vm_map_offset_t va) + vm_map_t map, + vm_map_offset_t va) { - vm_map_entry_t map_entry; - vm_object_t object; - kern_return_t kr = KERN_SUCCESS; + vm_map_entry_t map_entry; + vm_object_t object; + kern_return_t kr = KERN_SUCCESS; vm_map_lock_read(map); @@ -477,7 +739,7 @@ vm_map_set_cache_attr( kr = KERN_INVALID_ARGUMENT; goto done; } - object = map_entry->object.vm_object; + object = VME_OBJECT(map_entry); if (object == VM_OBJECT_NULL) { /* @@ -501,7 +763,7 @@ done: #if CONFIG_CODE_DECRYPTION /* * vm_map_apple_protected: - * This remaps the requested part of the object with an object backed by + * This remaps the requested part of the object with an object backed by * the decrypting pager. * crypt_info contains entry points and session data for the crypt module. * The crypt_info block will be copied by vm_map_apple_protected. The data structures @@ -509,98 +771,254 @@ done: */ kern_return_t vm_map_apple_protected( - vm_map_t map, - vm_map_offset_t start, - vm_map_offset_t end, - struct pager_crypt_info *crypt_info) -{ - boolean_t map_locked; - kern_return_t kr; - vm_map_entry_t map_entry; - memory_object_t protected_mem_obj; - vm_object_t protected_object; - vm_map_offset_t map_addr; - - vm_map_lock_read(map); - map_locked = TRUE; - - /* lookup the protected VM object */ - if (!vm_map_lookup_entry(map, - start, - &map_entry) || - map_entry->vme_end < end || - map_entry->is_sub_map || - !(map_entry->protection & VM_PROT_EXECUTE)) { - /* that memory is not properly mapped */ - kr = KERN_INVALID_ARGUMENT; - goto done; - } - protected_object = map_entry->object.vm_object; - if (protected_object == VM_OBJECT_NULL) { - /* there should be a VM object here at this point */ - kr = KERN_INVALID_ARGUMENT; - goto done; - } - - /* make sure protected object stays alive while map is unlocked */ - vm_object_reference(protected_object); + vm_map_t map, + vm_map_offset_t start, + vm_map_offset_t end, + vm_object_offset_t crypto_backing_offset, + struct pager_crypt_info *crypt_info, + uint32_t cryptid) +{ + boolean_t map_locked; + kern_return_t kr; + vm_map_entry_t map_entry; + struct vm_map_entry tmp_entry; + memory_object_t unprotected_mem_obj; + vm_object_t protected_object; + vm_map_offset_t map_addr; + vm_map_offset_t start_aligned, end_aligned; + vm_object_offset_t crypto_start, crypto_end; + int vm_flags; + vm_map_kernel_flags_t vmk_flags; + boolean_t cache_pager; + + vm_flags = 0; + vmk_flags = VM_MAP_KERNEL_FLAGS_NONE; - vm_map_unlock_read(map); map_locked = FALSE; + unprotected_mem_obj = MEMORY_OBJECT_NULL; + + start_aligned = vm_map_trunc_page(start, PAGE_MASK_64); + end_aligned = vm_map_round_page(end, PAGE_MASK_64); + start_aligned = vm_map_trunc_page(start_aligned, VM_MAP_PAGE_MASK(map)); + end_aligned = vm_map_round_page(end_aligned, VM_MAP_PAGE_MASK(map)); +#if __arm64__ /* - * Lookup (and create if necessary) the protected memory object - * matching that VM object. - * If successful, this also grabs a reference on the memory object, - * to guarantee that it doesn't go away before we get a chance to map - * it. + * "start" and "end" might be 4K-aligned but not 16K-aligned, + * so we might have to loop and establish up to 3 mappings: + * + * + the first 16K-page, which might overlap with the previous + * 4K-aligned mapping, + * + the center, + * + the last 16K-page, which might overlap with the next + * 4K-aligned mapping. + * Each of these mapping might be backed by a vnode pager (if + * properly page-aligned) or a "fourk_pager", itself backed by a + * vnode pager (if 4K-aligned but not page-aligned). */ - protected_mem_obj = apple_protect_pager_setup(protected_object, crypt_info); +#endif /* __arm64__ */ + + map_addr = start_aligned; + for (map_addr = start_aligned; + map_addr < end; + map_addr = tmp_entry.vme_end) { + vm_map_lock(map); + map_locked = TRUE; - /* release extra ref on protected object */ - vm_object_deallocate(protected_object); + /* lookup the protected VM object */ + if (!vm_map_lookup_entry(map, + map_addr, + &map_entry) || + map_entry->is_sub_map || + VME_OBJECT(map_entry) == VM_OBJECT_NULL) { + /* that memory is not properly mapped */ + kr = KERN_INVALID_ARGUMENT; + goto done; + } - if (protected_mem_obj == NULL) { - kr = KERN_FAILURE; - goto done; - } + /* ensure mapped memory is mapped as executable except + * except for model decryption flow */ + if ((cryptid != CRYPTID_MODEL_ENCRYPTION) && + !(map_entry->protection & VM_PROT_EXECUTE)) { + kr = KERN_INVALID_ARGUMENT; + goto done; + } + + /* get the protected object to be decrypted */ + protected_object = VME_OBJECT(map_entry); + if (protected_object == VM_OBJECT_NULL) { + /* there should be a VM object here at this point */ + kr = KERN_INVALID_ARGUMENT; + goto done; + } + /* ensure protected object stays alive while map is unlocked */ + vm_object_reference(protected_object); + + /* limit the map entry to the area we want to cover */ + vm_map_clip_start(map, map_entry, start_aligned); + vm_map_clip_end(map, map_entry, end_aligned); + + tmp_entry = *map_entry; + map_entry = VM_MAP_ENTRY_NULL; /* not valid after unlocking map */ + vm_map_unlock(map); + map_locked = FALSE; + + /* + * This map entry might be only partially encrypted + * (if not fully "page-aligned"). + */ + crypto_start = 0; + crypto_end = tmp_entry.vme_end - tmp_entry.vme_start; + if (tmp_entry.vme_start < start) { + if (tmp_entry.vme_start != start_aligned) { + kr = KERN_INVALID_ADDRESS; + } + crypto_start += (start - tmp_entry.vme_start); + } + if (tmp_entry.vme_end > end) { + if (tmp_entry.vme_end != end_aligned) { + kr = KERN_INVALID_ADDRESS; + } + crypto_end -= (tmp_entry.vme_end - end); + } + + /* + * This "extra backing offset" is needed to get the decryption + * routine to use the right key. It adjusts for the possibly + * relative offset of an interposed "4K" pager... + */ + if (crypto_backing_offset == (vm_object_offset_t) -1) { + crypto_backing_offset = VME_OFFSET(&tmp_entry); + } + + cache_pager = TRUE; +#if XNU_TARGET_OS_OSX + if (vm_map_is_alien(map)) { + cache_pager = FALSE; + } +#endif /* XNU_TARGET_OS_OSX */ + + /* + * Lookup (and create if necessary) the protected memory object + * matching that VM object. + * If successful, this also grabs a reference on the memory object, + * to guarantee that it doesn't go away before we get a chance to map + * it. + */ + unprotected_mem_obj = apple_protect_pager_setup( + protected_object, + VME_OFFSET(&tmp_entry), + crypto_backing_offset, + crypt_info, + crypto_start, + crypto_end, + cache_pager); + + /* release extra ref on protected object */ + vm_object_deallocate(protected_object); + + if (unprotected_mem_obj == NULL) { + kr = KERN_FAILURE; + goto done; + } + + vm_flags = VM_FLAGS_FIXED | VM_FLAGS_OVERWRITE; + /* can overwrite an immutable mapping */ + vmk_flags.vmkf_overwrite_immutable = TRUE; +#if __arm64__ + if (tmp_entry.used_for_jit && + (VM_MAP_PAGE_SHIFT(map) != FOURK_PAGE_SHIFT || + PAGE_SHIFT != FOURK_PAGE_SHIFT) && + fourk_binary_compatibility_unsafe && + fourk_binary_compatibility_allow_wx) { + printf("** FOURK_COMPAT [%d]: " + "allowing write+execute at 0x%llx\n", + proc_selfpid(), tmp_entry.vme_start); + vmk_flags.vmkf_map_jit = TRUE; + } +#endif /* __arm64__ */ + + /* map this memory object in place of the current one */ + map_addr = tmp_entry.vme_start; + kr = vm_map_enter_mem_object(map, + &map_addr, + (tmp_entry.vme_end - + tmp_entry.vme_start), + (mach_vm_offset_t) 0, + vm_flags, + vmk_flags, + VM_KERN_MEMORY_NONE, + (ipc_port_t)(uintptr_t) unprotected_mem_obj, + 0, + TRUE, + tmp_entry.protection, + tmp_entry.max_protection, + tmp_entry.inheritance); + assertf(kr == KERN_SUCCESS, + "kr = 0x%x\n", kr); + assertf(map_addr == tmp_entry.vme_start, + "map_addr=0x%llx vme_start=0x%llx tmp_entry=%p\n", + (uint64_t)map_addr, + (uint64_t) tmp_entry.vme_start, + &tmp_entry); + +#if VM_MAP_DEBUG_APPLE_PROTECT + if (vm_map_debug_apple_protect) { + printf("APPLE_PROTECT: map %p [0x%llx:0x%llx] pager %p:" + " backing:[object:%p,offset:0x%llx," + "crypto_backing_offset:0x%llx," + "crypto_start:0x%llx,crypto_end:0x%llx]\n", + map, + (uint64_t) map_addr, + (uint64_t) (map_addr + (tmp_entry.vme_end - + tmp_entry.vme_start)), + unprotected_mem_obj, + protected_object, + VME_OFFSET(&tmp_entry), + crypto_backing_offset, + crypto_start, + crypto_end); + } +#endif /* VM_MAP_DEBUG_APPLE_PROTECT */ + + /* + * Release the reference obtained by + * apple_protect_pager_setup(). + * The mapping (if it succeeded) is now holding a reference on + * the memory object. + */ + memory_object_deallocate(unprotected_mem_obj); + unprotected_mem_obj = MEMORY_OBJECT_NULL; - /* map this memory object in place of the current one */ - map_addr = start; - kr = vm_map_enter_mem_object(map, - &map_addr, - end - start, - (mach_vm_offset_t) 0, - VM_FLAGS_FIXED | VM_FLAGS_OVERWRITE, - (ipc_port_t) protected_mem_obj, - (map_entry->offset + - (start - map_entry->vme_start)), - TRUE, - map_entry->protection, - map_entry->max_protection, - map_entry->inheritance); - assert(map_addr == start); - /* - * Release the reference obtained by apple_protect_pager_setup(). - * The mapping (if it succeeded) is now holding a reference on the - * memory object. - */ - memory_object_deallocate(protected_mem_obj); + /* continue with next map entry */ + crypto_backing_offset += (tmp_entry.vme_end - + tmp_entry.vme_start); + crypto_backing_offset -= crypto_start; + } + kr = KERN_SUCCESS; done: if (map_locked) { - vm_map_unlock_read(map); + vm_map_unlock(map); } return kr; } -#endif /* CONFIG_CODE_DECRYPTION */ +#endif /* CONFIG_CODE_DECRYPTION */ -lck_grp_t vm_map_lck_grp; -lck_grp_attr_t vm_map_lck_grp_attr; -lck_attr_t vm_map_lck_attr; -lck_attr_t vm_map_lck_rw_attr; +LCK_GRP_DECLARE(vm_map_lck_grp, "vm_map"); +LCK_ATTR_DECLARE(vm_map_lck_attr, 0, 0); +LCK_ATTR_DECLARE(vm_map_lck_rw_attr, 0, LCK_ATTR_DEBUG); +#if XNU_TARGET_OS_OSX +int malloc_no_cow = 0; +#else /* XNU_TARGET_OS_OSX */ +int malloc_no_cow = 1; +#endif /* XNU_TARGET_OS_OSX */ +uint64_t vm_memory_malloc_no_cow_mask = 0ULL; +#if DEBUG +int vm_check_map_sanity = 0; +#endif /* * vm_map_init: @@ -623,74 +1041,113 @@ lck_attr_t vm_map_lck_rw_attr; * empty since the very act of allocating memory implies the creation * of a new entry. */ +__startup_func void -vm_map_init( - void) +vm_map_init(void) { - vm_size_t entry_zone_alloc_size; const char *mez_name = "VM map entries"; - vm_map_zone = zinit((vm_map_size_t) sizeof(struct _vm_map), 40*1024, - PAGE_SIZE, "maps"); - zone_change(vm_map_zone, Z_NOENCRYPT, TRUE); -#if defined(__LP64__) - entry_zone_alloc_size = PAGE_SIZE * 5; -#else - entry_zone_alloc_size = PAGE_SIZE * 6; -#endif - vm_map_entry_zone = zinit((vm_map_size_t) sizeof(struct vm_map_entry), - 1024*1024, entry_zone_alloc_size, - mez_name); - zone_change(vm_map_entry_zone, Z_NOENCRYPT, TRUE); - zone_change(vm_map_entry_zone, Z_NOCALLOUT, TRUE); - zone_change(vm_map_entry_zone, Z_GZALLOC_EXEMPT, TRUE); - - vm_map_entry_reserved_zone = zinit((vm_map_size_t) sizeof(struct vm_map_entry), - kentry_data_size * 64, kentry_data_size, - "Reserved VM map entries"); - zone_change(vm_map_entry_reserved_zone, Z_NOENCRYPT, TRUE); - - vm_map_copy_zone = zinit((vm_map_size_t) sizeof(struct vm_map_copy), - 16*1024, PAGE_SIZE, "VM map copies"); - zone_change(vm_map_copy_zone, Z_NOENCRYPT, TRUE); - - /* - * Cram the map and kentry zones with initial data. - * Set reserved_zone non-collectible to aid zone_gc(). - */ - zone_change(vm_map_zone, Z_COLLECT, FALSE); - - zone_change(vm_map_entry_reserved_zone, Z_COLLECT, FALSE); - zone_change(vm_map_entry_reserved_zone, Z_EXPAND, FALSE); - zone_change(vm_map_entry_reserved_zone, Z_FOREIGN, TRUE); - zone_change(vm_map_entry_reserved_zone, Z_NOCALLOUT, TRUE); - zone_change(vm_map_entry_reserved_zone, Z_CALLERACCT, FALSE); /* don't charge caller */ - zone_change(vm_map_copy_zone, Z_CALLERACCT, FALSE); /* don't charge caller */ - zone_change(vm_map_entry_reserved_zone, Z_GZALLOC_EXEMPT, TRUE); - - zcram(vm_map_zone, (vm_offset_t)map_data, map_data_size); - zcram(vm_map_entry_reserved_zone, (vm_offset_t)kentry_data, kentry_data_size); - - lck_grp_attr_setdefault(&vm_map_lck_grp_attr); - lck_grp_init(&vm_map_lck_grp, "vm_map", &vm_map_lck_grp_attr); - lck_attr_setdefault(&vm_map_lck_attr); - - lck_attr_setdefault(&vm_map_lck_rw_attr); - lck_attr_cleardebug(&vm_map_lck_rw_attr); -#if CONFIG_FREEZE - default_freezer_init(); -#endif /* CONFIG_FREEZE */ +#if MACH_ASSERT + PE_parse_boot_argn("debug4k_filter", &debug4k_filter, + sizeof(debug4k_filter)); +#endif /* MACH_ASSERT */ + + vm_map_zone = zone_create_ext(VM_MAP_ZONE_NAME, sizeof(struct _vm_map), + VM_MAP_ZFLAGS, ZONE_ID_VM_MAP, NULL); + + vm_map_entry_zone = zone_create(mez_name, sizeof(struct vm_map_entry), + ZC_NOENCRYPT | ZC_NOGZALLOC | ZC_NOCALLOUT); + + /* + * Don't quarantine because we always need elements available + * Disallow GC on this zone... to aid the GC. + */ + vm_map_entry_reserved_zone = zone_create_ext(VME_RESERVED_ZONE_NAME, + sizeof(struct vm_map_entry), VM_MAP_RESERVED_ZFLAGS, + ZONE_ID_ANY, NULL); + + vm_map_copy_zone = zone_create_ext("VM map copies", sizeof(struct vm_map_copy), + ZC_NOENCRYPT | ZC_CACHING, ZONE_ID_VM_MAP_COPY, NULL); + + vm_map_holes_zone = zone_create(VM_MAP_HOLES_ZONE_NAME, + sizeof(struct vm_map_links), VM_MAP_HOLES_ZFLAGS); + + /* + * Add the stolen memory to zones, adjust zone size and stolen counts. + */ + zone_cram_foreign(vm_map_zone, map_data, map_data_size); + zone_cram_foreign(vm_map_entry_reserved_zone, kentry_data, kentry_data_size); + zone_cram_foreign(vm_map_holes_zone, map_holes_data, map_holes_data_size); + + /* + * Since these are covered by zones, remove them from stolen page accounting. + */ + VM_PAGE_MOVE_STOLEN(atop_64(map_data_size) + atop_64(kentry_data_size) + atop_64(map_holes_data_size)); + +#if VM_MAP_DEBUG_APPLE_PROTECT + PE_parse_boot_argn("vm_map_debug_apple_protect", + &vm_map_debug_apple_protect, + sizeof(vm_map_debug_apple_protect)); +#endif /* VM_MAP_DEBUG_APPLE_PROTECT */ +#if VM_MAP_DEBUG_APPLE_FOURK + PE_parse_boot_argn("vm_map_debug_fourk", + &vm_map_debug_fourk, + sizeof(vm_map_debug_fourk)); +#endif /* VM_MAP_DEBUG_FOURK */ + PE_parse_boot_argn("vm_map_executable_immutable", + &vm_map_executable_immutable, + sizeof(vm_map_executable_immutable)); + PE_parse_boot_argn("vm_map_executable_immutable_verbose", + &vm_map_executable_immutable_verbose, + sizeof(vm_map_executable_immutable_verbose)); + + PE_parse_boot_argn("malloc_no_cow", + &malloc_no_cow, + sizeof(malloc_no_cow)); + if (malloc_no_cow) { + vm_memory_malloc_no_cow_mask = 0ULL; + vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC; + vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_SMALL; + vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_MEDIUM; + vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_LARGE; +// vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_HUGE; +// vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_REALLOC; + vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_TINY; + vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_LARGE_REUSABLE; + vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_LARGE_REUSED; + vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_NANO; +// vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_TCMALLOC; + PE_parse_boot_argn("vm_memory_malloc_no_cow_mask", + &vm_memory_malloc_no_cow_mask, + sizeof(vm_memory_malloc_no_cow_mask)); + } + +#if DEBUG + PE_parse_boot_argn("vm_check_map_sanity", &vm_check_map_sanity, sizeof(vm_check_map_sanity)); + if (vm_check_map_sanity) { + kprintf("VM sanity checking enabled\n"); + } else { + kprintf("VM sanity checking disabled. Set bootarg vm_check_map_sanity=1 to enable\n"); + } +#endif /* DEBUG */ + +#if DEVELOPMENT || DEBUG + PE_parse_boot_argn("panic_on_unsigned_execute", + &panic_on_unsigned_execute, + sizeof(panic_on_unsigned_execute)); +#endif /* DEVELOPMENT || DEBUG */ } -void -vm_map_steal_memory( - void) +__startup_func +static void +vm_map_steal_memory(void) { - uint32_t kentry_initial_pages; + uint16_t kentry_initial_pages; + uint16_t zone_foreign_pages; - map_data_size = round_page(10 * sizeof(struct _vm_map)); - map_data = pmap_steal_memory(map_data_size); + map_data_size = zone_get_foreign_alloc_size(VM_MAP_ZONE_NAME, + sizeof(struct _vm_map), VM_MAP_ZFLAGS, 1); /* * kentry_initial_pages corresponds to the number of kernel map entries @@ -698,8 +1155,8 @@ vm_map_steal_memory( * scheme is activated and/or entries are available from the general * map entry pool. */ -#if defined(__LP64__) - kentry_initial_pages = 10; +#if defined(__LP64__) + kentry_initial_pages = (uint16_t)atop(16 * 4096); #else kentry_initial_pages = 6; #endif @@ -707,17 +1164,88 @@ vm_map_steal_memory( #if CONFIG_GZALLOC /* If using the guard allocator, reserve more memory for the kernel * reserved map entry pool. - */ - if (gzalloc_enabled()) + */ + if (gzalloc_enabled()) { kentry_initial_pages *= 1024; + } #endif + if (PE_parse_boot_argn("zone_foreign_pages", &zone_foreign_pages, + sizeof(zone_foreign_pages))) { + kentry_initial_pages = zone_foreign_pages; + } + + kentry_data_size = zone_get_foreign_alloc_size(VME_RESERVED_ZONE_NAME, + sizeof(struct vm_map_entry), VM_MAP_RESERVED_ZFLAGS, + kentry_initial_pages); + + map_holes_data_size = zone_get_foreign_alloc_size(VM_MAP_HOLES_ZONE_NAME, + sizeof(struct vm_map_links), VM_MAP_HOLES_ZFLAGS, + kentry_initial_pages); + + /* + * Steal a contiguous range of memory so that a simple range check + * can validate foreign addresses being freed/crammed to these + * zones + */ + vm_size_t total_size; + if (os_add3_overflow(map_data_size, kentry_data_size, + map_holes_data_size, &total_size)) { + panic("vm_map_steal_memory: overflow in amount of memory requested"); + } + map_data = zone_foreign_mem_init(total_size); + kentry_data = map_data + map_data_size; + map_holes_data = kentry_data + kentry_data_size; +} +STARTUP(PMAP_STEAL, STARTUP_RANK_FIRST, vm_map_steal_memory); + +boolean_t vm_map_supports_hole_optimization = FALSE; + +void +vm_kernel_reserved_entry_init(void) +{ + zone_replenish_configure(vm_map_entry_reserved_zone); + + /* + * Once we have our replenish thread set up, we can start using the vm_map_holes zone. + */ + zone_replenish_configure(vm_map_holes_zone); + vm_map_supports_hole_optimization = TRUE; +} + +void +vm_map_disable_hole_optimization(vm_map_t map) +{ + vm_map_entry_t head_entry, hole_entry, next_hole_entry; + + if (map->holelistenabled) { + head_entry = hole_entry = CAST_TO_VM_MAP_ENTRY(map->holes_list); + + while (hole_entry != NULL) { + next_hole_entry = hole_entry->vme_next; + + hole_entry->vme_next = NULL; + hole_entry->vme_prev = NULL; + zfree(vm_map_holes_zone, hole_entry); + + if (next_hole_entry == head_entry) { + hole_entry = NULL; + } else { + hole_entry = next_hole_entry; + } + } + + map->holes_list = NULL; + map->holelistenabled = FALSE; - kentry_data_size = kentry_initial_pages * PAGE_SIZE; - kentry_data = pmap_steal_memory(kentry_data_size); + map->first_free = vm_map_first_entry(map); + SAVE_HINT_HOLE_WRITE(map, NULL); + } } -void vm_kernel_reserved_entry_init(void) { - zone_prio_refill_configure(vm_map_entry_reserved_zone, (6*PAGE_SIZE)/sizeof(struct vm_map_entry)); +boolean_t +vm_kernel_map_is_kernel(vm_map_t map) +{ + return map->pmap == kernel_pmap; } /* @@ -727,37 +1255,63 @@ void vm_kernel_reserved_entry_init(void) { * the given physical map structure, and having * the given lower and upper address bounds. */ + vm_map_t vm_map_create( - pmap_t pmap, - vm_map_offset_t min, - vm_map_offset_t max, - boolean_t pageable) + pmap_t pmap, + vm_map_offset_t min, + vm_map_offset_t max, + boolean_t pageable) +{ + int options; + + options = 0; + if (pageable) { + options |= VM_MAP_CREATE_PAGEABLE; + } + return vm_map_create_options(pmap, min, max, options); +} + +vm_map_t +vm_map_create_options( + pmap_t pmap, + vm_map_offset_t min, + vm_map_offset_t max, + int options) { - static int color_seed = 0; - register vm_map_t result; + vm_map_t result; + struct vm_map_links *hole_entry = NULL; + + if (options & ~(VM_MAP_CREATE_ALL_OPTIONS)) { + /* unknown option */ + return VM_MAP_NULL; + } result = (vm_map_t) zalloc(vm_map_zone); - if (result == VM_MAP_NULL) + if (result == VM_MAP_NULL) { panic("vm_map_create"); + } vm_map_first_entry(result) = vm_map_to_entry(result); vm_map_last_entry(result) = vm_map_to_entry(result); result->hdr.nentries = 0; - result->hdr.entries_pageable = pageable; + if (options & VM_MAP_CREATE_PAGEABLE) { + result->hdr.entries_pageable = TRUE; + } else { + result->hdr.entries_pageable = FALSE; + } + + vm_map_store_init( &(result->hdr)); - vm_map_store_init( &(result->hdr) ); - result->hdr.page_shift = PAGE_SHIFT; result->size = 0; - result->user_wire_limit = MACH_VM_MAX_ADDRESS; /* default limit is unlimited */ + result->user_wire_limit = MACH_VM_MAX_ADDRESS; /* default limit is unlimited */ result->user_wire_size = 0; - result->ref_count = 1; -#if TASK_SWAPPER - result->res_count = 1; - result->sw_state = MAP_SW_IN; -#endif /* TASK_SWAPPER */ +#if XNU_TARGET_OS_OSX + result->vmmap_high_start = 0; +#endif + os_ref_init_count(&result->map_refcnt, &map_refgrp, 1); result->pmap = pmap; result->min_offset = min; result->max_offset = max; @@ -768,38 +1322,109 @@ vm_map_create( result->switch_protect = FALSE; result->disable_vmentry_reuse = FALSE; result->map_disallow_data_exec = FALSE; + result->is_nested_map = FALSE; + result->map_disallow_new_exec = FALSE; + result->terminated = FALSE; + result->cs_enforcement = FALSE; result->highest_entry_end = 0; result->first_free = vm_map_to_entry(result); result->hint = vm_map_to_entry(result); - result->color_rr = (color_seed++) & vm_color_mask; - result->jit_entry_exists = FALSE; -#if CONFIG_FREEZE - result->default_freezer_handle = NULL; + result->jit_entry_exists = FALSE; + result->is_alien = FALSE; + result->reserved_regions = FALSE; + result->single_jit = FALSE; + + /* "has_corpse_footprint" and "holelistenabled" are mutually exclusive */ + if (options & VM_MAP_CREATE_CORPSE_FOOTPRINT) { + result->has_corpse_footprint = TRUE; + result->holelistenabled = FALSE; + result->vmmap_corpse_footprint = NULL; + } else { + result->has_corpse_footprint = FALSE; + if (vm_map_supports_hole_optimization) { + hole_entry = zalloc(vm_map_holes_zone); + + hole_entry->start = min; +#if defined(__arm__) || defined(__arm64__) + hole_entry->end = result->max_offset; +#else + hole_entry->end = (max > (vm_map_offset_t)MACH_VM_MAX_ADDRESS) ? max : (vm_map_offset_t)MACH_VM_MAX_ADDRESS; #endif + result->holes_list = result->hole_hint = hole_entry; + hole_entry->prev = hole_entry->next = CAST_TO_VM_MAP_ENTRY(hole_entry); + result->holelistenabled = TRUE; + } else { + result->holelistenabled = FALSE; + } + } + vm_map_lock_init(result); lck_mtx_init_ext(&result->s_lock, &result->s_lock_ext, &vm_map_lck_grp, &vm_map_lck_attr); - - return(result); + + return result; } -/* - * vm_map_entry_create: [ internal use only ] - * - * Allocates a VM map entry for insertion in the - * given map (or map copy). No fields are filled. - */ -#define vm_map_entry_create(map, map_locked) _vm_map_entry_create(&(map)->hdr, map_locked) +vm_map_size_t +vm_map_adjusted_size(vm_map_t map) +{ + struct vm_reserved_region *regions = NULL; + size_t num_regions = 0; + mach_vm_size_t reserved_size = 0, map_size = 0; -#define vm_map_copy_entry_create(copy, map_locked) \ - _vm_map_entry_create(&(copy)->cpy_hdr, map_locked) -unsigned reserved_zalloc_count, nonreserved_zalloc_count; + if (map == NULL || (map->size == 0)) { + return 0; + } + + map_size = map->size; + + if (map->reserved_regions == FALSE || !vm_map_is_exotic(map) || map->terminated) { + /* + * No special reserved regions or not an exotic map or the task + * is terminating and these special regions might have already + * been deallocated. + */ + return map_size; + } + + num_regions = ml_get_vm_reserved_regions(vm_map_is_64bit(map), ®ions); + assert((num_regions == 0) || (num_regions > 0 && regions != NULL)); + + while (num_regions) { + reserved_size += regions[--num_regions].vmrr_size; + } + + /* + * There are a few places where the map is being switched out due to + * 'termination' without that bit being set (e.g. exec and corpse purging). + * In those cases, we could have the map's regions being deallocated on + * a core while some accounting process is trying to get the map's size. + * So this assert can't be enabled till all those places are uniform in + * their use of the 'map->terminated' bit. + * + * assert(map_size >= reserved_size); + */ + + return (map_size >= reserved_size) ? (map_size - reserved_size) : map_size; +} + +/* + * vm_map_entry_create: [ internal use only ] + * + * Allocates a VM map entry for insertion in the + * given map (or map copy). No fields are filled. + */ +#define vm_map_entry_create(map, map_locked) _vm_map_entry_create(&(map)->hdr, map_locked) + +#define vm_map_copy_entry_create(copy, map_locked) \ + _vm_map_entry_create(&(copy)->cpy_hdr, map_locked) +unsigned reserved_zalloc_count, nonreserved_zalloc_count; static vm_map_entry_t _vm_map_entry_create( - struct vm_map_header *map_header, boolean_t __unused map_locked) + struct vm_map_header *map_header, boolean_t __unused map_locked) { - zone_t zone; - vm_map_entry_t entry; + zone_t zone; + vm_map_entry_t entry; zone = vm_map_entry_zone; @@ -807,29 +1432,31 @@ _vm_map_entry_create( if (map_header->entries_pageable) { entry = (vm_map_entry_t) zalloc(zone); - } - else { - entry = (vm_map_entry_t) zalloc_canblock(zone, FALSE); + } else { + entry = (vm_map_entry_t) zalloc_noblock(zone); if (entry == VM_MAP_ENTRY_NULL) { zone = vm_map_entry_reserved_zone; entry = (vm_map_entry_t) zalloc(zone); OSAddAtomic(1, &reserved_zalloc_count); - } else + } else { OSAddAtomic(1, &nonreserved_zalloc_count); + } } - if (entry == VM_MAP_ENTRY_NULL) + if (entry == VM_MAP_ENTRY_NULL) { panic("vm_map_entry_create"); + } + *entry = vm_map_entry_template; entry->from_reserved_zone = (zone == vm_map_entry_reserved_zone); - vm_map_store_update( (vm_map_t) NULL, entry, VM_MAP_ENTRY_CREATE); -#if MAP_ENTRY_CREATION_DEBUG + vm_map_store_update((vm_map_t) NULL, entry, VM_MAP_ENTRY_CREATE); +#if MAP_ENTRY_CREATION_DEBUG entry->vme_creation_maphdr = map_header; - fastbacktrace(&entry->vme_creation_bt[0], - (sizeof(entry->vme_creation_bt)/sizeof(uintptr_t))); + backtrace(&entry->vme_creation_bt[0], + (sizeof(entry->vme_creation_bt) / sizeof(uintptr_t)), NULL); #endif - return(entry); + return entry; } /* @@ -837,33 +1464,35 @@ _vm_map_entry_create( * * Inverse of vm_map_entry_create. * - * write map lock held so no need to + * write map lock held so no need to * do anything special to insure correctness - * of the stores + * of the stores */ -#define vm_map_entry_dispose(map, entry) \ +#define vm_map_entry_dispose(map, entry) \ _vm_map_entry_dispose(&(map)->hdr, (entry)) -#define vm_map_copy_entry_dispose(map, entry) \ +#define vm_map_copy_entry_dispose(copy, entry) \ _vm_map_entry_dispose(&(copy)->cpy_hdr, (entry)) static void _vm_map_entry_dispose( - register struct vm_map_header *map_header, - register vm_map_entry_t entry) + struct vm_map_header *map_header, + vm_map_entry_t entry) { - register zone_t zone; + zone_t zone; - if (map_header->entries_pageable || !(entry->from_reserved_zone)) + if (map_header->entries_pageable || !(entry->from_reserved_zone)) { zone = vm_map_entry_zone; - else + } else { zone = vm_map_entry_reserved_zone; + } if (!map_header->entries_pageable) { - if (zone == vm_map_entry_zone) + if (zone == vm_map_entry_zone) { OSAddAtomic(-1, &nonreserved_zalloc_count); - else + } else { OSAddAtomic(-1, &reserved_zalloc_count); + } } zfree(zone, entry); @@ -873,91 +1502,23 @@ _vm_map_entry_dispose( static boolean_t first_free_check = FALSE; boolean_t first_free_is_valid( - vm_map_t map) + vm_map_t map) { - if (!first_free_check) + if (!first_free_check) { return TRUE; - - return( first_free_is_valid_store( map )); + } + + return first_free_is_valid_store( map ); } #endif /* MACH_ASSERT */ -#define vm_map_copy_entry_link(copy, after_where, entry) \ +#define vm_map_copy_entry_link(copy, after_where, entry) \ _vm_map_store_entry_link(&(copy)->cpy_hdr, after_where, (entry)) -#define vm_map_copy_entry_unlink(copy, entry) \ +#define vm_map_copy_entry_unlink(copy, entry) \ _vm_map_store_entry_unlink(&(copy)->cpy_hdr, (entry)) -#if MACH_ASSERT && TASK_SWAPPER -/* - * vm_map_res_reference: - * - * Adds another valid residence count to the given map. - * - * Map is locked so this function can be called from - * vm_map_swapin. - * - */ -void vm_map_res_reference(register vm_map_t map) -{ - /* assert map is locked */ - assert(map->res_count >= 0); - assert(map->ref_count >= map->res_count); - if (map->res_count == 0) { - lck_mtx_unlock(&map->s_lock); - vm_map_lock(map); - vm_map_swapin(map); - lck_mtx_lock(&map->s_lock); - ++map->res_count; - vm_map_unlock(map); - } else - ++map->res_count; -} - -/* - * vm_map_reference_swap: - * - * Adds valid reference and residence counts to the given map. - * - * The map may not be in memory (i.e. zero residence count). - * - */ -void vm_map_reference_swap(register vm_map_t map) -{ - assert(map != VM_MAP_NULL); - lck_mtx_lock(&map->s_lock); - assert(map->res_count >= 0); - assert(map->ref_count >= map->res_count); - map->ref_count++; - vm_map_res_reference(map); - lck_mtx_unlock(&map->s_lock); -} - -/* - * vm_map_res_deallocate: - * - * Decrement residence count on a map; possibly causing swapout. - * - * The map must be in memory (i.e. non-zero residence count). - * - * The map is locked, so this function is callable from vm_map_deallocate. - * - */ -void vm_map_res_deallocate(register vm_map_t map) -{ - assert(map->res_count > 0); - if (--map->res_count == 0) { - lck_mtx_unlock(&map->s_lock); - vm_map_lock(map); - vm_map_swapout(map); - vm_map_unlock(map); - lck_mtx_lock(&map->s_lock); - } - assert(map->ref_count >= map->res_count); -} -#endif /* MACH_ASSERT && TASK_SWAPPER */ - /* * vm_map_destroy: * @@ -965,222 +1526,102 @@ void vm_map_res_deallocate(register vm_map_t map) */ void vm_map_destroy( - vm_map_t map, - int flags) -{ + vm_map_t map, + int flags) +{ vm_map_lock(map); + /* final cleanup: no need to unnest shared region */ + flags |= VM_MAP_REMOVE_NO_UNNESTING; + /* final cleanup: ok to remove immutable mappings */ + flags |= VM_MAP_REMOVE_IMMUTABLE; + /* final cleanup: allow gaps in range */ + flags |= VM_MAP_REMOVE_GAPS_OK; + /* clean up regular map entries */ (void) vm_map_delete(map, map->min_offset, map->max_offset, - flags, VM_MAP_NULL); - /* clean up leftover special mappings (commpage, etc...) */ + flags, VM_MAP_NULL); + /* clean up leftover special mappings (commpage, GPU carveout, etc...) */ +#if !defined(__arm__) (void) vm_map_delete(map, 0x0, 0xFFFFFFFFFFFFF000ULL, - flags, VM_MAP_NULL); + flags, VM_MAP_NULL); +#endif /* !__arm__ */ + + vm_map_disable_hole_optimization(map); + vm_map_corpse_footprint_destroy(map); -#if CONFIG_FREEZE - if (map->default_freezer_handle) { - default_freezer_handle_deallocate(map->default_freezer_handle); - map->default_freezer_handle = NULL; - } -#endif vm_map_unlock(map); assert(map->hdr.nentries == 0); - - if(map->pmap) + + if (map->pmap) { pmap_destroy(map->pmap); + } + + if (vm_map_lck_attr.lck_attr_val & LCK_ATTR_DEBUG) { + /* + * If lock debugging is enabled the mutexes get tagged as LCK_MTX_TAG_INDIRECT. + * And this is regardless of whether the lck_mtx_ext_t is embedded in the + * structure or kalloc'ed via lck_mtx_init. + * An example is s_lock_ext within struct _vm_map. + * + * A lck_mtx_destroy on such a mutex will attempt a kfree and panic. We + * can add another tag to detect embedded vs alloc'ed indirect external + * mutexes but that'll be additional checks in the lock path and require + * updating dependencies for the old vs new tag. + * + * Since the kfree() is for LCK_MTX_TAG_INDIRECT mutexes and that tag is applied + * just when lock debugging is ON, we choose to forego explicitly destroying + * the vm_map mutex and rw lock and, as a consequence, will overflow the reference + * count on vm_map_lck_grp, which has no serious side-effect. + */ + } else { + lck_rw_destroy(&(map)->lock, &vm_map_lck_grp); + lck_mtx_destroy(&(map)->s_lock, &vm_map_lck_grp); + } zfree(vm_map_zone, map); } -#if TASK_SWAPPER /* - * vm_map_swapin/vm_map_swapout - * - * Swap a map in and out, either referencing or releasing its resources. - * These functions are internal use only; however, they must be exported - * because they may be called from macros, which are exported. - * - * In the case of swapout, there could be races on the residence count, - * so if the residence count is up, we return, assuming that a - * vm_map_deallocate() call in the near future will bring us back. - * - * Locking: - * -- We use the map write lock for synchronization among races. - * -- The map write lock, and not the simple s_lock, protects the - * swap state of the map. - * -- If a map entry is a share map, then we hold both locks, in - * hierarchical order. - * - * Synchronization Notes: - * 1) If a vm_map_swapin() call happens while swapout in progress, it - * will block on the map lock and proceed when swapout is through. - * 2) A vm_map_reference() call at this time is illegal, and will - * cause a panic. vm_map_reference() is only allowed on resident - * maps, since it refuses to block. - * 3) A vm_map_swapin() call during a swapin will block, and - * proceeed when the first swapin is done, turning into a nop. - * This is the reason the res_count is not incremented until - * after the swapin is complete. - * 4) There is a timing hole after the checks of the res_count, before - * the map lock is taken, during which a swapin may get the lock - * before a swapout about to happen. If this happens, the swapin - * will detect the state and increment the reference count, causing - * the swapout to be a nop, thereby delaying it until a later - * vm_map_deallocate. If the swapout gets the lock first, then - * the swapin will simply block until the swapout is done, and - * then proceed. - * - * Because vm_map_swapin() is potentially an expensive operation, it - * should be used with caution. - * - * Invariants: - * 1) A map with a residence count of zero is either swapped, or - * being swapped. - * 2) A map with a non-zero residence count is either resident, - * or being swapped in. + * Returns pid of the task with the largest number of VM map entries. + * Used in the zone-map-exhaustion jetsam path. */ - -int vm_map_swap_enable = 1; - -void vm_map_swapin (vm_map_t map) +pid_t +find_largest_process_vm_map_entries(void) { - register vm_map_entry_t entry; - - if (!vm_map_swap_enable) /* debug */ - return; - - /* - * Map is locked - * First deal with various races. - */ - if (map->sw_state == MAP_SW_IN) - /* - * we raced with swapout and won. Returning will incr. - * the res_count, turning the swapout into a nop. - */ - return; - - /* - * The residence count must be zero. If we raced with another - * swapin, the state would have been IN; if we raced with a - * swapout (after another competing swapin), we must have lost - * the race to get here (see above comment), in which case - * res_count is still 0. - */ - assert(map->res_count == 0); - - /* - * There are no intermediate states of a map going out or - * coming in, since the map is locked during the transition. - */ - assert(map->sw_state == MAP_SW_OUT); - - /* - * We now operate upon each map entry. If the entry is a sub- - * or share-map, we call vm_map_res_reference upon it. - * If the entry is an object, we call vm_object_res_reference - * (this may iterate through the shadow chain). - * Note that we hold the map locked the entire time, - * even if we get back here via a recursive call in - * vm_map_res_reference. - */ - entry = vm_map_first_entry(map); - - while (entry != vm_map_to_entry(map)) { - if (entry->object.vm_object != VM_OBJECT_NULL) { - if (entry->is_sub_map) { - vm_map_t lmap = entry->object.sub_map; - lck_mtx_lock(&lmap->s_lock); - vm_map_res_reference(lmap); - lck_mtx_unlock(&lmap->s_lock); - } else { - vm_object_t object = entry->object.vm_object; - vm_object_lock(object); - /* - * This call may iterate through the - * shadow chain. - */ - vm_object_res_reference(object); - vm_object_unlock(object); - } + pid_t victim_pid = -1; + int max_vm_map_entries = 0; + task_t task = TASK_NULL; + queue_head_t *task_list = &tasks; + + lck_mtx_lock(&tasks_threads_lock); + queue_iterate(task_list, task, task_t, tasks) { + if (task == kernel_task || !task->active) { + continue; } - entry = entry->vme_next; - } - assert(map->sw_state == MAP_SW_OUT); - map->sw_state = MAP_SW_IN; -} - -void vm_map_swapout(vm_map_t map) -{ - register vm_map_entry_t entry; - - /* - * Map is locked - * First deal with various races. - * If we raced with a swapin and lost, the residence count - * will have been incremented to 1, and we simply return. - */ - lck_mtx_lock(&map->s_lock); - if (map->res_count != 0) { - lck_mtx_unlock(&map->s_lock); - return; - } - lck_mtx_unlock(&map->s_lock); - - /* - * There are no intermediate states of a map going out or - * coming in, since the map is locked during the transition. - */ - assert(map->sw_state == MAP_SW_IN); - - if (!vm_map_swap_enable) - return; - /* - * We now operate upon each map entry. If the entry is a sub- - * or share-map, we call vm_map_res_deallocate upon it. - * If the entry is an object, we call vm_object_res_deallocate - * (this may iterate through the shadow chain). - * Note that we hold the map locked the entire time, - * even if we get back here via a recursive call in - * vm_map_res_deallocate. - */ - entry = vm_map_first_entry(map); - - while (entry != vm_map_to_entry(map)) { - if (entry->object.vm_object != VM_OBJECT_NULL) { - if (entry->is_sub_map) { - vm_map_t lmap = entry->object.sub_map; - lck_mtx_lock(&lmap->s_lock); - vm_map_res_deallocate(lmap); - lck_mtx_unlock(&lmap->s_lock); - } else { - vm_object_t object = entry->object.vm_object; - vm_object_lock(object); - /* - * This call may take a long time, - * since it could actively push - * out pages (if we implement it - * that way). - */ - vm_object_res_deallocate(object); - vm_object_unlock(object); + vm_map_t task_map = task->map; + if (task_map != VM_MAP_NULL) { + int task_vm_map_entries = task_map->hdr.nentries; + if (task_vm_map_entries > max_vm_map_entries) { + max_vm_map_entries = task_vm_map_entries; + victim_pid = pid_from_task(task); } } - entry = entry->vme_next; } - assert(map->sw_state == MAP_SW_IN); - map->sw_state = MAP_SW_OUT; + lck_mtx_unlock(&tasks_threads_lock); + + printf("zone_map_exhaustion: victim pid %d, vm region count: %d\n", victim_pid, max_vm_map_entries); + return victim_pid; } -#endif /* TASK_SWAPPER */ /* * vm_map_lookup_entry: [ internal use only ] * - * Calls into the vm map store layer to find the map - * entry containing (or immediately preceding) the + * Calls into the vm map store layer to find the map + * entry containing (or immediately preceding) the * specified address in the given map; the entry is returned * in the "entry" parameter. The boolean * result indicates whether the address is @@ -1188,11 +1629,11 @@ void vm_map_swapout(vm_map_t map) */ boolean_t vm_map_lookup_entry( - register vm_map_t map, - register vm_map_offset_t address, - vm_map_entry_t *entry) /* OUT */ + vm_map_t map, + vm_map_offset_t address, + vm_map_entry_t *entry) /* OUT */ { - return ( vm_map_store_lookup_entry( map, address, entry )); + return vm_map_store_lookup_entry( map, address, entry ); } /* @@ -1207,105 +1648,201 @@ vm_map_lookup_entry( * * If an entry is allocated, the object/offset fields * are initialized to zero. + * + * If VM_MAP_FIND_LAST_FREE flag is set, allocate from end of map. This + * is currently only used for allocating memory for zones backing + * one of the kalloc heaps.(rdar://65832263) */ kern_return_t vm_map_find_space( - register vm_map_t map, - vm_map_offset_t *address, /* OUT */ - vm_map_size_t size, - vm_map_offset_t mask, - int flags, - vm_map_entry_t *o_entry) /* OUT */ + vm_map_t map, + vm_map_offset_t *address, /* OUT */ + vm_map_size_t size, + vm_map_offset_t mask, + int flags, + vm_map_kernel_flags_t vmk_flags, + vm_tag_t tag, + vm_map_entry_t *o_entry) /* OUT */ { - register vm_map_entry_t entry, new_entry; - register vm_map_offset_t start; - register vm_map_offset_t end; + vm_map_entry_t entry, new_entry, hole_entry; + vm_map_offset_t start; + vm_map_offset_t end; if (size == 0) { *address = 0; return KERN_INVALID_ARGUMENT; } - if (flags & VM_FLAGS_GUARD_AFTER) { - /* account for the back guard page in the size */ - size += VM_MAP_PAGE_SIZE(map); - } - new_entry = vm_map_entry_create(map, FALSE); + vm_map_lock(map); - /* - * Look for the first possible address; if there's already - * something at this address, we have to start after it. - */ + if (flags & VM_MAP_FIND_LAST_FREE) { + assert(!map->disable_vmentry_reuse); + /* TODO: Make backward lookup generic and support guard pages */ + assert(!vmk_flags.vmkf_guard_after && !vmk_flags.vmkf_guard_before); + assert(VM_MAP_PAGE_ALIGNED(size, VM_MAP_PAGE_MASK(map))); - vm_map_lock(map); + /* Allocate space from end of map */ + vm_map_store_find_last_free(map, &entry); - if( map->disable_vmentry_reuse == TRUE) { - VM_MAP_HIGHEST_ENTRY(map, entry, start); - } else { - assert(first_free_is_valid(map)); - if ((entry = map->first_free) == vm_map_to_entry(map)) - start = map->min_offset; - else - start = entry->vme_end; - } + if (!entry) { + goto noSpace; + } - /* - * In any case, the "entry" always precedes - * the proposed new region throughout the loop: - */ + if (entry == vm_map_to_entry(map)) { + end = map->max_offset; + } else { + end = entry->vme_start; + } - while (TRUE) { - register vm_map_entry_t next; + while (TRUE) { + vm_map_entry_t prev; - /* - * Find the end of the proposed new region. - * Be sure we didn't go beyond the end, or - * wrap around the address. - */ + start = end - size; - if (flags & VM_FLAGS_GUARD_BEFORE) { - /* reserve space for the front guard page */ - start += VM_MAP_PAGE_SIZE(map); - } - end = ((start + mask) & ~mask); - - if (end < start) { - vm_map_entry_dispose(map, new_entry); - vm_map_unlock(map); - return(KERN_NO_SPACE); - } - start = end; - end += size; + if ((start < map->min_offset) || end < start) { + goto noSpace; + } - if ((end > map->max_offset) || (end < start)) { - vm_map_entry_dispose(map, new_entry); - vm_map_unlock(map); - return(KERN_NO_SPACE); + prev = entry->vme_prev; + entry = prev; + + if (prev == vm_map_to_entry(map)) { + break; + } + + if (prev->vme_end <= start) { + break; + } + + /* + * Didn't fit -- move to the next entry. + */ + + end = entry->vme_start; + } + } else { + if (vmk_flags.vmkf_guard_after) { + /* account for the back guard page in the size */ + size += VM_MAP_PAGE_SIZE(map); } /* - * If there are no more entries, we must win. + * Look for the first possible address; if there's already + * something at this address, we have to start after it. */ - next = entry->vme_next; - if (next == vm_map_to_entry(map)) - break; + if (map->disable_vmentry_reuse == TRUE) { + VM_MAP_HIGHEST_ENTRY(map, entry, start); + } else { + if (map->holelistenabled) { + hole_entry = CAST_TO_VM_MAP_ENTRY(map->holes_list); - /* - * If there is another entry, it must be - * after the end of the potential new region. - */ + if (hole_entry == NULL) { + /* + * No more space in the map? + */ + goto noSpace; + } - if (next->vme_start >= end) - break; + entry = hole_entry; + start = entry->vme_start; + } else { + assert(first_free_is_valid(map)); + if ((entry = map->first_free) == vm_map_to_entry(map)) { + start = map->min_offset; + } else { + start = entry->vme_end; + } + } + } /* - * Didn't fit -- move to the next entry. + * In any case, the "entry" always precedes + * the proposed new region throughout the loop: */ - entry = next; - start = entry->vme_end; + while (TRUE) { + vm_map_entry_t next; + + /* + * Find the end of the proposed new region. + * Be sure we didn't go beyond the end, or + * wrap around the address. + */ + + if (vmk_flags.vmkf_guard_before) { + /* reserve space for the front guard page */ + start += VM_MAP_PAGE_SIZE(map); + } + end = ((start + mask) & ~mask); + + if (end < start) { + goto noSpace; + } + start = end; + assert(VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map))); + end += size; + assert(VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map))); + + if ((end > map->max_offset) || (end < start)) { + goto noSpace; + } + + next = entry->vme_next; + + if (map->holelistenabled) { + if (entry->vme_end >= end) { + break; + } + } else { + /* + * If there are no more entries, we must win. + * + * OR + * + * If there is another entry, it must be + * after the end of the potential new region. + */ + + if (next == vm_map_to_entry(map)) { + break; + } + + if (next->vme_start >= end) { + break; + } + } + + /* + * Didn't fit -- move to the next entry. + */ + + entry = next; + + if (map->holelistenabled) { + if (entry == CAST_TO_VM_MAP_ENTRY(map->holes_list)) { + /* + * Wrapped around + */ + goto noSpace; + } + start = entry->vme_start; + } else { + start = entry->vme_end; + } + } + + if (vmk_flags.vmkf_guard_before) { + /* go back for the front guard page */ + start -= VM_MAP_PAGE_SIZE(map); + } + } + + if (map->holelistenabled) { + if (vm_map_lookup_entry(map, entry->vme_start, &entry)) { + panic("Found an existing entry (%p) instead of potential hole at address: 0x%llx.\n", entry, (unsigned long long)entry->vme_start); + } } /* @@ -1318,10 +1855,6 @@ vm_map_find_space( * the map should be locked. */ - if (flags & VM_FLAGS_GUARD_BEFORE) { - /* go back for the front guard page */ - start -= VM_MAP_PAGE_SIZE(map); - } *address = start; assert(start < end); @@ -1330,15 +1863,15 @@ vm_map_find_space( assert(page_aligned(new_entry->vme_start)); assert(page_aligned(new_entry->vme_end)); assert(VM_MAP_PAGE_ALIGNED(new_entry->vme_start, - VM_MAP_PAGE_MASK(map))); + VM_MAP_PAGE_MASK(map))); assert(VM_MAP_PAGE_ALIGNED(new_entry->vme_end, - VM_MAP_PAGE_MASK(map))); + VM_MAP_PAGE_MASK(map))); new_entry->is_shared = FALSE; new_entry->is_sub_map = FALSE; new_entry->use_pmap = TRUE; - new_entry->object.vm_object = VM_OBJECT_NULL; - new_entry->offset = (vm_object_offset_t) 0; + VME_OBJECT_SET(new_entry, VM_OBJECT_NULL); + VME_OFFSET_SET(new_entry, (vm_object_offset_t) 0); new_entry->needs_copy = FALSE; @@ -1360,19 +1893,25 @@ vm_map_find_space( new_entry->map_aligned = FALSE; } - new_entry->used_for_jit = 0; - - new_entry->alias = 0; + new_entry->used_for_jit = FALSE; + new_entry->pmap_cs_associated = FALSE; new_entry->zero_wired_pages = FALSE; new_entry->iokit_acct = FALSE; + new_entry->vme_resilient_codesign = FALSE; + new_entry->vme_resilient_media = FALSE; + if (vmk_flags.vmkf_atomic_entry) { + new_entry->vme_atomic = TRUE; + } else { + new_entry->vme_atomic = FALSE; + } - VM_GET_FLAGS_ALIAS(flags, new_entry->alias); + VME_ALIAS_SET(new_entry, tag); /* * Insert the new entry into the list */ - vm_map_store_entry_link(map, entry, new_entry); + vm_map_store_entry_link(map, entry, new_entry, VM_MAP_KERNEL_FLAGS_NONE); map->size += size; @@ -1382,7 +1921,13 @@ vm_map_find_space( SAVE_HINT_MAP_WRITE(map, new_entry); *o_entry = new_entry; - return(KERN_SUCCESS); + return KERN_SUCCESS; + +noSpace: + + vm_map_entry_dispose(map, new_entry); + vm_map_unlock(map); + return KERN_NO_SPACE; } int vm_map_pmap_enter_print = FALSE; @@ -1397,32 +1942,36 @@ int vm_map_pmap_enter_enable = FALSE; * As soon as a page not found in the object the scan ends. * * Returns: - * Nothing. + * Nothing. * * In/out conditions: * The source map should not be locked on entry. */ __unused static void vm_map_pmap_enter( - vm_map_t map, - register vm_map_offset_t addr, - register vm_map_offset_t end_addr, - register vm_object_t object, - vm_object_offset_t offset, - vm_prot_t protection) + vm_map_t map, + vm_map_offset_t addr, + vm_map_offset_t end_addr, + vm_object_t object, + vm_object_offset_t offset, + vm_prot_t protection) { - int type_of_fault; - kern_return_t kr; + int type_of_fault; + kern_return_t kr; + struct vm_object_fault_info fault_info = {}; - if(map->pmap == 0) + if (map->pmap == 0) { return; + } + + assert(VM_MAP_PAGE_SHIFT(map) == PAGE_SHIFT); while (addr < end_addr) { - register vm_page_t m; + vm_page_t m; /* - * TODO: + * TODO: * From vm_map_enter(), we come into this function without the map * lock held or the object lock held. * We haven't taken a reference on the object either. @@ -1434,14 +1983,9 @@ vm_map_pmap_enter( vm_object_lock(object); m = vm_page_lookup(object, offset); - /* - * ENCRYPTED SWAP: - * The user should never see encrypted data, so do not - * enter an encrypted page in the page table. - */ - if (m == VM_PAGE_NULL || m->busy || m->encrypted || - m->fictitious || - (m->unusual && ( m->error || m->restart || m->absent))) { + + if (m == VM_PAGE_NULL || m->vmp_busy || m->vmp_fictitious || + (m->vmp_unusual && (m->vmp_error || m->vmp_restart || m->vmp_absent))) { vm_object_unlock(object); return; } @@ -1449,15 +1993,19 @@ vm_map_pmap_enter( if (vm_map_pmap_enter_print) { printf("vm_map_pmap_enter:"); printf("map: %p, addr: %llx, object: %p, offset: %llx\n", - map, (unsigned long long)addr, object, (unsigned long long)offset); + map, (unsigned long long)addr, object, (unsigned long long)offset); } type_of_fault = DBG_CACHE_HIT_FAULT; - kr = vm_fault_enter(m, map->pmap, addr, protection, protection, - VM_PAGE_WIRED(m), FALSE, FALSE, FALSE, - 0, /* XXX need user tag / alias? */ - 0, /* alternate accounting? */ - NULL, - &type_of_fault); + kr = vm_fault_enter(m, map->pmap, + addr, + PAGE_SIZE, 0, + protection, protection, + VM_PAGE_WIRED(m), + FALSE, /* change_wiring */ + VM_KERN_MEMORY_NONE, /* tag - not wiring */ + &fault_info, + NULL, /* need_retry */ + &type_of_fault); vm_object_unlock(object); @@ -1467,65 +2015,71 @@ vm_map_pmap_enter( } boolean_t vm_map_pmap_is_empty( - vm_map_t map, - vm_map_offset_t start, + vm_map_t map, + vm_map_offset_t start, vm_map_offset_t end); -boolean_t vm_map_pmap_is_empty( - vm_map_t map, - vm_map_offset_t start, - vm_map_offset_t end) +boolean_t +vm_map_pmap_is_empty( + vm_map_t map, + vm_map_offset_t start, + vm_map_offset_t end) { #ifdef MACHINE_PMAP_IS_EMPTY return pmap_is_empty(map->pmap, start, end); -#else /* MACHINE_PMAP_IS_EMPTY */ - vm_map_offset_t offset; - ppnum_t phys_page; +#else /* MACHINE_PMAP_IS_EMPTY */ + vm_map_offset_t offset; + ppnum_t phys_page; if (map->pmap == NULL) { return TRUE; } for (offset = start; - offset < end; - offset += PAGE_SIZE) { + offset < end; + offset += PAGE_SIZE) { phys_page = pmap_find_phys(map->pmap, offset); if (phys_page) { kprintf("vm_map_pmap_is_empty(%p,0x%llx,0x%llx): " - "page %d at 0x%llx\n", - map, (long long)start, (long long)end, - phys_page, (long long)offset); + "page %d at 0x%llx\n", + map, (long long)start, (long long)end, + phys_page, (long long)offset); return FALSE; } } return TRUE; -#endif /* MACHINE_PMAP_IS_EMPTY */ +#endif /* MACHINE_PMAP_IS_EMPTY */ } -#define MAX_TRIES_TO_GET_RANDOM_ADDRESS 1000 +#define MAX_TRIES_TO_GET_RANDOM_ADDRESS 1000 kern_return_t vm_map_random_address_for_size( - vm_map_t map, - vm_map_offset_t *address, - vm_map_size_t size) + vm_map_t map, + vm_map_offset_t *address, + vm_map_size_t size) { - kern_return_t kr = KERN_SUCCESS; - int tries = 0; - vm_map_offset_t random_addr = 0; + kern_return_t kr = KERN_SUCCESS; + int tries = 0; + vm_map_offset_t random_addr = 0; vm_map_offset_t hole_end; - vm_map_entry_t next_entry = VM_MAP_ENTRY_NULL; - vm_map_entry_t prev_entry = VM_MAP_ENTRY_NULL; - vm_map_size_t vm_hole_size = 0; - vm_map_size_t addr_space_size; + vm_map_entry_t next_entry = VM_MAP_ENTRY_NULL; + vm_map_entry_t prev_entry = VM_MAP_ENTRY_NULL; + vm_map_size_t vm_hole_size = 0; + vm_map_size_t addr_space_size; addr_space_size = vm_map_max(map) - vm_map_min(map); - assert(page_aligned(size)); + assert(VM_MAP_PAGE_ALIGNED(size, VM_MAP_PAGE_MASK(map))); while (tries < MAX_TRIES_TO_GET_RANDOM_ADDRESS) { - random_addr = ((vm_map_offset_t)random()) << PAGE_SHIFT; + if (startup_phase < STARTUP_SUB_ZALLOC) { + random_addr = (vm_map_offset_t)early_random(); + } else { + random_addr = (vm_map_offset_t)random(); + } + random_addr <<= VM_MAP_PAGE_SHIFT(map); random_addr = vm_map_trunc_page( - vm_map_min(map) +(random_addr % addr_space_size), + vm_map_min(map) + (random_addr % addr_space_size), VM_MAP_PAGE_MASK(map)); if (vm_map_lookup_entry(map, random_addr, &prev_entry) == FALSE) { @@ -1554,6 +2108,23 @@ vm_map_random_address_for_size( return kr; } +static boolean_t +vm_memory_malloc_no_cow( + int alias) +{ + uint64_t alias_mask; + + if (alias > 63) { + return FALSE; + } + + alias_mask = 1ULL << alias; + if (alias_mask & vm_memory_malloc_no_cow_mask) { + return TRUE; + } + return FALSE; +} + /* * Routine: vm_map_enter * @@ -1564,48 +2135,69 @@ vm_map_random_address_for_size( * * Arguments are as defined in the vm_map call. */ -int _map_enter_debug = 0; static unsigned int vm_map_enter_restore_successes = 0; static unsigned int vm_map_enter_restore_failures = 0; kern_return_t vm_map_enter( - vm_map_t map, - vm_map_offset_t *address, /* IN/OUT */ - vm_map_size_t size, - vm_map_offset_t mask, - int flags, - vm_object_t object, - vm_object_offset_t offset, - boolean_t needs_copy, - vm_prot_t cur_protection, - vm_prot_t max_protection, - vm_inherit_t inheritance) -{ - vm_map_entry_t entry, new_entry; - vm_map_offset_t start, tmp_start, tmp_offset; - vm_map_offset_t end, tmp_end; - vm_map_offset_t tmp2_start, tmp2_end; - vm_map_offset_t step; - kern_return_t result = KERN_SUCCESS; - vm_map_t zap_old_map = VM_MAP_NULL; - vm_map_t zap_new_map = VM_MAP_NULL; - boolean_t map_locked = FALSE; - boolean_t pmap_empty = TRUE; - boolean_t new_mapping_established = FALSE; - boolean_t keep_map_locked = ((flags & VM_FLAGS_KEEP_MAP_LOCKED) != 0); - boolean_t anywhere = ((flags & VM_FLAGS_ANYWHERE) != 0); - boolean_t purgable = ((flags & VM_FLAGS_PURGABLE) != 0); - boolean_t overwrite = ((flags & VM_FLAGS_OVERWRITE) != 0); - boolean_t no_cache = ((flags & VM_FLAGS_NO_CACHE) != 0); - boolean_t is_submap = ((flags & VM_FLAGS_SUBMAP) != 0); - boolean_t permanent = ((flags & VM_FLAGS_PERMANENT) != 0); - boolean_t entry_for_jit = ((flags & VM_FLAGS_MAP_JIT) != 0); - boolean_t iokit_acct = ((flags & VM_FLAGS_IOKIT_ACCT) != 0); - unsigned int superpage_size = ((flags & VM_FLAGS_SUPERPAGE_MASK) >> VM_FLAGS_SUPERPAGE_SHIFT); - char alias; - vm_map_offset_t effective_min_offset, effective_max_offset; - kern_return_t kr; - boolean_t clear_map_aligned = FALSE; + vm_map_t map, + vm_map_offset_t *address, /* IN/OUT */ + vm_map_size_t size, + vm_map_offset_t mask, + int flags, + vm_map_kernel_flags_t vmk_flags, + vm_tag_t alias, + vm_object_t object, + vm_object_offset_t offset, + boolean_t needs_copy, + vm_prot_t cur_protection, + vm_prot_t max_protection, + vm_inherit_t inheritance) +{ + vm_map_entry_t entry, new_entry; + vm_map_offset_t start, tmp_start, tmp_offset; + vm_map_offset_t end, tmp_end; + vm_map_offset_t tmp2_start, tmp2_end; + vm_map_offset_t desired_empty_end; + vm_map_offset_t step; + kern_return_t result = KERN_SUCCESS; + vm_map_t zap_old_map = VM_MAP_NULL; + vm_map_t zap_new_map = VM_MAP_NULL; + boolean_t map_locked = FALSE; + boolean_t pmap_empty = TRUE; + boolean_t new_mapping_established = FALSE; + boolean_t keep_map_locked = vmk_flags.vmkf_keep_map_locked; + boolean_t anywhere = ((flags & VM_FLAGS_ANYWHERE) != 0); + boolean_t purgable = ((flags & VM_FLAGS_PURGABLE) != 0); + boolean_t overwrite = ((flags & VM_FLAGS_OVERWRITE) != 0); + boolean_t no_cache = ((flags & VM_FLAGS_NO_CACHE) != 0); + boolean_t is_submap = vmk_flags.vmkf_submap; + boolean_t permanent = (((flags & VM_FLAGS_PERMANENT) != 0) || vmk_flags.vmkf_permanent); + boolean_t no_copy_on_read = vmk_flags.vmkf_no_copy_on_read; + boolean_t entry_for_jit = vmk_flags.vmkf_map_jit; + boolean_t iokit_acct = vmk_flags.vmkf_iokit_acct; + boolean_t translated_allow_execute = vmk_flags.vmkf_translated_allow_execute; + boolean_t resilient_codesign = ((flags & VM_FLAGS_RESILIENT_CODESIGN) != 0); + boolean_t resilient_media = ((flags & VM_FLAGS_RESILIENT_MEDIA) != 0); + boolean_t random_address = ((flags & VM_FLAGS_RANDOM_ADDR) != 0); + unsigned int superpage_size = ((flags & VM_FLAGS_SUPERPAGE_MASK) >> VM_FLAGS_SUPERPAGE_SHIFT); + vm_tag_t user_alias; + vm_map_offset_t effective_min_offset, effective_max_offset; + kern_return_t kr; + boolean_t clear_map_aligned = FALSE; + vm_map_entry_t hole_entry; + vm_map_size_t chunk_size = 0; + + assertf(vmk_flags.__vmkf_unused == 0, "vmk_flags unused=0x%x\n", vmk_flags.__vmkf_unused); + + if (flags & VM_FLAGS_4GB_CHUNK) { +#if defined(__LP64__) + chunk_size = (4ULL * 1024 * 1024 * 1024); /* max. 4GB chunks for the new allocation */ +#else /* __LP64__ */ + chunk_size = ANON_CHUNK_SIZE; +#endif /* __LP64__ */ + } else { + chunk_size = ANON_CHUNK_SIZE; + } if (superpage_size) { switch (superpage_size) { @@ -1617,22 +2209,94 @@ vm_map_enter( * with a lookup of the size depending on superpage_size. */ #ifdef __x86_64__ - case SUPERPAGE_SIZE_ANY: - /* handle it like 2 MB and round up to page size */ - size = (size + 2*1024*1024 - 1) & ~(2*1024*1024 - 1); - case SUPERPAGE_SIZE_2MB: - break; + case SUPERPAGE_SIZE_ANY: + /* handle it like 2 MB and round up to page size */ + size = (size + 2 * 1024 * 1024 - 1) & ~(2 * 1024 * 1024 - 1); + OS_FALLTHROUGH; + case SUPERPAGE_SIZE_2MB: + break; #endif - default: - return KERN_INVALID_ARGUMENT; + default: + return KERN_INVALID_ARGUMENT; } - mask = SUPERPAGE_SIZE-1; - if (size & (SUPERPAGE_SIZE-1)) + mask = SUPERPAGE_SIZE - 1; + if (size & (SUPERPAGE_SIZE - 1)) { return KERN_INVALID_ARGUMENT; - inheritance = VM_INHERIT_NONE; /* fork() children won't inherit superpages */ + } + inheritance = VM_INHERIT_NONE; /* fork() children won't inherit superpages */ + } + + + if ((cur_protection & VM_PROT_WRITE) && + (cur_protection & VM_PROT_EXECUTE) && +#if XNU_TARGET_OS_OSX + map->pmap != kernel_pmap && + (cs_process_global_enforcement() || + (vmk_flags.vmkf_cs_enforcement_override + ? vmk_flags.vmkf_cs_enforcement + : (vm_map_cs_enforcement(map) +#if __arm64__ + || !VM_MAP_IS_EXOTIC(map) +#endif /* __arm64__ */ + ))) && +#endif /* XNU_TARGET_OS_OSX */ + (VM_MAP_POLICY_WX_FAIL(map) || + VM_MAP_POLICY_WX_STRIP_X(map)) && + !entry_for_jit) { + boolean_t vm_protect_wx_fail = VM_MAP_POLICY_WX_FAIL(map); + + DTRACE_VM3(cs_wx, + uint64_t, 0, + uint64_t, 0, + vm_prot_t, cur_protection); + printf("CODE SIGNING: %d[%s] %s: curprot cannot be write+execute. %s\n", + proc_selfpid(), + (current_task()->bsd_info + ? proc_name_address(current_task()->bsd_info) + : "?"), + __FUNCTION__, + (vm_protect_wx_fail ? "failing" : "turning off execute")); + cur_protection &= ~VM_PROT_EXECUTE; + if (vm_protect_wx_fail) { + return KERN_PROTECTION_FAILURE; + } + } + + /* + * If the task has requested executable lockdown, + * deny any new executable mapping. + */ + if (map->map_disallow_new_exec == TRUE) { + if (cur_protection & VM_PROT_EXECUTE) { + return KERN_PROTECTION_FAILURE; + } } + if (resilient_codesign) { + assert(!is_submap); + int reject_prot = (needs_copy ? VM_PROT_EXECUTE : (VM_PROT_WRITE | VM_PROT_EXECUTE)); + if ((cur_protection | max_protection) & reject_prot) { + return KERN_PROTECTION_FAILURE; + } + } + if (resilient_media) { + assert(!is_submap); +// assert(!needs_copy); + if (object != VM_OBJECT_NULL && + !object->internal) { + /* + * This mapping is directly backed by an external + * memory manager (e.g. a vnode pager for a file): + * we would not have any safe place to inject + * a zero-filled page if an actual page is not + * available, without possibly impacting the actual + * contents of the mapped object (e.g. the file), + * so we can't provide any media resiliency here. + */ + return KERN_INVALID_ARGUMENT; + } + } if (is_submap) { if (purgable) { @@ -1644,7 +2308,7 @@ vm_map_enter( return KERN_INVALID_ARGUMENT; } } - if (flags & VM_FLAGS_ALREADY) { + if (vmk_flags.vmkf_already) { /* * VM_FLAGS_ALREADY says that it's OK if the same mapping * is already present. For it to be meaningul, the requested @@ -1660,32 +2324,55 @@ vm_map_enter( effective_min_offset = map->min_offset; - if (flags & VM_FLAGS_BEYOND_MAX) { + if (vmk_flags.vmkf_beyond_max) { /* * Allow an insertion beyond the map's max offset. */ - if (vm_map_is_64bit(map)) +#if !defined(__arm__) + if (vm_map_is_64bit(map)) { effective_max_offset = 0xFFFFFFFFFFFFF000ULL; - else - effective_max_offset = 0x00000000FFFFF000ULL; + } else +#endif /* __arm__ */ + effective_max_offset = 0x00000000FFFFF000ULL; } else { +#if XNU_TARGET_OS_OSX + if (__improbable(vmk_flags.vmkf_32bit_map_va)) { + effective_max_offset = MIN(map->max_offset, 0x00000000FFFFF000ULL); + } else { + effective_max_offset = map->max_offset; + } +#else /* XNU_TARGET_OS_OSX */ effective_max_offset = map->max_offset; +#endif /* XNU_TARGET_OS_OSX */ } if (size == 0 || - (offset & PAGE_MASK_64) != 0) { + (offset & MIN(VM_MAP_PAGE_MASK(map), PAGE_MASK_64)) != 0) { *address = 0; return KERN_INVALID_ARGUMENT; } - VM_GET_FLAGS_ALIAS(flags, alias); + if (map->pmap == kernel_pmap) { + user_alias = VM_KERN_MEMORY_NONE; + } else { + user_alias = alias; + } + + if (user_alias == VM_MEMORY_MALLOC_MEDIUM) { + chunk_size = MALLOC_MEDIUM_CHUNK_SIZE; + } -#define RETURN(value) { result = value; goto BailOut; } +#define RETURN(value) { result = value; goto BailOut; } - assert(page_aligned(*address)); - assert(page_aligned(size)); + assertf(VM_MAP_PAGE_ALIGNED(*address, FOURK_PAGE_MASK), "0x%llx", (uint64_t)*address); + assertf(VM_MAP_PAGE_ALIGNED(size, FOURK_PAGE_MASK), "0x%llx", (uint64_t)size); + if (VM_MAP_PAGE_MASK(map) >= PAGE_MASK) { + assertf(page_aligned(*address), "0x%llx", (uint64_t)*address); + assertf(page_aligned(size), "0x%llx", (uint64_t)size); + } - if (!VM_MAP_PAGE_ALIGNED(size, VM_MAP_PAGE_MASK(map))) { + if (VM_MAP_PAGE_MASK(map) >= PAGE_MASK && + !VM_MAP_PAGE_ALIGNED(size, VM_MAP_PAGE_MASK(map))) { /* * In most cases, the caller rounds the size up to the * map's page size. @@ -1696,7 +2383,8 @@ vm_map_enter( */ clear_map_aligned = TRUE; } - if (!anywhere && + if (!anywhere && + VM_MAP_PAGE_MASK(map) >= PAGE_MASK && !VM_MAP_PAGE_ALIGNED(*address, VM_MAP_PAGE_MASK(map))) { /* * We've been asked to map at a fixed address and that @@ -1715,11 +2403,12 @@ vm_map_enter( */ if (purgable && (offset != 0 || - (object != VM_OBJECT_NULL && - (object->vo_size != size || - object->purgable == VM_PURGABLE_DENY)) - || size > ANON_MAX_SIZE)) /* LP64todo: remove when dp capable */ + (object != VM_OBJECT_NULL && + (object->vo_size != size || + object->purgable == VM_PURGABLE_DENY)) + || size > ANON_MAX_SIZE)) { /* LP64todo: remove when dp capable */ return KERN_INVALID_ARGUMENT; + } if (!anywhere && overwrite) { /* @@ -1732,25 +2421,33 @@ vm_map_enter( * new mapping fails. */ zap_old_map = vm_map_create(PMAP_NULL, - *address, - *address + size, - map->hdr.entries_pageable); + *address, + *address + size, + map->hdr.entries_pageable); vm_map_set_page_shift(zap_old_map, VM_MAP_PAGE_SHIFT(map)); + vm_map_disable_hole_optimization(zap_old_map); } -StartAgain: ; +StartAgain:; start = *address; if (anywhere) { vm_map_lock(map); map_locked = TRUE; - + if (entry_for_jit) { - if (map->jit_entry_exists) { + if (map->jit_entry_exists && + !VM_MAP_POLICY_ALLOW_MULTIPLE_JIT(map)) { result = KERN_INVALID_ARGUMENT; goto BailOut; } + if (VM_MAP_POLICY_ALLOW_JIT_RANDOM_ADDRESS(map)) { + random_address = TRUE; + } + } + + if (random_address) { /* * Get a random start address. */ @@ -1760,16 +2457,25 @@ StartAgain: ; } start = *address; } +#if XNU_TARGET_OS_OSX + else if ((start == 0 || start == vm_map_min(map)) && + !map->disable_vmentry_reuse && + map->vmmap_high_start != 0) { + start = map->vmmap_high_start; + } +#endif /* XNU_TARGET_OS_OSX */ /* * Calculate the first possible address. */ - if (start < effective_min_offset) + if (start < effective_min_offset) { start = effective_min_offset; - if (start > effective_max_offset) + } + if (start > effective_max_offset) { RETURN(KERN_NO_SPACE); + } /* * Look for the first possible address; @@ -1777,44 +2483,83 @@ StartAgain: ; * address, we have to start after it. */ - if( map->disable_vmentry_reuse == TRUE) { + if (map->disable_vmentry_reuse == TRUE) { VM_MAP_HIGHEST_ENTRY(map, entry, start); } else { - assert(first_free_is_valid(map)); + if (map->holelistenabled) { + hole_entry = CAST_TO_VM_MAP_ENTRY(map->holes_list); - entry = map->first_free; + if (hole_entry == NULL) { + /* + * No more space in the map? + */ + result = KERN_NO_SPACE; + goto BailOut; + } else { + boolean_t found_hole = FALSE; - if (entry == vm_map_to_entry(map)) { - entry = NULL; + do { + if (hole_entry->vme_start >= start) { + start = hole_entry->vme_start; + found_hole = TRUE; + break; + } + + if (hole_entry->vme_end > start) { + found_hole = TRUE; + break; + } + hole_entry = hole_entry->vme_next; + } while (hole_entry != CAST_TO_VM_MAP_ENTRY(map->holes_list)); + + if (found_hole == FALSE) { + result = KERN_NO_SPACE; + goto BailOut; + } + + entry = hole_entry; + + if (start == 0) { + start += PAGE_SIZE_64; + } + } } else { - if (entry->vme_next == vm_map_to_entry(map)){ - /* - * Hole at the end of the map. - */ + assert(first_free_is_valid(map)); + + entry = map->first_free; + + if (entry == vm_map_to_entry(map)) { entry = NULL; - } else { - if (start < (entry->vme_next)->vme_start ) { - start = entry->vme_end; - start = vm_map_round_page(start, - VM_MAP_PAGE_MASK(map)); - } else { + } else { + if (entry->vme_next == vm_map_to_entry(map)) { /* - * Need to do a lookup. + * Hole at the end of the map. */ entry = NULL; + } else { + if (start < (entry->vme_next)->vme_start) { + start = entry->vme_end; + start = vm_map_round_page(start, + VM_MAP_PAGE_MASK(map)); + } else { + /* + * Need to do a lookup. + */ + entry = NULL; + } } - } - } + } - if (entry == NULL) { - vm_map_entry_t tmp_entry; - if (vm_map_lookup_entry(map, start, &tmp_entry)) { - assert(!entry_for_jit); - start = tmp_entry->vme_end; - start = vm_map_round_page(start, - VM_MAP_PAGE_MASK(map)); + if (entry == NULL) { + vm_map_entry_t tmp_entry; + if (vm_map_lookup_entry(map, start, &tmp_entry)) { + assert(!entry_for_jit); + start = tmp_entry->vme_end; + start = vm_map_round_page(start, + VM_MAP_PAGE_MASK(map)); + } + entry = tmp_entry; } - entry = tmp_entry; } } @@ -1825,7 +2570,7 @@ StartAgain: ; */ while (TRUE) { - register vm_map_entry_t next; + vm_map_entry_t next; /* * Find the end of the proposed new region. @@ -1835,21 +2580,25 @@ StartAgain: ; end = ((start + mask) & ~mask); end = vm_map_round_page(end, - VM_MAP_PAGE_MASK(map)); - if (end < start) + VM_MAP_PAGE_MASK(map)); + if (end < start) { RETURN(KERN_NO_SPACE); + } start = end; assert(VM_MAP_PAGE_ALIGNED(start, - VM_MAP_PAGE_MASK(map))); + VM_MAP_PAGE_MASK(map))); end += size; - if ((end > effective_max_offset) || (end < start)) { + /* We want an entire page of empty space, but don't increase the allocation size. */ + desired_empty_end = vm_map_round_page(end, VM_MAP_PAGE_MASK(map)); + + if ((desired_empty_end > effective_max_offset) || (desired_empty_end < start)) { if (map->wait_for_space) { assert(!keep_map_locked); if (size <= (effective_max_offset - - effective_min_offset)) { + effective_min_offset)) { assert_wait((event_t)map, - THREAD_ABORTSAFE); + THREAD_ABORTSAFE); vm_map_unlock(map); map_locked = FALSE; thread_block(THREAD_CONTINUE_NULL); @@ -1859,35 +2608,75 @@ StartAgain: ; RETURN(KERN_NO_SPACE); } - /* - * If there are no more entries, we must win. - */ - next = entry->vme_next; - if (next == vm_map_to_entry(map)) - break; - /* - * If there is another entry, it must be - * after the end of the potential new region. - */ + if (map->holelistenabled) { + if (entry->vme_end >= desired_empty_end) { + break; + } + } else { + /* + * If there are no more entries, we must win. + * + * OR + * + * If there is another entry, it must be + * after the end of the potential new region. + */ - if (next->vme_start >= end) - break; + if (next == vm_map_to_entry(map)) { + break; + } + + if (next->vme_start >= desired_empty_end) { + break; + } + } /* * Didn't fit -- move to the next entry. */ entry = next; - start = entry->vme_end; + + if (map->holelistenabled) { + if (entry == CAST_TO_VM_MAP_ENTRY(map->holes_list)) { + /* + * Wrapped around + */ + result = KERN_NO_SPACE; + goto BailOut; + } + start = entry->vme_start; + } else { + start = entry->vme_end; + } + start = vm_map_round_page(start, - VM_MAP_PAGE_MASK(map)); + VM_MAP_PAGE_MASK(map)); + } + + if (map->holelistenabled) { + if (vm_map_lookup_entry(map, entry->vme_start, &entry)) { + panic("Found an existing entry (%p) instead of potential hole at address: 0x%llx.\n", entry, (unsigned long long)entry->vme_start); + } } + *address = start; assert(VM_MAP_PAGE_ALIGNED(*address, - VM_MAP_PAGE_MASK(map))); + VM_MAP_PAGE_MASK(map))); } else { + if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT && + !overwrite && + user_alias == VM_MEMORY_REALLOC) { + /* + * Force realloc() to switch to a new allocation, + * to prevent 4k-fragmented virtual ranges. + */ +// DEBUG4K_ERROR("no realloc in place"); + return KERN_NO_SPACE; + } + /* * Verify that: * the address doesn't itself violate @@ -1896,8 +2685,9 @@ StartAgain: ; vm_map_lock(map); map_locked = TRUE; - if ((start & mask) != 0) + if ((start & mask) != 0) { RETURN(KERN_NO_SPACE); + } /* * ... the address is within bounds @@ -1912,15 +2702,21 @@ StartAgain: ; } if (overwrite && zap_old_map != VM_MAP_NULL) { + int remove_flags; /* * Fixed mapping and "overwrite" flag: attempt to * remove all existing mappings in the specified * address range, saving them in our "zap_old_map". */ + remove_flags = VM_MAP_REMOVE_SAVE_ENTRIES; + remove_flags |= VM_MAP_REMOVE_NO_MAP_ALIGN; + if (vmk_flags.vmkf_overwrite_immutable) { + /* we can overwrite immutable mappings */ + remove_flags |= VM_MAP_REMOVE_IMMUTABLE; + } (void) vm_map_delete(map, start, end, - (VM_MAP_REMOVE_SAVE_ENTRIES | - VM_MAP_REMOVE_NO_MAP_ALIGN), - zap_old_map); + remove_flags, + zap_old_map); } /* @@ -1928,7 +2724,7 @@ StartAgain: ; */ if (vm_map_lookup_entry(map, start, &entry)) { - if (! (flags & VM_FLAGS_ALREADY)) { + if (!(vmk_flags.vmkf_already)) { RETURN(KERN_NO_SPACE); } /* @@ -1939,10 +2735,9 @@ StartAgain: ; if (entry->vme_start < start) { tmp_start -= start - entry->vme_start; tmp_offset -= start - entry->vme_start; - } for (; entry->vme_start < end; - entry = entry->vme_next) { + entry = entry->vme_next) { /* * Check if the mapping's attributes * match the existing map entry. @@ -1950,13 +2745,13 @@ StartAgain: ; if (entry == vm_map_to_entry(map) || entry->vme_start != tmp_start || entry->is_sub_map != is_submap || - entry->offset != tmp_offset || + VME_OFFSET(entry) != tmp_offset || entry->needs_copy != needs_copy || entry->protection != cur_protection || entry->max_protection != max_protection || entry->inheritance != inheritance || entry->iokit_acct != iokit_acct || - entry->alias != alias) { + VME_ALIAS(entry) != alias) { /* not the same mapping ! */ RETURN(KERN_NO_SPACE); } @@ -1964,21 +2759,21 @@ StartAgain: ; * Check if the same object is being mapped. */ if (is_submap) { - if (entry->object.sub_map != + if (VME_SUBMAP(entry) != (vm_map_t) object) { /* not the same submap */ RETURN(KERN_NO_SPACE); } } else { - if (entry->object.vm_object != object) { + if (VME_OBJECT(entry) != object) { /* not the same VM object... */ vm_object_t obj2; - obj2 = entry->object.vm_object; + obj2 = VME_OBJECT(entry); if ((obj2 == VM_OBJECT_NULL || - obj2->internal) && + obj2->internal) && (object == VM_OBJECT_NULL || - object->internal)) { + object->internal)) { /* * ... but both are * anonymous memory, @@ -2007,8 +2802,9 @@ StartAgain: ; */ if ((entry->vme_next != vm_map_to_entry(map)) && - (entry->vme_next->vme_start < end)) + (entry->vme_next->vme_start < end)) { RETURN(KERN_NO_SPACE); + } } /* @@ -2030,11 +2826,13 @@ StartAgain: ; * semantics. */ - if (purgable || entry_for_jit) { + if (purgable || + entry_for_jit || + vm_memory_malloc_no_cow(user_alias)) { if (object == VM_OBJECT_NULL) { object = vm_object_allocate(size); object->copy_strategy = MEMORY_OBJECT_COPY_NONE; - object->true_share = TRUE; + object->true_share = FALSE; if (purgable) { task_t owner; object->purgable = VM_PURGABLE_NONVOLATILE; @@ -2052,7 +2850,7 @@ StartAgain: ; } else { owner = current_task(); } - assert(object->vo_purgeable_owner == NULL); + assert(object->vo_owner == NULL); assert(object->resident_page_count == 0); assert(object->wired_page_count == 0); vm_object_lock(object); @@ -2061,44 +2859,54 @@ StartAgain: ; } offset = (vm_object_offset_t)0; } + } else if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) { + /* no coalescing if address space uses sub-pages */ } else if ((is_submap == FALSE) && - (object == VM_OBJECT_NULL) && - (entry != vm_map_to_entry(map)) && - (entry->vme_end == start) && - (!entry->is_shared) && - (!entry->is_sub_map) && - (!entry->in_transition) && - (!entry->needs_wakeup) && - (entry->behavior == VM_BEHAVIOR_DEFAULT) && - (entry->protection == cur_protection) && - (entry->max_protection == max_protection) && - (entry->inheritance == inheritance) && - ((alias == VM_MEMORY_REALLOC) || (entry->alias == alias)) && - (entry->no_cache == no_cache) && - (entry->permanent == permanent) && - (!entry->superpage_size && !superpage_size) && - /* - * No coalescing if not map-aligned, to avoid propagating - * that condition any further than needed: - */ - (!entry->map_aligned || !clear_map_aligned) && - (!entry->zero_wired_pages) && - (!entry->used_for_jit && !entry_for_jit) && - (entry->iokit_acct == iokit_acct) && - - ((entry->vme_end - entry->vme_start) + size <= - (alias == VM_MEMORY_REALLOC ? - ANON_CHUNK_SIZE : - NO_COALESCE_LIMIT)) && - - (entry->wired_count == 0)) { /* implies user_wired_count == 0 */ - if (vm_object_coalesce(entry->object.vm_object, - VM_OBJECT_NULL, - entry->offset, - (vm_object_offset_t) 0, - (vm_map_size_t)(entry->vme_end - entry->vme_start), - (vm_map_size_t)(end - entry->vme_end))) { - + (object == VM_OBJECT_NULL) && + (entry != vm_map_to_entry(map)) && + (entry->vme_end == start) && + (!entry->is_shared) && + (!entry->is_sub_map) && + (!entry->in_transition) && + (!entry->needs_wakeup) && + (entry->behavior == VM_BEHAVIOR_DEFAULT) && + (entry->protection == cur_protection) && + (entry->max_protection == max_protection) && + (entry->inheritance == inheritance) && + ((user_alias == VM_MEMORY_REALLOC) || + (VME_ALIAS(entry) == alias)) && + (entry->no_cache == no_cache) && + (entry->permanent == permanent) && + /* no coalescing for immutable executable mappings */ + !((entry->protection & VM_PROT_EXECUTE) && + entry->permanent) && + (!entry->superpage_size && !superpage_size) && + /* + * No coalescing if not map-aligned, to avoid propagating + * that condition any further than needed: + */ + (!entry->map_aligned || !clear_map_aligned) && + (!entry->zero_wired_pages) && + (!entry->used_for_jit && !entry_for_jit) && + (!entry->pmap_cs_associated) && + (entry->iokit_acct == iokit_acct) && + (!entry->vme_resilient_codesign) && + (!entry->vme_resilient_media) && + (!entry->vme_atomic) && + (entry->vme_no_copy_on_read == no_copy_on_read) && + + ((entry->vme_end - entry->vme_start) + size <= + (user_alias == VM_MEMORY_REALLOC ? + ANON_CHUNK_SIZE : + NO_COALESCE_LIMIT)) && + + (entry->wired_count == 0)) { /* implies user_wired_count == 0 */ + if (vm_object_coalesce(VME_OBJECT(entry), + VM_OBJECT_NULL, + VME_OFFSET(entry), + (vm_object_offset_t) 0, + (vm_map_size_t)(entry->vme_end - entry->vme_start), + (vm_map_size_t)(end - entry->vme_end))) { /* * Coalesced the two objects - can extend * the previous map entry to include the @@ -2107,9 +2915,16 @@ StartAgain: ; map->size += (end - entry->vme_end); assert(entry->vme_start < end); assert(VM_MAP_PAGE_ALIGNED(end, - VM_MAP_PAGE_MASK(map))); + VM_MAP_PAGE_MASK(map))); + if (__improbable(vm_debug_events)) { + DTRACE_VM5(map_entry_extend, vm_map_t, map, vm_map_entry_t, entry, vm_address_t, entry->vme_start, vm_address_t, entry->vme_end, vm_address_t, end); + } entry->vme_end = end; - vm_map_store_update_first_free(map, map->first_free); + if (map->holelistenabled) { + vm_map_store_update_first_free(map, entry, TRUE); + } else { + vm_map_store_update_first_free(map, map->first_free, TRUE); + } new_mapping_established = TRUE; RETURN(KERN_SUCCESS); } @@ -2118,113 +2933,144 @@ StartAgain: ; step = superpage_size ? SUPERPAGE_SIZE : (end - start); new_entry = NULL; - for (tmp2_start = start; tmp2_start (vm_map_size_t)ANON_CHUNK_SIZE && + size > chunk_size && max_protection != VM_PROT_NONE && - superpage_size == 0) - tmp_end = tmp_start + (vm_map_size_t)ANON_CHUNK_SIZE; - else + superpage_size == 0) { + tmp_end = tmp_start + chunk_size; + } else { tmp_end = tmp2_end; + } do { - new_entry = vm_map_entry_insert(map, entry, tmp_start, tmp_end, - object, offset, needs_copy, - FALSE, FALSE, - cur_protection, max_protection, - VM_BEHAVIOR_DEFAULT, - (entry_for_jit)? VM_INHERIT_NONE: inheritance, - 0, no_cache, - permanent, - superpage_size, - clear_map_aligned, - is_submap); - new_entry->alias = alias; - if (entry_for_jit){ - if (!(map->jit_entry_exists)){ - new_entry->used_for_jit = TRUE; - map->jit_entry_exists = TRUE; + new_entry = vm_map_entry_insert(map, + entry, tmp_start, tmp_end, + object, offset, vmk_flags, + needs_copy, FALSE, FALSE, + cur_protection, max_protection, + VM_BEHAVIOR_DEFAULT, + (entry_for_jit && !VM_MAP_POLICY_ALLOW_JIT_INHERIT(map) ? + VM_INHERIT_NONE : inheritance), + 0, + no_cache, + permanent, + no_copy_on_read, + superpage_size, + clear_map_aligned, + is_submap, + entry_for_jit, + alias, + translated_allow_execute); + + assert((object != kernel_object) || (VM_KERN_MEMORY_NONE != alias)); + + if (resilient_codesign) { + int reject_prot = (needs_copy ? VM_PROT_EXECUTE : (VM_PROT_WRITE | VM_PROT_EXECUTE)); + if (!((cur_protection | max_protection) & reject_prot)) { + new_entry->vme_resilient_codesign = TRUE; } } + if (resilient_media && + (object == VM_OBJECT_NULL || + object->internal)) { + new_entry->vme_resilient_media = TRUE; + } + assert(!new_entry->iokit_acct); if (!is_submap && object != VM_OBJECT_NULL && - object->purgable != VM_PURGABLE_DENY) { + (object->purgable != VM_PURGABLE_DENY || + object->vo_ledger_tag)) { assert(new_entry->use_pmap); assert(!new_entry->iokit_acct); /* * Turn off pmap accounting since - * purgeable objects have their + * purgeable (or tagged) objects have their * own ledgers. */ new_entry->use_pmap = FALSE; } else if (!is_submap && - iokit_acct) { + iokit_acct && + object != VM_OBJECT_NULL && + object->internal) { /* alternate accounting */ assert(!new_entry->iokit_acct); assert(new_entry->use_pmap); new_entry->iokit_acct = TRUE; new_entry->use_pmap = FALSE; + DTRACE_VM4( + vm_map_iokit_mapped_region, + vm_map_t, map, + vm_map_offset_t, new_entry->vme_start, + vm_map_offset_t, new_entry->vme_end, + int, VME_ALIAS(new_entry)); vm_map_iokit_mapped_region( map, (new_entry->vme_end - - new_entry->vme_start)); + new_entry->vme_start)); } else if (!is_submap) { assert(!new_entry->iokit_acct); assert(new_entry->use_pmap); } if (is_submap) { - vm_map_t submap; - boolean_t submap_is_64bit; - boolean_t use_pmap; + vm_map_t submap; + boolean_t submap_is_64bit; + boolean_t use_pmap; assert(new_entry->is_sub_map); assert(!new_entry->use_pmap); assert(!new_entry->iokit_acct); submap = (vm_map_t) object; submap_is_64bit = vm_map_is_64bit(submap); - use_pmap = (alias == VM_MEMORY_SHARED_PMAP); + use_pmap = vmk_flags.vmkf_nested_pmap; #ifndef NO_NESTED_PMAP if (use_pmap && submap->pmap == NULL) { ledger_t ledger = map->pmap->ledger; /* we need a sub pmap to nest... */ - submap->pmap = pmap_create(ledger, 0, - submap_is_64bit); + submap->pmap = pmap_create_options(ledger, 0, + submap_is_64bit ? PMAP_CREATE_64BIT : 0); if (submap->pmap == NULL) { /* let's proceed without nesting... */ } +#if defined(__arm__) || defined(__arm64__) + else { + pmap_set_nested(submap->pmap); + } +#endif } if (use_pmap && submap->pmap != NULL) { - kr = pmap_nest(map->pmap, - submap->pmap, - tmp_start, - tmp_start, - tmp_end - tmp_start); + if (VM_MAP_PAGE_SHIFT(map) != VM_MAP_PAGE_SHIFT(submap)) { + DEBUG4K_ERROR("map %p (%d) submap %p (%d): incompatible page sizes\n", map, VM_MAP_PAGE_SHIFT(map), submap, VM_MAP_PAGE_SHIFT(submap)); + kr = KERN_FAILURE; + } else { + kr = pmap_nest(map->pmap, + submap->pmap, + tmp_start, + tmp_end - tmp_start); + } if (kr != KERN_SUCCESS) { printf("vm_map_enter: " - "pmap_nest(0x%llx,0x%llx) " - "error 0x%x\n", - (long long)tmp_start, - (long long)tmp_end, - kr); + "pmap_nest(0x%llx,0x%llx) " + "error 0x%x\n", + (long long)tmp_start, + (long long)tmp_end, + kr); } else { /* we're now nested ! */ new_entry->use_pmap = TRUE; @@ -2238,38 +3084,44 @@ StartAgain: ; if (superpage_size) { vm_page_t pages, m; vm_object_t sp_object; + vm_object_offset_t sp_offset; - entry->offset = 0; + VME_OFFSET_SET(entry, 0); /* allocate one superpage */ - kr = cpm_allocate(SUPERPAGE_SIZE, &pages, 0, SUPERPAGE_NBASEPAGES-1, TRUE, 0); + kr = cpm_allocate(SUPERPAGE_SIZE, &pages, 0, SUPERPAGE_NBASEPAGES - 1, TRUE, 0); if (kr != KERN_SUCCESS) { - new_mapping_established = TRUE; /* will cause deallocation of whole range */ + /* deallocate whole range... */ + new_mapping_established = TRUE; + /* ... but only up to "tmp_end" */ + size -= end - tmp_end; RETURN(kr); } /* create one vm_object per superpage */ sp_object = vm_object_allocate((vm_map_size_t)(entry->vme_end - entry->vme_start)); sp_object->phys_contiguous = TRUE; - sp_object->vo_shadow_offset = (vm_object_offset_t)pages->phys_page*PAGE_SIZE; - entry->object.vm_object = sp_object; + sp_object->vo_shadow_offset = (vm_object_offset_t)VM_PAGE_GET_PHYS_PAGE(pages) * PAGE_SIZE; + VME_OBJECT_SET(entry, sp_object); assert(entry->use_pmap); /* enter the base pages into the object */ vm_object_lock(sp_object); - for (offset = 0; offset < SUPERPAGE_SIZE; offset += PAGE_SIZE) { + for (sp_offset = 0; + sp_offset < SUPERPAGE_SIZE; + sp_offset += PAGE_SIZE) { m = pages; - pmap_zero_page(m->phys_page); + pmap_zero_page(VM_PAGE_GET_PHYS_PAGE(m)); pages = NEXT_PAGE(m); *(NEXT_PAGE_PTR(m)) = VM_PAGE_NULL; - vm_page_insert(m, sp_object, offset); + vm_page_insert_wired(m, sp_object, sp_offset, VM_KERN_MEMORY_OSFMK); } vm_object_unlock(sp_object); } - } while (tmp_end != tmp2_end && - (tmp_start = tmp_end) && - (tmp_end = (tmp2_end - tmp_end > (vm_map_size_t)ANON_CHUNK_SIZE) ? - tmp_end + (vm_map_size_t)ANON_CHUNK_SIZE : tmp2_end)); + } while (tmp_end != tmp2_end && + (tmp_start = tmp_end) && + (tmp_end = (tmp2_end - tmp_end > chunk_size) ? + tmp_end + chunk_size : tmp2_end)); } new_mapping_established = TRUE; @@ -2283,10 +3135,10 @@ BailOut: #if DEBUG if (pmap_empty && - !(flags & VM_FLAGS_NO_PMAP_CHECK)) { + !(vmk_flags.vmkf_no_pmap_check)) { assert(vm_map_pmap_is_empty(map, - *address, - *address+size)); + *address, + *address + size)); } #endif /* DEBUG */ @@ -2342,15 +3194,15 @@ BailOut: */ if (result == KERN_SUCCESS) { - /* Wire down the new entry if the user * requested all new map entries be wired. */ - if ((map->wiring_required)||(superpage_size)) { + if ((map->wiring_required) || (superpage_size)) { assert(!keep_map_locked); pmap_empty = FALSE; /* pmap won't be empty */ - kr = vm_map_wire(map, start, end, - new_entry->protection, TRUE); + kr = vm_map_wire_kernel(map, start, end, + new_entry->protection, VM_KERN_MEMORY_MLOCK, + TRUE); result = kr; } @@ -2365,23 +3217,25 @@ BailOut: * that someone else create new mappings that range. */ zap_new_map = vm_map_create(PMAP_NULL, - *address, - *address + size, - map->hdr.entries_pageable); + *address, + *address + size, + map->hdr.entries_pageable); vm_map_set_page_shift(zap_new_map, - VM_MAP_PAGE_SHIFT(map)); + VM_MAP_PAGE_SHIFT(map)); + vm_map_disable_hole_optimization(zap_new_map); + if (!map_locked) { vm_map_lock(map); map_locked = TRUE; } - (void) vm_map_delete(map, *address, *address+size, - (VM_MAP_REMOVE_SAVE_ENTRIES | - VM_MAP_REMOVE_NO_MAP_ALIGN), - zap_new_map); + (void) vm_map_delete(map, *address, *address + size, + (VM_MAP_REMOVE_SAVE_ENTRIES | + VM_MAP_REMOVE_NO_MAP_ALIGN), + zap_new_map); } if (zap_old_map != VM_MAP_NULL && zap_old_map->hdr.nentries != 0) { - vm_map_entry_t entry1, entry2; + vm_map_entry_t entry1, entry2; /* * The new mapping failed. Attempt to restore @@ -2411,16 +3265,17 @@ BailOut: * inserting them all after "entry1". */ for (entry2 = vm_map_first_entry(zap_old_map); - entry2 != vm_map_to_entry(zap_old_map); - entry2 = vm_map_first_entry(zap_old_map)) { + entry2 != vm_map_to_entry(zap_old_map); + entry2 = vm_map_first_entry(zap_old_map)) { vm_map_size_t entry_size; entry_size = (entry2->vme_end - - entry2->vme_start); + entry2->vme_start); vm_map_store_entry_unlink(zap_old_map, - entry2); + entry2); zap_old_map->size -= entry_size; - vm_map_store_entry_link(map, entry1, entry2); + vm_map_store_entry_link(map, entry1, entry2, + VM_MAP_KERNEL_FLAGS_NONE); map->size += entry_size; entry1 = entry2; } @@ -2458,509 +3313,444 @@ BailOut: return result; -#undef RETURN +#undef RETURN } -/* - * Counters for the prefault optimization. - */ -int64_t vm_prefault_nb_pages = 0; -int64_t vm_prefault_nb_bailout = 0; - -static kern_return_t -vm_map_enter_mem_object_helper( - vm_map_t target_map, - vm_map_offset_t *address, - vm_map_size_t initial_size, - vm_map_offset_t mask, - int flags, - ipc_port_t port, - vm_object_offset_t offset, - boolean_t copy, - vm_prot_t cur_protection, - vm_prot_t max_protection, - vm_inherit_t inheritance, - upl_page_list_ptr_t page_list, - unsigned int page_list_count) -{ - vm_map_address_t map_addr; - vm_map_size_t map_size; - vm_object_t object; - vm_object_size_t size; - kern_return_t result; - boolean_t mask_cur_protection, mask_max_protection; - boolean_t try_prefault = (page_list_count != 0); - vm_map_offset_t offset_in_mapping; - - mask_cur_protection = cur_protection & VM_PROT_IS_MASK; - mask_max_protection = max_protection & VM_PROT_IS_MASK; - cur_protection &= ~VM_PROT_IS_MASK; - max_protection &= ~VM_PROT_IS_MASK; +#if __arm64__ +extern const struct memory_object_pager_ops fourk_pager_ops; +kern_return_t +vm_map_enter_fourk( + vm_map_t map, + vm_map_offset_t *address, /* IN/OUT */ + vm_map_size_t size, + vm_map_offset_t mask, + int flags, + vm_map_kernel_flags_t vmk_flags, + vm_tag_t alias, + vm_object_t object, + vm_object_offset_t offset, + boolean_t needs_copy, + vm_prot_t cur_protection, + vm_prot_t max_protection, + vm_inherit_t inheritance) +{ + vm_map_entry_t entry, new_entry; + vm_map_offset_t start, fourk_start; + vm_map_offset_t end, fourk_end; + vm_map_size_t fourk_size; + kern_return_t result = KERN_SUCCESS; + vm_map_t zap_old_map = VM_MAP_NULL; + vm_map_t zap_new_map = VM_MAP_NULL; + boolean_t map_locked = FALSE; + boolean_t pmap_empty = TRUE; + boolean_t new_mapping_established = FALSE; + boolean_t keep_map_locked = vmk_flags.vmkf_keep_map_locked; + boolean_t anywhere = ((flags & VM_FLAGS_ANYWHERE) != 0); + boolean_t purgable = ((flags & VM_FLAGS_PURGABLE) != 0); + boolean_t overwrite = ((flags & VM_FLAGS_OVERWRITE) != 0); + boolean_t no_cache = ((flags & VM_FLAGS_NO_CACHE) != 0); + boolean_t is_submap = vmk_flags.vmkf_submap; + boolean_t permanent = vmk_flags.vmkf_permanent; + boolean_t no_copy_on_read = vmk_flags.vmkf_permanent; + boolean_t entry_for_jit = vmk_flags.vmkf_map_jit; +// boolean_t iokit_acct = vmk_flags.vmkf_iokit_acct; + boolean_t translated_allow_execute = vmk_flags.vmkf_translated_allow_execute; + unsigned int superpage_size = ((flags & VM_FLAGS_SUPERPAGE_MASK) >> VM_FLAGS_SUPERPAGE_SHIFT); + vm_map_offset_t effective_min_offset, effective_max_offset; + kern_return_t kr; + boolean_t clear_map_aligned = FALSE; + memory_object_t fourk_mem_obj; + vm_object_t fourk_object; + vm_map_offset_t fourk_pager_offset; + int fourk_pager_index_start, fourk_pager_index_num; + int cur_idx; + boolean_t fourk_copy; + vm_object_t copy_object; + vm_object_offset_t copy_offset; + + if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) { + panic("%s:%d\n", __FUNCTION__, __LINE__); + } + fourk_mem_obj = MEMORY_OBJECT_NULL; + fourk_object = VM_OBJECT_NULL; - /* - * Check arguments for validity - */ - if ((target_map == VM_MAP_NULL) || - (cur_protection & ~VM_PROT_ALL) || - (max_protection & ~VM_PROT_ALL) || - (inheritance > VM_INHERIT_LAST_VALID) || - (try_prefault && (copy || !page_list)) || - initial_size == 0) - return KERN_INVALID_ARGUMENT; - - map_addr = vm_map_trunc_page(*address, - VM_MAP_PAGE_MASK(target_map)); - map_size = vm_map_round_page(initial_size, - VM_MAP_PAGE_MASK(target_map)); - size = vm_object_round_page(initial_size); + if (superpage_size) { + return KERN_NOT_SUPPORTED; + } + + if ((cur_protection & VM_PROT_WRITE) && + (cur_protection & VM_PROT_EXECUTE) && +#if XNU_TARGET_OS_OSX + map->pmap != kernel_pmap && + (vm_map_cs_enforcement(map) +#if __arm64__ + || !VM_MAP_IS_EXOTIC(map) +#endif /* __arm64__ */ + ) && +#endif /* XNU_TARGET_OS_OSX */ + !entry_for_jit) { + DTRACE_VM3(cs_wx, + uint64_t, 0, + uint64_t, 0, + vm_prot_t, cur_protection); + printf("CODE SIGNING: %d[%s] %s: curprot cannot be write+execute. " + "turning off execute\n", + proc_selfpid(), + (current_task()->bsd_info + ? proc_name_address(current_task()->bsd_info) + : "?"), + __FUNCTION__); + cur_protection &= ~VM_PROT_EXECUTE; + } /* - * Find the vm object (if any) corresponding to this port. + * If the task has requested executable lockdown, + * deny any new executable mapping. */ - if (!IP_VALID(port)) { - object = VM_OBJECT_NULL; - offset = 0; - copy = FALSE; - } else if (ip_kotype(port) == IKOT_NAMED_ENTRY) { - vm_named_entry_t named_entry; - - named_entry = (vm_named_entry_t) port->ip_kobject; - - if ((flags & VM_FLAGS_RETURN_DATA_ADDR) != 0) { - offset += named_entry->data_offset; - } - - /* a few checks to make sure user is obeying rules */ - if (size == 0) { - if (offset >= named_entry->size) - return KERN_INVALID_RIGHT; - size = named_entry->size - offset; + if (map->map_disallow_new_exec == TRUE) { + if (cur_protection & VM_PROT_EXECUTE) { + return KERN_PROTECTION_FAILURE; } - if (mask_max_protection) { - max_protection &= named_entry->protection; - } - if (mask_cur_protection) { - cur_protection &= named_entry->protection; - } - if ((named_entry->protection & max_protection) != - max_protection) - return KERN_INVALID_RIGHT; - if ((named_entry->protection & cur_protection) != - cur_protection) - return KERN_INVALID_RIGHT; - if (offset + size < offset) { - /* overflow */ - return KERN_INVALID_ARGUMENT; - } - if (named_entry->size < (offset + size)) - return KERN_INVALID_ARGUMENT; + } - if (named_entry->is_copy) { - /* for a vm_map_copy, we can only map it whole */ - if ((size != named_entry->size) && - (vm_map_round_page(size, - VM_MAP_PAGE_MASK(target_map)) == - named_entry->size)) { - /* XXX FBDP use the rounded size... */ - size = vm_map_round_page( - size, - VM_MAP_PAGE_MASK(target_map)); - } - - if (!(flags & VM_FLAGS_ANYWHERE) && - (offset != 0 || - size != named_entry->size)) { - /* - * XXX for a mapping at a "fixed" address, - * we can't trim after mapping the whole - * memory entry, so reject a request for a - * partial mapping. - */ - return KERN_INVALID_ARGUMENT; - } - } + if (is_submap) { + return KERN_NOT_SUPPORTED; + } + if (vmk_flags.vmkf_already) { + return KERN_NOT_SUPPORTED; + } + if (purgable || entry_for_jit) { + return KERN_NOT_SUPPORTED; + } - /* the callers parameter offset is defined to be the */ - /* offset from beginning of named entry offset in object */ - offset = offset + named_entry->offset; - - if (! VM_MAP_PAGE_ALIGNED(size, - VM_MAP_PAGE_MASK(target_map))) { - /* - * Let's not map more than requested; - * vm_map_enter() will handle this "not map-aligned" - * case. - */ - map_size = size; - } + effective_min_offset = map->min_offset; - named_entry_lock(named_entry); - if (named_entry->is_sub_map) { - vm_map_t submap; + if (vmk_flags.vmkf_beyond_max) { + return KERN_NOT_SUPPORTED; + } else { + effective_max_offset = map->max_offset; + } - if ((flags & VM_FLAGS_RETURN_DATA_ADDR) != 0) { - panic("VM_FLAGS_RETURN_DATA_ADDR not expected for submap."); - } + if (size == 0 || + (offset & FOURK_PAGE_MASK) != 0) { + *address = 0; + return KERN_INVALID_ARGUMENT; + } - submap = named_entry->backing.map; - vm_map_lock(submap); - vm_map_reference(submap); - vm_map_unlock(submap); - named_entry_unlock(named_entry); +#define RETURN(value) { result = value; goto BailOut; } - result = vm_map_enter(target_map, - &map_addr, - map_size, - mask, - flags | VM_FLAGS_SUBMAP, - (vm_object_t) submap, - offset, - copy, - cur_protection, - max_protection, - inheritance); - if (result != KERN_SUCCESS) { - vm_map_deallocate(submap); - } else { - /* - * No need to lock "submap" just to check its - * "mapped" flag: that flag is never reset - * once it's been set and if we race, we'll - * just end up setting it twice, which is OK. - */ - if (submap->mapped_in_other_pmaps == FALSE && - vm_map_pmap(submap) != PMAP_NULL && - vm_map_pmap(submap) != - vm_map_pmap(target_map)) { - /* - * This submap is being mapped in a map - * that uses a different pmap. - * Set its "mapped_in_other_pmaps" flag - * to indicate that we now need to - * remove mappings from all pmaps rather - * than just the submap's pmap. - */ - vm_map_lock(submap); - submap->mapped_in_other_pmaps = TRUE; - vm_map_unlock(submap); - } - *address = map_addr; - } - return result; + assert(VM_MAP_PAGE_ALIGNED(*address, FOURK_PAGE_MASK)); + assert(VM_MAP_PAGE_ALIGNED(size, FOURK_PAGE_MASK)); - } else if (named_entry->is_pager) { - unsigned int access; - vm_prot_t protections; - unsigned int wimg_mode; + if (!anywhere && overwrite) { + return KERN_NOT_SUPPORTED; + } + if (!anywhere && overwrite) { + /* + * Create a temporary VM map to hold the old mappings in the + * affected area while we create the new one. + * This avoids releasing the VM map lock in + * vm_map_entry_delete() and allows atomicity + * when we want to replace some mappings with a new one. + * It also allows us to restore the old VM mappings if the + * new mapping fails. + */ + zap_old_map = vm_map_create(PMAP_NULL, + *address, + *address + size, + map->hdr.entries_pageable); + vm_map_set_page_shift(zap_old_map, VM_MAP_PAGE_SHIFT(map)); + vm_map_disable_hole_optimization(zap_old_map); + } - protections = named_entry->protection & VM_PROT_ALL; - access = GET_MAP_MEM(named_entry->protection); + fourk_start = *address; + fourk_size = size; + fourk_end = fourk_start + fourk_size; - if ((flags & VM_FLAGS_RETURN_DATA_ADDR) != 0) { - panic("VM_FLAGS_RETURN_DATA_ADDR not expected for submap."); - } + start = vm_map_trunc_page(*address, VM_MAP_PAGE_MASK(map)); + end = vm_map_round_page(fourk_end, VM_MAP_PAGE_MASK(map)); + size = end - start; - object = vm_object_enter(named_entry->backing.pager, - named_entry->size, - named_entry->internal, - FALSE, - FALSE); - if (object == VM_OBJECT_NULL) { - named_entry_unlock(named_entry); - return KERN_INVALID_OBJECT; - } + if (anywhere) { + return KERN_NOT_SUPPORTED; + } else { + /* + * Verify that: + * the address doesn't itself violate + * the mask requirement. + */ - /* JMM - drop reference on pager here */ + vm_map_lock(map); + map_locked = TRUE; + if ((start & mask) != 0) { + RETURN(KERN_NO_SPACE); + } - /* create an extra ref for the named entry */ - vm_object_lock(object); - vm_object_reference_locked(object); - named_entry->backing.object = object; - named_entry->is_pager = FALSE; - named_entry_unlock(named_entry); + /* + * ... the address is within bounds + */ - wimg_mode = object->wimg_bits; + end = start + size; - if (access == MAP_MEM_IO) { - wimg_mode = VM_WIMG_IO; - } else if (access == MAP_MEM_COPYBACK) { - wimg_mode = VM_WIMG_USE_DEFAULT; - } else if (access == MAP_MEM_INNERWBACK) { - wimg_mode = VM_WIMG_INNERWBACK; - } else if (access == MAP_MEM_WTHRU) { - wimg_mode = VM_WIMG_WTHRU; - } else if (access == MAP_MEM_WCOMB) { - wimg_mode = VM_WIMG_WCOMB; - } + if ((start < effective_min_offset) || + (end > effective_max_offset) || + (start >= end)) { + RETURN(KERN_INVALID_ADDRESS); + } - /* wait for object (if any) to be ready */ - if (!named_entry->internal) { - while (!object->pager_ready) { - vm_object_wait( - object, - VM_OBJECT_EVENT_PAGER_READY, - THREAD_UNINT); - vm_object_lock(object); - } - } + if (overwrite && zap_old_map != VM_MAP_NULL) { + /* + * Fixed mapping and "overwrite" flag: attempt to + * remove all existing mappings in the specified + * address range, saving them in our "zap_old_map". + */ + (void) vm_map_delete(map, start, end, + (VM_MAP_REMOVE_SAVE_ENTRIES | + VM_MAP_REMOVE_NO_MAP_ALIGN), + zap_old_map); + } - if (object->wimg_bits != wimg_mode) - vm_object_change_wimg_mode(object, wimg_mode); + /* + * ... the starting address isn't allocated + */ + if (vm_map_lookup_entry(map, start, &entry)) { + vm_object_t cur_object, shadow_object; -#if VM_OBJECT_TRACKING_OP_TRUESHARE - if (!object->true_share && - vm_object_tracking_inited) { - void *bt[VM_OBJECT_TRACKING_BTDEPTH]; - int num = 0; + /* + * We might already some 4K mappings + * in a 16K page here. + */ - num = OSBacktrace(bt, - VM_OBJECT_TRACKING_BTDEPTH); - btlog_add_entry(vm_object_tracking_btlog, - object, - VM_OBJECT_TRACKING_OP_TRUESHARE, - bt, - num); + if (entry->vme_end - entry->vme_start + != SIXTEENK_PAGE_SIZE) { + RETURN(KERN_NO_SPACE); } -#endif /* VM_OBJECT_TRACKING_OP_TRUESHARE */ - - object->true_share = TRUE; - - if (object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) - object->copy_strategy = MEMORY_OBJECT_COPY_DELAY; - vm_object_unlock(object); - - } else if (named_entry->is_copy) { - kern_return_t kr; - vm_map_copy_t copy_map; - vm_map_entry_t copy_entry; - vm_map_offset_t copy_addr; - - if (flags & ~(VM_FLAGS_FIXED | - VM_FLAGS_ANYWHERE | - VM_FLAGS_OVERWRITE | - VM_FLAGS_RETURN_DATA_ADDR)) { - named_entry_unlock(named_entry); - return KERN_INVALID_ARGUMENT; + if (entry->is_sub_map) { + RETURN(KERN_NO_SPACE); } - - if ((flags & VM_FLAGS_RETURN_DATA_ADDR) != 0) { - offset_in_mapping = offset - vm_object_trunc_page(offset); - offset = vm_object_trunc_page(offset); - map_size = vm_object_round_page(offset + offset_in_mapping + initial_size) - offset; + if (VME_OBJECT(entry) == VM_OBJECT_NULL) { + RETURN(KERN_NO_SPACE); } - copy_map = named_entry->backing.copy; - assert(copy_map->type == VM_MAP_COPY_ENTRY_LIST); - if (copy_map->type != VM_MAP_COPY_ENTRY_LIST) { - /* unsupported type; should not happen */ - printf("vm_map_enter_mem_object: " - "memory_entry->backing.copy " - "unsupported type 0x%x\n", - copy_map->type); - named_entry_unlock(named_entry); - return KERN_INVALID_ARGUMENT; + /* go all the way down the shadow chain */ + cur_object = VME_OBJECT(entry); + vm_object_lock(cur_object); + while (cur_object->shadow != VM_OBJECT_NULL) { + shadow_object = cur_object->shadow; + vm_object_lock(shadow_object); + vm_object_unlock(cur_object); + cur_object = shadow_object; + shadow_object = VM_OBJECT_NULL; } - - /* reserve a contiguous range */ - kr = vm_map_enter(target_map, - &map_addr, - /* map whole mem entry, trim later: */ - named_entry->size, - mask, - flags & (VM_FLAGS_ANYWHERE | - VM_FLAGS_OVERWRITE | - VM_FLAGS_RETURN_DATA_ADDR), - VM_OBJECT_NULL, - 0, - FALSE, /* copy */ - cur_protection, - max_protection, - inheritance); - if (kr != KERN_SUCCESS) { - named_entry_unlock(named_entry); - return kr; + if (cur_object->internal || + cur_object->pager == NULL) { + vm_object_unlock(cur_object); + RETURN(KERN_NO_SPACE); } + if (cur_object->pager->mo_pager_ops + != &fourk_pager_ops) { + vm_object_unlock(cur_object); + RETURN(KERN_NO_SPACE); + } + fourk_object = cur_object; + fourk_mem_obj = fourk_object->pager; + + /* keep the "4K" object alive */ + vm_object_reference_locked(fourk_object); + memory_object_reference(fourk_mem_obj); + vm_object_unlock(fourk_object); + + /* merge permissions */ + entry->protection |= cur_protection; + entry->max_protection |= max_protection; + if ((entry->protection & (VM_PROT_WRITE | + VM_PROT_EXECUTE)) == + (VM_PROT_WRITE | VM_PROT_EXECUTE) && + fourk_binary_compatibility_unsafe && + fourk_binary_compatibility_allow_wx) { + /* write+execute: need to be "jit" */ + entry->used_for_jit = TRUE; + } + goto map_in_fourk_pager; + } - copy_addr = map_addr; - - for (copy_entry = vm_map_copy_first_entry(copy_map); - copy_entry != vm_map_copy_to_entry(copy_map); - copy_entry = copy_entry->vme_next) { - int remap_flags = 0; - vm_map_t copy_submap; - vm_object_t copy_object; - vm_map_size_t copy_size; - vm_object_offset_t copy_offset; - - copy_offset = copy_entry->offset; - copy_size = (copy_entry->vme_end - - copy_entry->vme_start); - - /* sanity check */ - if ((copy_addr + copy_size) > - (map_addr + - named_entry->size /* XXX full size */ )) { - /* over-mapping too much !? */ - kr = KERN_INVALID_ARGUMENT; - /* abort */ - break; - } + /* + * ... the next region doesn't overlap the + * end point. + */ - /* take a reference on the object */ - if (copy_entry->is_sub_map) { - remap_flags |= VM_FLAGS_SUBMAP; - copy_submap = - copy_entry->object.sub_map; - vm_map_lock(copy_submap); - vm_map_reference(copy_submap); - vm_map_unlock(copy_submap); - copy_object = (vm_object_t) copy_submap; - } else { - copy_object = - copy_entry->object.vm_object; - vm_object_reference(copy_object); - } + if ((entry->vme_next != vm_map_to_entry(map)) && + (entry->vme_next->vme_start < end)) { + RETURN(KERN_NO_SPACE); + } + } - /* over-map the object into destination */ - remap_flags |= flags; - remap_flags |= VM_FLAGS_FIXED; - remap_flags |= VM_FLAGS_OVERWRITE; - remap_flags &= ~VM_FLAGS_ANYWHERE; - kr = vm_map_enter(target_map, - ©_addr, - copy_size, - (vm_map_offset_t) 0, - remap_flags, - copy_object, - copy_offset, - copy, - cur_protection, - max_protection, - inheritance); - if (kr != KERN_SUCCESS) { - if (copy_entry->is_sub_map) { - vm_map_deallocate(copy_submap); - } else { - vm_object_deallocate(copy_object); - } - /* abort */ - break; - } + /* + * At this point, + * "start" and "end" should define the endpoints of the + * available new range, and + * "entry" should refer to the region before the new + * range, and + * + * the map should be locked. + */ - /* next mapping */ - copy_addr += copy_size; - } - - if (kr == KERN_SUCCESS) { - if ((flags & VM_FLAGS_RETURN_DATA_ADDR) != 0) { - *address = map_addr + offset_in_mapping; - } else { - *address = map_addr; - } + /* create a new "4K" pager */ + fourk_mem_obj = fourk_pager_create(); + fourk_object = fourk_pager_to_vm_object(fourk_mem_obj); + assert(fourk_object); + + /* keep the "4" object alive */ + vm_object_reference(fourk_object); + + /* create a "copy" object, to map the "4K" object copy-on-write */ + fourk_copy = TRUE; + result = vm_object_copy_strategically(fourk_object, + 0, + end - start, + ©_object, + ©_offset, + &fourk_copy); + assert(result == KERN_SUCCESS); + assert(copy_object != VM_OBJECT_NULL); + assert(copy_offset == 0); + + /* map the "4K" pager's copy object */ + new_entry = + vm_map_entry_insert(map, entry, + vm_map_trunc_page(start, + VM_MAP_PAGE_MASK(map)), + vm_map_round_page(end, + VM_MAP_PAGE_MASK(map)), + copy_object, + 0, /* offset */ + vmk_flags, + FALSE, /* needs_copy */ + FALSE, + FALSE, + cur_protection, max_protection, + VM_BEHAVIOR_DEFAULT, + (entry_for_jit && !VM_MAP_POLICY_ALLOW_JIT_INHERIT(map) ? + VM_INHERIT_NONE : inheritance), + 0, + no_cache, + permanent, + no_copy_on_read, + superpage_size, + clear_map_aligned, + is_submap, + FALSE, /* jit */ + alias, + translated_allow_execute); + entry = new_entry; + +#if VM_MAP_DEBUG_FOURK + if (vm_map_debug_fourk) { + printf("FOURK_PAGER: map %p [0x%llx:0x%llx] new pager %p\n", + map, + (uint64_t) entry->vme_start, + (uint64_t) entry->vme_end, + fourk_mem_obj); + } +#endif /* VM_MAP_DEBUG_FOURK */ - if (offset) { - /* - * Trim in front, from 0 to "offset". - */ - vm_map_remove(target_map, - map_addr, - map_addr + offset, - 0); - *address += offset; - } - if (offset + map_size < named_entry->size) { - /* - * Trim in back, from - * "offset + map_size" to - * "named_entry->size". - */ - vm_map_remove(target_map, - (map_addr + - offset + map_size), - (map_addr + - named_entry->size), - 0); - } - } - named_entry_unlock(named_entry); + new_mapping_established = TRUE; - if (kr != KERN_SUCCESS) { - if (! (flags & VM_FLAGS_OVERWRITE)) { - /* deallocate the contiguous range */ - (void) vm_deallocate(target_map, - map_addr, - map_size); - } +map_in_fourk_pager: + /* "map" the original "object" where it belongs in the "4K" pager */ + fourk_pager_offset = (fourk_start & SIXTEENK_PAGE_MASK); + fourk_pager_index_start = (int) (fourk_pager_offset / FOURK_PAGE_SIZE); + if (fourk_size > SIXTEENK_PAGE_SIZE) { + fourk_pager_index_num = 4; + } else { + fourk_pager_index_num = (int) (fourk_size / FOURK_PAGE_SIZE); + } + if (fourk_pager_index_start + fourk_pager_index_num > 4) { + fourk_pager_index_num = 4 - fourk_pager_index_start; + } + for (cur_idx = 0; + cur_idx < fourk_pager_index_num; + cur_idx++) { + vm_object_t old_object; + vm_object_offset_t old_offset; + + kr = fourk_pager_populate(fourk_mem_obj, + TRUE, /* overwrite */ + fourk_pager_index_start + cur_idx, + object, + (object + ? (offset + + (cur_idx * FOURK_PAGE_SIZE)) + : 0), + &old_object, + &old_offset); +#if VM_MAP_DEBUG_FOURK + if (vm_map_debug_fourk) { + if (old_object == (vm_object_t) -1 && + old_offset == (vm_object_offset_t) -1) { + printf("FOURK_PAGER: map %p [0x%llx:0x%llx] " + "pager [%p:0x%llx] " + "populate[%d] " + "[object:%p,offset:0x%llx]\n", + map, + (uint64_t) entry->vme_start, + (uint64_t) entry->vme_end, + fourk_mem_obj, + VME_OFFSET(entry), + fourk_pager_index_start + cur_idx, + object, + (object + ? (offset + (cur_idx * FOURK_PAGE_SIZE)) + : 0)); + } else { + printf("FOURK_PAGER: map %p [0x%llx:0x%llx] " + "pager [%p:0x%llx] " + "populate[%d] [object:%p,offset:0x%llx] " + "old [%p:0x%llx]\n", + map, + (uint64_t) entry->vme_start, + (uint64_t) entry->vme_end, + fourk_mem_obj, + VME_OFFSET(entry), + fourk_pager_index_start + cur_idx, + object, + (object + ? (offset + (cur_idx * FOURK_PAGE_SIZE)) + : 0), + old_object, + old_offset); } + } +#endif /* VM_MAP_DEBUG_FOURK */ - return kr; - - } else { - /* This is the case where we are going to map */ - /* an already mapped object. If the object is */ - /* not ready it is internal. An external */ - /* object cannot be mapped until it is ready */ - /* we can therefore avoid the ready check */ - /* in this case. */ - if ((flags & VM_FLAGS_RETURN_DATA_ADDR) != 0) { - offset_in_mapping = offset - vm_object_trunc_page(offset); - offset = vm_object_trunc_page(offset); - map_size = vm_object_round_page(offset + offset_in_mapping + initial_size) - offset; - } - - object = named_entry->backing.object; - assert(object != VM_OBJECT_NULL); - named_entry_unlock(named_entry); + assert(kr == KERN_SUCCESS); + if (object != old_object && + object != VM_OBJECT_NULL && + object != (vm_object_t) -1) { vm_object_reference(object); } - } else if (ip_kotype(port) == IKOT_MEMORY_OBJECT) { - /* - * JMM - This is temporary until we unify named entries - * and raw memory objects. - * - * Detected fake ip_kotype for a memory object. In - * this case, the port isn't really a port at all, but - * instead is just a raw memory object. - */ - if ((flags & VM_FLAGS_RETURN_DATA_ADDR) != 0) { - panic("VM_FLAGS_RETURN_DATA_ADDR not expected for raw memory object."); + if (object != old_object && + old_object != VM_OBJECT_NULL && + old_object != (vm_object_t) -1) { + vm_object_deallocate(old_object); } + } - object = vm_object_enter((memory_object_t)port, - size, FALSE, FALSE, FALSE); - if (object == VM_OBJECT_NULL) - return KERN_INVALID_OBJECT; +BailOut: + assert(map_locked == TRUE); - /* wait for object (if any) to be ready */ - if (object != VM_OBJECT_NULL) { - if (object == kernel_object) { - printf("Warning: Attempt to map kernel object" - " by a non-private kernel entity\n"); - return KERN_INVALID_OBJECT; - } - if (!object->pager_ready) { - vm_object_lock(object); + if (result == KERN_SUCCESS) { + vm_prot_t pager_prot; + memory_object_t pager; - while (!object->pager_ready) { - vm_object_wait(object, - VM_OBJECT_EVENT_PAGER_READY, - THREAD_UNINT); - vm_object_lock(object); - } - vm_object_unlock(object); - } +#if DEBUG + if (pmap_empty && + !(vmk_flags.vmkf_no_pmap_check)) { + assert(vm_map_pmap_is_empty(map, + *address, + *address + size)); } - } else { - return KERN_INVALID_OBJECT; - } - - if (object != VM_OBJECT_NULL && - object->named && - object->pager != MEMORY_OBJECT_NULL && - object->copy_strategy != MEMORY_OBJECT_COPY_NONE) { - memory_object_t pager; - vm_prot_t pager_prot; - kern_return_t kr; +#endif /* DEBUG */ /* * For "named" VM objects, let the pager know that the @@ -2972,204 +3762,247 @@ vm_map_enter_mem_object_helper( * memory_object_last_unmap() when all the mappings are gone. */ pager_prot = max_protection; - if (copy) { + if (needs_copy) { /* - * Copy-On-Write mapping: won't modify the - * memory object. - */ + * Copy-On-Write mapping: won't modify + * the memory object. + */ pager_prot &= ~VM_PROT_WRITE; } - vm_object_lock(object); - pager = object->pager; - if (object->named && - pager != MEMORY_OBJECT_NULL && - object->copy_strategy != MEMORY_OBJECT_COPY_NONE) { - assert(object->pager_ready); - vm_object_mapping_wait(object, THREAD_UNINT); - vm_object_mapping_begin(object); + if (!is_submap && + object != VM_OBJECT_NULL && + object->named && + object->pager != MEMORY_OBJECT_NULL) { + vm_object_lock(object); + pager = object->pager; + if (object->named && + pager != MEMORY_OBJECT_NULL) { + assert(object->pager_ready); + vm_object_mapping_wait(object, THREAD_UNINT); + vm_object_mapping_begin(object); + vm_object_unlock(object); + + kr = memory_object_map(pager, pager_prot); + assert(kr == KERN_SUCCESS); + + vm_object_lock(object); + vm_object_mapping_end(object); + } vm_object_unlock(object); + } + if (!is_submap && + fourk_object != VM_OBJECT_NULL && + fourk_object->named && + fourk_object->pager != MEMORY_OBJECT_NULL) { + vm_object_lock(fourk_object); + pager = fourk_object->pager; + if (fourk_object->named && + pager != MEMORY_OBJECT_NULL) { + assert(fourk_object->pager_ready); + vm_object_mapping_wait(fourk_object, + THREAD_UNINT); + vm_object_mapping_begin(fourk_object); + vm_object_unlock(fourk_object); - kr = memory_object_map(pager, pager_prot); - assert(kr == KERN_SUCCESS); + kr = memory_object_map(pager, VM_PROT_READ); + assert(kr == KERN_SUCCESS); - vm_object_lock(object); - vm_object_mapping_end(object); + vm_object_lock(fourk_object); + vm_object_mapping_end(fourk_object); + } + vm_object_unlock(fourk_object); } - vm_object_unlock(object); } - /* - * Perform the copy if requested - */ + if (fourk_object != VM_OBJECT_NULL) { + vm_object_deallocate(fourk_object); + fourk_object = VM_OBJECT_NULL; + memory_object_deallocate(fourk_mem_obj); + fourk_mem_obj = MEMORY_OBJECT_NULL; + } - if (copy) { - vm_object_t new_object; - vm_object_offset_t new_offset; + assert(map_locked == TRUE); - result = vm_object_copy_strategically(object, offset, size, - &new_object, &new_offset, - ©); + if (!keep_map_locked) { + vm_map_unlock(map); + map_locked = FALSE; + } + /* + * We can't hold the map lock if we enter this block. + */ - if (result == KERN_MEMORY_RESTART_COPY) { - boolean_t success; - boolean_t src_needs_copy; + if (result == KERN_SUCCESS) { + /* Wire down the new entry if the user + * requested all new map entries be wired. + */ + if ((map->wiring_required) || (superpage_size)) { + assert(!keep_map_locked); + pmap_empty = FALSE; /* pmap won't be empty */ + kr = vm_map_wire_kernel(map, start, end, + new_entry->protection, VM_KERN_MEMORY_MLOCK, + TRUE); + result = kr; + } + + } + if (result != KERN_SUCCESS) { + if (new_mapping_established) { /* - * XXX - * We currently ignore src_needs_copy. - * This really is the issue of how to make - * MEMORY_OBJECT_COPY_SYMMETRIC safe for - * non-kernel users to use. Solution forthcoming. - * In the meantime, since we don't allow non-kernel - * memory managers to specify symmetric copy, - * we won't run into problems here. + * We have to get rid of the new mappings since we + * won't make them available to the user. + * Try and do that atomically, to minimize the risk + * that someone else create new mappings that range. */ - new_object = object; - new_offset = offset; - success = vm_object_copy_quickly(&new_object, - new_offset, size, - &src_needs_copy, - ©); - assert(success); - result = KERN_SUCCESS; + zap_new_map = vm_map_create(PMAP_NULL, + *address, + *address + size, + map->hdr.entries_pageable); + vm_map_set_page_shift(zap_new_map, + VM_MAP_PAGE_SHIFT(map)); + vm_map_disable_hole_optimization(zap_new_map); + + if (!map_locked) { + vm_map_lock(map); + map_locked = TRUE; + } + (void) vm_map_delete(map, *address, *address + size, + (VM_MAP_REMOVE_SAVE_ENTRIES | + VM_MAP_REMOVE_NO_MAP_ALIGN), + zap_new_map); } - /* - * Throw away the reference to the - * original object, as it won't be mapped. - */ + if (zap_old_map != VM_MAP_NULL && + zap_old_map->hdr.nentries != 0) { + vm_map_entry_t entry1, entry2; - vm_object_deallocate(object); + /* + * The new mapping failed. Attempt to restore + * the old mappings, saved in the "zap_old_map". + */ + if (!map_locked) { + vm_map_lock(map); + map_locked = TRUE; + } - if (result != KERN_SUCCESS) - return result; + /* first check if the coast is still clear */ + start = vm_map_first_entry(zap_old_map)->vme_start; + end = vm_map_last_entry(zap_old_map)->vme_end; + if (vm_map_lookup_entry(map, start, &entry1) || + vm_map_lookup_entry(map, end, &entry2) || + entry1 != entry2) { + /* + * Part of that range has already been + * re-mapped: we can't restore the old + * mappings... + */ + vm_map_enter_restore_failures++; + } else { + /* + * Transfer the saved map entries from + * "zap_old_map" to the original "map", + * inserting them all after "entry1". + */ + for (entry2 = vm_map_first_entry(zap_old_map); + entry2 != vm_map_to_entry(zap_old_map); + entry2 = vm_map_first_entry(zap_old_map)) { + vm_map_size_t entry_size; - object = new_object; - offset = new_offset; + entry_size = (entry2->vme_end - + entry2->vme_start); + vm_map_store_entry_unlink(zap_old_map, + entry2); + zap_old_map->size -= entry_size; + vm_map_store_entry_link(map, entry1, entry2, + VM_MAP_KERNEL_FLAGS_NONE); + map->size += entry_size; + entry1 = entry2; + } + if (map->wiring_required) { + /* + * XXX TODO: we should rewire the + * old pages here... + */ + } + vm_map_enter_restore_successes++; + } + } } /* - * If users want to try to prefault pages, the mapping and prefault - * needs to be atomic. + * The caller is responsible for releasing the lock if it requested to + * keep the map locked. */ - if (try_prefault) - flags |= VM_FLAGS_KEEP_MAP_LOCKED; - result = vm_map_enter(target_map, - &map_addr, map_size, - (vm_map_offset_t)mask, - flags, - object, offset, - copy, - cur_protection, max_protection, inheritance); - if (result != KERN_SUCCESS) - vm_object_deallocate(object); + if (map_locked && !keep_map_locked) { + vm_map_unlock(map); + } /* - * Try to prefault, and do not forget to release the vm map lock. + * Get rid of the "zap_maps" and all the map entries that + * they may still contain. */ - if (result == KERN_SUCCESS && try_prefault) { - mach_vm_address_t va = map_addr; - kern_return_t kr = KERN_SUCCESS; - unsigned int i = 0; - - for (i = 0; i < page_list_count; ++i) { - if (UPL_VALID_PAGE(page_list, i)) { - /* - * If this function call failed, we should stop - * trying to optimize, other calls are likely - * going to fail too. - * - * We are not gonna report an error for such - * failure though. That's an optimization, not - * something critical. - */ - kr = pmap_enter_options(target_map->pmap, - va, UPL_PHYS_PAGE(page_list, i), - cur_protection, VM_PROT_NONE, - 0, TRUE, PMAP_OPTIONS_NOWAIT, NULL); - if (kr != KERN_SUCCESS) { - OSIncrementAtomic64(&vm_prefault_nb_bailout); - goto BailOut; - } - OSIncrementAtomic64(&vm_prefault_nb_pages); - } - - /* Next virtual address */ - va += PAGE_SIZE; - } -BailOut: - vm_map_unlock(target_map); + if (zap_old_map != VM_MAP_NULL) { + vm_map_destroy(zap_old_map, VM_MAP_REMOVE_NO_PMAP_CLEANUP); + zap_old_map = VM_MAP_NULL; } - - if ((flags & VM_FLAGS_RETURN_DATA_ADDR) != 0) { - *address = map_addr + offset_in_mapping; - } else { - *address = map_addr; + if (zap_new_map != VM_MAP_NULL) { + vm_map_destroy(zap_new_map, VM_MAP_REMOVE_NO_PMAP_CLEANUP); + zap_new_map = VM_MAP_NULL; } + return result; -} -kern_return_t -vm_map_enter_mem_object( - vm_map_t target_map, - vm_map_offset_t *address, - vm_map_size_t initial_size, - vm_map_offset_t mask, - int flags, - ipc_port_t port, - vm_object_offset_t offset, - boolean_t copy, - vm_prot_t cur_protection, - vm_prot_t max_protection, - vm_inherit_t inheritance) -{ - return vm_map_enter_mem_object_helper(target_map, address, initial_size, mask, flags, - port, offset, copy, cur_protection, max_protection, - inheritance, NULL, 0); +#undef RETURN } +#endif /* __arm64__ */ -kern_return_t -vm_map_enter_mem_object_prefault( - vm_map_t target_map, - vm_map_offset_t *address, - vm_map_size_t initial_size, - vm_map_offset_t mask, - int flags, - ipc_port_t port, - vm_object_offset_t offset, - vm_prot_t cur_protection, - vm_prot_t max_protection, - upl_page_list_ptr_t page_list, - unsigned int page_list_count) +/* + * Counters for the prefault optimization. + */ +int64_t vm_prefault_nb_pages = 0; +int64_t vm_prefault_nb_bailout = 0; + +static kern_return_t +vm_map_enter_mem_object_helper( + vm_map_t target_map, + vm_map_offset_t *address, + vm_map_size_t initial_size, + vm_map_offset_t mask, + int flags, + vm_map_kernel_flags_t vmk_flags, + vm_tag_t tag, + ipc_port_t port, + vm_object_offset_t offset, + boolean_t copy, + vm_prot_t cur_protection, + vm_prot_t max_protection, + vm_inherit_t inheritance, + upl_page_list_ptr_t page_list, + unsigned int page_list_count) { - return vm_map_enter_mem_object_helper(target_map, address, initial_size, mask, flags, - port, offset, FALSE, cur_protection, max_protection, - VM_INHERIT_DEFAULT, page_list, page_list_count); -} + vm_map_address_t map_addr; + vm_map_size_t map_size; + vm_object_t object; + vm_object_size_t size; + kern_return_t result; + boolean_t mask_cur_protection, mask_max_protection; + boolean_t kernel_prefault, try_prefault = (page_list_count != 0); + vm_map_offset_t offset_in_mapping = 0; +#if __arm64__ + boolean_t fourk = vmk_flags.vmkf_fourk; +#endif /* __arm64__ */ + if (VM_MAP_PAGE_SHIFT(target_map) < PAGE_SHIFT) { + /* XXX TODO4K prefaulting depends on page size... */ + try_prefault = FALSE; + } -kern_return_t -vm_map_enter_mem_object_control( - vm_map_t target_map, - vm_map_offset_t *address, - vm_map_size_t initial_size, - vm_map_offset_t mask, - int flags, - memory_object_control_t control, - vm_object_offset_t offset, - boolean_t copy, - vm_prot_t cur_protection, - vm_prot_t max_protection, - vm_inherit_t inheritance) -{ - vm_map_address_t map_addr; - vm_map_size_t map_size; - vm_object_t object; - vm_object_size_t size; - kern_return_t result; - memory_object_t pager; - vm_prot_t pager_prot; - kern_return_t kr; + assertf(vmk_flags.__vmkf_unused == 0, "vmk_flags unused=0x%x\n", vmk_flags.__vmkf_unused); + + mask_cur_protection = cur_protection & VM_PROT_IS_MASK; + mask_max_protection = max_protection & VM_PROT_IS_MASK; + cur_protection &= ~VM_PROT_IS_MASK; + max_protection &= ~VM_PROT_IS_MASK; /* * Check arguments for validity @@ -3178,9702 +4011,13748 @@ vm_map_enter_mem_object_control( (cur_protection & ~VM_PROT_ALL) || (max_protection & ~VM_PROT_ALL) || (inheritance > VM_INHERIT_LAST_VALID) || - initial_size == 0) + (try_prefault && (copy || !page_list)) || + initial_size == 0) { return KERN_INVALID_ARGUMENT; - - map_addr = vm_map_trunc_page(*address, - VM_MAP_PAGE_MASK(target_map)); - map_size = vm_map_round_page(initial_size, - VM_MAP_PAGE_MASK(target_map)); - size = vm_object_round_page(initial_size); - - object = memory_object_control_to_vm_object(control); - - if (object == VM_OBJECT_NULL) - return KERN_INVALID_OBJECT; - - if (object == kernel_object) { - printf("Warning: Attempt to map kernel object" - " by a non-private kernel entity\n"); - return KERN_INVALID_OBJECT; } - vm_object_lock(object); - object->ref_count++; - vm_object_res_reference(object); +#if __arm64__ + if (fourk && VM_MAP_PAGE_SHIFT(target_map) < PAGE_SHIFT) { + /* no "fourk" if map is using a sub-page page size */ + fourk = FALSE; + } + if (fourk) { + map_addr = vm_map_trunc_page(*address, FOURK_PAGE_MASK); + map_size = vm_map_round_page(initial_size, FOURK_PAGE_MASK); + } else +#endif /* __arm64__ */ + { + map_addr = vm_map_trunc_page(*address, + VM_MAP_PAGE_MASK(target_map)); + map_size = vm_map_round_page(initial_size, + VM_MAP_PAGE_MASK(target_map)); + } + size = vm_object_round_page(initial_size); /* - * For "named" VM objects, let the pager know that the - * memory object is being mapped. Some pagers need to keep - * track of this, to know when they can reclaim the memory - * object, for example. - * VM calls memory_object_map() for each mapping (specifying - * the protection of each mapping) and calls - * memory_object_last_unmap() when all the mappings are gone. + * Find the vm object (if any) corresponding to this port. */ - pager_prot = max_protection; - if (copy) { - pager_prot &= ~VM_PROT_WRITE; - } - pager = object->pager; - if (object->named && - pager != MEMORY_OBJECT_NULL && - object->copy_strategy != MEMORY_OBJECT_COPY_NONE) { - assert(object->pager_ready); - vm_object_mapping_wait(object, THREAD_UNINT); - vm_object_mapping_begin(object); - vm_object_unlock(object); - - kr = memory_object_map(pager, pager_prot); - assert(kr == KERN_SUCCESS); - - vm_object_lock(object); - vm_object_mapping_end(object); - } - vm_object_unlock(object); + if (!IP_VALID(port)) { + object = VM_OBJECT_NULL; + offset = 0; + copy = FALSE; + } else if (ip_kotype(port) == IKOT_NAMED_ENTRY) { + vm_named_entry_t named_entry; + vm_object_offset_t data_offset; - /* - * Perform the copy if requested - */ + named_entry = (vm_named_entry_t) ip_get_kobject(port); - if (copy) { - vm_object_t new_object; - vm_object_offset_t new_offset; + if (flags & (VM_FLAGS_RETURN_DATA_ADDR | + VM_FLAGS_RETURN_4K_DATA_ADDR)) { + data_offset = named_entry->data_offset; + offset += named_entry->data_offset; + } else { + data_offset = 0; + } - result = vm_object_copy_strategically(object, offset, size, - &new_object, &new_offset, - ©); + /* a few checks to make sure user is obeying rules */ + if (size == 0) { + if (offset >= named_entry->size) { + return KERN_INVALID_RIGHT; + } + size = named_entry->size - offset; + } + if (mask_max_protection) { + max_protection &= named_entry->protection; + } + if (mask_cur_protection) { + cur_protection &= named_entry->protection; + } + if ((named_entry->protection & max_protection) != + max_protection) { + return KERN_INVALID_RIGHT; + } + if ((named_entry->protection & cur_protection) != + cur_protection) { + return KERN_INVALID_RIGHT; + } + if (offset + size < offset) { + /* overflow */ + return KERN_INVALID_ARGUMENT; + } + if (named_entry->size < (offset + initial_size)) { + return KERN_INVALID_ARGUMENT; + } + if (named_entry->is_copy) { + /* for a vm_map_copy, we can only map it whole */ + if ((size != named_entry->size) && + (vm_map_round_page(size, + VM_MAP_PAGE_MASK(target_map)) == + named_entry->size)) { + /* XXX FBDP use the rounded size... */ + size = vm_map_round_page( + size, + VM_MAP_PAGE_MASK(target_map)); + } + } - if (result == KERN_MEMORY_RESTART_COPY) { - boolean_t success; - boolean_t src_needs_copy; + /* the callers parameter offset is defined to be the */ + /* offset from beginning of named entry offset in object */ + offset = offset + named_entry->offset; + if (!VM_MAP_PAGE_ALIGNED(size, + VM_MAP_PAGE_MASK(target_map))) { /* - * XXX - * We currently ignore src_needs_copy. - * This really is the issue of how to make - * MEMORY_OBJECT_COPY_SYMMETRIC safe for - * non-kernel users to use. Solution forthcoming. - * In the meantime, since we don't allow non-kernel - * memory managers to specify symmetric copy, - * we won't run into problems here. + * Let's not map more than requested; + * vm_map_enter() will handle this "not map-aligned" + * case. */ - new_object = object; - new_offset = offset; - success = vm_object_copy_quickly(&new_object, - new_offset, size, - &src_needs_copy, - ©); - assert(success); - result = KERN_SUCCESS; + map_size = size; } - /* - * Throw away the reference to the - * original object, as it won't be mapped. - */ - vm_object_deallocate(object); + named_entry_lock(named_entry); + if (named_entry->is_sub_map) { + vm_map_t submap; - if (result != KERN_SUCCESS) - return result; + if (flags & (VM_FLAGS_RETURN_DATA_ADDR | + VM_FLAGS_RETURN_4K_DATA_ADDR)) { + panic("VM_FLAGS_RETURN_DATA_ADDR not expected for submap."); + } - object = new_object; - offset = new_offset; - } + submap = named_entry->backing.map; + vm_map_reference(submap); + named_entry_unlock(named_entry); - result = vm_map_enter(target_map, - &map_addr, map_size, - (vm_map_offset_t)mask, - flags, - object, offset, - copy, - cur_protection, max_protection, inheritance); - if (result != KERN_SUCCESS) - vm_object_deallocate(object); - *address = map_addr; + vmk_flags.vmkf_submap = TRUE; - return result; -} + result = vm_map_enter(target_map, + &map_addr, + map_size, + mask, + flags, + vmk_flags, + tag, + (vm_object_t)(uintptr_t) submap, + offset, + copy, + cur_protection, + max_protection, + inheritance); + if (result != KERN_SUCCESS) { + vm_map_deallocate(submap); + } else { + /* + * No need to lock "submap" just to check its + * "mapped" flag: that flag is never reset + * once it's been set and if we race, we'll + * just end up setting it twice, which is OK. + */ + if (submap->mapped_in_other_pmaps == FALSE && + vm_map_pmap(submap) != PMAP_NULL && + vm_map_pmap(submap) != + vm_map_pmap(target_map)) { + /* + * This submap is being mapped in a map + * that uses a different pmap. + * Set its "mapped_in_other_pmaps" flag + * to indicate that we now need to + * remove mappings from all pmaps rather + * than just the submap's pmap. + */ + vm_map_lock(submap); + submap->mapped_in_other_pmaps = TRUE; + vm_map_unlock(submap); + } + *address = map_addr; + } + return result; + } else if (named_entry->is_copy) { + kern_return_t kr; + vm_map_copy_t copy_map; + vm_map_entry_t copy_entry; + vm_map_offset_t copy_addr; + vm_map_copy_t target_copy_map; + vm_map_offset_t overmap_start, overmap_end; + vm_map_offset_t trimmed_start; + vm_map_size_t target_size; + if (flags & ~(VM_FLAGS_FIXED | + VM_FLAGS_ANYWHERE | + VM_FLAGS_OVERWRITE | + VM_FLAGS_RETURN_4K_DATA_ADDR | + VM_FLAGS_RETURN_DATA_ADDR | + VM_FLAGS_ALIAS_MASK)) { + named_entry_unlock(named_entry); + return KERN_INVALID_ARGUMENT; + } -#if VM_CPM + copy_map = named_entry->backing.copy; + assert(copy_map->type == VM_MAP_COPY_ENTRY_LIST); + if (copy_map->type != VM_MAP_COPY_ENTRY_LIST) { + /* unsupported type; should not happen */ + printf("vm_map_enter_mem_object: " + "memory_entry->backing.copy " + "unsupported type 0x%x\n", + copy_map->type); + named_entry_unlock(named_entry); + return KERN_INVALID_ARGUMENT; + } -#ifdef MACH_ASSERT -extern pmap_paddr_t avail_start, avail_end; -#endif + if (VM_MAP_PAGE_SHIFT(target_map) != copy_map->cpy_hdr.page_shift) { + DEBUG4K_SHARE("copy_map %p offset %llx size 0x%llx pgshift %d -> target_map %p pgshift %d\n", copy_map, offset, (uint64_t)map_size, copy_map->cpy_hdr.page_shift, target_map, VM_MAP_PAGE_SHIFT(target_map)); + } -/* - * Allocate memory in the specified map, with the caveat that - * the memory is physically contiguous. This call may fail - * if the system can't find sufficient contiguous memory. - * This call may cause or lead to heart-stopping amounts of - * paging activity. - * - * Memory obtained from this call should be freed in the - * normal way, viz., via vm_deallocate. - */ -kern_return_t -vm_map_enter_cpm( - vm_map_t map, - vm_map_offset_t *addr, - vm_map_size_t size, - int flags) -{ - vm_object_t cpm_obj; - pmap_t pmap; - vm_page_t m, pages; - kern_return_t kr; - vm_map_offset_t va, start, end, offset; -#if MACH_ASSERT - vm_map_offset_t prev_addr = 0; -#endif /* MACH_ASSERT */ - - boolean_t anywhere = ((VM_FLAGS_ANYWHERE & flags) != 0); + if (flags & (VM_FLAGS_RETURN_DATA_ADDR | + VM_FLAGS_RETURN_4K_DATA_ADDR)) { + offset_in_mapping = offset & VM_MAP_PAGE_MASK(target_map); + if (flags & VM_FLAGS_RETURN_4K_DATA_ADDR) { + offset_in_mapping &= ~((signed)(0xFFF)); + } + } - if (size == 0) { - *addr = 0; - return KERN_SUCCESS; - } - if (anywhere) - *addr = vm_map_min(map); - else - *addr = vm_map_trunc_page(*addr, - VM_MAP_PAGE_MASK(map)); - size = vm_map_round_page(size, - VM_MAP_PAGE_MASK(map)); + target_copy_map = VM_MAP_COPY_NULL; + target_size = copy_map->size; + overmap_start = 0; + overmap_end = 0; + trimmed_start = 0; + if (copy_map->cpy_hdr.page_shift != VM_MAP_PAGE_SHIFT(target_map)) { + DEBUG4K_ADJUST("adjusting...\n"); + kr = vm_map_copy_adjust_to_target( + copy_map, + offset /* includes data_offset */, + initial_size, + target_map, + copy, + &target_copy_map, + &overmap_start, + &overmap_end, + &trimmed_start); + if (kr != KERN_SUCCESS) { + named_entry_unlock(named_entry); + return kr; + } + target_size = target_copy_map->size; + if (trimmed_start >= data_offset) { + data_offset = offset & VM_MAP_PAGE_MASK(target_map); + } else { + data_offset -= trimmed_start; + } + } else { + target_copy_map = copy_map; + } - /* - * LP64todo - cpm_allocate should probably allow - * allocations of >4GB, but not with the current - * algorithm, so just cast down the size for now. - */ - if (size > VM_MAX_ADDRESS) - return KERN_RESOURCE_SHORTAGE; - if ((kr = cpm_allocate(CAST_DOWN(vm_size_t, size), - &pages, 0, 0, TRUE, flags)) != KERN_SUCCESS) - return kr; + /* reserve a contiguous range */ + kr = vm_map_enter(target_map, + &map_addr, + vm_map_round_page(target_size, VM_MAP_PAGE_MASK(target_map)), + mask, + flags & (VM_FLAGS_ANYWHERE | + VM_FLAGS_OVERWRITE | + VM_FLAGS_RETURN_4K_DATA_ADDR | + VM_FLAGS_RETURN_DATA_ADDR), + vmk_flags, + tag, + VM_OBJECT_NULL, + 0, + FALSE, /* copy */ + cur_protection, + max_protection, + inheritance); + if (kr != KERN_SUCCESS) { + DEBUG4K_ERROR("kr 0x%x\n", kr); + if (target_copy_map != copy_map) { + vm_map_copy_discard(target_copy_map); + target_copy_map = VM_MAP_COPY_NULL; + } + named_entry_unlock(named_entry); + return kr; + } - cpm_obj = vm_object_allocate((vm_object_size_t)size); - assert(cpm_obj != VM_OBJECT_NULL); - assert(cpm_obj->internal); - assert(cpm_obj->vo_size == (vm_object_size_t)size); - assert(cpm_obj->can_persist == FALSE); - assert(cpm_obj->pager_created == FALSE); - assert(cpm_obj->pageout == FALSE); - assert(cpm_obj->shadow == VM_OBJECT_NULL); + copy_addr = map_addr; - /* - * Insert pages into object. - */ + for (copy_entry = vm_map_copy_first_entry(target_copy_map); + copy_entry != vm_map_copy_to_entry(target_copy_map); + copy_entry = copy_entry->vme_next) { + int remap_flags; + vm_map_kernel_flags_t vmk_remap_flags; + vm_map_t copy_submap; + vm_object_t copy_object; + vm_map_size_t copy_size; + vm_object_offset_t copy_offset; + int copy_vm_alias; + + remap_flags = 0; + vmk_remap_flags = VM_MAP_KERNEL_FLAGS_NONE; + + copy_object = VME_OBJECT(copy_entry); + copy_offset = VME_OFFSET(copy_entry); + copy_size = (copy_entry->vme_end - + copy_entry->vme_start); + VM_GET_FLAGS_ALIAS(flags, copy_vm_alias); + if (copy_vm_alias == 0) { + /* + * Caller does not want a specific + * alias for this new mapping: use + * the alias of the original mapping. + */ + copy_vm_alias = VME_ALIAS(copy_entry); + } - vm_object_lock(cpm_obj); - for (offset = 0; offset < size; offset += PAGE_SIZE) { - m = pages; - pages = NEXT_PAGE(m); - *(NEXT_PAGE_PTR(m)) = VM_PAGE_NULL; + /* sanity check */ + if ((copy_addr + copy_size) > + (map_addr + + overmap_start + overmap_end + + named_entry->size /* XXX full size */)) { + /* over-mapping too much !? */ + kr = KERN_INVALID_ARGUMENT; + DEBUG4K_ERROR("kr 0x%x\n", kr); + /* abort */ + break; + } - assert(!m->gobbled); - assert(!m->wanted); - assert(!m->pageout); - assert(!m->tabled); - assert(VM_PAGE_WIRED(m)); - /* - * ENCRYPTED SWAP: - * "m" is not supposed to be pageable, so it - * should not be encrypted. It wouldn't be safe - * to enter it in a new VM object while encrypted. - */ - ASSERT_PAGE_DECRYPTED(m); - assert(m->busy); - assert(m->phys_page>=(avail_start>>PAGE_SHIFT) && m->phys_page<=(avail_end>>PAGE_SHIFT)); + /* take a reference on the object */ + if (copy_entry->is_sub_map) { + vmk_remap_flags.vmkf_submap = TRUE; + copy_submap = VME_SUBMAP(copy_entry); + vm_map_lock(copy_submap); + vm_map_reference(copy_submap); + vm_map_unlock(copy_submap); + copy_object = (vm_object_t)(uintptr_t) copy_submap; + } else if (!copy && + copy_object != VM_OBJECT_NULL && + (copy_entry->needs_copy || + copy_object->shadowed || + (!copy_object->true_share && + !copy_entry->is_shared && + copy_object->vo_size > copy_size))) { + /* + * We need to resolve our side of this + * "symmetric" copy-on-write now; we + * need a new object to map and share, + * instead of the current one which + * might still be shared with the + * original mapping. + * + * Note: A "vm_map_copy_t" does not + * have a lock but we're protected by + * the named entry's lock here. + */ + // assert(copy_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC); + VME_OBJECT_SHADOW(copy_entry, copy_size); + if (!copy_entry->needs_copy && + copy_entry->protection & VM_PROT_WRITE) { + vm_prot_t prot; + + prot = copy_entry->protection & ~VM_PROT_WRITE; + vm_object_pmap_protect(copy_object, + copy_offset, + copy_size, + PMAP_NULL, + PAGE_SIZE, + 0, + prot); + } - m->busy = FALSE; - vm_page_insert(m, cpm_obj, offset); - } - assert(cpm_obj->resident_page_count == size / PAGE_SIZE); - vm_object_unlock(cpm_obj); + copy_entry->needs_copy = FALSE; + copy_entry->is_shared = TRUE; + copy_object = VME_OBJECT(copy_entry); + copy_offset = VME_OFFSET(copy_entry); + vm_object_lock(copy_object); + vm_object_reference_locked(copy_object); + if (copy_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) { + /* we're about to make a shared mapping of this object */ + copy_object->copy_strategy = MEMORY_OBJECT_COPY_DELAY; + copy_object->true_share = TRUE; + } + vm_object_unlock(copy_object); + } else { + /* + * We already have the right object + * to map. + */ + copy_object = VME_OBJECT(copy_entry); + vm_object_reference(copy_object); + } - /* - * Hang onto a reference on the object in case a - * multi-threaded application for some reason decides - * to deallocate the portion of the address space into - * which we will insert this object. - * - * Unfortunately, we must insert the object now before - * we can talk to the pmap module about which addresses - * must be wired down. Hence, the race with a multi- - * threaded app. - */ - vm_object_reference(cpm_obj); + /* over-map the object into destination */ + remap_flags |= flags; + remap_flags |= VM_FLAGS_FIXED; + remap_flags |= VM_FLAGS_OVERWRITE; + remap_flags &= ~VM_FLAGS_ANYWHERE; + if (!copy && !copy_entry->is_sub_map) { + /* + * copy-on-write should have been + * resolved at this point, or we would + * end up sharing instead of copying. + */ + assert(!copy_entry->needs_copy); + } +#if XNU_TARGET_OS_OSX + if (copy_entry->used_for_jit) { + vmk_remap_flags.vmkf_map_jit = TRUE; + } +#endif /* XNU_TARGET_OS_OSX */ - /* - * Insert object into map. - */ + assertf((copy_vm_alias & VME_ALIAS_MASK) == copy_vm_alias, + "VM Tag truncated from 0x%x to 0x%x\n", copy_vm_alias, (copy_vm_alias & VME_ALIAS_MASK)); + kr = vm_map_enter(target_map, + ©_addr, + copy_size, + (vm_map_offset_t) 0, + remap_flags, + vmk_remap_flags, + (vm_tag_t) copy_vm_alias, /* see comment at end of vm_fault_unwire re. cast*/ + copy_object, + copy_offset, + ((copy_object == NULL) + ? FALSE + : (copy || copy_entry->needs_copy)), + cur_protection, + max_protection, + inheritance); + if (kr != KERN_SUCCESS) { + DEBUG4K_SHARE("failed kr 0x%x\n", kr); + if (copy_entry->is_sub_map) { + vm_map_deallocate(copy_submap); + } else { + vm_object_deallocate(copy_object); + } + /* abort */ + break; + } - kr = vm_map_enter( - map, - addr, - size, - (vm_map_offset_t)0, - flags, - cpm_obj, - (vm_object_offset_t)0, - FALSE, - VM_PROT_ALL, - VM_PROT_ALL, - VM_INHERIT_DEFAULT); + /* next mapping */ + copy_addr += copy_size; + } - if (kr != KERN_SUCCESS) { - /* - * A CPM object doesn't have can_persist set, - * so all we have to do is deallocate it to - * free up these pages. - */ - assert(cpm_obj->pager_created == FALSE); - assert(cpm_obj->can_persist == FALSE); - assert(cpm_obj->pageout == FALSE); - assert(cpm_obj->shadow == VM_OBJECT_NULL); - vm_object_deallocate(cpm_obj); /* kill acquired ref */ - vm_object_deallocate(cpm_obj); /* kill creation ref */ - } + if (kr == KERN_SUCCESS) { + if (flags & (VM_FLAGS_RETURN_DATA_ADDR | + VM_FLAGS_RETURN_4K_DATA_ADDR)) { + *address = map_addr + offset_in_mapping; + } else { + *address = map_addr; + } + if (overmap_start) { + *address += overmap_start; + DEBUG4K_SHARE("map %p map_addr 0x%llx offset_in_mapping 0x%llx overmap_start 0x%llx -> *address 0x%llx\n", target_map, (uint64_t)map_addr, (uint64_t) offset_in_mapping, (uint64_t)overmap_start, (uint64_t)*address); + } + } + named_entry_unlock(named_entry); + if (target_copy_map != copy_map) { + vm_map_copy_discard(target_copy_map); + target_copy_map = VM_MAP_COPY_NULL; + } - /* - * Inform the physical mapping system that the - * range of addresses may not fault, so that - * page tables and such can be locked down as well. - */ - start = *addr; - end = start + size; - pmap = vm_map_pmap(map); - pmap_pageable(pmap, start, end, FALSE); + if (kr != KERN_SUCCESS) { + if (!(flags & VM_FLAGS_OVERWRITE)) { + /* deallocate the contiguous range */ + (void) vm_deallocate(target_map, + map_addr, + map_size); + } + } - /* - * Enter each page into the pmap, to avoid faults. - * Note that this loop could be coded more efficiently, - * if the need arose, rather than looking up each page - * again. - */ - for (offset = 0, va = start; offset < size; - va += PAGE_SIZE, offset += PAGE_SIZE) { - int type_of_fault; + return kr; + } - vm_object_lock(cpm_obj); - m = vm_page_lookup(cpm_obj, (vm_object_offset_t)offset); - assert(m != VM_PAGE_NULL); + if (named_entry->is_object) { + unsigned int access; + vm_prot_t protections; + unsigned int wimg_mode; - vm_page_zero_fill(m); + /* we are mapping a VM object */ - type_of_fault = DBG_ZERO_FILL_FAULT; + protections = named_entry->protection & VM_PROT_ALL; + access = GET_MAP_MEM(named_entry->protection); - vm_fault_enter(m, pmap, va, VM_PROT_ALL, VM_PROT_WRITE, - VM_PAGE_WIRED(m), FALSE, FALSE, FALSE, 0, NULL, - &type_of_fault); + if (flags & (VM_FLAGS_RETURN_DATA_ADDR | + VM_FLAGS_RETURN_4K_DATA_ADDR)) { + offset_in_mapping = offset - VM_MAP_TRUNC_PAGE(offset, VM_MAP_PAGE_MASK(target_map)); + if (flags & VM_FLAGS_RETURN_4K_DATA_ADDR) { + offset_in_mapping &= ~((signed)(0xFFF)); + } + offset = VM_MAP_TRUNC_PAGE(offset, VM_MAP_PAGE_MASK(target_map)); + map_size = VM_MAP_ROUND_PAGE((offset + offset_in_mapping + initial_size) - offset, VM_MAP_PAGE_MASK(target_map)); + } - vm_object_unlock(cpm_obj); - } + object = vm_named_entry_to_vm_object(named_entry); + assert(object != VM_OBJECT_NULL); + vm_object_lock(object); + named_entry_unlock(named_entry); -#if MACH_ASSERT - /* - * Verify ordering in address space. - */ - for (offset = 0; offset < size; offset += PAGE_SIZE) { - vm_object_lock(cpm_obj); - m = vm_page_lookup(cpm_obj, (vm_object_offset_t)offset); - vm_object_unlock(cpm_obj); - if (m == VM_PAGE_NULL) - panic("vm_allocate_cpm: obj %p off 0x%llx no page", - cpm_obj, (uint64_t)offset); - assert(m->tabled); - assert(!m->busy); - assert(!m->wanted); - assert(!m->fictitious); - assert(!m->private); - assert(!m->absent); - assert(!m->error); - assert(!m->cleaning); - assert(!m->laundry); - assert(!m->precious); - assert(!m->clustered); - if (offset != 0) { - if (m->phys_page != prev_addr + 1) { - printf("start 0x%llx end 0x%llx va 0x%llx\n", - (uint64_t)start, (uint64_t)end, (uint64_t)va); - printf("obj %p off 0x%llx\n", cpm_obj, (uint64_t)offset); - printf("m %p prev_address 0x%llx\n", m, (uint64_t)prev_addr); - panic("vm_allocate_cpm: pages not contig!"); + vm_object_reference_locked(object); + + wimg_mode = object->wimg_bits; + vm_prot_to_wimg(access, &wimg_mode); + if (object->wimg_bits != wimg_mode) { + vm_object_change_wimg_mode(object, wimg_mode); + } + + vm_object_unlock(object); + } else { + panic("invalid VM named entry %p", named_entry); + } + } else if (ip_kotype(port) == IKOT_MEMORY_OBJECT) { + /* + * JMM - This is temporary until we unify named entries + * and raw memory objects. + * + * Detected fake ip_kotype for a memory object. In + * this case, the port isn't really a port at all, but + * instead is just a raw memory object. + */ + if (flags & (VM_FLAGS_RETURN_DATA_ADDR | + VM_FLAGS_RETURN_4K_DATA_ADDR)) { + panic("VM_FLAGS_RETURN_DATA_ADDR not expected for raw memory object."); + } + + object = memory_object_to_vm_object((memory_object_t)port); + if (object == VM_OBJECT_NULL) { + return KERN_INVALID_OBJECT; + } + vm_object_reference(object); + + /* wait for object (if any) to be ready */ + if (object != VM_OBJECT_NULL) { + if (object == kernel_object) { + printf("Warning: Attempt to map kernel object" + " by a non-private kernel entity\n"); + return KERN_INVALID_OBJECT; + } + if (!object->pager_ready) { + vm_object_lock(object); + + while (!object->pager_ready) { + vm_object_wait(object, + VM_OBJECT_EVENT_PAGER_READY, + THREAD_UNINT); + vm_object_lock(object); + } + vm_object_unlock(object); } } - prev_addr = m->phys_page; + } else { + return KERN_INVALID_OBJECT; } -#endif /* MACH_ASSERT */ - vm_object_deallocate(cpm_obj); /* kill extra ref */ + if (object != VM_OBJECT_NULL && + object->named && + object->pager != MEMORY_OBJECT_NULL && + object->copy_strategy != MEMORY_OBJECT_COPY_NONE) { + memory_object_t pager; + vm_prot_t pager_prot; + kern_return_t kr; - return kr; -} + /* + * For "named" VM objects, let the pager know that the + * memory object is being mapped. Some pagers need to keep + * track of this, to know when they can reclaim the memory + * object, for example. + * VM calls memory_object_map() for each mapping (specifying + * the protection of each mapping) and calls + * memory_object_last_unmap() when all the mappings are gone. + */ + pager_prot = max_protection; + if (copy) { + /* + * Copy-On-Write mapping: won't modify the + * memory object. + */ + pager_prot &= ~VM_PROT_WRITE; + } + vm_object_lock(object); + pager = object->pager; + if (object->named && + pager != MEMORY_OBJECT_NULL && + object->copy_strategy != MEMORY_OBJECT_COPY_NONE) { + assert(object->pager_ready); + vm_object_mapping_wait(object, THREAD_UNINT); + vm_object_mapping_begin(object); + vm_object_unlock(object); + kr = memory_object_map(pager, pager_prot); + assert(kr == KERN_SUCCESS); -#else /* VM_CPM */ + vm_object_lock(object); + vm_object_mapping_end(object); + } + vm_object_unlock(object); + } -/* - * Interface is defined in all cases, but unless the kernel - * is built explicitly for this option, the interface does - * nothing. - */ + /* + * Perform the copy if requested + */ -kern_return_t -vm_map_enter_cpm( - __unused vm_map_t map, - __unused vm_map_offset_t *addr, - __unused vm_map_size_t size, - __unused int flags) -{ - return KERN_FAILURE; -} -#endif /* VM_CPM */ + if (copy) { + vm_object_t new_object; + vm_object_offset_t new_offset; -/* Not used without nested pmaps */ -#ifndef NO_NESTED_PMAP -/* - * Clip and unnest a portion of a nested submap mapping. - */ + result = vm_object_copy_strategically(object, offset, + map_size, + &new_object, &new_offset, + ©); -static void -vm_map_clip_unnest( - vm_map_t map, - vm_map_entry_t entry, - vm_map_offset_t start_unnest, - vm_map_offset_t end_unnest) -{ - vm_map_offset_t old_start_unnest = start_unnest; - vm_map_offset_t old_end_unnest = end_unnest; + if (result == KERN_MEMORY_RESTART_COPY) { + boolean_t success; + boolean_t src_needs_copy; - assert(entry->is_sub_map); - assert(entry->object.sub_map != NULL); - assert(entry->use_pmap); + /* + * XXX + * We currently ignore src_needs_copy. + * This really is the issue of how to make + * MEMORY_OBJECT_COPY_SYMMETRIC safe for + * non-kernel users to use. Solution forthcoming. + * In the meantime, since we don't allow non-kernel + * memory managers to specify symmetric copy, + * we won't run into problems here. + */ + new_object = object; + new_offset = offset; + success = vm_object_copy_quickly(&new_object, + new_offset, + map_size, + &src_needs_copy, + ©); + assert(success); + result = KERN_SUCCESS; + } + /* + * Throw away the reference to the + * original object, as it won't be mapped. + */ + + vm_object_deallocate(object); + + if (result != KERN_SUCCESS) { + return result; + } + + object = new_object; + offset = new_offset; + } /* - * Query the platform for the optimal unnest range. - * DRK: There's some duplication of effort here, since - * callers may have adjusted the range to some extent. This - * routine was introduced to support 1GiB subtree nesting - * for x86 platforms, which can also nest on 2MiB boundaries - * depending on size/alignment. + * If non-kernel users want to try to prefault pages, the mapping and prefault + * needs to be atomic. */ - if (pmap_adjust_unnest_parameters(map->pmap, &start_unnest, &end_unnest)) { - log_unnest_badness(map, old_start_unnest, old_end_unnest); + kernel_prefault = (try_prefault && vm_kernel_map_is_kernel(target_map)); + vmk_flags.vmkf_keep_map_locked = (try_prefault && !kernel_prefault); + +#if __arm64__ + if (fourk) { + /* map this object in a "4K" pager */ + result = vm_map_enter_fourk(target_map, + &map_addr, + map_size, + (vm_map_offset_t) mask, + flags, + vmk_flags, + tag, + object, + offset, + copy, + cur_protection, + max_protection, + inheritance); + } else +#endif /* __arm64__ */ + { + result = vm_map_enter(target_map, + &map_addr, map_size, + (vm_map_offset_t)mask, + flags, + vmk_flags, + tag, + object, offset, + copy, + cur_protection, max_protection, + inheritance); } - - if (entry->vme_start > start_unnest || - entry->vme_end < end_unnest) { - panic("vm_map_clip_unnest(0x%llx,0x%llx): " - "bad nested entry: start=0x%llx end=0x%llx\n", - (long long)start_unnest, (long long)end_unnest, - (long long)entry->vme_start, (long long)entry->vme_end); + if (result != KERN_SUCCESS) { + vm_object_deallocate(object); } - if (start_unnest > entry->vme_start) { - _vm_map_clip_start(&map->hdr, - entry, - start_unnest); - vm_map_store_update_first_free(map, map->first_free); + /* + * Try to prefault, and do not forget to release the vm map lock. + */ + if (result == KERN_SUCCESS && try_prefault) { + mach_vm_address_t va = map_addr; + kern_return_t kr = KERN_SUCCESS; + unsigned int i = 0; + int pmap_options; + + pmap_options = kernel_prefault ? 0 : PMAP_OPTIONS_NOWAIT; + if (object->internal) { + pmap_options |= PMAP_OPTIONS_INTERNAL; + } + + for (i = 0; i < page_list_count; ++i) { + if (!UPL_VALID_PAGE(page_list, i)) { + if (kernel_prefault) { + assertf(FALSE, "kernel_prefault && !UPL_VALID_PAGE"); + result = KERN_MEMORY_ERROR; + break; + } + } else { + /* + * If this function call failed, we should stop + * trying to optimize, other calls are likely + * going to fail too. + * + * We are not gonna report an error for such + * failure though. That's an optimization, not + * something critical. + */ + kr = pmap_enter_options(target_map->pmap, + va, UPL_PHYS_PAGE(page_list, i), + cur_protection, VM_PROT_NONE, + 0, TRUE, pmap_options, NULL); + if (kr != KERN_SUCCESS) { + OSIncrementAtomic64(&vm_prefault_nb_bailout); + if (kernel_prefault) { + result = kr; + } + break; + } + OSIncrementAtomic64(&vm_prefault_nb_pages); + } + + /* Next virtual address */ + va += PAGE_SIZE; + } + if (vmk_flags.vmkf_keep_map_locked) { + vm_map_unlock(target_map); + } } - if (entry->vme_end > end_unnest) { - _vm_map_clip_end(&map->hdr, - entry, - end_unnest); - vm_map_store_update_first_free(map, map->first_free); + + if (flags & (VM_FLAGS_RETURN_DATA_ADDR | + VM_FLAGS_RETURN_4K_DATA_ADDR)) { + *address = map_addr + offset_in_mapping; + } else { + *address = map_addr; } + return result; +} - pmap_unnest(map->pmap, - entry->vme_start, - entry->vme_end - entry->vme_start); - if ((map->mapped_in_other_pmaps) && (map->ref_count)) { - /* clean up parent map/maps */ - vm_map_submap_pmap_clean( - map, entry->vme_start, - entry->vme_end, - entry->object.sub_map, - entry->offset); +kern_return_t +vm_map_enter_mem_object( + vm_map_t target_map, + vm_map_offset_t *address, + vm_map_size_t initial_size, + vm_map_offset_t mask, + int flags, + vm_map_kernel_flags_t vmk_flags, + vm_tag_t tag, + ipc_port_t port, + vm_object_offset_t offset, + boolean_t copy, + vm_prot_t cur_protection, + vm_prot_t max_protection, + vm_inherit_t inheritance) +{ + kern_return_t ret; + + ret = vm_map_enter_mem_object_helper(target_map, + address, + initial_size, + mask, + flags, + vmk_flags, + tag, + port, + offset, + copy, + cur_protection, + max_protection, + inheritance, + NULL, + 0); + +#if KASAN + if (ret == KERN_SUCCESS && address && target_map->pmap == kernel_pmap) { + kasan_notify_address(*address, initial_size); } - entry->use_pmap = FALSE; - if (entry->alias == VM_MEMORY_SHARED_PMAP) { - entry->alias = VM_MEMORY_UNSHARED_PMAP; +#endif + + return ret; +} + +kern_return_t +vm_map_enter_mem_object_prefault( + vm_map_t target_map, + vm_map_offset_t *address, + vm_map_size_t initial_size, + vm_map_offset_t mask, + int flags, + vm_map_kernel_flags_t vmk_flags, + vm_tag_t tag, + ipc_port_t port, + vm_object_offset_t offset, + vm_prot_t cur_protection, + vm_prot_t max_protection, + upl_page_list_ptr_t page_list, + unsigned int page_list_count) +{ + kern_return_t ret; + + ret = vm_map_enter_mem_object_helper(target_map, + address, + initial_size, + mask, + flags, + vmk_flags, + tag, + port, + offset, + FALSE, + cur_protection, + max_protection, + VM_INHERIT_DEFAULT, + page_list, + page_list_count); + +#if KASAN + if (ret == KERN_SUCCESS && address && target_map->pmap == kernel_pmap) { + kasan_notify_address(*address, initial_size); } +#endif + + return ret; } -#endif /* NO_NESTED_PMAP */ -/* - * vm_map_clip_start: [ internal use only ] - * - * Asserts that the given entry begins at or after - * the specified address; if necessary, - * it splits the entry into two. - */ -void -vm_map_clip_start( - vm_map_t map, - vm_map_entry_t entry, - vm_map_offset_t startaddr) + +kern_return_t +vm_map_enter_mem_object_control( + vm_map_t target_map, + vm_map_offset_t *address, + vm_map_size_t initial_size, + vm_map_offset_t mask, + int flags, + vm_map_kernel_flags_t vmk_flags, + vm_tag_t tag, + memory_object_control_t control, + vm_object_offset_t offset, + boolean_t copy, + vm_prot_t cur_protection, + vm_prot_t max_protection, + vm_inherit_t inheritance) { -#ifndef NO_NESTED_PMAP - if (entry->is_sub_map && - entry->use_pmap && - startaddr >= entry->vme_start) { - vm_map_offset_t start_unnest, end_unnest; + vm_map_address_t map_addr; + vm_map_size_t map_size; + vm_object_t object; + vm_object_size_t size; + kern_return_t result; + memory_object_t pager; + vm_prot_t pager_prot; + kern_return_t kr; +#if __arm64__ + boolean_t fourk = vmk_flags.vmkf_fourk; +#endif /* __arm64__ */ - /* - * Make sure "startaddr" is no longer in a nested range - * before we clip. Unnest only the minimum range the platform - * can handle. - * vm_map_clip_unnest may perform additional adjustments to - * the unnest range. - */ - start_unnest = startaddr & ~(pmap_nesting_size_min - 1); - end_unnest = start_unnest + pmap_nesting_size_min; - vm_map_clip_unnest(map, entry, start_unnest, end_unnest); + /* + * Check arguments for validity + */ + if ((target_map == VM_MAP_NULL) || + (cur_protection & ~VM_PROT_ALL) || + (max_protection & ~VM_PROT_ALL) || + (inheritance > VM_INHERIT_LAST_VALID) || + initial_size == 0) { + return KERN_INVALID_ARGUMENT; } -#endif /* NO_NESTED_PMAP */ - if (startaddr > entry->vme_start) { - if (entry->object.vm_object && - !entry->is_sub_map && - entry->object.vm_object->phys_contiguous) { - pmap_remove(map->pmap, - (addr64_t)(entry->vme_start), - (addr64_t)(entry->vme_end)); - } - _vm_map_clip_start(&map->hdr, entry, startaddr); - vm_map_store_update_first_free(map, map->first_free); + +#if __arm64__ + if (fourk && VM_MAP_PAGE_MASK(target_map) < PAGE_MASK) { + fourk = FALSE; } -} + if (fourk) { + map_addr = vm_map_trunc_page(*address, + FOURK_PAGE_MASK); + map_size = vm_map_round_page(initial_size, + FOURK_PAGE_MASK); + } else +#endif /* __arm64__ */ + { + map_addr = vm_map_trunc_page(*address, + VM_MAP_PAGE_MASK(target_map)); + map_size = vm_map_round_page(initial_size, + VM_MAP_PAGE_MASK(target_map)); + } + size = vm_object_round_page(initial_size); -#define vm_map_copy_clip_start(copy, entry, startaddr) \ - MACRO_BEGIN \ - if ((startaddr) > (entry)->vme_start) \ - _vm_map_clip_start(&(copy)->cpy_hdr,(entry),(startaddr)); \ - MACRO_END + object = memory_object_control_to_vm_object(control); -/* - * This routine is called only when it is known that - * the entry must be split. - */ -static void -_vm_map_clip_start( - register struct vm_map_header *map_header, - register vm_map_entry_t entry, - register vm_map_offset_t start) -{ - register vm_map_entry_t new_entry; + if (object == VM_OBJECT_NULL) { + return KERN_INVALID_OBJECT; + } + + if (object == kernel_object) { + printf("Warning: Attempt to map kernel object" + " by a non-private kernel entity\n"); + return KERN_INVALID_OBJECT; + } + + vm_object_lock(object); + object->ref_count++; /* - * Split off the front portion -- - * note that we must insert the new - * entry BEFORE this one, so that - * this entry has the specified starting - * address. + * For "named" VM objects, let the pager know that the + * memory object is being mapped. Some pagers need to keep + * track of this, to know when they can reclaim the memory + * object, for example. + * VM calls memory_object_map() for each mapping (specifying + * the protection of each mapping) and calls + * memory_object_last_unmap() when all the mappings are gone. */ + pager_prot = max_protection; + if (copy) { + pager_prot &= ~VM_PROT_WRITE; + } + pager = object->pager; + if (object->named && + pager != MEMORY_OBJECT_NULL && + object->copy_strategy != MEMORY_OBJECT_COPY_NONE) { + assert(object->pager_ready); + vm_object_mapping_wait(object, THREAD_UNINT); + vm_object_mapping_begin(object); + vm_object_unlock(object); - if (entry->map_aligned) { - assert(VM_MAP_PAGE_ALIGNED(start, - VM_MAP_HDR_PAGE_MASK(map_header))); + kr = memory_object_map(pager, pager_prot); + assert(kr == KERN_SUCCESS); + + vm_object_lock(object); + vm_object_mapping_end(object); } + vm_object_unlock(object); - new_entry = _vm_map_entry_create(map_header, !map_header->entries_pageable); - vm_map_entry_copy_full(new_entry, entry); + /* + * Perform the copy if requested + */ - new_entry->vme_end = start; - assert(new_entry->vme_start < new_entry->vme_end); - entry->offset += (start - entry->vme_start); - assert(start < entry->vme_end); - entry->vme_start = start; + if (copy) { + vm_object_t new_object; + vm_object_offset_t new_offset; - _vm_map_store_entry_link(map_header, entry->vme_prev, new_entry); + result = vm_object_copy_strategically(object, offset, size, + &new_object, &new_offset, + ©); - if (entry->is_sub_map) - vm_map_reference(new_entry->object.sub_map); - else - vm_object_reference(new_entry->object.vm_object); -} + if (result == KERN_MEMORY_RESTART_COPY) { + boolean_t success; + boolean_t src_needs_copy; -/* - * vm_map_clip_end: [ internal use only ] - * - * Asserts that the given entry ends at or before - * the specified address; if necessary, - * it splits the entry into two. - */ -void -vm_map_clip_end( - vm_map_t map, - vm_map_entry_t entry, - vm_map_offset_t endaddr) -{ - if (endaddr > entry->vme_end) { + /* + * XXX + * We currently ignore src_needs_copy. + * This really is the issue of how to make + * MEMORY_OBJECT_COPY_SYMMETRIC safe for + * non-kernel users to use. Solution forthcoming. + * In the meantime, since we don't allow non-kernel + * memory managers to specify symmetric copy, + * we won't run into problems here. + */ + new_object = object; + new_offset = offset; + success = vm_object_copy_quickly(&new_object, + new_offset, size, + &src_needs_copy, + ©); + assert(success); + result = KERN_SUCCESS; + } /* - * Within the scope of this clipping, limit "endaddr" to - * the end of this map entry... + * Throw away the reference to the + * original object, as it won't be mapped. */ - endaddr = entry->vme_end; + + vm_object_deallocate(object); + + if (result != KERN_SUCCESS) { + return result; + } + + object = new_object; + offset = new_offset; } -#ifndef NO_NESTED_PMAP - if (entry->is_sub_map && entry->use_pmap) { - vm_map_offset_t start_unnest, end_unnest; - /* - * Make sure the range between the start of this entry and - * the new "endaddr" is no longer nested before we clip. - * Unnest only the minimum range the platform can handle. - * vm_map_clip_unnest may perform additional adjustments to - * the unnest range. - */ - start_unnest = entry->vme_start; - end_unnest = - (endaddr + pmap_nesting_size_min - 1) & - ~(pmap_nesting_size_min - 1); - vm_map_clip_unnest(map, entry, start_unnest, end_unnest); +#if __arm64__ + if (fourk) { + result = vm_map_enter_fourk(target_map, + &map_addr, + map_size, + (vm_map_offset_t)mask, + flags, + vmk_flags, + tag, + object, offset, + copy, + cur_protection, max_protection, + inheritance); + } else +#endif /* __arm64__ */ + { + result = vm_map_enter(target_map, + &map_addr, map_size, + (vm_map_offset_t)mask, + flags, + vmk_flags, + tag, + object, offset, + copy, + cur_protection, max_protection, + inheritance); } -#endif /* NO_NESTED_PMAP */ - if (endaddr < entry->vme_end) { - if (entry->object.vm_object && - !entry->is_sub_map && - entry->object.vm_object->phys_contiguous) { - pmap_remove(map->pmap, - (addr64_t)(entry->vme_start), - (addr64_t)(entry->vme_end)); - } - _vm_map_clip_end(&map->hdr, entry, endaddr); - vm_map_store_update_first_free(map, map->first_free); + if (result != KERN_SUCCESS) { + vm_object_deallocate(object); } + *address = map_addr; + + return result; } -#define vm_map_copy_clip_end(copy, entry, endaddr) \ - MACRO_BEGIN \ - if ((endaddr) < (entry)->vme_end) \ - _vm_map_clip_end(&(copy)->cpy_hdr,(entry),(endaddr)); \ - MACRO_END +#if VM_CPM + +#ifdef MACH_ASSERT +extern pmap_paddr_t avail_start, avail_end; +#endif /* - * This routine is called only when it is known that - * the entry must be split. + * Allocate memory in the specified map, with the caveat that + * the memory is physically contiguous. This call may fail + * if the system can't find sufficient contiguous memory. + * This call may cause or lead to heart-stopping amounts of + * paging activity. + * + * Memory obtained from this call should be freed in the + * normal way, viz., via vm_deallocate. */ -static void -_vm_map_clip_end( - register struct vm_map_header *map_header, - register vm_map_entry_t entry, - register vm_map_offset_t end) +kern_return_t +vm_map_enter_cpm( + vm_map_t map, + vm_map_offset_t *addr, + vm_map_size_t size, + int flags) { - register vm_map_entry_t new_entry; + vm_object_t cpm_obj; + pmap_t pmap; + vm_page_t m, pages; + kern_return_t kr; + vm_map_offset_t va, start, end, offset; +#if MACH_ASSERT + vm_map_offset_t prev_addr = 0; +#endif /* MACH_ASSERT */ + + boolean_t anywhere = ((VM_FLAGS_ANYWHERE & flags) != 0); + vm_tag_t tag; - /* - * Create a new entry and insert it - * AFTER the specified entry - */ + if (VM_MAP_PAGE_SHIFT(map) != PAGE_SHIFT) { + /* XXX TODO4K do we need to support this? */ + *addr = 0; + return KERN_NOT_SUPPORTED; + } - if (entry->map_aligned) { - assert(VM_MAP_PAGE_ALIGNED(end, - VM_MAP_HDR_PAGE_MASK(map_header))); + VM_GET_FLAGS_ALIAS(flags, tag); + + if (size == 0) { + *addr = 0; + return KERN_SUCCESS; + } + if (anywhere) { + *addr = vm_map_min(map); + } else { + *addr = vm_map_trunc_page(*addr, + VM_MAP_PAGE_MASK(map)); } + size = vm_map_round_page(size, + VM_MAP_PAGE_MASK(map)); - new_entry = _vm_map_entry_create(map_header, !map_header->entries_pageable); - vm_map_entry_copy_full(new_entry, entry); + /* + * LP64todo - cpm_allocate should probably allow + * allocations of >4GB, but not with the current + * algorithm, so just cast down the size for now. + */ + if (size > VM_MAX_ADDRESS) { + return KERN_RESOURCE_SHORTAGE; + } + if ((kr = cpm_allocate(CAST_DOWN(vm_size_t, size), + &pages, 0, 0, TRUE, flags)) != KERN_SUCCESS) { + return kr; + } - assert(entry->vme_start < end); - new_entry->vme_start = entry->vme_end = end; - new_entry->offset += (end - entry->vme_start); - assert(new_entry->vme_start < new_entry->vme_end); + cpm_obj = vm_object_allocate((vm_object_size_t)size); + assert(cpm_obj != VM_OBJECT_NULL); + assert(cpm_obj->internal); + assert(cpm_obj->vo_size == (vm_object_size_t)size); + assert(cpm_obj->can_persist == FALSE); + assert(cpm_obj->pager_created == FALSE); + assert(cpm_obj->pageout == FALSE); + assert(cpm_obj->shadow == VM_OBJECT_NULL); - _vm_map_store_entry_link(map_header, entry, new_entry); + /* + * Insert pages into object. + */ - if (entry->is_sub_map) - vm_map_reference(new_entry->object.sub_map); - else - vm_object_reference(new_entry->object.vm_object); -} + vm_object_lock(cpm_obj); + for (offset = 0; offset < size; offset += PAGE_SIZE) { + m = pages; + pages = NEXT_PAGE(m); + *(NEXT_PAGE_PTR(m)) = VM_PAGE_NULL; + assert(!m->vmp_gobbled); + assert(!m->vmp_wanted); + assert(!m->vmp_pageout); + assert(!m->vmp_tabled); + assert(VM_PAGE_WIRED(m)); + assert(m->vmp_busy); + assert(VM_PAGE_GET_PHYS_PAGE(m) >= (avail_start >> PAGE_SHIFT) && VM_PAGE_GET_PHYS_PAGE(m) <= (avail_end >> PAGE_SHIFT)); -/* - * VM_MAP_RANGE_CHECK: [ internal use only ] - * - * Asserts that the starting and ending region - * addresses fall within the valid range of the map. - */ -#define VM_MAP_RANGE_CHECK(map, start, end) \ - MACRO_BEGIN \ - if (start < vm_map_min(map)) \ - start = vm_map_min(map); \ - if (end > vm_map_max(map)) \ - end = vm_map_max(map); \ - if (start > end) \ - start = end; \ - MACRO_END + m->vmp_busy = FALSE; + vm_page_insert(m, cpm_obj, offset); + } + assert(cpm_obj->resident_page_count == size / PAGE_SIZE); + vm_object_unlock(cpm_obj); -/* - * vm_map_range_check: [ internal use only ] - * - * Check that the region defined by the specified start and - * end addresses are wholly contained within a single map - * entry or set of adjacent map entries of the spacified map, - * i.e. the specified region contains no unmapped space. - * If any or all of the region is unmapped, FALSE is returned. - * Otherwise, TRUE is returned and if the output argument 'entry' - * is not NULL it points to the map entry containing the start - * of the region. - * - * The map is locked for reading on entry and is left locked. - */ -static boolean_t -vm_map_range_check( - register vm_map_t map, - register vm_map_offset_t start, - register vm_map_offset_t end, - vm_map_entry_t *entry) -{ - vm_map_entry_t cur; - register vm_map_offset_t prev; + /* + * Hang onto a reference on the object in case a + * multi-threaded application for some reason decides + * to deallocate the portion of the address space into + * which we will insert this object. + * + * Unfortunately, we must insert the object now before + * we can talk to the pmap module about which addresses + * must be wired down. Hence, the race with a multi- + * threaded app. + */ + vm_object_reference(cpm_obj); /* - * Basic sanity checks first + * Insert object into map. */ - if (start < vm_map_min(map) || end > vm_map_max(map) || start > end) - return (FALSE); + + kr = vm_map_enter( + map, + addr, + size, + (vm_map_offset_t)0, + flags, + VM_MAP_KERNEL_FLAGS_NONE, + cpm_obj, + (vm_object_offset_t)0, + FALSE, + VM_PROT_ALL, + VM_PROT_ALL, + VM_INHERIT_DEFAULT); + + if (kr != KERN_SUCCESS) { + /* + * A CPM object doesn't have can_persist set, + * so all we have to do is deallocate it to + * free up these pages. + */ + assert(cpm_obj->pager_created == FALSE); + assert(cpm_obj->can_persist == FALSE); + assert(cpm_obj->pageout == FALSE); + assert(cpm_obj->shadow == VM_OBJECT_NULL); + vm_object_deallocate(cpm_obj); /* kill acquired ref */ + vm_object_deallocate(cpm_obj); /* kill creation ref */ + } /* - * Check first if the region starts within a valid - * mapping for the map. + * Inform the physical mapping system that the + * range of addresses may not fault, so that + * page tables and such can be locked down as well. */ - if (!vm_map_lookup_entry(map, start, &cur)) - return (FALSE); + start = *addr; + end = start + size; + pmap = vm_map_pmap(map); + pmap_pageable(pmap, start, end, FALSE); /* - * Optimize for the case that the region is contained - * in a single map entry. + * Enter each page into the pmap, to avoid faults. + * Note that this loop could be coded more efficiently, + * if the need arose, rather than looking up each page + * again. */ - if (entry != (vm_map_entry_t *) NULL) - *entry = cur; - if (end <= cur->vme_end) - return (TRUE); + for (offset = 0, va = start; offset < size; + va += PAGE_SIZE, offset += PAGE_SIZE) { + int type_of_fault; + + vm_object_lock(cpm_obj); + m = vm_page_lookup(cpm_obj, (vm_object_offset_t)offset); + assert(m != VM_PAGE_NULL); + + vm_page_zero_fill(m); + + type_of_fault = DBG_ZERO_FILL_FAULT; + + vm_fault_enter(m, pmap, va, + PAGE_SIZE, 0, + VM_PROT_ALL, VM_PROT_WRITE, + VM_PAGE_WIRED(m), + FALSE, /* change_wiring */ + VM_KERN_MEMORY_NONE, /* tag - not wiring */ + FALSE, /* no_cache */ + FALSE, /* cs_bypass */ + 0, /* user_tag */ + 0, /* pmap_options */ + NULL, /* need_retry */ + &type_of_fault); + + vm_object_unlock(cpm_obj); + } +#if MACH_ASSERT /* - * If the region is not wholly contained within a - * single entry, walk the entries looking for holes. + * Verify ordering in address space. */ - prev = cur->vme_end; - cur = cur->vme_next; - while ((cur != vm_map_to_entry(map)) && (prev == cur->vme_start)) { - if (end <= cur->vme_end) - return (TRUE); - prev = cur->vme_end; - cur = cur->vme_next; + for (offset = 0; offset < size; offset += PAGE_SIZE) { + vm_object_lock(cpm_obj); + m = vm_page_lookup(cpm_obj, (vm_object_offset_t)offset); + vm_object_unlock(cpm_obj); + if (m == VM_PAGE_NULL) { + panic("vm_allocate_cpm: obj %p off 0x%llx no page", + cpm_obj, (uint64_t)offset); + } + assert(m->vmp_tabled); + assert(!m->vmp_busy); + assert(!m->vmp_wanted); + assert(!m->vmp_fictitious); + assert(!m->vmp_private); + assert(!m->vmp_absent); + assert(!m->vmp_error); + assert(!m->vmp_cleaning); + assert(!m->vmp_laundry); + assert(!m->vmp_precious); + assert(!m->vmp_clustered); + if (offset != 0) { + if (VM_PAGE_GET_PHYS_PAGE(m) != prev_addr + 1) { + printf("start 0x%llx end 0x%llx va 0x%llx\n", + (uint64_t)start, (uint64_t)end, (uint64_t)va); + printf("obj %p off 0x%llx\n", cpm_obj, (uint64_t)offset); + printf("m %p prev_address 0x%llx\n", m, (uint64_t)prev_addr); + panic("vm_allocate_cpm: pages not contig!"); + } + } + prev_addr = VM_PAGE_GET_PHYS_PAGE(m); } - return (FALSE); +#endif /* MACH_ASSERT */ + + vm_object_deallocate(cpm_obj); /* kill extra ref */ + + return kr; } + +#else /* VM_CPM */ + /* - * vm_map_submap: [ kernel use only ] - * - * Mark the given range as handled by a subordinate map. - * - * This range must have been created with vm_map_find using - * the vm_submap_object, and no other operations may have been - * performed on this range prior to calling vm_map_submap. - * - * Only a limited number of operations can be performed - * within this rage after calling vm_map_submap: - * vm_fault - * [Don't try vm_map_copyin!] - * - * To remove a submapping, one must first remove the - * range from the superior map, and then destroy the - * submap (if desired). [Better yet, don't try it.] + * Interface is defined in all cases, but unless the kernel + * is built explicitly for this option, the interface does + * nothing. */ + kern_return_t -vm_map_submap( - vm_map_t map, - vm_map_offset_t start, - vm_map_offset_t end, - vm_map_t submap, - vm_map_offset_t offset, -#ifdef NO_NESTED_PMAP - __unused -#endif /* NO_NESTED_PMAP */ - boolean_t use_pmap) +vm_map_enter_cpm( + __unused vm_map_t map, + __unused vm_map_offset_t *addr, + __unused vm_map_size_t size, + __unused int flags) { - vm_map_entry_t entry; - register kern_return_t result = KERN_INVALID_ARGUMENT; - register vm_object_t object; + return KERN_FAILURE; +} +#endif /* VM_CPM */ - vm_map_lock(map); +/* Not used without nested pmaps */ +#ifndef NO_NESTED_PMAP +/* + * Clip and unnest a portion of a nested submap mapping. + */ - if (! vm_map_lookup_entry(map, start, &entry)) { - entry = entry->vme_next; + +static void +vm_map_clip_unnest( + vm_map_t map, + vm_map_entry_t entry, + vm_map_offset_t start_unnest, + vm_map_offset_t end_unnest) +{ + vm_map_offset_t old_start_unnest = start_unnest; + vm_map_offset_t old_end_unnest = end_unnest; + + assert(entry->is_sub_map); + assert(VME_SUBMAP(entry) != NULL); + assert(entry->use_pmap); + + /* + * Query the platform for the optimal unnest range. + * DRK: There's some duplication of effort here, since + * callers may have adjusted the range to some extent. This + * routine was introduced to support 1GiB subtree nesting + * for x86 platforms, which can also nest on 2MiB boundaries + * depending on size/alignment. + */ + if (pmap_adjust_unnest_parameters(map->pmap, &start_unnest, &end_unnest)) { + assert(VME_SUBMAP(entry)->is_nested_map); + assert(!VME_SUBMAP(entry)->disable_vmentry_reuse); + log_unnest_badness(map, + old_start_unnest, + old_end_unnest, + VME_SUBMAP(entry)->is_nested_map, + (entry->vme_start + + VME_SUBMAP(entry)->lowest_unnestable_start - + VME_OFFSET(entry))); } - if (entry == vm_map_to_entry(map) || - entry->is_sub_map) { - vm_map_unlock(map); - return KERN_INVALID_ARGUMENT; + if (entry->vme_start > start_unnest || + entry->vme_end < end_unnest) { + panic("vm_map_clip_unnest(0x%llx,0x%llx): " + "bad nested entry: start=0x%llx end=0x%llx\n", + (long long)start_unnest, (long long)end_unnest, + (long long)entry->vme_start, (long long)entry->vme_end); } - vm_map_clip_start(map, entry, start); - vm_map_clip_end(map, entry, end); + if (start_unnest > entry->vme_start) { + _vm_map_clip_start(&map->hdr, + entry, + start_unnest); + if (map->holelistenabled) { + vm_map_store_update_first_free(map, NULL, FALSE); + } else { + vm_map_store_update_first_free(map, map->first_free, FALSE); + } + } + if (entry->vme_end > end_unnest) { + _vm_map_clip_end(&map->hdr, + entry, + end_unnest); + if (map->holelistenabled) { + vm_map_store_update_first_free(map, NULL, FALSE); + } else { + vm_map_store_update_first_free(map, map->first_free, FALSE); + } + } - if ((entry->vme_start == start) && (entry->vme_end == end) && - (!entry->is_sub_map) && - ((object = entry->object.vm_object) == vm_submap_object) && - (object->resident_page_count == 0) && - (object->copy == VM_OBJECT_NULL) && - (object->shadow == VM_OBJECT_NULL) && - (!object->pager_created)) { - entry->offset = (vm_object_offset_t)offset; - entry->object.vm_object = VM_OBJECT_NULL; - vm_object_deallocate(object); - entry->is_sub_map = TRUE; - entry->use_pmap = FALSE; - entry->object.sub_map = submap; - vm_map_reference(submap); - if (submap->mapped_in_other_pmaps == FALSE && - vm_map_pmap(submap) != PMAP_NULL && - vm_map_pmap(submap) != vm_map_pmap(map)) { - /* - * This submap is being mapped in a map - * that uses a different pmap. - * Set its "mapped_in_other_pmaps" flag - * to indicate that we now need to - * remove mappings from all pmaps rather - * than just the submap's pmap. - */ - submap->mapped_in_other_pmaps = TRUE; - } - -#ifndef NO_NESTED_PMAP - if (use_pmap) { - /* nest if platform code will allow */ - if(submap->pmap == NULL) { - ledger_t ledger = map->pmap->ledger; - submap->pmap = pmap_create(ledger, - (vm_map_size_t) 0, FALSE); - if(submap->pmap == PMAP_NULL) { - vm_map_unlock(map); - return(KERN_NO_SPACE); - } - } - result = pmap_nest(map->pmap, - (entry->object.sub_map)->pmap, - (addr64_t)start, - (addr64_t)start, - (uint64_t)(end - start)); - if(result) - panic("vm_map_submap: pmap_nest failed, rc = %08X\n", result); - entry->use_pmap = TRUE; - } -#else /* NO_NESTED_PMAP */ - pmap_remove(map->pmap, (addr64_t)start, (addr64_t)end); -#endif /* NO_NESTED_PMAP */ - result = KERN_SUCCESS; + pmap_unnest(map->pmap, + entry->vme_start, + entry->vme_end - entry->vme_start); + if ((map->mapped_in_other_pmaps) && os_ref_get_count(&map->map_refcnt) != 0) { + /* clean up parent map/maps */ + vm_map_submap_pmap_clean( + map, entry->vme_start, + entry->vme_end, + VME_SUBMAP(entry), + VME_OFFSET(entry)); + } + entry->use_pmap = FALSE; + if ((map->pmap != kernel_pmap) && + (VME_ALIAS(entry) == VM_MEMORY_SHARED_PMAP)) { + VME_ALIAS_SET(entry, VM_MEMORY_UNSHARED_PMAP); } - vm_map_unlock(map); - - return(result); } +#endif /* NO_NESTED_PMAP */ /* - * vm_map_protect: + * vm_map_clip_start: [ internal use only ] * - * Sets the protection of the specified address - * region in the target map. If "set_max" is - * specified, the maximum protection is to be set; - * otherwise, only the current protection is affected. + * Asserts that the given entry begins at or after + * the specified address; if necessary, + * it splits the entry into two. */ -kern_return_t -vm_map_protect( - register vm_map_t map, - register vm_map_offset_t start, - register vm_map_offset_t end, - register vm_prot_t new_prot, - register boolean_t set_max) +void +vm_map_clip_start( + vm_map_t map, + vm_map_entry_t entry, + vm_map_offset_t startaddr) { - register vm_map_entry_t current; - register vm_map_offset_t prev; - vm_map_entry_t entry; - vm_prot_t new_max; - - XPR(XPR_VM_MAP, - "vm_map_protect, 0x%X start 0x%X end 0x%X, new 0x%X %d", - map, start, end, new_prot, set_max); - - vm_map_lock(map); - - /* LP64todo - remove this check when vm_map_commpage64() - * no longer has to stuff in a map_entry for the commpage - * above the map's max_offset. - */ - if (start >= map->max_offset) { - vm_map_unlock(map); - return(KERN_INVALID_ADDRESS); - } +#ifndef NO_NESTED_PMAP + if (entry->is_sub_map && + entry->use_pmap && + startaddr >= entry->vme_start) { + vm_map_offset_t start_unnest, end_unnest; - while(1) { /* - * Lookup the entry. If it doesn't start in a valid - * entry, return an error. + * Make sure "startaddr" is no longer in a nested range + * before we clip. Unnest only the minimum range the platform + * can handle. + * vm_map_clip_unnest may perform additional adjustments to + * the unnest range. */ - if (! vm_map_lookup_entry(map, start, &entry)) { - vm_map_unlock(map); - return(KERN_INVALID_ADDRESS); + start_unnest = startaddr & ~(pmap_shared_region_size_min(map->pmap) - 1); + end_unnest = start_unnest + pmap_shared_region_size_min(map->pmap); + vm_map_clip_unnest(map, entry, start_unnest, end_unnest); + } +#endif /* NO_NESTED_PMAP */ + if (startaddr > entry->vme_start) { + if (VME_OBJECT(entry) && + !entry->is_sub_map && + VME_OBJECT(entry)->phys_contiguous) { + pmap_remove(map->pmap, + (addr64_t)(entry->vme_start), + (addr64_t)(entry->vme_end)); } - - if (entry->superpage_size && (start & (SUPERPAGE_SIZE-1))) { /* extend request to whole entry */ - start = SUPERPAGE_ROUND_DOWN(start); - continue; + if (entry->vme_atomic) { + panic("Attempting to clip an atomic VM entry! (map: %p, entry: %p)\n", map, entry); } - break; - } - if (entry->superpage_size) - end = SUPERPAGE_ROUND_UP(end); - - /* - * Make a first pass to check for protection and address - * violations. - */ - - current = entry; - prev = current->vme_start; - while ((current != vm_map_to_entry(map)) && - (current->vme_start < end)) { - /* - * If there is a hole, return an error. - */ - if (current->vme_start != prev) { - vm_map_unlock(map); - return(KERN_INVALID_ADDRESS); - } + DTRACE_VM5( + vm_map_clip_start, + vm_map_t, map, + vm_map_offset_t, entry->vme_start, + vm_map_offset_t, entry->vme_end, + vm_map_offset_t, startaddr, + int, VME_ALIAS(entry)); - new_max = current->max_protection; - if(new_prot & VM_PROT_COPY) { - new_max |= VM_PROT_WRITE; - if ((new_prot & (new_max | VM_PROT_COPY)) != new_prot) { - vm_map_unlock(map); - return(KERN_PROTECTION_FAILURE); - } + _vm_map_clip_start(&map->hdr, entry, startaddr); + if (map->holelistenabled) { + vm_map_store_update_first_free(map, NULL, FALSE); } else { - if ((new_prot & new_max) != new_prot) { - vm_map_unlock(map); - return(KERN_PROTECTION_FAILURE); - } + vm_map_store_update_first_free(map, map->first_free, FALSE); } + } +} - prev = current->vme_end; - current = current->vme_next; - } - if (end > prev) { - vm_map_unlock(map); - return(KERN_INVALID_ADDRESS); - } +#define vm_map_copy_clip_start(copy, entry, startaddr) \ + MACRO_BEGIN \ + if ((startaddr) > (entry)->vme_start) \ + _vm_map_clip_start(&(copy)->cpy_hdr,(entry),(startaddr)); \ + MACRO_END + +/* + * This routine is called only when it is known that + * the entry must be split. + */ +static void +_vm_map_clip_start( + struct vm_map_header *map_header, + vm_map_entry_t entry, + vm_map_offset_t start) +{ + vm_map_entry_t new_entry; /* - * Go back and fix up protections. - * Clip to start here if the range starts within - * the entry. + * Split off the front portion -- + * note that we must insert the new + * entry BEFORE this one, so that + * this entry has the specified starting + * address. */ - current = entry; - if (current != vm_map_to_entry(map)) { - /* clip and unnest if necessary */ - vm_map_clip_start(map, current, start); + if (entry->map_aligned) { + assert(VM_MAP_PAGE_ALIGNED(start, + VM_MAP_HDR_PAGE_MASK(map_header))); } - while ((current != vm_map_to_entry(map)) && - (current->vme_start < end)) { - - vm_prot_t old_prot; - - vm_map_clip_end(map, current, end); - - if (current->is_sub_map) { - /* clipping did unnest if needed */ - assert(!current->use_pmap); - } + new_entry = _vm_map_entry_create(map_header, !map_header->entries_pageable); + vm_map_entry_copy_full(new_entry, entry); - old_prot = current->protection; + new_entry->vme_end = start; + assert(new_entry->vme_start < new_entry->vme_end); + VME_OFFSET_SET(entry, VME_OFFSET(entry) + (start - entry->vme_start)); + assert(start < entry->vme_end); + entry->vme_start = start; - if(new_prot & VM_PROT_COPY) { - /* caller is asking specifically to copy the */ - /* mapped data, this implies that max protection */ - /* will include write. Caller must be prepared */ - /* for loss of shared memory communication in the */ - /* target area after taking this step */ + _vm_map_store_entry_link(map_header, entry->vme_prev, new_entry); - if (current->is_sub_map == FALSE && current->object.vm_object == VM_OBJECT_NULL){ - current->object.vm_object = vm_object_allocate((vm_map_size_t)(current->vme_end - current->vme_start)); - current->offset = 0; - assert(current->use_pmap); - } - current->needs_copy = TRUE; - current->max_protection |= VM_PROT_WRITE; - } + if (entry->is_sub_map) { + vm_map_reference(VME_SUBMAP(new_entry)); + } else { + vm_object_reference(VME_OBJECT(new_entry)); + } +} - if (set_max) - current->protection = - (current->max_protection = - new_prot & ~VM_PROT_COPY) & - old_prot; - else - current->protection = new_prot & ~VM_PROT_COPY; +/* + * vm_map_clip_end: [ internal use only ] + * + * Asserts that the given entry ends at or before + * the specified address; if necessary, + * it splits the entry into two. + */ +void +vm_map_clip_end( + vm_map_t map, + vm_map_entry_t entry, + vm_map_offset_t endaddr) +{ + if (endaddr > entry->vme_end) { /* - * Update physical map if necessary. - * If the request is to turn off write protection, - * we won't do it for real (in pmap). This is because - * it would cause copy-on-write to fail. We've already - * set, the new protection in the map, so if a - * write-protect fault occurred, it will be fixed up - * properly, COW or not. + * Within the scope of this clipping, limit "endaddr" to + * the end of this map entry... */ - if (current->protection != old_prot) { - /* Look one level in we support nested pmaps */ - /* from mapped submaps which are direct entries */ - /* in our map */ - - vm_prot_t prot; - - prot = current->protection & ~VM_PROT_WRITE; + endaddr = entry->vme_end; + } +#ifndef NO_NESTED_PMAP + if (entry->is_sub_map && entry->use_pmap) { + vm_map_offset_t start_unnest, end_unnest; - if (override_nx(map, current->alias) && prot) - prot |= VM_PROT_EXECUTE; + /* + * Make sure the range between the start of this entry and + * the new "endaddr" is no longer nested before we clip. + * Unnest only the minimum range the platform can handle. + * vm_map_clip_unnest may perform additional adjustments to + * the unnest range. + */ + start_unnest = entry->vme_start; + end_unnest = + (endaddr + pmap_shared_region_size_min(map->pmap) - 1) & + ~(pmap_shared_region_size_min(map->pmap) - 1); + vm_map_clip_unnest(map, entry, start_unnest, end_unnest); + } +#endif /* NO_NESTED_PMAP */ + if (endaddr < entry->vme_end) { + if (VME_OBJECT(entry) && + !entry->is_sub_map && + VME_OBJECT(entry)->phys_contiguous) { + pmap_remove(map->pmap, + (addr64_t)(entry->vme_start), + (addr64_t)(entry->vme_end)); + } + if (entry->vme_atomic) { + panic("Attempting to clip an atomic VM entry! (map: %p, entry: %p)\n", map, entry); + } + DTRACE_VM5( + vm_map_clip_end, + vm_map_t, map, + vm_map_offset_t, entry->vme_start, + vm_map_offset_t, entry->vme_end, + vm_map_offset_t, endaddr, + int, VME_ALIAS(entry)); - if (current->is_sub_map && current->use_pmap) { - pmap_protect(current->object.sub_map->pmap, - current->vme_start, - current->vme_end, - prot); - } else { - pmap_protect(map->pmap, - current->vme_start, - current->vme_end, - prot); - } + _vm_map_clip_end(&map->hdr, entry, endaddr); + if (map->holelistenabled) { + vm_map_store_update_first_free(map, NULL, FALSE); + } else { + vm_map_store_update_first_free(map, map->first_free, FALSE); } - current = current->vme_next; } +} - current = entry; - while ((current != vm_map_to_entry(map)) && - (current->vme_start <= end)) { - vm_map_simplify_entry(map, current); - current = current->vme_next; - } - vm_map_unlock(map); - return(KERN_SUCCESS); -} +#define vm_map_copy_clip_end(copy, entry, endaddr) \ + MACRO_BEGIN \ + if ((endaddr) < (entry)->vme_end) \ + _vm_map_clip_end(&(copy)->cpy_hdr,(entry),(endaddr)); \ + MACRO_END /* - * vm_map_inherit: - * - * Sets the inheritance of the specified address - * range in the target map. Inheritance - * affects how the map will be shared with - * child maps at the time of vm_map_fork. + * This routine is called only when it is known that + * the entry must be split. */ -kern_return_t -vm_map_inherit( - register vm_map_t map, - register vm_map_offset_t start, - register vm_map_offset_t end, - register vm_inherit_t new_inheritance) +static void +_vm_map_clip_end( + struct vm_map_header *map_header, + vm_map_entry_t entry, + vm_map_offset_t end) { - register vm_map_entry_t entry; - vm_map_entry_t temp_entry; - - vm_map_lock(map); + vm_map_entry_t new_entry; - VM_MAP_RANGE_CHECK(map, start, end); + /* + * Create a new entry and insert it + * AFTER the specified entry + */ - if (vm_map_lookup_entry(map, start, &temp_entry)) { - entry = temp_entry; - } - else { - temp_entry = temp_entry->vme_next; - entry = temp_entry; + if (entry->map_aligned) { + assert(VM_MAP_PAGE_ALIGNED(end, + VM_MAP_HDR_PAGE_MASK(map_header))); } - /* first check entire range for submaps which can't support the */ - /* given inheritance. */ - while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) { - if(entry->is_sub_map) { - if(new_inheritance == VM_INHERIT_COPY) { - vm_map_unlock(map); - return(KERN_INVALID_ARGUMENT); - } - } - - entry = entry->vme_next; - } - - entry = temp_entry; - if (entry != vm_map_to_entry(map)) { - /* clip and unnest if necessary */ - vm_map_clip_start(map, entry, start); - } + new_entry = _vm_map_entry_create(map_header, !map_header->entries_pageable); + vm_map_entry_copy_full(new_entry, entry); - while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) { - vm_map_clip_end(map, entry, end); - if (entry->is_sub_map) { - /* clip did unnest if needed */ - assert(!entry->use_pmap); - } + assert(entry->vme_start < end); + new_entry->vme_start = entry->vme_end = end; + VME_OFFSET_SET(new_entry, + VME_OFFSET(new_entry) + (end - entry->vme_start)); + assert(new_entry->vme_start < new_entry->vme_end); - entry->inheritance = new_inheritance; + _vm_map_store_entry_link(map_header, entry, new_entry); - entry = entry->vme_next; + if (entry->is_sub_map) { + vm_map_reference(VME_SUBMAP(new_entry)); + } else { + vm_object_reference(VME_OBJECT(new_entry)); } - - vm_map_unlock(map); - return(KERN_SUCCESS); } + /* - * Update the accounting for the amount of wired memory in this map. If the user has - * exceeded the defined limits, then we fail. Wiring on behalf of the kernel never fails. + * VM_MAP_RANGE_CHECK: [ internal use only ] + * + * Asserts that the starting and ending region + * addresses fall within the valid range of the map. */ +#define VM_MAP_RANGE_CHECK(map, start, end) \ + MACRO_BEGIN \ + if (start < vm_map_min(map)) \ + start = vm_map_min(map); \ + if (end > vm_map_max(map)) \ + end = vm_map_max(map); \ + if (start > end) \ + start = end; \ + MACRO_END -static kern_return_t -add_wire_counts( - vm_map_t map, - vm_map_entry_t entry, - boolean_t user_wire) -{ - vm_map_size_t size; - - if (user_wire) { - unsigned int total_wire_count = vm_page_wire_count + vm_lopage_free_count; - - /* - * We're wiring memory at the request of the user. Check if this is the first time the user is wiring - * this map entry. - */ - - if (entry->user_wired_count == 0) { - size = entry->vme_end - entry->vme_start; - - /* - * Since this is the first time the user is wiring this map entry, check to see if we're - * exceeding the user wire limits. There is a per map limit which is the smaller of either - * the process's rlimit or the global vm_user_wire_limit which caps this value. There is also - * a system-wide limit on the amount of memory all users can wire. If the user is over either - * limit, then we fail. - */ +/* + * vm_map_range_check: [ internal use only ] + * + * Check that the region defined by the specified start and + * end addresses are wholly contained within a single map + * entry or set of adjacent map entries of the spacified map, + * i.e. the specified region contains no unmapped space. + * If any or all of the region is unmapped, FALSE is returned. + * Otherwise, TRUE is returned and if the output argument 'entry' + * is not NULL it points to the map entry containing the start + * of the region. + * + * The map is locked for reading on entry and is left locked. + */ +static boolean_t +vm_map_range_check( + vm_map_t map, + vm_map_offset_t start, + vm_map_offset_t end, + vm_map_entry_t *entry) +{ + vm_map_entry_t cur; + vm_map_offset_t prev; - if(size + map->user_wire_size > MIN(map->user_wire_limit, vm_user_wire_limit) || - size + ptoa_64(total_wire_count) > vm_global_user_wire_limit || - size + ptoa_64(total_wire_count) > max_mem - vm_global_no_user_wire_amount) - return KERN_RESOURCE_SHORTAGE; + /* + * Basic sanity checks first + */ + if (start < vm_map_min(map) || end > vm_map_max(map) || start > end) { + return FALSE; + } - /* - * The first time the user wires an entry, we also increment the wired_count and add this to - * the total that has been wired in the map. - */ + /* + * Check first if the region starts within a valid + * mapping for the map. + */ + if (!vm_map_lookup_entry(map, start, &cur)) { + return FALSE; + } - if (entry->wired_count >= MAX_WIRE_COUNT) - return KERN_FAILURE; + /* + * Optimize for the case that the region is contained + * in a single map entry. + */ + if (entry != (vm_map_entry_t *) NULL) { + *entry = cur; + } + if (end <= cur->vme_end) { + return TRUE; + } - entry->wired_count++; - map->user_wire_size += size; + /* + * If the region is not wholly contained within a + * single entry, walk the entries looking for holes. + */ + prev = cur->vme_end; + cur = cur->vme_next; + while ((cur != vm_map_to_entry(map)) && (prev == cur->vme_start)) { + if (end <= cur->vme_end) { + return TRUE; } - - if (entry->user_wired_count >= MAX_WIRE_COUNT) - return KERN_FAILURE; - - entry->user_wired_count++; - - } else { - - /* - * The kernel's wiring the memory. Just bump the count and continue. - */ - - if (entry->wired_count >= MAX_WIRE_COUNT) - panic("vm_map_wire: too many wirings"); - - entry->wired_count++; + prev = cur->vme_end; + cur = cur->vme_next; } - - return KERN_SUCCESS; + return FALSE; } /* - * Update the memory wiring accounting now that the given map entry is being unwired. + * vm_map_submap: [ kernel use only ] + * + * Mark the given range as handled by a subordinate map. + * + * This range must have been created with vm_map_find using + * the vm_submap_object, and no other operations may have been + * performed on this range prior to calling vm_map_submap. + * + * Only a limited number of operations can be performed + * within this rage after calling vm_map_submap: + * vm_fault + * [Don't try vm_map_copyin!] + * + * To remove a submapping, one must first remove the + * range from the superior map, and then destroy the + * submap (if desired). [Better yet, don't try it.] */ +kern_return_t +vm_map_submap( + vm_map_t map, + vm_map_offset_t start, + vm_map_offset_t end, + vm_map_t submap, + vm_map_offset_t offset, +#ifdef NO_NESTED_PMAP + __unused +#endif /* NO_NESTED_PMAP */ + boolean_t use_pmap) +{ + vm_map_entry_t entry; + kern_return_t result = KERN_INVALID_ARGUMENT; + vm_object_t object; -static void -subtract_wire_counts( - vm_map_t map, - vm_map_entry_t entry, - boolean_t user_wire) -{ + vm_map_lock(map); - if (user_wire) { + if (!vm_map_lookup_entry(map, start, &entry)) { + entry = entry->vme_next; + } - /* - * We're unwiring memory at the request of the user. See if we're removing the last user wire reference. - */ + if (entry == vm_map_to_entry(map) || + entry->is_sub_map) { + vm_map_unlock(map); + return KERN_INVALID_ARGUMENT; + } - if (entry->user_wired_count == 1) { + vm_map_clip_start(map, entry, start); + vm_map_clip_end(map, entry, end); + if ((entry->vme_start == start) && (entry->vme_end == end) && + (!entry->is_sub_map) && + ((object = VME_OBJECT(entry)) == vm_submap_object) && + (object->resident_page_count == 0) && + (object->copy == VM_OBJECT_NULL) && + (object->shadow == VM_OBJECT_NULL) && + (!object->pager_created)) { + VME_OFFSET_SET(entry, (vm_object_offset_t)offset); + VME_OBJECT_SET(entry, VM_OBJECT_NULL); + vm_object_deallocate(object); + entry->is_sub_map = TRUE; + entry->use_pmap = FALSE; + VME_SUBMAP_SET(entry, submap); + vm_map_reference(submap); + if (submap->mapped_in_other_pmaps == FALSE && + vm_map_pmap(submap) != PMAP_NULL && + vm_map_pmap(submap) != vm_map_pmap(map)) { /* - * We're removing the last user wire reference. Decrement the wired_count and the total - * user wired memory for this map. + * This submap is being mapped in a map + * that uses a different pmap. + * Set its "mapped_in_other_pmaps" flag + * to indicate that we now need to + * remove mappings from all pmaps rather + * than just the submap's pmap. */ - - assert(entry->wired_count >= 1); - entry->wired_count--; - map->user_wire_size -= entry->vme_end - entry->vme_start; + submap->mapped_in_other_pmaps = TRUE; } - assert(entry->user_wired_count >= 1); - entry->user_wired_count--; - - } else { - - /* - * The kernel is unwiring the memory. Just update the count. - */ - - assert(entry->wired_count >= 1); - entry->wired_count--; +#ifndef NO_NESTED_PMAP + if (use_pmap) { + /* nest if platform code will allow */ + if (submap->pmap == NULL) { + ledger_t ledger = map->pmap->ledger; + submap->pmap = pmap_create_options(ledger, + (vm_map_size_t) 0, 0); + if (submap->pmap == PMAP_NULL) { + vm_map_unlock(map); + return KERN_NO_SPACE; + } +#if defined(__arm__) || defined(__arm64__) + pmap_set_nested(submap->pmap); +#endif + } + result = pmap_nest(map->pmap, + (VME_SUBMAP(entry))->pmap, + (addr64_t)start, + (uint64_t)(end - start)); + if (result) { + panic("vm_map_submap: pmap_nest failed, rc = %08X\n", result); + } + entry->use_pmap = TRUE; + } +#else /* NO_NESTED_PMAP */ + pmap_remove(map->pmap, (addr64_t)start, (addr64_t)end); +#endif /* NO_NESTED_PMAP */ + result = KERN_SUCCESS; } + vm_map_unlock(map); + + return result; } /* - * vm_map_wire: - * - * Sets the pageability of the specified address range in the - * target map as wired. Regions specified as not pageable require - * locked-down physical memory and physical page maps. The - * access_type variable indicates types of accesses that must not - * generate page faults. This is checked against protection of - * memory being locked-down. + * vm_map_protect: * - * The map must not be locked, but a reference must remain to the - * map throughout the call. + * Sets the protection of the specified address + * region in the target map. If "set_max" is + * specified, the maximum protection is to be set; + * otherwise, only the current protection is affected. */ -static kern_return_t -vm_map_wire_nested( - register vm_map_t map, - register vm_map_offset_t start, - register vm_map_offset_t end, - register vm_prot_t access_type, - boolean_t user_wire, - pmap_t map_pmap, - vm_map_offset_t pmap_addr, - ppnum_t *physpage_p) -{ - register vm_map_entry_t entry; - struct vm_map_entry *first_entry, tmp_entry; - vm_map_t real_map; - register vm_map_offset_t s,e; - kern_return_t rc; - boolean_t need_wakeup; - boolean_t main_map = FALSE; - wait_interrupt_t interruptible_state; - thread_t cur_thread; - unsigned int last_timestamp; - vm_map_size_t size; - boolean_t wire_and_extract; +kern_return_t +vm_map_protect( + vm_map_t map, + vm_map_offset_t start, + vm_map_offset_t end, + vm_prot_t new_prot, + boolean_t set_max) +{ + vm_map_entry_t current; + vm_map_offset_t prev; + vm_map_entry_t entry; + vm_prot_t new_max; + int pmap_options = 0; + kern_return_t kr; + + if (new_prot & VM_PROT_COPY) { + vm_map_offset_t new_start; + vm_prot_t cur_prot, max_prot; + vm_map_kernel_flags_t kflags; + + /* LP64todo - see below */ + if (start >= map->max_offset) { + return KERN_INVALID_ADDRESS; + } + + if ((new_prot & VM_PROT_EXECUTE) && + map->pmap != kernel_pmap && + (vm_map_cs_enforcement(map) +#if XNU_TARGET_OS_OSX && __arm64__ + || !VM_MAP_IS_EXOTIC(map) +#endif /* XNU_TARGET_OS_OSX && __arm64__ */ + ) && + VM_MAP_POLICY_WX_FAIL(map)) { + DTRACE_VM3(cs_wx, + uint64_t, (uint64_t) start, + uint64_t, (uint64_t) end, + vm_prot_t, new_prot); + printf("CODE SIGNING: %d[%s] %s can't have both write and exec at the same time\n", + proc_selfpid(), + (current_task()->bsd_info + ? proc_name_address(current_task()->bsd_info) + : "?"), + __FUNCTION__); + return KERN_PROTECTION_FAILURE; + } - wire_and_extract = FALSE; - if (physpage_p != NULL) { /* - * The caller wants the physical page number of the - * wired page. We return only one physical page number - * so this works for only one page at a time. + * Let vm_map_remap_extract() know that it will need to: + * + make a copy of the mapping + * + add VM_PROT_WRITE to the max protections + * + remove any protections that are no longer allowed from the + * max protections (to avoid any WRITE/EXECUTE conflict, for + * example). + * Note that "max_prot" is an IN/OUT parameter only for this + * specific (VM_PROT_COPY) case. It's usually an OUT parameter + * only. */ - if ((end - start) != PAGE_SIZE) { - return KERN_INVALID_ARGUMENT; + max_prot = new_prot & VM_PROT_ALL; + cur_prot = VM_PROT_NONE; + kflags = VM_MAP_KERNEL_FLAGS_NONE; + kflags.vmkf_remap_prot_copy = TRUE; + kflags.vmkf_overwrite_immutable = TRUE; + new_start = start; + kr = vm_map_remap(map, + &new_start, + end - start, + 0, /* mask */ + VM_FLAGS_FIXED | VM_FLAGS_OVERWRITE, + kflags, + 0, + map, + start, + TRUE, /* copy-on-write remapping! */ + &cur_prot, /* IN/OUT */ + &max_prot, /* IN/OUT */ + VM_INHERIT_DEFAULT); + if (kr != KERN_SUCCESS) { + return kr; } - wire_and_extract = TRUE; - *physpage_p = 0; + new_prot &= ~VM_PROT_COPY; } vm_map_lock(map); - if(map_pmap == NULL) - main_map = TRUE; - last_timestamp = map->timestamp; - VM_MAP_RANGE_CHECK(map, start, end); - assert(page_aligned(start)); - assert(page_aligned(end)); - assert(VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map))); - assert(VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map))); - if (start == end) { - /* We wired what the caller asked for, zero pages */ + /* LP64todo - remove this check when vm_map_commpage64() + * no longer has to stuff in a map_entry for the commpage + * above the map's max_offset. + */ + if (start >= map->max_offset) { vm_map_unlock(map); - return KERN_SUCCESS; + return KERN_INVALID_ADDRESS; } - need_wakeup = FALSE; - cur_thread = current_thread(); - - s = start; - rc = KERN_SUCCESS; - - if (vm_map_lookup_entry(map, s, &first_entry)) { - entry = first_entry; + while (1) { /* - * vm_map_clip_start will be done later. - * We don't want to unnest any nested submaps here ! + * Lookup the entry. If it doesn't start in a valid + * entry, return an error. */ - } else { - /* Start address is not in map */ - rc = KERN_INVALID_ADDRESS; - goto done; + if (!vm_map_lookup_entry(map, start, &entry)) { + vm_map_unlock(map); + return KERN_INVALID_ADDRESS; + } + + if (entry->superpage_size && (start & (SUPERPAGE_SIZE - 1))) { /* extend request to whole entry */ + start = SUPERPAGE_ROUND_DOWN(start); + continue; + } + break; + } + if (entry->superpage_size) { + end = SUPERPAGE_ROUND_UP(end); } - while ((entry != vm_map_to_entry(map)) && (s < end)) { + /* + * Make a first pass to check for protection and address + * violations. + */ + + current = entry; + prev = current->vme_start; + while ((current != vm_map_to_entry(map)) && + (current->vme_start < end)) { /* - * At this point, we have wired from "start" to "s". - * We still need to wire from "s" to "end". - * - * "entry" hasn't been clipped, so it could start before "s" - * and/or end after "end". + * If there is a hole, return an error. */ + if (current->vme_start != prev) { + vm_map_unlock(map); + return KERN_INVALID_ADDRESS; + } - /* "e" is how far we want to wire in this entry */ - e = entry->vme_end; - if (e > end) - e = end; + new_max = current->max_protection; + if ((new_prot & new_max) != new_prot) { + vm_map_unlock(map); + return KERN_PROTECTION_FAILURE; + } + + if (current->used_for_jit && + pmap_has_prot_policy(map->pmap, current->translated_allow_execute, current->protection)) { + vm_map_unlock(map); + return KERN_PROTECTION_FAILURE; + } + + if ((new_prot & VM_PROT_WRITE) && + (new_prot & VM_PROT_EXECUTE) && +#if XNU_TARGET_OS_OSX + map->pmap != kernel_pmap && + (vm_map_cs_enforcement(map) +#if __arm64__ + || !VM_MAP_IS_EXOTIC(map) +#endif /* __arm64__ */ + ) && +#endif /* XNU_TARGET_OS_OSX */ + !(current->used_for_jit)) { + DTRACE_VM3(cs_wx, + uint64_t, (uint64_t) current->vme_start, + uint64_t, (uint64_t) current->vme_end, + vm_prot_t, new_prot); + printf("CODE SIGNING: %d[%s] %s can't have both write and exec at the same time\n", + proc_selfpid(), + (current_task()->bsd_info + ? proc_name_address(current_task()->bsd_info) + : "?"), + __FUNCTION__); + new_prot &= ~VM_PROT_EXECUTE; + if (VM_MAP_POLICY_WX_FAIL(map)) { + vm_map_unlock(map); + return KERN_PROTECTION_FAILURE; + } + } /* - * If another thread is wiring/unwiring this entry then - * block after informing other thread to wake us up. + * If the task has requested executable lockdown, + * deny both: + * - adding executable protections OR + * - adding write protections to an existing executable mapping. */ - if (entry->in_transition) { - wait_result_t wait_result; + if (map->map_disallow_new_exec == TRUE) { + if ((new_prot & VM_PROT_EXECUTE) || + ((current->protection & VM_PROT_EXECUTE) && (new_prot & VM_PROT_WRITE))) { + vm_map_unlock(map); + return KERN_PROTECTION_FAILURE; + } + } - /* - * We have not clipped the entry. Make sure that - * the start address is in range so that the lookup - * below will succeed. - * "s" is the current starting point: we've already - * wired from "start" to "s" and we still have - * to wire from "s" to "end". - */ + prev = current->vme_end; + current = current->vme_next; + } - entry->needs_wakeup = TRUE; +#if __arm64__ + if (end > prev && + end == vm_map_round_page(prev, VM_MAP_PAGE_MASK(map))) { + vm_map_entry_t prev_entry; + prev_entry = current->vme_prev; + if (prev_entry != vm_map_to_entry(map) && + !prev_entry->map_aligned && + (vm_map_round_page(prev_entry->vme_end, + VM_MAP_PAGE_MASK(map)) + == end)) { /* - * wake up anybody waiting on entries that we have - * already wired. - */ - if (need_wakeup) { - vm_map_entry_wakeup(map); - need_wakeup = FALSE; - } - /* - * User wiring is interruptible + * The last entry in our range is not "map-aligned" + * but it would have reached all the way to "end" + * if it had been map-aligned, so this is not really + * a hole in the range and we can proceed. */ - wait_result = vm_map_entry_wait(map, - (user_wire) ? THREAD_ABORTSAFE : - THREAD_UNINT); - if (user_wire && wait_result == THREAD_INTERRUPTED) { - /* - * undo the wirings we have done so far - * We do not clear the needs_wakeup flag, - * because we cannot tell if we were the - * only one waiting. - */ - rc = KERN_FAILURE; - goto done; - } + prev = end; + } + } +#endif /* __arm64__ */ - /* - * Cannot avoid a lookup here. reset timestamp. - */ - last_timestamp = map->timestamp; + if (end > prev) { + vm_map_unlock(map); + return KERN_INVALID_ADDRESS; + } - /* - * The entry could have been clipped, look it up again. - * Worse that can happen is, it may not exist anymore. - */ - if (!vm_map_lookup_entry(map, s, &first_entry)) { - /* - * User: undo everything upto the previous - * entry. let vm_map_unwire worry about - * checking the validity of the range. - */ - rc = KERN_FAILURE; - goto done; - } - entry = first_entry; - continue; - } - - if (entry->is_sub_map) { - vm_map_offset_t sub_start; - vm_map_offset_t sub_end; - vm_map_offset_t local_start; - vm_map_offset_t local_end; - pmap_t pmap; + /* + * Go back and fix up protections. + * Clip to start here if the range starts within + * the entry. + */ - if (wire_and_extract) { - /* - * Wiring would result in copy-on-write - * which would not be compatible with - * the sharing we have with the original - * provider of this memory. - */ - rc = KERN_INVALID_ARGUMENT; - goto done; - } + current = entry; + if (current != vm_map_to_entry(map)) { + /* clip and unnest if necessary */ + vm_map_clip_start(map, current, start); + } - vm_map_clip_start(map, entry, s); - vm_map_clip_end(map, entry, end); + while ((current != vm_map_to_entry(map)) && + (current->vme_start < end)) { + vm_prot_t old_prot; - sub_start = entry->offset; - sub_end = entry->vme_end; - sub_end += entry->offset - entry->vme_start; - - local_end = entry->vme_end; - if(map_pmap == NULL) { - vm_object_t object; - vm_object_offset_t offset; - vm_prot_t prot; - boolean_t wired; - vm_map_entry_t local_entry; - vm_map_version_t version; - vm_map_t lookup_map; - - if(entry->use_pmap) { - pmap = entry->object.sub_map->pmap; - /* ppc implementation requires that */ - /* submaps pmap address ranges line */ - /* up with parent map */ -#ifdef notdef - pmap_addr = sub_start; -#endif - pmap_addr = s; - } else { - pmap = map->pmap; - pmap_addr = s; - } + vm_map_clip_end(map, current, end); - if (entry->wired_count) { - if ((rc = add_wire_counts(map, entry, user_wire)) != KERN_SUCCESS) - goto done; + if (current->is_sub_map) { + /* clipping did unnest if needed */ + assert(!current->use_pmap); + } - /* - * The map was not unlocked: - * no need to goto re-lookup. - * Just go directly to next entry. - */ - entry = entry->vme_next; - s = entry->vme_start; - continue; + old_prot = current->protection; - } + if (set_max) { + current->max_protection = new_prot; + current->protection = new_prot & old_prot; + } else { + current->protection = new_prot; + } - /* call vm_map_lookup_locked to */ - /* cause any needs copy to be */ - /* evaluated */ - local_start = entry->vme_start; - lookup_map = map; - vm_map_lock_write_to_read(map); - if(vm_map_lookup_locked( - &lookup_map, local_start, - access_type, - OBJECT_LOCK_EXCLUSIVE, - &version, &object, - &offset, &prot, &wired, - NULL, - &real_map)) { + /* + * Update physical map if necessary. + * If the request is to turn off write protection, + * we won't do it for real (in pmap). This is because + * it would cause copy-on-write to fail. We've already + * set, the new protection in the map, so if a + * write-protect fault occurred, it will be fixed up + * properly, COW or not. + */ + if (current->protection != old_prot) { + /* Look one level in we support nested pmaps */ + /* from mapped submaps which are direct entries */ + /* in our map */ - vm_map_unlock_read(lookup_map); - vm_map_unwire(map, start, - s, user_wire); - return(KERN_FAILURE); - } - vm_object_unlock(object); - if(real_map != lookup_map) - vm_map_unlock(real_map); - vm_map_unlock_read(lookup_map); - vm_map_lock(map); + vm_prot_t prot; - /* we unlocked, so must re-lookup */ - if (!vm_map_lookup_entry(map, - local_start, - &local_entry)) { - rc = KERN_FAILURE; - goto done; + prot = current->protection; + if (current->is_sub_map || (VME_OBJECT(current) == NULL) || (VME_OBJECT(current) != compressor_object)) { + prot &= ~VM_PROT_WRITE; + } else { + assert(!VME_OBJECT(current)->code_signed); + assert(VME_OBJECT(current)->copy_strategy == MEMORY_OBJECT_COPY_NONE); + } + + if (override_nx(map, VME_ALIAS(current)) && prot) { + prot |= VM_PROT_EXECUTE; + } + +#if DEVELOPMENT || DEBUG + if (!(old_prot & VM_PROT_EXECUTE) && + (prot & VM_PROT_EXECUTE) && + panic_on_unsigned_execute && + (proc_selfcsflags() & CS_KILL)) { + panic("vm_map_protect(%p,0x%llx,0x%llx) old=0x%x new=0x%x - code-signing bypass?\n", map, (uint64_t)current->vme_start, (uint64_t)current->vme_end, old_prot, prot); + } +#endif /* DEVELOPMENT || DEBUG */ + + if (pmap_has_prot_policy(map->pmap, current->translated_allow_execute, prot)) { + if (current->wired_count) { + panic("vm_map_protect(%p,0x%llx,0x%llx) new=0x%x wired=%x\n", + map, (uint64_t)current->vme_start, (uint64_t)current->vme_end, prot, current->wired_count); } + /* If the pmap layer cares about this + * protection type, force a fault for + * each page so that vm_fault will + * repopulate the page with the full + * set of protections. + */ /* - * entry could have been "simplified", - * so re-clip + * TODO: We don't seem to need this, + * but this is due to an internal + * implementation detail of + * pmap_protect. Do we want to rely + * on this? */ - entry = local_entry; - assert(s == local_start); - vm_map_clip_start(map, entry, s); - vm_map_clip_end(map, entry, end); - /* re-compute "e" */ - e = entry->vme_end; - if (e > end) - e = end; + prot = VM_PROT_NONE; + } - /* did we have a change of type? */ - if (!entry->is_sub_map) { - last_timestamp = map->timestamp; - continue; - } + if (current->is_sub_map && current->use_pmap) { + pmap_protect(VME_SUBMAP(current)->pmap, + current->vme_start, + current->vme_end, + prot); } else { - local_start = entry->vme_start; - pmap = map_pmap; - } + if (prot & VM_PROT_WRITE) { + if (VME_OBJECT(current) == compressor_object) { + /* + * For write requests on the + * compressor, we wil ask the + * pmap layer to prevent us from + * taking a write fault when we + * attempt to access the mapping + * next. + */ + pmap_options |= PMAP_OPTIONS_PROTECT_IMMEDIATE; + } + } - if ((rc = add_wire_counts(map, entry, user_wire)) != KERN_SUCCESS) - goto done; + pmap_protect_options(map->pmap, + current->vme_start, + current->vme_end, + prot, + pmap_options, + NULL); + } + } + current = current->vme_next; + } - entry->in_transition = TRUE; + current = entry; + while ((current != vm_map_to_entry(map)) && + (current->vme_start <= end)) { + vm_map_simplify_entry(map, current); + current = current->vme_next; + } - vm_map_unlock(map); - rc = vm_map_wire_nested(entry->object.sub_map, - sub_start, sub_end, - access_type, - user_wire, pmap, pmap_addr, - NULL); - vm_map_lock(map); + vm_map_unlock(map); + return KERN_SUCCESS; +} - /* - * Find the entry again. It could have been clipped - * after we unlocked the map. - */ - if (!vm_map_lookup_entry(map, local_start, - &first_entry)) - panic("vm_map_wire: re-lookup failed"); - entry = first_entry; +/* + * vm_map_inherit: + * + * Sets the inheritance of the specified address + * range in the target map. Inheritance + * affects how the map will be shared with + * child maps at the time of vm_map_fork. + */ +kern_return_t +vm_map_inherit( + vm_map_t map, + vm_map_offset_t start, + vm_map_offset_t end, + vm_inherit_t new_inheritance) +{ + vm_map_entry_t entry; + vm_map_entry_t temp_entry; - assert(local_start == s); - /* re-compute "e" */ - e = entry->vme_end; - if (e > end) - e = end; + vm_map_lock(map); - last_timestamp = map->timestamp; - while ((entry != vm_map_to_entry(map)) && - (entry->vme_start < e)) { - assert(entry->in_transition); - entry->in_transition = FALSE; - if (entry->needs_wakeup) { - entry->needs_wakeup = FALSE; - need_wakeup = TRUE; - } - if (rc != KERN_SUCCESS) {/* from vm_*_wire */ - subtract_wire_counts(map, entry, user_wire); - } - entry = entry->vme_next; - } - if (rc != KERN_SUCCESS) { /* from vm_*_wire */ - goto done; - } + VM_MAP_RANGE_CHECK(map, start, end); - /* no need to relookup again */ - s = entry->vme_start; - continue; + if (vm_map_lookup_entry(map, start, &temp_entry)) { + entry = temp_entry; + } else { + temp_entry = temp_entry->vme_next; + entry = temp_entry; + } + + /* first check entire range for submaps which can't support the */ + /* given inheritance. */ + while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) { + if (entry->is_sub_map) { + if (new_inheritance == VM_INHERIT_COPY) { + vm_map_unlock(map); + return KERN_INVALID_ARGUMENT; + } } - /* - * If this entry is already wired then increment - * the appropriate wire reference count. - */ - if (entry->wired_count) { + entry = entry->vme_next; + } - if ((entry->protection & access_type) != access_type) { - /* found a protection problem */ + entry = temp_entry; + if (entry != vm_map_to_entry(map)) { + /* clip and unnest if necessary */ + vm_map_clip_start(map, entry, start); + } - /* - * XXX FBDP - * We should always return an error - * in this case but since we didn't - * enforce it before, let's do - * it only for the new "wire_and_extract" - * code path for now... - */ - if (wire_and_extract) { - rc = KERN_PROTECTION_FAILURE; - goto done; - } - } + while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) { + vm_map_clip_end(map, entry, end); + if (entry->is_sub_map) { + /* clip did unnest if needed */ + assert(!entry->use_pmap); + } - /* - * entry is already wired down, get our reference - * after clipping to our range. - */ - vm_map_clip_start(map, entry, s); - vm_map_clip_end(map, entry, end); + entry->inheritance = new_inheritance; - if ((rc = add_wire_counts(map, entry, user_wire)) != KERN_SUCCESS) - goto done; + entry = entry->vme_next; + } - if (wire_and_extract) { - vm_object_t object; - vm_object_offset_t offset; - vm_page_t m; + vm_map_unlock(map); + return KERN_SUCCESS; +} - /* - * We don't have to "wire" the page again - * bit we still have to "extract" its - * physical page number, after some sanity - * checks. - */ - assert((entry->vme_end - entry->vme_start) - == PAGE_SIZE); - assert(!entry->needs_copy); - assert(!entry->is_sub_map); - assert(entry->object.vm_object); - if (((entry->vme_end - entry->vme_start) - != PAGE_SIZE) || - entry->needs_copy || - entry->is_sub_map || - entry->object.vm_object == VM_OBJECT_NULL) { - rc = KERN_INVALID_ARGUMENT; - goto done; - } +/* + * Update the accounting for the amount of wired memory in this map. If the user has + * exceeded the defined limits, then we fail. Wiring on behalf of the kernel never fails. + */ - object = entry->object.vm_object; - offset = entry->offset; - /* need exclusive lock to update m->dirty */ - if (entry->protection & VM_PROT_WRITE) { - vm_object_lock(object); - } else { - vm_object_lock_shared(object); - } - m = vm_page_lookup(object, offset); - assert(m != VM_PAGE_NULL); - assert(m->wire_count); - if (m != VM_PAGE_NULL && m->wire_count) { - *physpage_p = m->phys_page; - if (entry->protection & VM_PROT_WRITE) { - vm_object_lock_assert_exclusive( - m->object); - m->dirty = TRUE; - } - } else { - /* not already wired !? */ - *physpage_p = 0; - } - vm_object_unlock(object); - } +static kern_return_t +add_wire_counts( + vm_map_t map, + vm_map_entry_t entry, + boolean_t user_wire) +{ + vm_map_size_t size; - /* map was not unlocked: no need to relookup */ - entry = entry->vme_next; - s = entry->vme_start; - continue; - } + if (user_wire) { + unsigned int total_wire_count = vm_page_wire_count + vm_lopage_free_count; /* - * Unwired entry or wire request transmitted via submap + * We're wiring memory at the request of the user. Check if this is the first time the user is wiring + * this map entry. */ + if (entry->user_wired_count == 0) { + size = entry->vme_end - entry->vme_start; + + /* + * Since this is the first time the user is wiring this map entry, check to see if we're + * exceeding the user wire limits. There is a per map limit which is the smaller of either + * the process's rlimit or the global vm_per_task_user_wire_limit which caps this value. There is also + * a system-wide limit on the amount of memory all users can wire. If the user is over either + * limit, then we fail. + */ - /* - * Perform actions of vm_map_lookup that need the write - * lock on the map: create a shadow object for a - * copy-on-write region, or an object for a zero-fill - * region. - */ - size = entry->vme_end - entry->vme_start; - /* - * If wiring a copy-on-write page, we need to copy it now - * even if we're only (currently) requesting read access. - * This is aggressive, but once it's wired we can't move it. - */ - if (entry->needs_copy) { - if (wire_and_extract) { - /* - * We're supposed to share with the original - * provider so should not be "needs_copy" - */ - rc = KERN_INVALID_ARGUMENT; - goto done; + if (size + map->user_wire_size > MIN(map->user_wire_limit, vm_per_task_user_wire_limit) || + size + ptoa_64(total_wire_count) > vm_global_user_wire_limit) { + if (size + ptoa_64(total_wire_count) > vm_global_user_wire_limit) { + os_atomic_inc(&vm_add_wire_count_over_global_limit, relaxed); + } else { + os_atomic_inc(&vm_add_wire_count_over_user_limit, relaxed); + } + return KERN_RESOURCE_SHORTAGE; } - - vm_object_shadow(&entry->object.vm_object, - &entry->offset, size); - entry->needs_copy = FALSE; - } else if (entry->object.vm_object == VM_OBJECT_NULL) { - if (wire_and_extract) { - /* - * We're supposed to share with the original - * provider so should already have an object. - */ - rc = KERN_INVALID_ARGUMENT; - goto done; + + /* + * The first time the user wires an entry, we also increment the wired_count and add this to + * the total that has been wired in the map. + */ + + if (entry->wired_count >= MAX_WIRE_COUNT) { + return KERN_FAILURE; } - entry->object.vm_object = vm_object_allocate(size); - entry->offset = (vm_object_offset_t)0; - assert(entry->use_pmap); - } - vm_map_clip_start(map, entry, s); - vm_map_clip_end(map, entry, end); + entry->wired_count++; + map->user_wire_size += size; + } - /* re-compute "e" */ - e = entry->vme_end; - if (e > end) - e = end; + if (entry->user_wired_count >= MAX_WIRE_COUNT) { + return KERN_FAILURE; + } + entry->user_wired_count++; + } else { /* - * Check for holes and protection mismatch. - * Holes: Next entry should be contiguous unless this - * is the end of the region. - * Protection: Access requested must be allowed, unless - * wiring is by protection class + * The kernel's wiring the memory. Just bump the count and continue. */ - if ((entry->vme_end < end) && - ((entry->vme_next == vm_map_to_entry(map)) || - (entry->vme_next->vme_start > entry->vme_end))) { - /* found a hole */ - rc = KERN_INVALID_ADDRESS; - goto done; - } - if ((entry->protection & access_type) != access_type) { - /* found a protection problem */ - rc = KERN_PROTECTION_FAILURE; - goto done; + + if (entry->wired_count >= MAX_WIRE_COUNT) { + panic("vm_map_wire: too many wirings"); } - assert(entry->wired_count == 0 && entry->user_wired_count == 0); + entry->wired_count++; + } - if ((rc = add_wire_counts(map, entry, user_wire)) != KERN_SUCCESS) - goto done; + return KERN_SUCCESS; +} - entry->in_transition = TRUE; +/* + * Update the memory wiring accounting now that the given map entry is being unwired. + */ +static void +subtract_wire_counts( + vm_map_t map, + vm_map_entry_t entry, + boolean_t user_wire) +{ + if (user_wire) { /* - * This entry might get split once we unlock the map. - * In vm_fault_wire(), we need the current range as - * defined by this entry. In order for this to work - * along with a simultaneous clip operation, we make a - * temporary copy of this entry and use that for the - * wiring. Note that the underlying objects do not - * change during a clip. - */ - tmp_entry = *entry; - - /* - * The in_transition state guarentees that the entry - * (or entries for this range, if split occured) will be - * there when the map lock is acquired for the second time. + * We're unwiring memory at the request of the user. See if we're removing the last user wire reference. */ - vm_map_unlock(map); - - if (!user_wire && cur_thread != THREAD_NULL) - interruptible_state = thread_interrupt_level(THREAD_UNINT); - else - interruptible_state = THREAD_UNINT; - - if(map_pmap) - rc = vm_fault_wire(map, - &tmp_entry, map_pmap, pmap_addr, - physpage_p); - else - rc = vm_fault_wire(map, - &tmp_entry, map->pmap, - tmp_entry.vme_start, - physpage_p); - - if (!user_wire && cur_thread != THREAD_NULL) - thread_interrupt_level(interruptible_state); - vm_map_lock(map); - - if (last_timestamp+1 != map->timestamp) { + if (entry->user_wired_count == 1) { /* - * Find the entry again. It could have been clipped - * after we unlocked the map. + * We're removing the last user wire reference. Decrement the wired_count and the total + * user wired memory for this map. */ - if (!vm_map_lookup_entry(map, tmp_entry.vme_start, - &first_entry)) - panic("vm_map_wire: re-lookup failed"); - - entry = first_entry; - } - - last_timestamp = map->timestamp; - - while ((entry != vm_map_to_entry(map)) && - (entry->vme_start < tmp_entry.vme_end)) { - assert(entry->in_transition); - entry->in_transition = FALSE; - if (entry->needs_wakeup) { - entry->needs_wakeup = FALSE; - need_wakeup = TRUE; - } - if (rc != KERN_SUCCESS) { /* from vm_*_wire */ - subtract_wire_counts(map, entry, user_wire); - } - entry = entry->vme_next; - } - if (rc != KERN_SUCCESS) { /* from vm_*_wire */ - goto done; + assert(entry->wired_count >= 1); + entry->wired_count--; + map->user_wire_size -= entry->vme_end - entry->vme_start; } - s = entry->vme_start; - } /* end while loop through map entries */ - -done: - if (rc == KERN_SUCCESS) { - /* repair any damage we may have made to the VM map */ - vm_map_simplify_range(map, start, end); - } - - vm_map_unlock(map); - - /* - * wake up anybody waiting on entries we wired. - */ - if (need_wakeup) - vm_map_entry_wakeup(map); + assert(entry->user_wired_count >= 1); + entry->user_wired_count--; + } else { + /* + * The kernel is unwiring the memory. Just update the count. + */ - if (rc != KERN_SUCCESS) { - /* undo what has been wired so far */ - vm_map_unwire(map, start, s, user_wire); - if (physpage_p) { - *physpage_p = 0; - } + assert(entry->wired_count >= 1); + entry->wired_count--; } - - return rc; - -} - -kern_return_t -vm_map_wire( - register vm_map_t map, - register vm_map_offset_t start, - register vm_map_offset_t end, - register vm_prot_t access_type, - boolean_t user_wire) -{ - - kern_return_t kret; - - kret = vm_map_wire_nested(map, start, end, access_type, - user_wire, (pmap_t)NULL, 0, NULL); - return kret; } -kern_return_t -vm_map_wire_and_extract( - vm_map_t map, - vm_map_offset_t start, - vm_prot_t access_type, - boolean_t user_wire, - ppnum_t *physpage_p) -{ - - kern_return_t kret; - - kret = vm_map_wire_nested(map, - start, - start+VM_MAP_PAGE_SIZE(map), - access_type, - user_wire, - (pmap_t)NULL, - 0, - physpage_p); - if (kret != KERN_SUCCESS && - physpage_p != NULL) { - *physpage_p = 0; - } - return kret; -} +int cs_executable_wire = 0; /* - * vm_map_unwire: - * - * Sets the pageability of the specified address range in the target - * as pageable. Regions specified must have been wired previously. + * vm_map_wire: * - * The map must not be locked, but a reference must remain to the map - * throughout the call. + * Sets the pageability of the specified address range in the + * target map as wired. Regions specified as not pageable require + * locked-down physical memory and physical page maps. The + * access_type variable indicates types of accesses that must not + * generate page faults. This is checked against protection of + * memory being locked-down. * - * Kernel will panic on failures. User unwire ignores holes and - * unwired and intransition entries to avoid losing memory by leaving - * it unwired. + * The map must not be locked, but a reference must remain to the + * map throughout the call. */ static kern_return_t -vm_map_unwire_nested( - register vm_map_t map, - register vm_map_offset_t start, - register vm_map_offset_t end, - boolean_t user_wire, - pmap_t map_pmap, - vm_map_offset_t pmap_addr) -{ - register vm_map_entry_t entry; - struct vm_map_entry *first_entry, tmp_entry; - boolean_t need_wakeup; - boolean_t main_map = FALSE; - unsigned int last_timestamp; +vm_map_wire_nested( + vm_map_t map, + vm_map_offset_t start, + vm_map_offset_t end, + vm_prot_t caller_prot, + vm_tag_t tag, + boolean_t user_wire, + pmap_t map_pmap, + vm_map_offset_t pmap_addr, + ppnum_t *physpage_p) +{ + vm_map_entry_t entry; + vm_prot_t access_type; + struct vm_map_entry *first_entry, tmp_entry; + vm_map_t real_map; + vm_map_offset_t s, e; + kern_return_t rc; + boolean_t need_wakeup; + boolean_t main_map = FALSE; + wait_interrupt_t interruptible_state; + thread_t cur_thread; + unsigned int last_timestamp; + vm_map_size_t size; + boolean_t wire_and_extract; + vm_prot_t extra_prots; + + extra_prots = VM_PROT_COPY; + extra_prots |= VM_PROT_COPY_FAIL_IF_EXECUTABLE; +#if XNU_TARGET_OS_OSX + if (map->pmap == kernel_pmap || + !vm_map_cs_enforcement(map)) { + extra_prots &= ~VM_PROT_COPY_FAIL_IF_EXECUTABLE; + } +#endif /* XNU_TARGET_OS_OSX */ + + access_type = (caller_prot & VM_PROT_ALL); + + wire_and_extract = FALSE; + if (physpage_p != NULL) { + /* + * The caller wants the physical page number of the + * wired page. We return only one physical page number + * so this works for only one page at a time. + */ + if ((end - start) != PAGE_SIZE) { + return KERN_INVALID_ARGUMENT; + } + wire_and_extract = TRUE; + *physpage_p = 0; + } vm_map_lock(map); - if(map_pmap == NULL) + if (map_pmap == NULL) { main_map = TRUE; + } last_timestamp = map->timestamp; VM_MAP_RANGE_CHECK(map, start, end); - assert(page_aligned(start)); - assert(page_aligned(end)); assert(VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map))); assert(VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map))); if (start == end) { - /* We unwired what the caller asked for: zero pages */ + /* We wired what the caller asked for, zero pages */ vm_map_unlock(map); return KERN_SUCCESS; } - if (vm_map_lookup_entry(map, start, &first_entry)) { + need_wakeup = FALSE; + cur_thread = current_thread(); + + s = start; + rc = KERN_SUCCESS; + + if (vm_map_lookup_entry(map, s, &first_entry)) { entry = first_entry; /* * vm_map_clip_start will be done later. - * We don't want to unnest any nested sub maps here ! + * We don't want to unnest any nested submaps here ! */ - } - else { - if (!user_wire) { - panic("vm_map_unwire: start not found"); - } - /* Start address is not in map. */ - vm_map_unlock(map); - return(KERN_INVALID_ADDRESS); + } else { + /* Start address is not in map */ + rc = KERN_INVALID_ADDRESS; + goto done; } - if (entry->superpage_size) { - /* superpages are always wired */ - vm_map_unlock(map); - return KERN_INVALID_ADDRESS; - } + while ((entry != vm_map_to_entry(map)) && (s < end)) { + /* + * At this point, we have wired from "start" to "s". + * We still need to wire from "s" to "end". + * + * "entry" hasn't been clipped, so it could start before "s" + * and/or end after "end". + */ - need_wakeup = FALSE; - while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) { + /* "e" is how far we want to wire in this entry */ + e = entry->vme_end; + if (e > end) { + e = end; + } + + /* + * If another thread is wiring/unwiring this entry then + * block after informing other thread to wake us up. + */ if (entry->in_transition) { + wait_result_t wait_result; + /* - * 1) - * Another thread is wiring down this entry. Note - * that if it is not for the other thread we would - * be unwiring an unwired entry. This is not - * permitted. If we wait, we will be unwiring memory - * we did not wire. - * - * 2) - * Another thread is unwiring this entry. We did not - * have a reference to it, because if we did, this - * entry will not be getting unwired now. + * We have not clipped the entry. Make sure that + * the start address is in range so that the lookup + * below will succeed. + * "s" is the current starting point: we've already + * wired from "start" to "s" and we still have + * to wire from "s" to "end". */ - if (!user_wire) { + + entry->needs_wakeup = TRUE; + + /* + * wake up anybody waiting on entries that we have + * already wired. + */ + if (need_wakeup) { + vm_map_entry_wakeup(map); + need_wakeup = FALSE; + } + /* + * User wiring is interruptible + */ + wait_result = vm_map_entry_wait(map, + (user_wire) ? THREAD_ABORTSAFE : + THREAD_UNINT); + if (user_wire && wait_result == THREAD_INTERRUPTED) { /* - * XXX FBDP - * This could happen: there could be some - * overlapping vslock/vsunlock operations - * going on. - * We should probably just wait and retry, - * but then we have to be careful that this - * entry could get "simplified" after - * "in_transition" gets unset and before - * we re-lookup the entry, so we would - * have to re-clip the entry to avoid - * re-unwiring what we have already unwired... - * See vm_map_wire_nested(). - * - * Or we could just ignore "in_transition" - * here and proceed to decement the wired - * count(s) on this entry. That should be fine - * as long as "wired_count" doesn't drop all - * the way to 0 (and we should panic if THAT - * happens). + * undo the wirings we have done so far + * We do not clear the needs_wakeup flag, + * because we cannot tell if we were the + * only one waiting. */ - panic("vm_map_unwire: in_transition entry"); + rc = KERN_FAILURE; + goto done; } - entry = entry->vme_next; + /* + * Cannot avoid a lookup here. reset timestamp. + */ + last_timestamp = map->timestamp; + + /* + * The entry could have been clipped, look it up again. + * Worse that can happen is, it may not exist anymore. + */ + if (!vm_map_lookup_entry(map, s, &first_entry)) { + /* + * User: undo everything upto the previous + * entry. let vm_map_unwire worry about + * checking the validity of the range. + */ + rc = KERN_FAILURE; + goto done; + } + entry = first_entry; continue; } if (entry->is_sub_map) { - vm_map_offset_t sub_start; - vm_map_offset_t sub_end; - vm_map_offset_t local_end; - pmap_t pmap; - - vm_map_clip_start(map, entry, start); + vm_map_offset_t sub_start; + vm_map_offset_t sub_end; + vm_map_offset_t local_start; + vm_map_offset_t local_end; + pmap_t pmap; + + if (wire_and_extract) { + /* + * Wiring would result in copy-on-write + * which would not be compatible with + * the sharing we have with the original + * provider of this memory. + */ + rc = KERN_INVALID_ARGUMENT; + goto done; + } + + vm_map_clip_start(map, entry, s); vm_map_clip_end(map, entry, end); - sub_start = entry->offset; - sub_end = entry->vme_end - entry->vme_start; - sub_end += entry->offset; + sub_start = VME_OFFSET(entry); + sub_end = entry->vme_end; + sub_end += VME_OFFSET(entry) - entry->vme_start; + local_end = entry->vme_end; - if(map_pmap == NULL) { - if(entry->use_pmap) { - pmap = entry->object.sub_map->pmap; + if (map_pmap == NULL) { + vm_object_t object; + vm_object_offset_t offset; + vm_prot_t prot; + boolean_t wired; + vm_map_entry_t local_entry; + vm_map_version_t version; + vm_map_t lookup_map; + + if (entry->use_pmap) { + pmap = VME_SUBMAP(entry)->pmap; + /* ppc implementation requires that */ + /* submaps pmap address ranges line */ + /* up with parent map */ +#ifdef notdef pmap_addr = sub_start; +#endif + pmap_addr = s; } else { pmap = map->pmap; - pmap_addr = start; - } - if (entry->wired_count == 0 || - (user_wire && entry->user_wired_count == 0)) { - if (!user_wire) - panic("vm_map_unwire: entry is unwired"); - entry = entry->vme_next; - continue; - } - - /* - * Check for holes - * Holes: Next entry should be contiguous unless - * this is the end of the region. - */ - if (((entry->vme_end < end) && - ((entry->vme_next == vm_map_to_entry(map)) || - (entry->vme_next->vme_start - > entry->vme_end)))) { - if (!user_wire) - panic("vm_map_unwire: non-contiguous region"); -/* - entry = entry->vme_next; - continue; -*/ + pmap_addr = s; } - subtract_wire_counts(map, entry, user_wire); + if (entry->wired_count) { + if ((rc = add_wire_counts(map, entry, user_wire)) != KERN_SUCCESS) { + goto done; + } - if (entry->wired_count != 0) { + /* + * The map was not unlocked: + * no need to goto re-lookup. + * Just go directly to next entry. + */ entry = entry->vme_next; + s = entry->vme_start; continue; } - entry->in_transition = TRUE; - tmp_entry = *entry;/* see comment in vm_map_wire() */ - - /* - * We can unlock the map now. The in_transition state - * guarantees existance of the entry. - */ - vm_map_unlock(map); - vm_map_unwire_nested(entry->object.sub_map, - sub_start, sub_end, user_wire, pmap, pmap_addr); + /* call vm_map_lookup_locked to */ + /* cause any needs copy to be */ + /* evaluated */ + local_start = entry->vme_start; + lookup_map = map; + vm_map_lock_write_to_read(map); + rc = vm_map_lookup_locked( + &lookup_map, local_start, + (access_type | extra_prots), + OBJECT_LOCK_EXCLUSIVE, + &version, &object, + &offset, &prot, &wired, + NULL, + &real_map, NULL); + if (rc != KERN_SUCCESS) { + vm_map_unlock_read(lookup_map); + assert(map_pmap == NULL); + vm_map_unwire(map, start, + s, user_wire); + return rc; + } + vm_object_unlock(object); + if (real_map != lookup_map) { + vm_map_unlock(real_map); + } + vm_map_unlock_read(lookup_map); vm_map_lock(map); - if (last_timestamp+1 != map->timestamp) { - /* - * Find the entry again. It could have been - * clipped or deleted after we unlocked the map. - */ - if (!vm_map_lookup_entry(map, - tmp_entry.vme_start, - &first_entry)) { - if (!user_wire) - panic("vm_map_unwire: re-lookup failed"); - entry = first_entry->vme_next; - } else - entry = first_entry; + /* we unlocked, so must re-lookup */ + if (!vm_map_lookup_entry(map, + local_start, + &local_entry)) { + rc = KERN_FAILURE; + goto done; } - last_timestamp = map->timestamp; /* - * clear transition bit for all constituent entries - * that were in the original entry (saved in - * tmp_entry). Also check for waiters. + * entry could have been "simplified", + * so re-clip */ - while ((entry != vm_map_to_entry(map)) && - (entry->vme_start < tmp_entry.vme_end)) { - assert(entry->in_transition); - entry->in_transition = FALSE; - if (entry->needs_wakeup) { - entry->needs_wakeup = FALSE; - need_wakeup = TRUE; - } - entry = entry->vme_next; + entry = local_entry; + assert(s == local_start); + vm_map_clip_start(map, entry, s); + vm_map_clip_end(map, entry, end); + /* re-compute "e" */ + e = entry->vme_end; + if (e > end) { + e = end; } - continue; - } else { - vm_map_unlock(map); - vm_map_unwire_nested(entry->object.sub_map, - sub_start, sub_end, user_wire, map_pmap, - pmap_addr); - vm_map_lock(map); - if (last_timestamp+1 != map->timestamp) { - /* - * Find the entry again. It could have been - * clipped or deleted after we unlocked the map. - */ - if (!vm_map_lookup_entry(map, - tmp_entry.vme_start, - &first_entry)) { - if (!user_wire) - panic("vm_map_unwire: re-lookup failed"); - entry = first_entry->vme_next; - } else - entry = first_entry; + /* did we have a change of type? */ + if (!entry->is_sub_map) { + last_timestamp = map->timestamp; + continue; } - last_timestamp = map->timestamp; + } else { + local_start = entry->vme_start; + pmap = map_pmap; } - } + if ((rc = add_wire_counts(map, entry, user_wire)) != KERN_SUCCESS) { + goto done; + } - if ((entry->wired_count == 0) || - (user_wire && entry->user_wired_count == 0)) { - if (!user_wire) - panic("vm_map_unwire: entry is unwired"); - - entry = entry->vme_next; - continue; - } - - assert(entry->wired_count > 0 && - (!user_wire || entry->user_wired_count > 0)); + entry->in_transition = TRUE; - vm_map_clip_start(map, entry, start); - vm_map_clip_end(map, entry, end); + vm_map_unlock(map); + rc = vm_map_wire_nested(VME_SUBMAP(entry), + sub_start, sub_end, + caller_prot, tag, + user_wire, pmap, pmap_addr, + NULL); + vm_map_lock(map); - /* - * Check for holes - * Holes: Next entry should be contiguous unless - * this is the end of the region. - */ - if (((entry->vme_end < end) && - ((entry->vme_next == vm_map_to_entry(map)) || - (entry->vme_next->vme_start > entry->vme_end)))) { + /* + * Find the entry again. It could have been clipped + * after we unlocked the map. + */ + if (!vm_map_lookup_entry(map, local_start, + &first_entry)) { + panic("vm_map_wire: re-lookup failed"); + } + entry = first_entry; - if (!user_wire) - panic("vm_map_unwire: non-contiguous region"); - entry = entry->vme_next; - continue; - } + assert(local_start == s); + /* re-compute "e" */ + e = entry->vme_end; + if (e > end) { + e = end; + } - subtract_wire_counts(map, entry, user_wire); + last_timestamp = map->timestamp; + while ((entry != vm_map_to_entry(map)) && + (entry->vme_start < e)) { + assert(entry->in_transition); + entry->in_transition = FALSE; + if (entry->needs_wakeup) { + entry->needs_wakeup = FALSE; + need_wakeup = TRUE; + } + if (rc != KERN_SUCCESS) {/* from vm_*_wire */ + subtract_wire_counts(map, entry, user_wire); + } + entry = entry->vme_next; + } + if (rc != KERN_SUCCESS) { /* from vm_*_wire */ + goto done; + } - if (entry->wired_count != 0) { - entry = entry->vme_next; + /* no need to relookup again */ + s = entry->vme_start; continue; } - if(entry->zero_wired_pages) { - entry->zero_wired_pages = FALSE; - } - - entry->in_transition = TRUE; - tmp_entry = *entry; /* see comment in vm_map_wire() */ - /* - * We can unlock the map now. The in_transition state - * guarantees existance of the entry. + * If this entry is already wired then increment + * the appropriate wire reference count. */ - vm_map_unlock(map); - if(map_pmap) { - vm_fault_unwire(map, - &tmp_entry, FALSE, map_pmap, pmap_addr); - } else { - vm_fault_unwire(map, - &tmp_entry, FALSE, map->pmap, - tmp_entry.vme_start); - } - vm_map_lock(map); + if (entry->wired_count) { + if ((entry->protection & access_type) != access_type) { + /* found a protection problem */ + + /* + * XXX FBDP + * We should always return an error + * in this case but since we didn't + * enforce it before, let's do + * it only for the new "wire_and_extract" + * code path for now... + */ + if (wire_and_extract) { + rc = KERN_PROTECTION_FAILURE; + goto done; + } + } - if (last_timestamp+1 != map->timestamp) { /* - * Find the entry again. It could have been clipped - * or deleted after we unlocked the map. + * entry is already wired down, get our reference + * after clipping to our range. */ - if (!vm_map_lookup_entry(map, tmp_entry.vme_start, - &first_entry)) { - if (!user_wire) - panic("vm_map_unwire: re-lookup failed"); - entry = first_entry->vme_next; - } else - entry = first_entry; - } - last_timestamp = map->timestamp; + vm_map_clip_start(map, entry, s); + vm_map_clip_end(map, entry, end); - /* - * clear transition bit for all constituent entries that - * were in the original entry (saved in tmp_entry). Also - * check for waiters. - */ - while ((entry != vm_map_to_entry(map)) && - (entry->vme_start < tmp_entry.vme_end)) { - assert(entry->in_transition); - entry->in_transition = FALSE; - if (entry->needs_wakeup) { - entry->needs_wakeup = FALSE; - need_wakeup = TRUE; + if ((rc = add_wire_counts(map, entry, user_wire)) != KERN_SUCCESS) { + goto done; } - entry = entry->vme_next; - } - } - - /* - * We might have fragmented the address space when we wired this - * range of addresses. Attempt to re-coalesce these VM map entries - * with their neighbors now that they're no longer wired. - * Under some circumstances, address space fragmentation can - * prevent VM object shadow chain collapsing, which can cause - * swap space leaks. - */ - vm_map_simplify_range(map, start, end); - - vm_map_unlock(map); - /* - * wake up anybody waiting on entries that we have unwired. - */ - if (need_wakeup) - vm_map_entry_wakeup(map); - return(KERN_SUCCESS); -} + if (wire_and_extract) { + vm_object_t object; + vm_object_offset_t offset; + vm_page_t m; -kern_return_t -vm_map_unwire( - register vm_map_t map, - register vm_map_offset_t start, - register vm_map_offset_t end, - boolean_t user_wire) -{ - return vm_map_unwire_nested(map, start, end, - user_wire, (pmap_t)NULL, 0); -} + /* + * We don't have to "wire" the page again + * bit we still have to "extract" its + * physical page number, after some sanity + * checks. + */ + assert((entry->vme_end - entry->vme_start) + == PAGE_SIZE); + assert(!entry->needs_copy); + assert(!entry->is_sub_map); + assert(VME_OBJECT(entry)); + if (((entry->vme_end - entry->vme_start) + != PAGE_SIZE) || + entry->needs_copy || + entry->is_sub_map || + VME_OBJECT(entry) == VM_OBJECT_NULL) { + rc = KERN_INVALID_ARGUMENT; + goto done; + } + object = VME_OBJECT(entry); + offset = VME_OFFSET(entry); + /* need exclusive lock to update m->dirty */ + if (entry->protection & VM_PROT_WRITE) { + vm_object_lock(object); + } else { + vm_object_lock_shared(object); + } + m = vm_page_lookup(object, offset); + assert(m != VM_PAGE_NULL); + assert(VM_PAGE_WIRED(m)); + if (m != VM_PAGE_NULL && VM_PAGE_WIRED(m)) { + *physpage_p = VM_PAGE_GET_PHYS_PAGE(m); + if (entry->protection & VM_PROT_WRITE) { + vm_object_lock_assert_exclusive( + object); + m->vmp_dirty = TRUE; + } + } else { + /* not already wired !? */ + *physpage_p = 0; + } + vm_object_unlock(object); + } -/* - * vm_map_entry_delete: [ internal use only ] - * - * Deallocate the given entry from the target map. - */ -static void -vm_map_entry_delete( - register vm_map_t map, - register vm_map_entry_t entry) -{ - register vm_map_offset_t s, e; - register vm_object_t object; - register vm_map_t submap; + /* map was not unlocked: no need to relookup */ + entry = entry->vme_next; + s = entry->vme_start; + continue; + } - s = entry->vme_start; - e = entry->vme_end; - assert(page_aligned(s)); - assert(page_aligned(e)); - if (entry->map_aligned == TRUE) { - assert(VM_MAP_PAGE_ALIGNED(s, VM_MAP_PAGE_MASK(map))); - assert(VM_MAP_PAGE_ALIGNED(e, VM_MAP_PAGE_MASK(map))); - } - assert(entry->wired_count == 0); - assert(entry->user_wired_count == 0); - assert(!entry->permanent); + /* + * Unwired entry or wire request transmitted via submap + */ - if (entry->is_sub_map) { - object = NULL; - submap = entry->object.sub_map; - } else { - submap = NULL; - object = entry->object.vm_object; - } + /* + * Wiring would copy the pages to the shadow object. + * The shadow object would not be code-signed so + * attempting to execute code from these copied pages + * would trigger a code-signing violation. + */ - vm_map_store_entry_unlink(map, entry); - map->size -= e - s; + if ((entry->protection & VM_PROT_EXECUTE) +#if XNU_TARGET_OS_OSX + && + map->pmap != kernel_pmap && + (vm_map_cs_enforcement(map) +#if __arm64__ + || !VM_MAP_IS_EXOTIC(map) +#endif /* __arm64__ */ + ) +#endif /* XNU_TARGET_OS_OSX */ + ) { +#if MACH_ASSERT + printf("pid %d[%s] wiring executable range from " + "0x%llx to 0x%llx: rejected to preserve " + "code-signing\n", + proc_selfpid(), + (current_task()->bsd_info + ? proc_name_address(current_task()->bsd_info) + : "?"), + (uint64_t) entry->vme_start, + (uint64_t) entry->vme_end); +#endif /* MACH_ASSERT */ + DTRACE_VM2(cs_executable_wire, + uint64_t, (uint64_t)entry->vme_start, + uint64_t, (uint64_t)entry->vme_end); + cs_executable_wire++; + rc = KERN_PROTECTION_FAILURE; + goto done; + } - vm_map_entry_dispose(map, entry); + /* + * Perform actions of vm_map_lookup that need the write + * lock on the map: create a shadow object for a + * copy-on-write region, or an object for a zero-fill + * region. + */ + size = entry->vme_end - entry->vme_start; + /* + * If wiring a copy-on-write page, we need to copy it now + * even if we're only (currently) requesting read access. + * This is aggressive, but once it's wired we can't move it. + */ + if (entry->needs_copy) { + if (wire_and_extract) { + /* + * We're supposed to share with the original + * provider so should not be "needs_copy" + */ + rc = KERN_INVALID_ARGUMENT; + goto done; + } - vm_map_unlock(map); - /* - * Deallocate the object only after removing all - * pmap entries pointing to its pages. - */ - if (submap) - vm_map_deallocate(submap); - else - vm_object_deallocate(object); + VME_OBJECT_SHADOW(entry, size); + entry->needs_copy = FALSE; + } else if (VME_OBJECT(entry) == VM_OBJECT_NULL) { + if (wire_and_extract) { + /* + * We're supposed to share with the original + * provider so should already have an object. + */ + rc = KERN_INVALID_ARGUMENT; + goto done; + } + VME_OBJECT_SET(entry, vm_object_allocate(size)); + VME_OFFSET_SET(entry, (vm_object_offset_t)0); + assert(entry->use_pmap); + } -} + vm_map_clip_start(map, entry, s); + vm_map_clip_end(map, entry, end); -void -vm_map_submap_pmap_clean( - vm_map_t map, - vm_map_offset_t start, - vm_map_offset_t end, - vm_map_t sub_map, - vm_map_offset_t offset) -{ - vm_map_offset_t submap_start; - vm_map_offset_t submap_end; - vm_map_size_t remove_size; - vm_map_entry_t entry; + /* re-compute "e" */ + e = entry->vme_end; + if (e > end) { + e = end; + } - submap_end = offset + (end - start); - submap_start = offset; + /* + * Check for holes and protection mismatch. + * Holes: Next entry should be contiguous unless this + * is the end of the region. + * Protection: Access requested must be allowed, unless + * wiring is by protection class + */ + if ((entry->vme_end < end) && + ((entry->vme_next == vm_map_to_entry(map)) || + (entry->vme_next->vme_start > entry->vme_end))) { + /* found a hole */ + rc = KERN_INVALID_ADDRESS; + goto done; + } + if ((entry->protection & access_type) != access_type) { + /* found a protection problem */ + rc = KERN_PROTECTION_FAILURE; + goto done; + } - vm_map_lock_read(sub_map); - if(vm_map_lookup_entry(sub_map, offset, &entry)) { - - remove_size = (entry->vme_end - entry->vme_start); - if(offset > entry->vme_start) - remove_size -= offset - entry->vme_start; - + assert(entry->wired_count == 0 && entry->user_wired_count == 0); - if(submap_end < entry->vme_end) { - remove_size -= - entry->vme_end - submap_end; + if ((rc = add_wire_counts(map, entry, user_wire)) != KERN_SUCCESS) { + goto done; } - if(entry->is_sub_map) { - vm_map_submap_pmap_clean( - sub_map, - start, - start + remove_size, - entry->object.sub_map, - entry->offset); - } else { - if((map->mapped_in_other_pmaps) && (map->ref_count) - && (entry->object.vm_object != NULL)) { - vm_object_pmap_protect( - entry->object.vm_object, - entry->offset+(offset-entry->vme_start), - remove_size, - PMAP_NULL, - entry->vme_start, - VM_PROT_NONE); - } else { - pmap_remove(map->pmap, - (addr64_t)start, - (addr64_t)(start + remove_size)); - } - } - } + entry->in_transition = TRUE; - entry = entry->vme_next; + /* + * This entry might get split once we unlock the map. + * In vm_fault_wire(), we need the current range as + * defined by this entry. In order for this to work + * along with a simultaneous clip operation, we make a + * temporary copy of this entry and use that for the + * wiring. Note that the underlying objects do not + * change during a clip. + */ + tmp_entry = *entry; - while((entry != vm_map_to_entry(sub_map)) - && (entry->vme_start < submap_end)) { - remove_size = (entry->vme_end - entry->vme_start); - if(submap_end < entry->vme_end) { - remove_size -= entry->vme_end - submap_end; + /* + * The in_transition state guarentees that the entry + * (or entries for this range, if split occured) will be + * there when the map lock is acquired for the second time. + */ + vm_map_unlock(map); + + if (!user_wire && cur_thread != THREAD_NULL) { + interruptible_state = thread_interrupt_level(THREAD_UNINT); + } else { + interruptible_state = THREAD_UNINT; } - if(entry->is_sub_map) { - vm_map_submap_pmap_clean( - sub_map, - (start + entry->vme_start) - offset, - ((start + entry->vme_start) - offset) + remove_size, - entry->object.sub_map, - entry->offset); + + if (map_pmap) { + rc = vm_fault_wire(map, + &tmp_entry, caller_prot, tag, map_pmap, pmap_addr, + physpage_p); } else { - if((map->mapped_in_other_pmaps) && (map->ref_count) - && (entry->object.vm_object != NULL)) { - vm_object_pmap_protect( - entry->object.vm_object, - entry->offset, - remove_size, - PMAP_NULL, - entry->vme_start, - VM_PROT_NONE); - } else { - pmap_remove(map->pmap, - (addr64_t)((start + entry->vme_start) - - offset), - (addr64_t)(((start + entry->vme_start) - - offset) + remove_size)); - } + rc = vm_fault_wire(map, + &tmp_entry, caller_prot, tag, map->pmap, + tmp_entry.vme_start, + physpage_p); } - entry = entry->vme_next; - } - vm_map_unlock_read(sub_map); - return; -} -/* - * vm_map_delete: [ internal use only ] - * - * Deallocates the given address range from the target map. - * Removes all user wirings. Unwires one kernel wiring if - * VM_MAP_REMOVE_KUNWIRE is set. Waits for kernel wirings to go - * away if VM_MAP_REMOVE_WAIT_FOR_KWIRE is set. Sleeps - * interruptibly if VM_MAP_REMOVE_INTERRUPTIBLE is set. - * - * This routine is called with map locked and leaves map locked. - */ -static kern_return_t -vm_map_delete( - vm_map_t map, - vm_map_offset_t start, - vm_map_offset_t end, - int flags, - vm_map_t zap_map) -{ - vm_map_entry_t entry, next; - struct vm_map_entry *first_entry, tmp_entry; - register vm_map_offset_t s; - register vm_object_t object; - boolean_t need_wakeup; - unsigned int last_timestamp = ~0; /* unlikely value */ - int interruptible; + if (!user_wire && cur_thread != THREAD_NULL) { + thread_interrupt_level(interruptible_state); + } - interruptible = (flags & VM_MAP_REMOVE_INTERRUPTIBLE) ? - THREAD_ABORTSAFE : THREAD_UNINT; + vm_map_lock(map); - /* - * All our DMA I/O operations in IOKit are currently done by - * wiring through the map entries of the task requesting the I/O. - * Because of this, we must always wait for kernel wirings - * to go away on the entries before deleting them. - * - * Any caller who wants to actually remove a kernel wiring - * should explicitly set the VM_MAP_REMOVE_KUNWIRE flag to - * properly remove one wiring instead of blasting through - * them all. - */ - flags |= VM_MAP_REMOVE_WAIT_FOR_KWIRE; + if (last_timestamp + 1 != map->timestamp) { + /* + * Find the entry again. It could have been clipped + * after we unlocked the map. + */ + if (!vm_map_lookup_entry(map, tmp_entry.vme_start, + &first_entry)) { + panic("vm_map_wire: re-lookup failed"); + } - while(1) { - /* - * Find the start of the region, and clip it - */ - if (vm_map_lookup_entry(map, start, &first_entry)) { entry = first_entry; - if (map == kalloc_map && - (entry->vme_start != start || - entry->vme_end != end)) { - panic("vm_map_delete(%p,0x%llx,0x%llx): " - "mismatched entry %p [0x%llx:0x%llx]\n", - map, - (uint64_t)start, - (uint64_t)end, - entry, - (uint64_t)entry->vme_start, - (uint64_t)entry->vme_end); - } - if (entry->superpage_size && (start & ~SUPERPAGE_MASK)) { /* extend request to whole entry */ start = SUPERPAGE_ROUND_DOWN(start); - start = SUPERPAGE_ROUND_DOWN(start); - continue; + } + + last_timestamp = map->timestamp; + + while ((entry != vm_map_to_entry(map)) && + (entry->vme_start < tmp_entry.vme_end)) { + assert(entry->in_transition); + entry->in_transition = FALSE; + if (entry->needs_wakeup) { + entry->needs_wakeup = FALSE; + need_wakeup = TRUE; } - if (start == entry->vme_start) { - /* - * No need to clip. We don't want to cause - * any unnecessary unnesting in this case... - */ - } else { - if ((flags & VM_MAP_REMOVE_NO_MAP_ALIGN) && - entry->map_aligned && - !VM_MAP_PAGE_ALIGNED( - start, - VM_MAP_PAGE_MASK(map))) { - /* - * The entry will no longer be - * map-aligned after clipping - * and the caller said it's OK. - */ - entry->map_aligned = FALSE; - } - if (map == kalloc_map) { - panic("vm_map_delete(%p,0x%llx,0x%llx):" - " clipping %p at 0x%llx\n", - map, - (uint64_t)start, - (uint64_t)end, - entry, - (uint64_t)start); - } - vm_map_clip_start(map, entry, start); + if (rc != KERN_SUCCESS) { /* from vm_*_wire */ + subtract_wire_counts(map, entry, user_wire); } + entry = entry->vme_next; + } - /* - * Fix the lookup hint now, rather than each - * time through the loop. - */ - SAVE_HINT_MAP_WRITE(map, entry->vme_prev); - } else { - if (map->pmap == kernel_pmap && - map->ref_count != 0) { - panic("vm_map_delete(%p,0x%llx,0x%llx): " - "no map entry at 0x%llx\n", - map, - (uint64_t)start, - (uint64_t)end, - (uint64_t)start); - } - entry = first_entry->vme_next; + if (rc != KERN_SUCCESS) { /* from vm_*_wire */ + goto done; } - break; + + if ((entry != vm_map_to_entry(map)) && /* we still have entries in the map */ + (tmp_entry.vme_end != end) && /* AND, we are not at the end of the requested range */ + (entry->vme_start != tmp_entry.vme_end)) { /* AND, the next entry is not contiguous. */ + /* found a "new" hole */ + s = tmp_entry.vme_end; + rc = KERN_INVALID_ADDRESS; + goto done; + } + + s = entry->vme_start; + } /* end while loop through map entries */ + +done: + if (rc == KERN_SUCCESS) { + /* repair any damage we may have made to the VM map */ + vm_map_simplify_range(map, start, end); } - if (entry->superpage_size) - end = SUPERPAGE_ROUND_UP(end); - need_wakeup = FALSE; + vm_map_unlock(map); + /* - * Step through all entries in this region + * wake up anybody waiting on entries we wired. */ - s = entry->vme_start; - while ((entry != vm_map_to_entry(map)) && (s < end)) { - /* - * At this point, we have deleted all the memory entries - * between "start" and "s". We still need to delete - * all memory entries between "s" and "end". - * While we were blocked and the map was unlocked, some - * new memory entries could have been re-allocated between - * "start" and "s" and we don't want to mess with those. - * Some of those entries could even have been re-assembled - * with an entry after "s" (in vm_map_simplify_entry()), so - * we may have to vm_map_clip_start() again. - */ - - if (entry->vme_start >= s) { - /* - * This entry starts on or after "s" - * so no need to clip its start. - */ - } else { - /* - * This entry has been re-assembled by a - * vm_map_simplify_entry(). We need to - * re-clip its start. - */ - if ((flags & VM_MAP_REMOVE_NO_MAP_ALIGN) && - entry->map_aligned && - !VM_MAP_PAGE_ALIGNED(s, - VM_MAP_PAGE_MASK(map))) { - /* - * The entry will no longer be map-aligned - * after clipping and the caller said it's OK. - */ - entry->map_aligned = FALSE; - } - if (map == kalloc_map) { - panic("vm_map_delete(%p,0x%llx,0x%llx): " - "clipping %p at 0x%llx\n", - map, - (uint64_t)start, - (uint64_t)end, - entry, - (uint64_t)s); - } - vm_map_clip_start(map, entry, s); - } - if (entry->vme_end <= end) { - /* - * This entry is going away completely, so no need - * to clip and possibly cause an unnecessary unnesting. - */ - } else { - if ((flags & VM_MAP_REMOVE_NO_MAP_ALIGN) && - entry->map_aligned && - !VM_MAP_PAGE_ALIGNED(end, - VM_MAP_PAGE_MASK(map))) { - /* - * The entry will no longer be map-aligned - * after clipping and the caller said it's OK. - */ - entry->map_aligned = FALSE; - } - if (map == kalloc_map) { - panic("vm_map_delete(%p,0x%llx,0x%llx): " - "clipping %p at 0x%llx\n", - map, - (uint64_t)start, - (uint64_t)end, - entry, - (uint64_t)end); - } - vm_map_clip_end(map, entry, end); - } + if (need_wakeup) { + vm_map_entry_wakeup(map); + } - if (entry->permanent) { - panic("attempt to remove permanent VM map entry " - "%p [0x%llx:0x%llx]\n", - entry, (uint64_t) s, (uint64_t) end); + if (rc != KERN_SUCCESS) { + /* undo what has been wired so far */ + vm_map_unwire_nested(map, start, s, user_wire, + map_pmap, pmap_addr); + if (physpage_p) { + *physpage_p = 0; } + } + return rc; +} - if (entry->in_transition) { - wait_result_t wait_result; +kern_return_t +vm_map_wire_external( + vm_map_t map, + vm_map_offset_t start, + vm_map_offset_t end, + vm_prot_t caller_prot, + boolean_t user_wire) +{ + kern_return_t kret; - /* - * Another thread is wiring/unwiring this entry. - * Let the other thread know we are waiting. - */ - assert(s == entry->vme_start); - entry->needs_wakeup = TRUE; + kret = vm_map_wire_nested(map, start, end, caller_prot, vm_tag_bt(), + user_wire, (pmap_t)NULL, 0, NULL); + return kret; +} - /* - * wake up anybody waiting on entries that we have - * already unwired/deleted. - */ - if (need_wakeup) { - vm_map_entry_wakeup(map); - need_wakeup = FALSE; - } +kern_return_t +vm_map_wire_kernel( + vm_map_t map, + vm_map_offset_t start, + vm_map_offset_t end, + vm_prot_t caller_prot, + vm_tag_t tag, + boolean_t user_wire) +{ + kern_return_t kret; - wait_result = vm_map_entry_wait(map, interruptible); + kret = vm_map_wire_nested(map, start, end, caller_prot, tag, + user_wire, (pmap_t)NULL, 0, NULL); + return kret; +} - if (interruptible && - wait_result == THREAD_INTERRUPTED) { - /* - * We do not clear the needs_wakeup flag, - * since we cannot tell if we were the only one. - */ - return KERN_ABORTED; - } +kern_return_t +vm_map_wire_and_extract_external( + vm_map_t map, + vm_map_offset_t start, + vm_prot_t caller_prot, + boolean_t user_wire, + ppnum_t *physpage_p) +{ + kern_return_t kret; - /* - * The entry could have been clipped or it - * may not exist anymore. Look it up again. - */ - if (!vm_map_lookup_entry(map, s, &first_entry)) { - assert((map != kernel_map) && - (!entry->is_sub_map)); - /* - * User: use the next entry - */ - entry = first_entry->vme_next; - s = entry->vme_start; - } else { - entry = first_entry; - SAVE_HINT_MAP_WRITE(map, entry->vme_prev); - } - last_timestamp = map->timestamp; - continue; - } /* end in_transition */ + kret = vm_map_wire_nested(map, + start, + start + VM_MAP_PAGE_SIZE(map), + caller_prot, + vm_tag_bt(), + user_wire, + (pmap_t)NULL, + 0, + physpage_p); + if (kret != KERN_SUCCESS && + physpage_p != NULL) { + *physpage_p = 0; + } + return kret; +} - if (entry->wired_count) { - boolean_t user_wire; +kern_return_t +vm_map_wire_and_extract_kernel( + vm_map_t map, + vm_map_offset_t start, + vm_prot_t caller_prot, + vm_tag_t tag, + boolean_t user_wire, + ppnum_t *physpage_p) +{ + kern_return_t kret; - user_wire = entry->user_wired_count > 0; + kret = vm_map_wire_nested(map, + start, + start + VM_MAP_PAGE_SIZE(map), + caller_prot, + tag, + user_wire, + (pmap_t)NULL, + 0, + physpage_p); + if (kret != KERN_SUCCESS && + physpage_p != NULL) { + *physpage_p = 0; + } + return kret; +} - /* - * Remove a kernel wiring if requested - */ - if (flags & VM_MAP_REMOVE_KUNWIRE) { - entry->wired_count--; - } - - /* - * Remove all user wirings for proper accounting - */ - if (entry->user_wired_count > 0) { - while (entry->user_wired_count) - subtract_wire_counts(map, entry, user_wire); - } +/* + * vm_map_unwire: + * + * Sets the pageability of the specified address range in the target + * as pageable. Regions specified must have been wired previously. + * + * The map must not be locked, but a reference must remain to the map + * throughout the call. + * + * Kernel will panic on failures. User unwire ignores holes and + * unwired and intransition entries to avoid losing memory by leaving + * it unwired. + */ +static kern_return_t +vm_map_unwire_nested( + vm_map_t map, + vm_map_offset_t start, + vm_map_offset_t end, + boolean_t user_wire, + pmap_t map_pmap, + vm_map_offset_t pmap_addr) +{ + vm_map_entry_t entry; + struct vm_map_entry *first_entry, tmp_entry; + boolean_t need_wakeup; + boolean_t main_map = FALSE; + unsigned int last_timestamp; - if (entry->wired_count != 0) { - assert(map != kernel_map); - /* - * Cannot continue. Typical case is when - * a user thread has physical io pending on - * on this page. Either wait for the - * kernel wiring to go away or return an - * error. - */ - if (flags & VM_MAP_REMOVE_WAIT_FOR_KWIRE) { - wait_result_t wait_result; + vm_map_lock(map); + if (map_pmap == NULL) { + main_map = TRUE; + } + last_timestamp = map->timestamp; - assert(s == entry->vme_start); - entry->needs_wakeup = TRUE; - wait_result = vm_map_entry_wait(map, - interruptible); + VM_MAP_RANGE_CHECK(map, start, end); + assert(VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map))); + assert(VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map))); - if (interruptible && - wait_result == THREAD_INTERRUPTED) { - /* - * We do not clear the - * needs_wakeup flag, since we - * cannot tell if we were the - * only one. - */ - return KERN_ABORTED; - } + if (start == end) { + /* We unwired what the caller asked for: zero pages */ + vm_map_unlock(map); + return KERN_SUCCESS; + } - /* - * The entry could have been clipped or - * it may not exist anymore. Look it - * up again. - */ - if (!vm_map_lookup_entry(map, s, - &first_entry)) { - assert(map != kernel_map); - /* - * User: use the next entry - */ - entry = first_entry->vme_next; - s = entry->vme_start; - } else { - entry = first_entry; - SAVE_HINT_MAP_WRITE(map, entry->vme_prev); - } - last_timestamp = map->timestamp; - continue; - } - else { - return KERN_FAILURE; - } - } + if (vm_map_lookup_entry(map, start, &first_entry)) { + entry = first_entry; + /* + * vm_map_clip_start will be done later. + * We don't want to unnest any nested sub maps here ! + */ + } else { + if (!user_wire) { + panic("vm_map_unwire: start not found"); + } + /* Start address is not in map. */ + vm_map_unlock(map); + return KERN_INVALID_ADDRESS; + } - entry->in_transition = TRUE; - /* - * copy current entry. see comment in vm_map_wire() - */ - tmp_entry = *entry; - assert(s == entry->vme_start); + if (entry->superpage_size) { + /* superpages are always wired */ + vm_map_unlock(map); + return KERN_INVALID_ADDRESS; + } + need_wakeup = FALSE; + while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) { + if (entry->in_transition) { /* - * We can unlock the map now. The in_transition - * state guarentees existance of the entry. + * 1) + * Another thread is wiring down this entry. Note + * that if it is not for the other thread we would + * be unwiring an unwired entry. This is not + * permitted. If we wait, we will be unwiring memory + * we did not wire. + * + * 2) + * Another thread is unwiring this entry. We did not + * have a reference to it, because if we did, this + * entry will not be getting unwired now. */ - vm_map_unlock(map); + if (!user_wire) { + /* + * XXX FBDP + * This could happen: there could be some + * overlapping vslock/vsunlock operations + * going on. + * We should probably just wait and retry, + * but then we have to be careful that this + * entry could get "simplified" after + * "in_transition" gets unset and before + * we re-lookup the entry, so we would + * have to re-clip the entry to avoid + * re-unwiring what we have already unwired... + * See vm_map_wire_nested(). + * + * Or we could just ignore "in_transition" + * here and proceed to decement the wired + * count(s) on this entry. That should be fine + * as long as "wired_count" doesn't drop all + * the way to 0 (and we should panic if THAT + * happens). + */ + panic("vm_map_unwire: in_transition entry"); + } - if (tmp_entry.is_sub_map) { - vm_map_t sub_map; - vm_map_offset_t sub_start, sub_end; - pmap_t pmap; - vm_map_offset_t pmap_addr; - + entry = entry->vme_next; + continue; + } - sub_map = tmp_entry.object.sub_map; - sub_start = tmp_entry.offset; - sub_end = sub_start + (tmp_entry.vme_end - - tmp_entry.vme_start); - if (tmp_entry.use_pmap) { - pmap = sub_map->pmap; - pmap_addr = tmp_entry.vme_start; + if (entry->is_sub_map) { + vm_map_offset_t sub_start; + vm_map_offset_t sub_end; + vm_map_offset_t local_end; + pmap_t pmap; + + vm_map_clip_start(map, entry, start); + vm_map_clip_end(map, entry, end); + + sub_start = VME_OFFSET(entry); + sub_end = entry->vme_end - entry->vme_start; + sub_end += VME_OFFSET(entry); + local_end = entry->vme_end; + if (map_pmap == NULL) { + if (entry->use_pmap) { + pmap = VME_SUBMAP(entry)->pmap; + pmap_addr = sub_start; } else { pmap = map->pmap; - pmap_addr = tmp_entry.vme_start; + pmap_addr = start; + } + if (entry->wired_count == 0 || + (user_wire && entry->user_wired_count == 0)) { + if (!user_wire) { + panic("vm_map_unwire: entry is unwired"); + } + entry = entry->vme_next; + continue; } - (void) vm_map_unwire_nested(sub_map, - sub_start, sub_end, - user_wire, - pmap, pmap_addr); - } else { - if (tmp_entry.object.vm_object == kernel_object) { - pmap_protect_options( - map->pmap, - tmp_entry.vme_start, - tmp_entry.vme_end, - VM_PROT_NONE, - PMAP_OPTIONS_REMOVE, - NULL); + /* + * Check for holes + * Holes: Next entry should be contiguous unless + * this is the end of the region. + */ + if (((entry->vme_end < end) && + ((entry->vme_next == vm_map_to_entry(map)) || + (entry->vme_next->vme_start + > entry->vme_end)))) { + if (!user_wire) { + panic("vm_map_unwire: non-contiguous region"); + } +/* + * entry = entry->vme_next; + * continue; + */ } - vm_fault_unwire(map, &tmp_entry, - tmp_entry.object.vm_object == kernel_object, - map->pmap, tmp_entry.vme_start); - } - vm_map_lock(map); + subtract_wire_counts(map, entry, user_wire); + + if (entry->wired_count != 0) { + entry = entry->vme_next; + continue; + } + + entry->in_transition = TRUE; + tmp_entry = *entry;/* see comment in vm_map_wire() */ - if (last_timestamp+1 != map->timestamp) { /* - * Find the entry again. It could have - * been clipped after we unlocked the map. + * We can unlock the map now. The in_transition state + * guarantees existance of the entry. */ - if (!vm_map_lookup_entry(map, s, &first_entry)){ - assert((map != kernel_map) && - (!entry->is_sub_map)); - first_entry = first_entry->vme_next; - s = first_entry->vme_start; - } else { - SAVE_HINT_MAP_WRITE(map, entry->vme_prev); + vm_map_unlock(map); + vm_map_unwire_nested(VME_SUBMAP(entry), + sub_start, sub_end, user_wire, pmap, pmap_addr); + vm_map_lock(map); + + if (last_timestamp + 1 != map->timestamp) { + /* + * Find the entry again. It could have been + * clipped or deleted after we unlocked the map. + */ + if (!vm_map_lookup_entry(map, + tmp_entry.vme_start, + &first_entry)) { + if (!user_wire) { + panic("vm_map_unwire: re-lookup failed"); + } + entry = first_entry->vme_next; + } else { + entry = first_entry; + } + } + last_timestamp = map->timestamp; + + /* + * clear transition bit for all constituent entries + * that were in the original entry (saved in + * tmp_entry). Also check for waiters. + */ + while ((entry != vm_map_to_entry(map)) && + (entry->vme_start < tmp_entry.vme_end)) { + assert(entry->in_transition); + entry->in_transition = FALSE; + if (entry->needs_wakeup) { + entry->needs_wakeup = FALSE; + need_wakeup = TRUE; + } + entry = entry->vme_next; } + continue; } else { - SAVE_HINT_MAP_WRITE(map, entry->vme_prev); - first_entry = entry; + vm_map_unlock(map); + vm_map_unwire_nested(VME_SUBMAP(entry), + sub_start, sub_end, user_wire, map_pmap, + pmap_addr); + vm_map_lock(map); + + if (last_timestamp + 1 != map->timestamp) { + /* + * Find the entry again. It could have been + * clipped or deleted after we unlocked the map. + */ + if (!vm_map_lookup_entry(map, + tmp_entry.vme_start, + &first_entry)) { + if (!user_wire) { + panic("vm_map_unwire: re-lookup failed"); + } + entry = first_entry->vme_next; + } else { + entry = first_entry; + } + } + last_timestamp = map->timestamp; } + } - last_timestamp = map->timestamp; - entry = first_entry; - while ((entry != vm_map_to_entry(map)) && - (entry->vme_start < tmp_entry.vme_end)) { - assert(entry->in_transition); - entry->in_transition = FALSE; - if (entry->needs_wakeup) { - entry->needs_wakeup = FALSE; - need_wakeup = TRUE; - } - entry = entry->vme_next; + if ((entry->wired_count == 0) || + (user_wire && entry->user_wired_count == 0)) { + if (!user_wire) { + panic("vm_map_unwire: entry is unwired"); } - /* - * We have unwired the entry(s). Go back and - * delete them. - */ - entry = first_entry; + + entry = entry->vme_next; continue; } - /* entry is unwired */ - assert(entry->wired_count == 0); - assert(entry->user_wired_count == 0); + assert(entry->wired_count > 0 && + (!user_wire || entry->user_wired_count > 0)); - assert(s == entry->vme_start); + vm_map_clip_start(map, entry, start); + vm_map_clip_end(map, entry, end); - if (flags & VM_MAP_REMOVE_NO_PMAP_CLEANUP) { - /* - * XXX with the VM_MAP_REMOVE_SAVE_ENTRIES flag to - * vm_map_delete(), some map entries might have been - * transferred to a "zap_map", which doesn't have a - * pmap. The original pmap has already been flushed - * in the vm_map_delete() call targeting the original - * map, but when we get to destroying the "zap_map", - * we don't have any pmap to flush, so let's just skip - * all this. - */ - } else if (entry->is_sub_map) { - if (entry->use_pmap) { -#ifndef NO_NESTED_PMAP - pmap_unnest(map->pmap, - (addr64_t)entry->vme_start, - entry->vme_end - entry->vme_start); -#endif /* NO_NESTED_PMAP */ - if ((map->mapped_in_other_pmaps) && (map->ref_count)) { - /* clean up parent map/maps */ - vm_map_submap_pmap_clean( - map, entry->vme_start, - entry->vme_end, - entry->object.sub_map, - entry->offset); - } - } else { - vm_map_submap_pmap_clean( - map, entry->vme_start, entry->vme_end, - entry->object.sub_map, - entry->offset); + /* + * Check for holes + * Holes: Next entry should be contiguous unless + * this is the end of the region. + */ + if (((entry->vme_end < end) && + ((entry->vme_next == vm_map_to_entry(map)) || + (entry->vme_next->vme_start > entry->vme_end)))) { + if (!user_wire) { + panic("vm_map_unwire: non-contiguous region"); } - } else if (entry->object.vm_object != kernel_object && - entry->object.vm_object != compressor_object) { - object = entry->object.vm_object; - if ((map->mapped_in_other_pmaps) && (map->ref_count)) { - vm_object_pmap_protect_options( - object, entry->offset, - entry->vme_end - entry->vme_start, - PMAP_NULL, - entry->vme_start, - VM_PROT_NONE, - PMAP_OPTIONS_REMOVE); - } else if ((entry->object.vm_object != - VM_OBJECT_NULL) || - (map->pmap == kernel_pmap)) { - /* Remove translations associated - * with this range unless the entry - * does not have an object, or - * it's the kernel map or a descendant - * since the platform could potentially - * create "backdoor" mappings invisible - * to the VM. It is expected that - * objectless, non-kernel ranges - * do not have such VM invisible - * translations. - */ - pmap_remove_options(map->pmap, - (addr64_t)entry->vme_start, - (addr64_t)entry->vme_end, - PMAP_OPTIONS_REMOVE); - } - } - - if (entry->iokit_acct) { - /* alternate accounting */ - vm_map_iokit_unmapped_region(map, - (entry->vme_end - - entry->vme_start)); - entry->iokit_acct = FALSE; + entry = entry->vme_next; + continue; } - /* - * All pmap mappings for this map entry must have been - * cleared by now. - */ -#if DEBUG - assert(vm_map_pmap_is_empty(map, - entry->vme_start, - entry->vme_end)); -#endif /* DEBUG */ + subtract_wire_counts(map, entry, user_wire); - next = entry->vme_next; + if (entry->wired_count != 0) { + entry = entry->vme_next; + continue; + } - if (map->pmap == kernel_pmap && - map->ref_count != 0 && - entry->vme_end < end && - (next == vm_map_to_entry(map) || - next->vme_start != entry->vme_end)) { - panic("vm_map_delete(%p,0x%llx,0x%llx): " - "hole after %p at 0x%llx\n", - map, - (uint64_t)start, - (uint64_t)end, - entry, - (uint64_t)entry->vme_end); + if (entry->zero_wired_pages) { + entry->zero_wired_pages = FALSE; } - s = next->vme_start; - last_timestamp = map->timestamp; + entry->in_transition = TRUE; + tmp_entry = *entry; /* see comment in vm_map_wire() */ - if ((flags & VM_MAP_REMOVE_SAVE_ENTRIES) && - zap_map != VM_MAP_NULL) { - vm_map_size_t entry_size; - /* - * The caller wants to save the affected VM map entries - * into the "zap_map". The caller will take care of - * these entries. - */ - /* unlink the entry from "map" ... */ - vm_map_store_entry_unlink(map, entry); - /* ... and add it to the end of the "zap_map" */ - vm_map_store_entry_link(zap_map, - vm_map_last_entry(zap_map), - entry); - entry_size = entry->vme_end - entry->vme_start; - map->size -= entry_size; - zap_map->size += entry_size; - /* we didn't unlock the map, so no timestamp increase */ - last_timestamp--; + /* + * We can unlock the map now. The in_transition state + * guarantees existance of the entry. + */ + vm_map_unlock(map); + if (map_pmap) { + vm_fault_unwire(map, + &tmp_entry, FALSE, map_pmap, pmap_addr); } else { - vm_map_entry_delete(map, entry); - /* vm_map_entry_delete unlocks the map */ - vm_map_lock(map); + vm_fault_unwire(map, + &tmp_entry, FALSE, map->pmap, + tmp_entry.vme_start); } + vm_map_lock(map); - entry = next; - - if(entry == vm_map_to_entry(map)) { - break; - } - if (last_timestamp+1 != map->timestamp) { + if (last_timestamp + 1 != map->timestamp) { /* - * we are responsible for deleting everything - * from the give space, if someone has interfered - * we pick up where we left off, back fills should - * be all right for anyone except map_delete and - * we have to assume that the task has been fully - * disabled before we get here - */ - if (!vm_map_lookup_entry(map, s, &entry)){ - entry = entry->vme_next; - s = entry->vme_start; - } else { - SAVE_HINT_MAP_WRITE(map, entry->vme_prev); - } - /* - * others can not only allocate behind us, we can - * also see coalesce while we don't have the map lock + * Find the entry again. It could have been clipped + * or deleted after we unlocked the map. */ - if(entry == vm_map_to_entry(map)) { - break; + if (!vm_map_lookup_entry(map, tmp_entry.vme_start, + &first_entry)) { + if (!user_wire) { + panic("vm_map_unwire: re-lookup failed"); + } + entry = first_entry->vme_next; + } else { + entry = first_entry; } } last_timestamp = map->timestamp; + + /* + * clear transition bit for all constituent entries that + * were in the original entry (saved in tmp_entry). Also + * check for waiters. + */ + while ((entry != vm_map_to_entry(map)) && + (entry->vme_start < tmp_entry.vme_end)) { + assert(entry->in_transition); + entry->in_transition = FALSE; + if (entry->needs_wakeup) { + entry->needs_wakeup = FALSE; + need_wakeup = TRUE; + } + entry = entry->vme_next; + } } - if (map->wait_for_space) - thread_wakeup((event_t) map); /* - * wake up anybody waiting on entries that we have already deleted. + * We might have fragmented the address space when we wired this + * range of addresses. Attempt to re-coalesce these VM map entries + * with their neighbors now that they're no longer wired. + * Under some circumstances, address space fragmentation can + * prevent VM object shadow chain collapsing, which can cause + * swap space leaks. */ - if (need_wakeup) - vm_map_entry_wakeup(map); + vm_map_simplify_range(map, start, end); + vm_map_unlock(map); + /* + * wake up anybody waiting on entries that we have unwired. + */ + if (need_wakeup) { + vm_map_entry_wakeup(map); + } return KERN_SUCCESS; } -/* - * vm_map_remove: - * - * Remove the given address range from the target map. - * This is the exported form of vm_map_delete. - */ kern_return_t -vm_map_remove( - register vm_map_t map, - register vm_map_offset_t start, - register vm_map_offset_t end, - register boolean_t flags) +vm_map_unwire( + vm_map_t map, + vm_map_offset_t start, + vm_map_offset_t end, + boolean_t user_wire) { - register kern_return_t result; - - vm_map_lock(map); - VM_MAP_RANGE_CHECK(map, start, end); - /* - * For the zone_map, the kernel controls the allocation/freeing of memory. - * Any free to the zone_map should be within the bounds of the map and - * should free up memory. If the VM_MAP_RANGE_CHECK() silently converts a - * free to the zone_map into a no-op, there is a problem and we should - * panic. - */ - if ((map == zone_map) && (start == end)) - panic("Nothing being freed to the zone_map. start = end = %p\n", (void *)start); - result = vm_map_delete(map, start, end, flags, VM_MAP_NULL); - vm_map_unlock(map); - - return(result); + return vm_map_unwire_nested(map, start, end, + user_wire, (pmap_t)NULL, 0); } /* - * Routine: vm_map_copy_discard + * vm_map_entry_delete: [ internal use only ] * - * Description: - * Dispose of a map copy object (returned by - * vm_map_copyin). + * Deallocate the given entry from the target map. */ -void -vm_map_copy_discard( - vm_map_copy_t copy) +static void +vm_map_entry_delete( + vm_map_t map, + vm_map_entry_t entry) { - if (copy == VM_MAP_COPY_NULL) - return; - - switch (copy->type) { - case VM_MAP_COPY_ENTRY_LIST: - while (vm_map_copy_first_entry(copy) != - vm_map_copy_to_entry(copy)) { - vm_map_entry_t entry = vm_map_copy_first_entry(copy); + vm_map_offset_t s, e; + vm_object_t object; + vm_map_t submap; - vm_map_copy_entry_unlink(copy, entry); - if (entry->is_sub_map) { - vm_map_deallocate(entry->object.sub_map); - } else { - vm_object_deallocate(entry->object.vm_object); - } - vm_map_copy_entry_dispose(copy, entry); - } - break; - case VM_MAP_COPY_OBJECT: - vm_object_deallocate(copy->cpy_object); - break; - case VM_MAP_COPY_KERNEL_BUFFER: + s = entry->vme_start; + e = entry->vme_end; + assert(VM_MAP_PAGE_ALIGNED(s, FOURK_PAGE_MASK)); + assert(VM_MAP_PAGE_ALIGNED(e, FOURK_PAGE_MASK)); + if (VM_MAP_PAGE_MASK(map) >= PAGE_MASK) { + assert(page_aligned(s)); + assert(page_aligned(e)); + } + if (entry->map_aligned == TRUE) { + assert(VM_MAP_PAGE_ALIGNED(s, VM_MAP_PAGE_MASK(map))); + assert(VM_MAP_PAGE_ALIGNED(e, VM_MAP_PAGE_MASK(map))); + } + assert(entry->wired_count == 0); + assert(entry->user_wired_count == 0); + assert(!entry->permanent); - /* - * The vm_map_copy_t and possibly the data buffer were - * allocated by a single call to kalloc(), i.e. the - * vm_map_copy_t was not allocated out of the zone. - */ - kfree(copy, copy->cpy_kalloc_size); - return; + if (entry->is_sub_map) { + object = NULL; + submap = VME_SUBMAP(entry); + } else { + submap = NULL; + object = VME_OBJECT(entry); } - zfree(vm_map_copy_zone, copy); -} -/* - * Routine: vm_map_copy_copy - * - * Description: - * Move the information in a map copy object to - * a new map copy object, leaving the old one - * empty. - * - * This is used by kernel routines that need - * to look at out-of-line data (in copyin form) - * before deciding whether to return SUCCESS. - * If the routine returns FAILURE, the original - * copy object will be deallocated; therefore, - * these routines must make a copy of the copy - * object and leave the original empty so that - * deallocation will not fail. - */ -vm_map_copy_t -vm_map_copy_copy( - vm_map_copy_t copy) -{ - vm_map_copy_t new_copy; + vm_map_store_entry_unlink(map, entry); + map->size -= e - s; - if (copy == VM_MAP_COPY_NULL) - return VM_MAP_COPY_NULL; + vm_map_entry_dispose(map, entry); + vm_map_unlock(map); /* - * Allocate a new copy object, and copy the information - * from the old one into it. + * Deallocate the object only after removing all + * pmap entries pointing to its pages. */ - - new_copy = (vm_map_copy_t) zalloc(vm_map_copy_zone); - new_copy->c_u.hdr.rb_head_store.rbh_root = (void*)(int)SKIP_RB_TREE; - *new_copy = *copy; - - if (copy->type == VM_MAP_COPY_ENTRY_LIST) { - /* - * The links in the entry chain must be - * changed to point to the new copy object. - */ - vm_map_copy_first_entry(copy)->vme_prev - = vm_map_copy_to_entry(new_copy); - vm_map_copy_last_entry(copy)->vme_next - = vm_map_copy_to_entry(new_copy); + if (submap) { + vm_map_deallocate(submap); + } else { + vm_object_deallocate(object); } - - /* - * Change the old copy object into one that contains - * nothing to be deallocated. - */ - copy->type = VM_MAP_COPY_OBJECT; - copy->cpy_object = VM_OBJECT_NULL; - - /* - * Return the new object. - */ - return new_copy; } -static kern_return_t -vm_map_overwrite_submap_recurse( - vm_map_t dst_map, - vm_map_offset_t dst_addr, - vm_map_size_t dst_size) +void +vm_map_submap_pmap_clean( + vm_map_t map, + vm_map_offset_t start, + vm_map_offset_t end, + vm_map_t sub_map, + vm_map_offset_t offset) { - vm_map_offset_t dst_end; - vm_map_entry_t tmp_entry; - vm_map_entry_t entry; - kern_return_t result; - boolean_t encountered_sub_map = FALSE; + vm_map_offset_t submap_start; + vm_map_offset_t submap_end; + vm_map_size_t remove_size; + vm_map_entry_t entry; + submap_end = offset + (end - start); + submap_start = offset; + vm_map_lock_read(sub_map); + if (vm_map_lookup_entry(sub_map, offset, &entry)) { + remove_size = (entry->vme_end - entry->vme_start); + if (offset > entry->vme_start) { + remove_size -= offset - entry->vme_start; + } - /* - * Verify that the destination is all writeable - * initially. We have to trunc the destination - * address and round the copy size or we'll end up - * splitting entries in strange ways. - */ - - dst_end = vm_map_round_page(dst_addr + dst_size, - VM_MAP_PAGE_MASK(dst_map)); - vm_map_lock(dst_map); - -start_pass_1: - if (!vm_map_lookup_entry(dst_map, dst_addr, &tmp_entry)) { - vm_map_unlock(dst_map); - return(KERN_INVALID_ADDRESS); - } - - vm_map_clip_start(dst_map, - tmp_entry, - vm_map_trunc_page(dst_addr, - VM_MAP_PAGE_MASK(dst_map))); - if (tmp_entry->is_sub_map) { - /* clipping did unnest if needed */ - assert(!tmp_entry->use_pmap); - } - for (entry = tmp_entry;;) { - vm_map_entry_t next; - - next = entry->vme_next; - while(entry->is_sub_map) { - vm_map_offset_t sub_start; - vm_map_offset_t sub_end; - vm_map_offset_t local_end; - - if (entry->in_transition) { - /* - * Say that we are waiting, and wait for entry. - */ - entry->needs_wakeup = TRUE; - vm_map_entry_wait(dst_map, THREAD_UNINT); - - goto start_pass_1; + if (submap_end < entry->vme_end) { + remove_size -= + entry->vme_end - submap_end; + } + if (entry->is_sub_map) { + vm_map_submap_pmap_clean( + sub_map, + start, + start + remove_size, + VME_SUBMAP(entry), + VME_OFFSET(entry)); + } else { + if (map->mapped_in_other_pmaps && + os_ref_get_count(&map->map_refcnt) != 0 && + VME_OBJECT(entry) != NULL) { + vm_object_pmap_protect_options( + VME_OBJECT(entry), + (VME_OFFSET(entry) + + offset - + entry->vme_start), + remove_size, + PMAP_NULL, + PAGE_SIZE, + entry->vme_start, + VM_PROT_NONE, + PMAP_OPTIONS_REMOVE); + } else { + pmap_remove(map->pmap, + (addr64_t)start, + (addr64_t)(start + remove_size)); } + } + } - encountered_sub_map = TRUE; - sub_start = entry->offset; - - if(entry->vme_end < dst_end) - sub_end = entry->vme_end; - else - sub_end = dst_end; - sub_end -= entry->vme_start; - sub_end += entry->offset; - local_end = entry->vme_end; - vm_map_unlock(dst_map); - - result = vm_map_overwrite_submap_recurse( - entry->object.sub_map, - sub_start, - sub_end - sub_start); + entry = entry->vme_next; - if(result != KERN_SUCCESS) - return result; - if (dst_end <= entry->vme_end) - return KERN_SUCCESS; - vm_map_lock(dst_map); - if(!vm_map_lookup_entry(dst_map, local_end, - &tmp_entry)) { - vm_map_unlock(dst_map); - return(KERN_INVALID_ADDRESS); + while ((entry != vm_map_to_entry(sub_map)) + && (entry->vme_start < submap_end)) { + remove_size = (entry->vme_end - entry->vme_start); + if (submap_end < entry->vme_end) { + remove_size -= entry->vme_end - submap_end; + } + if (entry->is_sub_map) { + vm_map_submap_pmap_clean( + sub_map, + (start + entry->vme_start) - offset, + ((start + entry->vme_start) - offset) + remove_size, + VME_SUBMAP(entry), + VME_OFFSET(entry)); + } else { + if (map->mapped_in_other_pmaps && + os_ref_get_count(&map->map_refcnt) != 0 && + VME_OBJECT(entry) != NULL) { + vm_object_pmap_protect_options( + VME_OBJECT(entry), + VME_OFFSET(entry), + remove_size, + PMAP_NULL, + PAGE_SIZE, + entry->vme_start, + VM_PROT_NONE, + PMAP_OPTIONS_REMOVE); + } else { + pmap_remove(map->pmap, + (addr64_t)((start + entry->vme_start) + - offset), + (addr64_t)(((start + entry->vme_start) + - offset) + remove_size)); } - entry = tmp_entry; - next = entry->vme_next; } + entry = entry->vme_next; + } + vm_map_unlock_read(sub_map); + return; +} - if ( ! (entry->protection & VM_PROT_WRITE)) { - vm_map_unlock(dst_map); - return(KERN_PROTECTION_FAILURE); - } +/* + * virt_memory_guard_ast: + * + * Handle the AST callout for a virtual memory guard. + * raise an EXC_GUARD exception and terminate the task + * if configured to do so. + */ +void +virt_memory_guard_ast( + thread_t thread, + mach_exception_data_type_t code, + mach_exception_data_type_t subcode) +{ + task_t task = thread->task; + assert(task != kernel_task); + assert(task == current_task()); + uint32_t behavior; - /* - * If the entry is in transition, we must wait - * for it to exit that state. Anything could happen - * when we unlock the map, so start over. - */ - if (entry->in_transition) { + behavior = task->task_exc_guard; - /* - * Say that we are waiting, and wait for entry. - */ - entry->needs_wakeup = TRUE; - vm_map_entry_wait(dst_map, THREAD_UNINT); + /* Is delivery enabled */ + if ((behavior & TASK_EXC_GUARD_VM_DELIVER) == 0) { + return; + } - goto start_pass_1; - } + /* If only once, make sure we're that once */ + while (behavior & TASK_EXC_GUARD_VM_ONCE) { + uint32_t new_behavior = behavior & ~TASK_EXC_GUARD_VM_DELIVER; -/* - * our range is contained completely within this map entry - */ - if (dst_end <= entry->vme_end) { - vm_map_unlock(dst_map); - return KERN_SUCCESS; - } -/* - * check that range specified is contiguous region - */ - if ((next == vm_map_to_entry(dst_map)) || - (next->vme_start != entry->vme_end)) { - vm_map_unlock(dst_map); - return(KERN_INVALID_ADDRESS); + if (OSCompareAndSwap(behavior, new_behavior, &task->task_exc_guard)) { + break; } - - /* - * Check for permanent objects in the destination. - */ - if ((entry->object.vm_object != VM_OBJECT_NULL) && - ((!entry->object.vm_object->internal) || - (entry->object.vm_object->true_share))) { - if(encountered_sub_map) { - vm_map_unlock(dst_map); - return(KERN_FAILURE); - } + behavior = task->task_exc_guard; + if ((behavior & TASK_EXC_GUARD_VM_DELIVER) == 0) { + return; } + } + /* Raise exception via corpse fork or synchronously */ + if ((task->task_exc_guard & TASK_EXC_GUARD_VM_CORPSE) && + (task->task_exc_guard & TASK_EXC_GUARD_VM_FATAL) == 0) { + task_violated_guard(code, subcode, NULL); + } else { + task_exception_notify(EXC_GUARD, code, subcode); + } - entry = next; - }/* for */ - vm_map_unlock(dst_map); - return(KERN_SUCCESS); + /* Terminate the task if desired */ + if (task->task_exc_guard & TASK_EXC_GUARD_VM_FATAL) { + task_bsdtask_kill(current_task()); + } } /* - * Routine: vm_map_copy_overwrite - * - * Description: - * Copy the memory described by the map copy - * object (copy; returned by vm_map_copyin) onto - * the specified destination region (dst_map, dst_addr). - * The destination must be writeable. - * - * Unlike vm_map_copyout, this routine actually - * writes over previously-mapped memory. If the - * previous mapping was to a permanent (user-supplied) - * memory object, it is preserved. - * - * The attributes (protection and inheritance) of the - * destination region are preserved. - * - * If successful, consumes the copy object. - * Otherwise, the caller is responsible for it. - * - * Implementation notes: - * To overwrite aligned temporary virtual memory, it is - * sufficient to remove the previous mapping and insert - * the new copy. This replacement is done either on - * the whole region (if no permanent virtual memory - * objects are embedded in the destination region) or - * in individual map entries. - * - * To overwrite permanent virtual memory , it is necessary - * to copy each page, as the external memory management - * interface currently does not provide any optimizations. - * - * Unaligned memory also has to be copied. It is possible - * to use 'vm_trickery' to copy the aligned data. This is - * not done but not hard to implement. + * vm_map_guard_exception: * - * Once a page of permanent memory has been overwritten, - * it is impossible to interrupt this function; otherwise, - * the call would be neither atomic nor location-independent. - * The kernel-state portion of a user thread must be - * interruptible. + * Generate a GUARD_TYPE_VIRTUAL_MEMORY EXC_GUARD exception. * - * It may be expensive to forward all requests that might - * overwrite permanent memory (vm_write, vm_copy) to - * uninterruptible kernel threads. This routine may be - * called by interruptible threads; however, success is - * not guaranteed -- if the request cannot be performed - * atomically and interruptibly, an error indication is - * returned. + * Right now, we do this when we find nothing mapped, or a + * gap in the mapping when a user address space deallocate + * was requested. We report the address of the first gap found. */ +static void +vm_map_guard_exception( + vm_map_offset_t gap_start, + unsigned reason) +{ + mach_exception_code_t code = 0; + unsigned int guard_type = GUARD_TYPE_VIRT_MEMORY; + unsigned int target = 0; /* should we pass in pid associated with map? */ + mach_exception_data_type_t subcode = (uint64_t)gap_start; + boolean_t fatal = FALSE; -static kern_return_t -vm_map_copy_overwrite_nested( - vm_map_t dst_map, - vm_map_address_t dst_addr, - vm_map_copy_t copy, - boolean_t interruptible, - pmap_t pmap, - boolean_t discard_on_success) -{ - vm_map_offset_t dst_end; - vm_map_entry_t tmp_entry; - vm_map_entry_t entry; - kern_return_t kr; - boolean_t aligned = TRUE; - boolean_t contains_permanent_objects = FALSE; - boolean_t encountered_sub_map = FALSE; - vm_map_offset_t base_addr; - vm_map_size_t copy_size; - vm_map_size_t total_size; - - - /* - * Check for null copy object. - */ - - if (copy == VM_MAP_COPY_NULL) - return(KERN_SUCCESS); - - /* - * Check for special kernel buffer allocated - * by new_ipc_kmsg_copyin. - */ + task_t task = current_task(); - if (copy->type == VM_MAP_COPY_KERNEL_BUFFER) { - return(vm_map_copyout_kernel_buffer( - dst_map, &dst_addr, - copy, TRUE, discard_on_success)); + /* Can't deliver exceptions to kernel task */ + if (task == kernel_task) { + return; } - /* - * Only works for entry lists at the moment. Will - * support page lists later. - */ - - assert(copy->type == VM_MAP_COPY_ENTRY_LIST); + EXC_GUARD_ENCODE_TYPE(code, guard_type); + EXC_GUARD_ENCODE_FLAVOR(code, reason); + EXC_GUARD_ENCODE_TARGET(code, target); - if (copy->size == 0) { - if (discard_on_success) - vm_map_copy_discard(copy); - return(KERN_SUCCESS); + if (task->task_exc_guard & TASK_EXC_GUARD_VM_FATAL) { + fatal = TRUE; } + thread_guard_violation(current_thread(), code, subcode, fatal); +} - /* - * Verify that the destination is all writeable - * initially. We have to trunc the destination - * address and round the copy size or we'll end up - * splitting entries in strange ways. - */ - - if (!VM_MAP_PAGE_ALIGNED(copy->size, - VM_MAP_PAGE_MASK(dst_map)) || - !VM_MAP_PAGE_ALIGNED(copy->offset, - VM_MAP_PAGE_MASK(dst_map)) || - !VM_MAP_PAGE_ALIGNED(dst_addr, - VM_MAP_PAGE_MASK(dst_map))) - { - aligned = FALSE; - dst_end = vm_map_round_page(dst_addr + copy->size, - VM_MAP_PAGE_MASK(dst_map)); +/* + * vm_map_delete: [ internal use only ] + * + * Deallocates the given address range from the target map. + * Removes all user wirings. Unwires one kernel wiring if + * VM_MAP_REMOVE_KUNWIRE is set. Waits for kernel wirings to go + * away if VM_MAP_REMOVE_WAIT_FOR_KWIRE is set. Sleeps + * interruptibly if VM_MAP_REMOVE_INTERRUPTIBLE is set. + * + * This routine is called with map locked and leaves map locked. + */ +static kern_return_t +vm_map_delete( + vm_map_t map, + vm_map_offset_t start, + vm_map_offset_t end, + int flags, + vm_map_t zap_map) +{ + vm_map_entry_t entry, next; + struct vm_map_entry *first_entry, tmp_entry; + vm_map_offset_t s; + vm_object_t object; + boolean_t need_wakeup; + unsigned int last_timestamp = ~0; /* unlikely value */ + int interruptible; + vm_map_offset_t gap_start; + __unused vm_map_offset_t save_start = start; + __unused vm_map_offset_t save_end = end; + const vm_map_offset_t FIND_GAP = 1; /* a not page aligned value */ + const vm_map_offset_t GAPS_OK = 2; /* a different not page aligned value */ + + if (map != kernel_map && !(flags & VM_MAP_REMOVE_GAPS_OK) && !map->terminated) { + gap_start = FIND_GAP; } else { - dst_end = dst_addr + copy->size; + gap_start = GAPS_OK; } - vm_map_lock(dst_map); + interruptible = (flags & VM_MAP_REMOVE_INTERRUPTIBLE) ? + THREAD_ABORTSAFE : THREAD_UNINT; - /* LP64todo - remove this check when vm_map_commpage64() - * no longer has to stuff in a map_entry for the commpage - * above the map's max_offset. + /* + * All our DMA I/O operations in IOKit are currently done by + * wiring through the map entries of the task requesting the I/O. + * Because of this, we must always wait for kernel wirings + * to go away on the entries before deleting them. + * + * Any caller who wants to actually remove a kernel wiring + * should explicitly set the VM_MAP_REMOVE_KUNWIRE flag to + * properly remove one wiring instead of blasting through + * them all. */ - if (dst_addr >= dst_map->max_offset) { - vm_map_unlock(dst_map); - return(KERN_INVALID_ADDRESS); - } - -start_pass_1: - if (!vm_map_lookup_entry(dst_map, dst_addr, &tmp_entry)) { - vm_map_unlock(dst_map); - return(KERN_INVALID_ADDRESS); - } - vm_map_clip_start(dst_map, - tmp_entry, - vm_map_trunc_page(dst_addr, - VM_MAP_PAGE_MASK(dst_map))); - for (entry = tmp_entry;;) { - vm_map_entry_t next = entry->vme_next; - - while(entry->is_sub_map) { - vm_map_offset_t sub_start; - vm_map_offset_t sub_end; - vm_map_offset_t local_end; - - if (entry->in_transition) { - - /* - * Say that we are waiting, and wait for entry. - */ - entry->needs_wakeup = TRUE; - vm_map_entry_wait(dst_map, THREAD_UNINT); + flags |= VM_MAP_REMOVE_WAIT_FOR_KWIRE; - goto start_pass_1; + while (1) { + /* + * Find the start of the region, and clip it + */ + if (vm_map_lookup_entry(map, start, &first_entry)) { + entry = first_entry; + if (map == kalloc_map && + (entry->vme_start != start || + entry->vme_end != end)) { + panic("vm_map_delete(%p,0x%llx,0x%llx): " + "mismatched entry %p [0x%llx:0x%llx]\n", + map, + (uint64_t)start, + (uint64_t)end, + entry, + (uint64_t)entry->vme_start, + (uint64_t)entry->vme_end); } - local_end = entry->vme_end; - if (!(entry->needs_copy)) { - /* if needs_copy we are a COW submap */ - /* in such a case we just replace so */ - /* there is no need for the follow- */ - /* ing check. */ - encountered_sub_map = TRUE; - sub_start = entry->offset; - - if(entry->vme_end < dst_end) - sub_end = entry->vme_end; - else - sub_end = dst_end; - sub_end -= entry->vme_start; - sub_end += entry->offset; - vm_map_unlock(dst_map); - - kr = vm_map_overwrite_submap_recurse( - entry->object.sub_map, - sub_start, - sub_end - sub_start); - if(kr != KERN_SUCCESS) - return kr; - vm_map_lock(dst_map); + /* + * If in a superpage, extend the range to include the start of the mapping. + */ + if (entry->superpage_size && (start & ~SUPERPAGE_MASK)) { + start = SUPERPAGE_ROUND_DOWN(start); + continue; } - if (dst_end <= entry->vme_end) - goto start_overwrite; - if(!vm_map_lookup_entry(dst_map, local_end, - &entry)) { - vm_map_unlock(dst_map); - return(KERN_INVALID_ADDRESS); + if (start == entry->vme_start) { + /* + * No need to clip. We don't want to cause + * any unnecessary unnesting in this case... + */ + } else { + if ((flags & VM_MAP_REMOVE_NO_MAP_ALIGN) && + entry->map_aligned && + !VM_MAP_PAGE_ALIGNED( + start, + VM_MAP_PAGE_MASK(map))) { + /* + * The entry will no longer be + * map-aligned after clipping + * and the caller said it's OK. + */ + entry->map_aligned = FALSE; + } + if (map == kalloc_map) { + panic("vm_map_delete(%p,0x%llx,0x%llx):" + " clipping %p at 0x%llx\n", + map, + (uint64_t)start, + (uint64_t)end, + entry, + (uint64_t)start); + } + vm_map_clip_start(map, entry, start); } - next = entry->vme_next; - } - if ( ! (entry->protection & VM_PROT_WRITE)) { - vm_map_unlock(dst_map); - return(KERN_PROTECTION_FAILURE); + /* + * Fix the lookup hint now, rather than each + * time through the loop. + */ + SAVE_HINT_MAP_WRITE(map, entry->vme_prev); + } else { + if (map->pmap == kernel_pmap && + os_ref_get_count(&map->map_refcnt) != 0) { + panic("vm_map_delete(%p,0x%llx,0x%llx): " + "no map entry at 0x%llx\n", + map, + (uint64_t)start, + (uint64_t)end, + (uint64_t)start); + } + entry = first_entry->vme_next; + if (gap_start == FIND_GAP) { + gap_start = start; + } } + break; + } + if (entry->superpage_size) { + end = SUPERPAGE_ROUND_UP(end); + } + need_wakeup = FALSE; + /* + * Step through all entries in this region + */ + s = entry->vme_start; + while ((entry != vm_map_to_entry(map)) && (s < end)) { /* - * If the entry is in transition, we must wait - * for it to exit that state. Anything could happen - * when we unlock the map, so start over. + * At this point, we have deleted all the memory entries + * between "start" and "s". We still need to delete + * all memory entries between "s" and "end". + * While we were blocked and the map was unlocked, some + * new memory entries could have been re-allocated between + * "start" and "s" and we don't want to mess with those. + * Some of those entries could even have been re-assembled + * with an entry after "s" (in vm_map_simplify_entry()), so + * we may have to vm_map_clip_start() again. */ - if (entry->in_transition) { - - /* - * Say that we are waiting, and wait for entry. - */ - entry->needs_wakeup = TRUE; - vm_map_entry_wait(dst_map, THREAD_UNINT); - goto start_pass_1; + if (entry->vme_start >= s) { + /* + * This entry starts on or after "s" + * so no need to clip its start. + */ + } else { + /* + * This entry has been re-assembled by a + * vm_map_simplify_entry(). We need to + * re-clip its start. + */ + if ((flags & VM_MAP_REMOVE_NO_MAP_ALIGN) && + entry->map_aligned && + !VM_MAP_PAGE_ALIGNED(s, + VM_MAP_PAGE_MASK(map))) { + /* + * The entry will no longer be map-aligned + * after clipping and the caller said it's OK. + */ + entry->map_aligned = FALSE; + } + if (map == kalloc_map) { + panic("vm_map_delete(%p,0x%llx,0x%llx): " + "clipping %p at 0x%llx\n", + map, + (uint64_t)start, + (uint64_t)end, + entry, + (uint64_t)s); + } + vm_map_clip_start(map, entry, s); } - -/* - * our range is contained completely within this map entry - */ - if (dst_end <= entry->vme_end) - break; -/* - * check that range specified is contiguous region - */ - if ((next == vm_map_to_entry(dst_map)) || - (next->vme_start != entry->vme_end)) { - vm_map_unlock(dst_map); - return(KERN_INVALID_ADDRESS); + if (entry->vme_end <= end) { + /* + * This entry is going away completely, so no need + * to clip and possibly cause an unnecessary unnesting. + */ + } else { + if ((flags & VM_MAP_REMOVE_NO_MAP_ALIGN) && + entry->map_aligned && + !VM_MAP_PAGE_ALIGNED(end, + VM_MAP_PAGE_MASK(map))) { + /* + * The entry will no longer be map-aligned + * after clipping and the caller said it's OK. + */ + entry->map_aligned = FALSE; + } + if (map == kalloc_map) { + panic("vm_map_delete(%p,0x%llx,0x%llx): " + "clipping %p at 0x%llx\n", + map, + (uint64_t)start, + (uint64_t)end, + entry, + (uint64_t)end); + } + vm_map_clip_end(map, entry, end); } - - /* - * Check for permanent objects in the destination. - */ - if ((entry->object.vm_object != VM_OBJECT_NULL) && - ((!entry->object.vm_object->internal) || - (entry->object.vm_object->true_share))) { - contains_permanent_objects = TRUE; + if (entry->permanent) { + if (map->pmap == kernel_pmap) { + panic("%s(%p,0x%llx,0x%llx): " + "attempt to remove permanent " + "VM map entry " + "%p [0x%llx:0x%llx]\n", + __FUNCTION__, + map, + (uint64_t) start, + (uint64_t) end, + entry, + (uint64_t) entry->vme_start, + (uint64_t) entry->vme_end); + } else if (flags & VM_MAP_REMOVE_IMMUTABLE) { +// printf("FBDP %d[%s] removing permanent entry %p [0x%llx:0x%llx] prot 0x%x/0x%x\n", proc_selfpid(), (current_task()->bsd_info ? proc_name_address(current_task()->bsd_info) : "?"), entry, (uint64_t)entry->vme_start, (uint64_t)entry->vme_end, entry->protection, entry->max_protection); + entry->permanent = FALSE; + } else { + if (vm_map_executable_immutable_verbose) { + printf("%d[%s] %s(0x%llx,0x%llx): " + "permanent entry [0x%llx:0x%llx] " + "prot 0x%x/0x%x\n", + proc_selfpid(), + (current_task()->bsd_info + ? proc_name_address(current_task()->bsd_info) + : "?"), + __FUNCTION__, + (uint64_t) start, + (uint64_t) end, + (uint64_t)entry->vme_start, + (uint64_t)entry->vme_end, + entry->protection, + entry->max_protection); + } + /* + * dtrace -n 'vm_map_delete_permanent { print("start=0x%llx end=0x%llx prot=0x%x/0x%x\n", arg0, arg1, arg2, arg3); stack(); ustack(); }' + */ + DTRACE_VM5(vm_map_delete_permanent, + vm_map_offset_t, entry->vme_start, + vm_map_offset_t, entry->vme_end, + vm_prot_t, entry->protection, + vm_prot_t, entry->max_protection, + int, VME_ALIAS(entry)); + } } - entry = next; - }/* for */ -start_overwrite: - /* - * If there are permanent objects in the destination, then - * the copy cannot be interrupted. - */ + if (entry->in_transition) { + wait_result_t wait_result; - if (interruptible && contains_permanent_objects) { - vm_map_unlock(dst_map); - return(KERN_FAILURE); /* XXX */ - } + /* + * Another thread is wiring/unwiring this entry. + * Let the other thread know we are waiting. + */ + assert(s == entry->vme_start); + entry->needs_wakeup = TRUE; - /* - * - * Make a second pass, overwriting the data - * At the beginning of each loop iteration, - * the next entry to be overwritten is "tmp_entry" - * (initially, the value returned from the lookup above), - * and the starting address expected in that entry - * is "start". - */ + /* + * wake up anybody waiting on entries that we have + * already unwired/deleted. + */ + if (need_wakeup) { + vm_map_entry_wakeup(map); + need_wakeup = FALSE; + } - total_size = copy->size; - if(encountered_sub_map) { - copy_size = 0; - /* re-calculate tmp_entry since we've had the map */ - /* unlocked */ - if (!vm_map_lookup_entry( dst_map, dst_addr, &tmp_entry)) { - vm_map_unlock(dst_map); - return(KERN_INVALID_ADDRESS); - } - } else { - copy_size = copy->size; - } - - base_addr = dst_addr; - while(TRUE) { - /* deconstruct the copy object and do in parts */ - /* only in sub_map, interruptable case */ - vm_map_entry_t copy_entry; - vm_map_entry_t previous_prev = VM_MAP_ENTRY_NULL; - vm_map_entry_t next_copy = VM_MAP_ENTRY_NULL; - int nentries; - int remaining_entries = 0; - vm_map_offset_t new_offset = 0; - - for (entry = tmp_entry; copy_size == 0;) { - vm_map_entry_t next; + wait_result = vm_map_entry_wait(map, interruptible); - next = entry->vme_next; + if (interruptible && + wait_result == THREAD_INTERRUPTED) { + /* + * We do not clear the needs_wakeup flag, + * since we cannot tell if we were the only one. + */ + return KERN_ABORTED; + } - /* tmp_entry and base address are moved along */ - /* each time we encounter a sub-map. Otherwise */ - /* entry can outpase tmp_entry, and the copy_size */ - /* may reflect the distance between them */ - /* if the current entry is found to be in transition */ - /* we will start over at the beginning or the last */ - /* encounter of a submap as dictated by base_addr */ - /* we will zero copy_size accordingly. */ - if (entry->in_transition) { - /* - * Say that we are waiting, and wait for entry. - */ - entry->needs_wakeup = TRUE; - vm_map_entry_wait(dst_map, THREAD_UNINT); - - if(!vm_map_lookup_entry(dst_map, base_addr, - &tmp_entry)) { - vm_map_unlock(dst_map); - return(KERN_INVALID_ADDRESS); + /* + * The entry could have been clipped or it + * may not exist anymore. Look it up again. + */ + if (!vm_map_lookup_entry(map, s, &first_entry)) { + /* + * User: use the next entry + */ + if (gap_start == FIND_GAP) { + gap_start = s; } - copy_size = 0; - entry = tmp_entry; - continue; + entry = first_entry->vme_next; + s = entry->vme_start; + } else { + entry = first_entry; + SAVE_HINT_MAP_WRITE(map, entry->vme_prev); } - if(entry->is_sub_map) { - vm_map_offset_t sub_start; - vm_map_offset_t sub_end; - vm_map_offset_t local_end; + last_timestamp = map->timestamp; + continue; + } /* end in_transition */ - if (entry->needs_copy) { - /* if this is a COW submap */ - /* just back the range with a */ - /* anonymous entry */ - if(entry->vme_end < dst_end) - sub_end = entry->vme_end; - else - sub_end = dst_end; - if(entry->vme_start < base_addr) - sub_start = base_addr; - else - sub_start = entry->vme_start; - vm_map_clip_end( - dst_map, entry, sub_end); - vm_map_clip_start( - dst_map, entry, sub_start); - assert(!entry->use_pmap); - entry->is_sub_map = FALSE; - vm_map_deallocate( - entry->object.sub_map); - entry->object.sub_map = NULL; - entry->is_shared = FALSE; - entry->needs_copy = FALSE; - entry->offset = 0; - /* - * XXX FBDP - * We should propagate the protections - * of the submap entry here instead - * of forcing them to VM_PROT_ALL... - * Or better yet, we should inherit - * the protection of the copy_entry. - */ - entry->protection = VM_PROT_ALL; - entry->max_protection = VM_PROT_ALL; - entry->wired_count = 0; - entry->user_wired_count = 0; - if(entry->inheritance - == VM_INHERIT_SHARE) - entry->inheritance = VM_INHERIT_COPY; - continue; - } - /* first take care of any non-sub_map */ - /* entries to send */ - if(base_addr < entry->vme_start) { - /* stuff to send */ - copy_size = - entry->vme_start - base_addr; - break; - } - sub_start = entry->offset; + if (entry->wired_count) { + boolean_t user_wire; - if(entry->vme_end < dst_end) - sub_end = entry->vme_end; - else - sub_end = dst_end; - sub_end -= entry->vme_start; - sub_end += entry->offset; - local_end = entry->vme_end; - vm_map_unlock(dst_map); - copy_size = sub_end - sub_start; + user_wire = entry->user_wired_count > 0; - /* adjust the copy object */ - if (total_size > copy_size) { - vm_map_size_t local_size = 0; - vm_map_size_t entry_size; + /* + * Remove a kernel wiring if requested + */ + if (flags & VM_MAP_REMOVE_KUNWIRE) { + entry->wired_count--; + } - nentries = 1; - new_offset = copy->offset; - copy_entry = vm_map_copy_first_entry(copy); - while(copy_entry != - vm_map_copy_to_entry(copy)){ - entry_size = copy_entry->vme_end - - copy_entry->vme_start; - if((local_size < copy_size) && - ((local_size + entry_size) - >= copy_size)) { - vm_map_copy_clip_end(copy, - copy_entry, - copy_entry->vme_start + - (copy_size - local_size)); - entry_size = copy_entry->vme_end - - copy_entry->vme_start; - local_size += entry_size; - new_offset += entry_size; - } - if(local_size >= copy_size) { - next_copy = copy_entry->vme_next; - copy_entry->vme_next = - vm_map_copy_to_entry(copy); - previous_prev = - copy->cpy_hdr.links.prev; - copy->cpy_hdr.links.prev = copy_entry; - copy->size = copy_size; - remaining_entries = - copy->cpy_hdr.nentries; - remaining_entries -= nentries; - copy->cpy_hdr.nentries = nentries; - break; - } else { - local_size += entry_size; - new_offset += entry_size; - nentries++; - } - copy_entry = copy_entry->vme_next; - } - } - - if((entry->use_pmap) && (pmap == NULL)) { - kr = vm_map_copy_overwrite_nested( - entry->object.sub_map, - sub_start, - copy, - interruptible, - entry->object.sub_map->pmap, - TRUE); - } else if (pmap != NULL) { - kr = vm_map_copy_overwrite_nested( - entry->object.sub_map, - sub_start, - copy, - interruptible, pmap, - TRUE); - } else { - kr = vm_map_copy_overwrite_nested( - entry->object.sub_map, - sub_start, - copy, - interruptible, - dst_map->pmap, - TRUE); - } - if(kr != KERN_SUCCESS) { - if(next_copy != NULL) { - copy->cpy_hdr.nentries += - remaining_entries; - copy->cpy_hdr.links.prev->vme_next = - next_copy; - copy->cpy_hdr.links.prev - = previous_prev; - copy->size = total_size; - } - return kr; - } - if (dst_end <= local_end) { - return(KERN_SUCCESS); + /* + * Remove all user wirings for proper accounting + */ + if (entry->user_wired_count > 0) { + while (entry->user_wired_count) { + subtract_wire_counts(map, entry, user_wire); } - /* otherwise copy no longer exists, it was */ - /* destroyed after successful copy_overwrite */ - copy = (vm_map_copy_t) - zalloc(vm_map_copy_zone); - copy->c_u.hdr.rb_head_store.rbh_root = (void*)(int)SKIP_RB_TREE; - vm_map_copy_first_entry(copy) = - vm_map_copy_last_entry(copy) = - vm_map_copy_to_entry(copy); - copy->type = VM_MAP_COPY_ENTRY_LIST; - copy->offset = new_offset; + } + if (entry->wired_count != 0) { + assert(map != kernel_map); /* - * XXX FBDP - * this does not seem to deal with - * the VM map store (R&B tree) + * Cannot continue. Typical case is when + * a user thread has physical io pending on + * on this page. Either wait for the + * kernel wiring to go away or return an + * error. */ + if (flags & VM_MAP_REMOVE_WAIT_FOR_KWIRE) { + wait_result_t wait_result; - total_size -= copy_size; - copy_size = 0; - /* put back remainder of copy in container */ - if(next_copy != NULL) { - copy->cpy_hdr.nentries = remaining_entries; - copy->cpy_hdr.links.next = next_copy; - copy->cpy_hdr.links.prev = previous_prev; - copy->size = total_size; - next_copy->vme_prev = - vm_map_copy_to_entry(copy); - next_copy = NULL; - } - base_addr = local_end; - vm_map_lock(dst_map); - if(!vm_map_lookup_entry(dst_map, - local_end, &tmp_entry)) { - vm_map_unlock(dst_map); - return(KERN_INVALID_ADDRESS); + assert(s == entry->vme_start); + entry->needs_wakeup = TRUE; + wait_result = vm_map_entry_wait(map, + interruptible); + + if (interruptible && + wait_result == THREAD_INTERRUPTED) { + /* + * We do not clear the + * needs_wakeup flag, since we + * cannot tell if we were the + * only one. + */ + return KERN_ABORTED; + } + + /* + * The entry could have been clipped or + * it may not exist anymore. Look it + * up again. + */ + if (!vm_map_lookup_entry(map, s, + &first_entry)) { + assert(map != kernel_map); + /* + * User: use the next entry + */ + if (gap_start == FIND_GAP) { + gap_start = s; + } + entry = first_entry->vme_next; + s = entry->vme_start; + } else { + entry = first_entry; + SAVE_HINT_MAP_WRITE(map, entry->vme_prev); + } + last_timestamp = map->timestamp; + continue; + } else { + return KERN_FAILURE; } - entry = tmp_entry; - continue; - } - if (dst_end <= entry->vme_end) { - copy_size = dst_end - base_addr; - break; } - if ((next == vm_map_to_entry(dst_map)) || - (next->vme_start != entry->vme_end)) { - vm_map_unlock(dst_map); - return(KERN_INVALID_ADDRESS); - } + entry->in_transition = TRUE; + /* + * copy current entry. see comment in vm_map_wire() + */ + tmp_entry = *entry; + assert(s == entry->vme_start); - entry = next; - }/* for */ + /* + * We can unlock the map now. The in_transition + * state guarentees existance of the entry. + */ + vm_map_unlock(map); - next_copy = NULL; - nentries = 1; + if (tmp_entry.is_sub_map) { + vm_map_t sub_map; + vm_map_offset_t sub_start, sub_end; + pmap_t pmap; + vm_map_offset_t pmap_addr; - /* adjust the copy object */ - if (total_size > copy_size) { - vm_map_size_t local_size = 0; - vm_map_size_t entry_size; - new_offset = copy->offset; - copy_entry = vm_map_copy_first_entry(copy); - while(copy_entry != vm_map_copy_to_entry(copy)) { - entry_size = copy_entry->vme_end - - copy_entry->vme_start; - if((local_size < copy_size) && - ((local_size + entry_size) - >= copy_size)) { - vm_map_copy_clip_end(copy, copy_entry, - copy_entry->vme_start + - (copy_size - local_size)); - entry_size = copy_entry->vme_end - - copy_entry->vme_start; - local_size += entry_size; - new_offset += entry_size; - } - if(local_size >= copy_size) { - next_copy = copy_entry->vme_next; - copy_entry->vme_next = - vm_map_copy_to_entry(copy); - previous_prev = - copy->cpy_hdr.links.prev; - copy->cpy_hdr.links.prev = copy_entry; - copy->size = copy_size; - remaining_entries = - copy->cpy_hdr.nentries; - remaining_entries -= nentries; - copy->cpy_hdr.nentries = nentries; - break; + sub_map = VME_SUBMAP(&tmp_entry); + sub_start = VME_OFFSET(&tmp_entry); + sub_end = sub_start + (tmp_entry.vme_end - + tmp_entry.vme_start); + if (tmp_entry.use_pmap) { + pmap = sub_map->pmap; + pmap_addr = tmp_entry.vme_start; } else { - local_size += entry_size; - new_offset += entry_size; - nentries++; + pmap = map->pmap; + pmap_addr = tmp_entry.vme_start; } - copy_entry = copy_entry->vme_next; + (void) vm_map_unwire_nested(sub_map, + sub_start, sub_end, + user_wire, + pmap, pmap_addr); + } else { + if (VME_OBJECT(&tmp_entry) == kernel_object) { + pmap_protect_options( + map->pmap, + tmp_entry.vme_start, + tmp_entry.vme_end, + VM_PROT_NONE, + PMAP_OPTIONS_REMOVE, + NULL); + } + vm_fault_unwire(map, &tmp_entry, + VME_OBJECT(&tmp_entry) == kernel_object, + map->pmap, tmp_entry.vme_start); } - } - if (aligned) { - pmap_t local_pmap; + vm_map_lock(map); - if(pmap) - local_pmap = pmap; - else - local_pmap = dst_map->pmap; + if (last_timestamp + 1 != map->timestamp) { + /* + * Find the entry again. It could have + * been clipped after we unlocked the map. + */ + if (!vm_map_lookup_entry(map, s, &first_entry)) { + assert((map != kernel_map) && + (!entry->is_sub_map)); + if (gap_start == FIND_GAP) { + gap_start = s; + } + first_entry = first_entry->vme_next; + s = first_entry->vme_start; + } else { + SAVE_HINT_MAP_WRITE(map, entry->vme_prev); + } + } else { + SAVE_HINT_MAP_WRITE(map, entry->vme_prev); + first_entry = entry; + } - if ((kr = vm_map_copy_overwrite_aligned( - dst_map, tmp_entry, copy, - base_addr, local_pmap)) != KERN_SUCCESS) { - if(next_copy != NULL) { - copy->cpy_hdr.nentries += - remaining_entries; - copy->cpy_hdr.links.prev->vme_next = - next_copy; - copy->cpy_hdr.links.prev = - previous_prev; - copy->size += copy_size; + last_timestamp = map->timestamp; + + entry = first_entry; + while ((entry != vm_map_to_entry(map)) && + (entry->vme_start < tmp_entry.vme_end)) { + assert(entry->in_transition); + entry->in_transition = FALSE; + if (entry->needs_wakeup) { + entry->needs_wakeup = FALSE; + need_wakeup = TRUE; } - return kr; + entry = entry->vme_next; } - vm_map_unlock(dst_map); - } else { /* - * Performance gain: - * - * if the copy and dst address are misaligned but the same - * offset within the page we can copy_not_aligned the - * misaligned parts and copy aligned the rest. If they are - * aligned but len is unaligned we simply need to copy - * the end bit unaligned. We'll need to split the misaligned - * bits of the region in this case ! + * We have unwired the entry(s). Go back and + * delete them. */ - /* ALWAYS UNLOCKS THE dst_map MAP */ - kr = vm_map_copy_overwrite_unaligned( - dst_map, - tmp_entry, - copy, - base_addr, - discard_on_success); - if (kr != KERN_SUCCESS) { - if(next_copy != NULL) { - copy->cpy_hdr.nentries += - remaining_entries; - copy->cpy_hdr.links.prev->vme_next = - next_copy; - copy->cpy_hdr.links.prev = - previous_prev; - copy->size += copy_size; - } - return kr; - } - } - total_size -= copy_size; - if(total_size == 0) - break; - base_addr += copy_size; - copy_size = 0; - copy->offset = new_offset; - if(next_copy != NULL) { - copy->cpy_hdr.nentries = remaining_entries; - copy->cpy_hdr.links.next = next_copy; - copy->cpy_hdr.links.prev = previous_prev; - next_copy->vme_prev = vm_map_copy_to_entry(copy); - copy->size = total_size; - } - vm_map_lock(dst_map); - while(TRUE) { - if (!vm_map_lookup_entry(dst_map, - base_addr, &tmp_entry)) { - vm_map_unlock(dst_map); - return(KERN_INVALID_ADDRESS); - } - if (tmp_entry->in_transition) { - entry->needs_wakeup = TRUE; - vm_map_entry_wait(dst_map, THREAD_UNINT); - } else { - break; - } + entry = first_entry; + continue; } - vm_map_clip_start(dst_map, - tmp_entry, - vm_map_trunc_page(base_addr, - VM_MAP_PAGE_MASK(dst_map))); - entry = tmp_entry; - } /* while */ + /* entry is unwired */ + assert(entry->wired_count == 0); + assert(entry->user_wired_count == 0); - /* - * Throw away the vm_map_copy object - */ - if (discard_on_success) - vm_map_copy_discard(copy); + assert(s == entry->vme_start); - return(KERN_SUCCESS); -}/* vm_map_copy_overwrite */ + if (flags & VM_MAP_REMOVE_NO_PMAP_CLEANUP) { + /* + * XXX with the VM_MAP_REMOVE_SAVE_ENTRIES flag to + * vm_map_delete(), some map entries might have been + * transferred to a "zap_map", which doesn't have a + * pmap. The original pmap has already been flushed + * in the vm_map_delete() call targeting the original + * map, but when we get to destroying the "zap_map", + * we don't have any pmap to flush, so let's just skip + * all this. + */ + } else if (entry->is_sub_map) { + assertf(VM_MAP_PAGE_SHIFT(VME_SUBMAP(entry)) >= VM_MAP_PAGE_SHIFT(map), + "map %p (%d) entry %p submap %p (%d)\n", + map, VM_MAP_PAGE_SHIFT(map), entry, + VME_SUBMAP(entry), + VM_MAP_PAGE_SHIFT(VME_SUBMAP(entry))); + if (entry->use_pmap) { + assertf(VM_MAP_PAGE_SHIFT(VME_SUBMAP(entry)) == VM_MAP_PAGE_SHIFT(map), + "map %p (%d) entry %p submap %p (%d)\n", + map, VM_MAP_PAGE_SHIFT(map), entry, + VME_SUBMAP(entry), + VM_MAP_PAGE_SHIFT(VME_SUBMAP(entry))); +#ifndef NO_NESTED_PMAP + int pmap_flags; -kern_return_t -vm_map_copy_overwrite( - vm_map_t dst_map, - vm_map_offset_t dst_addr, - vm_map_copy_t copy, - boolean_t interruptible) -{ - vm_map_size_t head_size, tail_size; - vm_map_copy_t head_copy, tail_copy; - vm_map_offset_t head_addr, tail_addr; - vm_map_entry_t entry; - kern_return_t kr; + if (flags & VM_MAP_REMOVE_NO_UNNESTING) { + /* + * This is the final cleanup of the + * address space being terminated. + * No new mappings are expected and + * we don't really need to unnest the + * shared region (and lose the "global" + * pmap mappings, if applicable). + * + * Tell the pmap layer that we're + * "clean" wrt nesting. + */ + pmap_flags = PMAP_UNNEST_CLEAN; + } else { + /* + * We're unmapping part of the nested + * shared region, so we can't keep the + * nested pmap. + */ + pmap_flags = 0; + } + pmap_unnest_options( + map->pmap, + (addr64_t)entry->vme_start, + entry->vme_end - entry->vme_start, + pmap_flags); +#endif /* NO_NESTED_PMAP */ + if (map->mapped_in_other_pmaps && + os_ref_get_count(&map->map_refcnt) != 0) { + /* clean up parent map/maps */ + vm_map_submap_pmap_clean( + map, entry->vme_start, + entry->vme_end, + VME_SUBMAP(entry), + VME_OFFSET(entry)); + } + } else { + vm_map_submap_pmap_clean( + map, entry->vme_start, entry->vme_end, + VME_SUBMAP(entry), + VME_OFFSET(entry)); + } + } else if (VME_OBJECT(entry) != kernel_object && + VME_OBJECT(entry) != compressor_object) { + object = VME_OBJECT(entry); + if (map->mapped_in_other_pmaps && + os_ref_get_count(&map->map_refcnt) != 0) { + vm_object_pmap_protect_options( + object, VME_OFFSET(entry), + entry->vme_end - entry->vme_start, + PMAP_NULL, + PAGE_SIZE, + entry->vme_start, + VM_PROT_NONE, + PMAP_OPTIONS_REMOVE); + } else if ((VME_OBJECT(entry) != VM_OBJECT_NULL) || + (map->pmap == kernel_pmap)) { + /* Remove translations associated + * with this range unless the entry + * does not have an object, or + * it's the kernel map or a descendant + * since the platform could potentially + * create "backdoor" mappings invisible + * to the VM. It is expected that + * objectless, non-kernel ranges + * do not have such VM invisible + * translations. + */ + pmap_remove_options(map->pmap, + (addr64_t)entry->vme_start, + (addr64_t)entry->vme_end, + PMAP_OPTIONS_REMOVE); + } + } - head_size = 0; - tail_size = 0; - head_copy = NULL; - tail_copy = NULL; - head_addr = 0; - tail_addr = 0; + if (entry->iokit_acct) { + /* alternate accounting */ + DTRACE_VM4(vm_map_iokit_unmapped_region, + vm_map_t, map, + vm_map_offset_t, entry->vme_start, + vm_map_offset_t, entry->vme_end, + int, VME_ALIAS(entry)); + vm_map_iokit_unmapped_region(map, + (entry->vme_end - + entry->vme_start)); + entry->iokit_acct = FALSE; + entry->use_pmap = FALSE; + } - if (interruptible || - copy == VM_MAP_COPY_NULL || - copy->type != VM_MAP_COPY_ENTRY_LIST) { /* - * We can't split the "copy" map if we're interruptible - * or if we don't have a "copy" map... + * All pmap mappings for this map entry must have been + * cleared by now. */ - blunt_copy: - return vm_map_copy_overwrite_nested(dst_map, - dst_addr, - copy, - interruptible, - (pmap_t) NULL, - TRUE); - } +#if DEBUG + assert(vm_map_pmap_is_empty(map, + entry->vme_start, + entry->vme_end)); +#endif /* DEBUG */ - if (copy->size < 3 * PAGE_SIZE) { - /* - * Too small to bother with optimizing... - */ - goto blunt_copy; - } + next = entry->vme_next; - if ((dst_addr & VM_MAP_PAGE_MASK(dst_map)) != - (copy->offset & VM_MAP_PAGE_MASK(dst_map))) { - /* - * Incompatible mis-alignment of source and destination... - */ - goto blunt_copy; - } + if (map->pmap == kernel_pmap && + os_ref_get_count(&map->map_refcnt) != 0 && + entry->vme_end < end && + (next == vm_map_to_entry(map) || + next->vme_start != entry->vme_end)) { + panic("vm_map_delete(%p,0x%llx,0x%llx): " + "hole after %p at 0x%llx\n", + map, + (uint64_t)start, + (uint64_t)end, + entry, + (uint64_t)entry->vme_end); + } - /* - * Proper alignment or identical mis-alignment at the beginning. - * Let's try and do a small unaligned copy first (if needed) - * and then an aligned copy for the rest. - */ - if (!page_aligned(dst_addr)) { - head_addr = dst_addr; - head_size = (VM_MAP_PAGE_SIZE(dst_map) - - (copy->offset & VM_MAP_PAGE_MASK(dst_map))); - } - if (!page_aligned(copy->offset + copy->size)) { /* - * Mis-alignment at the end. - * Do an aligned copy up to the last page and - * then an unaligned copy for the remaining bytes. + * If the desired range didn't end with "entry", then there is a gap if + * we wrapped around to the start of the map or if "entry" and "next" + * aren't contiguous. + * + * The vm_map_round_page() is needed since an entry can be less than VM_MAP_PAGE_MASK() sized. + * For example, devices which have h/w 4K pages, but entry sizes are all now 16K. */ - tail_size = ((copy->offset + copy->size) & - VM_MAP_PAGE_MASK(dst_map)); - tail_addr = dst_addr + copy->size - tail_size; - } + if (gap_start == FIND_GAP && + vm_map_round_page(entry->vme_end, VM_MAP_PAGE_MASK(map)) < end && + (next == vm_map_to_entry(map) || entry->vme_end != next->vme_start)) { + gap_start = entry->vme_end; + } + s = next->vme_start; + last_timestamp = map->timestamp; - if (head_size + tail_size == copy->size) { - /* - * It's all unaligned, no optimization possible... - */ - goto blunt_copy; + if (entry->permanent) { + /* + * A permanent entry can not be removed, so leave it + * in place but remove all access permissions. + */ + entry->protection = VM_PROT_NONE; + entry->max_protection = VM_PROT_NONE; + } else if ((flags & VM_MAP_REMOVE_SAVE_ENTRIES) && + zap_map != VM_MAP_NULL) { + vm_map_size_t entry_size; + /* + * The caller wants to save the affected VM map entries + * into the "zap_map". The caller will take care of + * these entries. + */ + /* unlink the entry from "map" ... */ + vm_map_store_entry_unlink(map, entry); + /* ... and add it to the end of the "zap_map" */ + vm_map_store_entry_link(zap_map, + vm_map_last_entry(zap_map), + entry, + VM_MAP_KERNEL_FLAGS_NONE); + entry_size = entry->vme_end - entry->vme_start; + map->size -= entry_size; + zap_map->size += entry_size; + /* we didn't unlock the map, so no timestamp increase */ + last_timestamp--; + } else { + vm_map_entry_delete(map, entry); + /* vm_map_entry_delete unlocks the map */ + vm_map_lock(map); + } + + entry = next; + + if (entry == vm_map_to_entry(map)) { + break; + } + if (last_timestamp + 1 != map->timestamp) { + /* + * We are responsible for deleting everything + * from the given space. If someone has interfered, + * we pick up where we left off. Back fills should + * be all right for anyone, except map_delete, and + * we have to assume that the task has been fully + * disabled before we get here + */ + if (!vm_map_lookup_entry(map, s, &entry)) { + entry = entry->vme_next; + + /* + * Nothing found for s. If we weren't already done, then there is a gap. + */ + if (gap_start == FIND_GAP && s < end) { + gap_start = s; + } + s = entry->vme_start; + } else { + SAVE_HINT_MAP_WRITE(map, entry->vme_prev); + } + /* + * others can not only allocate behind us, we can + * also see coalesce while we don't have the map lock + */ + if (entry == vm_map_to_entry(map)) { + break; + } + } + last_timestamp = map->timestamp; } + if (map->wait_for_space) { + thread_wakeup((event_t) map); + } /* - * Can't optimize if there are any submaps in the - * destination due to the way we free the "copy" map - * progressively in vm_map_copy_overwrite_nested() - * in that case. + * wake up anybody waiting on entries that we have already deleted. */ - vm_map_lock_read(dst_map); - if (! vm_map_lookup_entry(dst_map, dst_addr, &entry)) { - vm_map_unlock_read(dst_map); - goto blunt_copy; + if (need_wakeup) { + vm_map_entry_wakeup(map); } - for (; - (entry != vm_map_copy_to_entry(copy) && - entry->vme_start < dst_addr + copy->size); - entry = entry->vme_next) { - if (entry->is_sub_map) { - vm_map_unlock_read(dst_map); - goto blunt_copy; + + if (gap_start != FIND_GAP && gap_start != GAPS_OK) { + DTRACE_VM3(kern_vm_deallocate_gap, + vm_map_offset_t, gap_start, + vm_map_offset_t, save_start, + vm_map_offset_t, save_end); + if (!(flags & VM_MAP_REMOVE_GAPS_OK)) { + vm_map_guard_exception(gap_start, kGUARD_EXC_DEALLOC_GAP); } } - vm_map_unlock_read(dst_map); - - if (head_size) { - /* - * Unaligned copy of the first "head_size" bytes, to reach - * a page boundary. - */ - - /* - * Extract "head_copy" out of "copy". - */ - head_copy = (vm_map_copy_t) zalloc(vm_map_copy_zone); - head_copy->c_u.hdr.rb_head_store.rbh_root = (void*)(int)SKIP_RB_TREE; - vm_map_copy_first_entry(head_copy) = - vm_map_copy_to_entry(head_copy); - vm_map_copy_last_entry(head_copy) = - vm_map_copy_to_entry(head_copy); - head_copy->type = VM_MAP_COPY_ENTRY_LIST; - head_copy->cpy_hdr.nentries = 0; - head_copy->cpy_hdr.entries_pageable = - copy->cpy_hdr.entries_pageable; - vm_map_store_init(&head_copy->cpy_hdr); - head_copy->offset = copy->offset; - head_copy->size = head_size; + return KERN_SUCCESS; +} - copy->offset += head_size; - copy->size -= head_size; - entry = vm_map_copy_first_entry(copy); - vm_map_copy_clip_end(copy, entry, copy->offset); - vm_map_copy_entry_unlink(copy, entry); - vm_map_copy_entry_link(head_copy, - vm_map_copy_to_entry(head_copy), - entry); +/* + * vm_map_terminate: + * + * Clean out a task's map. + */ +kern_return_t +vm_map_terminate( + vm_map_t map) +{ + vm_map_lock(map); + map->terminated = TRUE; + vm_map_unlock(map); - /* - * Do the unaligned copy. - */ - kr = vm_map_copy_overwrite_nested(dst_map, - head_addr, - head_copy, - interruptible, - (pmap_t) NULL, - FALSE); - if (kr != KERN_SUCCESS) - goto done; - } + return vm_map_remove(map, + map->min_offset, + map->max_offset, + /* + * Final cleanup: + * + no unnesting + * + remove immutable mappings + * + allow gaps in range + */ + (VM_MAP_REMOVE_NO_UNNESTING | + VM_MAP_REMOVE_IMMUTABLE | + VM_MAP_REMOVE_GAPS_OK)); +} - if (tail_size) { - /* - * Extract "tail_copy" out of "copy". - */ - tail_copy = (vm_map_copy_t) zalloc(vm_map_copy_zone); - tail_copy->c_u.hdr.rb_head_store.rbh_root = (void*)(int)SKIP_RB_TREE; - vm_map_copy_first_entry(tail_copy) = - vm_map_copy_to_entry(tail_copy); - vm_map_copy_last_entry(tail_copy) = - vm_map_copy_to_entry(tail_copy); - tail_copy->type = VM_MAP_COPY_ENTRY_LIST; - tail_copy->cpy_hdr.nentries = 0; - tail_copy->cpy_hdr.entries_pageable = - copy->cpy_hdr.entries_pageable; - vm_map_store_init(&tail_copy->cpy_hdr); - - tail_copy->offset = copy->offset + copy->size - tail_size; - tail_copy->size = tail_size; - - copy->size -= tail_size; - - entry = vm_map_copy_last_entry(copy); - vm_map_copy_clip_start(copy, entry, tail_copy->offset); - entry = vm_map_copy_last_entry(copy); - vm_map_copy_entry_unlink(copy, entry); - vm_map_copy_entry_link(tail_copy, - vm_map_copy_last_entry(tail_copy), - entry); - } +/* + * vm_map_remove: + * + * Remove the given address range from the target map. + * This is the exported form of vm_map_delete. + */ +kern_return_t +vm_map_remove( + vm_map_t map, + vm_map_offset_t start, + vm_map_offset_t end, + boolean_t flags) +{ + kern_return_t result; + vm_map_lock(map); + VM_MAP_RANGE_CHECK(map, start, end); /* - * Copy most (or possibly all) of the data. + * For the zone maps, the kernel controls the allocation/freeing of memory. + * Any free to the zone maps should be within the bounds of the map and + * should free up memory. If the VM_MAP_RANGE_CHECK() silently converts a + * free to the zone maps into a no-op, there is a problem and we should + * panic. */ - kr = vm_map_copy_overwrite_nested(dst_map, - dst_addr + head_size, - copy, - interruptible, - (pmap_t) NULL, - FALSE); - if (kr != KERN_SUCCESS) { - goto done; + if ((start == end) && zone_maps_owned(start, 1)) { + panic("Nothing being freed to a zone map. start = end = %p\n", (void *)start); } + result = vm_map_delete(map, start, end, flags, VM_MAP_NULL); + vm_map_unlock(map); - if (tail_size) { - kr = vm_map_copy_overwrite_nested(dst_map, - tail_addr, - tail_copy, - interruptible, - (pmap_t) NULL, - FALSE); + return result; +} + +/* + * vm_map_remove_locked: + * + * Remove the given address range from the target locked map. + * This is the exported form of vm_map_delete. + */ +kern_return_t +vm_map_remove_locked( + vm_map_t map, + vm_map_offset_t start, + vm_map_offset_t end, + boolean_t flags) +{ + kern_return_t result; + + VM_MAP_RANGE_CHECK(map, start, end); + result = vm_map_delete(map, start, end, flags, VM_MAP_NULL); + return result; +} + + +/* + * Routine: vm_map_copy_allocate + * + * Description: + * Allocates and initializes a map copy object. + */ +static vm_map_copy_t +vm_map_copy_allocate(void) +{ + vm_map_copy_t new_copy; + + new_copy = zalloc(vm_map_copy_zone); + bzero(new_copy, sizeof(*new_copy)); + new_copy->c_u.hdr.rb_head_store.rbh_root = (void*)(int)SKIP_RB_TREE; + vm_map_copy_first_entry(new_copy) = vm_map_copy_to_entry(new_copy); + vm_map_copy_last_entry(new_copy) = vm_map_copy_to_entry(new_copy); + return new_copy; +} + +/* + * Routine: vm_map_copy_discard + * + * Description: + * Dispose of a map copy object (returned by + * vm_map_copyin). + */ +void +vm_map_copy_discard( + vm_map_copy_t copy) +{ + if (copy == VM_MAP_COPY_NULL) { + return; } -done: - assert(copy->type == VM_MAP_COPY_ENTRY_LIST); - if (kr == KERN_SUCCESS) { - /* - * Discard all the copy maps. - */ - if (head_copy) { - vm_map_copy_discard(head_copy); - head_copy = NULL; - } - vm_map_copy_discard(copy); - if (tail_copy) { - vm_map_copy_discard(tail_copy); - tail_copy = NULL; + switch (copy->type) { + case VM_MAP_COPY_ENTRY_LIST: + while (vm_map_copy_first_entry(copy) != + vm_map_copy_to_entry(copy)) { + vm_map_entry_t entry = vm_map_copy_first_entry(copy); + + vm_map_copy_entry_unlink(copy, entry); + if (entry->is_sub_map) { + vm_map_deallocate(VME_SUBMAP(entry)); + } else { + vm_object_deallocate(VME_OBJECT(entry)); + } + vm_map_copy_entry_dispose(copy, entry); } - } else { + break; + case VM_MAP_COPY_OBJECT: + vm_object_deallocate(copy->cpy_object); + break; + case VM_MAP_COPY_KERNEL_BUFFER: + /* - * Re-assemble the original copy map. + * The vm_map_copy_t and possibly the data buffer were + * allocated by a single call to kheap_alloc(), i.e. the + * vm_map_copy_t was not allocated out of the zone. */ - if (head_copy) { - entry = vm_map_copy_first_entry(head_copy); - vm_map_copy_entry_unlink(head_copy, entry); - vm_map_copy_entry_link(copy, - vm_map_copy_to_entry(copy), - entry); - copy->offset -= head_size; - copy->size += head_size; - vm_map_copy_discard(head_copy); - head_copy = NULL; - } - if (tail_copy) { - entry = vm_map_copy_last_entry(tail_copy); - vm_map_copy_entry_unlink(tail_copy, entry); - vm_map_copy_entry_link(copy, - vm_map_copy_last_entry(copy), - entry); - copy->size += tail_size; - vm_map_copy_discard(tail_copy); - tail_copy = NULL; + if (copy->size > msg_ool_size_small || copy->offset) { + panic("Invalid vm_map_copy_t sz:%lld, ofst:%lld", + (long long)copy->size, (long long)copy->offset); } + kheap_free(KHEAP_DATA_BUFFERS, copy->cpy_kdata, copy->size); } - return kr; + zfree(vm_map_copy_zone, copy); } - /* - * Routine: vm_map_copy_overwrite_unaligned [internal use only] - * - * Decription: - * Physically copy unaligned data + * Routine: vm_map_copy_copy * - * Implementation: - * Unaligned parts of pages have to be physically copied. We use - * a modified form of vm_fault_copy (which understands none-aligned - * page offsets and sizes) to do the copy. We attempt to copy as - * much memory in one go as possibly, however vm_fault_copy copies - * within 1 memory object so we have to find the smaller of "amount left" - * "source object data size" and "target object data size". With - * unaligned data we don't need to split regions, therefore the source - * (copy) object should be one map entry, the target range may be split - * over multiple map entries however. In any event we are pessimistic - * about these assumptions. + * Description: + * Move the information in a map copy object to + * a new map copy object, leaving the old one + * empty. * - * Assumptions: - * dst_map is locked on entry and is return locked on success, - * unlocked on error. + * This is used by kernel routines that need + * to look at out-of-line data (in copyin form) + * before deciding whether to return SUCCESS. + * If the routine returns FAILURE, the original + * copy object will be deallocated; therefore, + * these routines must make a copy of the copy + * object and leave the original empty so that + * deallocation will not fail. */ +vm_map_copy_t +vm_map_copy_copy( + vm_map_copy_t copy) +{ + vm_map_copy_t new_copy; + + if (copy == VM_MAP_COPY_NULL) { + return VM_MAP_COPY_NULL; + } + + /* + * Allocate a new copy object, and copy the information + * from the old one into it. + */ + + new_copy = (vm_map_copy_t) zalloc(vm_map_copy_zone); + memcpy((void *) new_copy, (void *) copy, sizeof(struct vm_map_copy)); +#if __has_feature(ptrauth_calls) + if (copy->type == VM_MAP_COPY_KERNEL_BUFFER) { + new_copy->cpy_kdata = copy->cpy_kdata; + } +#endif + + if (copy->type == VM_MAP_COPY_ENTRY_LIST) { + /* + * The links in the entry chain must be + * changed to point to the new copy object. + */ + vm_map_copy_first_entry(copy)->vme_prev + = vm_map_copy_to_entry(new_copy); + vm_map_copy_last_entry(copy)->vme_next + = vm_map_copy_to_entry(new_copy); + } + + /* + * Change the old copy object into one that contains + * nothing to be deallocated. + */ + copy->type = VM_MAP_COPY_OBJECT; + copy->cpy_object = VM_OBJECT_NULL; + + /* + * Return the new object. + */ + return new_copy; +} static kern_return_t -vm_map_copy_overwrite_unaligned( - vm_map_t dst_map, - vm_map_entry_t entry, - vm_map_copy_t copy, - vm_map_offset_t start, - boolean_t discard_on_success) -{ - vm_map_entry_t copy_entry; - vm_map_entry_t copy_entry_next; - vm_map_version_t version; - vm_object_t dst_object; - vm_object_offset_t dst_offset; - vm_object_offset_t src_offset; - vm_object_offset_t entry_offset; - vm_map_offset_t entry_end; - vm_map_size_t src_size, - dst_size, - copy_size, - amount_left; - kern_return_t kr = KERN_SUCCESS; - - - copy_entry = vm_map_copy_first_entry(copy); +vm_map_overwrite_submap_recurse( + vm_map_t dst_map, + vm_map_offset_t dst_addr, + vm_map_size_t dst_size) +{ + vm_map_offset_t dst_end; + vm_map_entry_t tmp_entry; + vm_map_entry_t entry; + kern_return_t result; + boolean_t encountered_sub_map = FALSE; - vm_map_lock_write_to_read(dst_map); - src_offset = copy->offset - vm_object_trunc_page(copy->offset); - amount_left = copy->size; -/* - * unaligned so we never clipped this entry, we need the offset into - * the vm_object not just the data. - */ - while (amount_left > 0) { - if (entry == vm_map_to_entry(dst_map)) { - vm_map_unlock_read(dst_map); - return KERN_INVALID_ADDRESS; - } + /* + * Verify that the destination is all writeable + * initially. We have to trunc the destination + * address and round the copy size or we'll end up + * splitting entries in strange ways. + */ - /* "start" must be within the current map entry */ - assert ((start>=entry->vme_start) && (startvme_end)); + dst_end = vm_map_round_page(dst_addr + dst_size, + VM_MAP_PAGE_MASK(dst_map)); + vm_map_lock(dst_map); - dst_offset = start - entry->vme_start; +start_pass_1: + if (!vm_map_lookup_entry(dst_map, dst_addr, &tmp_entry)) { + vm_map_unlock(dst_map); + return KERN_INVALID_ADDRESS; + } - dst_size = entry->vme_end - start; + vm_map_clip_start(dst_map, + tmp_entry, + vm_map_trunc_page(dst_addr, + VM_MAP_PAGE_MASK(dst_map))); + if (tmp_entry->is_sub_map) { + /* clipping did unnest if needed */ + assert(!tmp_entry->use_pmap); + } - src_size = copy_entry->vme_end - - (copy_entry->vme_start + src_offset); + for (entry = tmp_entry;;) { + vm_map_entry_t next; - if (dst_size < src_size) { -/* - * we can only copy dst_size bytes before - * we have to get the next destination entry - */ - copy_size = dst_size; - } else { -/* - * we can only copy src_size bytes before - * we have to get the next source copy entry - */ - copy_size = src_size; - } + next = entry->vme_next; + while (entry->is_sub_map) { + vm_map_offset_t sub_start; + vm_map_offset_t sub_end; + vm_map_offset_t local_end; - if (copy_size > amount_left) { - copy_size = amount_left; - } -/* - * Entry needs copy, create a shadow shadow object for - * Copy on write region. - */ - if (entry->needs_copy && - ((entry->protection & VM_PROT_WRITE) != 0)) - { - if (vm_map_lock_read_to_write(dst_map)) { - vm_map_lock_read(dst_map); - goto RetryLookup; - } - vm_object_shadow(&entry->object.vm_object, - &entry->offset, - (vm_map_size_t)(entry->vme_end - - entry->vme_start)); - entry->needs_copy = FALSE; - vm_map_lock_write_to_read(dst_map); - } - dst_object = entry->object.vm_object; -/* - * unlike with the virtual (aligned) copy we're going - * to fault on it therefore we need a target object. - */ - if (dst_object == VM_OBJECT_NULL) { - if (vm_map_lock_read_to_write(dst_map)) { - vm_map_lock_read(dst_map); - goto RetryLookup; - } - dst_object = vm_object_allocate((vm_map_size_t) - entry->vme_end - entry->vme_start); - entry->object.vm_object = dst_object; - entry->offset = 0; - assert(entry->use_pmap); - vm_map_lock_write_to_read(dst_map); - } -/* - * Take an object reference and unlock map. The "entry" may - * disappear or change when the map is unlocked. - */ - vm_object_reference(dst_object); - version.main_timestamp = dst_map->timestamp; - entry_offset = entry->offset; - entry_end = entry->vme_end; - vm_map_unlock_read(dst_map); -/* - * Copy as much as possible in one pass - */ - kr = vm_fault_copy( - copy_entry->object.vm_object, - copy_entry->offset + src_offset, - ©_size, - dst_object, - entry_offset + dst_offset, - dst_map, - &version, - THREAD_UNINT ); + if (entry->in_transition) { + /* + * Say that we are waiting, and wait for entry. + */ + entry->needs_wakeup = TRUE; + vm_map_entry_wait(dst_map, THREAD_UNINT); - start += copy_size; - src_offset += copy_size; - amount_left -= copy_size; -/* - * Release the object reference - */ - vm_object_deallocate(dst_object); -/* - * If a hard error occurred, return it now - */ - if (kr != KERN_SUCCESS) - return kr; + goto start_pass_1; + } - if ((copy_entry->vme_start + src_offset) == copy_entry->vme_end - || amount_left == 0) - { -/* - * all done with this copy entry, dispose. - */ - copy_entry_next = copy_entry->vme_next; + encountered_sub_map = TRUE; + sub_start = VME_OFFSET(entry); - if (discard_on_success) { - vm_map_copy_entry_unlink(copy, copy_entry); - assert(!copy_entry->is_sub_map); - vm_object_deallocate( - copy_entry->object.vm_object); - vm_map_copy_entry_dispose(copy, copy_entry); + if (entry->vme_end < dst_end) { + sub_end = entry->vme_end; + } else { + sub_end = dst_end; } + sub_end -= entry->vme_start; + sub_end += VME_OFFSET(entry); + local_end = entry->vme_end; + vm_map_unlock(dst_map); - if (copy_entry_next == vm_map_copy_to_entry(copy) && - amount_left) { -/* - * not finished copying but run out of source - */ + result = vm_map_overwrite_submap_recurse( + VME_SUBMAP(entry), + sub_start, + sub_end - sub_start); + + if (result != KERN_SUCCESS) { + return result; + } + if (dst_end <= entry->vme_end) { + return KERN_SUCCESS; + } + vm_map_lock(dst_map); + if (!vm_map_lookup_entry(dst_map, local_end, + &tmp_entry)) { + vm_map_unlock(dst_map); return KERN_INVALID_ADDRESS; } + entry = tmp_entry; + next = entry->vme_next; + } - copy_entry = copy_entry_next; - - src_offset = 0; + if (!(entry->protection & VM_PROT_WRITE)) { + vm_map_unlock(dst_map); + return KERN_PROTECTION_FAILURE; } - if (amount_left == 0) - return KERN_SUCCESS; + /* + * If the entry is in transition, we must wait + * for it to exit that state. Anything could happen + * when we unlock the map, so start over. + */ + if (entry->in_transition) { + /* + * Say that we are waiting, and wait for entry. + */ + entry->needs_wakeup = TRUE; + vm_map_entry_wait(dst_map, THREAD_UNINT); + + goto start_pass_1; + } - vm_map_lock_read(dst_map); - if (version.main_timestamp == dst_map->timestamp) { - if (start == entry_end) { -/* - * destination region is split. Use the version - * information to avoid a lookup in the normal - * case. - */ - entry = entry->vme_next; /* - * should be contiguous. Fail if we encounter - * a hole in the destination. + * our range is contained completely within this map entry */ - if (start != entry->vme_start) { - vm_map_unlock_read(dst_map); - return KERN_INVALID_ADDRESS ; - } - } - } else { + if (dst_end <= entry->vme_end) { + vm_map_unlock(dst_map); + return KERN_SUCCESS; + } /* - * Map version check failed. - * we must lookup the entry because somebody - * might have changed the map behind our backs. + * check that range specified is contiguous region */ - RetryLookup: - if (!vm_map_lookup_entry(dst_map, start, &entry)) - { - vm_map_unlock_read(dst_map); - return KERN_INVALID_ADDRESS ; + if ((next == vm_map_to_entry(dst_map)) || + (next->vme_start != entry->vme_end)) { + vm_map_unlock(dst_map); + return KERN_INVALID_ADDRESS; + } + + /* + * Check for permanent objects in the destination. + */ + if ((VME_OBJECT(entry) != VM_OBJECT_NULL) && + ((!VME_OBJECT(entry)->internal) || + (VME_OBJECT(entry)->true_share))) { + if (encountered_sub_map) { + vm_map_unlock(dst_map); + return KERN_FAILURE; } } - }/* while */ + + entry = next; + }/* for */ + vm_map_unlock(dst_map); return KERN_SUCCESS; -}/* vm_map_copy_overwrite_unaligned */ +} /* - * Routine: vm_map_copy_overwrite_aligned [internal use only] + * Routine: vm_map_copy_overwrite * * Description: - * Does all the vm_trickery possible for whole pages. + * Copy the memory described by the map copy + * object (copy; returned by vm_map_copyin) onto + * the specified destination region (dst_map, dst_addr). + * The destination must be writeable. * - * Implementation: + * Unlike vm_map_copyout, this routine actually + * writes over previously-mapped memory. If the + * previous mapping was to a permanent (user-supplied) + * memory object, it is preserved. * - * If there are no permanent objects in the destination, - * and the source and destination map entry zones match, - * and the destination map entry is not shared, - * then the map entries can be deleted and replaced - * with those from the copy. The following code is the - * basic idea of what to do, but there are lots of annoying - * little details about getting protection and inheritance - * right. Should add protection, inheritance, and sharing checks - * to the above pass and make sure that no wiring is involved. + * The attributes (protection and inheritance) of the + * destination region are preserved. + * + * If successful, consumes the copy object. + * Otherwise, the caller is responsible for it. + * + * Implementation notes: + * To overwrite aligned temporary virtual memory, it is + * sufficient to remove the previous mapping and insert + * the new copy. This replacement is done either on + * the whole region (if no permanent virtual memory + * objects are embedded in the destination region) or + * in individual map entries. + * + * To overwrite permanent virtual memory , it is necessary + * to copy each page, as the external memory management + * interface currently does not provide any optimizations. + * + * Unaligned memory also has to be copied. It is possible + * to use 'vm_trickery' to copy the aligned data. This is + * not done but not hard to implement. + * + * Once a page of permanent memory has been overwritten, + * it is impossible to interrupt this function; otherwise, + * the call would be neither atomic nor location-independent. + * The kernel-state portion of a user thread must be + * interruptible. + * + * It may be expensive to forward all requests that might + * overwrite permanent memory (vm_write, vm_copy) to + * uninterruptible kernel threads. This routine may be + * called by interruptible threads; however, success is + * not guaranteed -- if the request cannot be performed + * atomically and interruptibly, an error indication is + * returned. */ -int vm_map_copy_overwrite_aligned_src_not_internal = 0; -int vm_map_copy_overwrite_aligned_src_not_symmetric = 0; -int vm_map_copy_overwrite_aligned_src_large = 0; - static kern_return_t -vm_map_copy_overwrite_aligned( - vm_map_t dst_map, - vm_map_entry_t tmp_entry, - vm_map_copy_t copy, - vm_map_offset_t start, - __unused pmap_t pmap) -{ - vm_object_t object; - vm_map_entry_t copy_entry; - vm_map_size_t copy_size; - vm_map_size_t size; - vm_map_entry_t entry; - - while ((copy_entry = vm_map_copy_first_entry(copy)) - != vm_map_copy_to_entry(copy)) - { - copy_size = (copy_entry->vme_end - copy_entry->vme_start); - - entry = tmp_entry; - if (entry->is_sub_map) { - /* unnested when clipped earlier */ - assert(!entry->use_pmap); - } - if (entry == vm_map_to_entry(dst_map)) { - vm_map_unlock(dst_map); - return KERN_INVALID_ADDRESS; - } - size = (entry->vme_end - entry->vme_start); - /* - * Make sure that no holes popped up in the - * address map, and that the protection is - * still valid, in case the map was unlocked - * earlier. - */ +vm_map_copy_overwrite_nested( + vm_map_t dst_map, + vm_map_address_t dst_addr, + vm_map_copy_t copy, + boolean_t interruptible, + pmap_t pmap, + boolean_t discard_on_success) +{ + vm_map_offset_t dst_end; + vm_map_entry_t tmp_entry; + vm_map_entry_t entry; + kern_return_t kr; + boolean_t aligned = TRUE; + boolean_t contains_permanent_objects = FALSE; + boolean_t encountered_sub_map = FALSE; + vm_map_offset_t base_addr; + vm_map_size_t copy_size; + vm_map_size_t total_size; + int copy_page_shift; - if ((entry->vme_start != start) || ((entry->is_sub_map) - && !entry->needs_copy)) { - vm_map_unlock(dst_map); - return(KERN_INVALID_ADDRESS); - } - assert(entry != vm_map_to_entry(dst_map)); - /* - * Check protection again - */ + /* + * Check for null copy object. + */ - if ( ! (entry->protection & VM_PROT_WRITE)) { - vm_map_unlock(dst_map); - return(KERN_PROTECTION_FAILURE); - } + if (copy == VM_MAP_COPY_NULL) { + return KERN_SUCCESS; + } - /* - * Adjust to source size first - */ + /* + * Assert that the vm_map_copy is coming from the right + * zone and hasn't been forged + */ + vm_map_copy_require(copy); - if (copy_size < size) { - if (entry->map_aligned && - !VM_MAP_PAGE_ALIGNED(entry->vme_start + copy_size, - VM_MAP_PAGE_MASK(dst_map))) { - /* no longer map-aligned */ - entry->map_aligned = FALSE; - } - vm_map_clip_end(dst_map, entry, entry->vme_start + copy_size); - size = copy_size; - } + /* + * Check for special kernel buffer allocated + * by new_ipc_kmsg_copyin. + */ - /* - * Adjust to destination size - */ + if (copy->type == VM_MAP_COPY_KERNEL_BUFFER) { + return vm_map_copyout_kernel_buffer( + dst_map, &dst_addr, + copy, copy->size, TRUE, discard_on_success); + } - if (size < copy_size) { - vm_map_copy_clip_end(copy, copy_entry, - copy_entry->vme_start + size); - copy_size = size; + /* + * Only works for entry lists at the moment. Will + * support page lists later. + */ + + assert(copy->type == VM_MAP_COPY_ENTRY_LIST); + + if (copy->size == 0) { + if (discard_on_success) { + vm_map_copy_discard(copy); } + return KERN_SUCCESS; + } - assert((entry->vme_end - entry->vme_start) == size); - assert((tmp_entry->vme_end - tmp_entry->vme_start) == size); - assert((copy_entry->vme_end - copy_entry->vme_start) == size); + copy_page_shift = copy->cpy_hdr.page_shift; - /* - * If the destination contains temporary unshared memory, - * we can perform the copy by throwing it away and - * installing the source data. - */ + /* + * Verify that the destination is all writeable + * initially. We have to trunc the destination + * address and round the copy size or we'll end up + * splitting entries in strange ways. + */ - object = entry->object.vm_object; - if ((!entry->is_shared && - ((object == VM_OBJECT_NULL) || - (object->internal && !object->true_share))) || - entry->needs_copy) { - vm_object_t old_object = entry->object.vm_object; - vm_object_offset_t old_offset = entry->offset; - vm_object_offset_t offset; + if (!VM_MAP_PAGE_ALIGNED(copy->size, + VM_MAP_PAGE_MASK(dst_map)) || + !VM_MAP_PAGE_ALIGNED(copy->offset, + VM_MAP_PAGE_MASK(dst_map)) || + !VM_MAP_PAGE_ALIGNED(dst_addr, + VM_MAP_PAGE_MASK(dst_map)) || + copy_page_shift != VM_MAP_PAGE_SHIFT(dst_map)) { + aligned = FALSE; + dst_end = vm_map_round_page(dst_addr + copy->size, + VM_MAP_PAGE_MASK(dst_map)); + } else { + dst_end = dst_addr + copy->size; + } - /* - * Ensure that the source and destination aren't - * identical - */ - if (old_object == copy_entry->object.vm_object && - old_offset == copy_entry->offset) { - vm_map_copy_entry_unlink(copy, copy_entry); - vm_map_copy_entry_dispose(copy, copy_entry); + vm_map_lock(dst_map); - if (old_object != VM_OBJECT_NULL) - vm_object_deallocate(old_object); + /* LP64todo - remove this check when vm_map_commpage64() + * no longer has to stuff in a map_entry for the commpage + * above the map's max_offset. + */ + if (dst_addr >= dst_map->max_offset) { + vm_map_unlock(dst_map); + return KERN_INVALID_ADDRESS; + } - start = tmp_entry->vme_end; - tmp_entry = tmp_entry->vme_next; - continue; - } +start_pass_1: + if (!vm_map_lookup_entry(dst_map, dst_addr, &tmp_entry)) { + vm_map_unlock(dst_map); + return KERN_INVALID_ADDRESS; + } + vm_map_clip_start(dst_map, + tmp_entry, + vm_map_trunc_page(dst_addr, + VM_MAP_PAGE_MASK(dst_map))); + for (entry = tmp_entry;;) { + vm_map_entry_t next = entry->vme_next; -#define __TRADEOFF1_OBJ_SIZE (64 * 1024 * 1024) /* 64 MB */ -#define __TRADEOFF1_COPY_SIZE (128 * 1024) /* 128 KB */ - if (copy_entry->object.vm_object != VM_OBJECT_NULL && - copy_entry->object.vm_object->vo_size >= __TRADEOFF1_OBJ_SIZE && - copy_size <= __TRADEOFF1_COPY_SIZE) { + while (entry->is_sub_map) { + vm_map_offset_t sub_start; + vm_map_offset_t sub_end; + vm_map_offset_t local_end; + + if (entry->in_transition) { /* - * Virtual vs. Physical copy tradeoff #1. - * - * Copying only a few pages out of a large - * object: do a physical copy instead of - * a virtual copy, to avoid possibly keeping - * the entire large object alive because of - * those few copy-on-write pages. + * Say that we are waiting, and wait for entry. */ - vm_map_copy_overwrite_aligned_src_large++; - goto slow_copy; + entry->needs_wakeup = TRUE; + vm_map_entry_wait(dst_map, THREAD_UNINT); + + goto start_pass_1; } - if (entry->alias >= VM_MEMORY_MALLOC && - entry->alias <= VM_MEMORY_MALLOC_LARGE_REUSED) { - vm_object_t new_object, new_shadow; + local_end = entry->vme_end; + if (!(entry->needs_copy)) { + /* if needs_copy we are a COW submap */ + /* in such a case we just replace so */ + /* there is no need for the follow- */ + /* ing check. */ + encountered_sub_map = TRUE; + sub_start = VME_OFFSET(entry); - /* - * We're about to map something over a mapping - * established by malloc()... - */ - new_object = copy_entry->object.vm_object; - if (new_object != VM_OBJECT_NULL) { - vm_object_lock_shared(new_object); - } - while (new_object != VM_OBJECT_NULL && - !new_object->true_share && - new_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC && - new_object->internal) { - new_shadow = new_object->shadow; - if (new_shadow == VM_OBJECT_NULL) { - break; - } - vm_object_lock_shared(new_shadow); - vm_object_unlock(new_object); - new_object = new_shadow; + if (entry->vme_end < dst_end) { + sub_end = entry->vme_end; + } else { + sub_end = dst_end; } - if (new_object != VM_OBJECT_NULL) { - if (!new_object->internal) { - /* - * The new mapping is backed - * by an external object. We - * don't want malloc'ed memory - * to be replaced with such a - * non-anonymous mapping, so - * let's go off the optimized - * path... - */ - vm_map_copy_overwrite_aligned_src_not_internal++; - vm_object_unlock(new_object); - goto slow_copy; - } - if (new_object->true_share || - new_object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) { - /* - * Same if there's a "true_share" - * object in the shadow chain, or - * an object with a non-default - * (SYMMETRIC) copy strategy. - */ - vm_map_copy_overwrite_aligned_src_not_symmetric++; - vm_object_unlock(new_object); - goto slow_copy; - } - vm_object_unlock(new_object); + sub_end -= entry->vme_start; + sub_end += VME_OFFSET(entry); + vm_map_unlock(dst_map); + + kr = vm_map_overwrite_submap_recurse( + VME_SUBMAP(entry), + sub_start, + sub_end - sub_start); + if (kr != KERN_SUCCESS) { + return kr; } - /* - * The new mapping is still backed by - * anonymous (internal) memory, so it's - * OK to substitute it for the original - * malloc() mapping. - */ + vm_map_lock(dst_map); } - if (old_object != VM_OBJECT_NULL) { - if(entry->is_sub_map) { - if(entry->use_pmap) { -#ifndef NO_NESTED_PMAP - pmap_unnest(dst_map->pmap, - (addr64_t)entry->vme_start, - entry->vme_end - entry->vme_start); -#endif /* NO_NESTED_PMAP */ - if(dst_map->mapped_in_other_pmaps) { - /* clean up parent */ - /* map/maps */ - vm_map_submap_pmap_clean( - dst_map, entry->vme_start, - entry->vme_end, - entry->object.sub_map, - entry->offset); - } - } else { - vm_map_submap_pmap_clean( - dst_map, entry->vme_start, - entry->vme_end, - entry->object.sub_map, - entry->offset); - } - vm_map_deallocate( - entry->object.sub_map); - } else { - if(dst_map->mapped_in_other_pmaps) { - vm_object_pmap_protect_options( - entry->object.vm_object, - entry->offset, - entry->vme_end - - entry->vme_start, - PMAP_NULL, - entry->vme_start, - VM_PROT_NONE, - PMAP_OPTIONS_REMOVE); - } else { - pmap_remove_options( - dst_map->pmap, - (addr64_t)(entry->vme_start), - (addr64_t)(entry->vme_end), - PMAP_OPTIONS_REMOVE); - } - vm_object_deallocate(old_object); - } + if (dst_end <= entry->vme_end) { + goto start_overwrite; } + if (!vm_map_lookup_entry(dst_map, local_end, + &entry)) { + vm_map_unlock(dst_map); + return KERN_INVALID_ADDRESS; + } + next = entry->vme_next; + } - entry->is_sub_map = FALSE; - entry->object = copy_entry->object; - object = entry->object.vm_object; - entry->needs_copy = copy_entry->needs_copy; - entry->wired_count = 0; - entry->user_wired_count = 0; - offset = entry->offset = copy_entry->offset; - - vm_map_copy_entry_unlink(copy, copy_entry); - vm_map_copy_entry_dispose(copy, copy_entry); + if (!(entry->protection & VM_PROT_WRITE)) { + vm_map_unlock(dst_map); + return KERN_PROTECTION_FAILURE; + } + /* + * If the entry is in transition, we must wait + * for it to exit that state. Anything could happen + * when we unlock the map, so start over. + */ + if (entry->in_transition) { /* - * we could try to push pages into the pmap at this point, BUT - * this optimization only saved on average 2 us per page if ALL - * the pages in the source were currently mapped - * and ALL the pages in the dest were touched, if there were fewer - * than 2/3 of the pages touched, this optimization actually cost more cycles - * it also puts a lot of pressure on the pmap layer w/r to mapping structures + * Say that we are waiting, and wait for entry. */ + entry->needs_wakeup = TRUE; + vm_map_entry_wait(dst_map, THREAD_UNINT); - /* - * Set up for the next iteration. The map - * has not been unlocked, so the next - * address should be at the end of this - * entry, and the next map entry should be - * the one following it. - */ + goto start_pass_1; + } - start = tmp_entry->vme_end; - tmp_entry = tmp_entry->vme_next; - } else { - vm_map_version_t version; - vm_object_t dst_object; - vm_object_offset_t dst_offset; - kern_return_t r; +/* + * our range is contained completely within this map entry + */ + if (dst_end <= entry->vme_end) { + break; + } +/* + * check that range specified is contiguous region + */ + if ((next == vm_map_to_entry(dst_map)) || + (next->vme_start != entry->vme_end)) { + vm_map_unlock(dst_map); + return KERN_INVALID_ADDRESS; + } - slow_copy: - if (entry->needs_copy) { - vm_object_shadow(&entry->object.vm_object, - &entry->offset, - (entry->vme_end - - entry->vme_start)); - entry->needs_copy = FALSE; - } - dst_object = entry->object.vm_object; - dst_offset = entry->offset; + /* + * Check for permanent objects in the destination. + */ + if ((VME_OBJECT(entry) != VM_OBJECT_NULL) && + ((!VME_OBJECT(entry)->internal) || + (VME_OBJECT(entry)->true_share))) { + contains_permanent_objects = TRUE; + } - /* - * Take an object reference, and record - * the map version information so that the - * map can be safely unlocked. - */ + entry = next; + }/* for */ - if (dst_object == VM_OBJECT_NULL) { - /* - * We would usually have just taken the - * optimized path above if the destination - * object has not been allocated yet. But we - * now disable that optimization if the copy - * entry's object is not backed by anonymous - * memory to avoid replacing malloc'ed - * (i.e. re-usable) anonymous memory with a - * not-so-anonymous mapping. - * So we have to handle this case here and - * allocate a new VM object for this map entry. - */ - dst_object = vm_object_allocate( - entry->vme_end - entry->vme_start); - dst_offset = 0; - entry->object.vm_object = dst_object; - entry->offset = dst_offset; - assert(entry->use_pmap); - - } +start_overwrite: + /* + * If there are permanent objects in the destination, then + * the copy cannot be interrupted. + */ - vm_object_reference(dst_object); + if (interruptible && contains_permanent_objects) { + vm_map_unlock(dst_map); + return KERN_FAILURE; /* XXX */ + } - /* account for unlock bumping up timestamp */ - version.main_timestamp = dst_map->timestamp + 1; + /* + * + * Make a second pass, overwriting the data + * At the beginning of each loop iteration, + * the next entry to be overwritten is "tmp_entry" + * (initially, the value returned from the lookup above), + * and the starting address expected in that entry + * is "start". + */ + total_size = copy->size; + if (encountered_sub_map) { + copy_size = 0; + /* re-calculate tmp_entry since we've had the map */ + /* unlocked */ + if (!vm_map_lookup_entry( dst_map, dst_addr, &tmp_entry)) { vm_map_unlock(dst_map); + return KERN_INVALID_ADDRESS; + } + } else { + copy_size = copy->size; + } - /* - * Copy as much as possible in one pass - */ - - copy_size = size; - r = vm_fault_copy( - copy_entry->object.vm_object, - copy_entry->offset, - ©_size, - dst_object, - dst_offset, - dst_map, - &version, - THREAD_UNINT ); - - /* - * Release the object reference - */ - - vm_object_deallocate(dst_object); + base_addr = dst_addr; + while (TRUE) { + /* deconstruct the copy object and do in parts */ + /* only in sub_map, interruptable case */ + vm_map_entry_t copy_entry; + vm_map_entry_t previous_prev = VM_MAP_ENTRY_NULL; + vm_map_entry_t next_copy = VM_MAP_ENTRY_NULL; + int nentries; + int remaining_entries = 0; + vm_map_offset_t new_offset = 0; - /* - * If a hard error occurred, return it now - */ + for (entry = tmp_entry; copy_size == 0;) { + vm_map_entry_t next; - if (r != KERN_SUCCESS) - return(r); + next = entry->vme_next; - if (copy_size != 0) { + /* tmp_entry and base address are moved along */ + /* each time we encounter a sub-map. Otherwise */ + /* entry can outpase tmp_entry, and the copy_size */ + /* may reflect the distance between them */ + /* if the current entry is found to be in transition */ + /* we will start over at the beginning or the last */ + /* encounter of a submap as dictated by base_addr */ + /* we will zero copy_size accordingly. */ + if (entry->in_transition) { /* - * Dispose of the copied region + * Say that we are waiting, and wait for entry. */ + entry->needs_wakeup = TRUE; + vm_map_entry_wait(dst_map, THREAD_UNINT); - vm_map_copy_clip_end(copy, copy_entry, - copy_entry->vme_start + copy_size); - vm_map_copy_entry_unlink(copy, copy_entry); - vm_object_deallocate(copy_entry->object.vm_object); - vm_map_copy_entry_dispose(copy, copy_entry); + if (!vm_map_lookup_entry(dst_map, base_addr, + &tmp_entry)) { + vm_map_unlock(dst_map); + return KERN_INVALID_ADDRESS; + } + copy_size = 0; + entry = tmp_entry; + continue; } + if (entry->is_sub_map) { + vm_map_offset_t sub_start; + vm_map_offset_t sub_end; + vm_map_offset_t local_end; - /* - * Pick up in the destination map where we left off. - * - * Use the version information to avoid a lookup - * in the normal case. - */ + if (entry->needs_copy) { + /* if this is a COW submap */ + /* just back the range with a */ + /* anonymous entry */ + if (entry->vme_end < dst_end) { + sub_end = entry->vme_end; + } else { + sub_end = dst_end; + } + if (entry->vme_start < base_addr) { + sub_start = base_addr; + } else { + sub_start = entry->vme_start; + } + vm_map_clip_end( + dst_map, entry, sub_end); + vm_map_clip_start( + dst_map, entry, sub_start); + assert(!entry->use_pmap); + assert(!entry->iokit_acct); + entry->use_pmap = TRUE; + entry->is_sub_map = FALSE; + vm_map_deallocate( + VME_SUBMAP(entry)); + VME_OBJECT_SET(entry, VM_OBJECT_NULL); + VME_OFFSET_SET(entry, 0); + entry->is_shared = FALSE; + entry->needs_copy = FALSE; + entry->protection = VM_PROT_DEFAULT; + entry->max_protection = VM_PROT_ALL; + entry->wired_count = 0; + entry->user_wired_count = 0; + if (entry->inheritance + == VM_INHERIT_SHARE) { + entry->inheritance = VM_INHERIT_COPY; + } + continue; + } + /* first take care of any non-sub_map */ + /* entries to send */ + if (base_addr < entry->vme_start) { + /* stuff to send */ + copy_size = + entry->vme_start - base_addr; + break; + } + sub_start = VME_OFFSET(entry); - start += copy_size; - vm_map_lock(dst_map); - if (version.main_timestamp == dst_map->timestamp && - copy_size != 0) { - /* We can safely use saved tmp_entry value */ + if (entry->vme_end < dst_end) { + sub_end = entry->vme_end; + } else { + sub_end = dst_end; + } + sub_end -= entry->vme_start; + sub_end += VME_OFFSET(entry); + local_end = entry->vme_end; + vm_map_unlock(dst_map); + copy_size = sub_end - sub_start; - if (tmp_entry->map_aligned && - !VM_MAP_PAGE_ALIGNED( - start, - VM_MAP_PAGE_MASK(dst_map))) { - /* no longer map-aligned */ - tmp_entry->map_aligned = FALSE; + /* adjust the copy object */ + if (total_size > copy_size) { + vm_map_size_t local_size = 0; + vm_map_size_t entry_size; + + nentries = 1; + new_offset = copy->offset; + copy_entry = vm_map_copy_first_entry(copy); + while (copy_entry != + vm_map_copy_to_entry(copy)) { + entry_size = copy_entry->vme_end - + copy_entry->vme_start; + if ((local_size < copy_size) && + ((local_size + entry_size) + >= copy_size)) { + vm_map_copy_clip_end(copy, + copy_entry, + copy_entry->vme_start + + (copy_size - local_size)); + entry_size = copy_entry->vme_end - + copy_entry->vme_start; + local_size += entry_size; + new_offset += entry_size; + } + if (local_size >= copy_size) { + next_copy = copy_entry->vme_next; + copy_entry->vme_next = + vm_map_copy_to_entry(copy); + previous_prev = + copy->cpy_hdr.links.prev; + copy->cpy_hdr.links.prev = copy_entry; + copy->size = copy_size; + remaining_entries = + copy->cpy_hdr.nentries; + remaining_entries -= nentries; + copy->cpy_hdr.nentries = nentries; + break; + } else { + local_size += entry_size; + new_offset += entry_size; + nentries++; + } + copy_entry = copy_entry->vme_next; + } } - vm_map_clip_end(dst_map, tmp_entry, start); - tmp_entry = tmp_entry->vme_next; - } else { - /* Must do lookup of tmp_entry */ - if (!vm_map_lookup_entry(dst_map, start, &tmp_entry)) { - vm_map_unlock(dst_map); - return(KERN_INVALID_ADDRESS); + if ((entry->use_pmap) && (pmap == NULL)) { + kr = vm_map_copy_overwrite_nested( + VME_SUBMAP(entry), + sub_start, + copy, + interruptible, + VME_SUBMAP(entry)->pmap, + TRUE); + } else if (pmap != NULL) { + kr = vm_map_copy_overwrite_nested( + VME_SUBMAP(entry), + sub_start, + copy, + interruptible, pmap, + TRUE); + } else { + kr = vm_map_copy_overwrite_nested( + VME_SUBMAP(entry), + sub_start, + copy, + interruptible, + dst_map->pmap, + TRUE); } - if (tmp_entry->map_aligned && - !VM_MAP_PAGE_ALIGNED( - start, - VM_MAP_PAGE_MASK(dst_map))) { - /* no longer map-aligned */ - tmp_entry->map_aligned = FALSE; + if (kr != KERN_SUCCESS) { + if (next_copy != NULL) { + copy->cpy_hdr.nentries += + remaining_entries; + copy->cpy_hdr.links.prev->vme_next = + next_copy; + copy->cpy_hdr.links.prev + = previous_prev; + copy->size = total_size; + } + return kr; } - vm_map_clip_start(dst_map, tmp_entry, start); + if (dst_end <= local_end) { + return KERN_SUCCESS; + } + /* otherwise copy no longer exists, it was */ + /* destroyed after successful copy_overwrite */ + copy = vm_map_copy_allocate(); + copy->type = VM_MAP_COPY_ENTRY_LIST; + copy->offset = new_offset; + copy->cpy_hdr.page_shift = copy_page_shift; + + /* + * XXX FBDP + * this does not seem to deal with + * the VM map store (R&B tree) + */ + + total_size -= copy_size; + copy_size = 0; + /* put back remainder of copy in container */ + if (next_copy != NULL) { + copy->cpy_hdr.nentries = remaining_entries; + copy->cpy_hdr.links.next = next_copy; + copy->cpy_hdr.links.prev = previous_prev; + copy->size = total_size; + next_copy->vme_prev = + vm_map_copy_to_entry(copy); + next_copy = NULL; + } + base_addr = local_end; + vm_map_lock(dst_map); + if (!vm_map_lookup_entry(dst_map, + local_end, &tmp_entry)) { + vm_map_unlock(dst_map); + return KERN_INVALID_ADDRESS; + } + entry = tmp_entry; + continue; + } + if (dst_end <= entry->vme_end) { + copy_size = dst_end - base_addr; + break; } - } - }/* while */ - return(KERN_SUCCESS); -}/* vm_map_copy_overwrite_aligned */ + if ((next == vm_map_to_entry(dst_map)) || + (next->vme_start != entry->vme_end)) { + vm_map_unlock(dst_map); + return KERN_INVALID_ADDRESS; + } -/* - * Routine: vm_map_copyin_kernel_buffer [internal use only] - * - * Description: - * Copy in data to a kernel buffer from space in the - * source map. The original space may be optionally - * deallocated. - * - * If successful, returns a new copy object. - */ -static kern_return_t -vm_map_copyin_kernel_buffer( - vm_map_t src_map, - vm_map_offset_t src_addr, - vm_map_size_t len, - boolean_t src_destroy, - vm_map_copy_t *copy_result) -{ - kern_return_t kr; - vm_map_copy_t copy; - vm_size_t kalloc_size; + entry = next; + }/* for */ - if ((vm_size_t) len != len) { - /* "len" is too big and doesn't fit in a "vm_size_t" */ - return KERN_RESOURCE_SHORTAGE; - } - kalloc_size = (vm_size_t) (sizeof(struct vm_map_copy) + len); - assert((vm_map_size_t) kalloc_size == sizeof (struct vm_map_copy) + len); + next_copy = NULL; + nentries = 1; - copy = (vm_map_copy_t) kalloc(kalloc_size); - if (copy == VM_MAP_COPY_NULL) { - return KERN_RESOURCE_SHORTAGE; - } - copy->type = VM_MAP_COPY_KERNEL_BUFFER; - copy->size = len; - copy->offset = 0; - copy->cpy_kdata = (void *) (copy + 1); - copy->cpy_kalloc_size = kalloc_size; + /* adjust the copy object */ + if (total_size > copy_size) { + vm_map_size_t local_size = 0; + vm_map_size_t entry_size; - kr = copyinmap(src_map, src_addr, copy->cpy_kdata, (vm_size_t) len); - if (kr != KERN_SUCCESS) { - kfree(copy, kalloc_size); - return kr; - } - if (src_destroy) { - (void) vm_map_remove( - src_map, - vm_map_trunc_page(src_addr, - VM_MAP_PAGE_MASK(src_map)), - vm_map_round_page(src_addr + len, - VM_MAP_PAGE_MASK(src_map)), - (VM_MAP_REMOVE_INTERRUPTIBLE | - VM_MAP_REMOVE_WAIT_FOR_KWIRE | - (src_map == kernel_map) ? VM_MAP_REMOVE_KUNWIRE : 0)); + new_offset = copy->offset; + copy_entry = vm_map_copy_first_entry(copy); + while (copy_entry != vm_map_copy_to_entry(copy)) { + entry_size = copy_entry->vme_end - + copy_entry->vme_start; + if ((local_size < copy_size) && + ((local_size + entry_size) + >= copy_size)) { + vm_map_copy_clip_end(copy, copy_entry, + copy_entry->vme_start + + (copy_size - local_size)); + entry_size = copy_entry->vme_end - + copy_entry->vme_start; + local_size += entry_size; + new_offset += entry_size; + } + if (local_size >= copy_size) { + next_copy = copy_entry->vme_next; + copy_entry->vme_next = + vm_map_copy_to_entry(copy); + previous_prev = + copy->cpy_hdr.links.prev; + copy->cpy_hdr.links.prev = copy_entry; + copy->size = copy_size; + remaining_entries = + copy->cpy_hdr.nentries; + remaining_entries -= nentries; + copy->cpy_hdr.nentries = nentries; + break; + } else { + local_size += entry_size; + new_offset += entry_size; + nentries++; + } + copy_entry = copy_entry->vme_next; + } + } + + if (aligned) { + pmap_t local_pmap; + + if (pmap) { + local_pmap = pmap; + } else { + local_pmap = dst_map->pmap; + } + + if ((kr = vm_map_copy_overwrite_aligned( + dst_map, tmp_entry, copy, + base_addr, local_pmap)) != KERN_SUCCESS) { + if (next_copy != NULL) { + copy->cpy_hdr.nentries += + remaining_entries; + copy->cpy_hdr.links.prev->vme_next = + next_copy; + copy->cpy_hdr.links.prev = + previous_prev; + copy->size += copy_size; + } + return kr; + } + vm_map_unlock(dst_map); + } else { + /* + * Performance gain: + * + * if the copy and dst address are misaligned but the same + * offset within the page we can copy_not_aligned the + * misaligned parts and copy aligned the rest. If they are + * aligned but len is unaligned we simply need to copy + * the end bit unaligned. We'll need to split the misaligned + * bits of the region in this case ! + */ + /* ALWAYS UNLOCKS THE dst_map MAP */ + kr = vm_map_copy_overwrite_unaligned( + dst_map, + tmp_entry, + copy, + base_addr, + discard_on_success); + if (kr != KERN_SUCCESS) { + if (next_copy != NULL) { + copy->cpy_hdr.nentries += + remaining_entries; + copy->cpy_hdr.links.prev->vme_next = + next_copy; + copy->cpy_hdr.links.prev = + previous_prev; + copy->size += copy_size; + } + return kr; + } + } + total_size -= copy_size; + if (total_size == 0) { + break; + } + base_addr += copy_size; + copy_size = 0; + copy->offset = new_offset; + if (next_copy != NULL) { + copy->cpy_hdr.nentries = remaining_entries; + copy->cpy_hdr.links.next = next_copy; + copy->cpy_hdr.links.prev = previous_prev; + next_copy->vme_prev = vm_map_copy_to_entry(copy); + copy->size = total_size; + } + vm_map_lock(dst_map); + while (TRUE) { + if (!vm_map_lookup_entry(dst_map, + base_addr, &tmp_entry)) { + vm_map_unlock(dst_map); + return KERN_INVALID_ADDRESS; + } + if (tmp_entry->in_transition) { + entry->needs_wakeup = TRUE; + vm_map_entry_wait(dst_map, THREAD_UNINT); + } else { + break; + } + } + vm_map_clip_start(dst_map, + tmp_entry, + vm_map_trunc_page(base_addr, + VM_MAP_PAGE_MASK(dst_map))); + + entry = tmp_entry; + } /* while */ + + /* + * Throw away the vm_map_copy object + */ + if (discard_on_success) { + vm_map_copy_discard(copy); } - *copy_result = copy; + return KERN_SUCCESS; -} +}/* vm_map_copy_overwrite */ -/* - * Routine: vm_map_copyout_kernel_buffer [internal use only] - * - * Description: - * Copy out data from a kernel buffer into space in the - * destination map. The space may be otpionally dynamically - * allocated. - * - * If successful, consumes the copy object. - * Otherwise, the caller is responsible for it. - */ -static int vm_map_copyout_kernel_buffer_failures = 0; -static kern_return_t -vm_map_copyout_kernel_buffer( - vm_map_t map, - vm_map_address_t *addr, /* IN/OUT */ - vm_map_copy_t copy, - boolean_t overwrite, - boolean_t consume_on_success) +kern_return_t +vm_map_copy_overwrite( + vm_map_t dst_map, + vm_map_offset_t dst_addr, + vm_map_copy_t copy, + vm_map_size_t copy_size, + boolean_t interruptible) { - kern_return_t kr = KERN_SUCCESS; - thread_t thread = current_thread(); + vm_map_size_t head_size, tail_size; + vm_map_copy_t head_copy, tail_copy; + vm_map_offset_t head_addr, tail_addr; + vm_map_entry_t entry; + kern_return_t kr; + vm_map_offset_t effective_page_mask, effective_page_size; + int copy_page_shift; - if (!overwrite) { + head_size = 0; + tail_size = 0; + head_copy = NULL; + tail_copy = NULL; + head_addr = 0; + tail_addr = 0; + if (interruptible || + copy == VM_MAP_COPY_NULL || + copy->type != VM_MAP_COPY_ENTRY_LIST) { /* - * Allocate space in the target map for the data + * We can't split the "copy" map if we're interruptible + * or if we don't have a "copy" map... */ - *addr = 0; - kr = vm_map_enter(map, - addr, - vm_map_round_page(copy->size, - VM_MAP_PAGE_MASK(map)), - (vm_map_offset_t) 0, - VM_FLAGS_ANYWHERE, - VM_OBJECT_NULL, - (vm_object_offset_t) 0, - FALSE, - VM_PROT_DEFAULT, - VM_PROT_ALL, - VM_INHERIT_DEFAULT); - if (kr != KERN_SUCCESS) - return kr; +blunt_copy: + return vm_map_copy_overwrite_nested(dst_map, + dst_addr, + copy, + interruptible, + (pmap_t) NULL, + TRUE); } - /* - * Copyout the data from the kernel buffer to the target map. - */ - if (thread->map == map) { - + copy_page_shift = VM_MAP_COPY_PAGE_SHIFT(copy); + if (copy_page_shift < PAGE_SHIFT || + VM_MAP_PAGE_SHIFT(dst_map) < PAGE_SHIFT) { + goto blunt_copy; + } + + if (VM_MAP_PAGE_SHIFT(dst_map) < PAGE_SHIFT) { + effective_page_mask = VM_MAP_PAGE_MASK(dst_map); + } else { + effective_page_mask = MAX(VM_MAP_PAGE_MASK(dst_map), PAGE_MASK); + effective_page_mask = MAX(VM_MAP_COPY_PAGE_MASK(copy), + effective_page_mask); + } + effective_page_size = effective_page_mask + 1; + + if (copy_size < VM_MAP_COPY_OVERWRITE_OPTIMIZATION_THRESHOLD_PAGES * effective_page_size) { /* - * If the target map is the current map, just do - * the copy. + * Too small to bother with optimizing... */ - assert((vm_size_t) copy->size == copy->size); - if (copyout(copy->cpy_kdata, *addr, (vm_size_t) copy->size)) { - kr = KERN_INVALID_ADDRESS; - } + goto blunt_copy; } - else { - vm_map_t oldmap; + if ((dst_addr & effective_page_mask) != + (copy->offset & effective_page_mask)) { /* - * If the target map is another map, assume the - * target's address space identity for the duration - * of the copy. + * Incompatible mis-alignment of source and destination... */ - vm_map_reference(map); - oldmap = vm_map_switch(map); + goto blunt_copy; + } - assert((vm_size_t) copy->size == copy->size); - if (copyout(copy->cpy_kdata, *addr, (vm_size_t) copy->size)) { - vm_map_copyout_kernel_buffer_failures++; - kr = KERN_INVALID_ADDRESS; - } - - (void) vm_map_switch(oldmap); - vm_map_deallocate(map); + /* + * Proper alignment or identical mis-alignment at the beginning. + * Let's try and do a small unaligned copy first (if needed) + * and then an aligned copy for the rest. + */ + if (!vm_map_page_aligned(dst_addr, effective_page_mask)) { + head_addr = dst_addr; + head_size = (effective_page_size - + (copy->offset & effective_page_mask)); + head_size = MIN(head_size, copy_size); + } + if (!vm_map_page_aligned(copy->offset + copy_size, + effective_page_mask)) { + /* + * Mis-alignment at the end. + * Do an aligned copy up to the last page and + * then an unaligned copy for the remaining bytes. + */ + tail_size = ((copy->offset + copy_size) & + effective_page_mask); + tail_size = MIN(tail_size, copy_size); + tail_addr = dst_addr + copy_size - tail_size; + assert(tail_addr >= head_addr + head_size); } + assert(head_size + tail_size <= copy_size); - if (kr != KERN_SUCCESS) { - /* the copy failed, clean up */ - if (!overwrite) { - /* - * Deallocate the space we allocated in the target map. - */ - (void) vm_map_remove( - map, - vm_map_trunc_page(*addr, - VM_MAP_PAGE_MASK(map)), - vm_map_round_page((*addr + - vm_map_round_page(copy->size, - VM_MAP_PAGE_MASK(map))), - VM_MAP_PAGE_MASK(map)), - VM_MAP_NO_FLAGS); - *addr = 0; - } - } else { - /* copy was successful, dicard the copy structure */ - if (consume_on_success) { - kfree(copy, copy->cpy_kalloc_size); + if (head_size + tail_size == copy_size) { + /* + * It's all unaligned, no optimization possible... + */ + goto blunt_copy; + } + + /* + * Can't optimize if there are any submaps in the + * destination due to the way we free the "copy" map + * progressively in vm_map_copy_overwrite_nested() + * in that case. + */ + vm_map_lock_read(dst_map); + if (!vm_map_lookup_entry(dst_map, dst_addr, &entry)) { + vm_map_unlock_read(dst_map); + goto blunt_copy; + } + for (; + (entry != vm_map_copy_to_entry(copy) && + entry->vme_start < dst_addr + copy_size); + entry = entry->vme_next) { + if (entry->is_sub_map) { + vm_map_unlock_read(dst_map); + goto blunt_copy; } } + vm_map_unlock_read(dst_map); - return kr; -} - -/* - * Macro: vm_map_copy_insert - * - * Description: - * Link a copy chain ("copy") into a map at the - * specified location (after "where"). - * Side effects: - * The copy chain is destroyed. - * Warning: - * The arguments are evaluated multiple times. - */ -#define vm_map_copy_insert(map, where, copy) \ -MACRO_BEGIN \ - vm_map_store_copy_insert(map, where, copy); \ - zfree(vm_map_copy_zone, copy); \ -MACRO_END + if (head_size) { + /* + * Unaligned copy of the first "head_size" bytes, to reach + * a page boundary. + */ -void -vm_map_copy_remap( - vm_map_t map, - vm_map_entry_t where, - vm_map_copy_t copy, - vm_map_offset_t adjustment, - vm_prot_t cur_prot, - vm_prot_t max_prot, - vm_inherit_t inheritance) -{ - vm_map_entry_t copy_entry, new_entry; + /* + * Extract "head_copy" out of "copy". + */ + head_copy = vm_map_copy_allocate(); + head_copy->type = VM_MAP_COPY_ENTRY_LIST; + head_copy->cpy_hdr.entries_pageable = + copy->cpy_hdr.entries_pageable; + vm_map_store_init(&head_copy->cpy_hdr); + head_copy->cpy_hdr.page_shift = copy_page_shift; - for (copy_entry = vm_map_copy_first_entry(copy); - copy_entry != vm_map_copy_to_entry(copy); - copy_entry = copy_entry->vme_next) { - /* get a new VM map entry for the map */ - new_entry = vm_map_entry_create(map, - !map->hdr.entries_pageable); - /* copy the "copy entry" to the new entry */ - vm_map_entry_copy(new_entry, copy_entry); - /* adjust "start" and "end" */ - new_entry->vme_start += adjustment; - new_entry->vme_end += adjustment; - /* clear some attributes */ - new_entry->inheritance = inheritance; - new_entry->protection = cur_prot; - new_entry->max_protection = max_prot; - new_entry->behavior = VM_BEHAVIOR_DEFAULT; - /* take an extra reference on the entry's "object" */ - if (new_entry->is_sub_map) { - assert(!new_entry->use_pmap); /* not nested */ - vm_map_lock(new_entry->object.sub_map); - vm_map_reference(new_entry->object.sub_map); - vm_map_unlock(new_entry->object.sub_map); - } else { - vm_object_reference(new_entry->object.vm_object); + entry = vm_map_copy_first_entry(copy); + if (entry->vme_end < copy->offset + head_size) { + head_size = entry->vme_end - copy->offset; } - /* insert the new entry in the map */ - vm_map_store_entry_link(map, where, new_entry); - /* continue inserting the "copy entries" after the new entry */ - where = new_entry; - } -} - -/* - * Routine: vm_map_copyout - * - * Description: - * Copy out a copy chain ("copy") into newly-allocated - * space in the destination map. - * - * If successful, consumes the copy object. - * Otherwise, the caller is responsible for it. - */ - -kern_return_t -vm_map_copyout( - vm_map_t dst_map, - vm_map_address_t *dst_addr, /* OUT */ - vm_map_copy_t copy) -{ - return vm_map_copyout_internal(dst_map, dst_addr, copy, - TRUE, /* consume_on_success */ - VM_PROT_DEFAULT, - VM_PROT_ALL, - VM_INHERIT_DEFAULT); -} -kern_return_t -vm_map_copyout_internal( - vm_map_t dst_map, - vm_map_address_t *dst_addr, /* OUT */ - vm_map_copy_t copy, - boolean_t consume_on_success, - vm_prot_t cur_protection, - vm_prot_t max_protection, - vm_inherit_t inheritance) -{ - vm_map_size_t size; - vm_map_size_t adjustment; - vm_map_offset_t start; - vm_object_offset_t vm_copy_start; - vm_map_entry_t last; - vm_map_entry_t entry; + head_copy->offset = copy->offset; + head_copy->size = head_size; + copy->offset += head_size; + copy->size -= head_size; + copy_size -= head_size; + assert(copy_size > 0); - /* - * Check for null copy object. - */ + vm_map_copy_clip_end(copy, entry, copy->offset); + vm_map_copy_entry_unlink(copy, entry); + vm_map_copy_entry_link(head_copy, + vm_map_copy_to_entry(head_copy), + entry); - if (copy == VM_MAP_COPY_NULL) { - *dst_addr = 0; - return(KERN_SUCCESS); + /* + * Do the unaligned copy. + */ + kr = vm_map_copy_overwrite_nested(dst_map, + head_addr, + head_copy, + interruptible, + (pmap_t) NULL, + FALSE); + if (kr != KERN_SUCCESS) { + goto done; + } } - /* - * Check for special copy object, created - * by vm_map_copyin_object. - */ + if (tail_size) { + /* + * Extract "tail_copy" out of "copy". + */ + tail_copy = vm_map_copy_allocate(); + tail_copy->type = VM_MAP_COPY_ENTRY_LIST; + tail_copy->cpy_hdr.entries_pageable = + copy->cpy_hdr.entries_pageable; + vm_map_store_init(&tail_copy->cpy_hdr); + tail_copy->cpy_hdr.page_shift = copy_page_shift; - if (copy->type == VM_MAP_COPY_OBJECT) { - vm_object_t object = copy->cpy_object; - kern_return_t kr; - vm_object_offset_t offset; + tail_copy->offset = copy->offset + copy_size - tail_size; + tail_copy->size = tail_size; - offset = vm_object_trunc_page(copy->offset); - size = vm_map_round_page((copy->size + - (vm_map_size_t)(copy->offset - - offset)), - VM_MAP_PAGE_MASK(dst_map)); - *dst_addr = 0; - kr = vm_map_enter(dst_map, dst_addr, size, - (vm_map_offset_t) 0, VM_FLAGS_ANYWHERE, - object, offset, FALSE, - VM_PROT_DEFAULT, VM_PROT_ALL, - VM_INHERIT_DEFAULT); - if (kr != KERN_SUCCESS) - return(kr); - /* Account for non-pagealigned copy object */ - *dst_addr += (vm_map_offset_t)(copy->offset - offset); - if (consume_on_success) - zfree(vm_map_copy_zone, copy); - return(KERN_SUCCESS); + copy->size -= tail_size; + copy_size -= tail_size; + assert(copy_size > 0); + + entry = vm_map_copy_last_entry(copy); + vm_map_copy_clip_start(copy, entry, tail_copy->offset); + entry = vm_map_copy_last_entry(copy); + vm_map_copy_entry_unlink(copy, entry); + vm_map_copy_entry_link(tail_copy, + vm_map_copy_last_entry(tail_copy), + entry); } /* - * Check for special kernel buffer allocated - * by new_ipc_kmsg_copyin. + * If we are here from ipc_kmsg_copyout_ool_descriptor(), + * we want to avoid TOCTOU issues w.r.t copy->size but + * we don't need to change vm_map_copy_overwrite_nested() + * and all other vm_map_copy_overwrite variants. + * + * So we assign the original copy_size that was passed into + * this routine back to copy. + * + * This use of local 'copy_size' passed into this routine is + * to try and protect against TOCTOU attacks where the kernel + * has been exploited. We don't expect this to be an issue + * during normal system operation. */ - - if (copy->type == VM_MAP_COPY_KERNEL_BUFFER) { - return vm_map_copyout_kernel_buffer(dst_map, dst_addr, - copy, FALSE, - consume_on_success); - } - + assertf(copy->size == copy_size, + "Mismatch of copy sizes. Expected 0x%llx, Got 0x%llx\n", (uint64_t) copy_size, (uint64_t) copy->size); + copy->size = copy_size; /* - * Find space for the data + * Copy most (or possibly all) of the data. */ + kr = vm_map_copy_overwrite_nested(dst_map, + dst_addr + head_size, + copy, + interruptible, + (pmap_t) NULL, + FALSE); + if (kr != KERN_SUCCESS) { + goto done; + } - vm_copy_start = vm_map_trunc_page((vm_map_size_t)copy->offset, - VM_MAP_COPY_PAGE_MASK(copy)); - size = vm_map_round_page((vm_map_size_t)copy->offset + copy->size, - VM_MAP_COPY_PAGE_MASK(copy)) - - vm_copy_start; - - -StartAgain: ; + if (tail_size) { + kr = vm_map_copy_overwrite_nested(dst_map, + tail_addr, + tail_copy, + interruptible, + (pmap_t) NULL, + FALSE); + } - vm_map_lock(dst_map); - if( dst_map->disable_vmentry_reuse == TRUE) { - VM_MAP_HIGHEST_ENTRY(dst_map, entry, start); - last = entry; +done: + assert(copy->type == VM_MAP_COPY_ENTRY_LIST); + if (kr == KERN_SUCCESS) { + /* + * Discard all the copy maps. + */ + if (head_copy) { + vm_map_copy_discard(head_copy); + head_copy = NULL; + } + vm_map_copy_discard(copy); + if (tail_copy) { + vm_map_copy_discard(tail_copy); + tail_copy = NULL; + } } else { - assert(first_free_is_valid(dst_map)); - start = ((last = dst_map->first_free) == vm_map_to_entry(dst_map)) ? - vm_map_min(dst_map) : last->vme_end; - start = vm_map_round_page(start, - VM_MAP_PAGE_MASK(dst_map)); + /* + * Re-assemble the original copy map. + */ + if (head_copy) { + entry = vm_map_copy_first_entry(head_copy); + vm_map_copy_entry_unlink(head_copy, entry); + vm_map_copy_entry_link(copy, + vm_map_copy_to_entry(copy), + entry); + copy->offset -= head_size; + copy->size += head_size; + vm_map_copy_discard(head_copy); + head_copy = NULL; + } + if (tail_copy) { + entry = vm_map_copy_last_entry(tail_copy); + vm_map_copy_entry_unlink(tail_copy, entry); + vm_map_copy_entry_link(copy, + vm_map_copy_last_entry(copy), + entry); + copy->size += tail_size; + vm_map_copy_discard(tail_copy); + tail_copy = NULL; + } } + return kr; +} - while (TRUE) { - vm_map_entry_t next = last->vme_next; - vm_map_offset_t end = start + size; - if ((end > dst_map->max_offset) || (end < start)) { - if (dst_map->wait_for_space) { - if (size <= (dst_map->max_offset - dst_map->min_offset)) { - assert_wait((event_t) dst_map, - THREAD_INTERRUPTIBLE); - vm_map_unlock(dst_map); - thread_block(THREAD_CONTINUE_NULL); - goto StartAgain; - } - } - vm_map_unlock(dst_map); - return(KERN_NO_SPACE); - } +/* + * Routine: vm_map_copy_overwrite_unaligned [internal use only] + * + * Decription: + * Physically copy unaligned data + * + * Implementation: + * Unaligned parts of pages have to be physically copied. We use + * a modified form of vm_fault_copy (which understands none-aligned + * page offsets and sizes) to do the copy. We attempt to copy as + * much memory in one go as possibly, however vm_fault_copy copies + * within 1 memory object so we have to find the smaller of "amount left" + * "source object data size" and "target object data size". With + * unaligned data we don't need to split regions, therefore the source + * (copy) object should be one map entry, the target range may be split + * over multiple map entries however. In any event we are pessimistic + * about these assumptions. + * + * Assumptions: + * dst_map is locked on entry and is return locked on success, + * unlocked on error. + */ - if ((next == vm_map_to_entry(dst_map)) || - (next->vme_start >= end)) - break; +static kern_return_t +vm_map_copy_overwrite_unaligned( + vm_map_t dst_map, + vm_map_entry_t entry, + vm_map_copy_t copy, + vm_map_offset_t start, + boolean_t discard_on_success) +{ + vm_map_entry_t copy_entry; + vm_map_entry_t copy_entry_next; + vm_map_version_t version; + vm_object_t dst_object; + vm_object_offset_t dst_offset; + vm_object_offset_t src_offset; + vm_object_offset_t entry_offset; + vm_map_offset_t entry_end; + vm_map_size_t src_size, + dst_size, + copy_size, + amount_left; + kern_return_t kr = KERN_SUCCESS; - last = next; - start = last->vme_end; - start = vm_map_round_page(start, - VM_MAP_PAGE_MASK(dst_map)); - } - adjustment = start - vm_copy_start; - if (! consume_on_success) { - /* - * We're not allowed to consume "copy", so we'll have to - * copy its map entries into the destination map below. - * No need to re-allocate map entries from the correct - * (pageable or not) zone, since we'll get new map entries - * during the transfer. - * We'll also adjust the map entries's "start" and "end" - * during the transfer, to keep "copy"'s entries consistent - * with its "offset". - */ - goto after_adjustments; - } + copy_entry = vm_map_copy_first_entry(copy); - /* - * Since we're going to just drop the map - * entries from the copy into the destination - * map, they must come from the same pool. - */ + vm_map_lock_write_to_read(dst_map); - if (copy->cpy_hdr.entries_pageable != dst_map->hdr.entries_pageable) { - /* - * Mismatches occur when dealing with the default - * pager. - */ - zone_t old_zone; - vm_map_entry_t next, new; + src_offset = copy->offset - trunc_page_mask_64(copy->offset, VM_MAP_COPY_PAGE_MASK(copy)); + amount_left = copy->size; +/* + * unaligned so we never clipped this entry, we need the offset into + * the vm_object not just the data. + */ + while (amount_left > 0) { + if (entry == vm_map_to_entry(dst_map)) { + vm_map_unlock_read(dst_map); + return KERN_INVALID_ADDRESS; + } - /* - * Find the zone that the copies were allocated from - */ + /* "start" must be within the current map entry */ + assert((start >= entry->vme_start) && (start < entry->vme_end)); - entry = vm_map_copy_first_entry(copy); + dst_offset = start - entry->vme_start; - /* - * Reinitialize the copy so that vm_map_copy_entry_link - * will work. - */ - vm_map_store_copy_reset(copy, entry); - copy->cpy_hdr.entries_pageable = dst_map->hdr.entries_pageable; + dst_size = entry->vme_end - start; - /* - * Copy each entry. - */ - while (entry != vm_map_copy_to_entry(copy)) { - new = vm_map_copy_entry_create(copy, !copy->cpy_hdr.entries_pageable); - vm_map_entry_copy_full(new, entry); - assert(!new->iokit_acct); - if (new->is_sub_map) { - /* clr address space specifics */ - new->use_pmap = FALSE; - } - vm_map_copy_entry_link(copy, - vm_map_copy_last_entry(copy), - new); - next = entry->vme_next; - old_zone = entry->from_reserved_zone ? vm_map_entry_reserved_zone : vm_map_entry_zone; - zfree(old_zone, entry); - entry = next; - } - } - - /* - * Adjust the addresses in the copy chain, and - * reset the region attributes. - */ + src_size = copy_entry->vme_end - + (copy_entry->vme_start + src_offset); - for (entry = vm_map_copy_first_entry(copy); - entry != vm_map_copy_to_entry(copy); - entry = entry->vme_next) { - if (VM_MAP_PAGE_SHIFT(dst_map) == PAGE_SHIFT) { - /* - * We're injecting this copy entry into a map that - * has the standard page alignment, so clear - * "map_aligned" (which might have been inherited - * from the original map entry). - */ - entry->map_aligned = FALSE; + if (dst_size < src_size) { +/* + * we can only copy dst_size bytes before + * we have to get the next destination entry + */ + copy_size = dst_size; + } else { +/* + * we can only copy src_size bytes before + * we have to get the next source copy entry + */ + copy_size = src_size; } - entry->vme_start += adjustment; - entry->vme_end += adjustment; + if (copy_size > amount_left) { + copy_size = amount_left; + } +/* + * Entry needs copy, create a shadow shadow object for + * Copy on write region. + */ + if (entry->needs_copy && + ((entry->protection & VM_PROT_WRITE) != 0)) { + if (vm_map_lock_read_to_write(dst_map)) { + vm_map_lock_read(dst_map); + goto RetryLookup; + } + VME_OBJECT_SHADOW(entry, + (vm_map_size_t)(entry->vme_end + - entry->vme_start)); + entry->needs_copy = FALSE; + vm_map_lock_write_to_read(dst_map); + } + dst_object = VME_OBJECT(entry); +/* + * unlike with the virtual (aligned) copy we're going + * to fault on it therefore we need a target object. + */ + if (dst_object == VM_OBJECT_NULL) { + if (vm_map_lock_read_to_write(dst_map)) { + vm_map_lock_read(dst_map); + goto RetryLookup; + } + dst_object = vm_object_allocate((vm_map_size_t) + entry->vme_end - entry->vme_start); + VME_OBJECT_SET(entry, dst_object); + VME_OFFSET_SET(entry, 0); + assert(entry->use_pmap); + vm_map_lock_write_to_read(dst_map); + } +/* + * Take an object reference and unlock map. The "entry" may + * disappear or change when the map is unlocked. + */ + vm_object_reference(dst_object); + version.main_timestamp = dst_map->timestamp; + entry_offset = VME_OFFSET(entry); + entry_end = entry->vme_end; + vm_map_unlock_read(dst_map); +/* + * Copy as much as possible in one pass + */ + kr = vm_fault_copy( + VME_OBJECT(copy_entry), + VME_OFFSET(copy_entry) + src_offset, + ©_size, + dst_object, + entry_offset + dst_offset, + dst_map, + &version, + THREAD_UNINT ); - if (entry->map_aligned) { - assert(VM_MAP_PAGE_ALIGNED(entry->vme_start, - VM_MAP_PAGE_MASK(dst_map))); - assert(VM_MAP_PAGE_ALIGNED(entry->vme_end, - VM_MAP_PAGE_MASK(dst_map))); + start += copy_size; + src_offset += copy_size; + amount_left -= copy_size; +/* + * Release the object reference + */ + vm_object_deallocate(dst_object); +/* + * If a hard error occurred, return it now + */ + if (kr != KERN_SUCCESS) { + return kr; } - entry->inheritance = VM_INHERIT_DEFAULT; - entry->protection = VM_PROT_DEFAULT; - entry->max_protection = VM_PROT_ALL; - entry->behavior = VM_BEHAVIOR_DEFAULT; + if ((copy_entry->vme_start + src_offset) == copy_entry->vme_end + || amount_left == 0) { +/* + * all done with this copy entry, dispose. + */ + copy_entry_next = copy_entry->vme_next; - /* - * If the entry is now wired, - * map the pages into the destination map. - */ - if (entry->wired_count != 0) { - register vm_map_offset_t va; - vm_object_offset_t offset; - register vm_object_t object; - vm_prot_t prot; - int type_of_fault; + if (discard_on_success) { + vm_map_copy_entry_unlink(copy, copy_entry); + assert(!copy_entry->is_sub_map); + vm_object_deallocate(VME_OBJECT(copy_entry)); + vm_map_copy_entry_dispose(copy, copy_entry); + } - object = entry->object.vm_object; - offset = entry->offset; - va = entry->vme_start; + if (copy_entry_next == vm_map_copy_to_entry(copy) && + amount_left) { +/* + * not finished copying but run out of source + */ + return KERN_INVALID_ADDRESS; + } - pmap_pageable(dst_map->pmap, - entry->vme_start, - entry->vme_end, - TRUE); + copy_entry = copy_entry_next; - while (va < entry->vme_end) { - register vm_page_t m; + src_offset = 0; + } - /* - * Look up the page in the object. - * Assert that the page will be found in the - * top object: - * either - * the object was newly created by - * vm_object_copy_slowly, and has - * copies of all of the pages from - * the source object - * or - * the object was moved from the old - * map entry; because the old map - * entry was wired, all of the pages - * were in the top-level object. - * (XXX not true if we wire pages for - * reading) - */ - vm_object_lock(object); + if (amount_left == 0) { + return KERN_SUCCESS; + } - m = vm_page_lookup(object, offset); - if (m == VM_PAGE_NULL || !VM_PAGE_WIRED(m) || - m->absent) - panic("vm_map_copyout: wiring %p", m); + vm_map_lock_read(dst_map); + if (version.main_timestamp == dst_map->timestamp) { + if (start == entry_end) { +/* + * destination region is split. Use the version + * information to avoid a lookup in the normal + * case. + */ + entry = entry->vme_next; +/* + * should be contiguous. Fail if we encounter + * a hole in the destination. + */ + if (start != entry->vme_start) { + vm_map_unlock_read(dst_map); + return KERN_INVALID_ADDRESS; + } + } + } else { +/* + * Map version check failed. + * we must lookup the entry because somebody + * might have changed the map behind our backs. + */ +RetryLookup: + if (!vm_map_lookup_entry(dst_map, start, &entry)) { + vm_map_unlock_read(dst_map); + return KERN_INVALID_ADDRESS; + } + } + }/* while */ - /* - * ENCRYPTED SWAP: - * The page is assumed to be wired here, so it - * shouldn't be encrypted. Otherwise, we - * couldn't enter it in the page table, since - * we don't want the user to see the encrypted - * data. - */ - ASSERT_PAGE_DECRYPTED(m); + return KERN_SUCCESS; +}/* vm_map_copy_overwrite_unaligned */ - prot = entry->protection; +/* + * Routine: vm_map_copy_overwrite_aligned [internal use only] + * + * Description: + * Does all the vm_trickery possible for whole pages. + * + * Implementation: + * + * If there are no permanent objects in the destination, + * and the source and destination map entry zones match, + * and the destination map entry is not shared, + * then the map entries can be deleted and replaced + * with those from the copy. The following code is the + * basic idea of what to do, but there are lots of annoying + * little details about getting protection and inheritance + * right. Should add protection, inheritance, and sharing checks + * to the above pass and make sure that no wiring is involved. + */ - if (override_nx(dst_map, entry->alias) && prot) - prot |= VM_PROT_EXECUTE; +int vm_map_copy_overwrite_aligned_src_not_internal = 0; +int vm_map_copy_overwrite_aligned_src_not_symmetric = 0; +int vm_map_copy_overwrite_aligned_src_large = 0; - type_of_fault = DBG_CACHE_HIT_FAULT; +static kern_return_t +vm_map_copy_overwrite_aligned( + vm_map_t dst_map, + vm_map_entry_t tmp_entry, + vm_map_copy_t copy, + vm_map_offset_t start, + __unused pmap_t pmap) +{ + vm_object_t object; + vm_map_entry_t copy_entry; + vm_map_size_t copy_size; + vm_map_size_t size; + vm_map_entry_t entry; - vm_fault_enter(m, dst_map->pmap, va, prot, prot, - VM_PAGE_WIRED(m), FALSE, FALSE, - FALSE, entry->alias, - ((entry->iokit_acct || - (!entry->is_sub_map && - !entry->use_pmap)) - ? PMAP_OPTIONS_ALT_ACCT - : 0), - NULL, &type_of_fault); + while ((copy_entry = vm_map_copy_first_entry(copy)) + != vm_map_copy_to_entry(copy)) { + copy_size = (copy_entry->vme_end - copy_entry->vme_start); - vm_object_unlock(object); + entry = tmp_entry; + if (entry->is_sub_map) { + /* unnested when clipped earlier */ + assert(!entry->use_pmap); + } + if (entry == vm_map_to_entry(dst_map)) { + vm_map_unlock(dst_map); + return KERN_INVALID_ADDRESS; + } + size = (entry->vme_end - entry->vme_start); + /* + * Make sure that no holes popped up in the + * address map, and that the protection is + * still valid, in case the map was unlocked + * earlier. + */ - offset += PAGE_SIZE_64; - va += PAGE_SIZE; - } + if ((entry->vme_start != start) || ((entry->is_sub_map) + && !entry->needs_copy)) { + vm_map_unlock(dst_map); + return KERN_INVALID_ADDRESS; } - } + assert(entry != vm_map_to_entry(dst_map)); -after_adjustments: + /* + * Check protection again + */ - /* - * Correct the page alignment for the result - */ + if (!(entry->protection & VM_PROT_WRITE)) { + vm_map_unlock(dst_map); + return KERN_PROTECTION_FAILURE; + } - *dst_addr = start + (copy->offset - vm_copy_start); + /* + * Adjust to source size first + */ - /* - * Update the hints and the map size - */ + if (copy_size < size) { + if (entry->map_aligned && + !VM_MAP_PAGE_ALIGNED(entry->vme_start + copy_size, + VM_MAP_PAGE_MASK(dst_map))) { + /* no longer map-aligned */ + entry->map_aligned = FALSE; + } + vm_map_clip_end(dst_map, entry, entry->vme_start + copy_size); + size = copy_size; + } + + /* + * Adjust to destination size + */ + + if (size < copy_size) { + vm_map_copy_clip_end(copy, copy_entry, + copy_entry->vme_start + size); + copy_size = size; + } + + assert((entry->vme_end - entry->vme_start) == size); + assert((tmp_entry->vme_end - tmp_entry->vme_start) == size); + assert((copy_entry->vme_end - copy_entry->vme_start) == size); + + /* + * If the destination contains temporary unshared memory, + * we can perform the copy by throwing it away and + * installing the source data. + */ + + object = VME_OBJECT(entry); + if ((!entry->is_shared && + ((object == VM_OBJECT_NULL) || + (object->internal && !object->true_share))) || + entry->needs_copy) { + vm_object_t old_object = VME_OBJECT(entry); + vm_object_offset_t old_offset = VME_OFFSET(entry); + vm_object_offset_t offset; + + /* + * Ensure that the source and destination aren't + * identical + */ + if (old_object == VME_OBJECT(copy_entry) && + old_offset == VME_OFFSET(copy_entry)) { + vm_map_copy_entry_unlink(copy, copy_entry); + vm_map_copy_entry_dispose(copy, copy_entry); + + if (old_object != VM_OBJECT_NULL) { + vm_object_deallocate(old_object); + } + + start = tmp_entry->vme_end; + tmp_entry = tmp_entry->vme_next; + continue; + } + +#if XNU_TARGET_OS_OSX +#define __TRADEOFF1_OBJ_SIZE (64 * 1024 * 1024) /* 64 MB */ +#define __TRADEOFF1_COPY_SIZE (128 * 1024) /* 128 KB */ + if (VME_OBJECT(copy_entry) != VM_OBJECT_NULL && + VME_OBJECT(copy_entry)->vo_size >= __TRADEOFF1_OBJ_SIZE && + copy_size <= __TRADEOFF1_COPY_SIZE) { + /* + * Virtual vs. Physical copy tradeoff #1. + * + * Copying only a few pages out of a large + * object: do a physical copy instead of + * a virtual copy, to avoid possibly keeping + * the entire large object alive because of + * those few copy-on-write pages. + */ + vm_map_copy_overwrite_aligned_src_large++; + goto slow_copy; + } +#endif /* XNU_TARGET_OS_OSX */ + + if ((dst_map->pmap != kernel_pmap) && + (VME_ALIAS(entry) >= VM_MEMORY_MALLOC) && + (VME_ALIAS(entry) <= VM_MEMORY_MALLOC_MEDIUM)) { + vm_object_t new_object, new_shadow; + + /* + * We're about to map something over a mapping + * established by malloc()... + */ + new_object = VME_OBJECT(copy_entry); + if (new_object != VM_OBJECT_NULL) { + vm_object_lock_shared(new_object); + } + while (new_object != VM_OBJECT_NULL && +#if XNU_TARGET_OS_OSX + !new_object->true_share && + new_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC && +#endif /* XNU_TARGET_OS_OSX */ + new_object->internal) { + new_shadow = new_object->shadow; + if (new_shadow == VM_OBJECT_NULL) { + break; + } + vm_object_lock_shared(new_shadow); + vm_object_unlock(new_object); + new_object = new_shadow; + } + if (new_object != VM_OBJECT_NULL) { + if (!new_object->internal) { + /* + * The new mapping is backed + * by an external object. We + * don't want malloc'ed memory + * to be replaced with such a + * non-anonymous mapping, so + * let's go off the optimized + * path... + */ + vm_map_copy_overwrite_aligned_src_not_internal++; + vm_object_unlock(new_object); + goto slow_copy; + } +#if XNU_TARGET_OS_OSX + if (new_object->true_share || + new_object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) { + /* + * Same if there's a "true_share" + * object in the shadow chain, or + * an object with a non-default + * (SYMMETRIC) copy strategy. + */ + vm_map_copy_overwrite_aligned_src_not_symmetric++; + vm_object_unlock(new_object); + goto slow_copy; + } +#endif /* XNU_TARGET_OS_OSX */ + vm_object_unlock(new_object); + } + /* + * The new mapping is still backed by + * anonymous (internal) memory, so it's + * OK to substitute it for the original + * malloc() mapping. + */ + } + + if (old_object != VM_OBJECT_NULL) { + if (entry->is_sub_map) { + if (entry->use_pmap) { +#ifndef NO_NESTED_PMAP + pmap_unnest(dst_map->pmap, + (addr64_t)entry->vme_start, + entry->vme_end - entry->vme_start); +#endif /* NO_NESTED_PMAP */ + if (dst_map->mapped_in_other_pmaps) { + /* clean up parent */ + /* map/maps */ + vm_map_submap_pmap_clean( + dst_map, entry->vme_start, + entry->vme_end, + VME_SUBMAP(entry), + VME_OFFSET(entry)); + } + } else { + vm_map_submap_pmap_clean( + dst_map, entry->vme_start, + entry->vme_end, + VME_SUBMAP(entry), + VME_OFFSET(entry)); + } + vm_map_deallocate(VME_SUBMAP(entry)); + } else { + if (dst_map->mapped_in_other_pmaps) { + vm_object_pmap_protect_options( + VME_OBJECT(entry), + VME_OFFSET(entry), + entry->vme_end + - entry->vme_start, + PMAP_NULL, + PAGE_SIZE, + entry->vme_start, + VM_PROT_NONE, + PMAP_OPTIONS_REMOVE); + } else { + pmap_remove_options( + dst_map->pmap, + (addr64_t)(entry->vme_start), + (addr64_t)(entry->vme_end), + PMAP_OPTIONS_REMOVE); + } + vm_object_deallocate(old_object); + } + } + + if (entry->iokit_acct) { + /* keep using iokit accounting */ + entry->use_pmap = FALSE; + } else { + /* use pmap accounting */ + entry->use_pmap = TRUE; + } + entry->is_sub_map = FALSE; + VME_OBJECT_SET(entry, VME_OBJECT(copy_entry)); + object = VME_OBJECT(entry); + entry->needs_copy = copy_entry->needs_copy; + entry->wired_count = 0; + entry->user_wired_count = 0; + offset = VME_OFFSET(copy_entry); + VME_OFFSET_SET(entry, offset); + + vm_map_copy_entry_unlink(copy, copy_entry); + vm_map_copy_entry_dispose(copy, copy_entry); + + /* + * we could try to push pages into the pmap at this point, BUT + * this optimization only saved on average 2 us per page if ALL + * the pages in the source were currently mapped + * and ALL the pages in the dest were touched, if there were fewer + * than 2/3 of the pages touched, this optimization actually cost more cycles + * it also puts a lot of pressure on the pmap layer w/r to mapping structures + */ + + /* + * Set up for the next iteration. The map + * has not been unlocked, so the next + * address should be at the end of this + * entry, and the next map entry should be + * the one following it. + */ + + start = tmp_entry->vme_end; + tmp_entry = tmp_entry->vme_next; + } else { + vm_map_version_t version; + vm_object_t dst_object; + vm_object_offset_t dst_offset; + kern_return_t r; + +slow_copy: + if (entry->needs_copy) { + VME_OBJECT_SHADOW(entry, + (entry->vme_end - + entry->vme_start)); + entry->needs_copy = FALSE; + } + + dst_object = VME_OBJECT(entry); + dst_offset = VME_OFFSET(entry); + + /* + * Take an object reference, and record + * the map version information so that the + * map can be safely unlocked. + */ + + if (dst_object == VM_OBJECT_NULL) { + /* + * We would usually have just taken the + * optimized path above if the destination + * object has not been allocated yet. But we + * now disable that optimization if the copy + * entry's object is not backed by anonymous + * memory to avoid replacing malloc'ed + * (i.e. re-usable) anonymous memory with a + * not-so-anonymous mapping. + * So we have to handle this case here and + * allocate a new VM object for this map entry. + */ + dst_object = vm_object_allocate( + entry->vme_end - entry->vme_start); + dst_offset = 0; + VME_OBJECT_SET(entry, dst_object); + VME_OFFSET_SET(entry, dst_offset); + assert(entry->use_pmap); + } + + vm_object_reference(dst_object); + + /* account for unlock bumping up timestamp */ + version.main_timestamp = dst_map->timestamp + 1; + + vm_map_unlock(dst_map); + + /* + * Copy as much as possible in one pass + */ + + copy_size = size; + r = vm_fault_copy( + VME_OBJECT(copy_entry), + VME_OFFSET(copy_entry), + ©_size, + dst_object, + dst_offset, + dst_map, + &version, + THREAD_UNINT ); + + /* + * Release the object reference + */ + + vm_object_deallocate(dst_object); + + /* + * If a hard error occurred, return it now + */ + + if (r != KERN_SUCCESS) { + return r; + } + + if (copy_size != 0) { + /* + * Dispose of the copied region + */ + + vm_map_copy_clip_end(copy, copy_entry, + copy_entry->vme_start + copy_size); + vm_map_copy_entry_unlink(copy, copy_entry); + vm_object_deallocate(VME_OBJECT(copy_entry)); + vm_map_copy_entry_dispose(copy, copy_entry); + } + + /* + * Pick up in the destination map where we left off. + * + * Use the version information to avoid a lookup + * in the normal case. + */ + + start += copy_size; + vm_map_lock(dst_map); + if (version.main_timestamp == dst_map->timestamp && + copy_size != 0) { + /* We can safely use saved tmp_entry value */ + + if (tmp_entry->map_aligned && + !VM_MAP_PAGE_ALIGNED( + start, + VM_MAP_PAGE_MASK(dst_map))) { + /* no longer map-aligned */ + tmp_entry->map_aligned = FALSE; + } + vm_map_clip_end(dst_map, tmp_entry, start); + tmp_entry = tmp_entry->vme_next; + } else { + /* Must do lookup of tmp_entry */ + + if (!vm_map_lookup_entry(dst_map, start, &tmp_entry)) { + vm_map_unlock(dst_map); + return KERN_INVALID_ADDRESS; + } + if (tmp_entry->map_aligned && + !VM_MAP_PAGE_ALIGNED( + start, + VM_MAP_PAGE_MASK(dst_map))) { + /* no longer map-aligned */ + tmp_entry->map_aligned = FALSE; + } + vm_map_clip_start(dst_map, tmp_entry, start); + } + } + }/* while */ + + return KERN_SUCCESS; +}/* vm_map_copy_overwrite_aligned */ + +/* + * Routine: vm_map_copyin_kernel_buffer [internal use only] + * + * Description: + * Copy in data to a kernel buffer from space in the + * source map. The original space may be optionally + * deallocated. + * + * If successful, returns a new copy object. + */ +static kern_return_t +vm_map_copyin_kernel_buffer( + vm_map_t src_map, + vm_map_offset_t src_addr, + vm_map_size_t len, + boolean_t src_destroy, + vm_map_copy_t *copy_result) +{ + kern_return_t kr; + vm_map_copy_t copy; + + if (len > msg_ool_size_small) { + return KERN_INVALID_ARGUMENT; + } + + copy = zalloc_flags(vm_map_copy_zone, Z_WAITOK | Z_ZERO); + if (copy == VM_MAP_COPY_NULL) { + return KERN_RESOURCE_SHORTAGE; + } + copy->cpy_kdata = kheap_alloc(KHEAP_DATA_BUFFERS, len, Z_WAITOK); + if (copy->cpy_kdata == NULL) { + zfree(vm_map_copy_zone, copy); + return KERN_RESOURCE_SHORTAGE; + } + + copy->type = VM_MAP_COPY_KERNEL_BUFFER; + copy->size = len; + copy->offset = 0; + + kr = copyinmap(src_map, src_addr, copy->cpy_kdata, (vm_size_t)len); + if (kr != KERN_SUCCESS) { + kheap_free(KHEAP_DATA_BUFFERS, copy->cpy_kdata, len); + zfree(vm_map_copy_zone, copy); + return kr; + } + if (src_destroy) { + (void) vm_map_remove( + src_map, + vm_map_trunc_page(src_addr, + VM_MAP_PAGE_MASK(src_map)), + vm_map_round_page(src_addr + len, + VM_MAP_PAGE_MASK(src_map)), + (VM_MAP_REMOVE_INTERRUPTIBLE | + VM_MAP_REMOVE_WAIT_FOR_KWIRE | + ((src_map == kernel_map) ? VM_MAP_REMOVE_KUNWIRE : VM_MAP_REMOVE_NO_FLAGS))); + } + *copy_result = copy; + return KERN_SUCCESS; +} + +/* + * Routine: vm_map_copyout_kernel_buffer [internal use only] + * + * Description: + * Copy out data from a kernel buffer into space in the + * destination map. The space may be otpionally dynamically + * allocated. + * + * If successful, consumes the copy object. + * Otherwise, the caller is responsible for it. + */ +static int vm_map_copyout_kernel_buffer_failures = 0; +static kern_return_t +vm_map_copyout_kernel_buffer( + vm_map_t map, + vm_map_address_t *addr, /* IN/OUT */ + vm_map_copy_t copy, + vm_map_size_t copy_size, + boolean_t overwrite, + boolean_t consume_on_success) +{ + kern_return_t kr = KERN_SUCCESS; + thread_t thread = current_thread(); + + assert(copy->size == copy_size); + + /* + * check for corrupted vm_map_copy structure + */ + if (copy_size > msg_ool_size_small || copy->offset) { + panic("Invalid vm_map_copy_t sz:%lld, ofst:%lld", + (long long)copy->size, (long long)copy->offset); + } + + if (!overwrite) { + /* + * Allocate space in the target map for the data + */ + *addr = 0; + kr = vm_map_enter(map, + addr, + vm_map_round_page(copy_size, + VM_MAP_PAGE_MASK(map)), + (vm_map_offset_t) 0, + VM_FLAGS_ANYWHERE, + VM_MAP_KERNEL_FLAGS_NONE, + VM_KERN_MEMORY_NONE, + VM_OBJECT_NULL, + (vm_object_offset_t) 0, + FALSE, + VM_PROT_DEFAULT, + VM_PROT_ALL, + VM_INHERIT_DEFAULT); + if (kr != KERN_SUCCESS) { + return kr; + } +#if KASAN + if (map->pmap == kernel_pmap) { + kasan_notify_address(*addr, copy->size); + } +#endif + } + + /* + * Copyout the data from the kernel buffer to the target map. + */ + if (thread->map == map) { + /* + * If the target map is the current map, just do + * the copy. + */ + assert((vm_size_t)copy_size == copy_size); + if (copyout(copy->cpy_kdata, *addr, (vm_size_t)copy_size)) { + kr = KERN_INVALID_ADDRESS; + } + } else { + vm_map_t oldmap; + + /* + * If the target map is another map, assume the + * target's address space identity for the duration + * of the copy. + */ + vm_map_reference(map); + oldmap = vm_map_switch(map); + + assert((vm_size_t)copy_size == copy_size); + if (copyout(copy->cpy_kdata, *addr, (vm_size_t)copy_size)) { + vm_map_copyout_kernel_buffer_failures++; + kr = KERN_INVALID_ADDRESS; + } + + (void) vm_map_switch(oldmap); + vm_map_deallocate(map); + } + + if (kr != KERN_SUCCESS) { + /* the copy failed, clean up */ + if (!overwrite) { + /* + * Deallocate the space we allocated in the target map. + */ + (void) vm_map_remove( + map, + vm_map_trunc_page(*addr, + VM_MAP_PAGE_MASK(map)), + vm_map_round_page((*addr + + vm_map_round_page(copy_size, + VM_MAP_PAGE_MASK(map))), + VM_MAP_PAGE_MASK(map)), + VM_MAP_REMOVE_NO_FLAGS); + *addr = 0; + } + } else { + /* copy was successful, dicard the copy structure */ + if (consume_on_success) { + kheap_free(KHEAP_DATA_BUFFERS, copy->cpy_kdata, copy_size); + zfree(vm_map_copy_zone, copy); + } + } + + return kr; +} + +/* + * Routine: vm_map_copy_insert [internal use only] + * + * Description: + * Link a copy chain ("copy") into a map at the + * specified location (after "where"). + * Side effects: + * The copy chain is destroyed. + */ +static void +vm_map_copy_insert( + vm_map_t map, + vm_map_entry_t after_where, + vm_map_copy_t copy) +{ + vm_map_entry_t entry; + + while (vm_map_copy_first_entry(copy) != vm_map_copy_to_entry(copy)) { + entry = vm_map_copy_first_entry(copy); + vm_map_copy_entry_unlink(copy, entry); + vm_map_store_entry_link(map, after_where, entry, + VM_MAP_KERNEL_FLAGS_NONE); + after_where = entry; + } + zfree(vm_map_copy_zone, copy); +} + +void +vm_map_copy_remap( + vm_map_t map, + vm_map_entry_t where, + vm_map_copy_t copy, + vm_map_offset_t adjustment, + vm_prot_t cur_prot, + vm_prot_t max_prot, + vm_inherit_t inheritance) +{ + vm_map_entry_t copy_entry, new_entry; + + for (copy_entry = vm_map_copy_first_entry(copy); + copy_entry != vm_map_copy_to_entry(copy); + copy_entry = copy_entry->vme_next) { + /* get a new VM map entry for the map */ + new_entry = vm_map_entry_create(map, + !map->hdr.entries_pageable); + /* copy the "copy entry" to the new entry */ + vm_map_entry_copy(map, new_entry, copy_entry); + /* adjust "start" and "end" */ + new_entry->vme_start += adjustment; + new_entry->vme_end += adjustment; + /* clear some attributes */ + new_entry->inheritance = inheritance; + new_entry->protection = cur_prot; + new_entry->max_protection = max_prot; + new_entry->behavior = VM_BEHAVIOR_DEFAULT; + /* take an extra reference on the entry's "object" */ + if (new_entry->is_sub_map) { + assert(!new_entry->use_pmap); /* not nested */ + vm_map_lock(VME_SUBMAP(new_entry)); + vm_map_reference(VME_SUBMAP(new_entry)); + vm_map_unlock(VME_SUBMAP(new_entry)); + } else { + vm_object_reference(VME_OBJECT(new_entry)); + } + /* insert the new entry in the map */ + vm_map_store_entry_link(map, where, new_entry, + VM_MAP_KERNEL_FLAGS_NONE); + /* continue inserting the "copy entries" after the new entry */ + where = new_entry; + } +} + + +/* + * Returns true if *size matches (or is in the range of) copy->size. + * Upon returning true, the *size field is updated with the actual size of the + * copy object (may be different for VM_MAP_COPY_ENTRY_LIST types) + */ +boolean_t +vm_map_copy_validate_size( + vm_map_t dst_map, + vm_map_copy_t copy, + vm_map_size_t *size) +{ + if (copy == VM_MAP_COPY_NULL) { + return FALSE; + } + vm_map_size_t copy_sz = copy->size; + vm_map_size_t sz = *size; + switch (copy->type) { + case VM_MAP_COPY_OBJECT: + case VM_MAP_COPY_KERNEL_BUFFER: + if (sz == copy_sz) { + return TRUE; + } + break; + case VM_MAP_COPY_ENTRY_LIST: + /* + * potential page-size rounding prevents us from exactly + * validating this flavor of vm_map_copy, but we can at least + * assert that it's within a range. + */ + if (copy_sz >= sz && + copy_sz <= vm_map_round_page(sz, VM_MAP_PAGE_MASK(dst_map))) { + *size = copy_sz; + return TRUE; + } + break; + default: + break; + } + return FALSE; +} + +/* + * Routine: vm_map_copyout_size + * + * Description: + * Copy out a copy chain ("copy") into newly-allocated + * space in the destination map. Uses a prevalidated + * size for the copy object (vm_map_copy_validate_size). + * + * If successful, consumes the copy object. + * Otherwise, the caller is responsible for it. + */ +kern_return_t +vm_map_copyout_size( + vm_map_t dst_map, + vm_map_address_t *dst_addr, /* OUT */ + vm_map_copy_t copy, + vm_map_size_t copy_size) +{ + return vm_map_copyout_internal(dst_map, dst_addr, copy, copy_size, + TRUE, /* consume_on_success */ + VM_PROT_DEFAULT, + VM_PROT_ALL, + VM_INHERIT_DEFAULT); +} + +/* + * Routine: vm_map_copyout + * + * Description: + * Copy out a copy chain ("copy") into newly-allocated + * space in the destination map. + * + * If successful, consumes the copy object. + * Otherwise, the caller is responsible for it. + */ +kern_return_t +vm_map_copyout( + vm_map_t dst_map, + vm_map_address_t *dst_addr, /* OUT */ + vm_map_copy_t copy) +{ + return vm_map_copyout_internal(dst_map, dst_addr, copy, copy ? copy->size : 0, + TRUE, /* consume_on_success */ + VM_PROT_DEFAULT, + VM_PROT_ALL, + VM_INHERIT_DEFAULT); +} + +kern_return_t +vm_map_copyout_internal( + vm_map_t dst_map, + vm_map_address_t *dst_addr, /* OUT */ + vm_map_copy_t copy, + vm_map_size_t copy_size, + boolean_t consume_on_success, + vm_prot_t cur_protection, + vm_prot_t max_protection, + vm_inherit_t inheritance) +{ + vm_map_size_t size; + vm_map_size_t adjustment; + vm_map_offset_t start; + vm_object_offset_t vm_copy_start; + vm_map_entry_t last; + vm_map_entry_t entry; + vm_map_entry_t hole_entry; + vm_map_copy_t original_copy; + + /* + * Check for null copy object. + */ + + if (copy == VM_MAP_COPY_NULL) { + *dst_addr = 0; + return KERN_SUCCESS; + } + + /* + * Assert that the vm_map_copy is coming from the right + * zone and hasn't been forged + */ + vm_map_copy_require(copy); + + if (copy->size != copy_size) { + *dst_addr = 0; + return KERN_FAILURE; + } + + /* + * Check for special copy object, created + * by vm_map_copyin_object. + */ + + if (copy->type == VM_MAP_COPY_OBJECT) { + vm_object_t object = copy->cpy_object; + kern_return_t kr; + vm_object_offset_t offset; + + offset = vm_object_trunc_page(copy->offset); + size = vm_map_round_page((copy_size + + (vm_map_size_t)(copy->offset - + offset)), + VM_MAP_PAGE_MASK(dst_map)); + *dst_addr = 0; + kr = vm_map_enter(dst_map, dst_addr, size, + (vm_map_offset_t) 0, VM_FLAGS_ANYWHERE, + VM_MAP_KERNEL_FLAGS_NONE, + VM_KERN_MEMORY_NONE, + object, offset, FALSE, + VM_PROT_DEFAULT, VM_PROT_ALL, + VM_INHERIT_DEFAULT); + if (kr != KERN_SUCCESS) { + return kr; + } + /* Account for non-pagealigned copy object */ + *dst_addr += (vm_map_offset_t)(copy->offset - offset); + if (consume_on_success) { + zfree(vm_map_copy_zone, copy); + } + return KERN_SUCCESS; + } + + /* + * Check for special kernel buffer allocated + * by new_ipc_kmsg_copyin. + */ + + if (copy->type == VM_MAP_COPY_KERNEL_BUFFER) { + return vm_map_copyout_kernel_buffer(dst_map, dst_addr, + copy, copy_size, FALSE, + consume_on_success); + } + + original_copy = copy; + if (copy->cpy_hdr.page_shift != VM_MAP_PAGE_SHIFT(dst_map)) { + kern_return_t kr; + vm_map_copy_t target_copy; + vm_map_offset_t overmap_start, overmap_end, trimmed_start; + + target_copy = VM_MAP_COPY_NULL; + DEBUG4K_ADJUST("adjusting...\n"); + kr = vm_map_copy_adjust_to_target( + copy, + 0, /* offset */ + copy->size, /* size */ + dst_map, + TRUE, /* copy */ + &target_copy, + &overmap_start, + &overmap_end, + &trimmed_start); + if (kr != KERN_SUCCESS) { + DEBUG4K_COPY("adjust failed 0x%x\n", kr); + return kr; + } + DEBUG4K_COPY("copy %p (%d 0x%llx 0x%llx) dst_map %p (%d) target_copy %p (%d 0x%llx 0x%llx) overmap_start 0x%llx overmap_end 0x%llx trimmed_start 0x%llx\n", copy, copy->cpy_hdr.page_shift, copy->offset, (uint64_t)copy->size, dst_map, VM_MAP_PAGE_SHIFT(dst_map), target_copy, target_copy->cpy_hdr.page_shift, target_copy->offset, (uint64_t)target_copy->size, (uint64_t)overmap_start, (uint64_t)overmap_end, (uint64_t)trimmed_start); + if (target_copy != copy) { + copy = target_copy; + } + copy_size = copy->size; + } + + /* + * Find space for the data + */ + + vm_copy_start = vm_map_trunc_page((vm_map_size_t)copy->offset, + VM_MAP_COPY_PAGE_MASK(copy)); + size = vm_map_round_page((vm_map_size_t)copy->offset + copy_size, + VM_MAP_COPY_PAGE_MASK(copy)) + - vm_copy_start; + + +StartAgain:; + + vm_map_lock(dst_map); + if (dst_map->disable_vmentry_reuse == TRUE) { + VM_MAP_HIGHEST_ENTRY(dst_map, entry, start); + last = entry; + } else { + if (dst_map->holelistenabled) { + hole_entry = CAST_TO_VM_MAP_ENTRY(dst_map->holes_list); + + if (hole_entry == NULL) { + /* + * No more space in the map? + */ + vm_map_unlock(dst_map); + return KERN_NO_SPACE; + } + + last = hole_entry; + start = last->vme_start; + } else { + assert(first_free_is_valid(dst_map)); + start = ((last = dst_map->first_free) == vm_map_to_entry(dst_map)) ? + vm_map_min(dst_map) : last->vme_end; + } + start = vm_map_round_page(start, + VM_MAP_PAGE_MASK(dst_map)); + } + + while (TRUE) { + vm_map_entry_t next = last->vme_next; + vm_map_offset_t end = start + size; + + if ((end > dst_map->max_offset) || (end < start)) { + if (dst_map->wait_for_space) { + if (size <= (dst_map->max_offset - dst_map->min_offset)) { + assert_wait((event_t) dst_map, + THREAD_INTERRUPTIBLE); + vm_map_unlock(dst_map); + thread_block(THREAD_CONTINUE_NULL); + goto StartAgain; + } + } + vm_map_unlock(dst_map); + return KERN_NO_SPACE; + } + + if (dst_map->holelistenabled) { + if (last->vme_end >= end) { + break; + } + } else { + /* + * If there are no more entries, we must win. + * + * OR + * + * If there is another entry, it must be + * after the end of the potential new region. + */ + + if (next == vm_map_to_entry(dst_map)) { + break; + } + + if (next->vme_start >= end) { + break; + } + } + + last = next; + + if (dst_map->holelistenabled) { + if (last == CAST_TO_VM_MAP_ENTRY(dst_map->holes_list)) { + /* + * Wrapped around + */ + vm_map_unlock(dst_map); + return KERN_NO_SPACE; + } + start = last->vme_start; + } else { + start = last->vme_end; + } + start = vm_map_round_page(start, + VM_MAP_PAGE_MASK(dst_map)); + } + + if (dst_map->holelistenabled) { + if (vm_map_lookup_entry(dst_map, last->vme_start, &last)) { + panic("Found an existing entry (%p) instead of potential hole at address: 0x%llx.\n", last, (unsigned long long)last->vme_start); + } + } + + + adjustment = start - vm_copy_start; + if (!consume_on_success) { + /* + * We're not allowed to consume "copy", so we'll have to + * copy its map entries into the destination map below. + * No need to re-allocate map entries from the correct + * (pageable or not) zone, since we'll get new map entries + * during the transfer. + * We'll also adjust the map entries's "start" and "end" + * during the transfer, to keep "copy"'s entries consistent + * with its "offset". + */ + goto after_adjustments; + } + + /* + * Since we're going to just drop the map + * entries from the copy into the destination + * map, they must come from the same pool. + */ + + if (copy->cpy_hdr.entries_pageable != dst_map->hdr.entries_pageable) { + /* + * Mismatches occur when dealing with the default + * pager. + */ + zone_t old_zone; + vm_map_entry_t next, new; + + /* + * Find the zone that the copies were allocated from + */ + + entry = vm_map_copy_first_entry(copy); + + /* + * Reinitialize the copy so that vm_map_copy_entry_link + * will work. + */ + vm_map_store_copy_reset(copy, entry); + copy->cpy_hdr.entries_pageable = dst_map->hdr.entries_pageable; + + /* + * Copy each entry. + */ + while (entry != vm_map_copy_to_entry(copy)) { + new = vm_map_copy_entry_create(copy, !copy->cpy_hdr.entries_pageable); + vm_map_entry_copy_full(new, entry); + new->vme_no_copy_on_read = FALSE; + assert(!new->iokit_acct); + if (new->is_sub_map) { + /* clr address space specifics */ + new->use_pmap = FALSE; + } + vm_map_copy_entry_link(copy, + vm_map_copy_last_entry(copy), + new); + next = entry->vme_next; + old_zone = entry->from_reserved_zone ? vm_map_entry_reserved_zone : vm_map_entry_zone; + zfree(old_zone, entry); + entry = next; + } + } + + /* + * Adjust the addresses in the copy chain, and + * reset the region attributes. + */ + + for (entry = vm_map_copy_first_entry(copy); + entry != vm_map_copy_to_entry(copy); + entry = entry->vme_next) { + if (VM_MAP_PAGE_SHIFT(dst_map) == PAGE_SHIFT) { + /* + * We're injecting this copy entry into a map that + * has the standard page alignment, so clear + * "map_aligned" (which might have been inherited + * from the original map entry). + */ + entry->map_aligned = FALSE; + } + + entry->vme_start += adjustment; + entry->vme_end += adjustment; + + if (entry->map_aligned) { + assert(VM_MAP_PAGE_ALIGNED(entry->vme_start, + VM_MAP_PAGE_MASK(dst_map))); + assert(VM_MAP_PAGE_ALIGNED(entry->vme_end, + VM_MAP_PAGE_MASK(dst_map))); + } + + entry->inheritance = VM_INHERIT_DEFAULT; + entry->protection = VM_PROT_DEFAULT; + entry->max_protection = VM_PROT_ALL; + entry->behavior = VM_BEHAVIOR_DEFAULT; + + /* + * If the entry is now wired, + * map the pages into the destination map. + */ + if (entry->wired_count != 0) { + vm_map_offset_t va; + vm_object_offset_t offset; + vm_object_t object; + vm_prot_t prot; + int type_of_fault; + + /* TODO4K would need to use actual page size */ + assert(VM_MAP_PAGE_SHIFT(dst_map) == PAGE_SHIFT); + + object = VME_OBJECT(entry); + offset = VME_OFFSET(entry); + va = entry->vme_start; + + pmap_pageable(dst_map->pmap, + entry->vme_start, + entry->vme_end, + TRUE); + + while (va < entry->vme_end) { + vm_page_t m; + struct vm_object_fault_info fault_info = {}; + + /* + * Look up the page in the object. + * Assert that the page will be found in the + * top object: + * either + * the object was newly created by + * vm_object_copy_slowly, and has + * copies of all of the pages from + * the source object + * or + * the object was moved from the old + * map entry; because the old map + * entry was wired, all of the pages + * were in the top-level object. + * (XXX not true if we wire pages for + * reading) + */ + vm_object_lock(object); + + m = vm_page_lookup(object, offset); + if (m == VM_PAGE_NULL || !VM_PAGE_WIRED(m) || + m->vmp_absent) { + panic("vm_map_copyout: wiring %p", m); + } + + prot = entry->protection; + + if (override_nx(dst_map, VME_ALIAS(entry)) && + prot) { + prot |= VM_PROT_EXECUTE; + } + + type_of_fault = DBG_CACHE_HIT_FAULT; + + fault_info.user_tag = VME_ALIAS(entry); + fault_info.pmap_options = 0; + if (entry->iokit_acct || + (!entry->is_sub_map && !entry->use_pmap)) { + fault_info.pmap_options |= PMAP_OPTIONS_ALT_ACCT; + } + + vm_fault_enter(m, + dst_map->pmap, + va, + PAGE_SIZE, 0, + prot, + prot, + VM_PAGE_WIRED(m), + FALSE, /* change_wiring */ + VM_KERN_MEMORY_NONE, /* tag - not wiring */ + &fault_info, + NULL, /* need_retry */ + &type_of_fault); + + vm_object_unlock(object); + + offset += PAGE_SIZE_64; + va += PAGE_SIZE; + } + } + } + +after_adjustments: + + /* + * Correct the page alignment for the result + */ + + *dst_addr = start + (copy->offset - vm_copy_start); + +#if KASAN + kasan_notify_address(*dst_addr, size); +#endif + + /* + * Update the hints and the map size + */ + + if (consume_on_success) { + SAVE_HINT_MAP_WRITE(dst_map, vm_map_copy_last_entry(copy)); + } else { + SAVE_HINT_MAP_WRITE(dst_map, last); + } + + dst_map->size += size; + + /* + * Link in the copy + */ + + if (consume_on_success) { + vm_map_copy_insert(dst_map, last, copy); + if (copy != original_copy) { + vm_map_copy_discard(original_copy); + original_copy = VM_MAP_COPY_NULL; + } + } else { + vm_map_copy_remap(dst_map, last, copy, adjustment, + cur_protection, max_protection, + inheritance); + if (copy != original_copy && original_copy != VM_MAP_COPY_NULL) { + vm_map_copy_discard(copy); + copy = original_copy; + } + } + + + vm_map_unlock(dst_map); + + /* + * XXX If wiring_required, call vm_map_pageable + */ + + return KERN_SUCCESS; +} + +/* + * Routine: vm_map_copyin + * + * Description: + * see vm_map_copyin_common. Exported via Unsupported.exports. + * + */ + +#undef vm_map_copyin + +kern_return_t +vm_map_copyin( + vm_map_t src_map, + vm_map_address_t src_addr, + vm_map_size_t len, + boolean_t src_destroy, + vm_map_copy_t *copy_result) /* OUT */ +{ + return vm_map_copyin_common(src_map, src_addr, len, src_destroy, + FALSE, copy_result, FALSE); +} + +/* + * Routine: vm_map_copyin_common + * + * Description: + * Copy the specified region (src_addr, len) from the + * source address space (src_map), possibly removing + * the region from the source address space (src_destroy). + * + * Returns: + * A vm_map_copy_t object (copy_result), suitable for + * insertion into another address space (using vm_map_copyout), + * copying over another address space region (using + * vm_map_copy_overwrite). If the copy is unused, it + * should be destroyed (using vm_map_copy_discard). + * + * In/out conditions: + * The source map should not be locked on entry. + */ + +typedef struct submap_map { + vm_map_t parent_map; + vm_map_offset_t base_start; + vm_map_offset_t base_end; + vm_map_size_t base_len; + struct submap_map *next; +} submap_map_t; + +kern_return_t +vm_map_copyin_common( + vm_map_t src_map, + vm_map_address_t src_addr, + vm_map_size_t len, + boolean_t src_destroy, + __unused boolean_t src_volatile, + vm_map_copy_t *copy_result, /* OUT */ + boolean_t use_maxprot) +{ + int flags; + + flags = 0; + if (src_destroy) { + flags |= VM_MAP_COPYIN_SRC_DESTROY; + } + if (use_maxprot) { + flags |= VM_MAP_COPYIN_USE_MAXPROT; + } + return vm_map_copyin_internal(src_map, + src_addr, + len, + flags, + copy_result); +} +kern_return_t +vm_map_copyin_internal( + vm_map_t src_map, + vm_map_address_t src_addr, + vm_map_size_t len, + int flags, + vm_map_copy_t *copy_result) /* OUT */ +{ + vm_map_entry_t tmp_entry; /* Result of last map lookup -- + * in multi-level lookup, this + * entry contains the actual + * vm_object/offset. + */ + vm_map_entry_t new_entry = VM_MAP_ENTRY_NULL; /* Map entry for copy */ + + vm_map_offset_t src_start; /* Start of current entry -- + * where copy is taking place now + */ + vm_map_offset_t src_end; /* End of entire region to be + * copied */ + vm_map_offset_t src_base; + vm_map_t base_map = src_map; + boolean_t map_share = FALSE; + submap_map_t *parent_maps = NULL; + + vm_map_copy_t copy; /* Resulting copy */ + vm_map_address_t copy_addr; + vm_map_size_t copy_size; + boolean_t src_destroy; + boolean_t use_maxprot; + boolean_t preserve_purgeable; + boolean_t entry_was_shared; + vm_map_entry_t saved_src_entry; + + if (flags & ~VM_MAP_COPYIN_ALL_FLAGS) { + return KERN_INVALID_ARGUMENT; + } + + src_destroy = (flags & VM_MAP_COPYIN_SRC_DESTROY) ? TRUE : FALSE; + use_maxprot = (flags & VM_MAP_COPYIN_USE_MAXPROT) ? TRUE : FALSE; + preserve_purgeable = + (flags & VM_MAP_COPYIN_PRESERVE_PURGEABLE) ? TRUE : FALSE; + + /* + * Check for copies of zero bytes. + */ + + if (len == 0) { + *copy_result = VM_MAP_COPY_NULL; + return KERN_SUCCESS; + } + + /* + * Check that the end address doesn't overflow + */ + src_end = src_addr + len; + if (src_end < src_addr) { + return KERN_INVALID_ADDRESS; + } + + /* + * Compute (page aligned) start and end of region + */ + src_start = vm_map_trunc_page(src_addr, + VM_MAP_PAGE_MASK(src_map)); + src_end = vm_map_round_page(src_end, + VM_MAP_PAGE_MASK(src_map)); + + /* + * If the copy is sufficiently small, use a kernel buffer instead + * of making a virtual copy. The theory being that the cost of + * setting up VM (and taking C-O-W faults) dominates the copy costs + * for small regions. + */ + if ((len < msg_ool_size_small) && + !use_maxprot && + !preserve_purgeable && + !(flags & VM_MAP_COPYIN_ENTRY_LIST) && + /* + * Since the "msg_ool_size_small" threshold was increased and + * vm_map_copyin_kernel_buffer() doesn't handle accesses beyond the + * address space limits, we revert to doing a virtual copy if the + * copied range goes beyond those limits. Otherwise, mach_vm_read() + * of the commpage would now fail when it used to work. + */ + (src_start >= vm_map_min(src_map) && + src_start < vm_map_max(src_map) && + src_end >= vm_map_min(src_map) && + src_end < vm_map_max(src_map))) { + return vm_map_copyin_kernel_buffer(src_map, src_addr, len, + src_destroy, copy_result); + } + + /* + * Allocate a header element for the list. + * + * Use the start and end in the header to + * remember the endpoints prior to rounding. + */ + + copy = vm_map_copy_allocate(); + copy->type = VM_MAP_COPY_ENTRY_LIST; + copy->cpy_hdr.entries_pageable = TRUE; + copy->cpy_hdr.page_shift = VM_MAP_PAGE_SHIFT(src_map); + + vm_map_store_init( &(copy->cpy_hdr)); + + copy->offset = src_addr; + copy->size = len; + + new_entry = vm_map_copy_entry_create(copy, !copy->cpy_hdr.entries_pageable); + +#define RETURN(x) \ + MACRO_BEGIN \ + vm_map_unlock(src_map); \ + if(src_map != base_map) \ + vm_map_deallocate(src_map); \ + if (new_entry != VM_MAP_ENTRY_NULL) \ + vm_map_copy_entry_dispose(copy,new_entry); \ + vm_map_copy_discard(copy); \ + { \ + submap_map_t *_ptr; \ + \ + for(_ptr = parent_maps; _ptr != NULL; _ptr = parent_maps) { \ + parent_maps=parent_maps->next; \ + if (_ptr->parent_map != base_map) \ + vm_map_deallocate(_ptr->parent_map); \ + kfree(_ptr, sizeof(submap_map_t)); \ + } \ + } \ + MACRO_RETURN(x); \ + MACRO_END + + /* + * Find the beginning of the region. + */ + + vm_map_lock(src_map); + + /* + * Lookup the original "src_addr" rather than the truncated + * "src_start", in case "src_start" falls in a non-map-aligned + * map entry *before* the map entry that contains "src_addr"... + */ + if (!vm_map_lookup_entry(src_map, src_addr, &tmp_entry)) { + RETURN(KERN_INVALID_ADDRESS); + } + if (!tmp_entry->is_sub_map) { + /* + * ... but clip to the map-rounded "src_start" rather than + * "src_addr" to preserve map-alignment. We'll adjust the + * first copy entry at the end, if needed. + */ + vm_map_clip_start(src_map, tmp_entry, src_start); + } + if (src_start < tmp_entry->vme_start) { + /* + * Move "src_start" up to the start of the + * first map entry to copy. + */ + src_start = tmp_entry->vme_start; + } + /* set for later submap fix-up */ + copy_addr = src_start; + + /* + * Go through entries until we get to the end. + */ + + while (TRUE) { + vm_map_entry_t src_entry = tmp_entry; /* Top-level entry */ + vm_map_size_t src_size; /* Size of source + * map entry (in both + * maps) + */ + + vm_object_t src_object; /* Object to copy */ + vm_object_offset_t src_offset; + + boolean_t src_needs_copy; /* Should source map + * be made read-only + * for copy-on-write? + */ + + boolean_t new_entry_needs_copy; /* Will new entry be COW? */ + + boolean_t was_wired; /* Was source wired? */ + vm_map_version_t version; /* Version before locks + * dropped to make copy + */ + kern_return_t result; /* Return value from + * copy_strategically. + */ + while (tmp_entry->is_sub_map) { + vm_map_size_t submap_len; + submap_map_t *ptr; + + ptr = (submap_map_t *)kalloc(sizeof(submap_map_t)); + ptr->next = parent_maps; + parent_maps = ptr; + ptr->parent_map = src_map; + ptr->base_start = src_start; + ptr->base_end = src_end; + submap_len = tmp_entry->vme_end - src_start; + if (submap_len > (src_end - src_start)) { + submap_len = src_end - src_start; + } + ptr->base_len = submap_len; + + src_start -= tmp_entry->vme_start; + src_start += VME_OFFSET(tmp_entry); + src_end = src_start + submap_len; + src_map = VME_SUBMAP(tmp_entry); + vm_map_lock(src_map); + /* keep an outstanding reference for all maps in */ + /* the parents tree except the base map */ + vm_map_reference(src_map); + vm_map_unlock(ptr->parent_map); + if (!vm_map_lookup_entry( + src_map, src_start, &tmp_entry)) { + RETURN(KERN_INVALID_ADDRESS); + } + map_share = TRUE; + if (!tmp_entry->is_sub_map) { + vm_map_clip_start(src_map, tmp_entry, src_start); + } + src_entry = tmp_entry; + } + /* we are now in the lowest level submap... */ + + if ((VME_OBJECT(tmp_entry) != VM_OBJECT_NULL) && + (VME_OBJECT(tmp_entry)->phys_contiguous)) { + /* This is not, supported for now.In future */ + /* we will need to detect the phys_contig */ + /* condition and then upgrade copy_slowly */ + /* to do physical copy from the device mem */ + /* based object. We can piggy-back off of */ + /* the was wired boolean to set-up the */ + /* proper handling */ + RETURN(KERN_PROTECTION_FAILURE); + } + /* + * Create a new address map entry to hold the result. + * Fill in the fields from the appropriate source entries. + * We must unlock the source map to do this if we need + * to allocate a map entry. + */ + if (new_entry == VM_MAP_ENTRY_NULL) { + version.main_timestamp = src_map->timestamp; + vm_map_unlock(src_map); + + new_entry = vm_map_copy_entry_create(copy, !copy->cpy_hdr.entries_pageable); + + vm_map_lock(src_map); + if ((version.main_timestamp + 1) != src_map->timestamp) { + if (!vm_map_lookup_entry(src_map, src_start, + &tmp_entry)) { + RETURN(KERN_INVALID_ADDRESS); + } + if (!tmp_entry->is_sub_map) { + vm_map_clip_start(src_map, tmp_entry, src_start); + } + continue; /* restart w/ new tmp_entry */ + } + } + + /* + * Verify that the region can be read. + */ + if (((src_entry->protection & VM_PROT_READ) == VM_PROT_NONE && + !use_maxprot) || + (src_entry->max_protection & VM_PROT_READ) == 0) { + RETURN(KERN_PROTECTION_FAILURE); + } + + /* + * Clip against the endpoints of the entire region. + */ + + vm_map_clip_end(src_map, src_entry, src_end); + + src_size = src_entry->vme_end - src_start; + src_object = VME_OBJECT(src_entry); + src_offset = VME_OFFSET(src_entry); + was_wired = (src_entry->wired_count != 0); + + vm_map_entry_copy(src_map, new_entry, src_entry); + if (new_entry->is_sub_map) { + /* clr address space specifics */ + new_entry->use_pmap = FALSE; + } else { + /* + * We're dealing with a copy-on-write operation, + * so the resulting mapping should not inherit the + * original mapping's accounting settings. + * "iokit_acct" should have been cleared in + * vm_map_entry_copy(). + * "use_pmap" should be reset to its default (TRUE) + * so that the new mapping gets accounted for in + * the task's memory footprint. + */ + assert(!new_entry->iokit_acct); + new_entry->use_pmap = TRUE; + } + + /* + * Attempt non-blocking copy-on-write optimizations. + */ + + /* + * If we are destroying the source, and the object + * is internal, we could move the object reference + * from the source to the copy. The copy is + * copy-on-write only if the source is. + * We make another reference to the object, because + * destroying the source entry will deallocate it. + * + * This memory transfer has to be atomic, (to prevent + * the VM object from being shared or copied while + * it's being moved here), so we could only do this + * if we won't have to unlock the VM map until the + * original mapping has been fully removed. + */ + +RestartCopy: + if ((src_object == VM_OBJECT_NULL || + (!was_wired && !map_share && !tmp_entry->is_shared + && !(debug4k_no_cow_copyin && VM_MAP_PAGE_SHIFT(src_map) < PAGE_SHIFT))) && + vm_object_copy_quickly( + VME_OBJECT_PTR(new_entry), + src_offset, + src_size, + &src_needs_copy, + &new_entry_needs_copy)) { + new_entry->needs_copy = new_entry_needs_copy; + + /* + * Handle copy-on-write obligations + */ + + if (src_needs_copy && !tmp_entry->needs_copy) { + vm_prot_t prot; + + prot = src_entry->protection & ~VM_PROT_WRITE; + + if (override_nx(src_map, VME_ALIAS(src_entry)) + && prot) { + prot |= VM_PROT_EXECUTE; + } + + vm_object_pmap_protect( + src_object, + src_offset, + src_size, + (src_entry->is_shared ? + PMAP_NULL + : src_map->pmap), + VM_MAP_PAGE_SIZE(src_map), + src_entry->vme_start, + prot); + + assert(tmp_entry->wired_count == 0); + tmp_entry->needs_copy = TRUE; + } + + /* + * The map has never been unlocked, so it's safe + * to move to the next entry rather than doing + * another lookup. + */ + + goto CopySuccessful; + } + + entry_was_shared = tmp_entry->is_shared; + + /* + * Take an object reference, so that we may + * release the map lock(s). + */ + + assert(src_object != VM_OBJECT_NULL); + vm_object_reference(src_object); + + /* + * Record the timestamp for later verification. + * Unlock the map. + */ + + version.main_timestamp = src_map->timestamp; + vm_map_unlock(src_map); /* Increments timestamp once! */ + saved_src_entry = src_entry; + tmp_entry = VM_MAP_ENTRY_NULL; + src_entry = VM_MAP_ENTRY_NULL; + + /* + * Perform the copy + */ + + if (was_wired || + (debug4k_no_cow_copyin && + VM_MAP_PAGE_SHIFT(src_map) < PAGE_SHIFT)) { +CopySlowly: + vm_object_lock(src_object); + result = vm_object_copy_slowly( + src_object, + src_offset, + src_size, + THREAD_UNINT, + VME_OBJECT_PTR(new_entry)); + VME_OFFSET_SET(new_entry, + src_offset - vm_object_trunc_page(src_offset)); + new_entry->needs_copy = FALSE; + } else if (src_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC && + (entry_was_shared || map_share)) { + vm_object_t new_object; + + vm_object_lock_shared(src_object); + new_object = vm_object_copy_delayed( + src_object, + src_offset, + src_size, + TRUE); + if (new_object == VM_OBJECT_NULL) { + goto CopySlowly; + } + + VME_OBJECT_SET(new_entry, new_object); + assert(new_entry->wired_count == 0); + new_entry->needs_copy = TRUE; + assert(!new_entry->iokit_acct); + assert(new_object->purgable == VM_PURGABLE_DENY); + assertf(new_entry->use_pmap, "src_map %p new_entry %p\n", src_map, new_entry); + result = KERN_SUCCESS; + } else { + vm_object_offset_t new_offset; + new_offset = VME_OFFSET(new_entry); + result = vm_object_copy_strategically(src_object, + src_offset, + src_size, + VME_OBJECT_PTR(new_entry), + &new_offset, + &new_entry_needs_copy); + if (new_offset != VME_OFFSET(new_entry)) { + VME_OFFSET_SET(new_entry, new_offset); + } + + new_entry->needs_copy = new_entry_needs_copy; + } + + if (result == KERN_SUCCESS && + ((preserve_purgeable && + src_object->purgable != VM_PURGABLE_DENY) || + new_entry->used_for_jit)) { + /* + * Purgeable objects should be COPY_NONE, true share; + * this should be propogated to the copy. + * + * Also force mappings the pmap specially protects to + * be COPY_NONE; trying to COW these mappings would + * change the effective protections, which could have + * side effects if the pmap layer relies on the + * specified protections. + */ + + vm_object_t new_object; + + new_object = VME_OBJECT(new_entry); + assert(new_object != src_object); + vm_object_lock(new_object); + assert(new_object->ref_count == 1); + assert(new_object->shadow == VM_OBJECT_NULL); + assert(new_object->copy == VM_OBJECT_NULL); + assert(new_object->vo_owner == NULL); + + new_object->copy_strategy = MEMORY_OBJECT_COPY_NONE; + + if (preserve_purgeable && + src_object->purgable != VM_PURGABLE_DENY) { + new_object->true_share = TRUE; + + /* start as non-volatile with no owner... */ + new_object->purgable = VM_PURGABLE_NONVOLATILE; + vm_purgeable_nonvolatile_enqueue(new_object, NULL); + /* ... and move to src_object's purgeable state */ + if (src_object->purgable != VM_PURGABLE_NONVOLATILE) { + int state; + state = src_object->purgable; + vm_object_purgable_control( + new_object, + VM_PURGABLE_SET_STATE_FROM_KERNEL, + &state); + } + /* no pmap accounting for purgeable objects */ + new_entry->use_pmap = FALSE; + } + + vm_object_unlock(new_object); + new_object = VM_OBJECT_NULL; + } + + if (result != KERN_SUCCESS && + result != KERN_MEMORY_RESTART_COPY) { + vm_map_lock(src_map); + RETURN(result); + } + + /* + * Throw away the extra reference + */ + + vm_object_deallocate(src_object); + + /* + * Verify that the map has not substantially + * changed while the copy was being made. + */ + + vm_map_lock(src_map); + + if ((version.main_timestamp + 1) == src_map->timestamp) { + /* src_map hasn't changed: src_entry is still valid */ + src_entry = saved_src_entry; + goto VerificationSuccessful; + } + + /* + * Simple version comparison failed. + * + * Retry the lookup and verify that the + * same object/offset are still present. + * + * [Note: a memory manager that colludes with + * the calling task can detect that we have + * cheated. While the map was unlocked, the + * mapping could have been changed and restored.] + */ + + if (!vm_map_lookup_entry(src_map, src_start, &tmp_entry)) { + if (result != KERN_MEMORY_RESTART_COPY) { + vm_object_deallocate(VME_OBJECT(new_entry)); + VME_OBJECT_SET(new_entry, VM_OBJECT_NULL); + /* reset accounting state */ + new_entry->iokit_acct = FALSE; + new_entry->use_pmap = TRUE; + } + RETURN(KERN_INVALID_ADDRESS); + } + + src_entry = tmp_entry; + vm_map_clip_start(src_map, src_entry, src_start); + + if ((((src_entry->protection & VM_PROT_READ) == VM_PROT_NONE) && + !use_maxprot) || + ((src_entry->max_protection & VM_PROT_READ) == 0)) { + goto VerificationFailed; + } + + if (src_entry->vme_end < new_entry->vme_end) { + /* + * This entry might have been shortened + * (vm_map_clip_end) or been replaced with + * an entry that ends closer to "src_start" + * than before. + * Adjust "new_entry" accordingly; copying + * less memory would be correct but we also + * redo the copy (see below) if the new entry + * no longer points at the same object/offset. + */ + assert(VM_MAP_PAGE_ALIGNED(src_entry->vme_end, + VM_MAP_COPY_PAGE_MASK(copy))); + new_entry->vme_end = src_entry->vme_end; + src_size = new_entry->vme_end - src_start; + } else if (src_entry->vme_end > new_entry->vme_end) { + /* + * This entry might have been extended + * (vm_map_entry_simplify() or coalesce) + * or been replaced with an entry that ends farther + * from "src_start" than before. + * + * We've called vm_object_copy_*() only on + * the previous range, so we can't + * just extend new_entry. We have to re-do + * the copy based on the new entry as if it was + * pointing at a different object/offset (see + * "Verification failed" below). + */ + } + + if ((VME_OBJECT(src_entry) != src_object) || + (VME_OFFSET(src_entry) != src_offset) || + (src_entry->vme_end > new_entry->vme_end)) { + /* + * Verification failed. + * + * Start over with this top-level entry. + */ + +VerificationFailed: ; + + vm_object_deallocate(VME_OBJECT(new_entry)); + tmp_entry = src_entry; + continue; + } + + /* + * Verification succeeded. + */ + +VerificationSuccessful:; + + if (result == KERN_MEMORY_RESTART_COPY) { + goto RestartCopy; + } + + /* + * Copy succeeded. + */ + +CopySuccessful: ; + + /* + * Link in the new copy entry. + */ + + vm_map_copy_entry_link(copy, vm_map_copy_last_entry(copy), + new_entry); + + /* + * Determine whether the entire region + * has been copied. + */ + src_base = src_start; + src_start = new_entry->vme_end; + new_entry = VM_MAP_ENTRY_NULL; + while ((src_start >= src_end) && (src_end != 0)) { + submap_map_t *ptr; + + if (src_map == base_map) { + /* back to the top */ + break; + } + + ptr = parent_maps; + assert(ptr != NULL); + parent_maps = parent_maps->next; + + /* fix up the damage we did in that submap */ + vm_map_simplify_range(src_map, + src_base, + src_end); + + vm_map_unlock(src_map); + vm_map_deallocate(src_map); + vm_map_lock(ptr->parent_map); + src_map = ptr->parent_map; + src_base = ptr->base_start; + src_start = ptr->base_start + ptr->base_len; + src_end = ptr->base_end; + if (!vm_map_lookup_entry(src_map, + src_start, + &tmp_entry) && + (src_end > src_start)) { + RETURN(KERN_INVALID_ADDRESS); + } + kfree(ptr, sizeof(submap_map_t)); + if (parent_maps == NULL) { + map_share = FALSE; + } + src_entry = tmp_entry->vme_prev; + } + + if ((VM_MAP_PAGE_SHIFT(src_map) != PAGE_SHIFT) && + (src_start >= src_addr + len) && + (src_addr + len != 0)) { + /* + * Stop copying now, even though we haven't reached + * "src_end". We'll adjust the end of the last copy + * entry at the end, if needed. + * + * If src_map's aligment is different from the + * system's page-alignment, there could be + * extra non-map-aligned map entries between + * the original (non-rounded) "src_addr + len" + * and the rounded "src_end". + * We do not want to copy those map entries since + * they're not part of the copied range. + */ + break; + } + + if ((src_start >= src_end) && (src_end != 0)) { + break; + } + + /* + * Verify that there are no gaps in the region + */ + + tmp_entry = src_entry->vme_next; + if ((tmp_entry->vme_start != src_start) || + (tmp_entry == vm_map_to_entry(src_map))) { + RETURN(KERN_INVALID_ADDRESS); + } + } + + /* + * If the source should be destroyed, do it now, since the + * copy was successful. + */ + if (src_destroy) { + (void) vm_map_delete( + src_map, + vm_map_trunc_page(src_addr, + VM_MAP_PAGE_MASK(src_map)), + src_end, + ((src_map == kernel_map) ? + VM_MAP_REMOVE_KUNWIRE : + VM_MAP_REMOVE_NO_FLAGS), + VM_MAP_NULL); + } else { + /* fix up the damage we did in the base map */ + vm_map_simplify_range( + src_map, + vm_map_trunc_page(src_addr, + VM_MAP_PAGE_MASK(src_map)), + vm_map_round_page(src_end, + VM_MAP_PAGE_MASK(src_map))); + } + + vm_map_unlock(src_map); + tmp_entry = VM_MAP_ENTRY_NULL; + + if (VM_MAP_PAGE_SHIFT(src_map) > PAGE_SHIFT && + VM_MAP_PAGE_SHIFT(src_map) != VM_MAP_COPY_PAGE_SHIFT(copy)) { + vm_map_offset_t original_start, original_offset, original_end; + + assert(VM_MAP_COPY_PAGE_MASK(copy) == PAGE_MASK); + + /* adjust alignment of first copy_entry's "vme_start" */ + tmp_entry = vm_map_copy_first_entry(copy); + if (tmp_entry != vm_map_copy_to_entry(copy)) { + vm_map_offset_t adjustment; + + original_start = tmp_entry->vme_start; + original_offset = VME_OFFSET(tmp_entry); + + /* map-align the start of the first copy entry... */ + adjustment = (tmp_entry->vme_start - + vm_map_trunc_page( + tmp_entry->vme_start, + VM_MAP_PAGE_MASK(src_map))); + tmp_entry->vme_start -= adjustment; + VME_OFFSET_SET(tmp_entry, + VME_OFFSET(tmp_entry) - adjustment); + copy_addr -= adjustment; + assert(tmp_entry->vme_start < tmp_entry->vme_end); + /* ... adjust for mis-aligned start of copy range */ + adjustment = + (vm_map_trunc_page(copy->offset, + PAGE_MASK) - + vm_map_trunc_page(copy->offset, + VM_MAP_PAGE_MASK(src_map))); + if (adjustment) { + assert(page_aligned(adjustment)); + assert(adjustment < VM_MAP_PAGE_SIZE(src_map)); + tmp_entry->vme_start += adjustment; + VME_OFFSET_SET(tmp_entry, + (VME_OFFSET(tmp_entry) + + adjustment)); + copy_addr += adjustment; + assert(tmp_entry->vme_start < tmp_entry->vme_end); + } + + /* + * Assert that the adjustments haven't exposed + * more than was originally copied... + */ + assert(tmp_entry->vme_start >= original_start); + assert(VME_OFFSET(tmp_entry) >= original_offset); + /* + * ... and that it did not adjust outside of a + * a single 16K page. + */ + assert(vm_map_trunc_page(tmp_entry->vme_start, + VM_MAP_PAGE_MASK(src_map)) == + vm_map_trunc_page(original_start, + VM_MAP_PAGE_MASK(src_map))); + } + + /* adjust alignment of last copy_entry's "vme_end" */ + tmp_entry = vm_map_copy_last_entry(copy); + if (tmp_entry != vm_map_copy_to_entry(copy)) { + vm_map_offset_t adjustment; + + original_end = tmp_entry->vme_end; + + /* map-align the end of the last copy entry... */ + tmp_entry->vme_end = + vm_map_round_page(tmp_entry->vme_end, + VM_MAP_PAGE_MASK(src_map)); + /* ... adjust for mis-aligned end of copy range */ + adjustment = + (vm_map_round_page((copy->offset + + copy->size), + VM_MAP_PAGE_MASK(src_map)) - + vm_map_round_page((copy->offset + + copy->size), + PAGE_MASK)); + if (adjustment) { + assert(page_aligned(adjustment)); + assert(adjustment < VM_MAP_PAGE_SIZE(src_map)); + tmp_entry->vme_end -= adjustment; + assert(tmp_entry->vme_start < tmp_entry->vme_end); + } + + /* + * Assert that the adjustments haven't exposed + * more than was originally copied... + */ + assert(tmp_entry->vme_end <= original_end); + /* + * ... and that it did not adjust outside of a + * a single 16K page. + */ + assert(vm_map_round_page(tmp_entry->vme_end, + VM_MAP_PAGE_MASK(src_map)) == + vm_map_round_page(original_end, + VM_MAP_PAGE_MASK(src_map))); + } + } + + /* Fix-up start and end points in copy. This is necessary */ + /* when the various entries in the copy object were picked */ + /* up from different sub-maps */ + + tmp_entry = vm_map_copy_first_entry(copy); + copy_size = 0; /* compute actual size */ + while (tmp_entry != vm_map_copy_to_entry(copy)) { + assert(VM_MAP_PAGE_ALIGNED( + copy_addr + (tmp_entry->vme_end - + tmp_entry->vme_start), + MIN(VM_MAP_COPY_PAGE_MASK(copy), PAGE_MASK))); + assert(VM_MAP_PAGE_ALIGNED( + copy_addr, + MIN(VM_MAP_COPY_PAGE_MASK(copy), PAGE_MASK))); + + /* + * The copy_entries will be injected directly into the + * destination map and might not be "map aligned" there... + */ + tmp_entry->map_aligned = FALSE; + + tmp_entry->vme_end = copy_addr + + (tmp_entry->vme_end - tmp_entry->vme_start); + tmp_entry->vme_start = copy_addr; + assert(tmp_entry->vme_start < tmp_entry->vme_end); + copy_addr += tmp_entry->vme_end - tmp_entry->vme_start; + copy_size += tmp_entry->vme_end - tmp_entry->vme_start; + tmp_entry = (struct vm_map_entry *)tmp_entry->vme_next; + } + + if (VM_MAP_PAGE_SHIFT(src_map) != PAGE_SHIFT && + copy_size < copy->size) { + /* + * The actual size of the VM map copy is smaller than what + * was requested by the caller. This must be because some + * PAGE_SIZE-sized pages are missing at the end of the last + * VM_MAP_PAGE_SIZE(src_map)-sized chunk of the range. + * The caller might not have been aware of those missing + * pages and might not want to be aware of it, which is + * fine as long as they don't try to access (and crash on) + * those missing pages. + * Let's adjust the size of the "copy", to avoid failing + * in vm_map_copyout() or vm_map_copy_overwrite(). + */ + assert(vm_map_round_page(copy_size, + VM_MAP_PAGE_MASK(src_map)) == + vm_map_round_page(copy->size, + VM_MAP_PAGE_MASK(src_map))); + copy->size = copy_size; + } + + *copy_result = copy; + return KERN_SUCCESS; + +#undef RETURN +} + +kern_return_t +vm_map_copy_extract( + vm_map_t src_map, + vm_map_address_t src_addr, + vm_map_size_t len, + boolean_t do_copy, + vm_map_copy_t *copy_result, /* OUT */ + vm_prot_t *cur_prot, /* IN/OUT */ + vm_prot_t *max_prot, /* IN/OUT */ + vm_inherit_t inheritance, + vm_map_kernel_flags_t vmk_flags) +{ + vm_map_copy_t copy; + kern_return_t kr; + vm_prot_t required_cur_prot, required_max_prot; + + /* + * Check for copies of zero bytes. + */ + + if (len == 0) { + *copy_result = VM_MAP_COPY_NULL; + return KERN_SUCCESS; + } + + /* + * Check that the end address doesn't overflow + */ + if (src_addr + len < src_addr) { + return KERN_INVALID_ADDRESS; + } + + if (VM_MAP_PAGE_SIZE(src_map) < PAGE_SIZE) { + DEBUG4K_SHARE("src_map %p src_addr 0x%llx src_end 0x%llx\n", src_map, (uint64_t)src_addr, (uint64_t)(src_addr + len)); + } + + required_cur_prot = *cur_prot; + required_max_prot = *max_prot; + + /* + * Allocate a header element for the list. + * + * Use the start and end in the header to + * remember the endpoints prior to rounding. + */ + + copy = vm_map_copy_allocate(); + copy->type = VM_MAP_COPY_ENTRY_LIST; + copy->cpy_hdr.entries_pageable = vmk_flags.vmkf_copy_pageable; + + vm_map_store_init(©->cpy_hdr); + + copy->offset = 0; + copy->size = len; + + kr = vm_map_remap_extract(src_map, + src_addr, + len, + do_copy, /* copy */ + ©->cpy_hdr, + cur_prot, /* IN/OUT */ + max_prot, /* IN/OUT */ + inheritance, + vmk_flags); + if (kr != KERN_SUCCESS) { + vm_map_copy_discard(copy); + return kr; + } + if (required_cur_prot != VM_PROT_NONE) { + assert((*cur_prot & required_cur_prot) == required_cur_prot); + assert((*max_prot & required_max_prot) == required_max_prot); + } + + *copy_result = copy; + return KERN_SUCCESS; +} + +/* + * vm_map_copyin_object: + * + * Create a copy object from an object. + * Our caller donates an object reference. + */ + +kern_return_t +vm_map_copyin_object( + vm_object_t object, + vm_object_offset_t offset, /* offset of region in object */ + vm_object_size_t size, /* size of region in object */ + vm_map_copy_t *copy_result) /* OUT */ +{ + vm_map_copy_t copy; /* Resulting copy */ + + /* + * We drop the object into a special copy object + * that contains the object directly. + */ + + copy = vm_map_copy_allocate(); + copy->type = VM_MAP_COPY_OBJECT; + copy->cpy_object = object; + copy->offset = offset; + copy->size = size; + + *copy_result = copy; + return KERN_SUCCESS; +} + +static void +vm_map_fork_share( + vm_map_t old_map, + vm_map_entry_t old_entry, + vm_map_t new_map) +{ + vm_object_t object; + vm_map_entry_t new_entry; + + /* + * New sharing code. New map entry + * references original object. Internal + * objects use asynchronous copy algorithm for + * future copies. First make sure we have + * the right object. If we need a shadow, + * or someone else already has one, then + * make a new shadow and share it. + */ + + object = VME_OBJECT(old_entry); + if (old_entry->is_sub_map) { + assert(old_entry->wired_count == 0); +#ifndef NO_NESTED_PMAP + if (old_entry->use_pmap) { + kern_return_t result; + + result = pmap_nest(new_map->pmap, + (VME_SUBMAP(old_entry))->pmap, + (addr64_t)old_entry->vme_start, + (uint64_t)(old_entry->vme_end - old_entry->vme_start)); + if (result) { + panic("vm_map_fork_share: pmap_nest failed!"); + } + } +#endif /* NO_NESTED_PMAP */ + } else if (object == VM_OBJECT_NULL) { + object = vm_object_allocate((vm_map_size_t)(old_entry->vme_end - + old_entry->vme_start)); + VME_OFFSET_SET(old_entry, 0); + VME_OBJECT_SET(old_entry, object); + old_entry->use_pmap = TRUE; +// assert(!old_entry->needs_copy); + } else if (object->copy_strategy != + MEMORY_OBJECT_COPY_SYMMETRIC) { + /* + * We are already using an asymmetric + * copy, and therefore we already have + * the right object. + */ + + assert(!old_entry->needs_copy); + } else if (old_entry->needs_copy || /* case 1 */ + object->shadowed || /* case 2 */ + (!object->true_share && /* case 3 */ + !old_entry->is_shared && + (object->vo_size > + (vm_map_size_t)(old_entry->vme_end - + old_entry->vme_start)))) { + /* + * We need to create a shadow. + * There are three cases here. + * In the first case, we need to + * complete a deferred symmetrical + * copy that we participated in. + * In the second and third cases, + * we need to create the shadow so + * that changes that we make to the + * object do not interfere with + * any symmetrical copies which + * have occured (case 2) or which + * might occur (case 3). + * + * The first case is when we had + * deferred shadow object creation + * via the entry->needs_copy mechanism. + * This mechanism only works when + * only one entry points to the source + * object, and we are about to create + * a second entry pointing to the + * same object. The problem is that + * there is no way of mapping from + * an object to the entries pointing + * to it. (Deferred shadow creation + * works with one entry because occurs + * at fault time, and we walk from the + * entry to the object when handling + * the fault.) + * + * The second case is when the object + * to be shared has already been copied + * with a symmetric copy, but we point + * directly to the object without + * needs_copy set in our entry. (This + * can happen because different ranges + * of an object can be pointed to by + * different entries. In particular, + * a single entry pointing to an object + * can be split by a call to vm_inherit, + * which, combined with task_create, can + * result in the different entries + * having different needs_copy values.) + * The shadowed flag in the object allows + * us to detect this case. The problem + * with this case is that if this object + * has or will have shadows, then we + * must not perform an asymmetric copy + * of this object, since such a copy + * allows the object to be changed, which + * will break the previous symmetrical + * copies (which rely upon the object + * not changing). In a sense, the shadowed + * flag says "don't change this object". + * We fix this by creating a shadow + * object for this object, and sharing + * that. This works because we are free + * to change the shadow object (and thus + * to use an asymmetric copy strategy); + * this is also semantically correct, + * since this object is temporary, and + * therefore a copy of the object is + * as good as the object itself. (This + * is not true for permanent objects, + * since the pager needs to see changes, + * which won't happen if the changes + * are made to a copy.) + * + * The third case is when the object + * to be shared has parts sticking + * outside of the entry we're working + * with, and thus may in the future + * be subject to a symmetrical copy. + * (This is a preemptive version of + * case 2.) + */ + VME_OBJECT_SHADOW(old_entry, + (vm_map_size_t) (old_entry->vme_end - + old_entry->vme_start)); + + /* + * If we're making a shadow for other than + * copy on write reasons, then we have + * to remove write permission. + */ - if (consume_on_success) { - SAVE_HINT_MAP_WRITE(dst_map, vm_map_copy_last_entry(copy)); - } else { - SAVE_HINT_MAP_WRITE(dst_map, last); + if (!old_entry->needs_copy && + (old_entry->protection & VM_PROT_WRITE)) { + vm_prot_t prot; + + assert(!pmap_has_prot_policy(old_map->pmap, old_entry->translated_allow_execute, old_entry->protection)); + + prot = old_entry->protection & ~VM_PROT_WRITE; + + assert(!pmap_has_prot_policy(old_map->pmap, old_entry->translated_allow_execute, prot)); + + if (override_nx(old_map, VME_ALIAS(old_entry)) && prot) { + prot |= VM_PROT_EXECUTE; + } + + + if (old_map->mapped_in_other_pmaps) { + vm_object_pmap_protect( + VME_OBJECT(old_entry), + VME_OFFSET(old_entry), + (old_entry->vme_end - + old_entry->vme_start), + PMAP_NULL, + PAGE_SIZE, + old_entry->vme_start, + prot); + } else { + pmap_protect(old_map->pmap, + old_entry->vme_start, + old_entry->vme_end, + prot); + } + } + + old_entry->needs_copy = FALSE; + object = VME_OBJECT(old_entry); } - dst_map->size += size; /* - * Link in the copy + * If object was using a symmetric copy strategy, + * change its copy strategy to the default + * asymmetric copy strategy, which is copy_delay + * in the non-norma case and copy_call in the + * norma case. Bump the reference count for the + * new entry. */ - if (consume_on_success) { - vm_map_copy_insert(dst_map, last, copy); + if (old_entry->is_sub_map) { + vm_map_lock(VME_SUBMAP(old_entry)); + vm_map_reference(VME_SUBMAP(old_entry)); + vm_map_unlock(VME_SUBMAP(old_entry)); } else { - vm_map_copy_remap(dst_map, last, copy, adjustment, - cur_protection, max_protection, - inheritance); + vm_object_lock(object); + vm_object_reference_locked(object); + if (object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) { + object->copy_strategy = MEMORY_OBJECT_COPY_DELAY; + } + vm_object_unlock(object); } - vm_map_unlock(dst_map); + /* + * Clone the entry, using object ref from above. + * Mark both entries as shared. + */ + + new_entry = vm_map_entry_create(new_map, FALSE); /* Never the kernel + * map or descendants */ + vm_map_entry_copy(old_map, new_entry, old_entry); + old_entry->is_shared = TRUE; + new_entry->is_shared = TRUE; /* - * XXX If wiring_required, call vm_map_pageable + * We're dealing with a shared mapping, so the resulting mapping + * should inherit some of the original mapping's accounting settings. + * "iokit_acct" should have been cleared in vm_map_entry_copy(). + * "use_pmap" should stay the same as before (if it hasn't been reset + * to TRUE when we cleared "iokit_acct"). */ + assert(!new_entry->iokit_acct); - return(KERN_SUCCESS); -} + /* + * If old entry's inheritence is VM_INHERIT_NONE, + * the new entry is for corpse fork, remove the + * write permission from the new entry. + */ + if (old_entry->inheritance == VM_INHERIT_NONE) { + new_entry->protection &= ~VM_PROT_WRITE; + new_entry->max_protection &= ~VM_PROT_WRITE; + } -/* - * Routine: vm_map_copyin - * - * Description: - * see vm_map_copyin_common. Exported via Unsupported.exports. - * - */ + /* + * Insert the entry into the new map -- we + * know we're inserting at the end of the new + * map. + */ -#undef vm_map_copyin + vm_map_store_entry_link(new_map, vm_map_last_entry(new_map), new_entry, + VM_MAP_KERNEL_FLAGS_NONE); -kern_return_t -vm_map_copyin( - vm_map_t src_map, - vm_map_address_t src_addr, - vm_map_size_t len, - boolean_t src_destroy, - vm_map_copy_t *copy_result) /* OUT */ + /* + * Update the physical map + */ + + if (old_entry->is_sub_map) { + /* Bill Angell pmap support goes here */ + } else { + pmap_copy(new_map->pmap, old_map->pmap, new_entry->vme_start, + old_entry->vme_end - old_entry->vme_start, + old_entry->vme_start); + } +} + +static boolean_t +vm_map_fork_copy( + vm_map_t old_map, + vm_map_entry_t *old_entry_p, + vm_map_t new_map, + int vm_map_copyin_flags) { - return(vm_map_copyin_common(src_map, src_addr, len, src_destroy, - FALSE, copy_result, FALSE)); + vm_map_entry_t old_entry = *old_entry_p; + vm_map_size_t entry_size = old_entry->vme_end - old_entry->vme_start; + vm_map_offset_t start = old_entry->vme_start; + vm_map_copy_t copy; + vm_map_entry_t last = vm_map_last_entry(new_map); + + vm_map_unlock(old_map); + /* + * Use maxprot version of copyin because we + * care about whether this memory can ever + * be accessed, not just whether it's accessible + * right now. + */ + vm_map_copyin_flags |= VM_MAP_COPYIN_USE_MAXPROT; + if (vm_map_copyin_internal(old_map, start, entry_size, + vm_map_copyin_flags, ©) + != KERN_SUCCESS) { + /* + * The map might have changed while it + * was unlocked, check it again. Skip + * any blank space or permanently + * unreadable region. + */ + vm_map_lock(old_map); + if (!vm_map_lookup_entry(old_map, start, &last) || + (last->max_protection & VM_PROT_READ) == VM_PROT_NONE) { + last = last->vme_next; + } + *old_entry_p = last; + + /* + * XXX For some error returns, want to + * XXX skip to the next element. Note + * that INVALID_ADDRESS and + * PROTECTION_FAILURE are handled above. + */ + + return FALSE; + } + + /* + * Assert that the vm_map_copy is coming from the right + * zone and hasn't been forged + */ + vm_map_copy_require(copy); + + /* + * Insert the copy into the new map + */ + vm_map_copy_insert(new_map, last, copy); + + /* + * Pick up the traversal at the end of + * the copied region. + */ + + vm_map_lock(old_map); + start += entry_size; + if (!vm_map_lookup_entry(old_map, start, &last)) { + last = last->vme_next; + } else { + if (last->vme_start == start) { + /* + * No need to clip here and we don't + * want to cause any unnecessary + * unnesting... + */ + } else { + vm_map_clip_start(old_map, last, start); + } + } + *old_entry_p = last; + + return TRUE; } /* - * Routine: vm_map_copyin_common - * - * Description: - * Copy the specified region (src_addr, len) from the - * source address space (src_map), possibly removing - * the region from the source address space (src_destroy). + * vm_map_fork: * - * Returns: - * A vm_map_copy_t object (copy_result), suitable for - * insertion into another address space (using vm_map_copyout), - * copying over another address space region (using - * vm_map_copy_overwrite). If the copy is unused, it - * should be destroyed (using vm_map_copy_discard). + * Create and return a new map based on the old + * map, according to the inheritance values on the + * regions in that map and the options. * - * In/out conditions: - * The source map should not be locked on entry. + * The source map must not be locked. */ +vm_map_t +vm_map_fork( + ledger_t ledger, + vm_map_t old_map, + int options) +{ + pmap_t new_pmap; + vm_map_t new_map; + vm_map_entry_t old_entry; + vm_map_size_t new_size = 0, entry_size; + vm_map_entry_t new_entry; + boolean_t src_needs_copy; + boolean_t new_entry_needs_copy; + boolean_t pmap_is64bit; + int vm_map_copyin_flags; + vm_inherit_t old_entry_inheritance; + int map_create_options; + kern_return_t footprint_collect_kr; + + if (options & ~(VM_MAP_FORK_SHARE_IF_INHERIT_NONE | + VM_MAP_FORK_PRESERVE_PURGEABLE | + VM_MAP_FORK_CORPSE_FOOTPRINT)) { + /* unsupported option */ + return VM_MAP_NULL; + } + + pmap_is64bit = +#if defined(__i386__) || defined(__x86_64__) + old_map->pmap->pm_task_map != TASK_MAP_32BIT; +#elif defined(__arm64__) + old_map->pmap->max == MACH_VM_MAX_ADDRESS; +#elif defined(__arm__) + FALSE; +#else +#error Unknown architecture. +#endif -typedef struct submap_map { - vm_map_t parent_map; - vm_map_offset_t base_start; - vm_map_offset_t base_end; - vm_map_size_t base_len; - struct submap_map *next; -} submap_map_t; + unsigned int pmap_flags = 0; + pmap_flags |= pmap_is64bit ? PMAP_CREATE_64BIT : 0; +#if defined(HAS_APPLE_PAC) + pmap_flags |= old_map->pmap->disable_jop ? PMAP_CREATE_DISABLE_JOP : 0; +#endif +#if PMAP_CREATE_FORCE_4K_PAGES + if (VM_MAP_PAGE_SIZE(old_map) == FOURK_PAGE_SIZE && + PAGE_SIZE != FOURK_PAGE_SIZE) { + pmap_flags |= PMAP_CREATE_FORCE_4K_PAGES; + } +#endif /* PMAP_CREATE_FORCE_4K_PAGES */ + new_pmap = pmap_create_options(ledger, (vm_map_size_t) 0, pmap_flags); -kern_return_t -vm_map_copyin_common( - vm_map_t src_map, - vm_map_address_t src_addr, - vm_map_size_t len, - boolean_t src_destroy, - __unused boolean_t src_volatile, - vm_map_copy_t *copy_result, /* OUT */ - boolean_t use_maxprot) -{ - vm_map_entry_t tmp_entry; /* Result of last map lookup -- - * in multi-level lookup, this - * entry contains the actual - * vm_object/offset. - */ - register - vm_map_entry_t new_entry = VM_MAP_ENTRY_NULL; /* Map entry for copy */ + vm_map_reference(old_map); + vm_map_lock(old_map); - vm_map_offset_t src_start; /* Start of current entry -- - * where copy is taking place now - */ - vm_map_offset_t src_end; /* End of entire region to be - * copied */ - vm_map_offset_t src_base; - vm_map_t base_map = src_map; - boolean_t map_share=FALSE; - submap_map_t *parent_maps = NULL; + map_create_options = 0; + if (old_map->hdr.entries_pageable) { + map_create_options |= VM_MAP_CREATE_PAGEABLE; + } + if (options & VM_MAP_FORK_CORPSE_FOOTPRINT) { + map_create_options |= VM_MAP_CREATE_CORPSE_FOOTPRINT; + footprint_collect_kr = KERN_SUCCESS; + } + new_map = vm_map_create_options(new_pmap, + old_map->min_offset, + old_map->max_offset, + map_create_options); + /* inherit cs_enforcement */ + vm_map_cs_enforcement_set(new_map, old_map->cs_enforcement); + vm_map_lock(new_map); + vm_commit_pagezero_status(new_map); + /* inherit the parent map's page size */ + vm_map_set_page_shift(new_map, VM_MAP_PAGE_SHIFT(old_map)); + for ( + old_entry = vm_map_first_entry(old_map); + old_entry != vm_map_to_entry(old_map); + ) { + entry_size = old_entry->vme_end - old_entry->vme_start; - register - vm_map_copy_t copy; /* Resulting copy */ - vm_map_address_t copy_addr; - vm_map_size_t copy_size; + old_entry_inheritance = old_entry->inheritance; + /* + * If caller used the VM_MAP_FORK_SHARE_IF_INHERIT_NONE option + * share VM_INHERIT_NONE entries that are not backed by a + * device pager. + */ + if (old_entry_inheritance == VM_INHERIT_NONE && + (options & VM_MAP_FORK_SHARE_IF_INHERIT_NONE) && + (old_entry->protection & VM_PROT_READ) && + !(!old_entry->is_sub_map && + VME_OBJECT(old_entry) != NULL && + VME_OBJECT(old_entry)->pager != NULL && + is_device_pager_ops( + VME_OBJECT(old_entry)->pager->mo_pager_ops))) { + old_entry_inheritance = VM_INHERIT_SHARE; + } + + if (old_entry_inheritance != VM_INHERIT_NONE && + (options & VM_MAP_FORK_CORPSE_FOOTPRINT) && + footprint_collect_kr == KERN_SUCCESS) { + /* + * The corpse won't have old_map->pmap to query + * footprint information, so collect that data now + * and store it in new_map->vmmap_corpse_footprint + * for later autopsy. + */ + footprint_collect_kr = + vm_map_corpse_footprint_collect(old_map, + old_entry, + new_map); + } - /* - * Check for copies of zero bytes. - */ + switch (old_entry_inheritance) { + case VM_INHERIT_NONE: + break; + + case VM_INHERIT_SHARE: + vm_map_fork_share(old_map, old_entry, new_map); + new_size += entry_size; + break; + + case VM_INHERIT_COPY: + + /* + * Inline the copy_quickly case; + * upon failure, fall back on call + * to vm_map_fork_copy. + */ + + if (old_entry->is_sub_map) { + break; + } + if ((old_entry->wired_count != 0) || + ((VME_OBJECT(old_entry) != NULL) && + (VME_OBJECT(old_entry)->true_share))) { + goto slow_vm_map_fork_copy; + } + + new_entry = vm_map_entry_create(new_map, FALSE); /* never the kernel map or descendants */ + vm_map_entry_copy(old_map, new_entry, old_entry); + + if (new_entry->used_for_jit == TRUE && new_map->jit_entry_exists == FALSE) { + new_map->jit_entry_exists = TRUE; + } + + if (new_entry->is_sub_map) { + /* clear address space specifics */ + new_entry->use_pmap = FALSE; + } else { + /* + * We're dealing with a copy-on-write operation, + * so the resulting mapping should not inherit + * the original mapping's accounting settings. + * "iokit_acct" should have been cleared in + * vm_map_entry_copy(). + * "use_pmap" should be reset to its default + * (TRUE) so that the new mapping gets + * accounted for in the task's memory footprint. + */ + assert(!new_entry->iokit_acct); + new_entry->use_pmap = TRUE; + } + + if (!vm_object_copy_quickly( + VME_OBJECT_PTR(new_entry), + VME_OFFSET(old_entry), + (old_entry->vme_end - + old_entry->vme_start), + &src_needs_copy, + &new_entry_needs_copy)) { + vm_map_entry_dispose(new_map, new_entry); + goto slow_vm_map_fork_copy; + } + + /* + * Handle copy-on-write obligations + */ - if (len == 0) { - *copy_result = VM_MAP_COPY_NULL; - return(KERN_SUCCESS); - } + if (src_needs_copy && !old_entry->needs_copy) { + vm_prot_t prot; - /* - * Check that the end address doesn't overflow - */ - src_end = src_addr + len; - if (src_end < src_addr) - return KERN_INVALID_ADDRESS; + assert(!pmap_has_prot_policy(old_map->pmap, old_entry->translated_allow_execute, old_entry->protection)); - /* - * If the copy is sufficiently small, use a kernel buffer instead - * of making a virtual copy. The theory being that the cost of - * setting up VM (and taking C-O-W faults) dominates the copy costs - * for small regions. - */ - if ((len < msg_ool_size_small) && !use_maxprot) - return vm_map_copyin_kernel_buffer(src_map, src_addr, len, - src_destroy, copy_result); + prot = old_entry->protection & ~VM_PROT_WRITE; - /* - * Compute (page aligned) start and end of region - */ - src_start = vm_map_trunc_page(src_addr, - VM_MAP_PAGE_MASK(src_map)); - src_end = vm_map_round_page(src_end, - VM_MAP_PAGE_MASK(src_map)); + if (override_nx(old_map, VME_ALIAS(old_entry)) + && prot) { + prot |= VM_PROT_EXECUTE; + } - XPR(XPR_VM_MAP, "vm_map_copyin_common map 0x%x addr 0x%x len 0x%x dest %d\n", src_map, src_addr, len, src_destroy, 0); + assert(!pmap_has_prot_policy(old_map->pmap, old_entry->translated_allow_execute, prot)); - /* - * Allocate a header element for the list. - * - * Use the start and end in the header to - * remember the endpoints prior to rounding. - */ + vm_object_pmap_protect( + VME_OBJECT(old_entry), + VME_OFFSET(old_entry), + (old_entry->vme_end - + old_entry->vme_start), + ((old_entry->is_shared + || old_map->mapped_in_other_pmaps) + ? PMAP_NULL : + old_map->pmap), + VM_MAP_PAGE_SIZE(old_map), + old_entry->vme_start, + prot); - copy = (vm_map_copy_t) zalloc(vm_map_copy_zone); - copy->c_u.hdr.rb_head_store.rbh_root = (void*)(int)SKIP_RB_TREE; - vm_map_copy_first_entry(copy) = - vm_map_copy_last_entry(copy) = vm_map_copy_to_entry(copy); - copy->type = VM_MAP_COPY_ENTRY_LIST; - copy->cpy_hdr.nentries = 0; - copy->cpy_hdr.entries_pageable = TRUE; -#if 00 - copy->cpy_hdr.page_shift = src_map->hdr.page_shift; -#else - /* - * The copy entries can be broken down for a variety of reasons, - * so we can't guarantee that they will remain map-aligned... - * Will need to adjust the first copy_entry's "vme_start" and - * the last copy_entry's "vme_end" to be rounded to PAGE_MASK - * rather than the original map's alignment. - */ - copy->cpy_hdr.page_shift = PAGE_SHIFT; -#endif + assert(old_entry->wired_count == 0); + old_entry->needs_copy = TRUE; + } + new_entry->needs_copy = new_entry_needs_copy; - vm_map_store_init( &(copy->cpy_hdr) ); + /* + * Insert the entry at the end + * of the map. + */ - copy->offset = src_addr; - copy->size = len; - - new_entry = vm_map_copy_entry_create(copy, !copy->cpy_hdr.entries_pageable); + vm_map_store_entry_link(new_map, + vm_map_last_entry(new_map), + new_entry, + VM_MAP_KERNEL_FLAGS_NONE); + new_size += entry_size; + break; -#define RETURN(x) \ - MACRO_BEGIN \ - vm_map_unlock(src_map); \ - if(src_map != base_map) \ - vm_map_deallocate(src_map); \ - if (new_entry != VM_MAP_ENTRY_NULL) \ - vm_map_copy_entry_dispose(copy,new_entry); \ - vm_map_copy_discard(copy); \ - { \ - submap_map_t *_ptr; \ - \ - for(_ptr = parent_maps; _ptr != NULL; _ptr = parent_maps) { \ - parent_maps=parent_maps->next; \ - if (_ptr->parent_map != base_map) \ - vm_map_deallocate(_ptr->parent_map); \ - kfree(_ptr, sizeof(submap_map_t)); \ - } \ - } \ - MACRO_RETURN(x); \ - MACRO_END +slow_vm_map_fork_copy: + vm_map_copyin_flags = 0; + if (options & VM_MAP_FORK_PRESERVE_PURGEABLE) { + vm_map_copyin_flags |= + VM_MAP_COPYIN_PRESERVE_PURGEABLE; + } + if (vm_map_fork_copy(old_map, + &old_entry, + new_map, + vm_map_copyin_flags)) { + new_size += entry_size; + } + continue; + } + old_entry = old_entry->vme_next; + } - /* - * Find the beginning of the region. - */ +#if defined(__arm64__) + pmap_insert_sharedpage(new_map->pmap); +#endif /* __arm64__ */ - vm_map_lock(src_map); + new_map->size = new_size; - /* - * Lookup the original "src_addr" rather than the truncated - * "src_start", in case "src_start" falls in a non-map-aligned - * map entry *before* the map entry that contains "src_addr"... - */ - if (!vm_map_lookup_entry(src_map, src_addr, &tmp_entry)) - RETURN(KERN_INVALID_ADDRESS); - if(!tmp_entry->is_sub_map) { - /* - * ... but clip to the map-rounded "src_start" rather than - * "src_addr" to preserve map-alignment. We'll adjust the - * first copy entry at the end, if needed. - */ - vm_map_clip_start(src_map, tmp_entry, src_start); + if (options & VM_MAP_FORK_CORPSE_FOOTPRINT) { + vm_map_corpse_footprint_collect_done(new_map); } - if (src_start < tmp_entry->vme_start) { - /* - * Move "src_start" up to the start of the - * first map entry to copy. - */ - src_start = tmp_entry->vme_start; + + /* Propagate JIT entitlement for the pmap layer. */ + if (pmap_get_jit_entitled(old_map->pmap)) { + /* Tell the pmap that it supports JIT. */ + pmap_set_jit_entitled(new_map->pmap); } - /* set for later submap fix-up */ - copy_addr = src_start; - /* - * Go through entries until we get to the end. - */ + vm_map_unlock(new_map); + vm_map_unlock(old_map); + vm_map_deallocate(old_map); - while (TRUE) { - register - vm_map_entry_t src_entry = tmp_entry; /* Top-level entry */ - vm_map_size_t src_size; /* Size of source - * map entry (in both - * maps) - */ + return new_map; +} - register - vm_object_t src_object; /* Object to copy */ - vm_object_offset_t src_offset; +/* + * vm_map_exec: + * + * Setup the "new_map" with the proper execution environment according + * to the type of executable (platform, 64bit, chroot environment). + * Map the comm page and shared region, etc... + */ +kern_return_t +vm_map_exec( + vm_map_t new_map, + task_t task, + boolean_t is64bit, + void *fsroot, + cpu_type_t cpu, + cpu_subtype_t cpu_subtype, + boolean_t reslide) +{ + SHARED_REGION_TRACE_DEBUG( + ("shared_region: task %p: vm_map_exec(%p,%p,%p,0x%x,0x%x): ->\n", + (void *)VM_KERNEL_ADDRPERM(current_task()), + (void *)VM_KERNEL_ADDRPERM(new_map), + (void *)VM_KERNEL_ADDRPERM(task), + (void *)VM_KERNEL_ADDRPERM(fsroot), + cpu, + cpu_subtype)); + (void) vm_commpage_enter(new_map, task, is64bit); - boolean_t src_needs_copy; /* Should source map - * be made read-only - * for copy-on-write? - */ + (void) vm_shared_region_enter(new_map, task, is64bit, fsroot, cpu, cpu_subtype, reslide); - boolean_t new_entry_needs_copy; /* Will new entry be COW? */ + SHARED_REGION_TRACE_DEBUG( + ("shared_region: task %p: vm_map_exec(%p,%p,%p,0x%x,0x%x): <-\n", + (void *)VM_KERNEL_ADDRPERM(current_task()), + (void *)VM_KERNEL_ADDRPERM(new_map), + (void *)VM_KERNEL_ADDRPERM(task), + (void *)VM_KERNEL_ADDRPERM(fsroot), + cpu, + cpu_subtype)); - boolean_t was_wired; /* Was source wired? */ - vm_map_version_t version; /* Version before locks - * dropped to make copy - */ - kern_return_t result; /* Return value from - * copy_strategically. - */ - while(tmp_entry->is_sub_map) { - vm_map_size_t submap_len; - submap_map_t *ptr; + /* + * Some devices have region(s) of memory that shouldn't get allocated by + * user processes. The following code creates dummy vm_map_entry_t's for each + * of the regions that needs to be reserved to prevent any allocations in + * those regions. + */ + kern_return_t kr = KERN_FAILURE; + vm_map_kernel_flags_t vmk_flags = VM_MAP_KERNEL_FLAGS_NONE; + vmk_flags.vmkf_permanent = TRUE; + vmk_flags.vmkf_beyond_max = TRUE; + + struct vm_reserved_region *regions = NULL; + size_t num_regions = ml_get_vm_reserved_regions(is64bit, ®ions); + assert((num_regions == 0) || (num_regions > 0 && regions != NULL)); + + for (size_t i = 0; i < num_regions; ++i) { + kr = vm_map_enter( + new_map, + ®ions[i].vmrr_addr, + regions[i].vmrr_size, + (vm_map_offset_t)0, + VM_FLAGS_FIXED, + vmk_flags, + VM_KERN_MEMORY_NONE, + VM_OBJECT_NULL, + (vm_object_offset_t)0, + FALSE, + VM_PROT_NONE, + VM_PROT_NONE, + VM_INHERIT_NONE); - ptr = (submap_map_t *)kalloc(sizeof(submap_map_t)); - ptr->next = parent_maps; - parent_maps = ptr; - ptr->parent_map = src_map; - ptr->base_start = src_start; - ptr->base_end = src_end; - submap_len = tmp_entry->vme_end - src_start; - if(submap_len > (src_end-src_start)) - submap_len = src_end-src_start; - ptr->base_len = submap_len; - - src_start -= tmp_entry->vme_start; - src_start += tmp_entry->offset; - src_end = src_start + submap_len; - src_map = tmp_entry->object.sub_map; - vm_map_lock(src_map); - /* keep an outstanding reference for all maps in */ - /* the parents tree except the base map */ - vm_map_reference(src_map); - vm_map_unlock(ptr->parent_map); - if (!vm_map_lookup_entry( - src_map, src_start, &tmp_entry)) - RETURN(KERN_INVALID_ADDRESS); - map_share = TRUE; - if(!tmp_entry->is_sub_map) - vm_map_clip_start(src_map, tmp_entry, src_start); - src_entry = tmp_entry; + if (kr != KERN_SUCCESS) { + panic("Failed to reserve %s region in user map %p %d", regions[i].vmrr_name, new_map, kr); } - /* we are now in the lowest level submap... */ + } - if ((tmp_entry->object.vm_object != VM_OBJECT_NULL) && - (tmp_entry->object.vm_object->phys_contiguous)) { - /* This is not, supported for now.In future */ - /* we will need to detect the phys_contig */ - /* condition and then upgrade copy_slowly */ - /* to do physical copy from the device mem */ - /* based object. We can piggy-back off of */ - /* the was wired boolean to set-up the */ - /* proper handling */ - RETURN(KERN_PROTECTION_FAILURE); - } - /* - * Create a new address map entry to hold the result. - * Fill in the fields from the appropriate source entries. - * We must unlock the source map to do this if we need - * to allocate a map entry. - */ - if (new_entry == VM_MAP_ENTRY_NULL) { - version.main_timestamp = src_map->timestamp; - vm_map_unlock(src_map); + new_map->reserved_regions = (num_regions ? TRUE : FALSE); - new_entry = vm_map_copy_entry_create(copy, !copy->cpy_hdr.entries_pageable); + return KERN_SUCCESS; +} - vm_map_lock(src_map); - if ((version.main_timestamp + 1) != src_map->timestamp) { - if (!vm_map_lookup_entry(src_map, src_start, - &tmp_entry)) { - RETURN(KERN_INVALID_ADDRESS); - } - if (!tmp_entry->is_sub_map) - vm_map_clip_start(src_map, tmp_entry, src_start); - continue; /* restart w/ new tmp_entry */ - } - } +uint64_t vm_map_lookup_locked_copy_slowly_count = 0; +uint64_t vm_map_lookup_locked_copy_slowly_size = 0; +uint64_t vm_map_lookup_locked_copy_slowly_max = 0; +uint64_t vm_map_lookup_locked_copy_slowly_restart = 0; +uint64_t vm_map_lookup_locked_copy_slowly_error = 0; +uint64_t vm_map_lookup_locked_copy_strategically_count = 0; +uint64_t vm_map_lookup_locked_copy_strategically_size = 0; +uint64_t vm_map_lookup_locked_copy_strategically_max = 0; +uint64_t vm_map_lookup_locked_copy_strategically_restart = 0; +uint64_t vm_map_lookup_locked_copy_strategically_error = 0; +uint64_t vm_map_lookup_locked_copy_shadow_count = 0; +uint64_t vm_map_lookup_locked_copy_shadow_size = 0; +uint64_t vm_map_lookup_locked_copy_shadow_max = 0; +/* + * vm_map_lookup_locked: + * + * Finds the VM object, offset, and + * protection for a given virtual address in the + * specified map, assuming a page fault of the + * type specified. + * + * Returns the (object, offset, protection) for + * this address, whether it is wired down, and whether + * this map has the only reference to the data in question. + * In order to later verify this lookup, a "version" + * is returned. + * If contended != NULL, *contended will be set to + * true iff the thread had to spin or block to acquire + * an exclusive lock. + * + * The map MUST be locked by the caller and WILL be + * locked on exit. In order to guarantee the + * existence of the returned object, it is returned + * locked. + * + * If a lookup is requested with "write protection" + * specified, the map may be changed to perform virtual + * copying operations, although the data referenced will + * remain the same. + */ +kern_return_t +vm_map_lookup_locked( + vm_map_t *var_map, /* IN/OUT */ + vm_map_offset_t vaddr, + vm_prot_t fault_type, + int object_lock_type, + vm_map_version_t *out_version, /* OUT */ + vm_object_t *object, /* OUT */ + vm_object_offset_t *offset, /* OUT */ + vm_prot_t *out_prot, /* OUT */ + boolean_t *wired, /* OUT */ + vm_object_fault_info_t fault_info, /* OUT */ + vm_map_t *real_map, /* OUT */ + bool *contended) /* OUT */ +{ + vm_map_entry_t entry; + vm_map_t map = *var_map; + vm_map_t old_map = *var_map; + vm_map_t cow_sub_map_parent = VM_MAP_NULL; + vm_map_offset_t cow_parent_vaddr = 0; + vm_map_offset_t old_start = 0; + vm_map_offset_t old_end = 0; + vm_prot_t prot; + boolean_t mask_protections; + boolean_t force_copy; + boolean_t no_force_copy_if_executable; + boolean_t submap_needed_copy; + vm_prot_t original_fault_type; + vm_map_size_t fault_page_mask; - /* - * Verify that the region can be read. - */ - if (((src_entry->protection & VM_PROT_READ) == VM_PROT_NONE && - !use_maxprot) || - (src_entry->max_protection & VM_PROT_READ) == 0) - RETURN(KERN_PROTECTION_FAILURE); + /* + * VM_PROT_MASK means that the caller wants us to use "fault_type" + * as a mask against the mapping's actual protections, not as an + * absolute value. + */ + mask_protections = (fault_type & VM_PROT_IS_MASK) ? TRUE : FALSE; + force_copy = (fault_type & VM_PROT_COPY) ? TRUE : FALSE; + no_force_copy_if_executable = (fault_type & VM_PROT_COPY_FAIL_IF_EXECUTABLE) ? TRUE : FALSE; + fault_type &= VM_PROT_ALL; + original_fault_type = fault_type; + if (contended) { + *contended = false; + } - /* - * Clip against the endpoints of the entire region. - */ + *real_map = map; - vm_map_clip_end(src_map, src_entry, src_end); + fault_page_mask = MIN(VM_MAP_PAGE_MASK(map), PAGE_MASK); + vaddr = VM_MAP_TRUNC_PAGE(vaddr, fault_page_mask); - src_size = src_entry->vme_end - src_start; - src_object = src_entry->object.vm_object; - src_offset = src_entry->offset; - was_wired = (src_entry->wired_count != 0); +RetryLookup: + fault_type = original_fault_type; - vm_map_entry_copy(new_entry, src_entry); - if (new_entry->is_sub_map) { - /* clr address space specifics */ - new_entry->use_pmap = FALSE; - } + /* + * If the map has an interesting hint, try it before calling + * full blown lookup routine. + */ + entry = map->hint; + + if ((entry == vm_map_to_entry(map)) || + (vaddr < entry->vme_start) || (vaddr >= entry->vme_end)) { + vm_map_entry_t tmp_entry; /* - * Attempt non-blocking copy-on-write optimizations. + * Entry was either not a valid hint, or the vaddr + * was not contained in the entry, so do a full lookup. */ + if (!vm_map_lookup_entry(map, vaddr, &tmp_entry)) { + if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) { + vm_map_unlock(cow_sub_map_parent); + } + if ((*real_map != map) + && (*real_map != cow_sub_map_parent)) { + vm_map_unlock(*real_map); + } + return KERN_INVALID_ADDRESS; + } - if (src_destroy && - (src_object == VM_OBJECT_NULL || - (src_object->internal && !src_object->true_share - && !map_share))) { - /* - * If we are destroying the source, and the object - * is internal, we can move the object reference - * from the source to the copy. The copy is - * copy-on-write only if the source is. - * We make another reference to the object, because - * destroying the source entry will deallocate it. - */ - vm_object_reference(src_object); - - /* - * Copy is always unwired. vm_map_copy_entry - * set its wired count to zero. - */ - - goto CopySuccessful; - } - - - RestartCopy: - XPR(XPR_VM_MAP, "vm_map_copyin_common src_obj 0x%x ent 0x%x obj 0x%x was_wired %d\n", - src_object, new_entry, new_entry->object.vm_object, - was_wired, 0); - if ((src_object == VM_OBJECT_NULL || - (!was_wired && !map_share && !tmp_entry->is_shared)) && - vm_object_copy_quickly( - &new_entry->object.vm_object, - src_offset, - src_size, - &src_needs_copy, - &new_entry_needs_copy)) { - - new_entry->needs_copy = new_entry_needs_copy; - - /* - * Handle copy-on-write obligations - */ - - if (src_needs_copy && !tmp_entry->needs_copy) { - vm_prot_t prot; + entry = tmp_entry; + } + if (map == old_map) { + old_start = entry->vme_start; + old_end = entry->vme_end; + } - prot = src_entry->protection & ~VM_PROT_WRITE; + /* + * Handle submaps. Drop lock on upper map, submap is + * returned locked. + */ - if (override_nx(src_map, src_entry->alias) && prot) - prot |= VM_PROT_EXECUTE; + submap_needed_copy = FALSE; +submap_recurse: + if (entry->is_sub_map) { + vm_map_offset_t local_vaddr; + vm_map_offset_t end_delta; + vm_map_offset_t start_delta; + vm_map_entry_t submap_entry, saved_submap_entry; + vm_object_offset_t submap_entry_offset; + vm_object_size_t submap_entry_size; + vm_prot_t subentry_protection; + vm_prot_t subentry_max_protection; + boolean_t subentry_no_copy_on_read; + boolean_t mapped_needs_copy = FALSE; + vm_map_version_t version; + + assertf(VM_MAP_PAGE_SHIFT(VME_SUBMAP(entry)) >= VM_MAP_PAGE_SHIFT(map), + "map %p (%d) entry %p submap %p (%d)\n", + map, VM_MAP_PAGE_SHIFT(map), entry, + VME_SUBMAP(entry), VM_MAP_PAGE_SHIFT(VME_SUBMAP(entry))); - vm_object_pmap_protect( - src_object, - src_offset, - src_size, - (src_entry->is_shared ? - PMAP_NULL - : src_map->pmap), - src_entry->vme_start, - prot); + local_vaddr = vaddr; - tmp_entry->needs_copy = TRUE; + if ((entry->use_pmap && + !((fault_type & VM_PROT_WRITE) || + force_copy))) { + /* if real_map equals map we unlock below */ + if ((*real_map != map) && + (*real_map != cow_sub_map_parent)) { + vm_map_unlock(*real_map); } + *real_map = VME_SUBMAP(entry); + } - /* - * The map has never been unlocked, so it's safe - * to move to the next entry rather than doing - * another lookup. - */ - - goto CopySuccessful; + if (entry->needs_copy && + ((fault_type & VM_PROT_WRITE) || + force_copy)) { + if (!mapped_needs_copy) { + if (vm_map_lock_read_to_write(map)) { + vm_map_lock_read(map); + *real_map = map; + goto RetryLookup; + } + vm_map_lock_read(VME_SUBMAP(entry)); + *var_map = VME_SUBMAP(entry); + cow_sub_map_parent = map; + /* reset base to map before cow object */ + /* this is the map which will accept */ + /* the new cow object */ + old_start = entry->vme_start; + old_end = entry->vme_end; + cow_parent_vaddr = vaddr; + mapped_needs_copy = TRUE; + } else { + vm_map_lock_read(VME_SUBMAP(entry)); + *var_map = VME_SUBMAP(entry); + if ((cow_sub_map_parent != map) && + (*real_map != map)) { + vm_map_unlock(map); + } + } + } else { + if (entry->needs_copy) { + submap_needed_copy = TRUE; + } + vm_map_lock_read(VME_SUBMAP(entry)); + *var_map = VME_SUBMAP(entry); + /* leave map locked if it is a target */ + /* cow sub_map above otherwise, just */ + /* follow the maps down to the object */ + /* here we unlock knowing we are not */ + /* revisiting the map. */ + if ((*real_map != map) && (map != cow_sub_map_parent)) { + vm_map_unlock_read(map); + } } - /* - * Take an object reference, so that we may - * release the map lock(s). - */ + map = *var_map; - assert(src_object != VM_OBJECT_NULL); - vm_object_reference(src_object); + /* calculate the offset in the submap for vaddr */ + local_vaddr = (local_vaddr - entry->vme_start) + VME_OFFSET(entry); + assertf(VM_MAP_PAGE_ALIGNED(local_vaddr, fault_page_mask), + "local_vaddr 0x%llx entry->vme_start 0x%llx fault_page_mask 0x%llx\n", + (uint64_t)local_vaddr, (uint64_t)entry->vme_start, (uint64_t)fault_page_mask); + +RetrySubMap: + if (!vm_map_lookup_entry(map, local_vaddr, &submap_entry)) { + if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) { + vm_map_unlock(cow_sub_map_parent); + } + if ((*real_map != map) + && (*real_map != cow_sub_map_parent)) { + vm_map_unlock(*real_map); + } + *real_map = map; + return KERN_INVALID_ADDRESS; + } - /* - * Record the timestamp for later verification. - * Unlock the map. - */ + /* find the attenuated shadow of the underlying object */ + /* on our target map */ - version.main_timestamp = src_map->timestamp; - vm_map_unlock(src_map); /* Increments timestamp once! */ + /* in english the submap object may extend beyond the */ + /* region mapped by the entry or, may only fill a portion */ + /* of it. For our purposes, we only care if the object */ + /* doesn't fill. In this case the area which will */ + /* ultimately be clipped in the top map will only need */ + /* to be as big as the portion of the underlying entry */ + /* which is mapped */ + start_delta = submap_entry->vme_start > VME_OFFSET(entry) ? + submap_entry->vme_start - VME_OFFSET(entry) : 0; - /* - * Perform the copy - */ + end_delta = + (VME_OFFSET(entry) + start_delta + (old_end - old_start)) <= + submap_entry->vme_end ? + 0 : (VME_OFFSET(entry) + + (old_end - old_start)) + - submap_entry->vme_end; - if (was_wired) { - CopySlowly: - vm_object_lock(src_object); - result = vm_object_copy_slowly( - src_object, - src_offset, - src_size, - THREAD_UNINT, - &new_entry->object.vm_object); - new_entry->offset = 0; - new_entry->needs_copy = FALSE; + old_start += start_delta; + old_end -= end_delta; + if (submap_entry->is_sub_map) { + entry = submap_entry; + vaddr = local_vaddr; + goto submap_recurse; } - else if (src_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC && - (tmp_entry->is_shared || map_share)) { - vm_object_t new_object; - vm_object_lock_shared(src_object); - new_object = vm_object_copy_delayed( - src_object, - src_offset, - src_size, - TRUE); - if (new_object == VM_OBJECT_NULL) - goto CopySlowly; + if (((fault_type & VM_PROT_WRITE) || + force_copy) + && cow_sub_map_parent) { + vm_object_t sub_object, copy_object; + vm_object_offset_t copy_offset; + vm_map_offset_t local_start; + vm_map_offset_t local_end; + boolean_t object_copied = FALSE; + vm_object_offset_t object_copied_offset = 0; + boolean_t object_copied_needs_copy = FALSE; + kern_return_t kr = KERN_SUCCESS; - new_entry->object.vm_object = new_object; - new_entry->needs_copy = TRUE; - assert(!new_entry->iokit_acct); - assert(new_object->purgable == VM_PURGABLE_DENY); - new_entry->use_pmap = TRUE; - result = KERN_SUCCESS; + if (vm_map_lock_read_to_write(map)) { + vm_map_lock_read(map); + old_start -= start_delta; + old_end += end_delta; + goto RetrySubMap; + } - } else { - result = vm_object_copy_strategically(src_object, - src_offset, - src_size, - &new_entry->object.vm_object, - &new_entry->offset, - &new_entry_needs_copy); - new_entry->needs_copy = new_entry_needs_copy; - } + sub_object = VME_OBJECT(submap_entry); + if (sub_object == VM_OBJECT_NULL) { + sub_object = + vm_object_allocate( + (vm_map_size_t) + (submap_entry->vme_end - + submap_entry->vme_start)); + VME_OBJECT_SET(submap_entry, sub_object); + VME_OFFSET_SET(submap_entry, 0); + assert(!submap_entry->is_sub_map); + assert(submap_entry->use_pmap); + } + local_start = local_vaddr - + (cow_parent_vaddr - old_start); + local_end = local_vaddr + + (old_end - cow_parent_vaddr); + vm_map_clip_start(map, submap_entry, local_start); + vm_map_clip_end(map, submap_entry, local_end); + if (submap_entry->is_sub_map) { + /* unnesting was done when clipping */ + assert(!submap_entry->use_pmap); + } - if (result != KERN_SUCCESS && - result != KERN_MEMORY_RESTART_COPY) { - vm_map_lock(src_map); - RETURN(result); - } + /* This is the COW case, lets connect */ + /* an entry in our space to the underlying */ + /* object in the submap, bypassing the */ + /* submap. */ + submap_entry_offset = VME_OFFSET(submap_entry); + submap_entry_size = submap_entry->vme_end - submap_entry->vme_start; + + if ((submap_entry->wired_count != 0 || + sub_object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) && + (submap_entry->protection & VM_PROT_EXECUTE) && + no_force_copy_if_executable) { +// printf("FBDP map %p entry %p start 0x%llx end 0x%llx wired %d strat %d\n", map, submap_entry, (uint64_t)local_start, (uint64_t)local_end, submap_entry->wired_count, sub_object->copy_strategy); + if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) { + vm_map_unlock(cow_sub_map_parent); + } + if ((*real_map != map) + && (*real_map != cow_sub_map_parent)) { + vm_map_unlock(*real_map); + } + *real_map = map; + vm_map_lock_write_to_read(map); + kr = KERN_PROTECTION_FAILURE; + DTRACE_VM4(submap_no_copy_executable, + vm_map_t, map, + vm_object_offset_t, submap_entry_offset, + vm_object_size_t, submap_entry_size, + int, kr); + return kr; + } - /* - * Throw away the extra reference - */ + if (submap_entry->wired_count != 0) { + vm_object_reference(sub_object); - vm_object_deallocate(src_object); + assertf(VM_MAP_PAGE_ALIGNED(VME_OFFSET(submap_entry), VM_MAP_PAGE_MASK(map)), + "submap_entry %p offset 0x%llx\n", + submap_entry, VME_OFFSET(submap_entry)); - /* - * Verify that the map has not substantially - * changed while the copy was being made. - */ + DTRACE_VM6(submap_copy_slowly, + vm_map_t, cow_sub_map_parent, + vm_map_offset_t, vaddr, + vm_map_t, map, + vm_object_size_t, submap_entry_size, + int, submap_entry->wired_count, + int, sub_object->copy_strategy); - vm_map_lock(src_map); + saved_submap_entry = submap_entry; + version.main_timestamp = map->timestamp; + vm_map_unlock(map); /* Increments timestamp by 1 */ + submap_entry = VM_MAP_ENTRY_NULL; - if ((version.main_timestamp + 1) == src_map->timestamp) - goto VerificationSuccessful; + vm_object_lock(sub_object); + kr = vm_object_copy_slowly(sub_object, + submap_entry_offset, + submap_entry_size, + FALSE, + ©_object); + object_copied = TRUE; + object_copied_offset = 0; + /* 4k: account for extra offset in physical page */ + object_copied_offset += submap_entry_offset - vm_object_trunc_page(submap_entry_offset); + object_copied_needs_copy = FALSE; + vm_object_deallocate(sub_object); - /* - * Simple version comparison failed. - * - * Retry the lookup and verify that the - * same object/offset are still present. - * - * [Note: a memory manager that colludes with - * the calling task can detect that we have - * cheated. While the map was unlocked, the - * mapping could have been changed and restored.] - */ + vm_map_lock(map); - if (!vm_map_lookup_entry(src_map, src_start, &tmp_entry)) { - if (result != KERN_MEMORY_RESTART_COPY) { - vm_object_deallocate(new_entry->object.vm_object); - new_entry->object.vm_object = VM_OBJECT_NULL; - assert(!new_entry->iokit_acct); - new_entry->use_pmap = TRUE; - } - RETURN(KERN_INVALID_ADDRESS); - } + if (kr != KERN_SUCCESS && + kr != KERN_MEMORY_RESTART_COPY) { + if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) { + vm_map_unlock(cow_sub_map_parent); + } + if ((*real_map != map) + && (*real_map != cow_sub_map_parent)) { + vm_map_unlock(*real_map); + } + *real_map = map; + vm_object_deallocate(copy_object); + copy_object = VM_OBJECT_NULL; + vm_map_lock_write_to_read(map); + DTRACE_VM4(submap_copy_error_slowly, + vm_object_t, sub_object, + vm_object_offset_t, submap_entry_offset, + vm_object_size_t, submap_entry_size, + int, kr); + vm_map_lookup_locked_copy_slowly_error++; + return kr; + } - src_entry = tmp_entry; - vm_map_clip_start(src_map, src_entry, src_start); + if ((kr == KERN_SUCCESS) && + (version.main_timestamp + 1) == map->timestamp) { + submap_entry = saved_submap_entry; + } else { + saved_submap_entry = NULL; + old_start -= start_delta; + old_end += end_delta; + vm_object_deallocate(copy_object); + copy_object = VM_OBJECT_NULL; + vm_map_lock_write_to_read(map); + vm_map_lookup_locked_copy_slowly_restart++; + goto RetrySubMap; + } + vm_map_lookup_locked_copy_slowly_count++; + vm_map_lookup_locked_copy_slowly_size += submap_entry_size; + if (submap_entry_size > vm_map_lookup_locked_copy_slowly_max) { + vm_map_lookup_locked_copy_slowly_max = submap_entry_size; + } + } else if (sub_object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) { + submap_entry_offset = VME_OFFSET(submap_entry); + copy_object = VM_OBJECT_NULL; + object_copied_offset = submap_entry_offset; + object_copied_needs_copy = FALSE; + DTRACE_VM6(submap_copy_strategically, + vm_map_t, cow_sub_map_parent, + vm_map_offset_t, vaddr, + vm_map_t, map, + vm_object_size_t, submap_entry_size, + int, submap_entry->wired_count, + int, sub_object->copy_strategy); + kr = vm_object_copy_strategically( + sub_object, + submap_entry_offset, + submap_entry->vme_end - submap_entry->vme_start, + ©_object, + &object_copied_offset, + &object_copied_needs_copy); + if (kr == KERN_MEMORY_RESTART_COPY) { + old_start -= start_delta; + old_end += end_delta; + vm_object_deallocate(copy_object); + copy_object = VM_OBJECT_NULL; + vm_map_lock_write_to_read(map); + vm_map_lookup_locked_copy_strategically_restart++; + goto RetrySubMap; + } + if (kr != KERN_SUCCESS) { + if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) { + vm_map_unlock(cow_sub_map_parent); + } + if ((*real_map != map) + && (*real_map != cow_sub_map_parent)) { + vm_map_unlock(*real_map); + } + *real_map = map; + vm_object_deallocate(copy_object); + copy_object = VM_OBJECT_NULL; + vm_map_lock_write_to_read(map); + DTRACE_VM4(submap_copy_error_strategically, + vm_object_t, sub_object, + vm_object_offset_t, submap_entry_offset, + vm_object_size_t, submap_entry_size, + int, kr); + vm_map_lookup_locked_copy_strategically_error++; + return kr; + } + assert(copy_object != VM_OBJECT_NULL); + assert(copy_object != sub_object); + object_copied = TRUE; + vm_map_lookup_locked_copy_strategically_count++; + vm_map_lookup_locked_copy_strategically_size += submap_entry_size; + if (submap_entry_size > vm_map_lookup_locked_copy_strategically_max) { + vm_map_lookup_locked_copy_strategically_max = submap_entry_size; + } + } else { + /* set up shadow object */ + object_copied = FALSE; + copy_object = sub_object; + vm_object_lock(sub_object); + vm_object_reference_locked(sub_object); + sub_object->shadowed = TRUE; + vm_object_unlock(sub_object); - if ((((src_entry->protection & VM_PROT_READ) == VM_PROT_NONE) && - !use_maxprot) || - ((src_entry->max_protection & VM_PROT_READ) == 0)) - goto VerificationFailed; + assert(submap_entry->wired_count == 0); + submap_entry->needs_copy = TRUE; - if (src_entry->vme_end < new_entry->vme_end) { - assert(VM_MAP_PAGE_ALIGNED(src_entry->vme_end, - VM_MAP_COPY_PAGE_MASK(copy))); - new_entry->vme_end = src_entry->vme_end; - src_size = new_entry->vme_end - src_start; - } + prot = submap_entry->protection; + assert(!pmap_has_prot_policy(map->pmap, submap_entry->translated_allow_execute, prot)); + prot = prot & ~VM_PROT_WRITE; + assert(!pmap_has_prot_policy(map->pmap, submap_entry->translated_allow_execute, prot)); + + if (override_nx(old_map, + VME_ALIAS(submap_entry)) + && prot) { + prot |= VM_PROT_EXECUTE; + } - if ((src_entry->object.vm_object != src_object) || - (src_entry->offset != src_offset) ) { + vm_object_pmap_protect( + sub_object, + VME_OFFSET(submap_entry), + submap_entry->vme_end - + submap_entry->vme_start, + (submap_entry->is_shared + || map->mapped_in_other_pmaps) ? + PMAP_NULL : map->pmap, + VM_MAP_PAGE_SIZE(map), + submap_entry->vme_start, + prot); + vm_map_lookup_locked_copy_shadow_count++; + vm_map_lookup_locked_copy_shadow_size += submap_entry_size; + if (submap_entry_size > vm_map_lookup_locked_copy_shadow_max) { + vm_map_lookup_locked_copy_shadow_max = submap_entry_size; + } + } /* - * Verification failed. - * - * Start over with this top-level entry. + * Adjust the fault offset to the submap entry. */ + copy_offset = (local_vaddr - + submap_entry->vme_start + + VME_OFFSET(submap_entry)); - VerificationFailed: ; - - vm_object_deallocate(new_entry->object.vm_object); - tmp_entry = src_entry; - continue; - } + /* This works diffently than the */ + /* normal submap case. We go back */ + /* to the parent of the cow map and*/ + /* clip out the target portion of */ + /* the sub_map, substituting the */ + /* new copy object, */ - /* - * Verification succeeded. - */ + subentry_protection = submap_entry->protection; + subentry_max_protection = submap_entry->max_protection; + subentry_no_copy_on_read = submap_entry->vme_no_copy_on_read; + vm_map_unlock(map); + submap_entry = NULL; /* not valid after map unlock */ - VerificationSuccessful: ; + local_start = old_start; + local_end = old_end; + map = cow_sub_map_parent; + *var_map = cow_sub_map_parent; + vaddr = cow_parent_vaddr; + cow_sub_map_parent = NULL; - if (result == KERN_MEMORY_RESTART_COPY) - goto RestartCopy; + if (!vm_map_lookup_entry(map, + vaddr, &entry)) { + if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) { + vm_map_unlock(cow_sub_map_parent); + } + if ((*real_map != map) + && (*real_map != cow_sub_map_parent)) { + vm_map_unlock(*real_map); + } + *real_map = map; + vm_object_deallocate( + copy_object); + copy_object = VM_OBJECT_NULL; + vm_map_lock_write_to_read(map); + DTRACE_VM4(submap_lookup_post_unlock, + uint64_t, (uint64_t)entry->vme_start, + uint64_t, (uint64_t)entry->vme_end, + vm_map_offset_t, vaddr, + int, object_copied); + return KERN_INVALID_ADDRESS; + } - /* - * Copy succeeded. - */ + /* clip out the portion of space */ + /* mapped by the sub map which */ + /* corresponds to the underlying */ + /* object */ - CopySuccessful: ; + /* + * Clip (and unnest) the smallest nested chunk + * possible around the faulting address... + */ + local_start = vaddr & ~(pmap_shared_region_size_min(map->pmap) - 1); + local_end = local_start + pmap_shared_region_size_min(map->pmap); + /* + * ... but don't go beyond the "old_start" to "old_end" + * range, to avoid spanning over another VM region + * with a possibly different VM object and/or offset. + */ + if (local_start < old_start) { + local_start = old_start; + } + if (local_end > old_end) { + local_end = old_end; + } + /* + * Adjust copy_offset to the start of the range. + */ + copy_offset -= (vaddr - local_start); - /* - * Link in the new copy entry. - */ + vm_map_clip_start(map, entry, local_start); + vm_map_clip_end(map, entry, local_end); + if (entry->is_sub_map) { + /* unnesting was done when clipping */ + assert(!entry->use_pmap); + } - vm_map_copy_entry_link(copy, vm_map_copy_last_entry(copy), - new_entry); - - /* - * Determine whether the entire region - * has been copied. - */ - src_base = src_start; - src_start = new_entry->vme_end; - new_entry = VM_MAP_ENTRY_NULL; - while ((src_start >= src_end) && (src_end != 0)) { - submap_map_t *ptr; + /* substitute copy object for */ + /* shared map entry */ + vm_map_deallocate(VME_SUBMAP(entry)); + assert(!entry->iokit_acct); + entry->is_sub_map = FALSE; + entry->use_pmap = TRUE; + VME_OBJECT_SET(entry, copy_object); - if (src_map == base_map) { - /* back to the top */ - break; + /* propagate the submap entry's protections */ + if (entry->protection != VM_PROT_READ) { + /* + * Someone has already altered the top entry's + * protections via vm_protect(VM_PROT_COPY). + * Respect these new values and ignore the + * submap entry's protections. + */ + } else { + /* + * Regular copy-on-write: propagate the submap + * entry's protections to the top map entry. + */ + entry->protection |= subentry_protection; + } + entry->max_protection |= subentry_max_protection; + /* propagate no_copy_on_read */ + entry->vme_no_copy_on_read = subentry_no_copy_on_read; + + if ((entry->protection & VM_PROT_WRITE) && + (entry->protection & VM_PROT_EXECUTE) && +#if XNU_TARGET_OS_OSX + map->pmap != kernel_pmap && + (vm_map_cs_enforcement(map) +#if __arm64__ + || !VM_MAP_IS_EXOTIC(map) +#endif /* __arm64__ */ + ) && +#endif /* XNU_TARGET_OS_OSX */ + !(entry->used_for_jit) && + VM_MAP_POLICY_WX_STRIP_X(map)) { + DTRACE_VM3(cs_wx, + uint64_t, (uint64_t)entry->vme_start, + uint64_t, (uint64_t)entry->vme_end, + vm_prot_t, entry->protection); + printf("CODE SIGNING: %d[%s] %s can't have both write and exec at the same time\n", + proc_selfpid(), + (current_task()->bsd_info + ? proc_name_address(current_task()->bsd_info) + : "?"), + __FUNCTION__); + entry->protection &= ~VM_PROT_EXECUTE; } - ptr = parent_maps; - assert(ptr != NULL); - parent_maps = parent_maps->next; - - /* fix up the damage we did in that submap */ - vm_map_simplify_range(src_map, - src_base, - src_end); + if (object_copied) { + VME_OFFSET_SET(entry, local_start - old_start + object_copied_offset); + entry->needs_copy = object_copied_needs_copy; + entry->is_shared = FALSE; + } else { + assert(VME_OBJECT(entry) != VM_OBJECT_NULL); + assert(VME_OBJECT(entry)->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC); + assert(entry->wired_count == 0); + VME_OFFSET_SET(entry, copy_offset); + entry->needs_copy = TRUE; + if (map != old_map) { + entry->is_shared = TRUE; + } + } + if (entry->inheritance == VM_INHERIT_SHARE) { + entry->inheritance = VM_INHERIT_COPY; + } - vm_map_unlock(src_map); - vm_map_deallocate(src_map); - vm_map_lock(ptr->parent_map); - src_map = ptr->parent_map; - src_base = ptr->base_start; - src_start = ptr->base_start + ptr->base_len; - src_end = ptr->base_end; - if (!vm_map_lookup_entry(src_map, - src_start, - &tmp_entry) && - (src_end > src_start)) { - RETURN(KERN_INVALID_ADDRESS); + vm_map_lock_write_to_read(map); + } else { + if ((cow_sub_map_parent) + && (cow_sub_map_parent != *real_map) + && (cow_sub_map_parent != map)) { + vm_map_unlock(cow_sub_map_parent); } - kfree(ptr, sizeof(submap_map_t)); - if (parent_maps == NULL) - map_share = FALSE; - src_entry = tmp_entry->vme_prev; + entry = submap_entry; + vaddr = local_vaddr; } + } - if ((VM_MAP_PAGE_SHIFT(src_map) != PAGE_SHIFT) && - (src_start >= src_addr + len) && - (src_addr + len != 0)) { - /* - * Stop copying now, even though we haven't reached - * "src_end". We'll adjust the end of the last copy - * entry at the end, if needed. - * - * If src_map's aligment is different from the - * system's page-alignment, there could be - * extra non-map-aligned map entries between - * the original (non-rounded) "src_addr + len" - * and the rounded "src_end". - * We do not want to copy those map entries since - * they're not part of the copied range. - */ - break; - } + /* + * Check whether this task is allowed to have + * this page. + */ - if ((src_start >= src_end) && (src_end != 0)) - break; + prot = entry->protection; + if (override_nx(old_map, VME_ALIAS(entry)) && prot) { /* - * Verify that there are no gaps in the region + * HACK -- if not a stack, then allow execution */ + prot |= VM_PROT_EXECUTE; + } - tmp_entry = src_entry->vme_next; - if ((tmp_entry->vme_start != src_start) || - (tmp_entry == vm_map_to_entry(src_map))) { - RETURN(KERN_INVALID_ADDRESS); + if (mask_protections) { + fault_type &= prot; + if (fault_type == VM_PROT_NONE) { + goto protection_failure; + } + } + if (((fault_type & prot) != fault_type) +#if __arm64__ + /* prefetch abort in execute-only page */ + && !(prot == VM_PROT_EXECUTE && fault_type == (VM_PROT_READ | VM_PROT_EXECUTE)) +#endif + ) { +protection_failure: + if (*real_map != map) { + vm_map_unlock(*real_map); } + *real_map = map; + + if ((fault_type & VM_PROT_EXECUTE) && prot) { + log_stack_execution_failure((addr64_t)vaddr, prot); + } + + DTRACE_VM2(prot_fault, int, 1, (uint64_t *), NULL); + return KERN_PROTECTION_FAILURE; } /* - * If the source should be destroyed, do it now, since the - * copy was successful. + * If this page is not pageable, we have to get + * it for all possible accesses. */ - if (src_destroy) { - (void) vm_map_delete( - src_map, - vm_map_trunc_page(src_addr, - VM_MAP_PAGE_MASK(src_map)), - src_end, - ((src_map == kernel_map) ? - VM_MAP_REMOVE_KUNWIRE : - VM_MAP_NO_FLAGS), - VM_MAP_NULL); - } else { - /* fix up the damage we did in the base map */ - vm_map_simplify_range( - src_map, - vm_map_trunc_page(src_addr, - VM_MAP_PAGE_MASK(src_map)), - vm_map_round_page(src_end, - VM_MAP_PAGE_MASK(src_map))); - } - - vm_map_unlock(src_map); - - if (VM_MAP_PAGE_SHIFT(src_map) != PAGE_SHIFT) { - vm_map_offset_t original_start, original_offset, original_end; - - assert(VM_MAP_COPY_PAGE_MASK(copy) == PAGE_MASK); - /* adjust alignment of first copy_entry's "vme_start" */ - tmp_entry = vm_map_copy_first_entry(copy); - if (tmp_entry != vm_map_copy_to_entry(copy)) { - vm_map_offset_t adjustment; + *wired = (entry->wired_count != 0); + if (*wired) { + fault_type = prot; + } - original_start = tmp_entry->vme_start; - original_offset = tmp_entry->offset; + /* + * If the entry was copy-on-write, we either ... + */ - /* map-align the start of the first copy entry... */ - adjustment = (tmp_entry->vme_start - - vm_map_trunc_page( - tmp_entry->vme_start, - VM_MAP_PAGE_MASK(src_map))); - tmp_entry->vme_start -= adjustment; - tmp_entry->offset -= adjustment; - copy_addr -= adjustment; - assert(tmp_entry->vme_start < tmp_entry->vme_end); - /* ... adjust for mis-aligned start of copy range */ - adjustment = - (vm_map_trunc_page(copy->offset, - PAGE_MASK) - - vm_map_trunc_page(copy->offset, - VM_MAP_PAGE_MASK(src_map))); - if (adjustment) { - assert(page_aligned(adjustment)); - assert(adjustment < VM_MAP_PAGE_SIZE(src_map)); - tmp_entry->vme_start += adjustment; - tmp_entry->offset += adjustment; - copy_addr += adjustment; - assert(tmp_entry->vme_start < tmp_entry->vme_end); - } + if (entry->needs_copy) { + /* + * If we want to write the page, we may as well + * handle that now since we've got the map locked. + * + * If we don't need to write the page, we just + * demote the permissions allowed. + */ + if ((fault_type & VM_PROT_WRITE) || *wired || force_copy) { /* - * Assert that the adjustments haven't exposed - * more than was originally copied... - */ - assert(tmp_entry->vme_start >= original_start); - assert(tmp_entry->offset >= original_offset); - /* - * ... and that it did not adjust outside of a - * a single 16K page. + * Make a new object, and place it in the + * object chain. Note that no new references + * have appeared -- one just moved from the + * map to the new object. */ - assert(vm_map_trunc_page(tmp_entry->vme_start, - VM_MAP_PAGE_MASK(src_map)) == - vm_map_trunc_page(original_start, - VM_MAP_PAGE_MASK(src_map))); - } - - /* adjust alignment of last copy_entry's "vme_end" */ - tmp_entry = vm_map_copy_last_entry(copy); - if (tmp_entry != vm_map_copy_to_entry(copy)) { - vm_map_offset_t adjustment; - original_end = tmp_entry->vme_end; + if (vm_map_lock_read_to_write(map)) { + vm_map_lock_read(map); + goto RetryLookup; + } - /* map-align the end of the last copy entry... */ - tmp_entry->vme_end = - vm_map_round_page(tmp_entry->vme_end, - VM_MAP_PAGE_MASK(src_map)); - /* ... adjust for mis-aligned end of copy range */ - adjustment = - (vm_map_round_page((copy->offset + - copy->size), - VM_MAP_PAGE_MASK(src_map)) - - vm_map_round_page((copy->offset + - copy->size), - PAGE_MASK)); - if (adjustment) { - assert(page_aligned(adjustment)); - assert(adjustment < VM_MAP_PAGE_SIZE(src_map)); - tmp_entry->vme_end -= adjustment; - assert(tmp_entry->vme_start < tmp_entry->vme_end); + if (VME_OBJECT(entry)->shadowed == FALSE) { + vm_object_lock(VME_OBJECT(entry)); + VME_OBJECT(entry)->shadowed = TRUE; + vm_object_unlock(VME_OBJECT(entry)); } + VME_OBJECT_SHADOW(entry, + (vm_map_size_t) (entry->vme_end - + entry->vme_start)); + entry->needs_copy = FALSE; - /* - * Assert that the adjustments haven't exposed - * more than was originally copied... - */ - assert(tmp_entry->vme_end <= original_end); - /* - * ... and that it did not adjust outside of a - * a single 16K page. - */ - assert(vm_map_round_page(tmp_entry->vme_end, - VM_MAP_PAGE_MASK(src_map)) == - vm_map_round_page(original_end, - VM_MAP_PAGE_MASK(src_map))); + vm_map_lock_write_to_read(map); } - } - - /* Fix-up start and end points in copy. This is necessary */ - /* when the various entries in the copy object were picked */ - /* up from different sub-maps */ - - tmp_entry = vm_map_copy_first_entry(copy); - copy_size = 0; /* compute actual size */ - while (tmp_entry != vm_map_copy_to_entry(copy)) { - assert(VM_MAP_PAGE_ALIGNED( - copy_addr + (tmp_entry->vme_end - - tmp_entry->vme_start), - VM_MAP_COPY_PAGE_MASK(copy))); - assert(VM_MAP_PAGE_ALIGNED( - copy_addr, - VM_MAP_COPY_PAGE_MASK(copy))); - - /* - * The copy_entries will be injected directly into the - * destination map and might not be "map aligned" there... - */ - tmp_entry->map_aligned = FALSE; + if ((fault_type & VM_PROT_WRITE) == 0 && *wired == 0) { + /* + * We're attempting to read a copy-on-write + * page -- don't allow writes. + */ - tmp_entry->vme_end = copy_addr + - (tmp_entry->vme_end - tmp_entry->vme_start); - tmp_entry->vme_start = copy_addr; - assert(tmp_entry->vme_start < tmp_entry->vme_end); - copy_addr += tmp_entry->vme_end - tmp_entry->vme_start; - copy_size += tmp_entry->vme_end - tmp_entry->vme_start; - tmp_entry = (struct vm_map_entry *)tmp_entry->vme_next; + prot &= (~VM_PROT_WRITE); + } } - if (VM_MAP_PAGE_SHIFT(src_map) != PAGE_SHIFT && - copy_size < copy->size) { + if (submap_needed_copy && (prot & VM_PROT_WRITE)) { /* - * The actual size of the VM map copy is smaller than what - * was requested by the caller. This must be because some - * PAGE_SIZE-sized pages are missing at the end of the last - * VM_MAP_PAGE_SIZE(src_map)-sized chunk of the range. - * The caller might not have been aware of those missing - * pages and might not want to be aware of it, which is - * fine as long as they don't try to access (and crash on) - * those missing pages. - * Let's adjust the size of the "copy", to avoid failing - * in vm_map_copyout() or vm_map_copy_overwrite(). + * We went through a "needs_copy" submap without triggering + * a copy, so granting write access to the page would bypass + * that submap's "needs_copy". */ - assert(vm_map_round_page(copy_size, - VM_MAP_PAGE_MASK(src_map)) == - vm_map_round_page(copy->size, - VM_MAP_PAGE_MASK(src_map))); - copy->size = copy_size; + assert(!(fault_type & VM_PROT_WRITE)); + assert(!*wired); + assert(!force_copy); + // printf("FBDP %d[%s] submap_needed_copy for %p 0x%llx\n", proc_selfpid(), proc_name_address(current_task()->bsd_info), map, vaddr); + prot &= ~VM_PROT_WRITE; } - *copy_result = copy; - return(KERN_SUCCESS); - -#undef RETURN -} - -kern_return_t -vm_map_copy_extract( - vm_map_t src_map, - vm_map_address_t src_addr, - vm_map_size_t len, - vm_map_copy_t *copy_result, /* OUT */ - vm_prot_t *cur_prot, /* OUT */ - vm_prot_t *max_prot) -{ - vm_map_offset_t src_start, src_end; - vm_map_copy_t copy; - kern_return_t kr; - /* - * Check for copies of zero bytes. + * Create an object if necessary. */ + if (VME_OBJECT(entry) == VM_OBJECT_NULL) { + if (vm_map_lock_read_to_write(map)) { + vm_map_lock_read(map); + goto RetryLookup; + } - if (len == 0) { - *copy_result = VM_MAP_COPY_NULL; - return(KERN_SUCCESS); + VME_OBJECT_SET(entry, + vm_object_allocate( + (vm_map_size_t)(entry->vme_end - + entry->vme_start))); + VME_OFFSET_SET(entry, 0); + assert(entry->use_pmap); + vm_map_lock_write_to_read(map); } /* - * Check that the end address doesn't overflow + * Return the object/offset from this entry. If the entry + * was copy-on-write or empty, it has been fixed up. Also + * return the protection. */ - src_end = src_addr + len; - if (src_end < src_addr) - return KERN_INVALID_ADDRESS; + + *offset = (vaddr - entry->vme_start) + VME_OFFSET(entry); + *object = VME_OBJECT(entry); + *out_prot = prot; + KDBG_FILTERED(MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_MAP_LOOKUP_OBJECT), VM_KERNEL_UNSLIDE_OR_PERM(*object), (unsigned long) VME_ALIAS(entry), 0, 0); + + if (fault_info) { + fault_info->interruptible = THREAD_UNINT; /* for now... */ + /* ... the caller will change "interruptible" if needed */ + fault_info->cluster_size = 0; + fault_info->user_tag = VME_ALIAS(entry); + fault_info->pmap_options = 0; + if (entry->iokit_acct || + (!entry->is_sub_map && !entry->use_pmap)) { + fault_info->pmap_options |= PMAP_OPTIONS_ALT_ACCT; + } + fault_info->behavior = entry->behavior; + fault_info->lo_offset = VME_OFFSET(entry); + fault_info->hi_offset = + (entry->vme_end - entry->vme_start) + VME_OFFSET(entry); + fault_info->no_cache = entry->no_cache; + fault_info->stealth = FALSE; + fault_info->io_sync = FALSE; + if (entry->used_for_jit || + entry->vme_resilient_codesign) { + fault_info->cs_bypass = TRUE; + } else { + fault_info->cs_bypass = FALSE; + } + fault_info->pmap_cs_associated = FALSE; +#if CONFIG_PMAP_CS + if (entry->pmap_cs_associated) { + /* + * The pmap layer will validate this page + * before allowing it to be executed from. + */ + fault_info->pmap_cs_associated = TRUE; + } +#endif /* CONFIG_PMAP_CS */ + fault_info->mark_zf_absent = FALSE; + fault_info->batch_pmap_op = FALSE; + fault_info->resilient_media = entry->vme_resilient_media; + fault_info->no_copy_on_read = entry->vme_no_copy_on_read; + if (entry->translated_allow_execute) { + fault_info->pmap_options |= PMAP_OPTIONS_TRANSLATED_ALLOW_EXECUTE; + } + } /* - * Compute (page aligned) start and end of region + * Lock the object to prevent it from disappearing */ - src_start = vm_map_trunc_page(src_addr, PAGE_MASK); - src_end = vm_map_round_page(src_end, PAGE_MASK); + if (object_lock_type == OBJECT_LOCK_EXCLUSIVE) { + if (contended == NULL) { + vm_object_lock(*object); + } else { + *contended = vm_object_lock_check_contended(*object); + } + } else { + vm_object_lock_shared(*object); + } /* - * Allocate a header element for the list. - * - * Use the start and end in the header to - * remember the endpoints prior to rounding. + * Save the version number */ - copy = (vm_map_copy_t) zalloc(vm_map_copy_zone); - copy->c_u.hdr.rb_head_store.rbh_root = (void*)(int)SKIP_RB_TREE; - vm_map_copy_first_entry(copy) = - vm_map_copy_last_entry(copy) = vm_map_copy_to_entry(copy); - copy->type = VM_MAP_COPY_ENTRY_LIST; - copy->cpy_hdr.nentries = 0; - copy->cpy_hdr.entries_pageable = TRUE; - - vm_map_store_init(©->cpy_hdr); - - copy->offset = 0; - copy->size = len; - - kr = vm_map_remap_extract(src_map, - src_addr, - len, - FALSE, /* copy */ - ©->cpy_hdr, - cur_prot, - max_prot, - VM_INHERIT_SHARE, - TRUE); /* pageable */ - if (kr != KERN_SUCCESS) { - vm_map_copy_discard(copy); - return kr; - } + out_version->main_timestamp = map->timestamp; - *copy_result = copy; return KERN_SUCCESS; } + /* - * vm_map_copyin_object: + * vm_map_verify: * - * Create a copy object from an object. - * Our caller donates an object reference. + * Verifies that the map in question has not changed + * since the given version. The map has to be locked + * ("shared" mode is fine) before calling this function + * and it will be returned locked too. */ - -kern_return_t -vm_map_copyin_object( - vm_object_t object, - vm_object_offset_t offset, /* offset of region in object */ - vm_object_size_t size, /* size of region in object */ - vm_map_copy_t *copy_result) /* OUT */ +boolean_t +vm_map_verify( + vm_map_t map, + vm_map_version_t *version) /* REF */ { - vm_map_copy_t copy; /* Resulting copy */ - - /* - * We drop the object into a special copy object - * that contains the object directly. - */ + boolean_t result; - copy = (vm_map_copy_t) zalloc(vm_map_copy_zone); - copy->c_u.hdr.rb_head_store.rbh_root = (void*)(int)SKIP_RB_TREE; - copy->type = VM_MAP_COPY_OBJECT; - copy->cpy_object = object; - copy->offset = offset; - copy->size = size; + vm_map_lock_assert_held(map); + result = (map->timestamp == version->main_timestamp); - *copy_result = copy; - return(KERN_SUCCESS); + return result; } -static void -vm_map_fork_share( - vm_map_t old_map, - vm_map_entry_t old_entry, - vm_map_t new_map) -{ - vm_object_t object; - vm_map_entry_t new_entry; - - /* - * New sharing code. New map entry - * references original object. Internal - * objects use asynchronous copy algorithm for - * future copies. First make sure we have - * the right object. If we need a shadow, - * or someone else already has one, then - * make a new shadow and share it. - */ - - object = old_entry->object.vm_object; - if (old_entry->is_sub_map) { - assert(old_entry->wired_count == 0); -#ifndef NO_NESTED_PMAP - if(old_entry->use_pmap) { - kern_return_t result; - - result = pmap_nest(new_map->pmap, - (old_entry->object.sub_map)->pmap, - (addr64_t)old_entry->vme_start, - (addr64_t)old_entry->vme_start, - (uint64_t)(old_entry->vme_end - old_entry->vme_start)); - if(result) - panic("vm_map_fork_share: pmap_nest failed!"); - } -#endif /* NO_NESTED_PMAP */ - } else if (object == VM_OBJECT_NULL) { - object = vm_object_allocate((vm_map_size_t)(old_entry->vme_end - - old_entry->vme_start)); - old_entry->offset = 0; - old_entry->object.vm_object = object; - old_entry->use_pmap = TRUE; - assert(!old_entry->needs_copy); - } else if (object->copy_strategy != - MEMORY_OBJECT_COPY_SYMMETRIC) { - - /* - * We are already using an asymmetric - * copy, and therefore we already have - * the right object. - */ - - assert(! old_entry->needs_copy); - } - else if (old_entry->needs_copy || /* case 1 */ - object->shadowed || /* case 2 */ - (!object->true_share && /* case 3 */ - !old_entry->is_shared && - (object->vo_size > - (vm_map_size_t)(old_entry->vme_end - - old_entry->vme_start)))) { - - /* - * We need to create a shadow. - * There are three cases here. - * In the first case, we need to - * complete a deferred symmetrical - * copy that we participated in. - * In the second and third cases, - * we need to create the shadow so - * that changes that we make to the - * object do not interfere with - * any symmetrical copies which - * have occured (case 2) or which - * might occur (case 3). - * - * The first case is when we had - * deferred shadow object creation - * via the entry->needs_copy mechanism. - * This mechanism only works when - * only one entry points to the source - * object, and we are about to create - * a second entry pointing to the - * same object. The problem is that - * there is no way of mapping from - * an object to the entries pointing - * to it. (Deferred shadow creation - * works with one entry because occurs - * at fault time, and we walk from the - * entry to the object when handling - * the fault.) - * - * The second case is when the object - * to be shared has already been copied - * with a symmetric copy, but we point - * directly to the object without - * needs_copy set in our entry. (This - * can happen because different ranges - * of an object can be pointed to by - * different entries. In particular, - * a single entry pointing to an object - * can be split by a call to vm_inherit, - * which, combined with task_create, can - * result in the different entries - * having different needs_copy values.) - * The shadowed flag in the object allows - * us to detect this case. The problem - * with this case is that if this object - * has or will have shadows, then we - * must not perform an asymmetric copy - * of this object, since such a copy - * allows the object to be changed, which - * will break the previous symmetrical - * copies (which rely upon the object - * not changing). In a sense, the shadowed - * flag says "don't change this object". - * We fix this by creating a shadow - * object for this object, and sharing - * that. This works because we are free - * to change the shadow object (and thus - * to use an asymmetric copy strategy); - * this is also semantically correct, - * since this object is temporary, and - * therefore a copy of the object is - * as good as the object itself. (This - * is not true for permanent objects, - * since the pager needs to see changes, - * which won't happen if the changes - * are made to a copy.) - * - * The third case is when the object - * to be shared has parts sticking - * outside of the entry we're working - * with, and thus may in the future - * be subject to a symmetrical copy. - * (This is a preemptive version of - * case 2.) - */ - vm_object_shadow(&old_entry->object.vm_object, - &old_entry->offset, - (vm_map_size_t) (old_entry->vme_end - - old_entry->vme_start)); - - /* - * If we're making a shadow for other than - * copy on write reasons, then we have - * to remove write permission. - */ - - if (!old_entry->needs_copy && - (old_entry->protection & VM_PROT_WRITE)) { - vm_prot_t prot; - - prot = old_entry->protection & ~VM_PROT_WRITE; - - if (override_nx(old_map, old_entry->alias) && prot) - prot |= VM_PROT_EXECUTE; +/* + * TEMPORARYTEMPORARYTEMPORARYTEMPORARYTEMPORARYTEMPORARY + * Goes away after regular vm_region_recurse function migrates to + * 64 bits + * vm_region_recurse: A form of vm_region which follows the + * submaps in a target map + * + */ - if (old_map->mapped_in_other_pmaps) { - vm_object_pmap_protect( - old_entry->object.vm_object, - old_entry->offset, - (old_entry->vme_end - - old_entry->vme_start), - PMAP_NULL, - old_entry->vme_start, - prot); - } else { - pmap_protect(old_map->pmap, - old_entry->vme_start, - old_entry->vme_end, - prot); - } - } - - old_entry->needs_copy = FALSE; - object = old_entry->object.vm_object; - } +kern_return_t +vm_map_region_recurse_64( + vm_map_t map, + vm_map_offset_t *address, /* IN/OUT */ + vm_map_size_t *size, /* OUT */ + natural_t *nesting_depth, /* IN/OUT */ + vm_region_submap_info_64_t submap_info, /* IN/OUT */ + mach_msg_type_number_t *count) /* IN/OUT */ +{ + mach_msg_type_number_t original_count; + vm_region_extended_info_data_t extended; + vm_map_entry_t tmp_entry; + vm_map_offset_t user_address; + unsigned int user_max_depth; - /* - * If object was using a symmetric copy strategy, - * change its copy strategy to the default - * asymmetric copy strategy, which is copy_delay - * in the non-norma case and copy_call in the - * norma case. Bump the reference count for the - * new entry. - */ - - if(old_entry->is_sub_map) { - vm_map_lock(old_entry->object.sub_map); - vm_map_reference(old_entry->object.sub_map); - vm_map_unlock(old_entry->object.sub_map); - } else { - vm_object_lock(object); - vm_object_reference_locked(object); - if (object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) { - object->copy_strategy = MEMORY_OBJECT_COPY_DELAY; - } - vm_object_unlock(object); - } - - /* - * Clone the entry, using object ref from above. - * Mark both entries as shared. - */ - - new_entry = vm_map_entry_create(new_map, FALSE); /* Never the kernel - * map or descendants */ - vm_map_entry_copy(new_entry, old_entry); - old_entry->is_shared = TRUE; - new_entry->is_shared = TRUE; - - /* - * Insert the entry into the new map -- we - * know we're inserting at the end of the new - * map. + * "curr_entry" is the VM map entry preceding or including the + * address we're looking for. + * "curr_map" is the map or sub-map containing "curr_entry". + * "curr_address" is the equivalent of the top map's "user_address" + * in the current map. + * "curr_offset" is the cumulated offset of "curr_map" in the + * target task's address space. + * "curr_depth" is the depth of "curr_map" in the chain of + * sub-maps. + * + * "curr_max_below" and "curr_max_above" limit the range (around + * "curr_address") we should take into account in the current (sub)map. + * They limit the range to what's visible through the map entries + * we've traversed from the top map to the current map. + * */ - - vm_map_store_entry_link(new_map, vm_map_last_entry(new_map), new_entry); - + vm_map_entry_t curr_entry; + vm_map_address_t curr_address; + vm_map_offset_t curr_offset; + vm_map_t curr_map; + unsigned int curr_depth; + vm_map_offset_t curr_max_below, curr_max_above; + vm_map_offset_t curr_skip; + /* - * Update the physical map + * "next_" is the same as "curr_" but for the VM region immediately + * after the address we're looking for. We need to keep track of this + * too because we want to return info about that region if the + * address we're looking for is not mapped. */ - - if (old_entry->is_sub_map) { - /* Bill Angell pmap support goes here */ - } else { - pmap_copy(new_map->pmap, old_map->pmap, new_entry->vme_start, - old_entry->vme_end - old_entry->vme_start, - old_entry->vme_start); - } -} + vm_map_entry_t next_entry; + vm_map_offset_t next_offset; + vm_map_offset_t next_address; + vm_map_t next_map; + unsigned int next_depth; + vm_map_offset_t next_max_below, next_max_above; + vm_map_offset_t next_skip; + + boolean_t look_for_pages; + vm_region_submap_short_info_64_t short_info; + boolean_t do_region_footprint; + int effective_page_size, effective_page_shift; + boolean_t submap_needed_copy; -static boolean_t -vm_map_fork_copy( - vm_map_t old_map, - vm_map_entry_t *old_entry_p, - vm_map_t new_map) -{ - vm_map_entry_t old_entry = *old_entry_p; - vm_map_size_t entry_size = old_entry->vme_end - old_entry->vme_start; - vm_map_offset_t start = old_entry->vme_start; - vm_map_copy_t copy; - vm_map_entry_t last = vm_map_last_entry(new_map); + if (map == VM_MAP_NULL) { + /* no address space to work on */ + return KERN_INVALID_ARGUMENT; + } - vm_map_unlock(old_map); - /* - * Use maxprot version of copyin because we - * care about whether this memory can ever - * be accessed, not just whether it's accessible - * right now. - */ - if (vm_map_copyin_maxprot(old_map, start, entry_size, FALSE, ©) - != KERN_SUCCESS) { - /* - * The map might have changed while it - * was unlocked, check it again. Skip - * any blank space or permanently - * unreadable region. - */ - vm_map_lock(old_map); - if (!vm_map_lookup_entry(old_map, start, &last) || - (last->max_protection & VM_PROT_READ) == VM_PROT_NONE) { - last = last->vme_next; - } - *old_entry_p = last; + effective_page_shift = vm_self_region_page_shift(map); + effective_page_size = (1 << effective_page_shift); + if (*count < VM_REGION_SUBMAP_SHORT_INFO_COUNT_64) { /* - * XXX For some error returns, want to - * XXX skip to the next element. Note - * that INVALID_ADDRESS and - * PROTECTION_FAILURE are handled above. + * "info" structure is not big enough and + * would overflow */ - - return FALSE; + return KERN_INVALID_ARGUMENT; } - - /* - * Insert the copy into the new map - */ - - vm_map_copy_insert(new_map, last, copy); - - /* - * Pick up the traversal at the end of - * the copied region. - */ - - vm_map_lock(old_map); - start += entry_size; - if (! vm_map_lookup_entry(old_map, start, &last)) { - last = last->vme_next; + + do_region_footprint = task_self_region_footprint(); + original_count = *count; + + if (original_count < VM_REGION_SUBMAP_INFO_V0_COUNT_64) { + *count = VM_REGION_SUBMAP_SHORT_INFO_COUNT_64; + look_for_pages = FALSE; + short_info = (vm_region_submap_short_info_64_t) submap_info; + submap_info = NULL; } else { - if (last->vme_start == start) { - /* - * No need to clip here and we don't - * want to cause any unnecessary - * unnesting... - */ - } else { - vm_map_clip_start(old_map, last, start); + look_for_pages = TRUE; + *count = VM_REGION_SUBMAP_INFO_V0_COUNT_64; + short_info = NULL; + + if (original_count >= VM_REGION_SUBMAP_INFO_V1_COUNT_64) { + *count = VM_REGION_SUBMAP_INFO_V1_COUNT_64; + } + if (original_count >= VM_REGION_SUBMAP_INFO_V2_COUNT_64) { + *count = VM_REGION_SUBMAP_INFO_V2_COUNT_64; } } - *old_entry_p = last; - return TRUE; -} + user_address = *address; + user_max_depth = *nesting_depth; + submap_needed_copy = FALSE; -/* - * vm_map_fork: - * - * Create and return a new map based on the old - * map, according to the inheritance values on the - * regions in that map. - * - * The source map must not be locked. - */ -vm_map_t -vm_map_fork( - ledger_t ledger, - vm_map_t old_map) -{ - pmap_t new_pmap; - vm_map_t new_map; - vm_map_entry_t old_entry; - vm_map_size_t new_size = 0, entry_size; - vm_map_entry_t new_entry; - boolean_t src_needs_copy; - boolean_t new_entry_needs_copy; - - new_pmap = pmap_create(ledger, (vm_map_size_t) 0, -#if defined(__i386__) || defined(__x86_64__) - old_map->pmap->pm_task_map != TASK_MAP_32BIT -#else -#error Unknown architecture. -#endif - ); + if (not_in_kdp) { + vm_map_lock_read(map); + } - vm_map_reference_swap(old_map); - vm_map_lock(old_map); +recurse_again: + curr_entry = NULL; + curr_map = map; + curr_address = user_address; + curr_offset = 0; + curr_skip = 0; + curr_depth = 0; + curr_max_above = ((vm_map_offset_t) -1) - curr_address; + curr_max_below = curr_address; - new_map = vm_map_create(new_pmap, - old_map->min_offset, - old_map->max_offset, - old_map->hdr.entries_pageable); - /* inherit the parent map's page size */ - vm_map_set_page_shift(new_map, VM_MAP_PAGE_SHIFT(old_map)); - for ( - old_entry = vm_map_first_entry(old_map); - old_entry != vm_map_to_entry(old_map); - ) { + next_entry = NULL; + next_map = NULL; + next_address = 0; + next_offset = 0; + next_skip = 0; + next_depth = 0; + next_max_above = (vm_map_offset_t) -1; + next_max_below = (vm_map_offset_t) -1; - entry_size = old_entry->vme_end - old_entry->vme_start; + for (;;) { + if (vm_map_lookup_entry(curr_map, + curr_address, + &tmp_entry)) { + /* tmp_entry contains the address we're looking for */ + curr_entry = tmp_entry; + } else { + vm_map_offset_t skip; + /* + * The address is not mapped. "tmp_entry" is the + * map entry preceding the address. We want the next + * one, if it exists. + */ + curr_entry = tmp_entry->vme_next; - switch (old_entry->inheritance) { - case VM_INHERIT_NONE: - break; + if (curr_entry == vm_map_to_entry(curr_map) || + (curr_entry->vme_start >= + curr_address + curr_max_above)) { + /* no next entry at this level: stop looking */ + if (not_in_kdp) { + vm_map_unlock_read(curr_map); + } + curr_entry = NULL; + curr_map = NULL; + curr_skip = 0; + curr_offset = 0; + curr_depth = 0; + curr_max_above = 0; + curr_max_below = 0; + break; + } - case VM_INHERIT_SHARE: - vm_map_fork_share(old_map, old_entry, new_map); - new_size += entry_size; - break; + /* adjust current address and offset */ + skip = curr_entry->vme_start - curr_address; + curr_address = curr_entry->vme_start; + curr_skip += skip; + curr_offset += skip; + curr_max_above -= skip; + curr_max_below = 0; + } - case VM_INHERIT_COPY: + /* + * Is the next entry at this level closer to the address (or + * deeper in the submap chain) than the one we had + * so far ? + */ + tmp_entry = curr_entry->vme_next; + if (tmp_entry == vm_map_to_entry(curr_map)) { + /* no next entry at this level */ + } else if (tmp_entry->vme_start >= + curr_address + curr_max_above) { + /* + * tmp_entry is beyond the scope of what we mapped of + * this submap in the upper level: ignore it. + */ + } else if ((next_entry == NULL) || + (tmp_entry->vme_start + curr_offset <= + next_entry->vme_start + next_offset)) { + /* + * We didn't have a "next_entry" or this one is + * closer to the address we're looking for: + * use this "tmp_entry" as the new "next_entry". + */ + if (next_entry != NULL) { + /* unlock the last "next_map" */ + if (next_map != curr_map && not_in_kdp) { + vm_map_unlock_read(next_map); + } + } + next_entry = tmp_entry; + next_map = curr_map; + next_depth = curr_depth; + next_address = next_entry->vme_start; + next_skip = curr_skip; + next_skip += (next_address - curr_address); + next_offset = curr_offset; + next_offset += (next_address - curr_address); + next_max_above = MIN(next_max_above, curr_max_above); + next_max_above = MIN(next_max_above, + next_entry->vme_end - next_address); + next_max_below = MIN(next_max_below, curr_max_below); + next_max_below = MIN(next_max_below, + next_address - next_entry->vme_start); + } + + /* + * "curr_max_{above,below}" allow us to keep track of the + * portion of the submap that is actually mapped at this level: + * the rest of that submap is irrelevant to us, since it's not + * mapped here. + * The relevant portion of the map starts at + * "VME_OFFSET(curr_entry)" up to the size of "curr_entry". + */ + curr_max_above = MIN(curr_max_above, + curr_entry->vme_end - curr_address); + curr_max_below = MIN(curr_max_below, + curr_address - curr_entry->vme_start); + if (!curr_entry->is_sub_map || + curr_depth >= user_max_depth) { /* - * Inline the copy_quickly case; - * upon failure, fall back on call - * to vm_map_fork_copy. + * We hit a leaf map or we reached the maximum depth + * we could, so stop looking. Keep the current map + * locked. */ + break; + } - if(old_entry->is_sub_map) - break; - if ((old_entry->wired_count != 0) || - ((old_entry->object.vm_object != NULL) && - (old_entry->object.vm_object->true_share))) { - goto slow_vm_map_fork_copy; - } + /* + * Get down to the next submap level. + */ - new_entry = vm_map_entry_create(new_map, FALSE); /* never the kernel map or descendants */ - vm_map_entry_copy(new_entry, old_entry); - if (new_entry->is_sub_map) { - /* clear address space specifics */ - new_entry->use_pmap = FALSE; - } + if (curr_entry->needs_copy) { + /* everything below this is effectively copy-on-write */ + submap_needed_copy = TRUE; + } - if (! vm_object_copy_quickly( - &new_entry->object.vm_object, - old_entry->offset, - (old_entry->vme_end - - old_entry->vme_start), - &src_needs_copy, - &new_entry_needs_copy)) { - vm_map_entry_dispose(new_map, new_entry); - goto slow_vm_map_fork_copy; + /* + * Lock the next level and unlock the current level, + * unless we need to keep it locked to access the "next_entry" + * later. + */ + if (not_in_kdp) { + vm_map_lock_read(VME_SUBMAP(curr_entry)); + } + if (curr_map == next_map) { + /* keep "next_map" locked in case we need it */ + } else { + /* release this map */ + if (not_in_kdp) { + vm_map_unlock_read(curr_map); } + } - /* - * Handle copy-on-write obligations - */ - - if (src_needs_copy && !old_entry->needs_copy) { - vm_prot_t prot; + /* + * Adjust the offset. "curr_entry" maps the submap + * at relative address "curr_entry->vme_start" in the + * curr_map but skips the first "VME_OFFSET(curr_entry)" + * bytes of the submap. + * "curr_offset" always represents the offset of a virtual + * address in the curr_map relative to the absolute address + * space (i.e. the top-level VM map). + */ + curr_offset += + (VME_OFFSET(curr_entry) - curr_entry->vme_start); + curr_address = user_address + curr_offset; + /* switch to the submap */ + curr_map = VME_SUBMAP(curr_entry); + curr_depth++; + curr_entry = NULL; + } - prot = old_entry->protection & ~VM_PROT_WRITE; +// LP64todo: all the current tools are 32bit, obviously never worked for 64b +// so probably should be a real 32b ID vs. ptr. +// Current users just check for equality - if (override_nx(old_map, old_entry->alias) && prot) - prot |= VM_PROT_EXECUTE; + if (curr_entry == NULL) { + /* no VM region contains the address... */ - vm_object_pmap_protect( - old_entry->object.vm_object, - old_entry->offset, - (old_entry->vme_end - - old_entry->vme_start), - ((old_entry->is_shared - || old_map->mapped_in_other_pmaps) - ? PMAP_NULL : - old_map->pmap), - old_entry->vme_start, - prot); + if (do_region_footprint && /* we want footprint numbers */ + next_entry == NULL && /* & there are no more regions */ + /* & we haven't already provided our fake region: */ + user_address <= vm_map_last_entry(map)->vme_end) { + ledger_amount_t ledger_resident, ledger_compressed; - old_entry->needs_copy = TRUE; - } - new_entry->needs_copy = new_entry_needs_copy; - /* - * Insert the entry at the end - * of the map. + * Add a fake memory region to account for + * purgeable and/or ledger-tagged memory that + * counts towards this task's memory footprint, + * i.e. the resident/compressed pages of non-volatile + * objects owned by that task. */ - - vm_map_store_entry_link(new_map, vm_map_last_entry(new_map), - new_entry); - new_size += entry_size; - break; + task_ledgers_footprint(map->pmap->ledger, + &ledger_resident, + &ledger_compressed); + if (ledger_resident + ledger_compressed == 0) { + /* no purgeable memory usage to report */ + return KERN_INVALID_ADDRESS; + } + /* fake region to show nonvolatile footprint */ + if (look_for_pages) { + submap_info->protection = VM_PROT_DEFAULT; + submap_info->max_protection = VM_PROT_DEFAULT; + submap_info->inheritance = VM_INHERIT_DEFAULT; + submap_info->offset = 0; + submap_info->user_tag = -1; + submap_info->pages_resident = (unsigned int) (ledger_resident / effective_page_size); + submap_info->pages_shared_now_private = 0; + submap_info->pages_swapped_out = (unsigned int) (ledger_compressed / effective_page_size); + submap_info->pages_dirtied = submap_info->pages_resident; + submap_info->ref_count = 1; + submap_info->shadow_depth = 0; + submap_info->external_pager = 0; + submap_info->share_mode = SM_PRIVATE; + if (submap_needed_copy) { + submap_info->share_mode = SM_COW; + } + submap_info->is_submap = 0; + submap_info->behavior = VM_BEHAVIOR_DEFAULT; + submap_info->object_id = VM_OBJECT_ID_FAKE(map, task_ledgers.purgeable_nonvolatile); + submap_info->user_wired_count = 0; + submap_info->pages_reusable = 0; + } else { + short_info->user_tag = -1; + short_info->offset = 0; + short_info->protection = VM_PROT_DEFAULT; + short_info->inheritance = VM_INHERIT_DEFAULT; + short_info->max_protection = VM_PROT_DEFAULT; + short_info->behavior = VM_BEHAVIOR_DEFAULT; + short_info->user_wired_count = 0; + short_info->is_submap = 0; + short_info->object_id = VM_OBJECT_ID_FAKE(map, task_ledgers.purgeable_nonvolatile); + short_info->external_pager = 0; + short_info->shadow_depth = 0; + short_info->share_mode = SM_PRIVATE; + if (submap_needed_copy) { + short_info->share_mode = SM_COW; + } + short_info->ref_count = 1; + } + *nesting_depth = 0; + *size = (vm_map_size_t) (ledger_resident + ledger_compressed); +// *address = user_address; + *address = vm_map_last_entry(map)->vme_end; + return KERN_SUCCESS; + } - slow_vm_map_fork_copy: - if (vm_map_fork_copy(old_map, &old_entry, new_map)) { - new_size += entry_size; + if (next_entry == NULL) { + /* ... and no VM region follows it either */ + return KERN_INVALID_ADDRESS; + } + /* ... gather info about the next VM region */ + curr_entry = next_entry; + curr_map = next_map; /* still locked ... */ + curr_address = next_address; + curr_skip = next_skip; + curr_offset = next_offset; + curr_depth = next_depth; + curr_max_above = next_max_above; + curr_max_below = next_max_below; + } else { + /* we won't need "next_entry" after all */ + if (next_entry != NULL) { + /* release "next_map" */ + if (next_map != curr_map && not_in_kdp) { + vm_map_unlock_read(next_map); } - continue; } - old_entry = old_entry->vme_next; } + next_entry = NULL; + next_map = NULL; + next_offset = 0; + next_skip = 0; + next_depth = 0; + next_max_below = -1; + next_max_above = -1; - - new_map->size = new_size; - vm_map_unlock(old_map); - vm_map_deallocate(old_map); - - return(new_map); -} - -/* - * vm_map_exec: - * - * Setup the "new_map" with the proper execution environment according - * to the type of executable (platform, 64bit, chroot environment). - * Map the comm page and shared region, etc... - */ -kern_return_t -vm_map_exec( - vm_map_t new_map, - task_t task, - void *fsroot, - cpu_type_t cpu) -{ - SHARED_REGION_TRACE_DEBUG( - ("shared_region: task %p: vm_map_exec(%p,%p,%p,0x%x): ->\n", - (void *)VM_KERNEL_ADDRPERM(current_task()), - (void *)VM_KERNEL_ADDRPERM(new_map), - (void *)VM_KERNEL_ADDRPERM(task), - (void *)VM_KERNEL_ADDRPERM(fsroot), - cpu)); - (void) vm_commpage_enter(new_map, task); - (void) vm_shared_region_enter(new_map, task, fsroot, cpu); - SHARED_REGION_TRACE_DEBUG( - ("shared_region: task %p: vm_map_exec(%p,%p,%p,0x%x): <-\n", - (void *)VM_KERNEL_ADDRPERM(current_task()), - (void *)VM_KERNEL_ADDRPERM(new_map), - (void *)VM_KERNEL_ADDRPERM(task), - (void *)VM_KERNEL_ADDRPERM(fsroot), - cpu)); - return KERN_SUCCESS; -} - -/* - * vm_map_lookup_locked: - * - * Finds the VM object, offset, and - * protection for a given virtual address in the - * specified map, assuming a page fault of the - * type specified. - * - * Returns the (object, offset, protection) for - * this address, whether it is wired down, and whether - * this map has the only reference to the data in question. - * In order to later verify this lookup, a "version" - * is returned. - * - * The map MUST be locked by the caller and WILL be - * locked on exit. In order to guarantee the - * existence of the returned object, it is returned - * locked. - * - * If a lookup is requested with "write protection" - * specified, the map may be changed to perform virtual - * copying operations, although the data referenced will - * remain the same. - */ -kern_return_t -vm_map_lookup_locked( - vm_map_t *var_map, /* IN/OUT */ - vm_map_offset_t vaddr, - vm_prot_t fault_type, - int object_lock_type, - vm_map_version_t *out_version, /* OUT */ - vm_object_t *object, /* OUT */ - vm_object_offset_t *offset, /* OUT */ - vm_prot_t *out_prot, /* OUT */ - boolean_t *wired, /* OUT */ - vm_object_fault_info_t fault_info, /* OUT */ - vm_map_t *real_map) -{ - vm_map_entry_t entry; - register vm_map_t map = *var_map; - vm_map_t old_map = *var_map; - vm_map_t cow_sub_map_parent = VM_MAP_NULL; - vm_map_offset_t cow_parent_vaddr = 0; - vm_map_offset_t old_start = 0; - vm_map_offset_t old_end = 0; - register vm_prot_t prot; - boolean_t mask_protections; - boolean_t force_copy; - vm_prot_t original_fault_type; - - /* - * VM_PROT_MASK means that the caller wants us to use "fault_type" - * as a mask against the mapping's actual protections, not as an - * absolute value. - */ - mask_protections = (fault_type & VM_PROT_IS_MASK) ? TRUE : FALSE; - force_copy = (fault_type & VM_PROT_COPY) ? TRUE : FALSE; - fault_type &= VM_PROT_ALL; - original_fault_type = fault_type; - - *real_map = map; - -RetryLookup: - fault_type = original_fault_type; - - /* - * If the map has an interesting hint, try it before calling - * full blown lookup routine. - */ - entry = map->hint; - - if ((entry == vm_map_to_entry(map)) || - (vaddr < entry->vme_start) || (vaddr >= entry->vme_end)) { - vm_map_entry_t tmp_entry; - + if (curr_entry->is_sub_map && + curr_depth < user_max_depth) { /* - * Entry was either not a valid hint, or the vaddr - * was not contained in the entry, so do a full lookup. + * We're not as deep as we could be: we must have + * gone back up after not finding anything mapped + * below the original top-level map entry's. + * Let's move "curr_address" forward and recurse again. */ - if (!vm_map_lookup_entry(map, vaddr, &tmp_entry)) { - if((cow_sub_map_parent) && (cow_sub_map_parent != map)) - vm_map_unlock(cow_sub_map_parent); - if((*real_map != map) - && (*real_map != cow_sub_map_parent)) - vm_map_unlock(*real_map); - return KERN_INVALID_ADDRESS; - } - - entry = tmp_entry; - } - if(map == old_map) { - old_start = entry->vme_start; - old_end = entry->vme_end; + user_address = curr_address; + goto recurse_again; } - /* - * Handle submaps. Drop lock on upper map, submap is - * returned locked. - */ - -submap_recurse: - if (entry->is_sub_map) { - vm_map_offset_t local_vaddr; - vm_map_offset_t end_delta; - vm_map_offset_t start_delta; - vm_map_entry_t submap_entry; - boolean_t mapped_needs_copy=FALSE; + *nesting_depth = curr_depth; + *size = curr_max_above + curr_max_below; + *address = user_address + curr_skip - curr_max_below; - local_vaddr = vaddr; + if (look_for_pages) { + submap_info->user_tag = VME_ALIAS(curr_entry); + submap_info->offset = VME_OFFSET(curr_entry); + submap_info->protection = curr_entry->protection; + submap_info->inheritance = curr_entry->inheritance; + submap_info->max_protection = curr_entry->max_protection; + submap_info->behavior = curr_entry->behavior; + submap_info->user_wired_count = curr_entry->user_wired_count; + submap_info->is_submap = curr_entry->is_sub_map; + submap_info->object_id = VM_OBJECT_ID(VME_OBJECT(curr_entry)); + } else { + short_info->user_tag = VME_ALIAS(curr_entry); + short_info->offset = VME_OFFSET(curr_entry); + short_info->protection = curr_entry->protection; + short_info->inheritance = curr_entry->inheritance; + short_info->max_protection = curr_entry->max_protection; + short_info->behavior = curr_entry->behavior; + short_info->user_wired_count = curr_entry->user_wired_count; + short_info->is_submap = curr_entry->is_sub_map; + short_info->object_id = VM_OBJECT_ID(VME_OBJECT(curr_entry)); + } - if ((entry->use_pmap && !(fault_type & VM_PROT_WRITE))) { - /* if real_map equals map we unlock below */ - if ((*real_map != map) && - (*real_map != cow_sub_map_parent)) - vm_map_unlock(*real_map); - *real_map = entry->object.sub_map; - } + extended.pages_resident = 0; + extended.pages_swapped_out = 0; + extended.pages_shared_now_private = 0; + extended.pages_dirtied = 0; + extended.pages_reusable = 0; + extended.external_pager = 0; + extended.shadow_depth = 0; + extended.share_mode = SM_EMPTY; + extended.ref_count = 0; - if(entry->needs_copy && (fault_type & VM_PROT_WRITE)) { - if (!mapped_needs_copy) { - if (vm_map_lock_read_to_write(map)) { - vm_map_lock_read(map); - *real_map = map; - goto RetryLookup; - } - vm_map_lock_read(entry->object.sub_map); - *var_map = entry->object.sub_map; - cow_sub_map_parent = map; - /* reset base to map before cow object */ - /* this is the map which will accept */ - /* the new cow object */ - old_start = entry->vme_start; - old_end = entry->vme_end; - cow_parent_vaddr = vaddr; - mapped_needs_copy = TRUE; - } else { - vm_map_lock_read(entry->object.sub_map); - *var_map = entry->object.sub_map; - if((cow_sub_map_parent != map) && - (*real_map != map)) - vm_map_unlock(map); + if (not_in_kdp) { + if (!curr_entry->is_sub_map) { + vm_map_offset_t range_start, range_end; + range_start = MAX((curr_address - curr_max_below), + curr_entry->vme_start); + range_end = MIN((curr_address + curr_max_above), + curr_entry->vme_end); + vm_map_region_walk(curr_map, + range_start, + curr_entry, + (VME_OFFSET(curr_entry) + + (range_start - + curr_entry->vme_start)), + range_end - range_start, + &extended, + look_for_pages, VM_REGION_EXTENDED_INFO_COUNT); + if (extended.external_pager && + extended.ref_count == 2 && + extended.share_mode == SM_SHARED) { + extended.share_mode = SM_PRIVATE; + } + if (submap_needed_copy) { + extended.share_mode = SM_COW; } } else { - vm_map_lock_read(entry->object.sub_map); - *var_map = entry->object.sub_map; - /* leave map locked if it is a target */ - /* cow sub_map above otherwise, just */ - /* follow the maps down to the object */ - /* here we unlock knowing we are not */ - /* revisiting the map. */ - if((*real_map != map) && (map != cow_sub_map_parent)) - vm_map_unlock_read(map); + if (curr_entry->use_pmap) { + extended.share_mode = SM_TRUESHARED; + } else { + extended.share_mode = SM_PRIVATE; + } + extended.ref_count = os_ref_get_count(&VME_SUBMAP(curr_entry)->map_refcnt); } + } - map = *var_map; - - /* calculate the offset in the submap for vaddr */ - local_vaddr = (local_vaddr - entry->vme_start) + entry->offset; + if (look_for_pages) { + submap_info->pages_resident = extended.pages_resident; + submap_info->pages_swapped_out = extended.pages_swapped_out; + submap_info->pages_shared_now_private = + extended.pages_shared_now_private; + submap_info->pages_dirtied = extended.pages_dirtied; + submap_info->external_pager = extended.external_pager; + submap_info->shadow_depth = extended.shadow_depth; + submap_info->share_mode = extended.share_mode; + submap_info->ref_count = extended.ref_count; - RetrySubMap: - if(!vm_map_lookup_entry(map, local_vaddr, &submap_entry)) { - if((cow_sub_map_parent) && (cow_sub_map_parent != map)){ - vm_map_unlock(cow_sub_map_parent); - } - if((*real_map != map) - && (*real_map != cow_sub_map_parent)) { - vm_map_unlock(*real_map); - } - *real_map = map; - return KERN_INVALID_ADDRESS; + if (original_count >= VM_REGION_SUBMAP_INFO_V1_COUNT_64) { + submap_info->pages_reusable = extended.pages_reusable; } + if (original_count >= VM_REGION_SUBMAP_INFO_V2_COUNT_64) { + submap_info->object_id_full = (vm_object_id_t) (VME_OBJECT(curr_entry) != NULL) ? VM_KERNEL_ADDRPERM(VME_OBJECT(curr_entry)) : 0ULL; + } + } else { + short_info->external_pager = extended.external_pager; + short_info->shadow_depth = extended.shadow_depth; + short_info->share_mode = extended.share_mode; + short_info->ref_count = extended.ref_count; + } - /* find the attenuated shadow of the underlying object */ - /* on our target map */ + if (not_in_kdp) { + vm_map_unlock_read(curr_map); + } - /* in english the submap object may extend beyond the */ - /* region mapped by the entry or, may only fill a portion */ - /* of it. For our purposes, we only care if the object */ - /* doesn't fill. In this case the area which will */ - /* ultimately be clipped in the top map will only need */ - /* to be as big as the portion of the underlying entry */ - /* which is mapped */ - start_delta = submap_entry->vme_start > entry->offset ? - submap_entry->vme_start - entry->offset : 0; + return KERN_SUCCESS; +} - end_delta = - (entry->offset + start_delta + (old_end - old_start)) <= - submap_entry->vme_end ? - 0 : (entry->offset + - (old_end - old_start)) - - submap_entry->vme_end; +/* + * vm_region: + * + * User call to obtain information about a region in + * a task's address map. Currently, only one flavor is + * supported. + * + * XXX The reserved and behavior fields cannot be filled + * in until the vm merge from the IK is completed, and + * vm_reserve is implemented. + */ - old_start += start_delta; - old_end -= end_delta; +kern_return_t +vm_map_region( + vm_map_t map, + vm_map_offset_t *address, /* IN/OUT */ + vm_map_size_t *size, /* OUT */ + vm_region_flavor_t flavor, /* IN */ + vm_region_info_t info, /* OUT */ + mach_msg_type_number_t *count, /* IN/OUT */ + mach_port_t *object_name) /* OUT */ +{ + vm_map_entry_t tmp_entry; + vm_map_entry_t entry; + vm_map_offset_t start; - if(submap_entry->is_sub_map) { - entry = submap_entry; - vaddr = local_vaddr; - goto submap_recurse; - } + if (map == VM_MAP_NULL) { + return KERN_INVALID_ARGUMENT; + } - if(((fault_type & VM_PROT_WRITE) && cow_sub_map_parent)) { + switch (flavor) { + case VM_REGION_BASIC_INFO: + /* legacy for old 32-bit objects info */ + { + vm_region_basic_info_t basic; - vm_object_t sub_object, copy_object; - vm_object_offset_t copy_offset; - vm_map_offset_t local_start; - vm_map_offset_t local_end; - boolean_t copied_slowly = FALSE; + if (*count < VM_REGION_BASIC_INFO_COUNT) { + return KERN_INVALID_ARGUMENT; + } - if (vm_map_lock_read_to_write(map)) { - vm_map_lock_read(map); - old_start -= start_delta; - old_end += end_delta; - goto RetrySubMap; - } + basic = (vm_region_basic_info_t) info; + *count = VM_REGION_BASIC_INFO_COUNT; + vm_map_lock_read(map); - sub_object = submap_entry->object.vm_object; - if (sub_object == VM_OBJECT_NULL) { - sub_object = - vm_object_allocate( - (vm_map_size_t) - (submap_entry->vme_end - - submap_entry->vme_start)); - submap_entry->object.vm_object = sub_object; - submap_entry->offset = 0; - } - local_start = local_vaddr - - (cow_parent_vaddr - old_start); - local_end = local_vaddr + - (old_end - cow_parent_vaddr); - vm_map_clip_start(map, submap_entry, local_start); - vm_map_clip_end(map, submap_entry, local_end); - if (submap_entry->is_sub_map) { - /* unnesting was done when clipping */ - assert(!submap_entry->use_pmap); + start = *address; + if (!vm_map_lookup_entry(map, start, &tmp_entry)) { + if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) { + vm_map_unlock_read(map); + return KERN_INVALID_ADDRESS; } + } else { + entry = tmp_entry; + } - /* This is the COW case, lets connect */ - /* an entry in our space to the underlying */ - /* object in the submap, bypassing the */ - /* submap. */ + start = entry->vme_start; + basic->offset = (uint32_t)VME_OFFSET(entry); + basic->protection = entry->protection; + basic->inheritance = entry->inheritance; + basic->max_protection = entry->max_protection; + basic->behavior = entry->behavior; + basic->user_wired_count = entry->user_wired_count; + basic->reserved = entry->is_sub_map; + *address = start; + *size = (entry->vme_end - start); - if(submap_entry->wired_count != 0 || - (sub_object->copy_strategy == - MEMORY_OBJECT_COPY_NONE)) { - vm_object_lock(sub_object); - vm_object_copy_slowly(sub_object, - submap_entry->offset, - (submap_entry->vme_end - - submap_entry->vme_start), - FALSE, - ©_object); - copied_slowly = TRUE; - } else { - - /* set up shadow object */ - copy_object = sub_object; - vm_object_reference(copy_object); - sub_object->shadowed = TRUE; - submap_entry->needs_copy = TRUE; + if (object_name) { + *object_name = IP_NULL; + } + if (entry->is_sub_map) { + basic->shared = FALSE; + } else { + basic->shared = entry->is_shared; + } - prot = submap_entry->protection & ~VM_PROT_WRITE; + vm_map_unlock_read(map); + return KERN_SUCCESS; + } - if (override_nx(old_map, submap_entry->alias) && prot) - prot |= VM_PROT_EXECUTE; + case VM_REGION_BASIC_INFO_64: + { + vm_region_basic_info_64_t basic; - vm_object_pmap_protect( - sub_object, - submap_entry->offset, - submap_entry->vme_end - - submap_entry->vme_start, - (submap_entry->is_shared - || map->mapped_in_other_pmaps) ? - PMAP_NULL : map->pmap, - submap_entry->vme_start, - prot); - } - - /* - * Adjust the fault offset to the submap entry. - */ - copy_offset = (local_vaddr - - submap_entry->vme_start + - submap_entry->offset); + if (*count < VM_REGION_BASIC_INFO_COUNT_64) { + return KERN_INVALID_ARGUMENT; + } - /* This works diffently than the */ - /* normal submap case. We go back */ - /* to the parent of the cow map and*/ - /* clip out the target portion of */ - /* the sub_map, substituting the */ - /* new copy object, */ + basic = (vm_region_basic_info_64_t) info; + *count = VM_REGION_BASIC_INFO_COUNT_64; - vm_map_unlock(map); - local_start = old_start; - local_end = old_end; - map = cow_sub_map_parent; - *var_map = cow_sub_map_parent; - vaddr = cow_parent_vaddr; - cow_sub_map_parent = NULL; + vm_map_lock_read(map); - if(!vm_map_lookup_entry(map, - vaddr, &entry)) { - vm_object_deallocate( - copy_object); - vm_map_lock_write_to_read(map); + start = *address; + if (!vm_map_lookup_entry(map, start, &tmp_entry)) { + if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) { + vm_map_unlock_read(map); return KERN_INVALID_ADDRESS; } - - /* clip out the portion of space */ - /* mapped by the sub map which */ - /* corresponds to the underlying */ - /* object */ - - /* - * Clip (and unnest) the smallest nested chunk - * possible around the faulting address... - */ - local_start = vaddr & ~(pmap_nesting_size_min - 1); - local_end = local_start + pmap_nesting_size_min; - /* - * ... but don't go beyond the "old_start" to "old_end" - * range, to avoid spanning over another VM region - * with a possibly different VM object and/or offset. - */ - if (local_start < old_start) { - local_start = old_start; - } - if (local_end > old_end) { - local_end = old_end; - } - /* - * Adjust copy_offset to the start of the range. - */ - copy_offset -= (vaddr - local_start); - - vm_map_clip_start(map, entry, local_start); - vm_map_clip_end(map, entry, local_end); - if (entry->is_sub_map) { - /* unnesting was done when clipping */ - assert(!entry->use_pmap); - } - - /* substitute copy object for */ - /* shared map entry */ - vm_map_deallocate(entry->object.sub_map); - assert(!entry->iokit_acct); - entry->is_sub_map = FALSE; - entry->use_pmap = TRUE; - entry->object.vm_object = copy_object; + } else { + entry = tmp_entry; + } - /* propagate the submap entry's protections */ - entry->protection |= submap_entry->protection; - entry->max_protection |= submap_entry->max_protection; + start = entry->vme_start; - if(copied_slowly) { - entry->offset = local_start - old_start; - entry->needs_copy = FALSE; - entry->is_shared = FALSE; - } else { - entry->offset = copy_offset; - entry->needs_copy = TRUE; - if(entry->inheritance == VM_INHERIT_SHARE) - entry->inheritance = VM_INHERIT_COPY; - if (map != old_map) - entry->is_shared = TRUE; - } - if(entry->inheritance == VM_INHERIT_SHARE) - entry->inheritance = VM_INHERIT_COPY; + basic->offset = VME_OFFSET(entry); + basic->protection = entry->protection; + basic->inheritance = entry->inheritance; + basic->max_protection = entry->max_protection; + basic->behavior = entry->behavior; + basic->user_wired_count = entry->user_wired_count; + basic->reserved = entry->is_sub_map; + *address = start; + *size = (entry->vme_end - start); - vm_map_lock_write_to_read(map); - } else { - if((cow_sub_map_parent) - && (cow_sub_map_parent != *real_map) - && (cow_sub_map_parent != map)) { - vm_map_unlock(cow_sub_map_parent); - } - entry = submap_entry; - vaddr = local_vaddr; + if (object_name) { + *object_name = IP_NULL; + } + if (entry->is_sub_map) { + basic->shared = FALSE; + } else { + basic->shared = entry->is_shared; } - } - - /* - * Check whether this task is allowed to have - * this page. - */ - - prot = entry->protection; - if (override_nx(old_map, entry->alias) && prot) { - /* - * HACK -- if not a stack, then allow execution - */ - prot |= VM_PROT_EXECUTE; + vm_map_unlock_read(map); + return KERN_SUCCESS; } - - if (mask_protections) { - fault_type &= prot; - if (fault_type == VM_PROT_NONE) { - goto protection_failure; + case VM_REGION_EXTENDED_INFO: + if (*count < VM_REGION_EXTENDED_INFO_COUNT) { + return KERN_INVALID_ARGUMENT; } - } - if ((fault_type & (prot)) != fault_type) { - protection_failure: - if (*real_map != map) { - vm_map_unlock(*real_map); + OS_FALLTHROUGH; + case VM_REGION_EXTENDED_INFO__legacy: + if (*count < VM_REGION_EXTENDED_INFO_COUNT__legacy) { + return KERN_INVALID_ARGUMENT; } - *real_map = map; - if ((fault_type & VM_PROT_EXECUTE) && prot) - log_stack_execution_failure((addr64_t)vaddr, prot); + { + vm_region_extended_info_t extended; + mach_msg_type_number_t original_count; + int effective_page_size, effective_page_shift; - DTRACE_VM2(prot_fault, int, 1, (uint64_t *), NULL); - return KERN_PROTECTION_FAILURE; - } + extended = (vm_region_extended_info_t) info; - /* - * If this page is not pageable, we have to get - * it for all possible accesses. - */ + effective_page_shift = vm_self_region_page_shift(map); + effective_page_size = (1 << effective_page_shift); - *wired = (entry->wired_count != 0); - if (*wired) - fault_type = prot; + vm_map_lock_read(map); - /* - * If the entry was copy-on-write, we either ... - */ + start = *address; + if (!vm_map_lookup_entry(map, start, &tmp_entry)) { + if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) { + vm_map_unlock_read(map); + return KERN_INVALID_ADDRESS; + } + } else { + entry = tmp_entry; + } + start = entry->vme_start; - if (entry->needs_copy) { - /* - * If we want to write the page, we may as well - * handle that now since we've got the map locked. - * - * If we don't need to write the page, we just - * demote the permissions allowed. - */ + extended->protection = entry->protection; + extended->user_tag = VME_ALIAS(entry); + extended->pages_resident = 0; + extended->pages_swapped_out = 0; + extended->pages_shared_now_private = 0; + extended->pages_dirtied = 0; + extended->external_pager = 0; + extended->shadow_depth = 0; + + original_count = *count; + if (flavor == VM_REGION_EXTENDED_INFO__legacy) { + *count = VM_REGION_EXTENDED_INFO_COUNT__legacy; + } else { + extended->pages_reusable = 0; + *count = VM_REGION_EXTENDED_INFO_COUNT; + } - if ((fault_type & VM_PROT_WRITE) || *wired || force_copy) { - /* - * Make a new object, and place it in the - * object chain. Note that no new references - * have appeared -- one just moved from the - * map to the new object. - */ + vm_map_region_walk(map, start, entry, VME_OFFSET(entry), entry->vme_end - start, extended, TRUE, *count); - if (vm_map_lock_read_to_write(map)) { - vm_map_lock_read(map); - goto RetryLookup; + if (extended->external_pager && extended->ref_count == 2 && extended->share_mode == SM_SHARED) { + extended->share_mode = SM_PRIVATE; } - vm_object_shadow(&entry->object.vm_object, - &entry->offset, - (vm_map_size_t) (entry->vme_end - - entry->vme_start)); - entry->object.vm_object->shadowed = TRUE; - entry->needs_copy = FALSE; - vm_map_lock_write_to_read(map); + if (object_name) { + *object_name = IP_NULL; + } + *address = start; + *size = (entry->vme_end - start); + + vm_map_unlock_read(map); + return KERN_SUCCESS; } - else { - /* - * We're attempting to read a copy-on-write - * page -- don't allow writes. - */ + case VM_REGION_TOP_INFO: + { + vm_region_top_info_t top; - prot &= (~VM_PROT_WRITE); + if (*count < VM_REGION_TOP_INFO_COUNT) { + return KERN_INVALID_ARGUMENT; } - } - /* - * Create an object if necessary. - */ - if (entry->object.vm_object == VM_OBJECT_NULL) { + top = (vm_region_top_info_t) info; + *count = VM_REGION_TOP_INFO_COUNT; - if (vm_map_lock_read_to_write(map)) { - vm_map_lock_read(map); - goto RetryLookup; - } + vm_map_lock_read(map); - entry->object.vm_object = vm_object_allocate( - (vm_map_size_t)(entry->vme_end - entry->vme_start)); - entry->offset = 0; - vm_map_lock_write_to_read(map); - } + start = *address; + if (!vm_map_lookup_entry(map, start, &tmp_entry)) { + if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) { + vm_map_unlock_read(map); + return KERN_INVALID_ADDRESS; + } + } else { + entry = tmp_entry; + } + start = entry->vme_start; - /* - * Return the object/offset from this entry. If the entry - * was copy-on-write or empty, it has been fixed up. Also - * return the protection. - */ + top->private_pages_resident = 0; + top->shared_pages_resident = 0; - *offset = (vaddr - entry->vme_start) + entry->offset; - *object = entry->object.vm_object; - *out_prot = prot; + vm_map_region_top_walk(entry, top); - if (fault_info) { - fault_info->interruptible = THREAD_UNINT; /* for now... */ - /* ... the caller will change "interruptible" if needed */ - fault_info->cluster_size = 0; - fault_info->user_tag = entry->alias; - fault_info->pmap_options = 0; - if (entry->iokit_acct || - (!entry->is_sub_map && !entry->use_pmap)) { - fault_info->pmap_options |= PMAP_OPTIONS_ALT_ACCT; + if (object_name) { + *object_name = IP_NULL; } - fault_info->behavior = entry->behavior; - fault_info->lo_offset = entry->offset; - fault_info->hi_offset = (entry->vme_end - entry->vme_start) + entry->offset; - fault_info->no_cache = entry->no_cache; - fault_info->stealth = FALSE; - fault_info->io_sync = FALSE; - fault_info->cs_bypass = (entry->used_for_jit)? TRUE : FALSE; - fault_info->mark_zf_absent = FALSE; - fault_info->batch_pmap_op = FALSE; + *address = start; + *size = (entry->vme_end - start); + + vm_map_unlock_read(map); + return KERN_SUCCESS; + } + default: + return KERN_INVALID_ARGUMENT; } +} - /* - * Lock the object to prevent it from disappearing - */ - if (object_lock_type == OBJECT_LOCK_EXCLUSIVE) - vm_object_lock(*object); - else - vm_object_lock_shared(*object); - - /* - * Save the version number - */ +#define OBJ_RESIDENT_COUNT(obj, entry_size) \ + MIN((entry_size), \ + ((obj)->all_reusable ? \ + (obj)->wired_page_count : \ + (obj)->resident_page_count - (obj)->reusable_page_count)) - out_version->main_timestamp = map->timestamp; +void +vm_map_region_top_walk( + vm_map_entry_t entry, + vm_region_top_info_t top) +{ + if (VME_OBJECT(entry) == 0 || entry->is_sub_map) { + top->share_mode = SM_EMPTY; + top->ref_count = 0; + top->obj_id = 0; + return; + } - return KERN_SUCCESS; -} + { + struct vm_object *obj, *tmp_obj; + int ref_count; + uint32_t entry_size; + entry_size = (uint32_t) ((entry->vme_end - entry->vme_start) / PAGE_SIZE_64); -/* - * vm_map_verify: - * - * Verifies that the map in question has not changed - * since the given version. If successful, the map - * will not change until vm_map_verify_done() is called. - */ -boolean_t -vm_map_verify( - register vm_map_t map, - register vm_map_version_t *version) /* REF */ -{ - boolean_t result; + obj = VME_OBJECT(entry); - vm_map_lock_read(map); - result = (map->timestamp == version->main_timestamp); + vm_object_lock(obj); - if (!result) - vm_map_unlock_read(map); + if ((ref_count = obj->ref_count) > 1 && obj->paging_in_progress) { + ref_count--; + } - return(result); -} + assert(obj->reusable_page_count <= obj->resident_page_count); + if (obj->shadow) { + if (ref_count == 1) { + top->private_pages_resident = + OBJ_RESIDENT_COUNT(obj, entry_size); + } else { + top->shared_pages_resident = + OBJ_RESIDENT_COUNT(obj, entry_size); + } + top->ref_count = ref_count; + top->share_mode = SM_COW; -/* - * vm_map_verify_done: - * - * Releases locks acquired by a vm_map_verify. - * - * This is now a macro in vm/vm_map.h. It does a - * vm_map_unlock_read on the map. - */ + while ((tmp_obj = obj->shadow)) { + vm_object_lock(tmp_obj); + vm_object_unlock(obj); + obj = tmp_obj; + if ((ref_count = obj->ref_count) > 1 && obj->paging_in_progress) { + ref_count--; + } -/* - * TEMPORARYTEMPORARYTEMPORARYTEMPORARYTEMPORARYTEMPORARY - * Goes away after regular vm_region_recurse function migrates to - * 64 bits - * vm_region_recurse: A form of vm_region which follows the - * submaps in a target map - * - */ + assert(obj->reusable_page_count <= obj->resident_page_count); + top->shared_pages_resident += + OBJ_RESIDENT_COUNT(obj, entry_size); + top->ref_count += ref_count - 1; + } + } else { + if (entry->superpage_size) { + top->share_mode = SM_LARGE_PAGE; + top->shared_pages_resident = 0; + top->private_pages_resident = entry_size; + } else if (entry->needs_copy) { + top->share_mode = SM_COW; + top->shared_pages_resident = + OBJ_RESIDENT_COUNT(obj, entry_size); + } else { + if (ref_count == 1 || + (ref_count == 2 && obj->named)) { + top->share_mode = SM_PRIVATE; + top->private_pages_resident = + OBJ_RESIDENT_COUNT(obj, + entry_size); + } else { + top->share_mode = SM_SHARED; + top->shared_pages_resident = + OBJ_RESIDENT_COUNT(obj, + entry_size); + } + } + top->ref_count = ref_count; + } + /* XXX K64: obj_id will be truncated */ + top->obj_id = (unsigned int) (uintptr_t)VM_KERNEL_ADDRPERM(obj); -kern_return_t -vm_map_region_recurse_64( - vm_map_t map, - vm_map_offset_t *address, /* IN/OUT */ - vm_map_size_t *size, /* OUT */ - natural_t *nesting_depth, /* IN/OUT */ - vm_region_submap_info_64_t submap_info, /* IN/OUT */ - mach_msg_type_number_t *count) /* IN/OUT */ -{ - mach_msg_type_number_t original_count; - vm_region_extended_info_data_t extended; - vm_map_entry_t tmp_entry; - vm_map_offset_t user_address; - unsigned int user_max_depth; + vm_object_unlock(obj); + } +} - /* - * "curr_entry" is the VM map entry preceding or including the - * address we're looking for. - * "curr_map" is the map or sub-map containing "curr_entry". - * "curr_address" is the equivalent of the top map's "user_address" - * in the current map. - * "curr_offset" is the cumulated offset of "curr_map" in the - * target task's address space. - * "curr_depth" is the depth of "curr_map" in the chain of - * sub-maps. - * - * "curr_max_below" and "curr_max_above" limit the range (around - * "curr_address") we should take into account in the current (sub)map. - * They limit the range to what's visible through the map entries - * we've traversed from the top map to the current map. +void +vm_map_region_walk( + vm_map_t map, + vm_map_offset_t va, + vm_map_entry_t entry, + vm_object_offset_t offset, + vm_object_size_t range, + vm_region_extended_info_t extended, + boolean_t look_for_pages, + mach_msg_type_number_t count) +{ + struct vm_object *obj, *tmp_obj; + vm_map_offset_t last_offset; + int i; + int ref_count; + struct vm_object *shadow_object; + unsigned short shadow_depth; + boolean_t do_region_footprint; + int effective_page_size, effective_page_shift; + vm_map_offset_t effective_page_mask; + + do_region_footprint = task_self_region_footprint(); + + if ((VME_OBJECT(entry) == 0) || + (entry->is_sub_map) || + (VME_OBJECT(entry)->phys_contiguous && + !entry->superpage_size)) { + extended->share_mode = SM_EMPTY; + extended->ref_count = 0; + return; + } - */ - vm_map_entry_t curr_entry; - vm_map_address_t curr_address; - vm_map_offset_t curr_offset; - vm_map_t curr_map; - unsigned int curr_depth; - vm_map_offset_t curr_max_below, curr_max_above; - vm_map_offset_t curr_skip; + if (entry->superpage_size) { + extended->shadow_depth = 0; + extended->share_mode = SM_LARGE_PAGE; + extended->ref_count = 1; + extended->external_pager = 0; - /* - * "next_" is the same as "curr_" but for the VM region immediately - * after the address we're looking for. We need to keep track of this - * too because we want to return info about that region if the - * address we're looking for is not mapped. - */ - vm_map_entry_t next_entry; - vm_map_offset_t next_offset; - vm_map_offset_t next_address; - vm_map_t next_map; - unsigned int next_depth; - vm_map_offset_t next_max_below, next_max_above; - vm_map_offset_t next_skip; + /* TODO4K: Superpage in 4k mode? */ + extended->pages_resident = (unsigned int)(range >> PAGE_SHIFT); + extended->shadow_depth = 0; + return; + } - boolean_t look_for_pages; - vm_region_submap_short_info_64_t short_info; + effective_page_shift = vm_self_region_page_shift(map); + effective_page_size = (1 << effective_page_shift); + effective_page_mask = effective_page_size - 1; - if (map == VM_MAP_NULL) { - /* no address space to work on */ - return KERN_INVALID_ARGUMENT; - } + offset = vm_map_trunc_page(offset, effective_page_mask); - - if (*count < VM_REGION_SUBMAP_SHORT_INFO_COUNT_64) { - /* - * "info" structure is not big enough and - * would overflow - */ - return KERN_INVALID_ARGUMENT; - } - - original_count = *count; - - if (original_count < VM_REGION_SUBMAP_INFO_V0_COUNT_64) { - *count = VM_REGION_SUBMAP_SHORT_INFO_COUNT_64; - look_for_pages = FALSE; - short_info = (vm_region_submap_short_info_64_t) submap_info; - submap_info = NULL; - } else { - look_for_pages = TRUE; - *count = VM_REGION_SUBMAP_INFO_V0_COUNT_64; - short_info = NULL; - - if (original_count >= VM_REGION_SUBMAP_INFO_V1_COUNT_64) { - *count = VM_REGION_SUBMAP_INFO_V1_COUNT_64; - } - } - - user_address = *address; - user_max_depth = *nesting_depth; - - curr_entry = NULL; - curr_map = map; - curr_address = user_address; - curr_offset = 0; - curr_skip = 0; - curr_depth = 0; - curr_max_above = ((vm_map_offset_t) -1) - curr_address; - curr_max_below = curr_address; + obj = VME_OBJECT(entry); - next_entry = NULL; - next_map = NULL; - next_address = 0; - next_offset = 0; - next_skip = 0; - next_depth = 0; - next_max_above = (vm_map_offset_t) -1; - next_max_below = (vm_map_offset_t) -1; + vm_object_lock(obj); - if (not_in_kdp) { - vm_map_lock_read(curr_map); + if ((ref_count = obj->ref_count) > 1 && obj->paging_in_progress) { + ref_count--; } - for (;;) { - if (vm_map_lookup_entry(curr_map, - curr_address, - &tmp_entry)) { - /* tmp_entry contains the address we're looking for */ - curr_entry = tmp_entry; - } else { - vm_map_offset_t skip; - /* - * The address is not mapped. "tmp_entry" is the - * map entry preceding the address. We want the next - * one, if it exists. - */ - curr_entry = tmp_entry->vme_next; - - if (curr_entry == vm_map_to_entry(curr_map) || - (curr_entry->vme_start >= - curr_address + curr_max_above)) { - /* no next entry at this level: stop looking */ - if (not_in_kdp) { - vm_map_unlock_read(curr_map); + if (look_for_pages) { + for (last_offset = offset + range; + offset < last_offset; + offset += effective_page_size, va += effective_page_size) { + if (do_region_footprint) { + int disp; + + disp = 0; + if (map->has_corpse_footprint) { + /* + * Query the page info data we saved + * while forking the corpse. + */ + vm_map_corpse_footprint_query_page_info( + map, + va, + &disp); + } else { + /* + * Query the pmap. + */ + vm_map_footprint_query_page_info( + map, + entry, + va, + &disp); } - curr_entry = NULL; - curr_map = NULL; - curr_offset = 0; - curr_depth = 0; - curr_max_above = 0; - curr_max_below = 0; - break; + if (disp & VM_PAGE_QUERY_PAGE_PRESENT) { + extended->pages_resident++; + } + if (disp & VM_PAGE_QUERY_PAGE_REUSABLE) { + extended->pages_reusable++; + } + if (disp & VM_PAGE_QUERY_PAGE_DIRTY) { + extended->pages_dirtied++; + } + if (disp & PMAP_QUERY_PAGE_COMPRESSED) { + extended->pages_swapped_out++; + } + continue; } - /* adjust current address and offset */ - skip = curr_entry->vme_start - curr_address; - curr_address = curr_entry->vme_start; - curr_skip = skip; - curr_offset += skip; - curr_max_above -= skip; - curr_max_below = 0; + vm_map_region_look_for_page(map, va, obj, + vm_object_trunc_page(offset), ref_count, + 0, extended, count); } - /* - * Is the next entry at this level closer to the address (or - * deeper in the submap chain) than the one we had - * so far ? - */ - tmp_entry = curr_entry->vme_next; - if (tmp_entry == vm_map_to_entry(curr_map)) { - /* no next entry at this level */ - } else if (tmp_entry->vme_start >= - curr_address + curr_max_above) { - /* - * tmp_entry is beyond the scope of what we mapped of - * this submap in the upper level: ignore it. - */ - } else if ((next_entry == NULL) || - (tmp_entry->vme_start + curr_offset <= - next_entry->vme_start + next_offset)) { - /* - * We didn't have a "next_entry" or this one is - * closer to the address we're looking for: - * use this "tmp_entry" as the new "next_entry". - */ - if (next_entry != NULL) { - /* unlock the last "next_map" */ - if (next_map != curr_map && not_in_kdp) { - vm_map_unlock_read(next_map); + if (do_region_footprint) { + goto collect_object_info; + } + } else { +collect_object_info: + shadow_object = obj->shadow; + shadow_depth = 0; + + if (!(obj->internal)) { + extended->external_pager = 1; + } + + if (shadow_object != VM_OBJECT_NULL) { + vm_object_lock(shadow_object); + for (; + shadow_object != VM_OBJECT_NULL; + shadow_depth++) { + vm_object_t next_shadow; + + if (!(shadow_object->internal)) { + extended->external_pager = 1; + } + + next_shadow = shadow_object->shadow; + if (next_shadow) { + vm_object_lock(next_shadow); } + vm_object_unlock(shadow_object); + shadow_object = next_shadow; } - next_entry = tmp_entry; - next_map = curr_map; - next_depth = curr_depth; - next_address = next_entry->vme_start; - next_skip = curr_skip; - next_offset = curr_offset; - next_offset += (next_address - curr_address); - next_max_above = MIN(next_max_above, curr_max_above); - next_max_above = MIN(next_max_above, - next_entry->vme_end - next_address); - next_max_below = MIN(next_max_below, curr_max_below); - next_max_below = MIN(next_max_below, - next_address - next_entry->vme_start); } + extended->shadow_depth = shadow_depth; + } - /* - * "curr_max_{above,below}" allow us to keep track of the - * portion of the submap that is actually mapped at this level: - * the rest of that submap is irrelevant to us, since it's not - * mapped here. - * The relevant portion of the map starts at - * "curr_entry->offset" up to the size of "curr_entry". - */ - curr_max_above = MIN(curr_max_above, - curr_entry->vme_end - curr_address); - curr_max_below = MIN(curr_max_below, - curr_address - curr_entry->vme_start); + if (extended->shadow_depth || entry->needs_copy) { + extended->share_mode = SM_COW; + } else { + if (ref_count == 1) { + extended->share_mode = SM_PRIVATE; + } else { + if (obj->true_share) { + extended->share_mode = SM_TRUESHARED; + } else { + extended->share_mode = SM_SHARED; + } + } + } + extended->ref_count = ref_count - extended->shadow_depth; - if (!curr_entry->is_sub_map || - curr_depth >= user_max_depth) { - /* - * We hit a leaf map or we reached the maximum depth - * we could, so stop looking. Keep the current map - * locked. - */ + for (i = 0; i < extended->shadow_depth; i++) { + if ((tmp_obj = obj->shadow) == 0) { break; } + vm_object_lock(tmp_obj); + vm_object_unlock(obj); - /* - * Get down to the next submap level. - */ + if ((ref_count = tmp_obj->ref_count) > 1 && tmp_obj->paging_in_progress) { + ref_count--; + } - /* - * Lock the next level and unlock the current level, - * unless we need to keep it locked to access the "next_entry" - * later. - */ - if (not_in_kdp) { - vm_map_lock_read(curr_entry->object.sub_map); + extended->ref_count += ref_count; + obj = tmp_obj; + } + vm_object_unlock(obj); + + if (extended->share_mode == SM_SHARED) { + vm_map_entry_t cur; + vm_map_entry_t last; + int my_refs; + + obj = VME_OBJECT(entry); + last = vm_map_to_entry(map); + my_refs = 0; + + if ((ref_count = obj->ref_count) > 1 && obj->paging_in_progress) { + ref_count--; } - if (curr_map == next_map) { - /* keep "next_map" locked in case we need it */ - } else { - /* release this map */ - if (not_in_kdp) - vm_map_unlock_read(curr_map); + for (cur = vm_map_first_entry(map); cur != last; cur = cur->vme_next) { + my_refs += vm_map_region_count_obj_refs(cur, obj); } - /* - * Adjust the offset. "curr_entry" maps the submap - * at relative address "curr_entry->vme_start" in the - * curr_map but skips the first "curr_entry->offset" - * bytes of the submap. - * "curr_offset" always represents the offset of a virtual - * address in the curr_map relative to the absolute address - * space (i.e. the top-level VM map). - */ - curr_offset += - (curr_entry->offset - curr_entry->vme_start); - curr_address = user_address + curr_offset; - /* switch to the submap */ - curr_map = curr_entry->object.sub_map; - curr_depth++; - curr_entry = NULL; + if (my_refs == ref_count) { + extended->share_mode = SM_PRIVATE_ALIASED; + } else if (my_refs > 1) { + extended->share_mode = SM_SHARED_ALIASED; + } } +} - if (curr_entry == NULL) { - /* no VM region contains the address... */ - if (next_entry == NULL) { - /* ... and no VM region follows it either */ - return KERN_INVALID_ADDRESS; + +/* object is locked on entry and locked on return */ + + +static void +vm_map_region_look_for_page( + __unused vm_map_t map, + __unused vm_map_offset_t va, + vm_object_t object, + vm_object_offset_t offset, + int max_refcnt, + unsigned short depth, + vm_region_extended_info_t extended, + mach_msg_type_number_t count) +{ + vm_page_t p; + vm_object_t shadow; + int ref_count; + vm_object_t caller_object; + + shadow = object->shadow; + caller_object = object; + + + while (TRUE) { + if (!(object->internal)) { + extended->external_pager = 1; } - /* ... gather info about the next VM region */ - curr_entry = next_entry; - curr_map = next_map; /* still locked ... */ - curr_address = next_address; - curr_skip = next_skip; - curr_offset = next_offset; - curr_depth = next_depth; - curr_max_above = next_max_above; - curr_max_below = next_max_below; - if (curr_map == map) { - user_address = curr_address; + + if ((p = vm_page_lookup(object, offset)) != VM_PAGE_NULL) { + if (shadow && (max_refcnt == 1)) { + extended->pages_shared_now_private++; + } + + if (!p->vmp_fictitious && + (p->vmp_dirty || pmap_is_modified(VM_PAGE_GET_PHYS_PAGE(p)))) { + extended->pages_dirtied++; + } else if (count >= VM_REGION_EXTENDED_INFO_COUNT) { + if (p->vmp_reusable || object->all_reusable) { + extended->pages_reusable++; + } + } + + extended->pages_resident++; + + if (object != caller_object) { + vm_object_unlock(object); + } + + return; } - } else { - /* we won't need "next_entry" after all */ - if (next_entry != NULL) { - /* release "next_map" */ - if (next_map != curr_map && not_in_kdp) { - vm_map_unlock_read(next_map); + if (object->internal && + object->alive && + !object->terminating && + object->pager_ready) { + if (VM_COMPRESSOR_PAGER_STATE_GET(object, offset) + == VM_EXTERNAL_STATE_EXISTS) { + /* the pager has that page */ + extended->pages_swapped_out++; + if (object != caller_object) { + vm_object_unlock(object); + } + return; + } + } + + if (shadow) { + vm_object_lock(shadow); + + if ((ref_count = shadow->ref_count) > 1 && shadow->paging_in_progress) { + ref_count--; + } + + if (++depth > extended->shadow_depth) { + extended->shadow_depth = depth; + } + + if (ref_count > max_refcnt) { + max_refcnt = ref_count; + } + + if (object != caller_object) { + vm_object_unlock(object); } + + offset = offset + object->vo_shadow_offset; + object = shadow; + shadow = object->shadow; + continue; + } + if (object != caller_object) { + vm_object_unlock(object); } + break; } - next_entry = NULL; - next_map = NULL; - next_offset = 0; - next_skip = 0; - next_depth = 0; - next_max_below = -1; - next_max_above = -1; +} - *nesting_depth = curr_depth; - *size = curr_max_above + curr_max_below; - *address = user_address + curr_skip - curr_max_below; +static int +vm_map_region_count_obj_refs( + vm_map_entry_t entry, + vm_object_t object) +{ + int ref_count; + vm_object_t chk_obj; + vm_object_t tmp_obj; -// LP64todo: all the current tools are 32bit, obviously never worked for 64b -// so probably should be a real 32b ID vs. ptr. -// Current users just check for equality -#define INFO_MAKE_OBJECT_ID(p) ((uint32_t)(uintptr_t)VM_KERNEL_ADDRPERM(p)) + if (VME_OBJECT(entry) == 0) { + return 0; + } - if (look_for_pages) { - submap_info->user_tag = curr_entry->alias; - submap_info->offset = curr_entry->offset; - submap_info->protection = curr_entry->protection; - submap_info->inheritance = curr_entry->inheritance; - submap_info->max_protection = curr_entry->max_protection; - submap_info->behavior = curr_entry->behavior; - submap_info->user_wired_count = curr_entry->user_wired_count; - submap_info->is_submap = curr_entry->is_sub_map; - submap_info->object_id = INFO_MAKE_OBJECT_ID(curr_entry->object.vm_object); + if (entry->is_sub_map) { + return 0; } else { - short_info->user_tag = curr_entry->alias; - short_info->offset = curr_entry->offset; - short_info->protection = curr_entry->protection; - short_info->inheritance = curr_entry->inheritance; - short_info->max_protection = curr_entry->max_protection; - short_info->behavior = curr_entry->behavior; - short_info->user_wired_count = curr_entry->user_wired_count; - short_info->is_submap = curr_entry->is_sub_map; - short_info->object_id = INFO_MAKE_OBJECT_ID(curr_entry->object.vm_object); - } + ref_count = 0; - extended.pages_resident = 0; - extended.pages_swapped_out = 0; - extended.pages_shared_now_private = 0; - extended.pages_dirtied = 0; - extended.pages_reusable = 0; - extended.external_pager = 0; - extended.shadow_depth = 0; + chk_obj = VME_OBJECT(entry); + vm_object_lock(chk_obj); - if (not_in_kdp) { - if (!curr_entry->is_sub_map) { - vm_map_offset_t range_start, range_end; - range_start = MAX((curr_address - curr_max_below), - curr_entry->vme_start); - range_end = MIN((curr_address + curr_max_above), - curr_entry->vme_end); - vm_map_region_walk(curr_map, - range_start, - curr_entry, - (curr_entry->offset + - (range_start - - curr_entry->vme_start)), - range_end - range_start, - &extended, - look_for_pages, VM_REGION_EXTENDED_INFO_COUNT); - if (extended.external_pager && - extended.ref_count == 2 && - extended.share_mode == SM_SHARED) { - extended.share_mode = SM_PRIVATE; + while (chk_obj) { + if (chk_obj == object) { + ref_count++; } - } else { - if (curr_entry->use_pmap) { - extended.share_mode = SM_TRUESHARED; - } else { - extended.share_mode = SM_PRIVATE; + tmp_obj = chk_obj->shadow; + if (tmp_obj) { + vm_object_lock(tmp_obj); } - extended.ref_count = - curr_entry->object.sub_map->ref_count; - } - } + vm_object_unlock(chk_obj); - if (look_for_pages) { - submap_info->pages_resident = extended.pages_resident; - submap_info->pages_swapped_out = extended.pages_swapped_out; - submap_info->pages_shared_now_private = - extended.pages_shared_now_private; - submap_info->pages_dirtied = extended.pages_dirtied; - submap_info->external_pager = extended.external_pager; - submap_info->shadow_depth = extended.shadow_depth; - submap_info->share_mode = extended.share_mode; - submap_info->ref_count = extended.ref_count; - - if (original_count >= VM_REGION_SUBMAP_INFO_V1_COUNT_64) { - submap_info->pages_reusable = extended.pages_reusable; + chk_obj = tmp_obj; } - } else { - short_info->external_pager = extended.external_pager; - short_info->shadow_depth = extended.shadow_depth; - short_info->share_mode = extended.share_mode; - short_info->ref_count = extended.ref_count; - } - - if (not_in_kdp) { - vm_map_unlock_read(curr_map); } - - return KERN_SUCCESS; + return ref_count; } + /* - * vm_region: - * - * User call to obtain information about a region in - * a task's address map. Currently, only one flavor is - * supported. + * Routine: vm_map_simplify * - * XXX The reserved and behavior fields cannot be filled - * in until the vm merge from the IK is completed, and - * vm_reserve is implemented. + * Description: + * Attempt to simplify the map representation in + * the vicinity of the given starting address. + * Note: + * This routine is intended primarily to keep the + * kernel maps more compact -- they generally don't + * benefit from the "expand a map entry" technology + * at allocation time because the adjacent entry + * is often wired down. */ - -kern_return_t -vm_map_region( - vm_map_t map, - vm_map_offset_t *address, /* IN/OUT */ - vm_map_size_t *size, /* OUT */ - vm_region_flavor_t flavor, /* IN */ - vm_region_info_t info, /* OUT */ - mach_msg_type_number_t *count, /* IN/OUT */ - mach_port_t *object_name) /* OUT */ +void +vm_map_simplify_entry( + vm_map_t map, + vm_map_entry_t this_entry) { - vm_map_entry_t tmp_entry; - vm_map_entry_t entry; - vm_map_offset_t start; + vm_map_entry_t prev_entry; - if (map == VM_MAP_NULL) - return(KERN_INVALID_ARGUMENT); + prev_entry = this_entry->vme_prev; - switch (flavor) { + if ((this_entry != vm_map_to_entry(map)) && + (prev_entry != vm_map_to_entry(map)) && - case VM_REGION_BASIC_INFO: - /* legacy for old 32-bit objects info */ - { - vm_region_basic_info_t basic; + (prev_entry->vme_end == this_entry->vme_start) && - if (*count < VM_REGION_BASIC_INFO_COUNT) - return(KERN_INVALID_ARGUMENT); + (prev_entry->is_sub_map == this_entry->is_sub_map) && + (VME_OBJECT(prev_entry) == VME_OBJECT(this_entry)) && + ((VME_OFFSET(prev_entry) + (prev_entry->vme_end - + prev_entry->vme_start)) + == VME_OFFSET(this_entry)) && - basic = (vm_region_basic_info_t) info; - *count = VM_REGION_BASIC_INFO_COUNT; + (prev_entry->behavior == this_entry->behavior) && + (prev_entry->needs_copy == this_entry->needs_copy) && + (prev_entry->protection == this_entry->protection) && + (prev_entry->max_protection == this_entry->max_protection) && + (prev_entry->inheritance == this_entry->inheritance) && + (prev_entry->use_pmap == this_entry->use_pmap) && + (VME_ALIAS(prev_entry) == VME_ALIAS(this_entry)) && + (prev_entry->no_cache == this_entry->no_cache) && + (prev_entry->permanent == this_entry->permanent) && + (prev_entry->map_aligned == this_entry->map_aligned) && + (prev_entry->zero_wired_pages == this_entry->zero_wired_pages) && + (prev_entry->used_for_jit == this_entry->used_for_jit) && + (prev_entry->pmap_cs_associated == this_entry->pmap_cs_associated) && + /* from_reserved_zone: OK if that field doesn't match */ + (prev_entry->iokit_acct == this_entry->iokit_acct) && + (prev_entry->vme_resilient_codesign == + this_entry->vme_resilient_codesign) && + (prev_entry->vme_resilient_media == + this_entry->vme_resilient_media) && + (prev_entry->vme_no_copy_on_read == this_entry->vme_no_copy_on_read) && - vm_map_lock_read(map); + (prev_entry->wired_count == this_entry->wired_count) && + (prev_entry->user_wired_count == this_entry->user_wired_count) && - start = *address; - if (!vm_map_lookup_entry(map, start, &tmp_entry)) { - if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) { - vm_map_unlock_read(map); - return(KERN_INVALID_ADDRESS); - } + ((prev_entry->vme_atomic == FALSE) && (this_entry->vme_atomic == FALSE)) && + (prev_entry->in_transition == FALSE) && + (this_entry->in_transition == FALSE) && + (prev_entry->needs_wakeup == FALSE) && + (this_entry->needs_wakeup == FALSE) && + (prev_entry->is_shared == this_entry->is_shared) && + (prev_entry->superpage_size == FALSE) && + (this_entry->superpage_size == FALSE) + ) { + vm_map_store_entry_unlink(map, prev_entry); + assert(prev_entry->vme_start < this_entry->vme_end); + if (prev_entry->map_aligned) { + assert(VM_MAP_PAGE_ALIGNED(prev_entry->vme_start, + VM_MAP_PAGE_MASK(map))); + } + this_entry->vme_start = prev_entry->vme_start; + VME_OFFSET_SET(this_entry, VME_OFFSET(prev_entry)); + + if (map->holelistenabled) { + vm_map_store_update_first_free(map, this_entry, TRUE); + } + + if (prev_entry->is_sub_map) { + vm_map_deallocate(VME_SUBMAP(prev_entry)); } else { - entry = tmp_entry; + vm_object_deallocate(VME_OBJECT(prev_entry)); } + vm_map_entry_dispose(map, prev_entry); + SAVE_HINT_MAP_WRITE(map, this_entry); + } +} - start = entry->vme_start; +void +vm_map_simplify( + vm_map_t map, + vm_map_offset_t start) +{ + vm_map_entry_t this_entry; - basic->offset = (uint32_t)entry->offset; - basic->protection = entry->protection; - basic->inheritance = entry->inheritance; - basic->max_protection = entry->max_protection; - basic->behavior = entry->behavior; - basic->user_wired_count = entry->user_wired_count; - basic->reserved = entry->is_sub_map; - *address = start; - *size = (entry->vme_end - start); + vm_map_lock(map); + if (vm_map_lookup_entry(map, start, &this_entry)) { + vm_map_simplify_entry(map, this_entry); + vm_map_simplify_entry(map, this_entry->vme_next); + } + vm_map_unlock(map); +} - if (object_name) *object_name = IP_NULL; - if (entry->is_sub_map) { - basic->shared = FALSE; +static void +vm_map_simplify_range( + vm_map_t map, + vm_map_offset_t start, + vm_map_offset_t end) +{ + vm_map_entry_t entry; + + /* + * The map should be locked (for "write") by the caller. + */ + + if (start >= end) { + /* invalid address range */ + return; + } + + start = vm_map_trunc_page(start, + VM_MAP_PAGE_MASK(map)); + end = vm_map_round_page(end, + VM_MAP_PAGE_MASK(map)); + + if (!vm_map_lookup_entry(map, start, &entry)) { + /* "start" is not mapped and "entry" ends before "start" */ + if (entry == vm_map_to_entry(map)) { + /* start with first entry in the map */ + entry = vm_map_first_entry(map); } else { - basic->shared = entry->is_shared; + /* start with next entry */ + entry = entry->vme_next; } + } - vm_map_unlock_read(map); - return(KERN_SUCCESS); + while (entry != vm_map_to_entry(map) && + entry->vme_start <= end) { + /* try and coalesce "entry" with its previous entry */ + vm_map_simplify_entry(map, entry); + entry = entry->vme_next; + } +} + + +/* + * Routine: vm_map_machine_attribute + * Purpose: + * Provide machine-specific attributes to mappings, + * such as cachability etc. for machines that provide + * them. NUMA architectures and machines with big/strange + * caches will use this. + * Note: + * Responsibilities for locking and checking are handled here, + * everything else in the pmap module. If any non-volatile + * information must be kept, the pmap module should handle + * it itself. [This assumes that attributes do not + * need to be inherited, which seems ok to me] + */ +kern_return_t +vm_map_machine_attribute( + vm_map_t map, + vm_map_offset_t start, + vm_map_offset_t end, + vm_machine_attribute_t attribute, + vm_machine_attribute_val_t* value) /* IN/OUT */ +{ + kern_return_t ret; + vm_map_size_t sync_size; + vm_map_entry_t entry; + + if (start < vm_map_min(map) || end > vm_map_max(map)) { + return KERN_INVALID_ADDRESS; } - case VM_REGION_BASIC_INFO_64: - { - vm_region_basic_info_64_t basic; + /* Figure how much memory we need to flush (in page increments) */ + sync_size = end - start; - if (*count < VM_REGION_BASIC_INFO_COUNT_64) - return(KERN_INVALID_ARGUMENT); + vm_map_lock(map); - basic = (vm_region_basic_info_64_t) info; - *count = VM_REGION_BASIC_INFO_COUNT_64; + if (attribute != MATTR_CACHE) { + /* If we don't have to find physical addresses, we */ + /* don't have to do an explicit traversal here. */ + ret = pmap_attribute(map->pmap, start, end - start, + attribute, value); + vm_map_unlock(map); + return ret; + } - vm_map_lock_read(map); + ret = KERN_SUCCESS; /* Assume it all worked */ - start = *address; - if (!vm_map_lookup_entry(map, start, &tmp_entry)) { - if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) { - vm_map_unlock_read(map); - return(KERN_INVALID_ADDRESS); + while (sync_size) { + if (vm_map_lookup_entry(map, start, &entry)) { + vm_map_size_t sub_size; + if ((entry->vme_end - start) > sync_size) { + sub_size = sync_size; + sync_size = 0; + } else { + sub_size = entry->vme_end - start; + sync_size -= sub_size; } - } else { - entry = tmp_entry; - } - - start = entry->vme_start; - - basic->offset = entry->offset; - basic->protection = entry->protection; - basic->inheritance = entry->inheritance; - basic->max_protection = entry->max_protection; - basic->behavior = entry->behavior; - basic->user_wired_count = entry->user_wired_count; - basic->reserved = entry->is_sub_map; - *address = start; - *size = (entry->vme_end - start); - - if (object_name) *object_name = IP_NULL; - if (entry->is_sub_map) { - basic->shared = FALSE; - } else { - basic->shared = entry->is_shared; - } + if (entry->is_sub_map) { + vm_map_offset_t sub_start; + vm_map_offset_t sub_end; - vm_map_unlock_read(map); - return(KERN_SUCCESS); - } - case VM_REGION_EXTENDED_INFO: - if (*count < VM_REGION_EXTENDED_INFO_COUNT) - return(KERN_INVALID_ARGUMENT); - /*fallthru*/ - case VM_REGION_EXTENDED_INFO__legacy: - if (*count < VM_REGION_EXTENDED_INFO_COUNT__legacy) - return KERN_INVALID_ARGUMENT; + sub_start = (start - entry->vme_start) + + VME_OFFSET(entry); + sub_end = sub_start + sub_size; + vm_map_machine_attribute( + VME_SUBMAP(entry), + sub_start, + sub_end, + attribute, value); + } else { + if (VME_OBJECT(entry)) { + vm_page_t m; + vm_object_t object; + vm_object_t base_object; + vm_object_t last_object; + vm_object_offset_t offset; + vm_object_offset_t base_offset; + vm_map_size_t range; + range = sub_size; + offset = (start - entry->vme_start) + + VME_OFFSET(entry); + offset = vm_object_trunc_page(offset); + base_offset = offset; + object = VME_OBJECT(entry); + base_object = object; + last_object = NULL; - { - vm_region_extended_info_t extended; - mach_msg_type_number_t original_count; + vm_object_lock(object); - extended = (vm_region_extended_info_t) info; + while (range) { + m = vm_page_lookup( + object, offset); - vm_map_lock_read(map); + if (m && !m->vmp_fictitious) { + ret = + pmap_attribute_cache_sync( + VM_PAGE_GET_PHYS_PAGE(m), + PAGE_SIZE, + attribute, value); + } else if (object->shadow) { + offset = offset + object->vo_shadow_offset; + last_object = object; + object = object->shadow; + vm_object_lock(last_object->shadow); + vm_object_unlock(last_object); + continue; + } + if (range < PAGE_SIZE) { + range = 0; + } else { + range -= PAGE_SIZE; + } - start = *address; - if (!vm_map_lookup_entry(map, start, &tmp_entry)) { - if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) { - vm_map_unlock_read(map); - return(KERN_INVALID_ADDRESS); + if (base_object != object) { + vm_object_unlock(object); + vm_object_lock(base_object); + object = base_object; + } + /* Bump to the next page */ + base_offset += PAGE_SIZE; + offset = base_offset; + } + vm_object_unlock(object); + } } + start += sub_size; } else { - entry = tmp_entry; - } - start = entry->vme_start; - - extended->protection = entry->protection; - extended->user_tag = entry->alias; - extended->pages_resident = 0; - extended->pages_swapped_out = 0; - extended->pages_shared_now_private = 0; - extended->pages_dirtied = 0; - extended->external_pager = 0; - extended->shadow_depth = 0; - - original_count = *count; - if (flavor == VM_REGION_EXTENDED_INFO__legacy) { - *count = VM_REGION_EXTENDED_INFO_COUNT__legacy; - } else { - extended->pages_reusable = 0; - *count = VM_REGION_EXTENDED_INFO_COUNT; + vm_map_unlock(map); + return KERN_FAILURE; } + } - vm_map_region_walk(map, start, entry, entry->offset, entry->vme_end - start, extended, TRUE, *count); + vm_map_unlock(map); - if (extended->external_pager && extended->ref_count == 2 && extended->share_mode == SM_SHARED) - extended->share_mode = SM_PRIVATE; + return ret; +} - if (object_name) - *object_name = IP_NULL; - *address = start; - *size = (entry->vme_end - start); +/* + * vm_map_behavior_set: + * + * Sets the paging reference behavior of the specified address + * range in the target map. Paging reference behavior affects + * how pagein operations resulting from faults on the map will be + * clustered. + */ +kern_return_t +vm_map_behavior_set( + vm_map_t map, + vm_map_offset_t start, + vm_map_offset_t end, + vm_behavior_t new_behavior) +{ + vm_map_entry_t entry; + vm_map_entry_t temp_entry; - vm_map_unlock_read(map); - return(KERN_SUCCESS); + if (start > end || + start < vm_map_min(map) || + end > vm_map_max(map)) { + return KERN_NO_SPACE; } - case VM_REGION_TOP_INFO: - { - vm_region_top_info_t top; - if (*count < VM_REGION_TOP_INFO_COUNT) - return(KERN_INVALID_ARGUMENT); - - top = (vm_region_top_info_t) info; - *count = VM_REGION_TOP_INFO_COUNT; + switch (new_behavior) { + /* + * This first block of behaviors all set a persistent state on the specified + * memory range. All we have to do here is to record the desired behavior + * in the vm_map_entry_t's. + */ - vm_map_lock_read(map); + case VM_BEHAVIOR_DEFAULT: + case VM_BEHAVIOR_RANDOM: + case VM_BEHAVIOR_SEQUENTIAL: + case VM_BEHAVIOR_RSEQNTL: + case VM_BEHAVIOR_ZERO_WIRED_PAGES: + vm_map_lock(map); - start = *address; - if (!vm_map_lookup_entry(map, start, &tmp_entry)) { - if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) { - vm_map_unlock_read(map); - return(KERN_INVALID_ADDRESS); - } + /* + * The entire address range must be valid for the map. + * Note that vm_map_range_check() does a + * vm_map_lookup_entry() internally and returns the + * entry containing the start of the address range if + * the entire range is valid. + */ + if (vm_map_range_check(map, start, end, &temp_entry)) { + entry = temp_entry; + vm_map_clip_start(map, entry, start); } else { - entry = tmp_entry; - + vm_map_unlock(map); + return KERN_INVALID_ADDRESS; } - start = entry->vme_start; - - top->private_pages_resident = 0; - top->shared_pages_resident = 0; - - vm_map_region_top_walk(entry, top); - - if (object_name) - *object_name = IP_NULL; - *address = start; - *size = (entry->vme_end - start); - - vm_map_unlock_read(map); - return(KERN_SUCCESS); - } - default: - return(KERN_INVALID_ARGUMENT); - } -} -#define OBJ_RESIDENT_COUNT(obj, entry_size) \ - MIN((entry_size), \ - ((obj)->all_reusable ? \ - (obj)->wired_page_count : \ - (obj)->resident_page_count - (obj)->reusable_page_count)) + while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) { + vm_map_clip_end(map, entry, end); + if (entry->is_sub_map) { + assert(!entry->use_pmap); + } -void -vm_map_region_top_walk( - vm_map_entry_t entry, - vm_region_top_info_t top) -{ + if (new_behavior == VM_BEHAVIOR_ZERO_WIRED_PAGES) { + entry->zero_wired_pages = TRUE; + } else { + entry->behavior = new_behavior; + } + entry = entry->vme_next; + } - if (entry->object.vm_object == 0 || entry->is_sub_map) { - top->share_mode = SM_EMPTY; - top->ref_count = 0; - top->obj_id = 0; - return; - } + vm_map_unlock(map); + break; - { - struct vm_object *obj, *tmp_obj; - int ref_count; - uint32_t entry_size; + /* + * The rest of these are different from the above in that they cause + * an immediate action to take place as opposed to setting a behavior that + * affects future actions. + */ - entry_size = (uint32_t) ((entry->vme_end - entry->vme_start) / PAGE_SIZE_64); + case VM_BEHAVIOR_WILLNEED: + return vm_map_willneed(map, start, end); - obj = entry->object.vm_object; + case VM_BEHAVIOR_DONTNEED: + return vm_map_msync(map, start, end - start, VM_SYNC_DEACTIVATE | VM_SYNC_CONTIGUOUS); - vm_object_lock(obj); + case VM_BEHAVIOR_FREE: + return vm_map_msync(map, start, end - start, VM_SYNC_KILLPAGES | VM_SYNC_CONTIGUOUS); - if ((ref_count = obj->ref_count) > 1 && obj->paging_in_progress) - ref_count--; + case VM_BEHAVIOR_REUSABLE: + return vm_map_reusable_pages(map, start, end); - assert(obj->reusable_page_count <= obj->resident_page_count); - if (obj->shadow) { - if (ref_count == 1) - top->private_pages_resident = - OBJ_RESIDENT_COUNT(obj, entry_size); - else - top->shared_pages_resident = - OBJ_RESIDENT_COUNT(obj, entry_size); - top->ref_count = ref_count; - top->share_mode = SM_COW; - - while ((tmp_obj = obj->shadow)) { - vm_object_lock(tmp_obj); - vm_object_unlock(obj); - obj = tmp_obj; + case VM_BEHAVIOR_REUSE: + return vm_map_reuse_pages(map, start, end); - if ((ref_count = obj->ref_count) > 1 && obj->paging_in_progress) - ref_count--; + case VM_BEHAVIOR_CAN_REUSE: + return vm_map_can_reuse(map, start, end); - assert(obj->reusable_page_count <= obj->resident_page_count); - top->shared_pages_resident += - OBJ_RESIDENT_COUNT(obj, entry_size); - top->ref_count += ref_count - 1; - } - } else { - if (entry->superpage_size) { - top->share_mode = SM_LARGE_PAGE; - top->shared_pages_resident = 0; - top->private_pages_resident = entry_size; - } else if (entry->needs_copy) { - top->share_mode = SM_COW; - top->shared_pages_resident = - OBJ_RESIDENT_COUNT(obj, entry_size); - } else { - if (ref_count == 1 || - (ref_count == 2 && !(obj->pager_trusted) && !(obj->internal))) { - top->share_mode = SM_PRIVATE; - top->private_pages_resident = - OBJ_RESIDENT_COUNT(obj, - entry_size); - } else { - top->share_mode = SM_SHARED; - top->shared_pages_resident = - OBJ_RESIDENT_COUNT(obj, - entry_size); - } - } - top->ref_count = ref_count; - } - /* XXX K64: obj_id will be truncated */ - top->obj_id = (unsigned int) (uintptr_t)VM_KERNEL_ADDRPERM(obj); +#if MACH_ASSERT + case VM_BEHAVIOR_PAGEOUT: + return vm_map_pageout(map, start, end); +#endif /* MACH_ASSERT */ - vm_object_unlock(obj); + default: + return KERN_INVALID_ARGUMENT; } + + return KERN_SUCCESS; } -void -vm_map_region_walk( - vm_map_t map, - vm_map_offset_t va, - vm_map_entry_t entry, - vm_object_offset_t offset, - vm_object_size_t range, - vm_region_extended_info_t extended, - boolean_t look_for_pages, - mach_msg_type_number_t count) -{ - register struct vm_object *obj, *tmp_obj; - register vm_map_offset_t last_offset; - register int i; - register int ref_count; - struct vm_object *shadow_object; - int shadow_depth; - if ((entry->object.vm_object == 0) || - (entry->is_sub_map) || - (entry->object.vm_object->phys_contiguous && - !entry->superpage_size)) { - extended->share_mode = SM_EMPTY; - extended->ref_count = 0; - return; - } +/* + * Internals for madvise(MADV_WILLNEED) system call. + * + * The implementation is to do:- + * a) read-ahead if the mapping corresponds to a mapped regular file + * b) or, fault in the pages (zero-fill, decompress etc) if it's an anonymous mapping + */ - if (entry->superpage_size) { - extended->shadow_depth = 0; - extended->share_mode = SM_LARGE_PAGE; - extended->ref_count = 1; - extended->external_pager = 0; - extended->pages_resident = (unsigned int)(range >> PAGE_SHIFT); - extended->shadow_depth = 0; - return; - } - { - obj = entry->object.vm_object; +static kern_return_t +vm_map_willneed( + vm_map_t map, + vm_map_offset_t start, + vm_map_offset_t end + ) +{ + vm_map_entry_t entry; + vm_object_t object; + memory_object_t pager; + struct vm_object_fault_info fault_info = {}; + kern_return_t kr; + vm_object_size_t len; + vm_object_offset_t offset; + + fault_info.interruptible = THREAD_UNINT; /* ignored value */ + fault_info.behavior = VM_BEHAVIOR_SEQUENTIAL; + fault_info.stealth = TRUE; - vm_object_lock(obj); + /* + * The MADV_WILLNEED operation doesn't require any changes to the + * vm_map_entry_t's, so the read lock is sufficient. + */ - if ((ref_count = obj->ref_count) > 1 && obj->paging_in_progress) - ref_count--; + vm_map_lock_read(map); - if (look_for_pages) { - for (last_offset = offset + range; - offset < last_offset; - offset += PAGE_SIZE_64, va += PAGE_SIZE) { - vm_map_region_look_for_page(map, va, obj, - offset, ref_count, - 0, extended, count); - } - } else { - shadow_object = obj->shadow; - shadow_depth = 0; + /* + * The madvise semantics require that the address range be fully + * allocated with no holes. Otherwise, we're required to return + * an error. + */ - if ( !(obj->pager_trusted) && !(obj->internal)) - extended->external_pager = 1; + if (!vm_map_range_check(map, start, end, &entry)) { + vm_map_unlock_read(map); + return KERN_INVALID_ADDRESS; + } - if (shadow_object != VM_OBJECT_NULL) { - vm_object_lock(shadow_object); - for (; - shadow_object != VM_OBJECT_NULL; - shadow_depth++) { - vm_object_t next_shadow; - - if ( !(shadow_object->pager_trusted) && - !(shadow_object->internal)) - extended->external_pager = 1; - - next_shadow = shadow_object->shadow; - if (next_shadow) { - vm_object_lock(next_shadow); - } - vm_object_unlock(shadow_object); - shadow_object = next_shadow; - } - } - extended->shadow_depth = shadow_depth; + /* + * Examine each vm_map_entry_t in the range. + */ + for (; entry != vm_map_to_entry(map) && start < end;) { + /* + * The first time through, the start address could be anywhere + * within the vm_map_entry we found. So adjust the offset to + * correspond. After that, the offset will always be zero to + * correspond to the beginning of the current vm_map_entry. + */ + offset = (start - entry->vme_start) + VME_OFFSET(entry); + + /* + * Set the length so we don't go beyond the end of the + * map_entry or beyond the end of the range we were given. + * This range could span also multiple map entries all of which + * map different files, so make sure we only do the right amount + * of I/O for each object. Note that it's possible for there + * to be multiple map entries all referring to the same object + * but with different page permissions, but it's not worth + * trying to optimize that case. + */ + len = MIN(entry->vme_end - start, end - start); + + if ((vm_size_t) len != len) { + /* 32-bit overflow */ + len = (vm_size_t) (0 - PAGE_SIZE); + } + fault_info.cluster_size = (vm_size_t) len; + fault_info.lo_offset = offset; + fault_info.hi_offset = offset + len; + fault_info.user_tag = VME_ALIAS(entry); + fault_info.pmap_options = 0; + if (entry->iokit_acct || + (!entry->is_sub_map && !entry->use_pmap)) { + fault_info.pmap_options |= PMAP_OPTIONS_ALT_ACCT; } - if (extended->shadow_depth || entry->needs_copy) - extended->share_mode = SM_COW; - else { - if (ref_count == 1) - extended->share_mode = SM_PRIVATE; - else { - if (obj->true_share) - extended->share_mode = SM_TRUESHARED; - else - extended->share_mode = SM_SHARED; - } + /* + * If the entry is a submap OR there's no read permission + * to this mapping, then just skip it. + */ + if ((entry->is_sub_map) || (entry->protection & VM_PROT_READ) == 0) { + entry = entry->vme_next; + start = entry->vme_start; + continue; } - extended->ref_count = ref_count - extended->shadow_depth; - - for (i = 0; i < extended->shadow_depth; i++) { - if ((tmp_obj = obj->shadow) == 0) - break; - vm_object_lock(tmp_obj); - vm_object_unlock(obj); - if ((ref_count = tmp_obj->ref_count) > 1 && tmp_obj->paging_in_progress) - ref_count--; + object = VME_OBJECT(entry); - extended->ref_count += ref_count; - obj = tmp_obj; - } - vm_object_unlock(obj); + if (object == NULL || + (object && object->internal)) { + /* + * Memory range backed by anonymous memory. + */ + vm_size_t region_size = 0, effective_page_size = 0; + vm_map_offset_t addr = 0, effective_page_mask = 0; - if (extended->share_mode == SM_SHARED) { - register vm_map_entry_t cur; - register vm_map_entry_t last; - int my_refs; + region_size = len; + addr = start; - obj = entry->object.vm_object; - last = vm_map_to_entry(map); - my_refs = 0; + effective_page_mask = MIN(vm_map_page_mask(current_map()), PAGE_MASK); + effective_page_size = effective_page_mask + 1; - if ((ref_count = obj->ref_count) > 1 && obj->paging_in_progress) - ref_count--; - for (cur = vm_map_first_entry(map); cur != last; cur = cur->vme_next) - my_refs += vm_map_region_count_obj_refs(cur, obj); + vm_map_unlock_read(map); - if (my_refs == ref_count) - extended->share_mode = SM_PRIVATE_ALIASED; - else if (my_refs > 1) - extended->share_mode = SM_SHARED_ALIASED; - } - } -} + while (region_size) { + vm_pre_fault( + vm_map_trunc_page(addr, effective_page_mask), + VM_PROT_READ | VM_PROT_WRITE); + region_size -= effective_page_size; + addr += effective_page_size; + } + } else { + /* + * Find the file object backing this map entry. If there is + * none, then we simply ignore the "will need" advice for this + * entry and go on to the next one. + */ + if ((object = find_vnode_object(entry)) == VM_OBJECT_NULL) { + entry = entry->vme_next; + start = entry->vme_start; + continue; + } -/* object is locked on entry and locked on return */ + vm_object_paging_begin(object); + pager = object->pager; + vm_object_unlock(object); + /* + * The data_request() could take a long time, so let's + * release the map lock to avoid blocking other threads. + */ + vm_map_unlock_read(map); -static void -vm_map_region_look_for_page( - __unused vm_map_t map, - __unused vm_map_offset_t va, - vm_object_t object, - vm_object_offset_t offset, - int max_refcnt, - int depth, - vm_region_extended_info_t extended, - mach_msg_type_number_t count) -{ - register vm_page_t p; - register vm_object_t shadow; - register int ref_count; - vm_object_t caller_object; - kern_return_t kr; - shadow = object->shadow; - caller_object = object; + /* + * Get the data from the object asynchronously. + * + * Note that memory_object_data_request() places limits on the + * amount of I/O it will do. Regardless of the len we + * specified, it won't do more than MAX_UPL_TRANSFER_BYTES and it + * silently truncates the len to that size. This isn't + * necessarily bad since madvise shouldn't really be used to + * page in unlimited amounts of data. Other Unix variants + * limit the willneed case as well. If this turns out to be an + * issue for developers, then we can always adjust the policy + * here and still be backwards compatible since this is all + * just "advice". + */ + kr = memory_object_data_request( + pager, + vm_object_trunc_page(offset) + object->paging_offset, + 0, /* ignored */ + VM_PROT_READ, + (memory_object_fault_info_t)&fault_info); - - while (TRUE) { + vm_object_lock(object); + vm_object_paging_end(object); + vm_object_unlock(object); - if ( !(object->pager_trusted) && !(object->internal)) - extended->external_pager = 1; + /* + * If we couldn't do the I/O for some reason, just give up on + * the madvise. We still return success to the user since + * madvise isn't supposed to fail when the advice can't be + * taken. + */ - if ((p = vm_page_lookup(object, offset)) != VM_PAGE_NULL) { - if (shadow && (max_refcnt == 1)) - extended->pages_shared_now_private++; - - if (!p->fictitious && - (p->dirty || pmap_is_modified(p->phys_page))) - extended->pages_dirtied++; - else if (count >= VM_REGION_EXTENDED_INFO_COUNT) { - if (p->reusable || p->object->all_reusable) { - extended->pages_reusable++; - } + if (kr != KERN_SUCCESS) { + return KERN_SUCCESS; } + } - extended->pages_resident++; - - if(object != caller_object) - vm_object_unlock(object); + start += len; + if (start >= end) { + /* done */ + return KERN_SUCCESS; + } - return; + /* look up next entry */ + vm_map_lock_read(map); + if (!vm_map_lookup_entry(map, start, &entry)) { + /* + * There's a new hole in the address range. + */ + vm_map_unlock_read(map); + return KERN_INVALID_ADDRESS; } -#if MACH_PAGEMAP - if (object->existence_map) { - if (vm_external_state_get(object->existence_map, offset) == VM_EXTERNAL_STATE_EXISTS) { + } - extended->pages_swapped_out++; + vm_map_unlock_read(map); + return KERN_SUCCESS; +} - if(object != caller_object) - vm_object_unlock(object); +static boolean_t +vm_map_entry_is_reusable( + vm_map_entry_t entry) +{ + /* Only user map entries */ - return; - } - } else -#endif /* MACH_PAGEMAP */ - if (object->internal && - object->alive && - !object->terminating && - object->pager_ready) { + vm_object_t object; - if (COMPRESSED_PAGER_IS_ACTIVE || DEFAULT_FREEZER_COMPRESSED_PAGER_IS_ACTIVE) { - if (VM_COMPRESSOR_PAGER_STATE_GET(object, - offset) - == VM_EXTERNAL_STATE_EXISTS) { - /* the pager has that page */ - extended->pages_swapped_out++; - if (object != caller_object) - vm_object_unlock(object); - return; - } - } else { - memory_object_t pager; + if (entry->is_sub_map) { + return FALSE; + } - vm_object_paging_begin(object); - pager = object->pager; - vm_object_unlock(object); + switch (VME_ALIAS(entry)) { + case VM_MEMORY_MALLOC: + case VM_MEMORY_MALLOC_SMALL: + case VM_MEMORY_MALLOC_LARGE: + case VM_MEMORY_REALLOC: + case VM_MEMORY_MALLOC_TINY: + case VM_MEMORY_MALLOC_LARGE_REUSABLE: + case VM_MEMORY_MALLOC_LARGE_REUSED: + /* + * This is a malloc() memory region: check if it's still + * in its original state and can be re-used for more + * malloc() allocations. + */ + break; + default: + /* + * Not a malloc() memory region: let the caller decide if + * it's re-usable. + */ + return TRUE; + } - kr = memory_object_data_request( - pager, - offset + object->paging_offset, - 0, /* just poke the pager */ - VM_PROT_READ, - NULL); + if (/*entry->is_shared ||*/ + entry->is_sub_map || + entry->in_transition || + entry->protection != VM_PROT_DEFAULT || + entry->max_protection != VM_PROT_ALL || + entry->inheritance != VM_INHERIT_DEFAULT || + entry->no_cache || + entry->permanent || + entry->superpage_size != FALSE || + entry->zero_wired_pages || + entry->wired_count != 0 || + entry->user_wired_count != 0) { + return FALSE; + } - vm_object_lock(object); - vm_object_paging_end(object); + object = VME_OBJECT(entry); + if (object == VM_OBJECT_NULL) { + return TRUE; + } + if ( +#if 0 + /* + * Let's proceed even if the VM object is potentially + * shared. + * We check for this later when processing the actual + * VM pages, so the contents will be safe if shared. + * + * But we can still mark this memory region as "reusable" to + * acknowledge that the caller did let us know that the memory + * could be re-used and should not be penalized for holding + * on to it. This allows its "resident size" to not include + * the reusable range. + */ + object->ref_count == 1 && +#endif + object->wired_page_count == 0 && + object->copy == VM_OBJECT_NULL && + object->shadow == VM_OBJECT_NULL && + object->internal && + object->purgable == VM_PURGABLE_DENY && + object->wimg_bits == VM_WIMG_USE_DEFAULT && + !object->code_signed) { + return TRUE; + } + return FALSE; +} - if (kr == KERN_SUCCESS) { - /* the pager has that page */ - extended->pages_swapped_out++; - if (object != caller_object) - vm_object_unlock(object); - return; - } - } - } +static kern_return_t +vm_map_reuse_pages( + vm_map_t map, + vm_map_offset_t start, + vm_map_offset_t end) +{ + vm_map_entry_t entry; + vm_object_t object; + vm_object_offset_t start_offset, end_offset; - if (shadow) { - vm_object_lock(shadow); + /* + * The MADV_REUSE operation doesn't require any changes to the + * vm_map_entry_t's, so the read lock is sufficient. + */ + + if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) { + /* + * XXX TODO4K + * need to figure out what reusable means for a + * portion of a native page. + */ + return KERN_SUCCESS; + } + + vm_map_lock_read(map); + assert(map->pmap != kernel_pmap); /* protect alias access */ + + /* + * The madvise semantics require that the address range be fully + * allocated with no holes. Otherwise, we're required to return + * an error. + */ + + if (!vm_map_range_check(map, start, end, &entry)) { + vm_map_unlock_read(map); + vm_page_stats_reusable.reuse_pages_failure++; + return KERN_INVALID_ADDRESS; + } - if ((ref_count = shadow->ref_count) > 1 && shadow->paging_in_progress) - ref_count--; + /* + * Examine each vm_map_entry_t in the range. + */ + for (; entry != vm_map_to_entry(map) && entry->vme_start < end; + entry = entry->vme_next) { + /* + * Sanity check on the VM map entry. + */ + if (!vm_map_entry_is_reusable(entry)) { + vm_map_unlock_read(map); + vm_page_stats_reusable.reuse_pages_failure++; + return KERN_INVALID_ADDRESS; + } - if (++depth > extended->shadow_depth) - extended->shadow_depth = depth; + /* + * The first time through, the start address could be anywhere + * within the vm_map_entry we found. So adjust the offset to + * correspond. + */ + if (entry->vme_start < start) { + start_offset = start - entry->vme_start; + } else { + start_offset = 0; + } + end_offset = MIN(end, entry->vme_end) - entry->vme_start; + start_offset += VME_OFFSET(entry); + end_offset += VME_OFFSET(entry); - if (ref_count > max_refcnt) - max_refcnt = ref_count; - - if(object != caller_object) - vm_object_unlock(object); + assert(!entry->is_sub_map); + object = VME_OBJECT(entry); + if (object != VM_OBJECT_NULL) { + vm_object_lock(object); + vm_object_reuse_pages(object, start_offset, end_offset, + TRUE); + vm_object_unlock(object); + } - offset = offset + object->vo_shadow_offset; - object = shadow; - shadow = object->shadow; - continue; + if (VME_ALIAS(entry) == VM_MEMORY_MALLOC_LARGE_REUSABLE) { + /* + * XXX + * We do not hold the VM map exclusively here. + * The "alias" field is not that critical, so it's + * safe to update it here, as long as it is the only + * one that can be modified while holding the VM map + * "shared". + */ + VME_ALIAS_SET(entry, VM_MEMORY_MALLOC_LARGE_REUSED); } - if(object != caller_object) - vm_object_unlock(object); - break; } + + vm_map_unlock_read(map); + vm_page_stats_reusable.reuse_pages_success++; + return KERN_SUCCESS; } -static int -vm_map_region_count_obj_refs( - vm_map_entry_t entry, - vm_object_t object) + +static kern_return_t +vm_map_reusable_pages( + vm_map_t map, + vm_map_offset_t start, + vm_map_offset_t end) { - register int ref_count; - register vm_object_t chk_obj; - register vm_object_t tmp_obj; + vm_map_entry_t entry; + vm_object_t object; + vm_object_offset_t start_offset, end_offset; + vm_map_offset_t pmap_offset; - if (entry->object.vm_object == 0) - return(0); + if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) { + /* + * XXX TODO4K + * need to figure out what reusable means for a portion + * of a native page. + */ + return KERN_SUCCESS; + } - if (entry->is_sub_map) - return(0); - else { - ref_count = 0; + /* + * The MADV_REUSABLE operation doesn't require any changes to the + * vm_map_entry_t's, so the read lock is sufficient. + */ - chk_obj = entry->object.vm_object; - vm_object_lock(chk_obj); + vm_map_lock_read(map); + assert(map->pmap != kernel_pmap); /* protect alias access */ - while (chk_obj) { - if (chk_obj == object) - ref_count++; - tmp_obj = chk_obj->shadow; - if (tmp_obj) - vm_object_lock(tmp_obj); - vm_object_unlock(chk_obj); + /* + * The madvise semantics require that the address range be fully + * allocated with no holes. Otherwise, we're required to return + * an error. + */ - chk_obj = tmp_obj; - } + if (!vm_map_range_check(map, start, end, &entry)) { + vm_map_unlock_read(map); + vm_page_stats_reusable.reusable_pages_failure++; + return KERN_INVALID_ADDRESS; } - return(ref_count); -} - - -/* - * Routine: vm_map_simplify - * - * Description: - * Attempt to simplify the map representation in - * the vicinity of the given starting address. - * Note: - * This routine is intended primarily to keep the - * kernel maps more compact -- they generally don't - * benefit from the "expand a map entry" technology - * at allocation time because the adjacent entry - * is often wired down. - */ -void -vm_map_simplify_entry( - vm_map_t map, - vm_map_entry_t this_entry) -{ - vm_map_entry_t prev_entry; - - counter(c_vm_map_simplify_entry_called++); - prev_entry = this_entry->vme_prev; + /* + * Examine each vm_map_entry_t in the range. + */ + for (; entry != vm_map_to_entry(map) && entry->vme_start < end; + entry = entry->vme_next) { + int kill_pages = 0; - if ((this_entry != vm_map_to_entry(map)) && - (prev_entry != vm_map_to_entry(map)) && + /* + * Sanity check on the VM map entry. + */ + if (!vm_map_entry_is_reusable(entry)) { + vm_map_unlock_read(map); + vm_page_stats_reusable.reusable_pages_failure++; + return KERN_INVALID_ADDRESS; + } - (prev_entry->vme_end == this_entry->vme_start) && + if (!(entry->protection & VM_PROT_WRITE) && !entry->used_for_jit) { + /* not writable: can't discard contents */ + vm_map_unlock_read(map); + vm_page_stats_reusable.reusable_nonwritable++; + vm_page_stats_reusable.reusable_pages_failure++; + return KERN_PROTECTION_FAILURE; + } - (prev_entry->is_sub_map == this_entry->is_sub_map) && - (prev_entry->object.vm_object == this_entry->object.vm_object) && - ((prev_entry->offset + (prev_entry->vme_end - - prev_entry->vme_start)) - == this_entry->offset) && + /* + * The first time through, the start address could be anywhere + * within the vm_map_entry we found. So adjust the offset to + * correspond. + */ + if (entry->vme_start < start) { + start_offset = start - entry->vme_start; + pmap_offset = start; + } else { + start_offset = 0; + pmap_offset = entry->vme_start; + } + end_offset = MIN(end, entry->vme_end) - entry->vme_start; + start_offset += VME_OFFSET(entry); + end_offset += VME_OFFSET(entry); - (prev_entry->behavior == this_entry->behavior) && - (prev_entry->needs_copy == this_entry->needs_copy) && - (prev_entry->protection == this_entry->protection) && - (prev_entry->max_protection == this_entry->max_protection) && - (prev_entry->inheritance == this_entry->inheritance) && - (prev_entry->use_pmap == this_entry->use_pmap) && - (prev_entry->alias == this_entry->alias) && - (prev_entry->no_cache == this_entry->no_cache) && - (prev_entry->permanent == this_entry->permanent) && - (prev_entry->map_aligned == this_entry->map_aligned) && - (prev_entry->zero_wired_pages == this_entry->zero_wired_pages) && - (prev_entry->used_for_jit == this_entry->used_for_jit) && - /* from_reserved_zone: OK if that field doesn't match */ - (prev_entry->iokit_acct == this_entry->iokit_acct) && + assert(!entry->is_sub_map); + object = VME_OBJECT(entry); + if (object == VM_OBJECT_NULL) { + continue; + } - (prev_entry->wired_count == this_entry->wired_count) && - (prev_entry->user_wired_count == this_entry->user_wired_count) && - (prev_entry->in_transition == FALSE) && - (this_entry->in_transition == FALSE) && - (prev_entry->needs_wakeup == FALSE) && - (this_entry->needs_wakeup == FALSE) && - (prev_entry->is_shared == FALSE) && - (this_entry->is_shared == FALSE) && - (prev_entry->superpage_size == FALSE) && - (this_entry->superpage_size == FALSE) - ) { - vm_map_store_entry_unlink(map, prev_entry); - assert(prev_entry->vme_start < this_entry->vme_end); - if (prev_entry->map_aligned) - assert(VM_MAP_PAGE_ALIGNED(prev_entry->vme_start, - VM_MAP_PAGE_MASK(map))); - this_entry->vme_start = prev_entry->vme_start; - this_entry->offset = prev_entry->offset; - if (prev_entry->is_sub_map) { - vm_map_deallocate(prev_entry->object.sub_map); + vm_object_lock(object); + if (((object->ref_count == 1) || + (object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC && + object->copy == VM_OBJECT_NULL)) && + object->shadow == VM_OBJECT_NULL && + /* + * "iokit_acct" entries are billed for their virtual size + * (rather than for their resident pages only), so they + * wouldn't benefit from making pages reusable, and it + * would be hard to keep track of pages that are both + * "iokit_acct" and "reusable" in the pmap stats and + * ledgers. + */ + !(entry->iokit_acct || + (!entry->is_sub_map && !entry->use_pmap))) { + if (object->ref_count != 1) { + vm_page_stats_reusable.reusable_shared++; + } + kill_pages = 1; } else { - vm_object_deallocate(prev_entry->object.vm_object); + kill_pages = -1; + } + if (kill_pages != -1) { + vm_object_deactivate_pages(object, + start_offset, + end_offset - start_offset, + kill_pages, + TRUE /*reusable_pages*/, + map->pmap, + pmap_offset); + } else { + vm_page_stats_reusable.reusable_pages_shared++; + } + vm_object_unlock(object); + + if (VME_ALIAS(entry) == VM_MEMORY_MALLOC_LARGE || + VME_ALIAS(entry) == VM_MEMORY_MALLOC_LARGE_REUSED) { + /* + * XXX + * We do not hold the VM map exclusively here. + * The "alias" field is not that critical, so it's + * safe to update it here, as long as it is the only + * one that can be modified while holding the VM map + * "shared". + */ + VME_ALIAS_SET(entry, VM_MEMORY_MALLOC_LARGE_REUSABLE); } - vm_map_entry_dispose(map, prev_entry); - SAVE_HINT_MAP_WRITE(map, this_entry); - counter(c_vm_map_simplified++); } + + vm_map_unlock_read(map); + vm_page_stats_reusable.reusable_pages_success++; + return KERN_SUCCESS; } -void -vm_map_simplify( - vm_map_t map, - vm_map_offset_t start) + +static kern_return_t +vm_map_can_reuse( + vm_map_t map, + vm_map_offset_t start, + vm_map_offset_t end) { - vm_map_entry_t this_entry; + vm_map_entry_t entry; - vm_map_lock(map); - if (vm_map_lookup_entry(map, start, &this_entry)) { - vm_map_simplify_entry(map, this_entry); - vm_map_simplify_entry(map, this_entry->vme_next); - } - counter(c_vm_map_simplify_called++); - vm_map_unlock(map); -} + /* + * The MADV_REUSABLE operation doesn't require any changes to the + * vm_map_entry_t's, so the read lock is sufficient. + */ -static void -vm_map_simplify_range( - vm_map_t map, - vm_map_offset_t start, - vm_map_offset_t end) -{ - vm_map_entry_t entry; + vm_map_lock_read(map); + assert(map->pmap != kernel_pmap); /* protect alias access */ /* - * The map should be locked (for "write") by the caller. + * The madvise semantics require that the address range be fully + * allocated with no holes. Otherwise, we're required to return + * an error. */ - if (start >= end) { - /* invalid address range */ - return; + if (!vm_map_range_check(map, start, end, &entry)) { + vm_map_unlock_read(map); + vm_page_stats_reusable.can_reuse_failure++; + return KERN_INVALID_ADDRESS; } - start = vm_map_trunc_page(start, - VM_MAP_PAGE_MASK(map)); - end = vm_map_round_page(end, - VM_MAP_PAGE_MASK(map)); - - if (!vm_map_lookup_entry(map, start, &entry)) { - /* "start" is not mapped and "entry" ends before "start" */ - if (entry == vm_map_to_entry(map)) { - /* start with first entry in the map */ - entry = vm_map_first_entry(map); - } else { - /* start with next entry */ - entry = entry->vme_next; + /* + * Examine each vm_map_entry_t in the range. + */ + for (; entry != vm_map_to_entry(map) && entry->vme_start < end; + entry = entry->vme_next) { + /* + * Sanity check on the VM map entry. + */ + if (!vm_map_entry_is_reusable(entry)) { + vm_map_unlock_read(map); + vm_page_stats_reusable.can_reuse_failure++; + return KERN_INVALID_ADDRESS; } } - - while (entry != vm_map_to_entry(map) && - entry->vme_start <= end) { - /* try and coalesce "entry" with its previous entry */ - vm_map_simplify_entry(map, entry); - entry = entry->vme_next; - } + + vm_map_unlock_read(map); + vm_page_stats_reusable.can_reuse_success++; + return KERN_SUCCESS; } -/* - * Routine: vm_map_machine_attribute - * Purpose: - * Provide machine-specific attributes to mappings, - * such as cachability etc. for machines that provide - * them. NUMA architectures and machines with big/strange - * caches will use this. - * Note: - * Responsibilities for locking and checking are handled here, - * everything else in the pmap module. If any non-volatile - * information must be kept, the pmap module should handle - * it itself. [This assumes that attributes do not - * need to be inherited, which seems ok to me] - */ -kern_return_t -vm_map_machine_attribute( - vm_map_t map, - vm_map_offset_t start, - vm_map_offset_t end, - vm_machine_attribute_t attribute, - vm_machine_attribute_val_t* value) /* IN/OUT */ +#if MACH_ASSERT +static kern_return_t +vm_map_pageout( + vm_map_t map, + vm_map_offset_t start, + vm_map_offset_t end) { - kern_return_t ret; - vm_map_size_t sync_size; - vm_map_entry_t entry; - - if (start < vm_map_min(map) || end > vm_map_max(map)) - return KERN_INVALID_ADDRESS; + vm_map_entry_t entry; - /* Figure how much memory we need to flush (in page increments) */ - sync_size = end - start; + /* + * The MADV_PAGEOUT operation doesn't require any changes to the + * vm_map_entry_t's, so the read lock is sufficient. + */ - vm_map_lock(map); - - if (attribute != MATTR_CACHE) { - /* If we don't have to find physical addresses, we */ - /* don't have to do an explicit traversal here. */ - ret = pmap_attribute(map->pmap, start, end-start, - attribute, value); - vm_map_unlock(map); - return ret; + vm_map_lock_read(map); + + /* + * The madvise semantics require that the address range be fully + * allocated with no holes. Otherwise, we're required to return + * an error. + */ + + if (!vm_map_range_check(map, start, end, &entry)) { + vm_map_unlock_read(map); + return KERN_INVALID_ADDRESS; } - ret = KERN_SUCCESS; /* Assume it all worked */ + /* + * Examine each vm_map_entry_t in the range. + */ + for (; entry != vm_map_to_entry(map) && entry->vme_start < end; + entry = entry->vme_next) { + vm_object_t object; - while(sync_size) { - if (vm_map_lookup_entry(map, start, &entry)) { - vm_map_size_t sub_size; - if((entry->vme_end - start) > sync_size) { - sub_size = sync_size; - sync_size = 0; - } else { - sub_size = entry->vme_end - start; - sync_size -= sub_size; + /* + * Sanity check on the VM map entry. + */ + if (entry->is_sub_map) { + vm_map_t submap; + vm_map_offset_t submap_start; + vm_map_offset_t submap_end; + vm_map_entry_t submap_entry; + + submap = VME_SUBMAP(entry); + submap_start = VME_OFFSET(entry); + submap_end = submap_start + (entry->vme_end - + entry->vme_start); + + vm_map_lock_read(submap); + + if (!vm_map_range_check(submap, + submap_start, + submap_end, + &submap_entry)) { + vm_map_unlock_read(submap); + vm_map_unlock_read(map); + return KERN_INVALID_ADDRESS; } - if(entry->is_sub_map) { - vm_map_offset_t sub_start; - vm_map_offset_t sub_end; - - sub_start = (start - entry->vme_start) - + entry->offset; - sub_end = sub_start + sub_size; - vm_map_machine_attribute( - entry->object.sub_map, - sub_start, - sub_end, - attribute, value); - } else { - if(entry->object.vm_object) { - vm_page_t m; - vm_object_t object; - vm_object_t base_object; - vm_object_t last_object; - vm_object_offset_t offset; - vm_object_offset_t base_offset; - vm_map_size_t range; - range = sub_size; - offset = (start - entry->vme_start) - + entry->offset; - base_offset = offset; - object = entry->object.vm_object; - base_object = object; - last_object = NULL; - vm_object_lock(object); + object = VME_OBJECT(submap_entry); + if (submap_entry->is_sub_map || + object == VM_OBJECT_NULL || + !object->internal) { + vm_map_unlock_read(submap); + continue; + } - while (range) { - m = vm_page_lookup( - object, offset); + vm_object_pageout(object); - if (m && !m->fictitious) { - ret = - pmap_attribute_cache_sync( - m->phys_page, - PAGE_SIZE, - attribute, value); - - } else if (object->shadow) { - offset = offset + object->vo_shadow_offset; - last_object = object; - object = object->shadow; - vm_object_lock(last_object->shadow); - vm_object_unlock(last_object); - continue; - } - range -= PAGE_SIZE; + vm_map_unlock_read(submap); + submap = VM_MAP_NULL; + submap_entry = VM_MAP_ENTRY_NULL; + continue; + } - if (base_object != object) { - vm_object_unlock(object); - vm_object_lock(base_object); - object = base_object; - } - /* Bump to the next page */ - base_offset += PAGE_SIZE; - offset = base_offset; - } - vm_object_unlock(object); - } - } - start += sub_size; - } else { - vm_map_unlock(map); - return KERN_FAILURE; + object = VME_OBJECT(entry); + if (entry->is_sub_map || + object == VM_OBJECT_NULL || + !object->internal) { + continue; } - - } - vm_map_unlock(map); + vm_object_pageout(object); + } - return ret; + vm_map_unlock_read(map); + return KERN_SUCCESS; } +#endif /* MACH_ASSERT */ + /* - * vm_map_behavior_set: + * Routine: vm_map_entry_insert * - * Sets the paging reference behavior of the specified address - * range in the target map. Paging reference behavior affects - * how pagein operations resulting from faults on the map will be - * clustered. + * Description: This routine inserts a new vm_entry in a locked map. */ -kern_return_t -vm_map_behavior_set( - vm_map_t map, - vm_map_offset_t start, - vm_map_offset_t end, - vm_behavior_t new_behavior) +vm_map_entry_t +vm_map_entry_insert( + vm_map_t map, + vm_map_entry_t insp_entry, + vm_map_offset_t start, + vm_map_offset_t end, + vm_object_t object, + vm_object_offset_t offset, + vm_map_kernel_flags_t vmk_flags, + boolean_t needs_copy, + boolean_t is_shared, + boolean_t in_transition, + vm_prot_t cur_protection, + vm_prot_t max_protection, + vm_behavior_t behavior, + vm_inherit_t inheritance, + unsigned short wired_count, + boolean_t no_cache, + boolean_t permanent, + boolean_t no_copy_on_read, + unsigned int superpage_size, + boolean_t clear_map_aligned, + boolean_t is_submap, + boolean_t used_for_jit, + int alias, + boolean_t translated_allow_execute) { - register vm_map_entry_t entry; - vm_map_entry_t temp_entry; + vm_map_entry_t new_entry; - XPR(XPR_VM_MAP, - "vm_map_behavior_set, 0x%X start 0x%X end 0x%X behavior %d", - map, start, end, new_behavior, 0); + assert(insp_entry != (vm_map_entry_t)0); + vm_map_lock_assert_exclusive(map); - if (start > end || - start < vm_map_min(map) || - end > vm_map_max(map)) { - return KERN_NO_SPACE; - } +#if DEVELOPMENT || DEBUG + vm_object_offset_t end_offset = 0; + assertf(!os_add_overflow(end - start, offset, &end_offset), "size 0x%llx, offset 0x%llx caused overflow", (uint64_t)(end - start), offset); +#endif /* DEVELOPMENT || DEBUG */ - switch (new_behavior) { + new_entry = vm_map_entry_create(map, !map->hdr.entries_pageable); - /* - * This first block of behaviors all set a persistent state on the specified - * memory range. All we have to do here is to record the desired behavior - * in the vm_map_entry_t's. - */ + if (VM_MAP_PAGE_SHIFT(map) != PAGE_SHIFT) { + new_entry->map_aligned = TRUE; + } else { + new_entry->map_aligned = FALSE; + } + if (clear_map_aligned && + (!VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)) || + !VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map)))) { + new_entry->map_aligned = FALSE; + } - case VM_BEHAVIOR_DEFAULT: - case VM_BEHAVIOR_RANDOM: - case VM_BEHAVIOR_SEQUENTIAL: - case VM_BEHAVIOR_RSEQNTL: - case VM_BEHAVIOR_ZERO_WIRED_PAGES: - vm_map_lock(map); - + new_entry->vme_start = start; + new_entry->vme_end = end; + if (new_entry->map_aligned) { + assert(VM_MAP_PAGE_ALIGNED(new_entry->vme_start, + VM_MAP_PAGE_MASK(map))); + assert(VM_MAP_PAGE_ALIGNED(new_entry->vme_end, + VM_MAP_PAGE_MASK(map))); + } else { + assert(page_aligned(new_entry->vme_start)); + assert(page_aligned(new_entry->vme_end)); + } + assert(new_entry->vme_start < new_entry->vme_end); + + VME_OBJECT_SET(new_entry, object); + VME_OFFSET_SET(new_entry, offset); + new_entry->is_shared = is_shared; + new_entry->is_sub_map = is_submap; + new_entry->needs_copy = needs_copy; + new_entry->in_transition = in_transition; + new_entry->needs_wakeup = FALSE; + new_entry->inheritance = inheritance; + new_entry->protection = cur_protection; + new_entry->max_protection = max_protection; + new_entry->behavior = behavior; + new_entry->wired_count = wired_count; + new_entry->user_wired_count = 0; + if (is_submap) { /* - * The entire address range must be valid for the map. - * Note that vm_map_range_check() does a - * vm_map_lookup_entry() internally and returns the - * entry containing the start of the address range if - * the entire range is valid. + * submap: "use_pmap" means "nested". + * default: false. */ - if (vm_map_range_check(map, start, end, &temp_entry)) { - entry = temp_entry; - vm_map_clip_start(map, entry, start); - } - else { - vm_map_unlock(map); - return(KERN_INVALID_ADDRESS); - } - - while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) { - vm_map_clip_end(map, entry, end); - if (entry->is_sub_map) { - assert(!entry->use_pmap); - } - - if( new_behavior == VM_BEHAVIOR_ZERO_WIRED_PAGES ) { - entry->zero_wired_pages = TRUE; - } else { - entry->behavior = new_behavior; - } - entry = entry->vme_next; + new_entry->use_pmap = FALSE; + } else { + /* + * object: "use_pmap" means "use pmap accounting" for footprint. + * default: true. + */ + new_entry->use_pmap = TRUE; + } + VME_ALIAS_SET(new_entry, alias); + new_entry->zero_wired_pages = FALSE; + new_entry->no_cache = no_cache; + new_entry->permanent = permanent; + if (superpage_size) { + new_entry->superpage_size = TRUE; + } else { + new_entry->superpage_size = FALSE; + } + if (used_for_jit) { + if (!(map->jit_entry_exists) || + VM_MAP_POLICY_ALLOW_MULTIPLE_JIT(map)) { + new_entry->used_for_jit = TRUE; + map->jit_entry_exists = TRUE; } - - vm_map_unlock(map); - break; + } else { + new_entry->used_for_jit = FALSE; + } + if (translated_allow_execute) { + new_entry->translated_allow_execute = TRUE; + } else { + new_entry->translated_allow_execute = FALSE; + } + new_entry->pmap_cs_associated = FALSE; + new_entry->iokit_acct = FALSE; + new_entry->vme_resilient_codesign = FALSE; + new_entry->vme_resilient_media = FALSE; + new_entry->vme_atomic = FALSE; + new_entry->vme_no_copy_on_read = no_copy_on_read; /* - * The rest of these are different from the above in that they cause - * an immediate action to take place as opposed to setting a behavior that - * affects future actions. + * Insert the new entry into the list. */ - case VM_BEHAVIOR_WILLNEED: - return vm_map_willneed(map, start, end); - - case VM_BEHAVIOR_DONTNEED: - return vm_map_msync(map, start, end - start, VM_SYNC_DEACTIVATE | VM_SYNC_CONTIGUOUS); - - case VM_BEHAVIOR_FREE: - return vm_map_msync(map, start, end - start, VM_SYNC_KILLPAGES | VM_SYNC_CONTIGUOUS); - - case VM_BEHAVIOR_REUSABLE: - return vm_map_reusable_pages(map, start, end); - - case VM_BEHAVIOR_REUSE: - return vm_map_reuse_pages(map, start, end); - - case VM_BEHAVIOR_CAN_REUSE: - return vm_map_can_reuse(map, start, end); + vm_map_store_entry_link(map, insp_entry, new_entry, vmk_flags); + map->size += end - start; - default: - return(KERN_INVALID_ARGUMENT); - } + /* + * Update the free space hint and the lookup hint. + */ - return(KERN_SUCCESS); + SAVE_HINT_MAP_WRITE(map, new_entry); + return new_entry; } - /* - * Internals for madvise(MADV_WILLNEED) system call. + * Routine: vm_map_remap_extract * - * The present implementation is to do a read-ahead if the mapping corresponds - * to a mapped regular file. If it's an anonymous mapping, then we do nothing - * and basically ignore the "advice" (which we are always free to do). + * Description: This routine returns a vm_entry list from a map. */ - - static kern_return_t -vm_map_willneed( - vm_map_t map, - vm_map_offset_t start, - vm_map_offset_t end -) +vm_map_remap_extract( + vm_map_t map, + vm_map_offset_t addr, + vm_map_size_t size, + boolean_t copy, + struct vm_map_header *map_header, + vm_prot_t *cur_protection, /* IN/OUT */ + vm_prot_t *max_protection, /* IN/OUT */ + /* What, no behavior? */ + vm_inherit_t inheritance, + vm_map_kernel_flags_t vmk_flags) { - vm_map_entry_t entry; - vm_object_t object; - memory_object_t pager; - struct vm_object_fault_info fault_info; - kern_return_t kr; - vm_object_size_t len; - vm_object_offset_t offset; - - /* - * Fill in static values in fault_info. Several fields get ignored by the code - * we call, but we'll fill them in anyway since uninitialized fields are bad - * when it comes to future backwards compatibility. - */ + kern_return_t result; + vm_map_size_t mapped_size; + vm_map_size_t tmp_size; + vm_map_entry_t src_entry; /* result of last map lookup */ + vm_map_entry_t new_entry; + vm_object_offset_t offset; + vm_map_offset_t map_address; + vm_map_offset_t src_start; /* start of entry to map */ + vm_map_offset_t src_end; /* end of region to be mapped */ + vm_object_t object; + vm_map_version_t version; + boolean_t src_needs_copy; + boolean_t new_entry_needs_copy; + vm_map_entry_t saved_src_entry; + boolean_t src_entry_was_wired; + vm_prot_t max_prot_for_prot_copy; + vm_map_offset_t effective_page_mask; + boolean_t pageable, same_map; + boolean_t vm_remap_legacy; + vm_prot_t required_cur_prot, required_max_prot; + + pageable = vmk_flags.vmkf_copy_pageable; + same_map = vmk_flags.vmkf_copy_same_map; + + effective_page_mask = MIN(PAGE_MASK, VM_MAP_PAGE_MASK(map)); - fault_info.interruptible = THREAD_UNINT; /* ignored value */ - fault_info.behavior = VM_BEHAVIOR_SEQUENTIAL; - fault_info.no_cache = FALSE; /* ignored value */ - fault_info.stealth = TRUE; - fault_info.io_sync = FALSE; - fault_info.cs_bypass = FALSE; - fault_info.mark_zf_absent = FALSE; - fault_info.batch_pmap_op = FALSE; + assert(map != VM_MAP_NULL); + assert(size != 0); + assert(size == vm_map_round_page(size, effective_page_mask)); + assert(inheritance == VM_INHERIT_NONE || + inheritance == VM_INHERIT_COPY || + inheritance == VM_INHERIT_SHARE); + assert(!(*cur_protection & ~VM_PROT_ALL)); + assert(!(*max_protection & ~VM_PROT_ALL)); + assert((*cur_protection & *max_protection) == *cur_protection); /* - * The MADV_WILLNEED operation doesn't require any changes to the - * vm_map_entry_t's, so the read lock is sufficient. + * Compute start and end of region. */ - - vm_map_lock_read(map); + src_start = vm_map_trunc_page(addr, effective_page_mask); + src_end = vm_map_round_page(src_start + size, effective_page_mask); /* - * The madvise semantics require that the address range be fully - * allocated with no holes. Otherwise, we're required to return - * an error. + * Initialize map_header. */ + map_header->links.next = CAST_TO_VM_MAP_ENTRY(&map_header->links); + map_header->links.prev = CAST_TO_VM_MAP_ENTRY(&map_header->links); + map_header->nentries = 0; + map_header->entries_pageable = pageable; +// map_header->page_shift = MIN(VM_MAP_PAGE_SHIFT(map), PAGE_SHIFT); + map_header->page_shift = VM_MAP_PAGE_SHIFT(map); + map_header->rb_head_store.rbh_root = (void *)(int)SKIP_RB_TREE; - if (! vm_map_range_check(map, start, end, &entry)) { - vm_map_unlock_read(map); - return KERN_INVALID_ADDRESS; - } + vm_map_store_init( map_header ); - /* - * Examine each vm_map_entry_t in the range. - */ - for (; entry != vm_map_to_entry(map) && start < end; ) { - + if (copy && vmk_flags.vmkf_remap_prot_copy) { /* - * The first time through, the start address could be anywhere - * within the vm_map_entry we found. So adjust the offset to - * correspond. After that, the offset will always be zero to - * correspond to the beginning of the current vm_map_entry. + * Special case for vm_map_protect(VM_PROT_COPY): + * we want to set the new mappings' max protection to the + * specified *max_protection... */ - offset = (start - entry->vme_start) + entry->offset; + max_prot_for_prot_copy = *max_protection & VM_PROT_ALL; + /* ... but we want to use the vm_remap() legacy mode */ + *max_protection = VM_PROT_NONE; + *cur_protection = VM_PROT_NONE; + } else { + max_prot_for_prot_copy = VM_PROT_NONE; + } + if (*cur_protection == VM_PROT_NONE && + *max_protection == VM_PROT_NONE) { /* - * Set the length so we don't go beyond the end of the - * map_entry or beyond the end of the range we were given. - * This range could span also multiple map entries all of which - * map different files, so make sure we only do the right amount - * of I/O for each object. Note that it's possible for there - * to be multiple map entries all referring to the same object - * but with different page permissions, but it's not worth - * trying to optimize that case. + * vm_remap() legacy mode: + * Extract all memory regions in the specified range and + * collect the strictest set of protections allowed on the + * entire range, so the caller knows what they can do with + * the remapped range. + * We start with VM_PROT_ALL and we'll remove the protections + * missing from each memory region. */ - len = MIN(entry->vme_end - start, end - start); - - if ((vm_size_t) len != len) { - /* 32-bit overflow */ - len = (vm_size_t) (0 - PAGE_SIZE); - } - fault_info.cluster_size = (vm_size_t) len; - fault_info.lo_offset = offset; - fault_info.hi_offset = offset + len; - fault_info.user_tag = entry->alias; - fault_info.pmap_options = 0; - if (entry->iokit_acct || - (!entry->is_sub_map && !entry->use_pmap)) { - fault_info.pmap_options |= PMAP_OPTIONS_ALT_ACCT; - } - + vm_remap_legacy = TRUE; + *cur_protection = VM_PROT_ALL; + *max_protection = VM_PROT_ALL; + required_cur_prot = VM_PROT_NONE; + required_max_prot = VM_PROT_NONE; + } else { /* - * If there's no read permission to this mapping, then just - * skip it. + * vm_remap_new() mode: + * Extract all memory regions in the specified range and + * ensure that they have at least the protections specified + * by the caller via *cur_protection and *max_protection. + * The resulting mapping should have these protections. */ - if ((entry->protection & VM_PROT_READ) == 0) { - entry = entry->vme_next; - start = entry->vme_start; - continue; + vm_remap_legacy = FALSE; + if (copy) { + required_cur_prot = VM_PROT_NONE; + required_max_prot = VM_PROT_READ; + } else { + required_cur_prot = *cur_protection; + required_max_prot = *max_protection; } + } + + map_address = 0; + mapped_size = 0; + result = KERN_SUCCESS; + /* + * The specified source virtual space might correspond to + * multiple map entries, need to loop on them. + */ + vm_map_lock(map); + if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) { /* - * Find the file object backing this map entry. If there is - * none, then we simply ignore the "will need" advice for this - * entry and go on to the next one. + * This address space uses sub-pages so the range might + * not be re-mappable in an address space with larger + * pages. Re-assemble any broken-up VM map entries to + * improve our chances of making it work. */ - if ((object = find_vnode_object(entry)) == VM_OBJECT_NULL) { - entry = entry->vme_next; - start = entry->vme_start; - continue; - } + vm_map_simplify_range(map, src_start, src_end); + } + while (mapped_size != size) { + vm_map_size_t entry_size; /* - * The data_request() could take a long time, so let's - * release the map lock to avoid blocking other threads. + * Find the beginning of the region. */ - vm_map_unlock_read(map); + if (!vm_map_lookup_entry(map, src_start, &src_entry)) { + result = KERN_INVALID_ADDRESS; + break; + } - vm_object_paging_begin(object); - pager = object->pager; - vm_object_unlock(object); + if (src_start < src_entry->vme_start || + (mapped_size && src_start != src_entry->vme_start)) { + result = KERN_INVALID_ADDRESS; + break; + } - /* - * Get the data from the object asynchronously. - * - * Note that memory_object_data_request() places limits on the - * amount of I/O it will do. Regardless of the len we - * specified, it won't do more than MAX_UPL_TRANSFER_BYTES and it - * silently truncates the len to that size. This isn't - * necessarily bad since madvise shouldn't really be used to - * page in unlimited amounts of data. Other Unix variants - * limit the willneed case as well. If this turns out to be an - * issue for developers, then we can always adjust the policy - * here and still be backwards compatible since this is all - * just "advice". - */ - kr = memory_object_data_request( - pager, - offset + object->paging_offset, - 0, /* ignored */ - VM_PROT_READ, - (memory_object_fault_info_t)&fault_info); + tmp_size = size - mapped_size; + if (src_end > src_entry->vme_end) { + tmp_size -= (src_end - src_entry->vme_end); + } - vm_object_lock(object); - vm_object_paging_end(object); - vm_object_unlock(object); + entry_size = (vm_map_size_t)(src_entry->vme_end - + src_entry->vme_start); - /* - * If we couldn't do the I/O for some reason, just give up on - * the madvise. We still return success to the user since - * madvise isn't supposed to fail when the advice can't be - * taken. - */ - if (kr != KERN_SUCCESS) { - return KERN_SUCCESS; + if (src_entry->is_sub_map && + vmk_flags.vmkf_copy_single_object) { + vm_map_t submap; + vm_map_offset_t submap_start; + vm_map_size_t submap_size; + boolean_t submap_needs_copy; + + /* + * No check for "required protection" on "src_entry" + * because the protections that matter are the ones + * on the submap's VM map entry, which will be checked + * during the call to vm_map_remap_extract() below. + */ + submap_size = src_entry->vme_end - src_start; + if (submap_size > size) { + submap_size = size; + } + submap_start = VME_OFFSET(src_entry) + src_start - src_entry->vme_start; + submap = VME_SUBMAP(src_entry); + if (copy) { + /* + * The caller wants a copy-on-write re-mapping, + * so let's extract from the submap accordingly. + */ + submap_needs_copy = TRUE; + } else if (src_entry->needs_copy) { + /* + * The caller wants a shared re-mapping but the + * submap is mapped with "needs_copy", so its + * contents can't be shared as is. Extract the + * contents of the submap as "copy-on-write". + * The re-mapping won't be shared with the + * original mapping but this is equivalent to + * what happened with the original "remap from + * submap" code. + * The shared region is mapped "needs_copy", for + * example. + */ + submap_needs_copy = TRUE; + } else { + /* + * The caller wants a shared re-mapping and + * this mapping can be shared (no "needs_copy"), + * so let's extract from the submap accordingly. + * Kernel submaps are mapped without + * "needs_copy", for example. + */ + submap_needs_copy = FALSE; + } + vm_map_reference(submap); + vm_map_unlock(map); + src_entry = NULL; + if (vm_remap_legacy) { + *cur_protection = VM_PROT_NONE; + *max_protection = VM_PROT_NONE; + } + + DTRACE_VM7(remap_submap_recurse, + vm_map_t, map, + vm_map_offset_t, addr, + vm_map_size_t, size, + boolean_t, copy, + vm_map_offset_t, submap_start, + vm_map_size_t, submap_size, + boolean_t, submap_needs_copy); + + result = vm_map_remap_extract(submap, + submap_start, + submap_size, + submap_needs_copy, + map_header, + cur_protection, + max_protection, + inheritance, + vmk_flags); + vm_map_deallocate(submap); + return result; } - start += len; - if (start >= end) { - /* done */ - return KERN_SUCCESS; + if (src_entry->is_sub_map) { + /* protections for submap mapping are irrelevant here */ + } else if (((src_entry->protection & required_cur_prot) != + required_cur_prot) || + ((src_entry->max_protection & required_max_prot) != + required_max_prot)) { + if (vmk_flags.vmkf_copy_single_object && + mapped_size != 0) { + /* + * Single object extraction. + * We can't extract more with the required + * protection but we've extracted some, so + * stop there and declare success. + * The caller should check the size of + * the copy entry we've extracted. + */ + result = KERN_SUCCESS; + } else { + /* + * VM range extraction. + * Required proctection is not available + * for this part of the range: fail. + */ + result = KERN_PROTECTION_FAILURE; + } + break; } - /* look up next entry */ - vm_map_lock_read(map); - if (! vm_map_lookup_entry(map, start, &entry)) { + if (src_entry->is_sub_map) { + vm_map_t submap; + vm_map_offset_t submap_start; + vm_map_size_t submap_size; + vm_map_copy_t submap_copy; + vm_prot_t submap_curprot, submap_maxprot; + boolean_t submap_needs_copy; + /* - * There's a new hole in the address range. + * No check for "required protection" on "src_entry" + * because the protections that matter are the ones + * on the submap's VM map entry, which will be checked + * during the call to vm_map_copy_extract() below. */ - vm_map_unlock_read(map); - return KERN_INVALID_ADDRESS; - } - } + object = VM_OBJECT_NULL; + submap_copy = VM_MAP_COPY_NULL; - vm_map_unlock_read(map); - return KERN_SUCCESS; -} + /* find equivalent range in the submap */ + submap = VME_SUBMAP(src_entry); + submap_start = VME_OFFSET(src_entry) + src_start - src_entry->vme_start; + submap_size = tmp_size; + if (copy) { + /* + * The caller wants a copy-on-write re-mapping, + * so let's extract from the submap accordingly. + */ + submap_needs_copy = TRUE; + } else if (src_entry->needs_copy) { + /* + * The caller wants a shared re-mapping but the + * submap is mapped with "needs_copy", so its + * contents can't be shared as is. Extract the + * contents of the submap as "copy-on-write". + * The re-mapping won't be shared with the + * original mapping but this is equivalent to + * what happened with the original "remap from + * submap" code. + * The shared region is mapped "needs_copy", for + * example. + */ + submap_needs_copy = TRUE; + } else { + /* + * The caller wants a shared re-mapping and + * this mapping can be shared (no "needs_copy"), + * so let's extract from the submap accordingly. + * Kernel submaps are mapped without + * "needs_copy", for example. + */ + submap_needs_copy = FALSE; + } + /* extra ref to keep submap alive */ + vm_map_reference(submap); -static boolean_t -vm_map_entry_is_reusable( - vm_map_entry_t entry) -{ - vm_object_t object; + DTRACE_VM7(remap_submap_recurse, + vm_map_t, map, + vm_map_offset_t, addr, + vm_map_size_t, size, + boolean_t, copy, + vm_map_offset_t, submap_start, + vm_map_size_t, submap_size, + boolean_t, submap_needs_copy); - switch (entry->alias) { - case VM_MEMORY_MALLOC: - case VM_MEMORY_MALLOC_SMALL: - case VM_MEMORY_MALLOC_LARGE: - case VM_MEMORY_REALLOC: - case VM_MEMORY_MALLOC_TINY: - case VM_MEMORY_MALLOC_LARGE_REUSABLE: - case VM_MEMORY_MALLOC_LARGE_REUSED: - /* - * This is a malloc() memory region: check if it's still - * in its original state and can be re-used for more - * malloc() allocations. - */ - break; - default: - /* - * Not a malloc() memory region: let the caller decide if - * it's re-usable. - */ - return TRUE; - } + /* + * The map can be safely unlocked since we + * already hold a reference on the submap. + * + * No timestamp since we don't care if the map + * gets modified while we're down in the submap. + * We'll resume the extraction at src_start + tmp_size + * anyway. + */ + vm_map_unlock(map); + src_entry = NULL; /* not valid once map is unlocked */ - if (entry->is_shared || - entry->is_sub_map || - entry->in_transition || - entry->protection != VM_PROT_DEFAULT || - entry->max_protection != VM_PROT_ALL || - entry->inheritance != VM_INHERIT_DEFAULT || - entry->no_cache || - entry->permanent || - entry->superpage_size != FALSE || - entry->zero_wired_pages || - entry->wired_count != 0 || - entry->user_wired_count != 0) { - return FALSE; - } + if (vm_remap_legacy) { + submap_curprot = VM_PROT_NONE; + submap_maxprot = VM_PROT_NONE; + if (max_prot_for_prot_copy) { + submap_maxprot = max_prot_for_prot_copy; + } + } else { + assert(!max_prot_for_prot_copy); + submap_curprot = *cur_protection; + submap_maxprot = *max_protection; + } + result = vm_map_copy_extract(submap, + submap_start, + submap_size, + submap_needs_copy, + &submap_copy, + &submap_curprot, + &submap_maxprot, + inheritance, + vmk_flags); + + /* release extra ref on submap */ + vm_map_deallocate(submap); + submap = VM_MAP_NULL; - object = entry->object.vm_object; - if (object == VM_OBJECT_NULL) { - return TRUE; - } - if ( -#if 0 - /* - * Let's proceed even if the VM object is potentially - * shared. - * We check for this later when processing the actual - * VM pages, so the contents will be safe if shared. - * - * But we can still mark this memory region as "reusable" to - * acknowledge that the caller did let us know that the memory - * could be re-used and should not be penalized for holding - * on to it. This allows its "resident size" to not include - * the reusable range. - */ - object->ref_count == 1 && -#endif - object->wired_page_count == 0 && - object->copy == VM_OBJECT_NULL && - object->shadow == VM_OBJECT_NULL && - object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC && - object->internal && - !object->true_share && - object->wimg_bits == VM_WIMG_USE_DEFAULT && - !object->code_signed) { - return TRUE; - } - return FALSE; - - -} + if (result != KERN_SUCCESS) { + vm_map_lock(map); + break; + } -static kern_return_t -vm_map_reuse_pages( - vm_map_t map, - vm_map_offset_t start, - vm_map_offset_t end) -{ - vm_map_entry_t entry; - vm_object_t object; - vm_object_offset_t start_offset, end_offset; + /* transfer submap_copy entries to map_header */ + while (vm_map_copy_first_entry(submap_copy) != + vm_map_copy_to_entry(submap_copy)) { + vm_map_entry_t copy_entry; + vm_map_size_t copy_entry_size; + + copy_entry = vm_map_copy_first_entry(submap_copy); + assert(!copy_entry->is_sub_map); + object = VME_OBJECT(copy_entry); + + /* + * Prevent kernel_object from being exposed to + * user space. + */ + if (__improbable(object == kernel_object)) { + printf("%d[%s]: rejecting attempt to extract from kernel_object\n", + proc_selfpid(), + (current_task()->bsd_info + ? proc_name_address(current_task()->bsd_info) + : "?")); + DTRACE_VM(extract_kernel_only); + result = KERN_INVALID_RIGHT; + vm_map_copy_discard(submap_copy); + submap_copy = VM_MAP_COPY_NULL; + vm_map_lock(map); + break; + } + + vm_map_copy_entry_unlink(submap_copy, copy_entry); + copy_entry_size = copy_entry->vme_end - copy_entry->vme_start; + copy_entry->vme_start = map_address; + copy_entry->vme_end = map_address + copy_entry_size; + map_address += copy_entry_size; + mapped_size += copy_entry_size; + src_start += copy_entry_size; + assert(src_start <= src_end); + _vm_map_store_entry_link(map_header, + map_header->links.prev, + copy_entry); + } + /* done with submap_copy */ + vm_map_copy_discard(submap_copy); + + if (vm_remap_legacy) { + *cur_protection &= submap_curprot; + *max_protection &= submap_maxprot; + } + + /* re-acquire the map lock and continue to next entry */ + vm_map_lock(map); + continue; + } else { + object = VME_OBJECT(src_entry); + + /* + * Prevent kernel_object from being exposed to + * user space. + */ + if (__improbable(object == kernel_object)) { + printf("%d[%s]: rejecting attempt to extract from kernel_object\n", + proc_selfpid(), + (current_task()->bsd_info + ? proc_name_address(current_task()->bsd_info) + : "?")); + DTRACE_VM(extract_kernel_only); + result = KERN_INVALID_RIGHT; + break; + } + + if (src_entry->iokit_acct) { + /* + * This entry uses "IOKit accounting". + */ + } else if (object != VM_OBJECT_NULL && + (object->purgable != VM_PURGABLE_DENY || + object->vo_ledger_tag != VM_LEDGER_TAG_NONE)) { + /* + * Purgeable objects have their own accounting: + * no pmap accounting for them. + */ + assertf(!src_entry->use_pmap, + "map=%p src_entry=%p [0x%llx:0x%llx] 0x%x/0x%x %d", + map, + src_entry, + (uint64_t)src_entry->vme_start, + (uint64_t)src_entry->vme_end, + src_entry->protection, + src_entry->max_protection, + VME_ALIAS(src_entry)); + } else { + /* + * Not IOKit or purgeable: + * must be accounted by pmap stats. + */ + assertf(src_entry->use_pmap, + "map=%p src_entry=%p [0x%llx:0x%llx] 0x%x/0x%x %d", + map, + src_entry, + (uint64_t)src_entry->vme_start, + (uint64_t)src_entry->vme_end, + src_entry->protection, + src_entry->max_protection, + VME_ALIAS(src_entry)); + } + + if (object == VM_OBJECT_NULL) { + assert(!src_entry->needs_copy); + object = vm_object_allocate(entry_size); + VME_OFFSET_SET(src_entry, 0); + VME_OBJECT_SET(src_entry, object); + assert(src_entry->use_pmap); + assert(!map->mapped_in_other_pmaps); + } else if (src_entry->wired_count || + object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) { + /* + * A wired memory region should not have + * any pending copy-on-write and needs to + * keep pointing at the VM object that + * contains the wired pages. + * If we're sharing this memory (copy=false), + * we'll share this VM object. + * If we're copying this memory (copy=true), + * we'll call vm_object_copy_slowly() below + * and use the new VM object for the remapping. + * + * Or, we are already using an asymmetric + * copy, and therefore we already have + * the right object. + */ + assert(!src_entry->needs_copy); + } else if (src_entry->needs_copy || object->shadowed || + (object->internal && !object->true_share && + !src_entry->is_shared && + object->vo_size > entry_size)) { + VME_OBJECT_SHADOW(src_entry, entry_size); + assert(src_entry->use_pmap); + + if (!src_entry->needs_copy && + (src_entry->protection & VM_PROT_WRITE)) { + vm_prot_t prot; - /* - * The MADV_REUSE operation doesn't require any changes to the - * vm_map_entry_t's, so the read lock is sufficient. - */ + assert(!pmap_has_prot_policy(map->pmap, src_entry->translated_allow_execute, src_entry->protection)); - vm_map_lock_read(map); + prot = src_entry->protection & ~VM_PROT_WRITE; - /* - * The madvise semantics require that the address range be fully - * allocated with no holes. Otherwise, we're required to return - * an error. - */ + if (override_nx(map, + VME_ALIAS(src_entry)) + && prot) { + prot |= VM_PROT_EXECUTE; + } - if (!vm_map_range_check(map, start, end, &entry)) { - vm_map_unlock_read(map); - vm_page_stats_reusable.reuse_pages_failure++; - return KERN_INVALID_ADDRESS; - } + assert(!pmap_has_prot_policy(map->pmap, src_entry->translated_allow_execute, prot)); - /* - * Examine each vm_map_entry_t in the range. - */ - for (; entry != vm_map_to_entry(map) && entry->vme_start < end; - entry = entry->vme_next) { - /* - * Sanity check on the VM map entry. - */ - if (! vm_map_entry_is_reusable(entry)) { - vm_map_unlock_read(map); - vm_page_stats_reusable.reuse_pages_failure++; - return KERN_INVALID_ADDRESS; - } + if (map->mapped_in_other_pmaps) { + vm_object_pmap_protect( + VME_OBJECT(src_entry), + VME_OFFSET(src_entry), + entry_size, + PMAP_NULL, + PAGE_SIZE, + src_entry->vme_start, + prot); +#if MACH_ASSERT + } else if (__improbable(map->pmap == PMAP_NULL)) { + extern boolean_t vm_tests_in_progress; + assert(vm_tests_in_progress); + /* + * Some VM tests (in vm_tests.c) + * sometimes want to use a VM + * map without a pmap. + * Otherwise, this should never + * happen. + */ +#endif /* MACH_ASSERT */ + } else { + pmap_protect(vm_map_pmap(map), + src_entry->vme_start, + src_entry->vme_end, + prot); + } + } + + object = VME_OBJECT(src_entry); + src_entry->needs_copy = FALSE; + } - /* - * The first time through, the start address could be anywhere - * within the vm_map_entry we found. So adjust the offset to - * correspond. - */ - if (entry->vme_start < start) { - start_offset = start - entry->vme_start; - } else { - start_offset = 0; - } - end_offset = MIN(end, entry->vme_end) - entry->vme_start; - start_offset += entry->offset; - end_offset += entry->offset; - object = entry->object.vm_object; - if (object != VM_OBJECT_NULL) { vm_object_lock(object); - vm_object_reuse_pages(object, start_offset, end_offset, - TRUE); + vm_object_reference_locked(object); /* object ref. for new entry */ + assert(!src_entry->needs_copy); + if (object->copy_strategy == + MEMORY_OBJECT_COPY_SYMMETRIC) { + /* + * If we want to share this object (copy==0), + * it needs to be COPY_DELAY. + * If we want to copy this object (copy==1), + * we can't just set "needs_copy" on our side + * and expect the other side to do the same + * (symmetrically), so we can't let the object + * stay COPY_SYMMETRIC. + * So we always switch from COPY_SYMMETRIC to + * COPY_DELAY. + */ + object->copy_strategy = + MEMORY_OBJECT_COPY_DELAY; + object->true_share = TRUE; + } vm_object_unlock(object); } - if (entry->alias == VM_MEMORY_MALLOC_LARGE_REUSABLE) { + offset = (VME_OFFSET(src_entry) + + (src_start - src_entry->vme_start)); + + new_entry = _vm_map_entry_create(map_header, !map_header->entries_pageable); + vm_map_entry_copy(map, new_entry, src_entry); + if (new_entry->is_sub_map) { + /* clr address space specifics */ + new_entry->use_pmap = FALSE; + } else if (copy) { /* - * XXX - * We do not hold the VM map exclusively here. - * The "alias" field is not that critical, so it's - * safe to update it here, as long as it is the only - * one that can be modified while holding the VM map - * "shared". + * We're dealing with a copy-on-write operation, + * so the resulting mapping should not inherit the + * original mapping's accounting settings. + * "use_pmap" should be reset to its default (TRUE) + * so that the new mapping gets accounted for in + * the task's memory footprint. */ - entry->alias = VM_MEMORY_MALLOC_LARGE_REUSED; + new_entry->use_pmap = TRUE; } - } - - vm_map_unlock_read(map); - vm_page_stats_reusable.reuse_pages_success++; - return KERN_SUCCESS; -} + /* "iokit_acct" was cleared in vm_map_entry_copy() */ + assert(!new_entry->iokit_acct); + new_entry->map_aligned = FALSE; -static kern_return_t -vm_map_reusable_pages( - vm_map_t map, - vm_map_offset_t start, - vm_map_offset_t end) -{ - vm_map_entry_t entry; - vm_object_t object; - vm_object_offset_t start_offset, end_offset; + new_entry->vme_start = map_address; + new_entry->vme_end = map_address + tmp_size; + assert(new_entry->vme_start < new_entry->vme_end); + if (copy && vmk_flags.vmkf_remap_prot_copy) { + /* + * Remapping for vm_map_protect(VM_PROT_COPY) + * to convert a read-only mapping into a + * copy-on-write version of itself but + * with write access: + * keep the original inheritance and add + * VM_PROT_WRITE to the max protection. + */ + new_entry->inheritance = src_entry->inheritance; + new_entry->protection &= max_prot_for_prot_copy; + new_entry->max_protection |= VM_PROT_WRITE; + } else { + new_entry->inheritance = inheritance; + if (!vm_remap_legacy) { + new_entry->protection = *cur_protection; + new_entry->max_protection = *max_protection; + } + } + VME_OFFSET_SET(new_entry, offset); - /* - * The MADV_REUSABLE operation doesn't require any changes to the - * vm_map_entry_t's, so the read lock is sufficient. - */ + /* + * The new region has to be copied now if required. + */ +RestartCopy: + if (!copy) { + if (src_entry->used_for_jit == TRUE) { + if (same_map) { + } else if (!VM_MAP_POLICY_ALLOW_JIT_SHARING(map)) { + /* + * Cannot allow an entry describing a JIT + * region to be shared across address spaces. + */ + result = KERN_INVALID_ARGUMENT; + break; + } + } - vm_map_lock_read(map); + src_entry->is_shared = TRUE; + new_entry->is_shared = TRUE; + if (!(new_entry->is_sub_map)) { + new_entry->needs_copy = FALSE; + } + } else if (src_entry->is_sub_map) { + /* make this a COW sub_map if not already */ + assert(new_entry->wired_count == 0); + new_entry->needs_copy = TRUE; + object = VM_OBJECT_NULL; + } else if (src_entry->wired_count == 0 && + !(debug4k_no_cow_copyin && VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) && + vm_object_copy_quickly(VME_OBJECT_PTR(new_entry), + VME_OFFSET(new_entry), + (new_entry->vme_end - + new_entry->vme_start), + &src_needs_copy, + &new_entry_needs_copy)) { + new_entry->needs_copy = new_entry_needs_copy; + new_entry->is_shared = FALSE; + assertf(new_entry->use_pmap, "map %p new_entry %p\n", map, new_entry); - /* - * The madvise semantics require that the address range be fully - * allocated with no holes. Otherwise, we're required to return - * an error. - */ + /* + * Handle copy_on_write semantics. + */ + if (src_needs_copy && !src_entry->needs_copy) { + vm_prot_t prot; - if (!vm_map_range_check(map, start, end, &entry)) { - vm_map_unlock_read(map); - vm_page_stats_reusable.reusable_pages_failure++; - return KERN_INVALID_ADDRESS; - } + assert(!pmap_has_prot_policy(map->pmap, src_entry->translated_allow_execute, src_entry->protection)); - /* - * Examine each vm_map_entry_t in the range. - */ - for (; entry != vm_map_to_entry(map) && entry->vme_start < end; - entry = entry->vme_next) { - int kill_pages = 0; + prot = src_entry->protection & ~VM_PROT_WRITE; - /* - * Sanity check on the VM map entry. - */ - if (! vm_map_entry_is_reusable(entry)) { - vm_map_unlock_read(map); - vm_page_stats_reusable.reusable_pages_failure++; - return KERN_INVALID_ADDRESS; - } + if (override_nx(map, + VME_ALIAS(src_entry)) + && prot) { + prot |= VM_PROT_EXECUTE; + } - /* - * The first time through, the start address could be anywhere - * within the vm_map_entry we found. So adjust the offset to - * correspond. - */ - if (entry->vme_start < start) { - start_offset = start - entry->vme_start; + assert(!pmap_has_prot_policy(map->pmap, src_entry->translated_allow_execute, prot)); + + vm_object_pmap_protect(object, + offset, + entry_size, + ((src_entry->is_shared + || map->mapped_in_other_pmaps) ? + PMAP_NULL : map->pmap), + VM_MAP_PAGE_SIZE(map), + src_entry->vme_start, + prot); + + assert(src_entry->wired_count == 0); + src_entry->needs_copy = TRUE; + } + /* + * Throw away the old object reference of the new entry. + */ + vm_object_deallocate(object); } else { - start_offset = 0; - } - end_offset = MIN(end, entry->vme_end) - entry->vme_start; - start_offset += entry->offset; - end_offset += entry->offset; + new_entry->is_shared = FALSE; + assertf(new_entry->use_pmap, "map %p new_entry %p\n", map, new_entry); - object = entry->object.vm_object; - if (object == VM_OBJECT_NULL) - continue; + src_entry_was_wired = (src_entry->wired_count > 0); + saved_src_entry = src_entry; + src_entry = VM_MAP_ENTRY_NULL; + /* + * The map can be safely unlocked since we + * already hold a reference on the object. + * + * Record the timestamp of the map for later + * verification, and unlock the map. + */ + version.main_timestamp = map->timestamp; + vm_map_unlock(map); /* Increments timestamp once! */ - vm_object_lock(object); - if (object->ref_count == 1 && - !object->shadow && - /* - * "iokit_acct" entries are billed for their virtual size - * (rather than for their resident pages only), so they - * wouldn't benefit from making pages reusable, and it - * would be hard to keep track of pages that are both - * "iokit_acct" and "reusable" in the pmap stats and ledgers. - */ - !(entry->iokit_acct || - (!entry->is_sub_map && !entry->use_pmap))) - kill_pages = 1; - else - kill_pages = -1; - if (kill_pages != -1) { - vm_object_deactivate_pages(object, - start_offset, - end_offset - start_offset, - kill_pages, - TRUE /*reusable_pages*/); - } else { - vm_page_stats_reusable.reusable_pages_shared++; - } - vm_object_unlock(object); + /* + * Perform the copy. + */ + if (src_entry_was_wired > 0 || + (debug4k_no_cow_copyin && + VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT)) { + vm_object_lock(object); + result = vm_object_copy_slowly( + object, + offset, + (new_entry->vme_end - + new_entry->vme_start), + THREAD_UNINT, + VME_OBJECT_PTR(new_entry)); + + VME_OFFSET_SET(new_entry, offset - vm_object_trunc_page(offset)); + new_entry->needs_copy = FALSE; + } else { + vm_object_offset_t new_offset; + + new_offset = VME_OFFSET(new_entry); + result = vm_object_copy_strategically( + object, + offset, + (new_entry->vme_end - + new_entry->vme_start), + VME_OBJECT_PTR(new_entry), + &new_offset, + &new_entry_needs_copy); + if (new_offset != VME_OFFSET(new_entry)) { + VME_OFFSET_SET(new_entry, new_offset); + } + + new_entry->needs_copy = new_entry_needs_copy; + } - if (entry->alias == VM_MEMORY_MALLOC_LARGE || - entry->alias == VM_MEMORY_MALLOC_LARGE_REUSED) { /* - * XXX - * We do not hold the VM map exclusively here. - * The "alias" field is not that critical, so it's - * safe to update it here, as long as it is the only - * one that can be modified while holding the VM map - * "shared". + * Throw away the old object reference of the new entry. */ - entry->alias = VM_MEMORY_MALLOC_LARGE_REUSABLE; - } - } - - vm_map_unlock_read(map); - vm_page_stats_reusable.reusable_pages_success++; - return KERN_SUCCESS; -} + vm_object_deallocate(object); + if (result != KERN_SUCCESS && + result != KERN_MEMORY_RESTART_COPY) { + _vm_map_entry_dispose(map_header, new_entry); + vm_map_lock(map); + break; + } -static kern_return_t -vm_map_can_reuse( - vm_map_t map, - vm_map_offset_t start, - vm_map_offset_t end) -{ - vm_map_entry_t entry; + /* + * Verify that the map has not substantially + * changed while the copy was being made. + */ + + vm_map_lock(map); + if (version.main_timestamp + 1 != map->timestamp) { + /* + * Simple version comparison failed. + * + * Retry the lookup and verify that the + * same object/offset are still present. + */ + saved_src_entry = VM_MAP_ENTRY_NULL; + vm_object_deallocate(VME_OBJECT(new_entry)); + _vm_map_entry_dispose(map_header, new_entry); + if (result == KERN_MEMORY_RESTART_COPY) { + result = KERN_SUCCESS; + } + continue; + } + /* map hasn't changed: src_entry is still valid */ + src_entry = saved_src_entry; + saved_src_entry = VM_MAP_ENTRY_NULL; + + if (result == KERN_MEMORY_RESTART_COPY) { + vm_object_reference(object); + goto RestartCopy; + } + } - /* - * The MADV_REUSABLE operation doesn't require any changes to the - * vm_map_entry_t's, so the read lock is sufficient. - */ + _vm_map_store_entry_link(map_header, + map_header->links.prev, new_entry); - vm_map_lock_read(map); + /* protections for submap mapping are irrelevant here */ + if (vm_remap_legacy && !src_entry->is_sub_map) { + *cur_protection &= src_entry->protection; + *max_protection &= src_entry->max_protection; + } - /* - * The madvise semantics require that the address range be fully - * allocated with no holes. Otherwise, we're required to return - * an error. - */ + map_address += tmp_size; + mapped_size += tmp_size; + src_start += tmp_size; - if (!vm_map_range_check(map, start, end, &entry)) { - vm_map_unlock_read(map); - vm_page_stats_reusable.can_reuse_failure++; - return KERN_INVALID_ADDRESS; - } + if (vmk_flags.vmkf_copy_single_object) { + if (mapped_size != size) { + DEBUG4K_SHARE("map %p addr 0x%llx size 0x%llx clipped copy at mapped_size 0x%llx\n", map, (uint64_t)addr, (uint64_t)size, (uint64_t)mapped_size); + if (src_entry->vme_next != vm_map_to_entry(map) && + VME_OBJECT(src_entry->vme_next) == VME_OBJECT(src_entry)) { + /* XXX TODO4K */ + DEBUG4K_ERROR("could have extended copy to next entry...\n"); + } + } + break; + } + } /* end while */ - /* - * Examine each vm_map_entry_t in the range. - */ - for (; entry != vm_map_to_entry(map) && entry->vme_start < end; - entry = entry->vme_next) { + vm_map_unlock(map); + if (result != KERN_SUCCESS) { /* - * Sanity check on the VM map entry. + * Free all allocated elements. */ - if (! vm_map_entry_is_reusable(entry)) { - vm_map_unlock_read(map); - vm_page_stats_reusable.can_reuse_failure++; - return KERN_INVALID_ADDRESS; + for (src_entry = map_header->links.next; + src_entry != CAST_TO_VM_MAP_ENTRY(&map_header->links); + src_entry = new_entry) { + new_entry = src_entry->vme_next; + _vm_map_store_entry_unlink(map_header, src_entry); + if (src_entry->is_sub_map) { + vm_map_deallocate(VME_SUBMAP(src_entry)); + } else { + vm_object_deallocate(VME_OBJECT(src_entry)); + } + _vm_map_entry_dispose(map_header, src_entry); } } - - vm_map_unlock_read(map); - vm_page_stats_reusable.can_reuse_success++; - return KERN_SUCCESS; + return result; } +bool +vm_map_is_exotic( + vm_map_t map) +{ + return VM_MAP_IS_EXOTIC(map); +} -/* - * Routine: vm_map_entry_insert - * - * Descritpion: This routine inserts a new vm_entry in a locked map. - */ -vm_map_entry_t -vm_map_entry_insert( - vm_map_t map, - vm_map_entry_t insp_entry, - vm_map_offset_t start, - vm_map_offset_t end, - vm_object_t object, - vm_object_offset_t offset, - boolean_t needs_copy, - boolean_t is_shared, - boolean_t in_transition, - vm_prot_t cur_protection, - vm_prot_t max_protection, - vm_behavior_t behavior, - vm_inherit_t inheritance, - unsigned wired_count, - boolean_t no_cache, - boolean_t permanent, - unsigned int superpage_size, - boolean_t clear_map_aligned, - boolean_t is_submap) -{ - vm_map_entry_t new_entry; - - assert(insp_entry != (vm_map_entry_t)0); - - new_entry = vm_map_entry_create(map, !map->hdr.entries_pageable); - - if (VM_MAP_PAGE_SHIFT(map) != PAGE_SHIFT) { - new_entry->map_aligned = TRUE; - } else { - new_entry->map_aligned = FALSE; - } - if (clear_map_aligned && - (! VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)) || - ! VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map)))) { - new_entry->map_aligned = FALSE; - } - - new_entry->vme_start = start; - new_entry->vme_end = end; - assert(page_aligned(new_entry->vme_start)); - assert(page_aligned(new_entry->vme_end)); - if (new_entry->map_aligned) { - assert(VM_MAP_PAGE_ALIGNED(new_entry->vme_start, - VM_MAP_PAGE_MASK(map))); - assert(VM_MAP_PAGE_ALIGNED(new_entry->vme_end, - VM_MAP_PAGE_MASK(map))); - } - assert(new_entry->vme_start < new_entry->vme_end); +bool +vm_map_is_alien( + vm_map_t map) +{ + return VM_MAP_IS_ALIEN(map); +} - new_entry->object.vm_object = object; - new_entry->offset = offset; - new_entry->is_shared = is_shared; - new_entry->is_sub_map = is_submap; - new_entry->needs_copy = needs_copy; - new_entry->in_transition = in_transition; - new_entry->needs_wakeup = FALSE; - new_entry->inheritance = inheritance; - new_entry->protection = cur_protection; - new_entry->max_protection = max_protection; - new_entry->behavior = behavior; - new_entry->wired_count = wired_count; - new_entry->user_wired_count = 0; - if (is_submap) { - /* - * submap: "use_pmap" means "nested". - * default: false. - */ - new_entry->use_pmap = FALSE; - } else { - /* - * object: "use_pmap" means "use pmap accounting" for footprint. - * default: true. - */ - new_entry->use_pmap = TRUE; - } - new_entry->alias = 0; - new_entry->zero_wired_pages = FALSE; - new_entry->no_cache = no_cache; - new_entry->permanent = permanent; - if (superpage_size) - new_entry->superpage_size = TRUE; - else - new_entry->superpage_size = FALSE; - new_entry->used_for_jit = FALSE; - new_entry->iokit_acct = FALSE; +#if XNU_TARGET_OS_OSX +void +vm_map_mark_alien( + vm_map_t map) +{ + vm_map_lock(map); + map->is_alien = true; + vm_map_unlock(map); +} - /* - * Insert the new entry into the list. - */ +void +vm_map_single_jit( + vm_map_t map) +{ + vm_map_lock(map); + map->single_jit = true; + vm_map_unlock(map); +} +#endif /* XNU_TARGET_OS_OSX */ - vm_map_store_entry_link(map, insp_entry, new_entry); - map->size += end - start; +void vm_map_copy_to_physcopy(vm_map_copy_t copy_map, vm_map_t target_map); +void +vm_map_copy_to_physcopy( + vm_map_copy_t copy_map, + vm_map_t target_map) +{ + vm_map_size_t size; + vm_map_entry_t entry; + vm_map_entry_t new_entry; + vm_object_t new_object; + unsigned int pmap_flags; + pmap_t new_pmap; + vm_map_t new_map; + vm_map_address_t src_start, src_end, src_cur; + vm_map_address_t dst_start, dst_end, dst_cur; + kern_return_t kr; + void *kbuf; /* - * Update the free space hint and the lookup hint. + * Perform the equivalent of vm_allocate() and memcpy(). + * Replace the mappings in "copy_map" with the newly allocated mapping. */ + DEBUG4K_COPY("copy_map %p (%d %d 0x%llx 0x%llx) BEFORE\n", copy_map, copy_map->cpy_hdr.page_shift, copy_map->cpy_hdr.nentries, copy_map->offset, (uint64_t)copy_map->size); - SAVE_HINT_MAP_WRITE(map, new_entry); - return new_entry; -} - -/* - * Routine: vm_map_remap_extract - * - * Descritpion: This routine returns a vm_entry list from a map. - */ -static kern_return_t -vm_map_remap_extract( - vm_map_t map, - vm_map_offset_t addr, - vm_map_size_t size, - boolean_t copy, - struct vm_map_header *map_header, - vm_prot_t *cur_protection, - vm_prot_t *max_protection, - /* What, no behavior? */ - vm_inherit_t inheritance, - boolean_t pageable) -{ - kern_return_t result; - vm_map_size_t mapped_size; - vm_map_size_t tmp_size; - vm_map_entry_t src_entry; /* result of last map lookup */ - vm_map_entry_t new_entry; - vm_object_offset_t offset; - vm_map_offset_t map_address; - vm_map_offset_t src_start; /* start of entry to map */ - vm_map_offset_t src_end; /* end of region to be mapped */ - vm_object_t object; - vm_map_version_t version; - boolean_t src_needs_copy; - boolean_t new_entry_needs_copy; + assert(copy_map->cpy_hdr.page_shift != VM_MAP_PAGE_MASK(target_map)); - assert(map != VM_MAP_NULL); - assert(size != 0); - assert(size == vm_map_round_page(size, PAGE_MASK)); - assert(inheritance == VM_INHERIT_NONE || - inheritance == VM_INHERIT_COPY || - inheritance == VM_INHERIT_SHARE); + /* allocate new VM object */ + size = VM_MAP_ROUND_PAGE(copy_map->size, PAGE_MASK); + new_object = vm_object_allocate(size); + assert(new_object); - /* - * Compute start and end of region. - */ - src_start = vm_map_trunc_page(addr, PAGE_MASK); - src_end = vm_map_round_page(src_start + size, PAGE_MASK); + /* allocate new VM map entry */ + new_entry = vm_map_copy_entry_create(copy_map, FALSE); + assert(new_entry); + /* finish initializing new VM map entry */ + new_entry->protection = VM_PROT_DEFAULT; + new_entry->max_protection = VM_PROT_DEFAULT; + new_entry->use_pmap = TRUE; - /* - * Initialize map_header. - */ - map_header->links.next = (struct vm_map_entry *)&map_header->links; - map_header->links.prev = (struct vm_map_entry *)&map_header->links; - map_header->nentries = 0; - map_header->entries_pageable = pageable; - map_header->page_shift = PAGE_SHIFT; + /* make new VM map entry point to new VM object */ + new_entry->vme_start = 0; + new_entry->vme_end = size; + VME_OBJECT_SET(new_entry, new_object); + VME_OFFSET_SET(new_entry, 0); + + /* create a new pmap to map "copy_map" */ + pmap_flags = 0; + assert(copy_map->cpy_hdr.page_shift == FOURK_PAGE_SHIFT); +#if PMAP_CREATE_FORCE_4K_PAGES + pmap_flags |= PMAP_CREATE_FORCE_4K_PAGES; +#endif /* PMAP_CREATE_FORCE_4K_PAGES */ + pmap_flags |= PMAP_CREATE_64BIT; + new_pmap = pmap_create_options(NULL, (vm_map_size_t)0, pmap_flags); + assert(new_pmap); + + /* create a new pageable VM map to map "copy_map" */ + new_map = vm_map_create(new_pmap, 0, MACH_VM_MAX_ADDRESS, TRUE); + assert(new_map); + vm_map_set_page_shift(new_map, copy_map->cpy_hdr.page_shift); + + /* map "copy_map" in the new VM map */ + src_start = 0; + kr = vm_map_copyout_internal( + new_map, + &src_start, + copy_map, + copy_map->size, + FALSE, /* consume_on_success */ + VM_PROT_DEFAULT, + VM_PROT_DEFAULT, + VM_INHERIT_DEFAULT); + assert(kr == KERN_SUCCESS); + src_end = src_start + copy_map->size; + + /* map "new_object" in the new VM map */ + vm_object_reference(new_object); + dst_start = 0; + kr = vm_map_enter(new_map, + &dst_start, + size, + 0, /* mask */ + VM_FLAGS_ANYWHERE, + VM_MAP_KERNEL_FLAGS_NONE, + VM_KERN_MEMORY_OSFMK, + new_object, + 0, /* offset */ + FALSE, /* needs copy */ + VM_PROT_DEFAULT, + VM_PROT_DEFAULT, + VM_INHERIT_DEFAULT); + assert(kr == KERN_SUCCESS); + dst_end = dst_start + size; + + /* get a kernel buffer */ + kbuf = kheap_alloc(KHEAP_TEMP, PAGE_SIZE, Z_WAITOK); + assert(kbuf); + + /* physically copy "copy_map" mappings to new VM object */ + for (src_cur = src_start, dst_cur = dst_start; + src_cur < src_end; + src_cur += PAGE_SIZE, dst_cur += PAGE_SIZE) { + vm_size_t bytes; + + bytes = PAGE_SIZE; + if (src_cur + PAGE_SIZE > src_end) { + /* partial copy for last page */ + bytes = src_end - src_cur; + assert(bytes > 0 && bytes < PAGE_SIZE); + /* rest of dst page should be zero-filled */ + } + /* get bytes from src mapping */ + kr = copyinmap(new_map, src_cur, kbuf, bytes); + if (kr != KERN_SUCCESS) { + DEBUG4K_COPY("copyinmap(%p, 0x%llx, %p, 0x%llx) kr 0x%x\n", new_map, (uint64_t)src_cur, kbuf, (uint64_t)bytes, kr); + } + /* put bytes in dst mapping */ + assert(dst_cur < dst_end); + assert(dst_cur + bytes <= dst_end); + kr = copyoutmap(new_map, kbuf, dst_cur, bytes); + if (kr != KERN_SUCCESS) { + DEBUG4K_COPY("copyoutmap(%p, %p, 0x%llx, 0x%llx) kr 0x%x\n", new_map, kbuf, (uint64_t)dst_cur, (uint64_t)bytes, kr); + } + } - vm_map_store_init( map_header ); + /* free kernel buffer */ + kheap_free(KHEAP_TEMP, kbuf, PAGE_SIZE); + kbuf = NULL; - *cur_protection = VM_PROT_ALL; - *max_protection = VM_PROT_ALL; + /* destroy new map */ + vm_map_destroy(new_map, VM_MAP_REMOVE_NO_FLAGS); + new_map = VM_MAP_NULL; - map_address = 0; - mapped_size = 0; - result = KERN_SUCCESS; + /* dispose of the old map entries in "copy_map" */ + while (vm_map_copy_first_entry(copy_map) != + vm_map_copy_to_entry(copy_map)) { + entry = vm_map_copy_first_entry(copy_map); + vm_map_copy_entry_unlink(copy_map, entry); + if (entry->is_sub_map) { + vm_map_deallocate(VME_SUBMAP(entry)); + } else { + vm_object_deallocate(VME_OBJECT(entry)); + } + vm_map_copy_entry_dispose(copy_map, entry); + } - /* - * The specified source virtual space might correspond to - * multiple map entries, need to loop on them. - */ - vm_map_lock(map); - while (mapped_size != size) { - vm_map_size_t entry_size; + /* change "copy_map"'s page_size to match "target_map" */ + copy_map->cpy_hdr.page_shift = VM_MAP_PAGE_SHIFT(target_map); + copy_map->offset = 0; + copy_map->size = size; - /* - * Find the beginning of the region. - */ - if (! vm_map_lookup_entry(map, src_start, &src_entry)) { - result = KERN_INVALID_ADDRESS; - break; - } + /* insert new map entry in "copy_map" */ + assert(vm_map_copy_last_entry(copy_map) == vm_map_copy_to_entry(copy_map)); + vm_map_copy_entry_link(copy_map, vm_map_copy_last_entry(copy_map), new_entry); - if (src_start < src_entry->vme_start || - (mapped_size && src_start != src_entry->vme_start)) { - result = KERN_INVALID_ADDRESS; - break; - } + DEBUG4K_COPY("copy_map %p (%d %d 0x%llx 0x%llx) AFTER\n", copy_map, copy_map->cpy_hdr.page_shift, copy_map->cpy_hdr.nentries, copy_map->offset, (uint64_t)copy_map->size); +} - tmp_size = size - mapped_size; - if (src_end > src_entry->vme_end) - tmp_size -= (src_end - src_entry->vme_end); +void +vm_map_copy_adjust_get_target_copy_map( + vm_map_copy_t copy_map, + vm_map_copy_t *target_copy_map_p); +void +vm_map_copy_adjust_get_target_copy_map( + vm_map_copy_t copy_map, + vm_map_copy_t *target_copy_map_p) +{ + vm_map_copy_t target_copy_map; + vm_map_entry_t entry, target_entry; - entry_size = (vm_map_size_t)(src_entry->vme_end - - src_entry->vme_start); + if (*target_copy_map_p != VM_MAP_COPY_NULL) { + /* the caller already has a "target_copy_map": use it */ + return; + } - if(src_entry->is_sub_map) { - vm_map_reference(src_entry->object.sub_map); - object = VM_OBJECT_NULL; + /* the caller wants us to create a new copy of "copy_map" */ + target_copy_map = vm_map_copy_allocate(); + target_copy_map->type = copy_map->type; + assert(target_copy_map->type == VM_MAP_COPY_ENTRY_LIST); + target_copy_map->offset = copy_map->offset; + target_copy_map->size = copy_map->size; + target_copy_map->cpy_hdr.page_shift = copy_map->cpy_hdr.page_shift; + vm_map_store_init(&target_copy_map->cpy_hdr); + for (entry = vm_map_copy_first_entry(copy_map); + entry != vm_map_copy_to_entry(copy_map); + entry = entry->vme_next) { + target_entry = vm_map_copy_entry_create(target_copy_map, FALSE); + vm_map_entry_copy_full(target_entry, entry); + if (target_entry->is_sub_map) { + vm_map_reference(VME_SUBMAP(target_entry)); } else { - object = src_entry->object.vm_object; - if (src_entry->iokit_acct) { - /* - * This entry uses "IOKit accounting". - */ - } else if (object != VM_OBJECT_NULL && - object->purgable != VM_PURGABLE_DENY) { - /* - * Purgeable objects have their own accounting: - * no pmap accounting for them. - */ - assert(!src_entry->use_pmap); - } else { - /* - * Not IOKit or purgeable: - * must be accounted by pmap stats. - */ - assert(src_entry->use_pmap); - } + vm_object_reference(VME_OBJECT(target_entry)); + } + vm_map_copy_entry_link( + target_copy_map, + vm_map_copy_last_entry(target_copy_map), + target_entry); + } + entry = VM_MAP_ENTRY_NULL; + *target_copy_map_p = target_copy_map; +} - if (object == VM_OBJECT_NULL) { - object = vm_object_allocate(entry_size); - src_entry->offset = 0; - src_entry->object.vm_object = object; - } else if (object->copy_strategy != - MEMORY_OBJECT_COPY_SYMMETRIC) { - /* - * We are already using an asymmetric - * copy, and therefore we already have - * the right object. - */ - assert(!src_entry->needs_copy); - } else if (src_entry->needs_copy || object->shadowed || - (object->internal && !object->true_share && - !src_entry->is_shared && - object->vo_size > entry_size)) { +void +vm_map_copy_trim( + vm_map_copy_t copy_map, + int new_page_shift, + vm_map_offset_t trim_start, + vm_map_offset_t trim_end); +void +vm_map_copy_trim( + vm_map_copy_t copy_map, + int new_page_shift, + vm_map_offset_t trim_start, + vm_map_offset_t trim_end) +{ + int copy_page_shift; + vm_map_entry_t entry, next_entry; - vm_object_shadow(&src_entry->object.vm_object, - &src_entry->offset, - entry_size); + assert(copy_map->type == VM_MAP_COPY_ENTRY_LIST); + assert(copy_map->cpy_hdr.nentries > 0); - if (!src_entry->needs_copy && - (src_entry->protection & VM_PROT_WRITE)) { - vm_prot_t prot; + trim_start += vm_map_copy_first_entry(copy_map)->vme_start; + trim_end += vm_map_copy_first_entry(copy_map)->vme_start; - prot = src_entry->protection & ~VM_PROT_WRITE; + /* use the new page_shift to do the clipping */ + copy_page_shift = VM_MAP_COPY_PAGE_SHIFT(copy_map); + copy_map->cpy_hdr.page_shift = new_page_shift; - if (override_nx(map, src_entry->alias) && prot) - prot |= VM_PROT_EXECUTE; + for (entry = vm_map_copy_first_entry(copy_map); + entry != vm_map_copy_to_entry(copy_map); + entry = next_entry) { + next_entry = entry->vme_next; + if (entry->vme_end <= trim_start) { + /* entry fully before trim range: skip */ + continue; + } + if (entry->vme_start >= trim_end) { + /* entry fully after trim range: done */ + break; + } + /* clip entry if needed */ + vm_map_copy_clip_start(copy_map, entry, trim_start); + vm_map_copy_clip_end(copy_map, entry, trim_end); + /* dispose of entry */ + copy_map->size -= entry->vme_end - entry->vme_start; + vm_map_copy_entry_unlink(copy_map, entry); + if (entry->is_sub_map) { + vm_map_deallocate(VME_SUBMAP(entry)); + } else { + vm_object_deallocate(VME_OBJECT(entry)); + } + vm_map_copy_entry_dispose(copy_map, entry); + entry = VM_MAP_ENTRY_NULL; + } - if(map->mapped_in_other_pmaps) { - vm_object_pmap_protect( - src_entry->object.vm_object, - src_entry->offset, - entry_size, - PMAP_NULL, - src_entry->vme_start, - prot); - } else { - pmap_protect(vm_map_pmap(map), - src_entry->vme_start, - src_entry->vme_end, - prot); - } - } + /* restore copy_map's original page_shift */ + copy_map->cpy_hdr.page_shift = copy_page_shift; +} - object = src_entry->object.vm_object; - src_entry->needs_copy = FALSE; - } +/* + * Make any necessary adjustments to "copy_map" to allow it to be + * mapped into "target_map". + * If no changes were necessary, "target_copy_map" points to the + * untouched "copy_map". + * If changes are necessary, changes will be made to "target_copy_map". + * If "target_copy_map" was NULL, we create a new "vm_map_copy_t" and + * copy the original "copy_map" to it before applying the changes. + * The caller should discard "target_copy_map" if it's not the same as + * the original "copy_map". + */ +/* TODO4K: also adjust to sub-range in the copy_map -> add start&end? */ +kern_return_t +vm_map_copy_adjust_to_target( + vm_map_copy_t src_copy_map, + vm_map_offset_t offset, + vm_map_size_t size, + vm_map_t target_map, + boolean_t copy, + vm_map_copy_t *target_copy_map_p, + vm_map_offset_t *overmap_start_p, + vm_map_offset_t *overmap_end_p, + vm_map_offset_t *trimmed_start_p) +{ + vm_map_copy_t copy_map, target_copy_map; + vm_map_size_t target_size; + vm_map_size_t src_copy_map_size; + vm_map_size_t overmap_start, overmap_end; + int misalignments; + vm_map_entry_t entry, target_entry; + vm_map_offset_t addr_adjustment; + vm_map_offset_t new_start, new_end; + int copy_page_mask, target_page_mask; + int copy_page_shift, target_page_shift; + vm_map_offset_t trimmed_end; + /* + * Assert that the vm_map_copy is coming from the right + * zone and hasn't been forged + */ + vm_map_copy_require(src_copy_map); + assert(src_copy_map->type == VM_MAP_COPY_ENTRY_LIST); - vm_object_lock(object); - vm_object_reference_locked(object); /* object ref. for new entry */ - if (object->copy_strategy == - MEMORY_OBJECT_COPY_SYMMETRIC) { - object->copy_strategy = - MEMORY_OBJECT_COPY_DELAY; - } - vm_object_unlock(object); - } + /* + * Start working with "src_copy_map" but we'll switch + * to "target_copy_map" as soon as we start making adjustments. + */ + copy_map = src_copy_map; + src_copy_map_size = src_copy_map->size; - offset = src_entry->offset + (src_start - src_entry->vme_start); + copy_page_shift = VM_MAP_COPY_PAGE_SHIFT(copy_map); + copy_page_mask = VM_MAP_COPY_PAGE_MASK(copy_map); + target_page_shift = VM_MAP_PAGE_SHIFT(target_map); + target_page_mask = VM_MAP_PAGE_MASK(target_map); - new_entry = _vm_map_entry_create(map_header, !map_header->entries_pageable); - vm_map_entry_copy(new_entry, src_entry); - if (new_entry->is_sub_map) { - /* clr address space specifics */ - new_entry->use_pmap = FALSE; - } + DEBUG4K_ADJUST("copy_map %p (%d offset 0x%llx size 0x%llx) target_map %p (%d) copy %d offset 0x%llx size 0x%llx target_copy_map %p...\n", copy_map, copy_page_shift, (uint64_t)copy_map->offset, (uint64_t)copy_map->size, target_map, target_page_shift, copy, (uint64_t)offset, (uint64_t)size, *target_copy_map_p); - new_entry->map_aligned = FALSE; + target_copy_map = *target_copy_map_p; + if (target_copy_map != VM_MAP_COPY_NULL) { + vm_map_copy_require(target_copy_map); + } - new_entry->vme_start = map_address; - new_entry->vme_end = map_address + tmp_size; - assert(new_entry->vme_start < new_entry->vme_end); - new_entry->inheritance = inheritance; - new_entry->offset = offset; + if (offset + size > copy_map->size) { + DEBUG4K_ERROR("copy_map %p (%d->%d) copy_map->size 0x%llx offset 0x%llx size 0x%llx KERN_INVALID_ARGUMENT\n", copy_map, copy_page_shift, target_page_shift, (uint64_t)copy_map->size, (uint64_t)offset, (uint64_t)size); + return KERN_INVALID_ARGUMENT; + } + /* trim the end */ + trimmed_end = 0; + new_end = VM_MAP_ROUND_PAGE(offset + size, target_page_mask); + if (new_end < copy_map->size) { + trimmed_end = src_copy_map_size - new_end; + DEBUG4K_ADJUST("copy_map %p (%d->%d) copy %d offset 0x%llx size 0x%llx target_copy_map %p... trim end from 0x%llx to 0x%llx\n", copy_map, copy_page_shift, target_page_shift, copy, (uint64_t)offset, (uint64_t)size, target_copy_map, (uint64_t)new_end, (uint64_t)copy_map->size); + /* get "target_copy_map" if needed and adjust it */ + vm_map_copy_adjust_get_target_copy_map(copy_map, + &target_copy_map); + copy_map = target_copy_map; + vm_map_copy_trim(target_copy_map, target_page_shift, + new_end, copy_map->size); + } + + /* trim the start */ + new_start = VM_MAP_TRUNC_PAGE(offset, target_page_mask); + if (new_start != 0) { + DEBUG4K_ADJUST("copy_map %p (%d->%d) copy %d offset 0x%llx size 0x%llx target_copy_map %p... trim start from 0x%llx to 0x%llx\n", copy_map, copy_page_shift, target_page_shift, copy, (uint64_t)offset, (uint64_t)size, target_copy_map, (uint64_t)0, (uint64_t)new_start); + /* get "target_copy_map" if needed and adjust it */ + vm_map_copy_adjust_get_target_copy_map(copy_map, + &target_copy_map); + copy_map = target_copy_map; + vm_map_copy_trim(target_copy_map, target_page_shift, + 0, new_start); + } + *trimmed_start_p = new_start; + + /* target_size starts with what's left after trimming */ + target_size = copy_map->size; + assertf(target_size == src_copy_map_size - *trimmed_start_p - trimmed_end, + "target_size 0x%llx src_copy_map_size 0x%llx trimmed_start 0x%llx trimmed_end 0x%llx\n", + (uint64_t)target_size, (uint64_t)src_copy_map_size, + (uint64_t)*trimmed_start_p, (uint64_t)trimmed_end); + + /* check for misalignments but don't adjust yet */ + misalignments = 0; + overmap_start = 0; + overmap_end = 0; + if (copy_page_shift < target_page_shift) { /* - * The new region has to be copied now if required. + * Remapping from 4K to 16K: check the VM object alignments + * throughout the range. + * If the start and end of the range are mis-aligned, we can + * over-map to re-align, and adjust the "overmap" start/end + * and "target_size" of the range accordingly. + * If there is any mis-alignment within the range: + * if "copy": + * we can do immediate-copy instead of copy-on-write, + * else: + * no way to remap and share; fail. */ - RestartCopy: - if (!copy) { - /* - * Cannot allow an entry describing a JIT - * region to be shared across address spaces. - */ - if (src_entry->used_for_jit == TRUE) { - result = KERN_INVALID_ARGUMENT; - break; + for (entry = vm_map_copy_first_entry(copy_map); + entry != vm_map_copy_to_entry(copy_map); + entry = entry->vme_next) { + vm_object_offset_t object_offset_start, object_offset_end; + + object_offset_start = VME_OFFSET(entry); + object_offset_end = object_offset_start; + object_offset_end += entry->vme_end - entry->vme_start; + if (object_offset_start & target_page_mask) { + if (entry == vm_map_copy_first_entry(copy_map) && !copy) { + overmap_start++; + } else { + misalignments++; + } } - src_entry->is_shared = TRUE; - new_entry->is_shared = TRUE; - if (!(new_entry->is_sub_map)) - new_entry->needs_copy = FALSE; + if (object_offset_end & target_page_mask) { + if (entry->vme_next == vm_map_copy_to_entry(copy_map) && !copy) { + overmap_end++; + } else { + misalignments++; + } + } + } + } + entry = VM_MAP_ENTRY_NULL; - } else if (src_entry->is_sub_map) { - /* make this a COW sub_map if not already */ - new_entry->needs_copy = TRUE; - object = VM_OBJECT_NULL; - } else if (src_entry->wired_count == 0 && - vm_object_copy_quickly(&new_entry->object.vm_object, - new_entry->offset, - (new_entry->vme_end - - new_entry->vme_start), - &src_needs_copy, - &new_entry_needs_copy)) { + /* decide how to deal with misalignments */ + assert(overmap_start <= 1); + assert(overmap_end <= 1); + if (!overmap_start && !overmap_end && !misalignments) { + /* copy_map is properly aligned for target_map ... */ + if (*trimmed_start_p) { + /* ... but we trimmed it, so still need to adjust */ + } else { + /* ... and we didn't trim anything: we're done */ + if (target_copy_map == VM_MAP_COPY_NULL) { + target_copy_map = copy_map; + } + *target_copy_map_p = target_copy_map; + *overmap_start_p = 0; + *overmap_end_p = 0; + DEBUG4K_ADJUST("copy_map %p (%d offset 0x%llx size 0x%llx) target_map %p (%d) copy %d target_copy_map %p (%d offset 0x%llx size 0x%llx) -> trimmed 0x%llx overmap start 0x%llx end 0x%llx KERN_SUCCESS\n", copy_map, copy_page_shift, (uint64_t)copy_map->offset, (uint64_t)copy_map->size, target_map, target_page_shift, copy, *target_copy_map_p, VM_MAP_COPY_PAGE_SHIFT(*target_copy_map_p), (uint64_t)(*target_copy_map_p)->offset, (uint64_t)(*target_copy_map_p)->size, (uint64_t)*trimmed_start_p, (uint64_t)*overmap_start_p, (uint64_t)*overmap_end_p); + return KERN_SUCCESS; + } + } else if (misalignments && !copy) { + /* can't "share" if misaligned */ + DEBUG4K_ADJUST("unsupported sharing\n"); +#if MACH_ASSERT + if (debug4k_panic_on_misaligned_sharing) { + panic("DEBUG4k %s:%d unsupported sharing\n", __FUNCTION__, __LINE__); + } +#endif /* MACH_ASSERT */ + DEBUG4K_ADJUST("copy_map %p (%d) target_map %p (%d) copy %d target_copy_map %p -> KERN_NOT_SUPPORTED\n", copy_map, copy_page_shift, target_map, target_page_shift, copy, *target_copy_map_p); + return KERN_NOT_SUPPORTED; + } else { + /* can't virtual-copy if misaligned (but can physical-copy) */ + DEBUG4K_ADJUST("mis-aligned copying\n"); + } - new_entry->needs_copy = new_entry_needs_copy; - new_entry->is_shared = FALSE; + /* get a "target_copy_map" if needed and switch to it */ + vm_map_copy_adjust_get_target_copy_map(copy_map, &target_copy_map); + copy_map = target_copy_map; - /* - * Handle copy_on_write semantics. - */ - if (src_needs_copy && !src_entry->needs_copy) { - vm_prot_t prot; + if (misalignments && copy) { + vm_map_size_t target_copy_map_size; - prot = src_entry->protection & ~VM_PROT_WRITE; + /* + * Can't do copy-on-write with misaligned mappings. + * Replace the mappings with a physical copy of the original + * mappings' contents. + */ + target_copy_map_size = target_copy_map->size; + vm_map_copy_to_physcopy(target_copy_map, target_map); + *target_copy_map_p = target_copy_map; + *overmap_start_p = 0; + *overmap_end_p = target_copy_map->size - target_copy_map_size; + DEBUG4K_ADJUST("copy_map %p (%d offset 0x%llx size 0x%llx) target_map %p (%d) copy %d target_copy_map %p (%d offset 0x%llx size 0x%llx)-> trimmed 0x%llx overmap start 0x%llx end 0x%llx PHYSCOPY\n", copy_map, copy_page_shift, (uint64_t)copy_map->offset, (uint64_t)copy_map->size, target_map, target_page_shift, copy, *target_copy_map_p, VM_MAP_COPY_PAGE_SHIFT(*target_copy_map_p), (uint64_t)(*target_copy_map_p)->offset, (uint64_t)(*target_copy_map_p)->size, (uint64_t)*trimmed_start_p, (uint64_t)*overmap_start_p, (uint64_t)*overmap_end_p); + return KERN_SUCCESS; + } - if (override_nx(map, src_entry->alias) && prot) - prot |= VM_PROT_EXECUTE; + /* apply the adjustments */ + misalignments = 0; + overmap_start = 0; + overmap_end = 0; + /* remove copy_map->offset, so that everything starts at offset 0 */ + addr_adjustment = copy_map->offset; + /* also remove whatever we trimmed from the start */ + addr_adjustment += *trimmed_start_p; + for (target_entry = vm_map_copy_first_entry(target_copy_map); + target_entry != vm_map_copy_to_entry(target_copy_map); + target_entry = target_entry->vme_next) { + vm_object_offset_t object_offset_start, object_offset_end; + + DEBUG4K_ADJUST("copy %p (%d 0x%llx 0x%llx) entry %p [ 0x%llx 0x%llx ] object %p offset 0x%llx BEFORE\n", target_copy_map, VM_MAP_COPY_PAGE_SHIFT(target_copy_map), target_copy_map->offset, (uint64_t)target_copy_map->size, target_entry, (uint64_t)target_entry->vme_start, (uint64_t)target_entry->vme_end, VME_OBJECT(target_entry), VME_OFFSET(target_entry)); + object_offset_start = VME_OFFSET(target_entry); + if (object_offset_start & target_page_mask) { + DEBUG4K_ADJUST("copy %p (%d 0x%llx 0x%llx) entry %p [ 0x%llx 0x%llx ] object %p offset 0x%llx misaligned at start\n", target_copy_map, VM_MAP_COPY_PAGE_SHIFT(target_copy_map), target_copy_map->offset, (uint64_t)target_copy_map->size, target_entry, (uint64_t)target_entry->vme_start, (uint64_t)target_entry->vme_end, VME_OBJECT(target_entry), VME_OFFSET(target_entry)); + if (target_entry == vm_map_copy_first_entry(target_copy_map)) { + /* + * start of 1st entry is mis-aligned: + * re-adjust by over-mapping. + */ + overmap_start = object_offset_start - trunc_page_mask_64(object_offset_start, target_page_mask); + DEBUG4K_ADJUST("entry %p offset 0x%llx copy %d -> overmap_start 0x%llx\n", target_entry, VME_OFFSET(target_entry), copy, (uint64_t)overmap_start); + VME_OFFSET_SET(target_entry, VME_OFFSET(target_entry) - overmap_start); + } else { + misalignments++; + DEBUG4K_ADJUST("entry %p offset 0x%llx copy %d -> misalignments %d\n", target_entry, VME_OFFSET(target_entry), copy, misalignments); + assert(copy); + } + } - vm_object_pmap_protect(object, - offset, - entry_size, - ((src_entry->is_shared - || map->mapped_in_other_pmaps) ? - PMAP_NULL : map->pmap), - src_entry->vme_start, - prot); + if (target_entry == vm_map_copy_first_entry(target_copy_map)) { + target_size += overmap_start; + } else { + target_entry->vme_start += overmap_start; + } + target_entry->vme_end += overmap_start; - src_entry->needs_copy = TRUE; + object_offset_end = VME_OFFSET(target_entry) + target_entry->vme_end - target_entry->vme_start; + if (object_offset_end & target_page_mask) { + DEBUG4K_ADJUST("copy %p (%d 0x%llx 0x%llx) entry %p [ 0x%llx 0x%llx ] object %p offset 0x%llx misaligned at end\n", target_copy_map, VM_MAP_COPY_PAGE_SHIFT(target_copy_map), target_copy_map->offset, (uint64_t)target_copy_map->size, target_entry, (uint64_t)target_entry->vme_start, (uint64_t)target_entry->vme_end, VME_OBJECT(target_entry), VME_OFFSET(target_entry)); + if (target_entry->vme_next == vm_map_copy_to_entry(target_copy_map)) { + /* + * end of last entry is mis-aligned: re-adjust by over-mapping. + */ + overmap_end = round_page_mask_64(object_offset_end, target_page_mask) - object_offset_end; + DEBUG4K_ADJUST("entry %p offset 0x%llx copy %d -> overmap_end 0x%llx\n", target_entry, VME_OFFSET(target_entry), copy, (uint64_t)overmap_end); + target_entry->vme_end += overmap_end; + target_size += overmap_end; + } else { + misalignments++; + DEBUG4K_ADJUST("entry %p offset 0x%llx copy %d -> misalignments %d\n", target_entry, VME_OFFSET(target_entry), copy, misalignments); + assert(copy); } - /* - * Throw away the old object reference of the new entry. - */ - vm_object_deallocate(object); + } + target_entry->vme_start -= addr_adjustment; + target_entry->vme_end -= addr_adjustment; + DEBUG4K_ADJUST("copy %p (%d 0x%llx 0x%llx) entry %p [ 0x%llx 0x%llx ] object %p offset 0x%llx AFTER\n", target_copy_map, VM_MAP_COPY_PAGE_SHIFT(target_copy_map), target_copy_map->offset, (uint64_t)target_copy_map->size, target_entry, (uint64_t)target_entry->vme_start, (uint64_t)target_entry->vme_end, VME_OBJECT(target_entry), VME_OFFSET(target_entry)); + } - } else { - new_entry->is_shared = FALSE; + target_copy_map->size = target_size; + target_copy_map->offset += overmap_start; + target_copy_map->offset -= addr_adjustment; + target_copy_map->cpy_hdr.page_shift = target_page_shift; - /* - * The map can be safely unlocked since we - * already hold a reference on the object. - * - * Record the timestamp of the map for later - * verification, and unlock the map. - */ - version.main_timestamp = map->timestamp; - vm_map_unlock(map); /* Increments timestamp once! */ +// assert(VM_MAP_PAGE_ALIGNED(target_copy_map->size, target_page_mask)); +// assert(VM_MAP_PAGE_ALIGNED(target_copy_map->offset, FOURK_PAGE_MASK)); + assert(overmap_start < VM_MAP_PAGE_SIZE(target_map)); + assert(overmap_end < VM_MAP_PAGE_SIZE(target_map)); - /* - * Perform the copy. - */ - if (src_entry->wired_count > 0) { - vm_object_lock(object); - result = vm_object_copy_slowly( - object, - offset, - entry_size, - THREAD_UNINT, - &new_entry->object.vm_object); + *target_copy_map_p = target_copy_map; + *overmap_start_p = overmap_start; + *overmap_end_p = overmap_end; - new_entry->offset = 0; - new_entry->needs_copy = FALSE; - } else { - result = vm_object_copy_strategically( - object, - offset, - entry_size, - &new_entry->object.vm_object, - &new_entry->offset, - &new_entry_needs_copy); + DEBUG4K_ADJUST("copy_map %p (%d offset 0x%llx size 0x%llx) target_map %p (%d) copy %d target_copy_map %p (%d offset 0x%llx size 0x%llx) -> trimmed 0x%llx overmap start 0x%llx end 0x%llx KERN_SUCCESS\n", copy_map, copy_page_shift, (uint64_t)copy_map->offset, (uint64_t)copy_map->size, target_map, target_page_shift, copy, *target_copy_map_p, VM_MAP_COPY_PAGE_SHIFT(*target_copy_map_p), (uint64_t)(*target_copy_map_p)->offset, (uint64_t)(*target_copy_map_p)->size, (uint64_t)*trimmed_start_p, (uint64_t)*overmap_start_p, (uint64_t)*overmap_end_p); + return KERN_SUCCESS; +} - new_entry->needs_copy = new_entry_needs_copy; - } +kern_return_t +vm_map_range_physical_size( + vm_map_t map, + vm_map_address_t start, + mach_vm_size_t size, + mach_vm_size_t * phys_size) +{ + kern_return_t kr; + vm_map_copy_t copy_map, target_copy_map; + vm_map_offset_t adjusted_start, adjusted_end; + vm_map_size_t adjusted_size; + vm_prot_t cur_prot, max_prot; + vm_map_offset_t overmap_start, overmap_end, trimmed_start; + vm_map_kernel_flags_t vmk_flags; + + adjusted_start = vm_map_trunc_page(start, VM_MAP_PAGE_MASK(map)); + adjusted_end = vm_map_round_page(start + size, VM_MAP_PAGE_MASK(map)); + adjusted_size = adjusted_end - adjusted_start; + *phys_size = adjusted_size; + if (VM_MAP_PAGE_SIZE(map) == PAGE_SIZE) { + return KERN_SUCCESS; + } + if (start == 0) { + adjusted_start = vm_map_trunc_page(start, PAGE_MASK); + adjusted_end = vm_map_round_page(start + size, PAGE_MASK); + adjusted_size = adjusted_end - adjusted_start; + *phys_size = adjusted_size; + return KERN_SUCCESS; + } + if (adjusted_size == 0) { + DEBUG4K_SHARE("map %p start 0x%llx size 0x%llx adjusted 0x%llx -> phys_size 0!\n", map, (uint64_t)start, (uint64_t)size, (uint64_t)adjusted_size); + *phys_size = 0; + return KERN_SUCCESS; + } - /* - * Throw away the old object reference of the new entry. - */ - vm_object_deallocate(object); + vmk_flags = VM_MAP_KERNEL_FLAGS_NONE; + vmk_flags.vmkf_copy_pageable = TRUE; + vmk_flags.vmkf_copy_same_map = TRUE; + assert(adjusted_size != 0); + cur_prot = VM_PROT_NONE; /* legacy mode */ + max_prot = VM_PROT_NONE; /* legacy mode */ + kr = vm_map_copy_extract(map, adjusted_start, adjusted_size, + FALSE /* copy */, + ©_map, + &cur_prot, &max_prot, VM_INHERIT_DEFAULT, + vmk_flags); + if (kr != KERN_SUCCESS) { + DEBUG4K_ERROR("map %p start 0x%llx 0x%llx size 0x%llx 0x%llx kr 0x%x\n", map, (uint64_t)start, (uint64_t)adjusted_start, size, (uint64_t)adjusted_size, kr); + //assert(0); + *phys_size = 0; + return kr; + } + assert(copy_map != VM_MAP_COPY_NULL); + target_copy_map = copy_map; + DEBUG4K_ADJUST("adjusting...\n"); + kr = vm_map_copy_adjust_to_target( + copy_map, + start - adjusted_start, /* offset */ + size, /* size */ + kernel_map, + FALSE, /* copy */ + &target_copy_map, + &overmap_start, + &overmap_end, + &trimmed_start); + if (kr == KERN_SUCCESS) { + if (target_copy_map->size != *phys_size) { + DEBUG4K_ADJUST("map %p (%d) start 0x%llx size 0x%llx adjusted_start 0x%llx adjusted_end 0x%llx overmap_start 0x%llx overmap_end 0x%llx trimmed_start 0x%llx phys_size 0x%llx -> 0x%llx\n", map, VM_MAP_PAGE_SHIFT(map), (uint64_t)start, (uint64_t)size, (uint64_t)adjusted_start, (uint64_t)adjusted_end, (uint64_t)overmap_start, (uint64_t)overmap_end, (uint64_t)trimmed_start, (uint64_t)*phys_size, (uint64_t)target_copy_map->size); + } + *phys_size = target_copy_map->size; + } else { + DEBUG4K_ERROR("map %p start 0x%llx 0x%llx size 0x%llx 0x%llx kr 0x%x\n", map, (uint64_t)start, (uint64_t)adjusted_start, size, (uint64_t)adjusted_size, kr); + //assert(0); + *phys_size = 0; + } + vm_map_copy_discard(copy_map); + copy_map = VM_MAP_COPY_NULL; - if (result != KERN_SUCCESS && - result != KERN_MEMORY_RESTART_COPY) { - _vm_map_entry_dispose(map_header, new_entry); - break; - } + return kr; +} - /* - * Verify that the map has not substantially - * changed while the copy was being made. - */ - vm_map_lock(map); - if (version.main_timestamp + 1 != map->timestamp) { - /* - * Simple version comparison failed. - * - * Retry the lookup and verify that the - * same object/offset are still present. - */ - vm_object_deallocate(new_entry-> - object.vm_object); - _vm_map_entry_dispose(map_header, new_entry); - if (result == KERN_MEMORY_RESTART_COPY) - result = KERN_SUCCESS; - continue; - } +kern_return_t +memory_entry_check_for_adjustment( + vm_map_t src_map, + ipc_port_t port, + vm_map_offset_t *overmap_start, + vm_map_offset_t *overmap_end) +{ + kern_return_t kr = KERN_SUCCESS; + vm_map_copy_t copy_map = VM_MAP_COPY_NULL, target_copy_map = VM_MAP_COPY_NULL; - if (result == KERN_MEMORY_RESTART_COPY) { - vm_object_reference(object); - goto RestartCopy; - } - } + assert(port); + assertf(ip_kotype(port) == IKOT_NAMED_ENTRY, "Port Type expected: %d...received:%d\n", IKOT_NAMED_ENTRY, ip_kotype(port)); - _vm_map_store_entry_link(map_header, - map_header->links.prev, new_entry); + vm_named_entry_t named_entry; - /*Protections for submap mapping are irrelevant here*/ - if( !src_entry->is_sub_map ) { - *cur_protection &= src_entry->protection; - *max_protection &= src_entry->max_protection; - } - map_address += tmp_size; - mapped_size += tmp_size; - src_start += tmp_size; + named_entry = (vm_named_entry_t) ipc_kobject_get(port); + named_entry_lock(named_entry); + copy_map = named_entry->backing.copy; + target_copy_map = copy_map; - } /* end while */ + if (src_map && VM_MAP_PAGE_SHIFT(src_map) < PAGE_SHIFT) { + vm_map_offset_t trimmed_start; - vm_map_unlock(map); - if (result != KERN_SUCCESS) { - /* - * Free all allocated elements. - */ - for (src_entry = map_header->links.next; - src_entry != (struct vm_map_entry *)&map_header->links; - src_entry = new_entry) { - new_entry = src_entry->vme_next; - _vm_map_store_entry_unlink(map_header, src_entry); - if (src_entry->is_sub_map) { - vm_map_deallocate(src_entry->object.sub_map); - } else { - vm_object_deallocate(src_entry->object.vm_object); - } - _vm_map_entry_dispose(map_header, src_entry); - } + trimmed_start = 0; + DEBUG4K_ADJUST("adjusting...\n"); + kr = vm_map_copy_adjust_to_target( + copy_map, + 0, /* offset */ + copy_map->size, /* size */ + src_map, + FALSE, /* copy */ + &target_copy_map, + overmap_start, + overmap_end, + &trimmed_start); + assert(trimmed_start == 0); } - return result; + named_entry_unlock(named_entry); + + return kr; } + /* * Routine: vm_remap * @@ -12887,132 +17766,266 @@ vm_map_remap_extract( */ kern_return_t vm_map_remap( - vm_map_t target_map, - vm_map_address_t *address, - vm_map_size_t size, - vm_map_offset_t mask, - int flags, - vm_map_t src_map, - vm_map_offset_t memory_address, - boolean_t copy, - vm_prot_t *cur_protection, - vm_prot_t *max_protection, - vm_inherit_t inheritance) -{ - kern_return_t result; - vm_map_entry_t entry; - vm_map_entry_t insp_entry = VM_MAP_ENTRY_NULL; - vm_map_entry_t new_entry; - struct vm_map_header map_header; - vm_map_offset_t offset_in_mapping; - - if (target_map == VM_MAP_NULL) + vm_map_t target_map, + vm_map_address_t *address, + vm_map_size_t size, + vm_map_offset_t mask, + int flags, + vm_map_kernel_flags_t vmk_flags, + vm_tag_t tag, + vm_map_t src_map, + vm_map_offset_t memory_address, + boolean_t copy, + vm_prot_t *cur_protection, /* IN/OUT */ + vm_prot_t *max_protection, /* IN/OUT */ + vm_inherit_t inheritance) +{ + kern_return_t result; + vm_map_entry_t entry; + vm_map_entry_t insp_entry = VM_MAP_ENTRY_NULL; + vm_map_entry_t new_entry; + vm_map_copy_t copy_map; + vm_map_offset_t offset_in_mapping; + vm_map_size_t target_size = 0; + vm_map_size_t src_page_mask, target_page_mask; + vm_map_offset_t overmap_start, overmap_end, trimmed_start; + vm_map_offset_t initial_memory_address; + vm_map_size_t initial_size; + + if (target_map == VM_MAP_NULL) { return KERN_INVALID_ARGUMENT; + } + + initial_memory_address = memory_address; + initial_size = size; + src_page_mask = VM_MAP_PAGE_MASK(src_map); + target_page_mask = VM_MAP_PAGE_MASK(target_map); switch (inheritance) { case VM_INHERIT_NONE: case VM_INHERIT_COPY: case VM_INHERIT_SHARE: - if (size != 0 && src_map != VM_MAP_NULL) + if (size != 0 && src_map != VM_MAP_NULL) { break; - /*FALL THRU*/ + } + OS_FALLTHROUGH; default: return KERN_INVALID_ARGUMENT; } - /* - * If the user is requesting that we return the address of the - * first byte of the data (rather than the base of the page), - * then we use different rounding semantics: specifically, + if (src_page_mask != target_page_mask) { + if (copy) { + DEBUG4K_COPY("src_map %p pgsz 0x%x addr 0x%llx size 0x%llx copy %d -> target_map %p pgsz 0x%x\n", src_map, VM_MAP_PAGE_SIZE(src_map), (uint64_t)memory_address, (uint64_t)size, copy, target_map, VM_MAP_PAGE_SIZE(target_map)); + } else { + DEBUG4K_SHARE("src_map %p pgsz 0x%x addr 0x%llx size 0x%llx copy %d -> target_map %p pgsz 0x%x\n", src_map, VM_MAP_PAGE_SIZE(src_map), (uint64_t)memory_address, (uint64_t)size, copy, target_map, VM_MAP_PAGE_SIZE(target_map)); + } + } + + /* + * If the user is requesting that we return the address of the + * first byte of the data (rather than the base of the page), + * then we use different rounding semantics: specifically, * we assume that (memory_address, size) describes a region * all of whose pages we must cover, rather than a base to be truncated * down and a size to be added to that base. So we figure out * the highest page that the requested region includes and make * sure that the size will cover it. - * - * The key example we're worried about it is of the form: * - * memory_address = 0x1ff0, size = 0x20 - * - * With the old semantics, we round down the memory_address to 0x1000 + * The key example we're worried about it is of the form: + * + * memory_address = 0x1ff0, size = 0x20 + * + * With the old semantics, we round down the memory_address to 0x1000 * and round up the size to 0x1000, resulting in our covering *only* * page 0x1000. With the new semantics, we'd realize that the region covers - * 0x1ff0-0x2010, and compute a size of 0x2000. Thus, we cover both page + * 0x1ff0-0x2010, and compute a size of 0x2000. Thus, we cover both page * 0x1000 and page 0x2000 in the region we remap. */ if ((flags & VM_FLAGS_RETURN_DATA_ADDR) != 0) { - offset_in_mapping = memory_address - vm_map_trunc_page(memory_address, PAGE_MASK); - size = vm_map_round_page(memory_address + size - vm_map_trunc_page(memory_address, PAGE_MASK), PAGE_MASK); + vm_map_offset_t range_start, range_end; + + range_start = vm_map_trunc_page(memory_address, src_page_mask); + range_end = vm_map_round_page(memory_address + size, src_page_mask); + memory_address = range_start; + size = range_end - range_start; + offset_in_mapping = initial_memory_address - memory_address; } else { - size = vm_map_round_page(size, PAGE_MASK); - } + /* + * IMPORTANT: + * This legacy code path is broken: for the range mentioned + * above [ memory_address = 0x1ff0,size = 0x20 ], which spans + * two 4k pages, it yields [ memory_address = 0x1000, + * size = 0x1000 ], which covers only the first 4k page. + * BUT some code unfortunately depends on this bug, so we + * can't fix it without breaking something. + * New code should get automatically opted in the new + * behavior with the new VM_FLAGS_RETURN_DATA_ADDR flags. + */ + offset_in_mapping = 0; + memory_address = vm_map_trunc_page(memory_address, src_page_mask); + size = vm_map_round_page(size, src_page_mask); + initial_memory_address = memory_address; + initial_size = size; + } + + + if (size == 0) { + return KERN_INVALID_ARGUMENT; + } + + if (flags & VM_FLAGS_RESILIENT_MEDIA) { + /* must be copy-on-write to be "media resilient" */ + if (!copy) { + return KERN_INVALID_ARGUMENT; + } + } - result = vm_map_remap_extract(src_map, memory_address, - size, copy, &map_header, - cur_protection, - max_protection, - inheritance, - target_map->hdr.entries_pageable); + vmk_flags.vmkf_copy_pageable = target_map->hdr.entries_pageable; + vmk_flags.vmkf_copy_same_map = (src_map == target_map); + assert(size != 0); + result = vm_map_copy_extract(src_map, + memory_address, + size, + copy, ©_map, + cur_protection, /* IN/OUT */ + max_protection, /* IN/OUT */ + inheritance, + vmk_flags); if (result != KERN_SUCCESS) { return result; } + assert(copy_map != VM_MAP_COPY_NULL); + + overmap_start = 0; + overmap_end = 0; + trimmed_start = 0; + target_size = size; + if (src_page_mask != target_page_mask) { + vm_map_copy_t target_copy_map; + + target_copy_map = copy_map; /* can modify "copy_map" itself */ + DEBUG4K_ADJUST("adjusting...\n"); + result = vm_map_copy_adjust_to_target( + copy_map, + offset_in_mapping, /* offset */ + initial_size, + target_map, + copy, + &target_copy_map, + &overmap_start, + &overmap_end, + &trimmed_start); + if (result != KERN_SUCCESS) { + DEBUG4K_COPY("failed to adjust 0x%x\n", result); + vm_map_copy_discard(copy_map); + return result; + } + if (trimmed_start == 0) { + /* nothing trimmed: no adjustment needed */ + } else if (trimmed_start >= offset_in_mapping) { + /* trimmed more than offset_in_mapping: nothing left */ + assert(overmap_start == 0); + assert(overmap_end == 0); + offset_in_mapping = 0; + } else { + /* trimmed some of offset_in_mapping: adjust */ + assert(overmap_start == 0); + assert(overmap_end == 0); + offset_in_mapping -= trimmed_start; + } + offset_in_mapping += overmap_start; + target_size = target_copy_map->size; + } /* * Allocate/check a range of free virtual address * space for the target */ - *address = vm_map_trunc_page(*address, - VM_MAP_PAGE_MASK(target_map)); + *address = vm_map_trunc_page(*address, target_page_mask); vm_map_lock(target_map); - result = vm_map_remap_range_allocate(target_map, address, size, - mask, flags, &insp_entry); - - for (entry = map_header.links.next; - entry != (struct vm_map_entry *)&map_header.links; - entry = new_entry) { + target_size = vm_map_round_page(target_size, target_page_mask); + result = vm_map_remap_range_allocate(target_map, address, + target_size, + mask, flags, vmk_flags, tag, + &insp_entry); + + for (entry = vm_map_copy_first_entry(copy_map); + entry != vm_map_copy_to_entry(copy_map); + entry = new_entry) { new_entry = entry->vme_next; - _vm_map_store_entry_unlink(&map_header, entry); + vm_map_copy_entry_unlink(copy_map, entry); if (result == KERN_SUCCESS) { + if (flags & VM_FLAGS_RESILIENT_CODESIGN) { + /* no codesigning -> read-only access */ + entry->max_protection = VM_PROT_READ; + entry->protection = VM_PROT_READ; + entry->vme_resilient_codesign = TRUE; + } entry->vme_start += *address; entry->vme_end += *address; assert(!entry->map_aligned); - vm_map_store_entry_link(target_map, insp_entry, entry); + if ((flags & VM_FLAGS_RESILIENT_MEDIA) && + !entry->is_sub_map && + (VME_OBJECT(entry) == VM_OBJECT_NULL || + VME_OBJECT(entry)->internal)) { + entry->vme_resilient_media = TRUE; + } + assert(VM_MAP_PAGE_ALIGNED(entry->vme_start, MIN(target_page_mask, PAGE_MASK))); + assert(VM_MAP_PAGE_ALIGNED(entry->vme_end, MIN(target_page_mask, PAGE_MASK))); + assert(VM_MAP_PAGE_ALIGNED(VME_OFFSET(entry), MIN(target_page_mask, PAGE_MASK))); + vm_map_store_entry_link(target_map, insp_entry, entry, + vmk_flags); insp_entry = entry; } else { if (!entry->is_sub_map) { - vm_object_deallocate(entry->object.vm_object); + vm_object_deallocate(VME_OBJECT(entry)); } else { - vm_map_deallocate(entry->object.sub_map); + vm_map_deallocate(VME_SUBMAP(entry)); } - _vm_map_entry_dispose(&map_header, entry); + vm_map_copy_entry_dispose(copy_map, entry); } } - if( target_map->disable_vmentry_reuse == TRUE) { - if( target_map->highest_entry_end < insp_entry->vme_end ){ + if (flags & VM_FLAGS_RESILIENT_CODESIGN) { + *cur_protection = VM_PROT_READ; + *max_protection = VM_PROT_READ; + } + + if (target_map->disable_vmentry_reuse == TRUE) { + assert(!target_map->is_nested_map); + if (target_map->highest_entry_end < insp_entry->vme_end) { target_map->highest_entry_end = insp_entry->vme_end; } } if (result == KERN_SUCCESS) { - target_map->size += size; + target_map->size += target_size; SAVE_HINT_MAP_WRITE(target_map, insp_entry); + } vm_map_unlock(target_map); - if (result == KERN_SUCCESS && target_map->wiring_required) - result = vm_map_wire(target_map, *address, - *address + size, *cur_protection, TRUE); + if (result == KERN_SUCCESS && target_map->wiring_required) { + result = vm_map_wire_kernel(target_map, *address, + *address + size, *cur_protection, VM_KERN_MEMORY_MLOCK, + TRUE); + } - /* - * If requested, return the address of the data pointed to by the + /* + * If requested, return the address of the data pointed to by the * request, rather than the base of the resulting page. */ if ((flags & VM_FLAGS_RETURN_DATA_ADDR) != 0) { *address += offset_in_mapping; } + if (src_page_mask != target_page_mask) { + DEBUG4K_SHARE("vm_remap(%p 0x%llx 0x%llx copy=%d-> %p 0x%llx 0x%llx result=0x%x\n", src_map, (uint64_t)memory_address, (uint64_t)size, copy, target_map, (uint64_t)*address, (uint64_t)offset_in_mapping, result); + } + vm_map_copy_discard(copy_map); + copy_map = VM_MAP_COPY_NULL; + return result; } @@ -13029,56 +18042,107 @@ vm_map_remap( static kern_return_t vm_map_remap_range_allocate( - vm_map_t map, - vm_map_address_t *address, /* IN/OUT */ - vm_map_size_t size, - vm_map_offset_t mask, - int flags, - vm_map_entry_t *map_entry) /* OUT */ + vm_map_t map, + vm_map_address_t *address, /* IN/OUT */ + vm_map_size_t size, + vm_map_offset_t mask, + int flags, + vm_map_kernel_flags_t vmk_flags, + __unused vm_tag_t tag, + vm_map_entry_t *map_entry) /* OUT */ { - vm_map_entry_t entry; - vm_map_offset_t start; - vm_map_offset_t end; - kern_return_t kr; + vm_map_entry_t entry; + vm_map_offset_t start; + vm_map_offset_t end; + vm_map_offset_t desired_empty_end; + kern_return_t kr; + vm_map_entry_t hole_entry; -StartAgain: ; +StartAgain:; start = *address; - if (flags & VM_FLAGS_ANYWHERE) - { + if (flags & VM_FLAGS_ANYWHERE) { + if (flags & VM_FLAGS_RANDOM_ADDR) { + /* + * Get a random start address. + */ + kr = vm_map_random_address_for_size(map, address, size); + if (kr != KERN_SUCCESS) { + return kr; + } + start = *address; + } + /* * Calculate the first possible address. */ - if (start < map->min_offset) + if (start < map->min_offset) { start = map->min_offset; - if (start > map->max_offset) - return(KERN_NO_SPACE); - + } + if (start > map->max_offset) { + return KERN_NO_SPACE; + } + /* * Look for the first possible address; * if there's already something at this * address, we have to start after it. */ - if( map->disable_vmentry_reuse == TRUE) { + if (map->disable_vmentry_reuse == TRUE) { VM_MAP_HIGHEST_ENTRY(map, entry, start); } else { - assert(first_free_is_valid(map)); - if (start == map->min_offset) { - if ((entry = map->first_free) != vm_map_to_entry(map)) - start = entry->vme_end; + if (map->holelistenabled) { + hole_entry = CAST_TO_VM_MAP_ENTRY(map->holes_list); + + if (hole_entry == NULL) { + /* + * No more space in the map? + */ + return KERN_NO_SPACE; + } else { + boolean_t found_hole = FALSE; + + do { + if (hole_entry->vme_start >= start) { + start = hole_entry->vme_start; + found_hole = TRUE; + break; + } + + if (hole_entry->vme_end > start) { + found_hole = TRUE; + break; + } + hole_entry = hole_entry->vme_next; + } while (hole_entry != CAST_TO_VM_MAP_ENTRY(map->holes_list)); + + if (found_hole == FALSE) { + return KERN_NO_SPACE; + } + + entry = hole_entry; + } } else { - vm_map_entry_t tmp_entry; - if (vm_map_lookup_entry(map, start, &tmp_entry)) - start = tmp_entry->vme_end; - entry = tmp_entry; + assert(first_free_is_valid(map)); + if (start == map->min_offset) { + if ((entry = map->first_free) != vm_map_to_entry(map)) { + start = entry->vme_end; + } + } else { + vm_map_entry_t tmp_entry; + if (vm_map_lookup_entry(map, start, &tmp_entry)) { + start = tmp_entry->vme_end; + } + entry = tmp_entry; + } } start = vm_map_round_page(start, - VM_MAP_PAGE_MASK(map)); + VM_MAP_PAGE_MASK(map)); } - + /* * In any case, the "entry" always precedes * the proposed new region throughout the @@ -13086,7 +18150,7 @@ StartAgain: ; */ while (TRUE) { - register vm_map_entry_t next; + vm_map_entry_t next; /* * Find the end of the proposed new region. @@ -13096,16 +18160,20 @@ StartAgain: ; end = ((start + mask) & ~mask); end = vm_map_round_page(end, - VM_MAP_PAGE_MASK(map)); - if (end < start) - return(KERN_NO_SPACE); + VM_MAP_PAGE_MASK(map)); + if (end < start) { + return KERN_NO_SPACE; + } start = end; end += size; - if ((end > map->max_offset) || (end < start)) { + /* We want an entire page of empty space, but don't increase the allocation size. */ + desired_empty_end = vm_map_round_page(end, VM_MAP_PAGE_MASK(map)); + + if ((desired_empty_end > map->max_offset) || (desired_empty_end < start)) { if (map->wait_for_space) { if (size <= (map->max_offset - - map->min_offset)) { + map->min_offset)) { assert_wait((event_t) map, THREAD_INTERRUPTIBLE); vm_map_unlock(map); thread_block(THREAD_CONTINUE_NULL); @@ -13113,45 +18181,73 @@ StartAgain: ; goto StartAgain; } } - - return(KERN_NO_SPACE); - } - /* - * If there are no more entries, we must win. - */ + return KERN_NO_SPACE; + } next = entry->vme_next; - if (next == vm_map_to_entry(map)) - break; - /* - * If there is another entry, it must be - * after the end of the potential new region. - */ + if (map->holelistenabled) { + if (entry->vme_end >= desired_empty_end) { + break; + } + } else { + /* + * If there are no more entries, we must win. + * + * OR + * + * If there is another entry, it must be + * after the end of the potential new region. + */ - if (next->vme_start >= end) - break; + if (next == vm_map_to_entry(map)) { + break; + } + + if (next->vme_start >= desired_empty_end) { + break; + } + } /* * Didn't fit -- move to the next entry. */ entry = next; - start = entry->vme_end; + + if (map->holelistenabled) { + if (entry == CAST_TO_VM_MAP_ENTRY(map->holes_list)) { + /* + * Wrapped around + */ + return KERN_NO_SPACE; + } + start = entry->vme_start; + } else { + start = entry->vme_end; + } } + + if (map->holelistenabled) { + if (vm_map_lookup_entry(map, entry->vme_start, &entry)) { + panic("Found an existing entry (%p) instead of potential hole at address: 0x%llx.\n", entry, (unsigned long long)entry->vme_start); + } + } + *address = start; } else { - vm_map_entry_t temp_entry; - + vm_map_entry_t temp_entry; + /* * Verify that: * the address doesn't itself violate * the mask requirement. */ - if ((start & mask) != 0) - return(KERN_NO_SPACE); + if ((start & mask) != 0) { + return KERN_NO_SPACE; + } /* @@ -13163,7 +18259,7 @@ StartAgain: ; if ((start < map->min_offset) || (end > map->max_offset) || (start >= end)) { - return(KERN_INVALID_ADDRESS); + return KERN_INVALID_ADDRESS; } /* @@ -13172,6 +18268,7 @@ StartAgain: ; */ if (flags & VM_FLAGS_OVERWRITE) { vm_map_t zap_map; + int remove_flags = VM_MAP_REMOVE_SAVE_ENTRIES | VM_MAP_REMOVE_NO_MAP_ALIGN; /* * We use a "zap_map" to avoid having to unlock @@ -13180,21 +18277,24 @@ StartAgain: ; * combination. */ zap_map = vm_map_create(PMAP_NULL, - start, - end, - map->hdr.entries_pageable); + start, + end, + map->hdr.entries_pageable); if (zap_map == VM_MAP_NULL) { return KERN_RESOURCE_SHORTAGE; } vm_map_set_page_shift(zap_map, VM_MAP_PAGE_SHIFT(map)); + vm_map_disable_hole_optimization(zap_map); + if (vmk_flags.vmkf_overwrite_immutable) { + remove_flags |= VM_MAP_REMOVE_IMMUTABLE; + } kr = vm_map_delete(map, start, end, - (VM_MAP_REMOVE_SAVE_ENTRIES | - VM_MAP_REMOVE_NO_MAP_ALIGN), - zap_map); + remove_flags, + zap_map); if (kr == KERN_SUCCESS) { vm_map_destroy(zap_map, - VM_MAP_REMOVE_NO_PMAP_CLEANUP); + VM_MAP_REMOVE_NO_PMAP_CLEANUP); zap_map = VM_MAP_NULL; } } @@ -13203,8 +18303,9 @@ StartAgain: ; * ... the starting address isn't allocated */ - if (vm_map_lookup_entry(map, start, &temp_entry)) - return(KERN_NO_SPACE); + if (vm_map_lookup_entry(map, start, &temp_entry)) { + return KERN_NO_SPACE; + } entry = temp_entry; @@ -13214,11 +18315,12 @@ StartAgain: ; */ if ((entry->vme_next != vm_map_to_entry(map)) && - (entry->vme_next->vme_start < end)) - return(KERN_NO_SPACE); + (entry->vme_next->vme_start < end)) { + return KERN_NO_SPACE; + } } *map_entry = entry; - return(KERN_SUCCESS); + return KERN_SUCCESS; } /* @@ -13229,11 +18331,11 @@ StartAgain: ; vm_map_t vm_map_switch( - vm_map_t map) + vm_map_t map) { - int mycpu; - thread_t thread = current_thread(); - vm_map_t oldmap = thread->map; + int mycpu; + thread_t thread = current_thread(); + vm_map_t oldmap = thread->map; mp_disable_preemption(); mycpu = cpu_number(); @@ -13244,7 +18346,7 @@ vm_map_switch( PMAP_SWITCH_USER(thread, map, mycpu); mp_enable_preemption(); - return(oldmap); + return oldmap; } @@ -13262,19 +18364,19 @@ vm_map_switch( */ kern_return_t vm_map_write_user( - vm_map_t map, - void *src_p, - vm_map_address_t dst_addr, - vm_size_t size) + vm_map_t map, + void *src_p, + vm_map_address_t dst_addr, + vm_size_t size) { - kern_return_t kr = KERN_SUCCESS; + kern_return_t kr = KERN_SUCCESS; - if(current_map() == map) { + if (current_map() == map) { if (copyout(src_p, dst_addr, size)) { kr = KERN_INVALID_ADDRESS; } } else { - vm_map_t oldmap; + vm_map_t oldmap; /* take on the identity of the target map while doing */ /* the transfer */ @@ -13304,19 +18406,19 @@ vm_map_write_user( */ kern_return_t vm_map_read_user( - vm_map_t map, - vm_map_address_t src_addr, - void *dst_p, - vm_size_t size) + vm_map_t map, + vm_map_address_t src_addr, + void *dst_p, + vm_size_t size) { - kern_return_t kr = KERN_SUCCESS; + kern_return_t kr = KERN_SUCCESS; - if(current_map() == map) { + if (current_map() == map) { if (copyin(src_addr, dst_p, size)) { kr = KERN_INVALID_ADDRESS; } } else { - vm_map_t oldmap; + vm_map_t oldmap; /* take on the identity of the target map while doing */ /* the transfer */ @@ -13342,22 +18444,21 @@ vm_map_read_user( */ boolean_t vm_map_check_protection(vm_map_t map, vm_map_offset_t start, - vm_map_offset_t end, vm_prot_t protection) + vm_map_offset_t end, vm_prot_t protection) { vm_map_entry_t entry; vm_map_entry_t tmp_entry; vm_map_lock(map); - if (start < vm_map_min(map) || end > vm_map_max(map) || start > end) - { + if (start < vm_map_min(map) || end > vm_map_max(map) || start > end) { vm_map_unlock(map); - return (FALSE); + return FALSE; } if (!vm_map_lookup_entry(map, start, &tmp_entry)) { vm_map_unlock(map); - return(FALSE); + return FALSE; } entry = tmp_entry; @@ -13365,7 +18466,7 @@ vm_map_check_protection(vm_map_t map, vm_map_offset_t start, while (start < end) { if (entry == vm_map_to_entry(map)) { vm_map_unlock(map); - return(FALSE); + return FALSE; } /* @@ -13374,7 +18475,7 @@ vm_map_check_protection(vm_map_t map, vm_map_offset_t start, if (start < entry->vme_start) { vm_map_unlock(map); - return(FALSE); + return FALSE; } /* @@ -13383,7 +18484,7 @@ vm_map_check_protection(vm_map_t map, vm_map_offset_t start, if ((entry->protection & protection) != protection) { vm_map_unlock(map); - return(FALSE); + return FALSE; } /* go to next entry */ @@ -13392,52 +18493,56 @@ vm_map_check_protection(vm_map_t map, vm_map_offset_t start, entry = entry->vme_next; } vm_map_unlock(map); - return(TRUE); + return TRUE; } kern_return_t vm_map_purgable_control( - vm_map_t map, - vm_map_offset_t address, - vm_purgable_t control, - int *state) + vm_map_t map, + vm_map_offset_t address, + vm_purgable_t control, + int *state) { - vm_map_entry_t entry; - vm_object_t object; - kern_return_t kr; - boolean_t was_nonvolatile; + vm_map_entry_t entry; + vm_object_t object; + kern_return_t kr; + boolean_t was_nonvolatile; /* * Vet all the input parameters and current type and state of the * underlaying object. Return with an error if anything is amiss. */ - if (map == VM_MAP_NULL) - return(KERN_INVALID_ARGUMENT); + if (map == VM_MAP_NULL) { + return KERN_INVALID_ARGUMENT; + } if (control != VM_PURGABLE_SET_STATE && control != VM_PURGABLE_GET_STATE && - control != VM_PURGABLE_PURGE_ALL) - return(KERN_INVALID_ARGUMENT); + control != VM_PURGABLE_PURGE_ALL && + control != VM_PURGABLE_SET_STATE_FROM_KERNEL) { + return KERN_INVALID_ARGUMENT; + } if (control == VM_PURGABLE_PURGE_ALL) { vm_purgeable_object_purge_all(); return KERN_SUCCESS; } - if (control == VM_PURGABLE_SET_STATE && + if ((control == VM_PURGABLE_SET_STATE || + control == VM_PURGABLE_SET_STATE_FROM_KERNEL) && (((*state & ~(VM_PURGABLE_ALL_MASKS)) != 0) || - ((*state & VM_PURGABLE_STATE_MASK) > VM_PURGABLE_STATE_MASK))) - return(KERN_INVALID_ARGUMENT); + ((*state & VM_PURGABLE_STATE_MASK) > VM_PURGABLE_STATE_MASK))) { + return KERN_INVALID_ARGUMENT; + } vm_map_lock_read(map); if (!vm_map_lookup_entry(map, address, &entry) || entry->is_sub_map) { - /* * Must pass a valid non-submap address. */ vm_map_unlock_read(map); - return(KERN_INVALID_ADDRESS); + return KERN_INVALID_ADDRESS; } if ((entry->protection & VM_PROT_WRITE) == 0) { @@ -13445,10 +18550,10 @@ vm_map_purgable_control( * Can't apply purgable controls to something you can't write. */ vm_map_unlock_read(map); - return(KERN_PROTECTION_FAILURE); + return KERN_PROTECTION_FAILURE; } - object = entry->object.vm_object; + object = VME_OBJECT(entry); if (object == VM_OBJECT_NULL || object->purgable == VM_PURGABLE_DENY) { /* @@ -13457,11 +18562,11 @@ vm_map_purgable_control( vm_map_unlock_read(map); return KERN_INVALID_ARGUMENT; } - + vm_object_lock(object); #if 00 - if (entry->offset != 0 || + if (VME_OFFSET(entry) != 0 || entry->vme_end - entry->vme_start != object->vo_size) { /* * Can only apply purgable controls to the whole (existing) @@ -13473,286 +18578,691 @@ vm_map_purgable_control( } #endif - assert(!entry->is_sub_map); - assert(!entry->use_pmap); /* purgeable has its own accounting */ + assert(!entry->is_sub_map); + assert(!entry->use_pmap); /* purgeable has its own accounting */ + + vm_map_unlock_read(map); + + was_nonvolatile = (object->purgable == VM_PURGABLE_NONVOLATILE); + + kr = vm_object_purgable_control(object, control, state); + + if (was_nonvolatile && + object->purgable != VM_PURGABLE_NONVOLATILE && + map->pmap == kernel_pmap) { +#if DEBUG + object->vo_purgeable_volatilizer = kernel_task; +#endif /* DEBUG */ + } + + vm_object_unlock(object); + + return kr; +} + +void +vm_map_footprint_query_page_info( + vm_map_t map, + vm_map_entry_t map_entry, + vm_map_offset_t curr_s_offset, + int *disposition_p) +{ + int pmap_disp; + vm_object_t object; + int disposition; + int effective_page_size; + + vm_map_lock_assert_held(map); + assert(!map->has_corpse_footprint); + assert(curr_s_offset >= map_entry->vme_start); + assert(curr_s_offset < map_entry->vme_end); + + object = VME_OBJECT(map_entry); + if (object == VM_OBJECT_NULL) { + *disposition_p = 0; + return; + } + + effective_page_size = MIN(PAGE_SIZE, VM_MAP_PAGE_SIZE(map)); + + pmap_disp = 0; + if (object == VM_OBJECT_NULL) { + /* nothing mapped here: no need to ask */ + *disposition_p = 0; + return; + } else if (map_entry->is_sub_map && + !map_entry->use_pmap) { + /* nested pmap: no footprint */ + *disposition_p = 0; + return; + } + + /* + * Query the pmap. + */ + pmap_query_page_info(map->pmap, curr_s_offset, &pmap_disp); + + /* + * Compute this page's disposition. + */ + disposition = 0; + + /* deal with "alternate accounting" first */ + if (!map_entry->is_sub_map && + object->vo_no_footprint) { + /* does not count in footprint */ + assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry); + } else if (!map_entry->is_sub_map && + (object->purgable == VM_PURGABLE_NONVOLATILE || + (object->purgable == VM_PURGABLE_DENY && + object->vo_ledger_tag)) && + VM_OBJECT_OWNER(object) != NULL && + VM_OBJECT_OWNER(object)->map == map) { + assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry); + if ((((curr_s_offset + - map_entry->vme_start + + VME_OFFSET(map_entry)) + / effective_page_size) < + (object->resident_page_count + + vm_compressor_pager_get_count(object->pager)))) { + /* + * Non-volatile purgeable object owned + * by this task: report the first + * "#resident + #compressed" pages as + * "resident" (to show that they + * contribute to the footprint) but not + * "dirty" (to avoid double-counting + * with the fake "non-volatile" region + * we'll report at the end of the + * address space to account for all + * (mapped or not) non-volatile memory + * owned by this task. + */ + disposition |= VM_PAGE_QUERY_PAGE_PRESENT; + } + } else if (!map_entry->is_sub_map && + (object->purgable == VM_PURGABLE_VOLATILE || + object->purgable == VM_PURGABLE_EMPTY) && + VM_OBJECT_OWNER(object) != NULL && + VM_OBJECT_OWNER(object)->map == map) { + assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry); + if ((((curr_s_offset + - map_entry->vme_start + + VME_OFFSET(map_entry)) + / effective_page_size) < + object->wired_page_count)) { + /* + * Volatile|empty purgeable object owned + * by this task: report the first + * "#wired" pages as "resident" (to + * show that they contribute to the + * footprint) but not "dirty" (to avoid + * double-counting with the fake + * "non-volatile" region we'll report + * at the end of the address space to + * account for all (mapped or not) + * non-volatile memory owned by this + * task. + */ + disposition |= VM_PAGE_QUERY_PAGE_PRESENT; + } + } else if (!map_entry->is_sub_map && + map_entry->iokit_acct && + object->internal && + object->purgable == VM_PURGABLE_DENY) { + /* + * Non-purgeable IOKit memory: phys_footprint + * includes the entire virtual mapping. + */ + assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry); + disposition |= VM_PAGE_QUERY_PAGE_PRESENT; + disposition |= VM_PAGE_QUERY_PAGE_DIRTY; + } else if (pmap_disp & (PMAP_QUERY_PAGE_ALTACCT | + PMAP_QUERY_PAGE_COMPRESSED_ALTACCT)) { + /* alternate accounting */ +#if (__arm__ || __arm64__) && (DEVELOPMENT || DEBUG) + if (map->pmap->footprint_was_suspended) { + /* + * The assertion below can fail if dyld + * suspended footprint accounting + * while doing some adjustments to + * this page; the mapping would say + * "use pmap accounting" but the page + * would be marked "alternate + * accounting". + */ + } else +#endif /* (__arm__ || __arm64__) && (DEVELOPMENT || DEBUG) */ + { + assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry); + } + disposition = 0; + } else { + if (pmap_disp & PMAP_QUERY_PAGE_PRESENT) { + assertf(map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry); + disposition |= VM_PAGE_QUERY_PAGE_PRESENT; + disposition |= VM_PAGE_QUERY_PAGE_REF; + if (pmap_disp & PMAP_QUERY_PAGE_INTERNAL) { + disposition |= VM_PAGE_QUERY_PAGE_DIRTY; + } else { + disposition |= VM_PAGE_QUERY_PAGE_EXTERNAL; + } + if (pmap_disp & PMAP_QUERY_PAGE_REUSABLE) { + disposition |= VM_PAGE_QUERY_PAGE_REUSABLE; + } + } else if (pmap_disp & PMAP_QUERY_PAGE_COMPRESSED) { + assertf(map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry); + disposition |= VM_PAGE_QUERY_PAGE_PAGED_OUT; + } + } + + *disposition_p = disposition; +} + +kern_return_t +vm_map_page_query_internal( + vm_map_t target_map, + vm_map_offset_t offset, + int *disposition, + int *ref_count) +{ + kern_return_t kr; + vm_page_info_basic_data_t info; + mach_msg_type_number_t count; + + count = VM_PAGE_INFO_BASIC_COUNT; + kr = vm_map_page_info(target_map, + offset, + VM_PAGE_INFO_BASIC, + (vm_page_info_t) &info, + &count); + if (kr == KERN_SUCCESS) { + *disposition = info.disposition; + *ref_count = info.ref_count; + } else { + *disposition = 0; + *ref_count = 0; + } + + return kr; +} + +kern_return_t +vm_map_page_info( + vm_map_t map, + vm_map_offset_t offset, + vm_page_info_flavor_t flavor, + vm_page_info_t info, + mach_msg_type_number_t *count) +{ + return vm_map_page_range_info_internal(map, + offset, /* start of range */ + (offset + 1), /* this will get rounded in the call to the page boundary */ + (int)-1, /* effective_page_shift: unspecified */ + flavor, + info, + count); +} + +kern_return_t +vm_map_page_range_info_internal( + vm_map_t map, + vm_map_offset_t start_offset, + vm_map_offset_t end_offset, + int effective_page_shift, + vm_page_info_flavor_t flavor, + vm_page_info_t info, + mach_msg_type_number_t *count) +{ + vm_map_entry_t map_entry = VM_MAP_ENTRY_NULL; + vm_object_t object = VM_OBJECT_NULL, curr_object = VM_OBJECT_NULL; + vm_page_t m = VM_PAGE_NULL; + kern_return_t retval = KERN_SUCCESS; + int disposition = 0; + int ref_count = 0; + int depth = 0, info_idx = 0; + vm_page_info_basic_t basic_info = 0; + vm_map_offset_t offset_in_page = 0, offset_in_object = 0, curr_offset_in_object = 0; + vm_map_offset_t start = 0, end = 0, curr_s_offset = 0, curr_e_offset = 0; + boolean_t do_region_footprint; + ledger_amount_t ledger_resident, ledger_compressed; + int effective_page_size; + vm_map_offset_t effective_page_mask; + + switch (flavor) { + case VM_PAGE_INFO_BASIC: + if (*count != VM_PAGE_INFO_BASIC_COUNT) { + /* + * The "vm_page_info_basic_data" structure was not + * properly padded, so allow the size to be off by + * one to maintain backwards binary compatibility... + */ + if (*count != VM_PAGE_INFO_BASIC_COUNT - 1) { + return KERN_INVALID_ARGUMENT; + } + } + break; + default: + return KERN_INVALID_ARGUMENT; + } + + if (effective_page_shift == -1) { + effective_page_shift = vm_self_region_page_shift_safely(map); + if (effective_page_shift == -1) { + return KERN_INVALID_ARGUMENT; + } + } + effective_page_size = (1 << effective_page_shift); + effective_page_mask = effective_page_size - 1; + + do_region_footprint = task_self_region_footprint(); + disposition = 0; + ref_count = 0; + depth = 0; + info_idx = 0; /* Tracks the next index within the info structure to be filled.*/ + retval = KERN_SUCCESS; + + offset_in_page = start_offset & effective_page_mask; + start = vm_map_trunc_page(start_offset, effective_page_mask); + end = vm_map_round_page(end_offset, effective_page_mask); + + if (end < start) { + return KERN_INVALID_ARGUMENT; + } + + assert((end - start) <= MAX_PAGE_RANGE_QUERY); + + vm_map_lock_read(map); + + task_ledgers_footprint(map->pmap->ledger, &ledger_resident, &ledger_compressed); + + for (curr_s_offset = start; curr_s_offset < end;) { + /* + * New lookup needs reset of these variables. + */ + curr_object = object = VM_OBJECT_NULL; + offset_in_object = 0; + ref_count = 0; + depth = 0; + + if (do_region_footprint && + curr_s_offset >= vm_map_last_entry(map)->vme_end) { + /* + * Request for "footprint" info about a page beyond + * the end of address space: this must be for + * the fake region vm_map_region_recurse_64() + * reported to account for non-volatile purgeable + * memory owned by this task. + */ + disposition = 0; + + if (curr_s_offset - vm_map_last_entry(map)->vme_end <= + (unsigned) ledger_compressed) { + /* + * We haven't reported all the "non-volatile + * compressed" pages yet, so report this fake + * page as "compressed". + */ + disposition |= VM_PAGE_QUERY_PAGE_PAGED_OUT; + } else { + /* + * We've reported all the non-volatile + * compressed page but not all the non-volatile + * pages , so report this fake page as + * "resident dirty". + */ + disposition |= VM_PAGE_QUERY_PAGE_PRESENT; + disposition |= VM_PAGE_QUERY_PAGE_DIRTY; + disposition |= VM_PAGE_QUERY_PAGE_REF; + } + switch (flavor) { + case VM_PAGE_INFO_BASIC: + basic_info = (vm_page_info_basic_t) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic))); + basic_info->disposition = disposition; + basic_info->ref_count = 1; + basic_info->object_id = VM_OBJECT_ID_FAKE(map, task_ledgers.purgeable_nonvolatile); + basic_info->offset = 0; + basic_info->depth = 0; + + info_idx++; + break; + } + curr_s_offset += effective_page_size; + continue; + } + + /* + * First, find the map entry covering "curr_s_offset", going down + * submaps if necessary. + */ + if (!vm_map_lookup_entry(map, curr_s_offset, &map_entry)) { + /* no entry -> no object -> no page */ + + if (curr_s_offset < vm_map_min(map)) { + /* + * Illegal address that falls below map min. + */ + curr_e_offset = MIN(end, vm_map_min(map)); + } else if (curr_s_offset >= vm_map_max(map)) { + /* + * Illegal address that falls on/after map max. + */ + curr_e_offset = end; + } else if (map_entry == vm_map_to_entry(map)) { + /* + * Hit a hole. + */ + if (map_entry->vme_next == vm_map_to_entry(map)) { + /* + * Empty map. + */ + curr_e_offset = MIN(map->max_offset, end); + } else { + /* + * Hole at start of the map. + */ + curr_e_offset = MIN(map_entry->vme_next->vme_start, end); + } + } else { + if (map_entry->vme_next == vm_map_to_entry(map)) { + /* + * Hole at the end of the map. + */ + curr_e_offset = MIN(map->max_offset, end); + } else { + curr_e_offset = MIN(map_entry->vme_next->vme_start, end); + } + } + + assert(curr_e_offset >= curr_s_offset); + + uint64_t num_pages = (curr_e_offset - curr_s_offset) >> effective_page_shift; + + void *info_ptr = (void*) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic))); + + bzero(info_ptr, num_pages * sizeof(struct vm_page_info_basic)); + + curr_s_offset = curr_e_offset; + + info_idx += num_pages; + + continue; + } + + /* compute offset from this map entry's start */ + offset_in_object = curr_s_offset - map_entry->vme_start; + + /* compute offset into this map entry's object (or submap) */ + offset_in_object += VME_OFFSET(map_entry); + + if (map_entry->is_sub_map) { + vm_map_t sub_map = VM_MAP_NULL; + vm_page_info_t submap_info = 0; + vm_map_offset_t submap_s_offset = 0, submap_e_offset = 0, range_len = 0; + + range_len = MIN(map_entry->vme_end, end) - curr_s_offset; + + submap_s_offset = offset_in_object; + submap_e_offset = submap_s_offset + range_len; + + sub_map = VME_SUBMAP(map_entry); + + vm_map_reference(sub_map); + vm_map_unlock_read(map); - vm_map_unlock_read(map); + submap_info = (vm_page_info_t) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic))); - was_nonvolatile = (object->purgable == VM_PURGABLE_NONVOLATILE); + assertf(VM_MAP_PAGE_SHIFT(sub_map) >= VM_MAP_PAGE_SHIFT(map), + "Submap page size (%d) differs from current map (%d)\n", VM_MAP_PAGE_SIZE(sub_map), VM_MAP_PAGE_SIZE(map)); - kr = vm_object_purgable_control(object, control, state); + retval = vm_map_page_range_info_internal(sub_map, + submap_s_offset, + submap_e_offset, + effective_page_shift, + VM_PAGE_INFO_BASIC, + (vm_page_info_t) submap_info, + count); - if (was_nonvolatile && - object->purgable != VM_PURGABLE_NONVOLATILE && - map->pmap == kernel_pmap) { -#if DEBUG - object->vo_purgeable_volatilizer = kernel_task; -#endif /* DEBUG */ - } + assert(retval == KERN_SUCCESS); - vm_object_unlock(object); + vm_map_lock_read(map); + vm_map_deallocate(sub_map); - return kr; -} + /* Move the "info" index by the number of pages we inspected.*/ + info_idx += range_len >> effective_page_shift; -kern_return_t -vm_map_page_query_internal( - vm_map_t target_map, - vm_map_offset_t offset, - int *disposition, - int *ref_count) -{ - kern_return_t kr; - vm_page_info_basic_data_t info; - mach_msg_type_number_t count; + /* Move our current offset by the size of the range we inspected.*/ + curr_s_offset += range_len; - count = VM_PAGE_INFO_BASIC_COUNT; - kr = vm_map_page_info(target_map, - offset, - VM_PAGE_INFO_BASIC, - (vm_page_info_t) &info, - &count); - if (kr == KERN_SUCCESS) { - *disposition = info.disposition; - *ref_count = info.ref_count; - } else { - *disposition = 0; - *ref_count = 0; - } + continue; + } - return kr; -} - -kern_return_t -vm_map_page_info( - vm_map_t map, - vm_map_offset_t offset, - vm_page_info_flavor_t flavor, - vm_page_info_t info, - mach_msg_type_number_t *count) -{ - vm_map_entry_t map_entry; - vm_object_t object; - vm_page_t m; - kern_return_t kr; - kern_return_t retval = KERN_SUCCESS; - boolean_t top_object; - int disposition; - int ref_count; - vm_page_info_basic_t basic_info; - int depth; - vm_map_offset_t offset_in_page; + object = VME_OBJECT(map_entry); - switch (flavor) { - case VM_PAGE_INFO_BASIC: - if (*count != VM_PAGE_INFO_BASIC_COUNT) { + if (object == VM_OBJECT_NULL) { /* - * The "vm_page_info_basic_data" structure was not - * properly padded, so allow the size to be off by - * one to maintain backwards binary compatibility... + * We don't have an object here and, hence, + * no pages to inspect. We'll fill up the + * info structure appropriately. */ - if (*count != VM_PAGE_INFO_BASIC_COUNT - 1) - return KERN_INVALID_ARGUMENT; - } - break; - default: - return KERN_INVALID_ARGUMENT; - } - disposition = 0; - ref_count = 0; - top_object = TRUE; - depth = 0; + curr_e_offset = MIN(map_entry->vme_end, end); - retval = KERN_SUCCESS; - offset_in_page = offset & PAGE_MASK; - offset = vm_map_trunc_page(offset, PAGE_MASK); + uint64_t num_pages = (curr_e_offset - curr_s_offset) >> effective_page_shift; - vm_map_lock_read(map); + void *info_ptr = (void*) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic))); - /* - * First, find the map entry covering "offset", going down - * submaps if necessary. - */ - for (;;) { - if (!vm_map_lookup_entry(map, offset, &map_entry)) { - vm_map_unlock_read(map); - return KERN_INVALID_ADDRESS; - } - /* compute offset from this map entry's start */ - offset -= map_entry->vme_start; - /* compute offset into this map entry's object (or submap) */ - offset += map_entry->offset; + bzero(info_ptr, num_pages * sizeof(struct vm_page_info_basic)); - if (map_entry->is_sub_map) { - vm_map_t sub_map; + curr_s_offset = curr_e_offset; - sub_map = map_entry->object.sub_map; - vm_map_lock_read(sub_map); - vm_map_unlock_read(map); + info_idx += num_pages; - map = sub_map; + continue; + } - ref_count = MAX(ref_count, map->ref_count); + if (do_region_footprint) { + disposition = 0; + if (map->has_corpse_footprint) { + /* + * Query the page info data we saved + * while forking the corpse. + */ + vm_map_corpse_footprint_query_page_info( + map, + curr_s_offset, + &disposition); + } else { + /* + * Query the live pmap for footprint info + * about this page. + */ + vm_map_footprint_query_page_info( + map, + map_entry, + curr_s_offset, + &disposition); + } + switch (flavor) { + case VM_PAGE_INFO_BASIC: + basic_info = (vm_page_info_basic_t) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic))); + basic_info->disposition = disposition; + basic_info->ref_count = 1; + basic_info->object_id = VM_OBJECT_ID_FAKE(map, task_ledgers.purgeable_nonvolatile); + basic_info->offset = 0; + basic_info->depth = 0; + + info_idx++; + break; + } + curr_s_offset += effective_page_size; continue; } - break; - } - object = map_entry->object.vm_object; - if (object == VM_OBJECT_NULL) { - /* no object -> no page */ + vm_object_reference(object); + /* + * Shared mode -- so we can allow other readers + * to grab the lock too. + */ + vm_object_lock_shared(object); + + curr_e_offset = MIN(map_entry->vme_end, end); + vm_map_unlock_read(map); - goto done; - } - vm_object_lock(object); - vm_map_unlock_read(map); + map_entry = NULL; /* map is unlocked, the entry is no longer valid. */ - /* - * Go down the VM object shadow chain until we find the page - * we're looking for. - */ - for (;;) { - ref_count = MAX(ref_count, object->ref_count); + curr_object = object; - m = vm_page_lookup(object, offset); + for (; curr_s_offset < curr_e_offset;) { + if (object == curr_object) { + ref_count = curr_object->ref_count - 1; /* account for our object reference above. */ + } else { + ref_count = curr_object->ref_count; + } - if (m != VM_PAGE_NULL) { - disposition |= VM_PAGE_QUERY_PAGE_PRESENT; - break; - } else { -#if MACH_PAGEMAP - if (object->existence_map) { - if (vm_external_state_get(object->existence_map, - offset) == - VM_EXTERNAL_STATE_EXISTS) { - /* - * this page has been paged out - */ - disposition |= VM_PAGE_QUERY_PAGE_PAGED_OUT; + curr_offset_in_object = offset_in_object; + + for (;;) { + m = vm_page_lookup(curr_object, vm_object_trunc_page(curr_offset_in_object)); + + if (m != VM_PAGE_NULL) { + disposition |= VM_PAGE_QUERY_PAGE_PRESENT; break; - } - } else -#endif - if (object->internal && - object->alive && - !object->terminating && - object->pager_ready) { - - if (COMPRESSED_PAGER_IS_ACTIVE || DEFAULT_FREEZER_COMPRESSED_PAGER_IS_ACTIVE) { - if (VM_COMPRESSOR_PAGER_STATE_GET( - object, - offset) - == VM_EXTERNAL_STATE_EXISTS) { - /* the pager has that page */ - disposition |= VM_PAGE_QUERY_PAGE_PAGED_OUT; - break; - } } else { - memory_object_t pager; - - vm_object_paging_begin(object); - pager = object->pager; - vm_object_unlock(object); + if (curr_object->internal && + curr_object->alive && + !curr_object->terminating && + curr_object->pager_ready) { + if (VM_COMPRESSOR_PAGER_STATE_GET(curr_object, vm_object_trunc_page(curr_offset_in_object)) + == VM_EXTERNAL_STATE_EXISTS) { + /* the pager has that page */ + disposition |= VM_PAGE_QUERY_PAGE_PAGED_OUT; + break; + } + } /* - * Ask the default pager if - * it has this page. + * Go down the VM object shadow chain until we find the page + * we're looking for. */ - kr = memory_object_data_request( - pager, - offset + object->paging_offset, - 0, /* just poke the pager */ - VM_PROT_READ, - NULL); - vm_object_lock(object); - vm_object_paging_end(object); + if (curr_object->shadow != VM_OBJECT_NULL) { + vm_object_t shadow = VM_OBJECT_NULL; + + curr_offset_in_object += curr_object->vo_shadow_offset; + shadow = curr_object->shadow; - if (kr == KERN_SUCCESS) { - /* the default pager has it */ - disposition |= VM_PAGE_QUERY_PAGE_PAGED_OUT; + vm_object_lock_shared(shadow); + vm_object_unlock(curr_object); + + curr_object = shadow; + depth++; + continue; + } else { break; } } } - if (object->shadow != VM_OBJECT_NULL) { - vm_object_t shadow; + /* The ref_count is not strictly accurate, it measures the number */ + /* of entities holding a ref on the object, they may not be mapping */ + /* the object or may not be mapping the section holding the */ + /* target page but its still a ball park number and though an over- */ + /* count, it picks up the copy-on-write cases */ - offset += object->vo_shadow_offset; - shadow = object->shadow; - - vm_object_lock(shadow); - vm_object_unlock(object); + /* We could also get a picture of page sharing from pmap_attributes */ + /* but this would under count as only faulted-in mappings would */ + /* show up. */ - object = shadow; - top_object = FALSE; - depth++; - } else { -// if (!object->internal) -// break; -// retval = KERN_FAILURE; -// goto done_with_object; - break; + if ((curr_object == object) && curr_object->shadow) { + disposition |= VM_PAGE_QUERY_PAGE_COPIED; } - } - } - /* The ref_count is not strictly accurate, it measures the number */ - /* of entities holding a ref on the object, they may not be mapping */ - /* the object or may not be mapping the section holding the */ - /* target page but its still a ball park number and though an over- */ - /* count, it picks up the copy-on-write cases */ - /* We could also get a picture of page sharing from pmap_attributes */ - /* but this would under count as only faulted-in mappings would */ - /* show up. */ + if (!curr_object->internal) { + disposition |= VM_PAGE_QUERY_PAGE_EXTERNAL; + } - if (top_object == TRUE && object->shadow) - disposition |= VM_PAGE_QUERY_PAGE_COPIED; + if (m != VM_PAGE_NULL) { + if (m->vmp_fictitious) { + disposition |= VM_PAGE_QUERY_PAGE_FICTITIOUS; + } else { + if (m->vmp_dirty || pmap_is_modified(VM_PAGE_GET_PHYS_PAGE(m))) { + disposition |= VM_PAGE_QUERY_PAGE_DIRTY; + } - if (! object->internal) - disposition |= VM_PAGE_QUERY_PAGE_EXTERNAL; + if (m->vmp_reference || pmap_is_referenced(VM_PAGE_GET_PHYS_PAGE(m))) { + disposition |= VM_PAGE_QUERY_PAGE_REF; + } - if (m == VM_PAGE_NULL) - goto done_with_object; + if (m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) { + disposition |= VM_PAGE_QUERY_PAGE_SPECULATIVE; + } - if (m->fictitious) { - disposition |= VM_PAGE_QUERY_PAGE_FICTITIOUS; - goto done_with_object; - } - if (m->dirty || pmap_is_modified(m->phys_page)) - disposition |= VM_PAGE_QUERY_PAGE_DIRTY; + /* + * XXX TODO4K: + * when this routine deals with 4k + * pages, check the appropriate CS bit + * here. + */ + if (m->vmp_cs_validated) { + disposition |= VM_PAGE_QUERY_PAGE_CS_VALIDATED; + } + if (m->vmp_cs_tainted) { + disposition |= VM_PAGE_QUERY_PAGE_CS_TAINTED; + } + if (m->vmp_cs_nx) { + disposition |= VM_PAGE_QUERY_PAGE_CS_NX; + } + if (m->vmp_reusable || curr_object->all_reusable) { + disposition |= VM_PAGE_QUERY_PAGE_REUSABLE; + } + } + } - if (m->reference || pmap_is_referenced(m->phys_page)) - disposition |= VM_PAGE_QUERY_PAGE_REF; + switch (flavor) { + case VM_PAGE_INFO_BASIC: + basic_info = (vm_page_info_basic_t) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic))); + basic_info->disposition = disposition; + basic_info->ref_count = ref_count; + basic_info->object_id = (vm_object_id_t) (uintptr_t) + VM_KERNEL_ADDRPERM(curr_object); + basic_info->offset = + (memory_object_offset_t) curr_offset_in_object + offset_in_page; + basic_info->depth = depth; + + info_idx++; + break; + } - if (m->speculative) - disposition |= VM_PAGE_QUERY_PAGE_SPECULATIVE; + disposition = 0; + offset_in_page = 0; // This doesn't really make sense for any offset other than the starting offset. - if (m->cs_validated) - disposition |= VM_PAGE_QUERY_PAGE_CS_VALIDATED; - if (m->cs_tainted) - disposition |= VM_PAGE_QUERY_PAGE_CS_TAINTED; + /* + * Move to next offset in the range and in our object. + */ + curr_s_offset += effective_page_size; + offset_in_object += effective_page_size; + curr_offset_in_object = offset_in_object; -done_with_object: - vm_object_unlock(object); -done: + if (curr_object != object) { + vm_object_unlock(curr_object); - switch (flavor) { - case VM_PAGE_INFO_BASIC: - basic_info = (vm_page_info_basic_t) info; - basic_info->disposition = disposition; - basic_info->ref_count = ref_count; - basic_info->object_id = (vm_object_id_t) (uintptr_t) - VM_KERNEL_ADDRPERM(object); - basic_info->offset = - (memory_object_offset_t) offset + offset_in_page; - basic_info->depth = depth; - break; + curr_object = object; + + vm_object_lock_shared(curr_object); + } else { + vm_object_lock_yield_shared(curr_object); + } + } + + vm_object_unlock(curr_object); + vm_object_deallocate(curr_object); + + vm_map_lock_read(map); } + vm_map_unlock_read(map); return retval; } @@ -13796,62 +19306,75 @@ done: kern_return_t vm_map_msync( - vm_map_t map, - vm_map_address_t address, - vm_map_size_t size, - vm_sync_t sync_flags) -{ - msync_req_t msr; - msync_req_t new_msr; - queue_chain_t req_q; /* queue of requests for this msync */ - vm_map_entry_t entry; - vm_map_size_t amount_left; - vm_object_offset_t offset; - boolean_t do_sync_req; - boolean_t had_hole = FALSE; - memory_object_t pager; - + vm_map_t map, + vm_map_address_t address, + vm_map_size_t size, + vm_sync_t sync_flags) +{ + vm_map_entry_t entry; + vm_map_size_t amount_left; + vm_object_offset_t offset; + vm_object_offset_t start_offset, end_offset; + boolean_t do_sync_req; + boolean_t had_hole = FALSE; + vm_map_offset_t pmap_offset; + if ((sync_flags & VM_SYNC_ASYNCHRONOUS) && - (sync_flags & VM_SYNC_SYNCHRONOUS)) - return(KERN_INVALID_ARGUMENT); + (sync_flags & VM_SYNC_SYNCHRONOUS)) { + return KERN_INVALID_ARGUMENT; + } + + if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) { + DEBUG4K_SHARE("map %p address 0x%llx size 0x%llx flags 0x%x\n", map, (uint64_t)address, (uint64_t)size, sync_flags); + } /* * align address and size on page boundaries */ size = (vm_map_round_page(address + size, - VM_MAP_PAGE_MASK(map)) - - vm_map_trunc_page(address, - VM_MAP_PAGE_MASK(map))); + VM_MAP_PAGE_MASK(map)) - + vm_map_trunc_page(address, + VM_MAP_PAGE_MASK(map))); address = vm_map_trunc_page(address, - VM_MAP_PAGE_MASK(map)); + VM_MAP_PAGE_MASK(map)); - if (map == VM_MAP_NULL) - return(KERN_INVALID_TASK); + if (map == VM_MAP_NULL) { + return KERN_INVALID_TASK; + } - if (size == 0) - return(KERN_SUCCESS); + if (size == 0) { + return KERN_SUCCESS; + } - queue_init(&req_q); amount_left = size; while (amount_left > 0) { - vm_object_size_t flush_size; - vm_object_t object; + vm_object_size_t flush_size; + vm_object_t object; vm_map_lock(map); if (!vm_map_lookup_entry(map, - vm_map_trunc_page( - address, - VM_MAP_PAGE_MASK(map)), - &entry)) { - - vm_map_size_t skip; + address, + &entry)) { + vm_map_size_t skip; /* * hole in the address map. */ had_hole = TRUE; + if (sync_flags & VM_SYNC_KILLPAGES) { + /* + * For VM_SYNC_KILLPAGES, there should be + * no holes in the range, since we couldn't + * prevent someone else from allocating in + * that hole and we wouldn't want to "kill" + * their pages. + */ + vm_map_unlock(map); + break; + } + /* * Check for empty map. */ @@ -13873,16 +19396,18 @@ vm_map_msync( * Move up to the next entry if needed */ skip = (entry->vme_next->vme_start - address); - if (skip >= amount_left) + if (skip >= amount_left) { amount_left = 0; - else + } else { amount_left -= skip; + } address = entry->vme_next->vme_start; vm_map_unlock(map); continue; } offset = address - entry->vme_start; + pmap_offset = address; /* * do we have more to flush than is contained in this @@ -13890,7 +19415,7 @@ vm_map_msync( */ if (amount_left + entry->vme_start + offset > entry->vme_end) { flush_size = entry->vme_end - - (entry->vme_start + offset); + (entry->vme_start + offset); } else { flush_size = amount_left; } @@ -13898,11 +19423,12 @@ vm_map_msync( address += flush_size; if (entry->is_sub_map == TRUE) { - vm_map_t local_map; - vm_map_offset_t local_offset; + vm_map_t local_map; + vm_map_offset_t local_offset; - local_map = entry->object.sub_map; - local_offset = entry->offset; + local_map = VME_SUBMAP(entry); + local_offset = VME_OFFSET(entry); + vm_map_reference(local_map); vm_map_unlock(map); if (vm_map_msync( local_map, @@ -13911,9 +19437,10 @@ vm_map_msync( sync_flags) == KERN_INVALID_ADDRESS) { had_hole = TRUE; } + vm_map_deallocate(local_map); continue; } - object = entry->object.vm_object; + object = VME_OBJECT(entry); /* * We can't sync this object if the object has not been @@ -13923,23 +19450,59 @@ vm_map_msync( vm_map_unlock(map); continue; } - offset += entry->offset; + offset += VME_OFFSET(entry); - vm_object_lock(object); + vm_object_lock(object); if (sync_flags & (VM_SYNC_KILLPAGES | VM_SYNC_DEACTIVATE)) { - int kill_pages = 0; + int kill_pages = 0; boolean_t reusable_pages = FALSE; + if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) { + /* + * This is a destructive operation and so we + * err on the side of limiting the range of + * the operation. + */ + start_offset = vm_object_round_page(offset); + end_offset = vm_object_trunc_page(offset + flush_size); + + if (end_offset <= start_offset) { + vm_object_unlock(object); + vm_map_unlock(map); + continue; + } + + pmap_offset += start_offset - offset;; + } else { + start_offset = offset; + end_offset = offset + flush_size; + } + if (sync_flags & VM_SYNC_KILLPAGES) { - if (object->ref_count == 1 && !object->shadow) - kill_pages = 1; - else - kill_pages = -1; - } - if (kill_pages != -1) - vm_object_deactivate_pages(object, offset, - (vm_object_size_t)flush_size, kill_pages, reusable_pages); + if (((object->ref_count == 1) || + ((object->copy_strategy != + MEMORY_OBJECT_COPY_SYMMETRIC) && + (object->copy == VM_OBJECT_NULL))) && + (object->shadow == VM_OBJECT_NULL)) { + if (object->ref_count != 1) { + vm_page_stats_reusable.free_shared++; + } + kill_pages = 1; + } else { + kill_pages = -1; + } + } + if (kill_pages != -1) { + vm_object_deactivate_pages( + object, + start_offset, + (vm_object_size_t) (end_offset - start_offset), + kill_pages, + reusable_pages, + map->pmap, + pmap_offset); + } vm_object_unlock(object); vm_map_unlock(map); continue; @@ -13963,126 +19526,113 @@ vm_map_msync( vm_map_unlock(map); - do_sync_req = vm_object_sync(object, - offset, - flush_size, - sync_flags & VM_SYNC_INVALIDATE, - ((sync_flags & VM_SYNC_SYNCHRONOUS) || - (sync_flags & VM_SYNC_ASYNCHRONOUS)), - sync_flags & VM_SYNC_SYNCHRONOUS); - /* - * only send a m_o_s if we returned pages or if the entry - * is writable (ie dirty pages may have already been sent back) - */ - if (!do_sync_req) { - if ((sync_flags & VM_SYNC_INVALIDATE) && object->resident_page_count == 0) { - /* - * clear out the clustering and read-ahead hints - */ - vm_object_lock(object); - - object->pages_created = 0; - object->pages_used = 0; - object->sequential = 0; - object->last_alloc = 0; - - vm_object_unlock(object); - } - vm_object_deallocate(object); - continue; + if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) { + start_offset = vm_object_trunc_page(offset); + end_offset = vm_object_round_page(offset + flush_size); + } else { + start_offset = offset; + end_offset = offset + flush_size; } - msync_req_alloc(new_msr); - - vm_object_lock(object); - offset += object->paging_offset; - - new_msr->offset = offset; - new_msr->length = flush_size; - new_msr->object = object; - new_msr->flag = VM_MSYNC_SYNCHRONIZING; - re_iterate: - /* - * We can't sync this object if there isn't a pager. The - * pager can disappear anytime we're not holding the object - * lock. So this has to be checked anytime we goto re_iterate. - */ + do_sync_req = vm_object_sync(object, + start_offset, + (end_offset - start_offset), + sync_flags & VM_SYNC_INVALIDATE, + ((sync_flags & VM_SYNC_SYNCHRONOUS) || + (sync_flags & VM_SYNC_ASYNCHRONOUS)), + sync_flags & VM_SYNC_SYNCHRONOUS); + + if ((sync_flags & VM_SYNC_INVALIDATE) && object->resident_page_count == 0) { + /* + * clear out the clustering and read-ahead hints + */ + vm_object_lock(object); - pager = object->pager; + object->pages_created = 0; + object->pages_used = 0; + object->sequential = 0; + object->last_alloc = 0; - if (pager == MEMORY_OBJECT_NULL) { vm_object_unlock(object); - vm_object_deallocate(object); - msync_req_free(new_msr); - new_msr = NULL; - continue; } + vm_object_deallocate(object); + } /* while */ - queue_iterate(&object->msr_q, msr, msync_req_t, msr_q) { - /* - * need to check for overlapping entry, if found, wait - * on overlapping msr to be done, then reiterate - */ - msr_lock(msr); - if (msr->flag == VM_MSYNC_SYNCHRONIZING && - ((offset >= msr->offset && - offset < (msr->offset + msr->length)) || - (msr->offset >= offset && - msr->offset < (offset + flush_size)))) - { - assert_wait((event_t) msr,THREAD_INTERRUPTIBLE); - msr_unlock(msr); - vm_object_unlock(object); - thread_block(THREAD_CONTINUE_NULL); - vm_object_lock(object); - goto re_iterate; - } - msr_unlock(msr); - }/* queue_iterate */ + /* for proper msync() behaviour */ + if (had_hole == TRUE && (sync_flags & VM_SYNC_CONTIGUOUS)) { + return KERN_INVALID_ADDRESS; + } + + return KERN_SUCCESS; +}/* vm_msync */ - queue_enter(&object->msr_q, new_msr, msync_req_t, msr_q); +kern_return_t +vm_named_entry_from_vm_object( + vm_named_entry_t named_entry, + vm_object_t object, + vm_object_offset_t offset, + vm_object_size_t size, + vm_prot_t prot) +{ + vm_map_copy_t copy; + vm_map_entry_t copy_entry; - vm_object_paging_begin(object); - vm_object_unlock(object); + assert(!named_entry->is_sub_map); + assert(!named_entry->is_copy); + assert(!named_entry->is_object); + assert(!named_entry->internal); + assert(named_entry->backing.copy == VM_MAP_COPY_NULL); - queue_enter(&req_q, new_msr, msync_req_t, req_q); + copy = vm_map_copy_allocate(); + copy->type = VM_MAP_COPY_ENTRY_LIST; + copy->offset = offset; + copy->size = size; + copy->cpy_hdr.page_shift = PAGE_SHIFT; + vm_map_store_init(©->cpy_hdr); - (void) memory_object_synchronize( - pager, - offset, - flush_size, - sync_flags & ~VM_SYNC_CONTIGUOUS); + copy_entry = vm_map_copy_entry_create(copy, FALSE); + copy_entry->protection = prot; + copy_entry->max_protection = prot; + copy_entry->use_pmap = TRUE; + copy_entry->vme_start = VM_MAP_TRUNC_PAGE(offset, PAGE_MASK); + copy_entry->vme_end = VM_MAP_ROUND_PAGE(offset + size, PAGE_MASK); + VME_OBJECT_SET(copy_entry, object); + VME_OFFSET_SET(copy_entry, vm_object_trunc_page(offset)); + vm_map_copy_entry_link(copy, vm_map_copy_last_entry(copy), copy_entry); - vm_object_lock(object); - vm_object_paging_end(object); - vm_object_unlock(object); - }/* while */ + named_entry->backing.copy = copy; + named_entry->is_object = TRUE; + if (object->internal) { + named_entry->internal = TRUE; + } - /* - * wait for memory_object_sychronize_completed messages from pager(s) - */ + DEBUG4K_MEMENTRY("named_entry %p copy %p object %p offset 0x%llx size 0x%llx prot 0x%x\n", named_entry, copy, object, offset, size, prot); - while (!queue_empty(&req_q)) { - msr = (msync_req_t)queue_first(&req_q); - msr_lock(msr); - while(msr->flag != VM_MSYNC_DONE) { - assert_wait((event_t) msr, THREAD_INTERRUPTIBLE); - msr_unlock(msr); - thread_block(THREAD_CONTINUE_NULL); - msr_lock(msr); - }/* while */ - queue_remove(&req_q, msr, msync_req_t, req_q); - msr_unlock(msr); - vm_object_deallocate(msr->object); - msync_req_free(msr); - }/* queue_iterate */ + return KERN_SUCCESS; +} - /* for proper msync() behaviour */ - if (had_hole == TRUE && (sync_flags & VM_SYNC_CONTIGUOUS)) - return(KERN_INVALID_ADDRESS); +vm_object_t +vm_named_entry_to_vm_object( + vm_named_entry_t named_entry) +{ + vm_map_copy_t copy; + vm_map_entry_t copy_entry; + vm_object_t object; + + assert(!named_entry->is_sub_map); + assert(!named_entry->is_copy); + assert(named_entry->is_object); + copy = named_entry->backing.copy; + assert(copy != VM_MAP_COPY_NULL); + assert(copy->cpy_hdr.nentries == 1); + copy_entry = vm_map_copy_first_entry(copy); + assert(!copy_entry->is_sub_map); + object = VME_OBJECT(copy_entry); - return(KERN_SUCCESS); -}/* vm_msync */ + DEBUG4K_MEMENTRY("%p -> %p -> %p [0x%llx 0x%llx 0x%llx 0x%x/0x%x ] -> %p offset 0x%llx size 0x%llx prot 0x%x\n", named_entry, copy, copy_entry, (uint64_t)copy_entry->vme_start, (uint64_t)copy_entry->vme_end, copy_entry->vme_offset, copy_entry->protection, copy_entry->max_protection, object, named_entry->offset, named_entry->size, named_entry->protection); + + return object; +} /* * Routine: convert_port_entry_to_map @@ -14098,47 +19648,54 @@ vm_map_msync( vm_map_t convert_port_entry_to_map( - ipc_port_t port) + ipc_port_t port) { vm_map_t map; - vm_named_entry_t named_entry; - uint32_t try_failed_count = 0; + vm_named_entry_t named_entry; + uint32_t try_failed_count = 0; - if(IP_VALID(port) && (ip_kotype(port) == IKOT_NAMED_ENTRY)) { - while(TRUE) { + if (IP_VALID(port) && (ip_kotype(port) == IKOT_NAMED_ENTRY)) { + while (TRUE) { ip_lock(port); - if(ip_active(port) && (ip_kotype(port) - == IKOT_NAMED_ENTRY)) { + if (ip_active(port) && (ip_kotype(port) + == IKOT_NAMED_ENTRY)) { named_entry = - (vm_named_entry_t)port->ip_kobject; + (vm_named_entry_t) ip_get_kobject(port); if (!(lck_mtx_try_lock(&(named_entry)->Lock))) { - ip_unlock(port); + ip_unlock(port); try_failed_count++; - mutex_pause(try_failed_count); - continue; - } + mutex_pause(try_failed_count); + continue; + } named_entry->ref_count++; lck_mtx_unlock(&(named_entry)->Lock); ip_unlock(port); if ((named_entry->is_sub_map) && - (named_entry->protection - & VM_PROT_WRITE)) { + (named_entry->protection + & VM_PROT_WRITE)) { map = named_entry->backing.map; + if (map->pmap != PMAP_NULL) { + if (map->pmap == kernel_pmap) { + panic("userspace has access " + "to a kernel map %p", map); + } + pmap_require(map->pmap); + } } else { mach_destroy_memory_entry(port); return VM_MAP_NULL; } - vm_map_reference_swap(map); + vm_map_reference(map); mach_destroy_memory_entry(port); break; - } - else + } else { return VM_MAP_NULL; + } } - } - else + } else { map = convert_port_to_map(port); + } return map; } @@ -14148,7 +19705,7 @@ convert_port_entry_to_map( * Purpose: * Convert from a port specifying a named entry to an * object. Doesn't consume the port ref; produces a map ref, - * which may be null. + * which may be null. * Conditions: * Nothing locked. */ @@ -14156,33 +19713,41 @@ convert_port_entry_to_map( vm_object_t convert_port_entry_to_object( - ipc_port_t port) + ipc_port_t port) { - vm_object_t object = VM_OBJECT_NULL; - vm_named_entry_t named_entry; - uint32_t try_failed_count = 0; + vm_object_t object = VM_OBJECT_NULL; + vm_named_entry_t named_entry; + uint32_t try_failed_count = 0; if (IP_VALID(port) && (ip_kotype(port) == IKOT_NAMED_ENTRY)) { - try_again: +try_again: ip_lock(port); if (ip_active(port) && (ip_kotype(port) == IKOT_NAMED_ENTRY)) { - named_entry = (vm_named_entry_t)port->ip_kobject; + named_entry = (vm_named_entry_t) ip_get_kobject(port); if (!(lck_mtx_try_lock(&(named_entry)->Lock))) { ip_unlock(port); try_failed_count++; mutex_pause(try_failed_count); - goto try_again; + goto try_again; } named_entry->ref_count++; lck_mtx_unlock(&(named_entry)->Lock); ip_unlock(port); if (!(named_entry->is_sub_map) && - !(named_entry->is_pager) && !(named_entry->is_copy) && + (named_entry->is_object) && (named_entry->protection & VM_PROT_WRITE)) { - object = named_entry->backing.object; + vm_map_copy_t copy; + vm_map_entry_t copy_entry; + + copy = named_entry->backing.copy; + assert(copy->cpy_hdr.nentries == 1); + copy_entry = vm_map_copy_first_entry(copy); + assert(!copy_entry->is_sub_map); + object = VME_OBJECT(copy_entry); + assert(object != VM_OBJECT_NULL); vm_object_reference(object); } mach_destroy_memory_entry(port); @@ -14200,32 +19765,22 @@ convert_port_entry_to_object( vm_map_t current_map(void) { - return (current_map_fast()); + return current_map_fast(); } /* * vm_map_reference: * - * Most code internal to the osfmk will go through a - * macro defining this. This is always here for the - * use of other kernel components. + * Takes a reference on the specified map. */ -#undef vm_map_reference void vm_map_reference( - register vm_map_t map) + vm_map_t map) { - if (map == VM_MAP_NULL) - return; - - lck_mtx_lock(&map->s_lock); -#if TASK_SWAPPER - assert(map->res_count > 0); - assert(map->ref_count >= map->res_count); - map->res_count++; -#endif - map->ref_count++; - lck_mtx_unlock(&map->s_lock); + if (__probable(map != VM_MAP_NULL)) { + vm_map_require(map); + os_ref_retain(&map->map_refcnt); + } } /* @@ -14237,54 +19792,52 @@ vm_map_reference( */ void vm_map_deallocate( - register vm_map_t map) + vm_map_t map) { - unsigned int ref; - - if (map == VM_MAP_NULL) - return; - - lck_mtx_lock(&map->s_lock); - ref = --map->ref_count; - if (ref > 0) { - vm_map_res_deallocate(map); - lck_mtx_unlock(&map->s_lock); - return; + if (__probable(map != VM_MAP_NULL)) { + vm_map_require(map); + if (os_ref_release(&map->map_refcnt) == 0) { + vm_map_destroy(map, VM_MAP_REMOVE_NO_FLAGS); + } } - assert(map->ref_count == 0); - lck_mtx_unlock(&map->s_lock); +} -#if TASK_SWAPPER - /* - * The map residence count isn't decremented here because - * the vm_map_delete below will traverse the entire map, - * deleting entries, and the residence counts on objects - * and sharing maps will go away then. - */ -#endif +void +vm_map_inspect_deallocate( + vm_map_inspect_t map) +{ + vm_map_deallocate((vm_map_t)map); +} - vm_map_destroy(map, VM_MAP_NO_FLAGS); +void +vm_map_read_deallocate( + vm_map_read_t map) +{ + vm_map_deallocate((vm_map_t)map); } void vm_map_disable_NX(vm_map_t map) { - if (map == NULL) - return; - if (map->pmap == NULL) - return; + if (map == NULL) { + return; + } + if (map->pmap == NULL) { + return; + } - pmap_disable_NX(map->pmap); + pmap_disable_NX(map->pmap); } void vm_map_disallow_data_exec(vm_map_t map) { - if (map == NULL) - return; + if (map == NULL) { + return; + } - map->map_disallow_data_exec = TRUE; + map->map_disallow_data_exec = TRUE; } /* XXX Consider making these constants (VM_MAX_ADDRESS and MACH_VM_MAX_ADDRESS) @@ -14293,39 +19846,163 @@ vm_map_disallow_data_exec(vm_map_t map) void vm_map_set_32bit(vm_map_t map) { +#if defined(__arm__) || defined(__arm64__) + map->max_offset = pmap_max_offset(FALSE, ARM_PMAP_MAX_OFFSET_DEVICE); +#else map->max_offset = (vm_map_offset_t)VM_MAX_ADDRESS; +#endif } void vm_map_set_64bit(vm_map_t map) { +#if defined(__arm__) || defined(__arm64__) + map->max_offset = pmap_max_offset(TRUE, ARM_PMAP_MAX_OFFSET_DEVICE); +#else map->max_offset = (vm_map_offset_t)MACH_VM_MAX_ADDRESS; +#endif +} + +/* + * Expand the maximum size of an existing map to the maximum supported. + */ +void +vm_map_set_jumbo(vm_map_t map) +{ +#if defined (__arm64__) && !defined(CONFIG_ARROW) + vm_map_set_max_addr(map, ~0); +#else /* arm64 */ + (void) map; +#endif +} + +/* + * This map has a JIT entitlement + */ +void +vm_map_set_jit_entitled(vm_map_t map) +{ +#if defined (__arm64__) + pmap_set_jit_entitled(map->pmap); +#else /* arm64 */ + (void) map; +#endif +} + +/* + * Expand the maximum size of an existing map. + */ +void +vm_map_set_max_addr(vm_map_t map, vm_map_offset_t new_max_offset) +{ +#if defined(__arm64__) + vm_map_offset_t max_supported_offset = 0; + vm_map_offset_t old_max_offset = map->max_offset; + max_supported_offset = pmap_max_offset(vm_map_is_64bit(map), ARM_PMAP_MAX_OFFSET_JUMBO); + + new_max_offset = trunc_page(new_max_offset); + + /* The address space cannot be shrunk using this routine. */ + if (old_max_offset >= new_max_offset) { + return; + } + + if (max_supported_offset < new_max_offset) { + new_max_offset = max_supported_offset; + } + + map->max_offset = new_max_offset; + + if (map->holes_list->prev->vme_end == old_max_offset) { + /* + * There is already a hole at the end of the map; simply make it bigger. + */ + map->holes_list->prev->vme_end = map->max_offset; + } else { + /* + * There is no hole at the end, so we need to create a new hole + * for the new empty space we're creating. + */ + struct vm_map_links *new_hole = zalloc(vm_map_holes_zone); + new_hole->start = old_max_offset; + new_hole->end = map->max_offset; + new_hole->prev = map->holes_list->prev; + new_hole->next = (struct vm_map_entry *)map->holes_list; + map->holes_list->prev->links.next = (struct vm_map_entry *)new_hole; + map->holes_list->prev = (struct vm_map_entry *)new_hole; + } +#else + (void)map; + (void)new_max_offset; +#endif } vm_map_offset_t -vm_compute_max_offset(unsigned is64) +vm_compute_max_offset(boolean_t is64) +{ +#if defined(__arm__) || defined(__arm64__) + return pmap_max_offset(is64, ARM_PMAP_MAX_OFFSET_DEVICE); +#else + return is64 ? (vm_map_offset_t)MACH_VM_MAX_ADDRESS : (vm_map_offset_t)VM_MAX_ADDRESS; +#endif +} + +void +vm_map_get_max_aslr_slide_section( + vm_map_t map __unused, + int64_t *max_sections, + int64_t *section_size) +{ +#if defined(__arm64__) + *max_sections = 3; + *section_size = ARM_TT_TWIG_SIZE; +#else + *max_sections = 1; + *section_size = 0; +#endif +} + +uint64_t +vm_map_get_max_aslr_slide_pages(vm_map_t map) { - return (is64 ? (vm_map_offset_t)MACH_VM_MAX_ADDRESS : (vm_map_offset_t)VM_MAX_ADDRESS); +#if defined(__arm64__) + /* Limit arm64 slide to 16MB to conserve contiguous VA space in the more + * limited embedded address space; this is also meant to minimize pmap + * memory usage on 16KB page systems. + */ + return 1 << (24 - VM_MAP_PAGE_SHIFT(map)); +#else + return 1 << (vm_map_is_64bit(map) ? 16 : 8); +#endif } uint64_t -vm_map_get_max_aslr_slide_pages(vm_map_t map) +vm_map_get_max_loader_aslr_slide_pages(vm_map_t map) { - return (1 << (vm_map_is_64bit(map) ? 16 : 8)); +#if defined(__arm64__) + /* We limit the loader slide to 4MB, in order to ensure at least 8 bits + * of independent entropy on 16KB page systems. + */ + return 1 << (22 - VM_MAP_PAGE_SHIFT(map)); +#else + return 1 << (vm_map_is_64bit(map) ? 16 : 8); +#endif } +#ifndef __arm__ boolean_t vm_map_is_64bit( - vm_map_t map) + vm_map_t map) { return map->max_offset > ((vm_map_offset_t)VM_MAX_ADDRESS); } +#endif boolean_t vm_map_has_hard_pagezero( - vm_map_t map, - vm_map_offset_t pagezero_size) + vm_map_t map, + vm_map_offset_t pagezero_size) { /* * XXX FBDP @@ -14337,7 +20014,7 @@ vm_map_has_hard_pagezero( * VM map is being torn down, and when a new map is created via * load_machfile()/execve(). */ - return (map->min_offset >= pagezero_size); + return map->min_offset >= pagezero_size; } /* @@ -14345,16 +20022,16 @@ vm_map_has_hard_pagezero( */ kern_return_t vm_map_raise_max_offset( - vm_map_t map, - vm_map_offset_t new_max_offset) + vm_map_t map, + vm_map_offset_t new_max_offset) { - kern_return_t ret; + kern_return_t ret; vm_map_lock(map); ret = KERN_INVALID_ADDRESS; if (new_max_offset >= map->max_offset) { - if (!vm_map_is_64bit(map)) { + if (!vm_map_is_64bit(map)) { if (new_max_offset <= (vm_map_offset_t)VM_MAX_ADDRESS) { map->max_offset = new_max_offset; ret = KERN_SUCCESS; @@ -14378,13 +20055,13 @@ vm_map_raise_max_offset( */ kern_return_t vm_map_raise_min_offset( - vm_map_t map, - vm_map_offset_t new_min_offset) + vm_map_t map, + vm_map_offset_t new_min_offset) { - vm_map_entry_t first_entry; + vm_map_entry_t first_entry; new_min_offset = vm_map_round_page(new_min_offset, - VM_MAP_PAGE_MASK(map)); + VM_MAP_PAGE_MASK(map)); vm_map_lock(map); @@ -14397,6 +20074,11 @@ vm_map_raise_min_offset( vm_map_unlock(map); return KERN_INVALID_ADDRESS; } + if (new_min_offset >= map->max_offset) { + /* can't go beyond the end of the address space */ + vm_map_unlock(map); + return KERN_INVALID_ADDRESS; + } first_entry = vm_map_first_entry(map); if (first_entry != vm_map_to_entry(map) && @@ -14411,6 +20093,10 @@ vm_map_raise_min_offset( map->min_offset = new_min_offset; + assert(map->holes_list); + map->holes_list->start = new_min_offset; + assert(new_min_offset < map->holes_list->end); + vm_map_unlock(map); return KERN_SUCCESS; @@ -14424,18 +20110,48 @@ vm_map_raise_min_offset( */ void -vm_map_set_user_wire_limit(vm_map_t map, - vm_size_t limit) +vm_map_set_user_wire_limit(vm_map_t map, + vm_size_t limit) { map->user_wire_limit = limit; } -void vm_map_switch_protect(vm_map_t map, - boolean_t val) +void +vm_map_switch_protect(vm_map_t map, + boolean_t val) +{ + vm_map_lock(map); + map->switch_protect = val; + vm_map_unlock(map); +} + +extern int cs_process_enforcement_enable; +boolean_t +vm_map_cs_enforcement( + vm_map_t map) +{ + if (cs_process_enforcement_enable) { + return TRUE; + } + return map->cs_enforcement; +} + +kern_return_t +vm_map_cs_wx_enable( + vm_map_t map) +{ + return pmap_cs_allow_invalid(vm_map_pmap(map)); +} + +void +vm_map_cs_enforcement_set( + vm_map_t map, + boolean_t val) { vm_map_lock(map); - map->switch_protect=val; + map->cs_enforcement = val; + pmap_set_vm_map_cs_enforced(map->pmap, val); vm_map_unlock(map); } @@ -14450,7 +20166,7 @@ vm_map_iokit_mapped_region(vm_map_t map, vm_size_t bytes) pmap_t pmap = vm_map_pmap(map); ledger_credit(pmap->ledger, task_ledgers.iokit_mapped, bytes); - ledger_credit(pmap->ledger, task_ledgers.phys_footprint, bytes); + ledger_credit(pmap->ledger, task_ledgers.phys_footprint, bytes); } void @@ -14459,46 +20175,48 @@ vm_map_iokit_unmapped_region(vm_map_t map, vm_size_t bytes) pmap_t pmap = vm_map_pmap(map); ledger_debit(pmap->ledger, task_ledgers.iokit_mapped, bytes); - ledger_debit(pmap->ledger, task_ledgers.phys_footprint, bytes); + ledger_debit(pmap->ledger, task_ledgers.phys_footprint, bytes); } /* Add (generate) code signature for memory range */ #if CONFIG_DYNAMIC_CODE_SIGNING -kern_return_t vm_map_sign(vm_map_t map, - vm_map_offset_t start, - vm_map_offset_t end) +kern_return_t +vm_map_sign(vm_map_t map, + vm_map_offset_t start, + vm_map_offset_t end) { vm_map_entry_t entry; vm_page_t m; vm_object_t object; - + /* * Vet all the input parameters and current type and state of the * underlaying object. Return with an error if anything is amiss. */ - if (map == VM_MAP_NULL) - return(KERN_INVALID_ARGUMENT); - + if (map == VM_MAP_NULL) { + return KERN_INVALID_ARGUMENT; + } + vm_map_lock_read(map); - + if (!vm_map_lookup_entry(map, start, &entry) || entry->is_sub_map) { /* * Must pass a valid non-submap address. */ vm_map_unlock_read(map); - return(KERN_INVALID_ADDRESS); + return KERN_INVALID_ADDRESS; } - - if((entry->vme_start > start) || (entry->vme_end < end)) { + + if ((entry->vme_start > start) || (entry->vme_end < end)) { /* * Map entry doesn't cover the requested range. Not handling * this situation currently. */ vm_map_unlock_read(map); - return(KERN_INVALID_ARGUMENT); + return KERN_INVALID_ARGUMENT; } - - object = entry->object.vm_object; + + object = VME_OBJECT(entry); if (object == VM_OBJECT_NULL) { /* * Object must already be present or we can't sign. @@ -14506,60 +20224,63 @@ kern_return_t vm_map_sign(vm_map_t map, vm_map_unlock_read(map); return KERN_INVALID_ARGUMENT; } - + vm_object_lock(object); vm_map_unlock_read(map); - - while(start < end) { + + while (start < end) { uint32_t refmod; - - m = vm_page_lookup(object, start - entry->vme_start + entry->offset ); - if (m==VM_PAGE_NULL) { - /* shoud we try to fault a page here? we can probably + + m = vm_page_lookup(object, + start - entry->vme_start + VME_OFFSET(entry)); + if (m == VM_PAGE_NULL) { + /* shoud we try to fault a page here? we can probably * demand it exists and is locked for this request */ vm_object_unlock(object); return KERN_FAILURE; } /* deal with special page status */ - if (m->busy || - (m->unusual && (m->error || m->restart || m->private || m->absent))) { + if (m->vmp_busy || + (m->vmp_unusual && (m->vmp_error || m->vmp_restart || m->vmp_private || m->vmp_absent))) { vm_object_unlock(object); return KERN_FAILURE; } - + /* Page is OK... now "validate" it */ - /* This is the place where we'll call out to create a code + /* This is the place where we'll call out to create a code * directory, later */ - m->cs_validated = TRUE; + /* XXX TODO4K: deal with 4k subpages individually? */ + m->vmp_cs_validated = VMP_CS_ALL_TRUE; /* The page is now "clean" for codesigning purposes. That means - * we don't consider it as modified (wpmapped) anymore. But + * we don't consider it as modified (wpmapped) anymore. But * we'll disconnect the page so we note any future modification * attempts. */ - m->wpmapped = FALSE; - refmod = pmap_disconnect(m->phys_page); - - /* Pull the dirty status from the pmap, since we cleared the + m->vmp_wpmapped = FALSE; + refmod = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m)); + + /* Pull the dirty status from the pmap, since we cleared the * wpmapped bit */ - if ((refmod & VM_MEM_MODIFIED) && !m->dirty) { + if ((refmod & VM_MEM_MODIFIED) && !m->vmp_dirty) { SET_PAGE_DIRTY(m, FALSE); } - + /* On to the next page */ start += PAGE_SIZE; } vm_object_unlock(object); - + return KERN_SUCCESS; } #endif -kern_return_t vm_map_partial_reap(vm_map_t map, unsigned int *reclaimed_resident, unsigned int *reclaimed_compressed) -{ - vm_map_entry_t entry = VM_MAP_ENTRY_NULL; +kern_return_t +vm_map_partial_reap(vm_map_t map, unsigned int *reclaimed_resident, unsigned int *reclaimed_compressed) +{ + vm_map_entry_t entry = VM_MAP_ENTRY_NULL; vm_map_entry_t next_entry; - kern_return_t kr = KERN_SUCCESS; - vm_map_t zap_map; + kern_return_t kr = KERN_SUCCESS; + vm_map_t zap_map; vm_map_lock(map); @@ -14568,229 +20289,370 @@ kern_return_t vm_map_partial_reap(vm_map_t map, unsigned int *reclaimed_resident * the "map" in vm_map_delete(). */ zap_map = vm_map_create(PMAP_NULL, - map->min_offset, - map->max_offset, - map->hdr.entries_pageable); + map->min_offset, + map->max_offset, + map->hdr.entries_pageable); if (zap_map == VM_MAP_NULL) { return KERN_RESOURCE_SHORTAGE; } - vm_map_set_page_shift(zap_map, - VM_MAP_PAGE_SHIFT(map)); + vm_map_set_page_shift(zap_map, + VM_MAP_PAGE_SHIFT(map)); + vm_map_disable_hole_optimization(zap_map); for (entry = vm_map_first_entry(map); - entry != vm_map_to_entry(map); - entry = next_entry) { + entry != vm_map_to_entry(map); + entry = next_entry) { next_entry = entry->vme_next; - - if (entry->object.vm_object && !entry->is_sub_map && (entry->object.vm_object->internal == TRUE) - && (entry->object.vm_object->ref_count == 1)) { - *reclaimed_resident += entry->object.vm_object->resident_page_count; - *reclaimed_compressed += vm_compressor_pager_get_count(entry->object.vm_object->pager); + if (VME_OBJECT(entry) && + !entry->is_sub_map && + (VME_OBJECT(entry)->internal == TRUE) && + (VME_OBJECT(entry)->ref_count == 1)) { + *reclaimed_resident += VME_OBJECT(entry)->resident_page_count; + *reclaimed_compressed += vm_compressor_pager_get_count(VME_OBJECT(entry)->pager); - (void)vm_map_delete(map, - entry->vme_start, - entry->vme_end, - VM_MAP_REMOVE_SAVE_ENTRIES, - zap_map); + (void)vm_map_delete(map, + entry->vme_start, + entry->vme_end, + VM_MAP_REMOVE_SAVE_ENTRIES, + zap_map); } } vm_map_unlock(map); - /* + /* * Get rid of the "zap_maps" and all the map entries that - * they may still contain. - */ - if (zap_map != VM_MAP_NULL) { - vm_map_destroy(zap_map, VM_MAP_REMOVE_NO_PMAP_CLEANUP); - zap_map = VM_MAP_NULL; - } + * they may still contain. + */ + if (zap_map != VM_MAP_NULL) { + vm_map_destroy(zap_map, VM_MAP_REMOVE_NO_PMAP_CLEANUP); + zap_map = VM_MAP_NULL; + } return kr; } -#if CONFIG_FREEZE -kern_return_t vm_map_freeze_walk( - vm_map_t map, - unsigned int *purgeable_count, - unsigned int *wired_count, - unsigned int *clean_count, - unsigned int *dirty_count, - unsigned int dirty_budget, - boolean_t *has_shared) +#if DEVELOPMENT || DEBUG + +int +vm_map_disconnect_page_mappings( + vm_map_t map, + boolean_t do_unnest) { vm_map_entry_t entry; - + int page_count = 0; + + if (do_unnest == TRUE) { +#ifndef NO_NESTED_PMAP + vm_map_lock(map); + + for (entry = vm_map_first_entry(map); + entry != vm_map_to_entry(map); + entry = entry->vme_next) { + if (entry->is_sub_map && entry->use_pmap) { + /* + * Make sure the range between the start of this entry and + * the end of this entry is no longer nested, so that + * we will only remove mappings from the pmap in use by this + * this task + */ + vm_map_clip_unnest(map, entry, entry->vme_start, entry->vme_end); + } + } + vm_map_unlock(map); +#endif + } vm_map_lock_read(map); - - *purgeable_count = *wired_count = *clean_count = *dirty_count = 0; - *has_shared = FALSE; - + + page_count = map->pmap->stats.resident_count; + for (entry = vm_map_first_entry(map); - entry != vm_map_to_entry(map); - entry = entry->vme_next) { - unsigned int purgeable, clean, dirty, wired; - boolean_t shared; - - if ((entry->object.vm_object == 0) || - (entry->is_sub_map) || - (entry->object.vm_object->phys_contiguous)) { + entry != vm_map_to_entry(map); + entry = entry->vme_next) { + if (!entry->is_sub_map && ((VME_OBJECT(entry) == 0) || + (VME_OBJECT(entry)->phys_contiguous))) { continue; } - - default_freezer_pack(&purgeable, &wired, &clean, &dirty, dirty_budget, &shared, entry->object.vm_object, NULL); - - *purgeable_count += purgeable; - *wired_count += wired; - *clean_count += clean; - *dirty_count += dirty; - - if (shared) { - *has_shared = TRUE; - } - - /* Adjust pageout budget and finish up if reached */ - if (dirty_budget) { - dirty_budget -= dirty; - if (dirty_budget == 0) { - break; - } + if (entry->is_sub_map) { + assert(!entry->use_pmap); } - } + pmap_remove_options(map->pmap, entry->vme_start, entry->vme_end, 0); + } vm_map_unlock_read(map); - return KERN_SUCCESS; + return page_count; +} + +kern_return_t +vm_map_inject_error(vm_map_t map, vm_map_offset_t vaddr) +{ + vm_object_t object = NULL; + vm_object_offset_t offset; + vm_prot_t prot; + boolean_t wired; + vm_map_version_t version; + vm_map_t real_map; + int result = KERN_FAILURE; + + vaddr = vm_map_trunc_page(vaddr, PAGE_MASK); + vm_map_lock(map); + + result = vm_map_lookup_locked(&map, vaddr, VM_PROT_READ, + OBJECT_LOCK_EXCLUSIVE, &version, &object, &offset, &prot, &wired, + NULL, &real_map, NULL); + if (object == NULL) { + result = KERN_MEMORY_ERROR; + } else if (object->pager) { + result = vm_compressor_pager_inject_error(object->pager, + offset); + } else { + result = KERN_MEMORY_PRESENT; + } + + if (object != NULL) { + vm_object_unlock(object); + } + + if (real_map != map) { + vm_map_unlock(real_map); + } + vm_map_unlock(map); + + return result; } -kern_return_t vm_map_freeze( - vm_map_t map, - unsigned int *purgeable_count, - unsigned int *wired_count, - unsigned int *clean_count, - unsigned int *dirty_count, - unsigned int dirty_budget, - boolean_t *has_shared) -{ - vm_map_entry_t entry2 = VM_MAP_ENTRY_NULL; - kern_return_t kr = KERN_SUCCESS; - boolean_t default_freezer_active = TRUE; +#endif + + +#if CONFIG_FREEZE + + +extern struct freezer_context freezer_context_global; +AbsoluteTime c_freezer_last_yield_ts = 0; + +extern unsigned int memorystatus_freeze_private_shared_pages_ratio; +extern unsigned int memorystatus_freeze_shared_mb_per_process_max; + +kern_return_t +vm_map_freeze( + task_t task, + unsigned int *purgeable_count, + unsigned int *wired_count, + unsigned int *clean_count, + unsigned int *dirty_count, + unsigned int dirty_budget, + unsigned int *shared_count, + int *freezer_error_code, + boolean_t eval_only) +{ + vm_map_entry_t entry2 = VM_MAP_ENTRY_NULL; + kern_return_t kr = KERN_SUCCESS; + boolean_t evaluation_phase = TRUE; + vm_object_t cur_shared_object = NULL; + int cur_shared_obj_ref_cnt = 0; + unsigned int dirty_private_count = 0, dirty_shared_count = 0, obj_pages_snapshot = 0; - *purgeable_count = *wired_count = *clean_count = *dirty_count = 0; - *has_shared = FALSE; + *purgeable_count = *wired_count = *clean_count = *dirty_count = *shared_count = 0; /* * We need the exclusive lock here so that we can * block any page faults or lookups while we are * in the middle of freezing this vm map. */ + vm_map_t map = task->map; + vm_map_lock(map); - if (COMPRESSED_PAGER_IS_ACTIVE || DEFAULT_FREEZER_COMPRESSED_PAGER_IS_ACTIVE) { - default_freezer_active = FALSE; - } - - if (default_freezer_active) { - if (map->default_freezer_handle == NULL) { - map->default_freezer_handle = default_freezer_handle_allocate(); + assert(VM_CONFIG_COMPRESSOR_IS_PRESENT); + + if (vm_compressor_low_on_space() || vm_swap_low_on_space()) { + if (vm_compressor_low_on_space()) { + *freezer_error_code = FREEZER_ERROR_NO_COMPRESSOR_SPACE; + } + + if (vm_swap_low_on_space()) { + *freezer_error_code = FREEZER_ERROR_NO_SWAP_SPACE; } - - if ((kr = default_freezer_handle_init(map->default_freezer_handle)) != KERN_SUCCESS) { + + kr = KERN_NO_SPACE; + goto done; + } + + if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE == FALSE) { + /* + * In-memory compressor backing the freezer. No disk. + * So no need to do the evaluation phase. + */ + evaluation_phase = FALSE; + + if (eval_only == TRUE) { /* - * Can happen if default_freezer_handle passed in is NULL - * Or, a table has already been allocated and associated - * with this handle, i.e. the map is already frozen. + * We don't support 'eval_only' mode + * in this non-swap config. */ + *freezer_error_code = FREEZER_ERROR_GENERIC; + kr = KERN_INVALID_ARGUMENT; goto done; } + + freezer_context_global.freezer_ctx_uncompressed_pages = 0; + clock_get_uptime(&c_freezer_last_yield_ts); } - +again: + for (entry2 = vm_map_first_entry(map); - entry2 != vm_map_to_entry(map); - entry2 = entry2->vme_next) { - - vm_object_t src_object = entry2->object.vm_object; + entry2 != vm_map_to_entry(map); + entry2 = entry2->vme_next) { + vm_object_t src_object = VME_OBJECT(entry2); - if (entry2->object.vm_object && !entry2->is_sub_map && !entry2->object.vm_object->phys_contiguous) { + if (src_object && + !entry2->is_sub_map && + !src_object->phys_contiguous) { /* If eligible, scan the entry, moving eligible pages over to our parent object */ - if (default_freezer_active) { - unsigned int purgeable, clean, dirty, wired; - boolean_t shared; - - default_freezer_pack(&purgeable, &wired, &clean, &dirty, dirty_budget, &shared, - src_object, map->default_freezer_handle); - - *purgeable_count += purgeable; - *wired_count += wired; - *clean_count += clean; - *dirty_count += dirty; - - /* Adjust pageout budget and finish up if reached */ - if (dirty_budget) { - dirty_budget -= dirty; - if (dirty_budget == 0) { - break; + + if (src_object->internal == TRUE) { + if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) { + /* + * We skip purgeable objects during evaluation phase only. + * If we decide to freeze this process, we'll explicitly + * purge these objects before we go around again with + * 'evaluation_phase' set to FALSE. + */ + + if ((src_object->purgable == VM_PURGABLE_EMPTY) || (src_object->purgable == VM_PURGABLE_VOLATILE)) { + /* + * We want to purge objects that may not belong to this task but are mapped + * in this task alone. Since we already purged this task's purgeable memory + * at the end of a successful evaluation phase, we want to avoid doing no-op calls + * on this task's purgeable objects. Hence the check for only volatile objects. + */ + if (evaluation_phase == FALSE && + (src_object->purgable == VM_PURGABLE_VOLATILE) && + (src_object->ref_count == 1)) { + vm_object_lock(src_object); + vm_object_purge(src_object, 0); + vm_object_unlock(src_object); + } + continue; + } + + /* + * Pages belonging to this object could be swapped to disk. + * Make sure it's not a shared object because we could end + * up just bringing it back in again. + * + * We try to optimize somewhat by checking for objects that are mapped + * more than once within our own map. But we don't do full searches, + * we just look at the entries following our current entry. + */ + + if (src_object->ref_count > 1) { + if (src_object != cur_shared_object) { + obj_pages_snapshot = (src_object->resident_page_count - src_object->wired_page_count) + vm_compressor_pager_get_count(src_object->pager); + dirty_shared_count += obj_pages_snapshot; + + cur_shared_object = src_object; + cur_shared_obj_ref_cnt = 1; + continue; + } else { + cur_shared_obj_ref_cnt++; + if (src_object->ref_count == cur_shared_obj_ref_cnt) { + /* + * Fall through to below and treat this object as private. + * So deduct its pages from our shared total and add it to the + * private total. + */ + + dirty_shared_count -= obj_pages_snapshot; + dirty_private_count += obj_pages_snapshot; + } else { + continue; + } + } + } + + + if (src_object->ref_count == 1) { + dirty_private_count += (src_object->resident_page_count - src_object->wired_page_count) + vm_compressor_pager_get_count(src_object->pager); + } + + if (evaluation_phase == TRUE) { + continue; } } - if (shared) { - *has_shared = TRUE; + uint32_t paged_out_count = vm_object_compressed_freezer_pageout(src_object, dirty_budget); + *wired_count += src_object->wired_page_count; + + if (vm_compressor_low_on_space() || vm_swap_low_on_space()) { + if (vm_compressor_low_on_space()) { + *freezer_error_code = FREEZER_ERROR_NO_COMPRESSOR_SPACE; + } + + if (vm_swap_low_on_space()) { + *freezer_error_code = FREEZER_ERROR_NO_SWAP_SPACE; + } + + kr = KERN_NO_SPACE; + break; } - } else { - /* - * To the compressor. - */ - if (entry2->object.vm_object->internal == TRUE) { - vm_object_pageout(entry2->object.vm_object); + if (paged_out_count >= dirty_budget) { + break; } + dirty_budget -= paged_out_count; } } } - if (default_freezer_active) { - /* Finally, throw out the pages to swap */ - default_freezer_pageout(map->default_freezer_handle); - } + *shared_count = (unsigned int) ((dirty_shared_count * PAGE_SIZE_64) / (1024 * 1024ULL)); + if (evaluation_phase) { + unsigned int shared_pages_threshold = (memorystatus_freeze_shared_mb_per_process_max * 1024 * 1024ULL) / PAGE_SIZE_64; -done: - vm_map_unlock(map); - - return kr; -} + if (dirty_shared_count > shared_pages_threshold) { + *freezer_error_code = FREEZER_ERROR_EXCESS_SHARED_MEMORY; + kr = KERN_FAILURE; + goto done; + } -kern_return_t -vm_map_thaw( - vm_map_t map) -{ - kern_return_t kr = KERN_SUCCESS; + if (dirty_shared_count && + ((dirty_private_count / dirty_shared_count) < memorystatus_freeze_private_shared_pages_ratio)) { + *freezer_error_code = FREEZER_ERROR_LOW_PRIVATE_SHARED_RATIO; + kr = KERN_FAILURE; + goto done; + } - if (COMPRESSED_PAGER_IS_ACTIVE || DEFAULT_FREEZER_COMPRESSED_PAGER_IS_ACTIVE) { - /* - * We will on-demand thaw in the presence of the compressed pager. - */ - return kr; - } + evaluation_phase = FALSE; + dirty_shared_count = dirty_private_count = 0; - vm_map_lock(map); + freezer_context_global.freezer_ctx_uncompressed_pages = 0; + clock_get_uptime(&c_freezer_last_yield_ts); - if (map->default_freezer_handle == NULL) { - /* - * This map is not in a frozen state. - */ - kr = KERN_FAILURE; - goto out; + if (eval_only) { + kr = KERN_SUCCESS; + goto done; + } + + vm_purgeable_purge_task_owned(task); + + goto again; + } else { + kr = KERN_SUCCESS; } - kr = default_freezer_unpack(map->default_freezer_handle); -out: +done: vm_map_unlock(map); - + + if ((eval_only == FALSE) && (kr == KERN_SUCCESS)) { + vm_object_compressed_freezer_done(); + } return kr; } + #endif /* @@ -14803,19 +20665,21 @@ out: * For now, we target only the map entries created for the Objective C * Garbage Collector, which initially have the following properties: * - alias == VM_MEMORY_MALLOC - * - wired_count == 0 - * - !needs_copy + * - wired_count == 0 + * - !needs_copy * and a VM object with: - * - internal - * - copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC - * - !true_share - * - vo_size == ANON_CHUNK_SIZE + * - internal + * - copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC + * - !true_share + * - vo_size == ANON_CHUNK_SIZE + * + * Only non-kernel map entries. */ boolean_t vm_map_entry_should_cow_for_true_share( - vm_map_entry_t entry) + vm_map_entry_t entry) { - vm_object_t object; + vm_object_t object; if (entry->is_sub_map) { /* entry does not point at a VM object */ @@ -14827,8 +20691,8 @@ vm_map_entry_should_cow_for_true_share( return FALSE; } - if (entry->alias != VM_MEMORY_MALLOC && - entry->alias != VM_MEMORY_MALLOC_SMALL) { + if (VME_ALIAS(entry) != VM_MEMORY_MALLOC && + VME_ALIAS(entry) != VM_MEMORY_MALLOC_SMALL) { /* not a malloc heap or Obj-C Garbage Collector heap */ return FALSE; } @@ -14839,7 +20703,7 @@ vm_map_entry_should_cow_for_true_share( return FALSE; } - object = entry->object.vm_object; + object = VME_OBJECT(entry); if (object == VM_OBJECT_NULL) { /* no object yet... */ @@ -14861,13 +20725,13 @@ vm_map_entry_should_cow_for_true_share( return FALSE; } - if (entry->alias == VM_MEMORY_MALLOC && + if (VME_ALIAS(entry) == VM_MEMORY_MALLOC && object->vo_size != ANON_CHUNK_SIZE) { /* ... not an object created for the ObjC Garbage Collector */ return FALSE; } - if (entry->alias == VM_MEMORY_MALLOC_SMALL && + if (VME_ALIAS(entry) == VM_MEMORY_MALLOC_SMALL && object->vo_size != 2048 * 4096) { /* ... not a "MALLOC_SMALL" heap */ return FALSE; @@ -14882,22 +20746,30 @@ vm_map_entry_should_cow_for_true_share( return TRUE; } -vm_map_offset_t +vm_map_offset_t vm_map_round_page_mask( - vm_map_offset_t offset, - vm_map_offset_t mask) + vm_map_offset_t offset, + vm_map_offset_t mask) { return VM_MAP_ROUND_PAGE(offset, mask); } -vm_map_offset_t +vm_map_offset_t vm_map_trunc_page_mask( - vm_map_offset_t offset, - vm_map_offset_t mask) + vm_map_offset_t offset, + vm_map_offset_t mask) { return VM_MAP_TRUNC_PAGE(offset, mask); } +boolean_t +vm_map_page_aligned( + vm_map_offset_t offset, + vm_map_offset_t mask) +{ + return ((offset) & mask) == 0; +} + int vm_map_page_shift( vm_map_t map) @@ -14912,7 +20784,7 @@ vm_map_page_size( return VM_MAP_PAGE_SIZE(map); } -int +vm_map_offset_t vm_map_page_mask( vm_map_t map) { @@ -14921,8 +20793,8 @@ vm_map_page_mask( kern_return_t vm_map_set_page_shift( - vm_map_t map, - int pageshift) + vm_map_t map, + int pageshift) { if (map->hdr.nentries != 0) { /* too late to change page size */ @@ -14934,104 +20806,52 @@ vm_map_set_page_shift( return KERN_SUCCESS; } -int -vm_map_purge( - vm_map_t map) -{ - int num_object_purged; - vm_map_entry_t entry; - vm_map_offset_t next_address; - vm_object_t object; - int state; - kern_return_t kr; - - num_object_purged = 0; - - vm_map_lock_read(map); - entry = vm_map_first_entry(map); - while (entry != vm_map_to_entry(map)) { - if (entry->is_sub_map) { - goto next; - } - if (! (entry->protection & VM_PROT_WRITE)) { - goto next; - } - object = entry->object.vm_object; - if (object == VM_OBJECT_NULL) { - goto next; - } - if (object->purgable != VM_PURGABLE_VOLATILE) { - goto next; - } - - vm_object_lock(object); -#if 00 - if (entry->offset != 0 || - (entry->vme_end - entry->vme_start) != object->vo_size) { - vm_object_unlock(object); - goto next; - } -#endif - next_address = entry->vme_end; - vm_map_unlock_read(map); - state = VM_PURGABLE_EMPTY; - kr = vm_object_purgable_control(object, - VM_PURGABLE_SET_STATE, - &state); - if (kr == KERN_SUCCESS) { - num_object_purged++; - } - vm_object_unlock(object); - - vm_map_lock_read(map); - if (vm_map_lookup_entry(map, next_address, &entry)) { - continue; - } - next: - entry = entry->vme_next; - } - vm_map_unlock_read(map); - - return num_object_purged; -} - kern_return_t vm_map_query_volatile( - vm_map_t map, - mach_vm_size_t *volatile_virtual_size_p, - mach_vm_size_t *volatile_resident_size_p, - mach_vm_size_t *volatile_pmap_size_p) -{ - mach_vm_size_t volatile_virtual_size; - mach_vm_size_t volatile_resident_count; - mach_vm_size_t volatile_pmap_count; - mach_vm_size_t resident_count; - vm_map_entry_t entry; - vm_object_t object; + vm_map_t map, + mach_vm_size_t *volatile_virtual_size_p, + mach_vm_size_t *volatile_resident_size_p, + mach_vm_size_t *volatile_compressed_size_p, + mach_vm_size_t *volatile_pmap_size_p, + mach_vm_size_t *volatile_compressed_pmap_size_p) +{ + mach_vm_size_t volatile_virtual_size; + mach_vm_size_t volatile_resident_count; + mach_vm_size_t volatile_compressed_count; + mach_vm_size_t volatile_pmap_count; + mach_vm_size_t volatile_compressed_pmap_count; + mach_vm_size_t resident_count; + vm_map_entry_t entry; + vm_object_t object; /* map should be locked by caller */ volatile_virtual_size = 0; volatile_resident_count = 0; + volatile_compressed_count = 0; volatile_pmap_count = 0; + volatile_compressed_pmap_count = 0; for (entry = vm_map_first_entry(map); - entry != vm_map_to_entry(map); - entry = entry->vme_next) { + entry != vm_map_to_entry(map); + entry = entry->vme_next) { + mach_vm_size_t pmap_resident_bytes, pmap_compressed_bytes; + if (entry->is_sub_map) { continue; } - if (! (entry->protection & VM_PROT_WRITE)) { + if (!(entry->protection & VM_PROT_WRITE)) { continue; } - object = entry->object.vm_object; + object = VME_OBJECT(entry); if (object == VM_OBJECT_NULL) { continue; } - if (object->purgable != VM_PURGABLE_VOLATILE) { + if (object->purgable != VM_PURGABLE_VOLATILE && + object->purgable != VM_PURGABLE_EMPTY) { continue; } - if (entry->offset != 0) { + if (VME_OFFSET(entry)) { /* * If the map entry has been split and the object now * appears several times in the VM map, we don't want @@ -15042,58 +20862,123 @@ vm_map_query_volatile( continue; } resident_count = object->resident_page_count; - if ((entry->offset / PAGE_SIZE) >= resident_count) { + if ((VME_OFFSET(entry) / PAGE_SIZE) >= resident_count) { resident_count = 0; } else { - resident_count -= (entry->offset / PAGE_SIZE); + resident_count -= (VME_OFFSET(entry) / PAGE_SIZE); } volatile_virtual_size += entry->vme_end - entry->vme_start; volatile_resident_count += resident_count; - volatile_pmap_count += pmap_query_resident(map->pmap, - entry->vme_start, - entry->vme_end); + if (object->pager) { + volatile_compressed_count += + vm_compressor_pager_get_count(object->pager); + } + pmap_compressed_bytes = 0; + pmap_resident_bytes = + pmap_query_resident(map->pmap, + entry->vme_start, + entry->vme_end, + &pmap_compressed_bytes); + volatile_pmap_count += (pmap_resident_bytes / PAGE_SIZE); + volatile_compressed_pmap_count += (pmap_compressed_bytes + / PAGE_SIZE); } /* map is still locked on return */ *volatile_virtual_size_p = volatile_virtual_size; *volatile_resident_size_p = volatile_resident_count * PAGE_SIZE; + *volatile_compressed_size_p = volatile_compressed_count * PAGE_SIZE; *volatile_pmap_size_p = volatile_pmap_count * PAGE_SIZE; + *volatile_compressed_pmap_size_p = volatile_compressed_pmap_count * PAGE_SIZE; return KERN_SUCCESS; } +void +vm_map_sizes(vm_map_t map, + vm_map_size_t * psize, + vm_map_size_t * pfree, + vm_map_size_t * plargest_free) +{ + vm_map_entry_t entry; + vm_map_offset_t prev; + vm_map_size_t free, total_free, largest_free; + boolean_t end; + + if (!map) { + *psize = *pfree = *plargest_free = 0; + return; + } + total_free = largest_free = 0; + + vm_map_lock_read(map); + if (psize) { + *psize = map->max_offset - map->min_offset; + } + + prev = map->min_offset; + for (entry = vm_map_first_entry(map);; entry = entry->vme_next) { + end = (entry == vm_map_to_entry(map)); + + if (end) { + free = entry->vme_end - prev; + } else { + free = entry->vme_start - prev; + } + + total_free += free; + if (free > largest_free) { + largest_free = free; + } + + if (end) { + break; + } + prev = entry->vme_end; + } + vm_map_unlock_read(map); + if (pfree) { + *pfree = total_free; + } + if (plargest_free) { + *plargest_free = largest_free; + } +} + #if VM_SCAN_FOR_SHADOW_CHAIN int vm_map_shadow_max(vm_map_t map); -int vm_map_shadow_max( +int +vm_map_shadow_max( vm_map_t map) { - int shadows, shadows_max; - vm_map_entry_t entry; - vm_object_t object, next_object; + int shadows, shadows_max; + vm_map_entry_t entry; + vm_object_t object, next_object; - if (map == NULL) + if (map == NULL) { return 0; + } shadows_max = 0; vm_map_lock_read(map); - + for (entry = vm_map_first_entry(map); - entry != vm_map_to_entry(map); - entry = entry->vme_next) { + entry != vm_map_to_entry(map); + entry = entry->vme_next) { if (entry->is_sub_map) { continue; } - object = entry->object.vm_object; + object = VME_OBJECT(entry); if (object == NULL) { continue; } vm_object_lock_shared(object); for (shadows = 0; - object->shadow != NULL; - shadows++, object = next_object) { + object->shadow != NULL; + shadows++, object = next_object) { next_object = object->shadow; vm_object_lock_shared(next_object); vm_object_unlock(object); @@ -15109,3 +20994,929 @@ int vm_map_shadow_max( return shadows_max; } #endif /* VM_SCAN_FOR_SHADOW_CHAIN */ + +void +vm_commit_pagezero_status(vm_map_t lmap) +{ + pmap_advise_pagezero_range(lmap->pmap, lmap->min_offset); +} + +#if XNU_TARGET_OS_OSX +void +vm_map_set_high_start( + vm_map_t map, + vm_map_offset_t high_start) +{ + map->vmmap_high_start = high_start; +} +#endif /* XNU_TARGET_OS_OSX */ + + +/* + * FORKED CORPSE FOOTPRINT + * + * A forked corpse gets a copy of the original VM map but its pmap is mostly + * empty since it never ran and never got to fault in any pages. + * Collecting footprint info (via "sysctl vm.self_region_footprint") for + * a forked corpse would therefore return very little information. + * + * When forking a corpse, we can pass the VM_MAP_FORK_CORPSE_FOOTPRINT option + * to vm_map_fork() to collect footprint information from the original VM map + * and its pmap, and store it in the forked corpse's VM map. That information + * is stored in place of the VM map's "hole list" since we'll never need to + * lookup for holes in the corpse's map. + * + * The corpse's footprint info looks like this: + * + * vm_map->vmmap_corpse_footprint points to pageable kernel memory laid out + * as follows: + * +---------------------------------------+ + * header-> | cf_size | + * +-------------------+-------------------+ + * | cf_last_region | cf_last_zeroes | + * +-------------------+-------------------+ + * region1-> | cfr_vaddr | + * +-------------------+-------------------+ + * | cfr_num_pages | d0 | d1 | d2 | d3 | + * +---------------------------------------+ + * | d4 | d5 | ... | + * +---------------------------------------+ + * | ... | + * +-------------------+-------------------+ + * | dy | dz | na | na | cfr_vaddr... | <-region2 + * +-------------------+-------------------+ + * | cfr_vaddr (ctd) | cfr_num_pages | + * +---------------------------------------+ + * | d0 | d1 ... | + * +---------------------------------------+ + * ... + * +---------------------------------------+ + * last region-> | cfr_vaddr | + * +---------------------------------------+ + * + cfr_num_pages | d0 | d1 | d2 | d3 | + * +---------------------------------------+ + * ... + * +---------------------------------------+ + * | dx | dy | dz | na | na | na | na | na | + * +---------------------------------------+ + * + * where: + * cf_size: total size of the buffer (rounded to page size) + * cf_last_region: offset in the buffer of the last "region" sub-header + * cf_last_zeroes: number of trailing "zero" dispositions at the end + * of last region + * cfr_vaddr: virtual address of the start of the covered "region" + * cfr_num_pages: number of pages in the covered "region" + * d*: disposition of the page at that virtual address + * Regions in the buffer are word-aligned. + * + * We estimate the size of the buffer based on the number of memory regions + * and the virtual size of the address space. While copying each memory region + * during vm_map_fork(), we also collect the footprint info for that region + * and store it in the buffer, packing it as much as possible (coalescing + * contiguous memory regions to avoid having too many region headers and + * avoiding long streaks of "zero" page dispositions by splitting footprint + * "regions", so the number of regions in the footprint buffer might not match + * the number of memory regions in the address space. + * + * We also have to copy the original task's "nonvolatile" ledgers since that's + * part of the footprint and will need to be reported to any tool asking for + * the footprint information of the forked corpse. + */ + +uint64_t vm_map_corpse_footprint_count = 0; +uint64_t vm_map_corpse_footprint_size_avg = 0; +uint64_t vm_map_corpse_footprint_size_max = 0; +uint64_t vm_map_corpse_footprint_full = 0; +uint64_t vm_map_corpse_footprint_no_buf = 0; + +struct vm_map_corpse_footprint_header { + vm_size_t cf_size; /* allocated buffer size */ + uint32_t cf_last_region; /* offset of last region in buffer */ + union { + uint32_t cfu_last_zeroes; /* during creation: + * number of "zero" dispositions at + * end of last region */ + uint32_t cfu_hint_region; /* during lookup: + * offset of last looked up region */ +#define cf_last_zeroes cfu.cfu_last_zeroes +#define cf_hint_region cfu.cfu_hint_region + } cfu; +}; +typedef uint8_t cf_disp_t; +struct vm_map_corpse_footprint_region { + vm_map_offset_t cfr_vaddr; /* region start virtual address */ + uint32_t cfr_num_pages; /* number of pages in this "region" */ + cf_disp_t cfr_disposition[0]; /* disposition of each page */ +} __attribute__((packed)); + +static cf_disp_t +vm_page_disposition_to_cf_disp( + int disposition) +{ + assert(sizeof(cf_disp_t) == 1); + /* relocate bits that don't fit in a "uint8_t" */ + if (disposition & VM_PAGE_QUERY_PAGE_REUSABLE) { + disposition |= VM_PAGE_QUERY_PAGE_FICTITIOUS; + } + /* cast gets rid of extra bits */ + return (cf_disp_t) disposition; +} + +static int +vm_page_cf_disp_to_disposition( + cf_disp_t cf_disp) +{ + int disposition; + + assert(sizeof(cf_disp_t) == 1); + disposition = (int) cf_disp; + /* move relocated bits back in place */ + if (cf_disp & VM_PAGE_QUERY_PAGE_FICTITIOUS) { + disposition |= VM_PAGE_QUERY_PAGE_REUSABLE; + disposition &= ~VM_PAGE_QUERY_PAGE_FICTITIOUS; + } + return disposition; +} + +/* + * vm_map_corpse_footprint_new_region: + * closes the current footprint "region" and creates a new one + * + * Returns NULL if there's not enough space in the buffer for a new region. + */ +static struct vm_map_corpse_footprint_region * +vm_map_corpse_footprint_new_region( + struct vm_map_corpse_footprint_header *footprint_header) +{ + uintptr_t footprint_edge; + uint32_t new_region_offset; + struct vm_map_corpse_footprint_region *footprint_region; + struct vm_map_corpse_footprint_region *new_footprint_region; + + footprint_edge = ((uintptr_t)footprint_header + + footprint_header->cf_size); + footprint_region = ((struct vm_map_corpse_footprint_region *) + ((char *)footprint_header + + footprint_header->cf_last_region)); + assert((uintptr_t)footprint_region + sizeof(*footprint_region) <= + footprint_edge); + + /* get rid of trailing zeroes in the last region */ + assert(footprint_region->cfr_num_pages >= + footprint_header->cf_last_zeroes); + footprint_region->cfr_num_pages -= + footprint_header->cf_last_zeroes; + footprint_header->cf_last_zeroes = 0; + + /* reuse this region if it's now empty */ + if (footprint_region->cfr_num_pages == 0) { + return footprint_region; + } + + /* compute offset of new region */ + new_region_offset = footprint_header->cf_last_region; + new_region_offset += sizeof(*footprint_region); + new_region_offset += (footprint_region->cfr_num_pages * sizeof(cf_disp_t)); + new_region_offset = roundup(new_region_offset, sizeof(int)); + + /* check if we're going over the edge */ + if (((uintptr_t)footprint_header + + new_region_offset + + sizeof(*footprint_region)) >= + footprint_edge) { + /* over the edge: no new region */ + return NULL; + } + + /* adjust offset of last region in header */ + footprint_header->cf_last_region = new_region_offset; + + new_footprint_region = (struct vm_map_corpse_footprint_region *) + ((char *)footprint_header + + footprint_header->cf_last_region); + new_footprint_region->cfr_vaddr = 0; + new_footprint_region->cfr_num_pages = 0; + /* caller needs to initialize new region */ + + return new_footprint_region; +} + +/* + * vm_map_corpse_footprint_collect: + * collect footprint information for "old_entry" in "old_map" and + * stores it in "new_map"'s vmmap_footprint_info. + */ +kern_return_t +vm_map_corpse_footprint_collect( + vm_map_t old_map, + vm_map_entry_t old_entry, + vm_map_t new_map) +{ + vm_map_offset_t va; + kern_return_t kr; + struct vm_map_corpse_footprint_header *footprint_header; + struct vm_map_corpse_footprint_region *footprint_region; + struct vm_map_corpse_footprint_region *new_footprint_region; + cf_disp_t *next_disp_p; + uintptr_t footprint_edge; + uint32_t num_pages_tmp; + int effective_page_size; + + effective_page_size = MIN(PAGE_SIZE, VM_MAP_PAGE_SIZE(old_map)); + + va = old_entry->vme_start; + + vm_map_lock_assert_exclusive(old_map); + vm_map_lock_assert_exclusive(new_map); + + assert(new_map->has_corpse_footprint); + assert(!old_map->has_corpse_footprint); + if (!new_map->has_corpse_footprint || + old_map->has_corpse_footprint) { + /* + * This can only transfer footprint info from a + * map with a live pmap to a map with a corpse footprint. + */ + return KERN_NOT_SUPPORTED; + } + + if (new_map->vmmap_corpse_footprint == NULL) { + vm_offset_t buf; + vm_size_t buf_size; + + buf = 0; + buf_size = (sizeof(*footprint_header) + + (old_map->hdr.nentries + * + (sizeof(*footprint_region) + + +3)) /* potential alignment for each region */ + + + ((old_map->size / effective_page_size) + * + sizeof(cf_disp_t))); /* disposition for each page */ +// printf("FBDP corpse map %p guestimate footprint size 0x%llx\n", new_map, (uint64_t) buf_size); + buf_size = round_page(buf_size); + + /* limit buffer to 1 page to validate overflow detection */ +// buf_size = PAGE_SIZE; + + /* limit size to a somewhat sane amount */ +#if XNU_TARGET_OS_OSX +#define VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE (8*1024*1024) /* 8MB */ +#else /* XNU_TARGET_OS_OSX */ +#define VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE (256*1024) /* 256KB */ +#endif /* XNU_TARGET_OS_OSX */ + if (buf_size > VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE) { + buf_size = VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE; + } + + /* + * Allocate the pageable buffer (with a trailing guard page). + * It will be zero-filled on demand. + */ + kr = kernel_memory_allocate(kernel_map, + &buf, + (buf_size + + PAGE_SIZE), /* trailing guard page */ + 0, /* mask */ + KMA_PAGEABLE | KMA_GUARD_LAST, + VM_KERN_MEMORY_DIAG); + if (kr != KERN_SUCCESS) { + vm_map_corpse_footprint_no_buf++; + return kr; + } + + /* initialize header and 1st region */ + footprint_header = (struct vm_map_corpse_footprint_header *)buf; + new_map->vmmap_corpse_footprint = footprint_header; + + footprint_header->cf_size = buf_size; + footprint_header->cf_last_region = + sizeof(*footprint_header); + footprint_header->cf_last_zeroes = 0; + + footprint_region = (struct vm_map_corpse_footprint_region *) + ((char *)footprint_header + + footprint_header->cf_last_region); + footprint_region->cfr_vaddr = 0; + footprint_region->cfr_num_pages = 0; + } else { + /* retrieve header and last region */ + footprint_header = (struct vm_map_corpse_footprint_header *) + new_map->vmmap_corpse_footprint; + footprint_region = (struct vm_map_corpse_footprint_region *) + ((char *)footprint_header + + footprint_header->cf_last_region); + } + footprint_edge = ((uintptr_t)footprint_header + + footprint_header->cf_size); + + if ((footprint_region->cfr_vaddr + + (((vm_map_offset_t)footprint_region->cfr_num_pages) * + effective_page_size)) + != old_entry->vme_start) { + uint64_t num_pages_delta, num_pages_delta_size; + uint32_t region_offset_delta_size; + + /* + * Not the next contiguous virtual address: + * start a new region or store "zero" dispositions for + * the missing pages? + */ + /* size of gap in actual page dispositions */ + num_pages_delta = ((old_entry->vme_start - + footprint_region->cfr_vaddr) / effective_page_size) + - footprint_region->cfr_num_pages; + num_pages_delta_size = num_pages_delta * sizeof(cf_disp_t); + /* size of gap as a new footprint region header */ + region_offset_delta_size = + (sizeof(*footprint_region) + + roundup(((footprint_region->cfr_num_pages - + footprint_header->cf_last_zeroes) * sizeof(cf_disp_t)), + sizeof(int)) - + ((footprint_region->cfr_num_pages - + footprint_header->cf_last_zeroes) * sizeof(cf_disp_t))); +// printf("FBDP %s:%d region 0x%x 0x%llx 0x%x vme_start 0x%llx pages_delta 0x%llx region_delta 0x%x\n", __FUNCTION__, __LINE__, footprint_header->cf_last_region, footprint_region->cfr_vaddr, footprint_region->cfr_num_pages, old_entry->vme_start, num_pages_delta, region_offset_delta); + if (region_offset_delta_size < num_pages_delta_size || + os_add3_overflow(footprint_region->cfr_num_pages, + (uint32_t) num_pages_delta, + 1, + &num_pages_tmp)) { + /* + * Storing data for this gap would take more space + * than inserting a new footprint region header: + * let's start a new region and save space. If it's a + * tie, let's avoid using a new region, since that + * would require more region hops to find the right + * range during lookups. + * + * If the current region's cfr_num_pages would overflow + * if we added "zero" page dispositions for the gap, + * no choice but to start a new region. + */ +// printf("FBDP %s:%d new region\n", __FUNCTION__, __LINE__); + new_footprint_region = + vm_map_corpse_footprint_new_region(footprint_header); + /* check that we're not going over the edge */ + if (new_footprint_region == NULL) { + goto over_the_edge; + } + footprint_region = new_footprint_region; + /* initialize new region as empty */ + footprint_region->cfr_vaddr = old_entry->vme_start; + footprint_region->cfr_num_pages = 0; + } else { + /* + * Store "zero" page dispositions for the missing + * pages. + */ +// printf("FBDP %s:%d zero gap\n", __FUNCTION__, __LINE__); + for (; num_pages_delta > 0; num_pages_delta--) { + next_disp_p = (cf_disp_t *) + ((uintptr_t) footprint_region + + sizeof(*footprint_region)); + next_disp_p += footprint_region->cfr_num_pages; + /* check that we're not going over the edge */ + if ((uintptr_t)next_disp_p >= footprint_edge) { + goto over_the_edge; + } + /* store "zero" disposition for this gap page */ + footprint_region->cfr_num_pages++; + *next_disp_p = (cf_disp_t) 0; + footprint_header->cf_last_zeroes++; + } + } + } + + for (va = old_entry->vme_start; + va < old_entry->vme_end; + va += effective_page_size) { + int disposition; + cf_disp_t cf_disp; + + vm_map_footprint_query_page_info(old_map, + old_entry, + va, + &disposition); + cf_disp = vm_page_disposition_to_cf_disp(disposition); + +// if (va < SHARED_REGION_BASE_ARM64) printf("FBDP collect map %p va 0x%llx disp 0x%x\n", new_map, va, disp); + + if (cf_disp == 0 && footprint_region->cfr_num_pages == 0) { + /* + * Ignore "zero" dispositions at start of + * region: just move start of region. + */ + footprint_region->cfr_vaddr += effective_page_size; + continue; + } + + /* would region's cfr_num_pages overflow? */ + if (os_add_overflow(footprint_region->cfr_num_pages, 1, + &num_pages_tmp)) { + /* overflow: create a new region */ + new_footprint_region = + vm_map_corpse_footprint_new_region( + footprint_header); + if (new_footprint_region == NULL) { + goto over_the_edge; + } + footprint_region = new_footprint_region; + footprint_region->cfr_vaddr = va; + footprint_region->cfr_num_pages = 0; + } + + next_disp_p = (cf_disp_t *) ((uintptr_t) footprint_region + + sizeof(*footprint_region)); + next_disp_p += footprint_region->cfr_num_pages; + /* check that we're not going over the edge */ + if ((uintptr_t)next_disp_p >= footprint_edge) { + goto over_the_edge; + } + /* store this dispostion */ + *next_disp_p = cf_disp; + footprint_region->cfr_num_pages++; + + if (cf_disp != 0) { + /* non-zero disp: break the current zero streak */ + footprint_header->cf_last_zeroes = 0; + /* done */ + continue; + } + + /* zero disp: add to the current streak of zeroes */ + footprint_header->cf_last_zeroes++; + if ((footprint_header->cf_last_zeroes + + roundup(((footprint_region->cfr_num_pages - + footprint_header->cf_last_zeroes) * sizeof(cf_disp_t)) & + (sizeof(int) - 1), + sizeof(int))) < + (sizeof(*footprint_header))) { + /* + * There are not enough trailing "zero" dispositions + * (+ the extra padding we would need for the previous + * region); creating a new region would not save space + * at this point, so let's keep this "zero" disposition + * in this region and reconsider later. + */ + continue; + } + /* + * Create a new region to avoid having too many consecutive + * "zero" dispositions. + */ + new_footprint_region = + vm_map_corpse_footprint_new_region(footprint_header); + if (new_footprint_region == NULL) { + goto over_the_edge; + } + footprint_region = new_footprint_region; + /* initialize the new region as empty ... */ + footprint_region->cfr_num_pages = 0; + /* ... and skip this "zero" disp */ + footprint_region->cfr_vaddr = va + effective_page_size; + } + + return KERN_SUCCESS; + +over_the_edge: +// printf("FBDP map %p footprint was full for va 0x%llx\n", new_map, va); + vm_map_corpse_footprint_full++; + return KERN_RESOURCE_SHORTAGE; +} + +/* + * vm_map_corpse_footprint_collect_done: + * completes the footprint collection by getting rid of any remaining + * trailing "zero" dispositions and trimming the unused part of the + * kernel buffer + */ +void +vm_map_corpse_footprint_collect_done( + vm_map_t new_map) +{ + struct vm_map_corpse_footprint_header *footprint_header; + struct vm_map_corpse_footprint_region *footprint_region; + vm_size_t buf_size, actual_size; + kern_return_t kr; + + assert(new_map->has_corpse_footprint); + if (!new_map->has_corpse_footprint || + new_map->vmmap_corpse_footprint == NULL) { + return; + } + + footprint_header = (struct vm_map_corpse_footprint_header *) + new_map->vmmap_corpse_footprint; + buf_size = footprint_header->cf_size; + + footprint_region = (struct vm_map_corpse_footprint_region *) + ((char *)footprint_header + + footprint_header->cf_last_region); + + /* get rid of trailing zeroes in last region */ + assert(footprint_region->cfr_num_pages >= footprint_header->cf_last_zeroes); + footprint_region->cfr_num_pages -= footprint_header->cf_last_zeroes; + footprint_header->cf_last_zeroes = 0; + + actual_size = (vm_size_t)(footprint_header->cf_last_region + + sizeof(*footprint_region) + + (footprint_region->cfr_num_pages * sizeof(cf_disp_t))); + +// printf("FBDP map %p buf_size 0x%llx actual_size 0x%llx\n", new_map, (uint64_t) buf_size, (uint64_t) actual_size); + vm_map_corpse_footprint_size_avg = + (((vm_map_corpse_footprint_size_avg * + vm_map_corpse_footprint_count) + + actual_size) / + (vm_map_corpse_footprint_count + 1)); + vm_map_corpse_footprint_count++; + if (actual_size > vm_map_corpse_footprint_size_max) { + vm_map_corpse_footprint_size_max = actual_size; + } + + actual_size = round_page(actual_size); + if (buf_size > actual_size) { + kr = vm_deallocate(kernel_map, + ((vm_address_t)footprint_header + + actual_size + + PAGE_SIZE), /* trailing guard page */ + (buf_size - actual_size)); + assertf(kr == KERN_SUCCESS, + "trim: footprint_header %p buf_size 0x%llx actual_size 0x%llx kr=0x%x\n", + footprint_header, + (uint64_t) buf_size, + (uint64_t) actual_size, + kr); + kr = vm_protect(kernel_map, + ((vm_address_t)footprint_header + + actual_size), + PAGE_SIZE, + FALSE, /* set_maximum */ + VM_PROT_NONE); + assertf(kr == KERN_SUCCESS, + "guard: footprint_header %p buf_size 0x%llx actual_size 0x%llx kr=0x%x\n", + footprint_header, + (uint64_t) buf_size, + (uint64_t) actual_size, + kr); + } + + footprint_header->cf_size = actual_size; +} + +/* + * vm_map_corpse_footprint_query_page_info: + * retrieves the disposition of the page at virtual address "vaddr" + * in the forked corpse's VM map + * + * This is the equivalent of vm_map_footprint_query_page_info() for a forked corpse. + */ +kern_return_t +vm_map_corpse_footprint_query_page_info( + vm_map_t map, + vm_map_offset_t va, + int *disposition_p) +{ + struct vm_map_corpse_footprint_header *footprint_header; + struct vm_map_corpse_footprint_region *footprint_region; + uint32_t footprint_region_offset; + vm_map_offset_t region_start, region_end; + int disp_idx; + kern_return_t kr; + int effective_page_size; + cf_disp_t cf_disp; + + if (!map->has_corpse_footprint) { + *disposition_p = 0; + kr = KERN_INVALID_ARGUMENT; + goto done; + } + + footprint_header = map->vmmap_corpse_footprint; + if (footprint_header == NULL) { + *disposition_p = 0; +// if (va < SHARED_REGION_BASE_ARM64) printf("FBDP %d query map %p va 0x%llx disp 0x%x\n", __LINE__, map, va, *disposition_p); + kr = KERN_INVALID_ARGUMENT; + goto done; + } + + /* start looking at the hint ("cf_hint_region") */ + footprint_region_offset = footprint_header->cf_hint_region; + + effective_page_size = MIN(PAGE_SIZE, VM_MAP_PAGE_SIZE(map)); + +lookup_again: + if (footprint_region_offset < sizeof(*footprint_header)) { + /* hint too low: start from 1st region */ + footprint_region_offset = sizeof(*footprint_header); + } + if (footprint_region_offset >= footprint_header->cf_last_region) { + /* hint too high: re-start from 1st region */ + footprint_region_offset = sizeof(*footprint_header); + } + footprint_region = (struct vm_map_corpse_footprint_region *) + ((char *)footprint_header + footprint_region_offset); + region_start = footprint_region->cfr_vaddr; + region_end = (region_start + + ((vm_map_offset_t)(footprint_region->cfr_num_pages) * + effective_page_size)); + if (va < region_start && + footprint_region_offset != sizeof(*footprint_header)) { + /* our range starts before the hint region */ + + /* reset the hint (in a racy way...) */ + footprint_header->cf_hint_region = sizeof(*footprint_header); + /* lookup "va" again from 1st region */ + footprint_region_offset = sizeof(*footprint_header); + goto lookup_again; + } + + while (va >= region_end) { + if (footprint_region_offset >= footprint_header->cf_last_region) { + break; + } + /* skip the region's header */ + footprint_region_offset += sizeof(*footprint_region); + /* skip the region's page dispositions */ + footprint_region_offset += (footprint_region->cfr_num_pages * sizeof(cf_disp_t)); + /* align to next word boundary */ + footprint_region_offset = + roundup(footprint_region_offset, + sizeof(int)); + footprint_region = (struct vm_map_corpse_footprint_region *) + ((char *)footprint_header + footprint_region_offset); + region_start = footprint_region->cfr_vaddr; + region_end = (region_start + + ((vm_map_offset_t)(footprint_region->cfr_num_pages) * + effective_page_size)); + } + if (va < region_start || va >= region_end) { + /* page not found */ + *disposition_p = 0; +// if (va < SHARED_REGION_BASE_ARM64) printf("FBDP %d query map %p va 0x%llx disp 0x%x\n", __LINE__, map, va, *disposition_p); + kr = KERN_SUCCESS; + goto done; + } + + /* "va" found: set the lookup hint for next lookup (in a racy way...) */ + footprint_header->cf_hint_region = footprint_region_offset; + + /* get page disposition for "va" in this region */ + disp_idx = (int) ((va - footprint_region->cfr_vaddr) / effective_page_size); + cf_disp = footprint_region->cfr_disposition[disp_idx]; + *disposition_p = vm_page_cf_disp_to_disposition(cf_disp); + kr = KERN_SUCCESS; +done: +// if (va < SHARED_REGION_BASE_ARM64) printf("FBDP %d query map %p va 0x%llx disp 0x%x\n", __LINE__, map, va, *disposition_p); + /* dtrace -n 'vminfo:::footprint_query_page_info { printf("map 0x%p va 0x%llx disp 0x%x kr 0x%x", arg0, arg1, arg2, arg3); }' */ + DTRACE_VM4(footprint_query_page_info, + vm_map_t, map, + vm_map_offset_t, va, + int, *disposition_p, + kern_return_t, kr); + + return kr; +} + +void +vm_map_corpse_footprint_destroy( + vm_map_t map) +{ + if (map->has_corpse_footprint && + map->vmmap_corpse_footprint != 0) { + struct vm_map_corpse_footprint_header *footprint_header; + vm_size_t buf_size; + kern_return_t kr; + + footprint_header = map->vmmap_corpse_footprint; + buf_size = footprint_header->cf_size; + kr = vm_deallocate(kernel_map, + (vm_offset_t) map->vmmap_corpse_footprint, + ((vm_size_t) buf_size + + PAGE_SIZE)); /* trailing guard page */ + assertf(kr == KERN_SUCCESS, "kr=0x%x\n", kr); + map->vmmap_corpse_footprint = 0; + map->has_corpse_footprint = FALSE; + } +} + +/* + * vm_map_copy_footprint_ledgers: + * copies any ledger that's relevant to the memory footprint of "old_task" + * into the forked corpse's task ("new_task") + */ +void +vm_map_copy_footprint_ledgers( + task_t old_task, + task_t new_task) +{ + vm_map_copy_ledger(old_task, new_task, task_ledgers.phys_footprint); + vm_map_copy_ledger(old_task, new_task, task_ledgers.purgeable_nonvolatile); + vm_map_copy_ledger(old_task, new_task, task_ledgers.purgeable_nonvolatile_compressed); + vm_map_copy_ledger(old_task, new_task, task_ledgers.internal); + vm_map_copy_ledger(old_task, new_task, task_ledgers.internal_compressed); + vm_map_copy_ledger(old_task, new_task, task_ledgers.iokit_mapped); + vm_map_copy_ledger(old_task, new_task, task_ledgers.alternate_accounting); + vm_map_copy_ledger(old_task, new_task, task_ledgers.alternate_accounting_compressed); + vm_map_copy_ledger(old_task, new_task, task_ledgers.page_table); + vm_map_copy_ledger(old_task, new_task, task_ledgers.tagged_footprint); + vm_map_copy_ledger(old_task, new_task, task_ledgers.tagged_footprint_compressed); + vm_map_copy_ledger(old_task, new_task, task_ledgers.network_nonvolatile); + vm_map_copy_ledger(old_task, new_task, task_ledgers.network_nonvolatile_compressed); + vm_map_copy_ledger(old_task, new_task, task_ledgers.media_footprint); + vm_map_copy_ledger(old_task, new_task, task_ledgers.media_footprint_compressed); + vm_map_copy_ledger(old_task, new_task, task_ledgers.graphics_footprint); + vm_map_copy_ledger(old_task, new_task, task_ledgers.graphics_footprint_compressed); + vm_map_copy_ledger(old_task, new_task, task_ledgers.neural_footprint); + vm_map_copy_ledger(old_task, new_task, task_ledgers.neural_footprint_compressed); + vm_map_copy_ledger(old_task, new_task, task_ledgers.wired_mem); +} + +/* + * vm_map_copy_ledger: + * copy a single ledger from "old_task" to "new_task" + */ +void +vm_map_copy_ledger( + task_t old_task, + task_t new_task, + int ledger_entry) +{ + ledger_amount_t old_balance, new_balance, delta; + + assert(new_task->map->has_corpse_footprint); + if (!new_task->map->has_corpse_footprint) { + return; + } + + /* turn off sanity checks for the ledger we're about to mess with */ + ledger_disable_panic_on_negative(new_task->ledger, + ledger_entry); + + /* adjust "new_task" to match "old_task" */ + ledger_get_balance(old_task->ledger, + ledger_entry, + &old_balance); + ledger_get_balance(new_task->ledger, + ledger_entry, + &new_balance); + if (new_balance == old_balance) { + /* new == old: done */ + } else if (new_balance > old_balance) { + /* new > old ==> new -= new - old */ + delta = new_balance - old_balance; + ledger_debit(new_task->ledger, + ledger_entry, + delta); + } else { + /* new < old ==> new += old - new */ + delta = old_balance - new_balance; + ledger_credit(new_task->ledger, + ledger_entry, + delta); + } +} + +#if MACH_ASSERT + +extern int pmap_ledgers_panic; +extern int pmap_ledgers_panic_leeway; + +#define LEDGER_DRIFT(__LEDGER) \ + int __LEDGER##_over; \ + ledger_amount_t __LEDGER##_over_total; \ + ledger_amount_t __LEDGER##_over_max; \ + int __LEDGER##_under; \ + ledger_amount_t __LEDGER##_under_total; \ + ledger_amount_t __LEDGER##_under_max + +struct { + uint64_t num_pmaps_checked; + + LEDGER_DRIFT(phys_footprint); + LEDGER_DRIFT(internal); + LEDGER_DRIFT(internal_compressed); + LEDGER_DRIFT(iokit_mapped); + LEDGER_DRIFT(alternate_accounting); + LEDGER_DRIFT(alternate_accounting_compressed); + LEDGER_DRIFT(page_table); + LEDGER_DRIFT(purgeable_volatile); + LEDGER_DRIFT(purgeable_nonvolatile); + LEDGER_DRIFT(purgeable_volatile_compressed); + LEDGER_DRIFT(purgeable_nonvolatile_compressed); + LEDGER_DRIFT(tagged_nofootprint); + LEDGER_DRIFT(tagged_footprint); + LEDGER_DRIFT(tagged_nofootprint_compressed); + LEDGER_DRIFT(tagged_footprint_compressed); + LEDGER_DRIFT(network_volatile); + LEDGER_DRIFT(network_nonvolatile); + LEDGER_DRIFT(network_volatile_compressed); + LEDGER_DRIFT(network_nonvolatile_compressed); + LEDGER_DRIFT(media_nofootprint); + LEDGER_DRIFT(media_footprint); + LEDGER_DRIFT(media_nofootprint_compressed); + LEDGER_DRIFT(media_footprint_compressed); + LEDGER_DRIFT(graphics_nofootprint); + LEDGER_DRIFT(graphics_footprint); + LEDGER_DRIFT(graphics_nofootprint_compressed); + LEDGER_DRIFT(graphics_footprint_compressed); + LEDGER_DRIFT(neural_nofootprint); + LEDGER_DRIFT(neural_footprint); + LEDGER_DRIFT(neural_nofootprint_compressed); + LEDGER_DRIFT(neural_footprint_compressed); +} pmap_ledgers_drift; + +void +vm_map_pmap_check_ledgers( + pmap_t pmap, + ledger_t ledger, + int pid, + char *procname) +{ + ledger_amount_t bal; + boolean_t do_panic; + + do_panic = FALSE; + + pmap_ledgers_drift.num_pmaps_checked++; + +#define LEDGER_CHECK_BALANCE(__LEDGER) \ +MACRO_BEGIN \ + int panic_on_negative = TRUE; \ + ledger_get_balance(ledger, \ + task_ledgers.__LEDGER, \ + &bal); \ + ledger_get_panic_on_negative(ledger, \ + task_ledgers.__LEDGER, \ + &panic_on_negative); \ + if (bal != 0) { \ + if (panic_on_negative || \ + (pmap_ledgers_panic && \ + pmap_ledgers_panic_leeway > 0 && \ + (bal > (pmap_ledgers_panic_leeway * PAGE_SIZE) || \ + bal < (-pmap_ledgers_panic_leeway * PAGE_SIZE)))) { \ + do_panic = TRUE; \ + } \ + printf("LEDGER BALANCE proc %d (%s) " \ + "\"%s\" = %lld\n", \ + pid, procname, #__LEDGER, bal); \ + if (bal > 0) { \ + pmap_ledgers_drift.__LEDGER##_over++; \ + pmap_ledgers_drift.__LEDGER##_over_total += bal; \ + if (bal > pmap_ledgers_drift.__LEDGER##_over_max) { \ + pmap_ledgers_drift.__LEDGER##_over_max = bal; \ + } \ + } else if (bal < 0) { \ + pmap_ledgers_drift.__LEDGER##_under++; \ + pmap_ledgers_drift.__LEDGER##_under_total += bal; \ + if (bal < pmap_ledgers_drift.__LEDGER##_under_max) { \ + pmap_ledgers_drift.__LEDGER##_under_max = bal; \ + } \ + } \ + } \ +MACRO_END + + LEDGER_CHECK_BALANCE(phys_footprint); + LEDGER_CHECK_BALANCE(internal); + LEDGER_CHECK_BALANCE(internal_compressed); + LEDGER_CHECK_BALANCE(iokit_mapped); + LEDGER_CHECK_BALANCE(alternate_accounting); + LEDGER_CHECK_BALANCE(alternate_accounting_compressed); + LEDGER_CHECK_BALANCE(page_table); + LEDGER_CHECK_BALANCE(purgeable_volatile); + LEDGER_CHECK_BALANCE(purgeable_nonvolatile); + LEDGER_CHECK_BALANCE(purgeable_volatile_compressed); + LEDGER_CHECK_BALANCE(purgeable_nonvolatile_compressed); + LEDGER_CHECK_BALANCE(tagged_nofootprint); + LEDGER_CHECK_BALANCE(tagged_footprint); + LEDGER_CHECK_BALANCE(tagged_nofootprint_compressed); + LEDGER_CHECK_BALANCE(tagged_footprint_compressed); + LEDGER_CHECK_BALANCE(network_volatile); + LEDGER_CHECK_BALANCE(network_nonvolatile); + LEDGER_CHECK_BALANCE(network_volatile_compressed); + LEDGER_CHECK_BALANCE(network_nonvolatile_compressed); + LEDGER_CHECK_BALANCE(media_nofootprint); + LEDGER_CHECK_BALANCE(media_footprint); + LEDGER_CHECK_BALANCE(media_nofootprint_compressed); + LEDGER_CHECK_BALANCE(media_footprint_compressed); + LEDGER_CHECK_BALANCE(graphics_nofootprint); + LEDGER_CHECK_BALANCE(graphics_footprint); + LEDGER_CHECK_BALANCE(graphics_nofootprint_compressed); + LEDGER_CHECK_BALANCE(graphics_footprint_compressed); + LEDGER_CHECK_BALANCE(neural_nofootprint); + LEDGER_CHECK_BALANCE(neural_footprint); + LEDGER_CHECK_BALANCE(neural_nofootprint_compressed); + LEDGER_CHECK_BALANCE(neural_footprint_compressed); + + if (do_panic) { + if (pmap_ledgers_panic) { + panic("pmap_destroy(%p) %d[%s] has imbalanced ledgers\n", + pmap, pid, procname); + } else { + printf("pmap_destroy(%p) %d[%s] has imbalanced ledgers\n", + pmap, pid, procname); + } + } +} +#endif /* MACH_ASSERT */