+ case VM_PURGABLE_VOLATILE:
+ if (object->volatile_fault) {
+ vm_page_t p;
+ int refmod;
+
+ vm_page_queue_iterate(&object->memq, p, vm_page_t, listq) {
+ if (p->busy ||
+ VM_PAGE_WIRED(p) ||
+ p->fictitious) {
+ continue;
+ }
+ refmod = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(p));
+ if ((refmod & VM_MEM_MODIFIED) &&
+ !p->dirty) {
+ SET_PAGE_DIRTY(p, FALSE);
+ }
+ }
+ }
+
+ assert(old_state != VM_PURGABLE_EMPTY);
+
+ purgeable_q_t queue;
+
+ /* find the correct queue */
+ if ((*state&VM_PURGABLE_ORDERING_MASK) == VM_PURGABLE_ORDERING_OBSOLETE)
+ queue = &purgeable_queues[PURGEABLE_Q_TYPE_OBSOLETE];
+ else {
+ if ((*state&VM_PURGABLE_BEHAVIOR_MASK) == VM_PURGABLE_BEHAVIOR_FIFO)
+ queue = &purgeable_queues[PURGEABLE_Q_TYPE_FIFO];
+ else
+ queue = &purgeable_queues[PURGEABLE_Q_TYPE_LIFO];
+ }
+
+ if (old_state == VM_PURGABLE_NONVOLATILE ||
+ old_state == VM_PURGABLE_EMPTY) {
+ unsigned int delta;
+
+ if ((*state & VM_PURGABLE_NO_AGING_MASK) ==
+ VM_PURGABLE_NO_AGING) {
+ object->purgeable_when_ripe = FALSE;
+ } else {
+ object->purgeable_when_ripe = TRUE;
+ }
+
+ if (object->purgeable_when_ripe) {
+ kern_return_t result;
+
+ /* try to add token... this can fail */
+ vm_page_lock_queues();
+
+ result = vm_purgeable_token_add(queue);
+ if (result != KERN_SUCCESS) {
+ vm_page_unlock_queues();
+ return result;
+ }
+ vm_page_unlock_queues();
+ }
+
+ assert(object->resident_page_count >=
+ object->wired_page_count);
+ delta = (object->resident_page_count -
+ object->wired_page_count);
+
+ if (delta != 0) {
+ OSAddAtomic(delta,
+ &vm_page_purgeable_count);
+ }
+ if (object->wired_page_count != 0) {
+ OSAddAtomic(object->wired_page_count,
+ &vm_page_purgeable_wired_count);
+ }
+
+ object->purgable = new_state;
+
+ /* object should be on "non-volatile" queue */
+ assert(object->objq.next != NULL);
+ assert(object->objq.prev != NULL);
+ }
+ else if (old_state == VM_PURGABLE_VOLATILE) {
+ purgeable_q_t old_queue;
+ boolean_t purgeable_when_ripe;
+
+ /*
+ * if reassigning priorities / purgeable groups, we don't change the
+ * token queue. So moving priorities will not make pages stay around longer.
+ * Reasoning is that the algorithm gives most priority to the most important
+ * object. If a new token is added, the most important object' priority is boosted.
+ * This biases the system already for purgeable queues that move a lot.
+ * It doesn't seem more biasing is neccessary in this case, where no new object is added.
+ */
+ assert(object->objq.next != NULL && object->objq.prev != NULL); /* object should be on a queue */
+
+ old_queue = vm_purgeable_object_remove(object);
+ assert(old_queue);
+
+ if ((*state & VM_PURGABLE_NO_AGING_MASK) ==
+ VM_PURGABLE_NO_AGING) {
+ purgeable_when_ripe = FALSE;
+ } else {
+ purgeable_when_ripe = TRUE;
+ }
+
+ if (old_queue != queue ||
+ (purgeable_when_ripe !=
+ object->purgeable_when_ripe)) {
+ kern_return_t result;
+
+ /* Changing queue. Have to move token. */
+ vm_page_lock_queues();
+ if (object->purgeable_when_ripe) {
+ vm_purgeable_token_delete_last(old_queue);
+ }
+ object->purgeable_when_ripe = purgeable_when_ripe;
+ if (object->purgeable_when_ripe) {
+ result = vm_purgeable_token_add(queue);
+ assert(result==KERN_SUCCESS); /* this should never fail since we just freed a token */
+ }
+ vm_page_unlock_queues();
+
+ }
+ };
+ vm_purgeable_object_add(object, queue, (*state&VM_VOLATILE_GROUP_MASK)>>VM_VOLATILE_GROUP_SHIFT );
+ if (old_state == VM_PURGABLE_NONVOLATILE) {
+ vm_purgeable_accounting(object, VM_PURGABLE_NONVOLATILE,
+ FALSE);
+ }
+
+ assert(queue->debug_count_objects>=0);
+
+ break;
+
+
+ case VM_PURGABLE_EMPTY:
+ if (object->volatile_fault) {
+ vm_page_t p;
+ int refmod;
+
+ vm_page_queue_iterate(&object->memq, p, vm_page_t, listq) {
+ if (p->busy ||
+ VM_PAGE_WIRED(p) ||
+ p->fictitious) {
+ continue;
+ }
+ refmod = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(p));
+ if ((refmod & VM_MEM_MODIFIED) &&
+ !p->dirty) {
+ SET_PAGE_DIRTY(p, FALSE);
+ }
+ }
+ }
+
+ if (old_state == VM_PURGABLE_VOLATILE) {
+ purgeable_q_t old_queue;
+
+ /* object should be on a queue */
+ assert(object->objq.next != NULL &&
+ object->objq.prev != NULL);
+
+ old_queue = vm_purgeable_object_remove(object);
+ assert(old_queue);
+ if (object->purgeable_when_ripe) {
+ vm_page_lock_queues();
+ vm_purgeable_token_delete_first(old_queue);
+ vm_page_unlock_queues();
+ }
+ }
+
+ if (old_state == VM_PURGABLE_NONVOLATILE) {
+ /*
+ * This object's pages were previously accounted as
+ * "non-volatile" and now need to be accounted as
+ * "volatile".
+ */
+ vm_purgeable_accounting(object, VM_PURGABLE_NONVOLATILE,
+ FALSE);
+ /*
+ * Set to VM_PURGABLE_EMPTY because the pages are no
+ * longer accounted in the "non-volatile" ledger
+ * and are also not accounted for in
+ * "vm_page_purgeable_count".
+ */
+ object->purgable = VM_PURGABLE_EMPTY;
+ }
+
+ (void) vm_object_purge(object, 0);
+ assert(object->purgable == VM_PURGABLE_EMPTY);
+
+ break;
+ }
+
+ *state = old_state;
+
+ vm_object_lock_assert_exclusive(object);
+
+ return KERN_SUCCESS;
+}
+
+kern_return_t
+vm_object_get_page_counts(
+ vm_object_t object,
+ vm_object_offset_t offset,
+ vm_object_size_t size,
+ unsigned int *resident_page_count,
+ unsigned int *dirty_page_count)
+{
+
+ kern_return_t kr = KERN_SUCCESS;
+ boolean_t count_dirty_pages = FALSE;
+ vm_page_t p = VM_PAGE_NULL;
+ unsigned int local_resident_count = 0;
+ unsigned int local_dirty_count = 0;
+ vm_object_offset_t cur_offset = 0;
+ vm_object_offset_t end_offset = 0;
+
+ if (object == VM_OBJECT_NULL)
+ return KERN_INVALID_ARGUMENT;
+
+
+ cur_offset = offset;
+
+ end_offset = offset + size;
+
+ vm_object_lock_assert_exclusive(object);
+
+ if (dirty_page_count != NULL) {
+
+ count_dirty_pages = TRUE;
+ }
+
+ if (resident_page_count != NULL && count_dirty_pages == FALSE) {
+ /*
+ * Fast path when:
+ * - we only want the resident page count, and,
+ * - the entire object is exactly covered by the request.
+ */
+ if (offset == 0 && (object->vo_size == size)) {
+
+ *resident_page_count = object->resident_page_count;
+ goto out;
+ }
+ }
+
+ if (object->resident_page_count <= (size >> PAGE_SHIFT)) {
+
+ vm_page_queue_iterate(&object->memq, p, vm_page_t, listq) {
+
+ if (p->offset >= cur_offset && p->offset < end_offset) {
+
+ local_resident_count++;
+
+ if (count_dirty_pages) {
+
+ if (p->dirty || (p->wpmapped && pmap_is_modified(VM_PAGE_GET_PHYS_PAGE(p)))) {
+
+ local_dirty_count++;
+ }
+ }
+ }
+ }
+ } else {
+
+ for (cur_offset = offset; cur_offset < end_offset; cur_offset += PAGE_SIZE_64) {
+
+ p = vm_page_lookup(object, cur_offset);
+
+ if (p != VM_PAGE_NULL) {
+
+ local_resident_count++;
+
+ if (count_dirty_pages) {
+
+ if (p->dirty || (p->wpmapped && pmap_is_modified(VM_PAGE_GET_PHYS_PAGE(p)))) {
+
+ local_dirty_count++;
+ }
+ }
+ }
+ }
+
+ }
+
+ if (resident_page_count != NULL) {
+ *resident_page_count = local_resident_count;
+ }
+
+ if (dirty_page_count != NULL) {
+ *dirty_page_count = local_dirty_count;
+ }
+
+out:
+ return kr;
+}
+
+
+#if TASK_SWAPPER
+/*
+ * vm_object_res_deallocate
+ *
+ * (recursively) decrement residence counts on vm objects and their shadows.
+ * Called from vm_object_deallocate and when swapping out an object.
+ *
+ * The object is locked, and remains locked throughout the function,
+ * even as we iterate down the shadow chain. Locks on intermediate objects
+ * will be dropped, but not the original object.
+ *
+ * NOTE: this function used to use recursion, rather than iteration.
+ */
+
+__private_extern__ void
+vm_object_res_deallocate(
+ vm_object_t object)
+{
+ vm_object_t orig_object = object;
+ /*
+ * Object is locked so it can be called directly
+ * from vm_object_deallocate. Original object is never
+ * unlocked.
+ */
+ assert(object->res_count > 0);
+ while (--object->res_count == 0) {
+ assert(object->ref_count >= object->res_count);
+ vm_object_deactivate_all_pages(object);
+ /* iterate on shadow, if present */
+ if (object->shadow != VM_OBJECT_NULL) {
+ vm_object_t tmp_object = object->shadow;
+ vm_object_lock(tmp_object);
+ if (object != orig_object)
+ vm_object_unlock(object);
+ object = tmp_object;
+ assert(object->res_count > 0);
+ } else
+ break;
+ }
+ if (object != orig_object)
+ vm_object_unlock(object);
+}
+
+/*
+ * vm_object_res_reference
+ *
+ * Internal function to increment residence count on a vm object
+ * and its shadows. It is called only from vm_object_reference, and
+ * when swapping in a vm object, via vm_map_swap.
+ *
+ * The object is locked, and remains locked throughout the function,
+ * even as we iterate down the shadow chain. Locks on intermediate objects
+ * will be dropped, but not the original object.
+ *
+ * NOTE: this function used to use recursion, rather than iteration.
+ */
+
+__private_extern__ void
+vm_object_res_reference(
+ vm_object_t object)
+{
+ vm_object_t orig_object = object;
+ /*
+ * Object is locked, so this can be called directly
+ * from vm_object_reference. This lock is never released.
+ */
+ while ((++object->res_count == 1) &&
+ (object->shadow != VM_OBJECT_NULL)) {
+ vm_object_t tmp_object = object->shadow;
+
+ assert(object->ref_count >= object->res_count);
+ vm_object_lock(tmp_object);
+ if (object != orig_object)
+ vm_object_unlock(object);
+ object = tmp_object;
+ }
+ if (object != orig_object)
+ vm_object_unlock(object);
+ assert(orig_object->ref_count >= orig_object->res_count);
+}
+#endif /* TASK_SWAPPER */
+
+/*
+ * vm_object_reference:
+ *
+ * Gets another reference to the given object.
+ */
+#ifdef vm_object_reference
+#undef vm_object_reference
+#endif
+__private_extern__ void
+vm_object_reference(
+ vm_object_t object)
+{
+ if (object == VM_OBJECT_NULL)
+ return;
+
+ vm_object_lock(object);
+ assert(object->ref_count > 0);
+ vm_object_reference_locked(object);
+ vm_object_unlock(object);
+}
+
+/*
+ * vm_object_transpose
+ *
+ * This routine takes two VM objects of the same size and exchanges
+ * their backing store.
+ * The objects should be "quiesced" via a UPL operation with UPL_SET_IO_WIRE
+ * and UPL_BLOCK_ACCESS if they are referenced anywhere.
+ *
+ * The VM objects must not be locked by caller.
+ */
+unsigned int vm_object_transpose_count = 0;
+kern_return_t
+vm_object_transpose(
+ vm_object_t object1,
+ vm_object_t object2,
+ vm_object_size_t transpose_size)
+{
+ vm_object_t tmp_object;
+ kern_return_t retval;
+ boolean_t object1_locked, object2_locked;
+ vm_page_t page;
+ vm_object_offset_t page_offset;
+
+ tmp_object = VM_OBJECT_NULL;
+ object1_locked = FALSE; object2_locked = FALSE;
+
+ if (object1 == object2 ||
+ object1 == VM_OBJECT_NULL ||
+ object2 == VM_OBJECT_NULL) {
+ /*
+ * If the 2 VM objects are the same, there's
+ * no point in exchanging their backing store.
+ */
+ retval = KERN_INVALID_VALUE;
+ goto done;
+ }
+
+ /*
+ * Since we need to lock both objects at the same time,
+ * make sure we always lock them in the same order to
+ * avoid deadlocks.
+ */
+ if (object1 > object2) {
+ tmp_object = object1;
+ object1 = object2;
+ object2 = tmp_object;
+ }
+
+ /*
+ * Allocate a temporary VM object to hold object1's contents
+ * while we copy object2 to object1.
+ */
+ tmp_object = vm_object_allocate(transpose_size);
+ vm_object_lock(tmp_object);
+ tmp_object->can_persist = FALSE;
+
+
+ /*
+ * Grab control of the 1st VM object.
+ */
+ vm_object_lock(object1);
+ object1_locked = TRUE;
+ if (!object1->alive || object1->terminating ||
+ object1->copy || object1->shadow || object1->shadowed ||
+ object1->purgable != VM_PURGABLE_DENY) {
+ /*
+ * We don't deal with copy or shadow objects (yet).
+ */
+ retval = KERN_INVALID_VALUE;
+ goto done;
+ }
+ /*
+ * We're about to mess with the object's backing store and
+ * taking a "paging_in_progress" reference wouldn't be enough
+ * to prevent any paging activity on this object, so the caller should
+ * have "quiesced" the objects beforehand, via a UPL operation with
+ * UPL_SET_IO_WIRE (to make sure all the pages are there and wired)
+ * and UPL_BLOCK_ACCESS (to mark the pages "busy").
+ *
+ * Wait for any paging operation to complete (but only paging, not
+ * other kind of activities not linked to the pager). After we're
+ * statisfied that there's no more paging in progress, we keep the
+ * object locked, to guarantee that no one tries to access its pager.
+ */
+ vm_object_paging_only_wait(object1, THREAD_UNINT);
+
+ /*
+ * Same as above for the 2nd object...
+ */
+ vm_object_lock(object2);
+ object2_locked = TRUE;
+ if (! object2->alive || object2->terminating ||
+ object2->copy || object2->shadow || object2->shadowed ||
+ object2->purgable != VM_PURGABLE_DENY) {
+ retval = KERN_INVALID_VALUE;
+ goto done;
+ }
+ vm_object_paging_only_wait(object2, THREAD_UNINT);
+
+
+ if (object1->vo_size != object2->vo_size ||
+ object1->vo_size != transpose_size) {
+ /*
+ * If the 2 objects don't have the same size, we can't
+ * exchange their backing stores or one would overflow.
+ * If their size doesn't match the caller's
+ * "transpose_size", we can't do it either because the
+ * transpose operation will affect the entire span of
+ * the objects.
+ */
+ retval = KERN_INVALID_VALUE;
+ goto done;
+ }
+
+
+ /*
+ * Transpose the lists of resident pages.
+ * This also updates the resident_page_count and the memq_hint.
+ */
+ if (object1->phys_contiguous || vm_page_queue_empty(&object1->memq)) {
+ /*
+ * No pages in object1, just transfer pages
+ * from object2 to object1. No need to go through
+ * an intermediate object.
+ */
+ while (!vm_page_queue_empty(&object2->memq)) {
+ page = (vm_page_t) vm_page_queue_first(&object2->memq);
+ vm_page_rename(page, object1, page->offset);
+ }
+ assert(vm_page_queue_empty(&object2->memq));
+ } else if (object2->phys_contiguous || vm_page_queue_empty(&object2->memq)) {
+ /*
+ * No pages in object2, just transfer pages
+ * from object1 to object2. No need to go through
+ * an intermediate object.
+ */
+ while (!vm_page_queue_empty(&object1->memq)) {
+ page = (vm_page_t) vm_page_queue_first(&object1->memq);
+ vm_page_rename(page, object2, page->offset);
+ }
+ assert(vm_page_queue_empty(&object1->memq));
+ } else {
+ /* transfer object1's pages to tmp_object */
+ while (!vm_page_queue_empty(&object1->memq)) {
+ page = (vm_page_t) vm_page_queue_first(&object1->memq);
+ page_offset = page->offset;
+ vm_page_remove(page, TRUE);
+ page->offset = page_offset;
+ vm_page_queue_enter(&tmp_object->memq, page, vm_page_t, listq);
+ }
+ assert(vm_page_queue_empty(&object1->memq));
+ /* transfer object2's pages to object1 */
+ while (!vm_page_queue_empty(&object2->memq)) {
+ page = (vm_page_t) vm_page_queue_first(&object2->memq);
+ vm_page_rename(page, object1, page->offset);
+ }
+ assert(vm_page_queue_empty(&object2->memq));
+ /* transfer tmp_object's pages to object2 */
+ while (!vm_page_queue_empty(&tmp_object->memq)) {
+ page = (vm_page_t) vm_page_queue_first(&tmp_object->memq);
+ vm_page_queue_remove(&tmp_object->memq, page,
+ vm_page_t, listq);
+ vm_page_insert(page, object2, page->offset);
+ }
+ assert(vm_page_queue_empty(&tmp_object->memq));
+ }
+
+#define __TRANSPOSE_FIELD(field) \
+MACRO_BEGIN \
+ tmp_object->field = object1->field; \
+ object1->field = object2->field; \
+ object2->field = tmp_object->field; \
+MACRO_END
+
+ /* "Lock" refers to the object not its contents */
+ /* "size" should be identical */
+ assert(object1->vo_size == object2->vo_size);
+ /* "memq_hint" was updated above when transposing pages */
+ /* "ref_count" refers to the object not its contents */
+ assert(object1->ref_count >= 1);
+ assert(object2->ref_count >= 1);
+#if TASK_SWAPPER
+ /* "res_count" refers to the object not its contents */
+#endif
+ /* "resident_page_count" was updated above when transposing pages */
+ /* "wired_page_count" was updated above when transposing pages */
+ /* "reusable_page_count" was updated above when transposing pages */
+ /* there should be no "copy" */
+ assert(!object1->copy);
+ assert(!object2->copy);
+ /* there should be no "shadow" */
+ assert(!object1->shadow);
+ assert(!object2->shadow);
+ __TRANSPOSE_FIELD(vo_shadow_offset); /* used by phys_contiguous objects */
+ __TRANSPOSE_FIELD(pager);
+ __TRANSPOSE_FIELD(paging_offset);
+ __TRANSPOSE_FIELD(pager_control);
+ /* update the memory_objects' pointers back to the VM objects */
+ if (object1->pager_control != MEMORY_OBJECT_CONTROL_NULL) {
+ memory_object_control_collapse(object1->pager_control,
+ object1);
+ }
+ if (object2->pager_control != MEMORY_OBJECT_CONTROL_NULL) {
+ memory_object_control_collapse(object2->pager_control,
+ object2);
+ }
+ __TRANSPOSE_FIELD(copy_strategy);
+ /* "paging_in_progress" refers to the object not its contents */
+ assert(!object1->paging_in_progress);
+ assert(!object2->paging_in_progress);
+ assert(object1->activity_in_progress);
+ assert(object2->activity_in_progress);
+ /* "all_wanted" refers to the object not its contents */
+ __TRANSPOSE_FIELD(pager_created);
+ __TRANSPOSE_FIELD(pager_initialized);
+ __TRANSPOSE_FIELD(pager_ready);
+ __TRANSPOSE_FIELD(pager_trusted);
+ __TRANSPOSE_FIELD(can_persist);
+ __TRANSPOSE_FIELD(internal);
+ __TRANSPOSE_FIELD(private);
+ __TRANSPOSE_FIELD(pageout);
+ /* "alive" should be set */
+ assert(object1->alive);
+ assert(object2->alive);
+ /* "purgeable" should be non-purgeable */
+ assert(object1->purgable == VM_PURGABLE_DENY);
+ assert(object2->purgable == VM_PURGABLE_DENY);
+ /* "shadowed" refers to the the object not its contents */
+ __TRANSPOSE_FIELD(purgeable_when_ripe);
+ __TRANSPOSE_FIELD(true_share);
+ /* "terminating" should not be set */
+ assert(!object1->terminating);
+ assert(!object2->terminating);
+ /* transfer "named" reference if needed */
+ if (object1->named && !object2->named) {
+ assert(object1->ref_count >= 2);
+ assert(object2->ref_count >= 1);
+ object1->ref_count--;
+ object2->ref_count++;
+ } else if (!object1->named && object2->named) {
+ assert(object1->ref_count >= 1);
+ assert(object2->ref_count >= 2);
+ object1->ref_count++;
+ object2->ref_count--;
+ }
+ __TRANSPOSE_FIELD(named);
+ /* "shadow_severed" refers to the object not its contents */
+ __TRANSPOSE_FIELD(phys_contiguous);
+ __TRANSPOSE_FIELD(nophyscache);
+ /* "cached_list.next" points to transposed object */
+ object1->cached_list.next = (queue_entry_t) object2;
+ object2->cached_list.next = (queue_entry_t) object1;
+ /* "cached_list.prev" should be NULL */
+ assert(object1->cached_list.prev == NULL);
+ assert(object2->cached_list.prev == NULL);
+ __TRANSPOSE_FIELD(last_alloc);
+ __TRANSPOSE_FIELD(sequential);
+ __TRANSPOSE_FIELD(pages_created);
+ __TRANSPOSE_FIELD(pages_used);
+ __TRANSPOSE_FIELD(scan_collisions);
+ __TRANSPOSE_FIELD(cow_hint);
+ __TRANSPOSE_FIELD(wimg_bits);
+ __TRANSPOSE_FIELD(set_cache_attr);
+ __TRANSPOSE_FIELD(code_signed);
+ object1->transposed = TRUE;
+ object2->transposed = TRUE;
+ __TRANSPOSE_FIELD(mapping_in_progress);
+ __TRANSPOSE_FIELD(volatile_empty);
+ __TRANSPOSE_FIELD(volatile_fault);
+ __TRANSPOSE_FIELD(all_reusable);
+ assert(object1->blocked_access);
+ assert(object2->blocked_access);
+ assert(object1->__object2_unused_bits == 0);
+ assert(object2->__object2_unused_bits == 0);
+#if UPL_DEBUG
+ /* "uplq" refers to the object not its contents (see upl_transpose()) */
+#endif
+ assert((object1->purgable == VM_PURGABLE_DENY) || (object1->objq.next == NULL));
+ assert((object1->purgable == VM_PURGABLE_DENY) || (object1->objq.prev == NULL));
+ assert((object2->purgable == VM_PURGABLE_DENY) || (object2->objq.next == NULL));
+ assert((object2->purgable == VM_PURGABLE_DENY) || (object2->objq.prev == NULL));
+
+#undef __TRANSPOSE_FIELD
+
+ retval = KERN_SUCCESS;
+
+done:
+ /*
+ * Cleanup.
+ */
+ if (tmp_object != VM_OBJECT_NULL) {
+ vm_object_unlock(tmp_object);
+ /*
+ * Re-initialize the temporary object to avoid
+ * deallocating a real pager.
+ */
+ _vm_object_allocate(transpose_size, tmp_object);
+ vm_object_deallocate(tmp_object);
+ tmp_object = VM_OBJECT_NULL;
+ }
+
+ if (object1_locked) {
+ vm_object_unlock(object1);
+ object1_locked = FALSE;
+ }
+ if (object2_locked) {
+ vm_object_unlock(object2);
+ object2_locked = FALSE;
+ }
+
+ vm_object_transpose_count++;
+
+ return retval;
+}
+
+
+/*
+ * vm_object_cluster_size
+ *
+ * Determine how big a cluster we should issue an I/O for...
+ *
+ * Inputs: *start == offset of page needed
+ * *length == maximum cluster pager can handle
+ * Outputs: *start == beginning offset of cluster
+ * *length == length of cluster to try
+ *
+ * The original *start will be encompassed by the cluster
+ *
+ */
+extern int speculative_reads_disabled;
+
+/*
+ * Try to always keep these values an even multiple of PAGE_SIZE. We use these values
+ * to derive min_ph_bytes and max_ph_bytes (IMP: bytes not # of pages) and expect those values to
+ * always be page-aligned. The derivation could involve operations (e.g. division)
+ * that could give us non-page-size aligned values if we start out with values that
+ * are odd multiples of PAGE_SIZE.
+ */
+#if CONFIG_EMBEDDED
+ unsigned int preheat_max_bytes = (1024 * 512);
+#else /* CONFIG_EMBEDDED */
+ unsigned int preheat_max_bytes = MAX_UPL_TRANSFER_BYTES;
+#endif /* CONFIG_EMBEDDED */
+unsigned int preheat_min_bytes = (1024 * 32);
+
+
+__private_extern__ void
+vm_object_cluster_size(vm_object_t object, vm_object_offset_t *start,
+ vm_size_t *length, vm_object_fault_info_t fault_info, uint32_t *io_streaming)
+{
+ vm_size_t pre_heat_size;
+ vm_size_t tail_size;
+ vm_size_t head_size;
+ vm_size_t max_length;
+ vm_size_t cluster_size;
+ vm_object_offset_t object_size;
+ vm_object_offset_t orig_start;
+ vm_object_offset_t target_start;
+ vm_object_offset_t offset;
+ vm_behavior_t behavior;
+ boolean_t look_behind = TRUE;
+ boolean_t look_ahead = TRUE;
+ boolean_t isSSD = FALSE;
+ uint32_t throttle_limit;
+ int sequential_run;
+ int sequential_behavior = VM_BEHAVIOR_SEQUENTIAL;
+ vm_size_t max_ph_size;
+ vm_size_t min_ph_size;
+
+ assert( !(*length & PAGE_MASK));
+ assert( !(*start & PAGE_MASK_64));
+
+ /*
+ * remember maxiumum length of run requested
+ */
+ max_length = *length;
+ /*
+ * we'll always return a cluster size of at least
+ * 1 page, since the original fault must always
+ * be processed
+ */
+ *length = PAGE_SIZE;
+ *io_streaming = 0;
+
+ if (speculative_reads_disabled || fault_info == NULL) {
+ /*
+ * no cluster... just fault the page in
+ */
+ return;
+ }
+ orig_start = *start;
+ target_start = orig_start;
+ cluster_size = round_page(fault_info->cluster_size);
+ behavior = fault_info->behavior;
+
+ vm_object_lock(object);
+
+ if (object->pager == MEMORY_OBJECT_NULL)
+ goto out; /* pager is gone for this object, nothing more to do */
+
+ vnode_pager_get_isSSD(object->pager, &isSSD);
+
+ min_ph_size = round_page(preheat_min_bytes);
+ max_ph_size = round_page(preheat_max_bytes);
+
+#if !CONFIG_EMBEDDED
+ if (isSSD) {
+ min_ph_size /= 2;
+ max_ph_size /= 8;
+
+ if (min_ph_size & PAGE_MASK_64) {
+ min_ph_size = trunc_page(min_ph_size);
+ }
+
+ if (max_ph_size & PAGE_MASK_64) {
+ max_ph_size = trunc_page(max_ph_size);
+ }
+ }
+#endif /* !CONFIG_EMBEDDED */
+
+ if (min_ph_size < PAGE_SIZE)
+ min_ph_size = PAGE_SIZE;
+
+ if (max_ph_size < PAGE_SIZE)
+ max_ph_size = PAGE_SIZE;
+ else if (max_ph_size > MAX_UPL_TRANSFER_BYTES)
+ max_ph_size = MAX_UPL_TRANSFER_BYTES;
+
+ if (max_length > max_ph_size)
+ max_length = max_ph_size;
+
+ if (max_length <= PAGE_SIZE)
+ goto out;
+
+ if (object->internal)
+ object_size = object->vo_size;
+ else
+ vnode_pager_get_object_size(object->pager, &object_size);
+
+ object_size = round_page_64(object_size);
+
+ if (orig_start >= object_size) {
+ /*
+ * fault occurred beyond the EOF...
+ * we need to punt w/o changing the
+ * starting offset
+ */
+ goto out;
+ }
+ if (object->pages_used > object->pages_created) {
+ /*
+ * must have wrapped our 32 bit counters
+ * so reset
+ */
+ object->pages_used = object->pages_created = 0;
+ }
+ if ((sequential_run = object->sequential)) {
+ if (sequential_run < 0) {
+ sequential_behavior = VM_BEHAVIOR_RSEQNTL;
+ sequential_run = 0 - sequential_run;
+ } else {
+ sequential_behavior = VM_BEHAVIOR_SEQUENTIAL;
+ }
+
+ }
+ switch (behavior) {
+
+ default:
+ behavior = VM_BEHAVIOR_DEFAULT;
+
+ case VM_BEHAVIOR_DEFAULT:
+ if (object->internal && fault_info->user_tag == VM_MEMORY_STACK)
+ goto out;
+
+ if (sequential_run >= (3 * PAGE_SIZE)) {
+ pre_heat_size = sequential_run + PAGE_SIZE;
+
+ if (sequential_behavior == VM_BEHAVIOR_SEQUENTIAL)
+ look_behind = FALSE;
+ else
+ look_ahead = FALSE;
+
+ *io_streaming = 1;
+ } else {
+
+ if (object->pages_created < (20 * (min_ph_size >> PAGE_SHIFT))) {
+ /*
+ * prime the pump
+ */
+ pre_heat_size = min_ph_size;
+ } else {
+ /*
+ * Linear growth in PH size: The maximum size is max_length...
+ * this cacluation will result in a size that is neither a
+ * power of 2 nor a multiple of PAGE_SIZE... so round
+ * it up to the nearest PAGE_SIZE boundary
+ */
+ pre_heat_size = (max_length * (uint64_t)object->pages_used) / object->pages_created;
+
+ if (pre_heat_size < min_ph_size)
+ pre_heat_size = min_ph_size;
+ else
+ pre_heat_size = round_page(pre_heat_size);
+ }
+ }
+ break;
+
+ case VM_BEHAVIOR_RANDOM:
+ if ((pre_heat_size = cluster_size) <= PAGE_SIZE)
+ goto out;
+ break;
+
+ case VM_BEHAVIOR_SEQUENTIAL:
+ if ((pre_heat_size = cluster_size) == 0)
+ pre_heat_size = sequential_run + PAGE_SIZE;
+ look_behind = FALSE;
+ *io_streaming = 1;
+
+ break;
+
+ case VM_BEHAVIOR_RSEQNTL:
+ if ((pre_heat_size = cluster_size) == 0)
+ pre_heat_size = sequential_run + PAGE_SIZE;
+ look_ahead = FALSE;
+ *io_streaming = 1;
+
+ break;
+
+ }
+ throttle_limit = (uint32_t) max_length;
+ assert(throttle_limit == max_length);
+
+ if (vnode_pager_get_throttle_io_limit(object->pager, &throttle_limit) == KERN_SUCCESS) {
+ if (max_length > throttle_limit)
+ max_length = throttle_limit;
+ }
+ if (pre_heat_size > max_length)
+ pre_heat_size = max_length;
+
+ if (behavior == VM_BEHAVIOR_DEFAULT && (pre_heat_size > min_ph_size)) {
+
+ unsigned int consider_free = vm_page_free_count + vm_page_cleaned_count;
+
+ if (consider_free < vm_page_throttle_limit) {
+ pre_heat_size = trunc_page(pre_heat_size / 16);
+ } else if (consider_free < vm_page_free_target) {
+ pre_heat_size = trunc_page(pre_heat_size / 4);
+ }
+
+ if (pre_heat_size < min_ph_size)
+ pre_heat_size = min_ph_size;
+ }
+ if (look_ahead == TRUE) {
+ if (look_behind == TRUE) {
+ /*
+ * if we get here its due to a random access...
+ * so we want to center the original fault address
+ * within the cluster we will issue... make sure
+ * to calculate 'head_size' as a multiple of PAGE_SIZE...
+ * 'pre_heat_size' is a multiple of PAGE_SIZE but not
+ * necessarily an even number of pages so we need to truncate
+ * the result to a PAGE_SIZE boundary
+ */
+ head_size = trunc_page(pre_heat_size / 2);
+
+ if (target_start > head_size)
+ target_start -= head_size;
+ else
+ target_start = 0;
+
+ /*
+ * 'target_start' at this point represents the beginning offset
+ * of the cluster we are considering... 'orig_start' will be in
+ * the center of this cluster if we didn't have to clip the start
+ * due to running into the start of the file
+ */
+ }
+ if ((target_start + pre_heat_size) > object_size)
+ pre_heat_size = (vm_size_t)(round_page_64(object_size - target_start));
+ /*
+ * at this point caclulate the number of pages beyond the original fault
+ * address that we want to consider... this is guaranteed not to extend beyond
+ * the current EOF...
+ */
+ assert((vm_size_t)(orig_start - target_start) == (orig_start - target_start));
+ tail_size = pre_heat_size - (vm_size_t)(orig_start - target_start) - PAGE_SIZE;
+ } else {
+ if (pre_heat_size > target_start) {
+ /*
+ * since pre_heat_size is always smaller then 2^32,
+ * if it is larger then target_start (a 64 bit value)
+ * it is safe to clip target_start to 32 bits
+ */
+ pre_heat_size = (vm_size_t) target_start;
+ }
+ tail_size = 0;
+ }
+ assert( !(target_start & PAGE_MASK_64));
+ assert( !(pre_heat_size & PAGE_MASK_64));
+
+ if (pre_heat_size <= PAGE_SIZE)
+ goto out;
+
+ if (look_behind == TRUE) {
+ /*
+ * take a look at the pages before the original
+ * faulting offset... recalculate this in case
+ * we had to clip 'pre_heat_size' above to keep
+ * from running past the EOF.
+ */
+ head_size = pre_heat_size - tail_size - PAGE_SIZE;
+
+ for (offset = orig_start - PAGE_SIZE_64; head_size; offset -= PAGE_SIZE_64, head_size -= PAGE_SIZE) {
+ /*
+ * don't poke below the lowest offset
+ */
+ if (offset < fault_info->lo_offset)
+ break;
+ /*
+ * for external objects or internal objects w/o a pager,
+ * VM_COMPRESSOR_PAGER_STATE_GET will return VM_EXTERNAL_STATE_UNKNOWN
+ */
+ if (VM_COMPRESSOR_PAGER_STATE_GET(object, offset) == VM_EXTERNAL_STATE_ABSENT) {
+ break;
+ }
+ if (vm_page_lookup(object, offset) != VM_PAGE_NULL) {
+ /*
+ * don't bridge resident pages
+ */
+ break;
+ }
+ *start = offset;
+ *length += PAGE_SIZE;
+ }
+ }
+ if (look_ahead == TRUE) {
+ for (offset = orig_start + PAGE_SIZE_64; tail_size; offset += PAGE_SIZE_64, tail_size -= PAGE_SIZE) {
+ /*
+ * don't poke above the highest offset
+ */
+ if (offset >= fault_info->hi_offset)
+ break;
+ assert(offset < object_size);
+
+ /*
+ * for external objects or internal objects w/o a pager,
+ * VM_COMPRESSOR_PAGER_STATE_GET will return VM_EXTERNAL_STATE_UNKNOWN
+ */
+ if (VM_COMPRESSOR_PAGER_STATE_GET(object, offset) == VM_EXTERNAL_STATE_ABSENT) {
+ break;
+ }
+ if (vm_page_lookup(object, offset) != VM_PAGE_NULL) {
+ /*
+ * don't bridge resident pages
+ */
+ break;
+ }
+ *length += PAGE_SIZE;
+ }
+ }
+out:
+ if (*length > max_length)
+ *length = max_length;
+
+ vm_object_unlock(object);
+
+ DTRACE_VM1(clustersize, vm_size_t, *length);
+}
+
+
+/*
+ * Allow manipulation of individual page state. This is actually part of
+ * the UPL regimen but takes place on the VM object rather than on a UPL
+ */
+
+kern_return_t
+vm_object_page_op(
+ vm_object_t object,
+ vm_object_offset_t offset,
+ int ops,
+ ppnum_t *phys_entry,
+ int *flags)
+{
+ vm_page_t dst_page;
+
+ vm_object_lock(object);
+
+ if(ops & UPL_POP_PHYSICAL) {
+ if(object->phys_contiguous) {
+ if (phys_entry) {
+ *phys_entry = (ppnum_t)
+ (object->vo_shadow_offset >> PAGE_SHIFT);
+ }
+ vm_object_unlock(object);
+ return KERN_SUCCESS;
+ } else {
+ vm_object_unlock(object);
+ return KERN_INVALID_OBJECT;
+ }
+ }
+ if(object->phys_contiguous) {
+ vm_object_unlock(object);
+ return KERN_INVALID_OBJECT;
+ }
+
+ while(TRUE) {
+ if((dst_page = vm_page_lookup(object,offset)) == VM_PAGE_NULL) {
+ vm_object_unlock(object);
+ return KERN_FAILURE;
+ }
+
+ /* Sync up on getting the busy bit */
+ if((dst_page->busy || dst_page->cleaning) &&
+ (((ops & UPL_POP_SET) &&
+ (ops & UPL_POP_BUSY)) || (ops & UPL_POP_DUMP))) {
+ /* someone else is playing with the page, we will */
+ /* have to wait */
+ PAGE_SLEEP(object, dst_page, THREAD_UNINT);
+ continue;
+ }
+
+ if (ops & UPL_POP_DUMP) {
+ if (dst_page->pmapped == TRUE)
+ pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(dst_page));
+
+ VM_PAGE_FREE(dst_page);
+ break;
+ }
+
+ if (flags) {
+ *flags = 0;
+
+ /* Get the condition of flags before requested ops */
+ /* are undertaken */
+
+ if(dst_page->dirty) *flags |= UPL_POP_DIRTY;
+ if(dst_page->free_when_done) *flags |= UPL_POP_PAGEOUT;
+ if(dst_page->precious) *flags |= UPL_POP_PRECIOUS;
+ if(dst_page->absent) *flags |= UPL_POP_ABSENT;
+ if(dst_page->busy) *flags |= UPL_POP_BUSY;
+ }
+
+ /* The caller should have made a call either contingent with */
+ /* or prior to this call to set UPL_POP_BUSY */
+ if(ops & UPL_POP_SET) {
+ /* The protection granted with this assert will */
+ /* not be complete. If the caller violates the */
+ /* convention and attempts to change page state */
+ /* without first setting busy we may not see it */
+ /* because the page may already be busy. However */
+ /* if such violations occur we will assert sooner */
+ /* or later. */
+ assert(dst_page->busy || (ops & UPL_POP_BUSY));
+ if (ops & UPL_POP_DIRTY) {
+ SET_PAGE_DIRTY(dst_page, FALSE);
+ }
+ if (ops & UPL_POP_PAGEOUT) dst_page->free_when_done = TRUE;
+ if (ops & UPL_POP_PRECIOUS) dst_page->precious = TRUE;
+ if (ops & UPL_POP_ABSENT) dst_page->absent = TRUE;
+ if (ops & UPL_POP_BUSY) dst_page->busy = TRUE;
+ }
+
+ if(ops & UPL_POP_CLR) {
+ assert(dst_page->busy);
+ if (ops & UPL_POP_DIRTY) dst_page->dirty = FALSE;
+ if (ops & UPL_POP_PAGEOUT) dst_page->free_when_done = FALSE;
+ if (ops & UPL_POP_PRECIOUS) dst_page->precious = FALSE;
+ if (ops & UPL_POP_ABSENT) dst_page->absent = FALSE;
+ if (ops & UPL_POP_BUSY) {
+ dst_page->busy = FALSE;
+ PAGE_WAKEUP(dst_page);
+ }
+ }
+ if (phys_entry) {
+ /*
+ * The physical page number will remain valid
+ * only if the page is kept busy.
+ */
+ assert(dst_page->busy);
+ *phys_entry = VM_PAGE_GET_PHYS_PAGE(dst_page);
+ }
+
+ break;
+ }
+
+ vm_object_unlock(object);
+ return KERN_SUCCESS;
+
+}
+
+/*
+ * vm_object_range_op offers performance enhancement over
+ * vm_object_page_op for page_op functions which do not require page
+ * level state to be returned from the call. Page_op was created to provide
+ * a low-cost alternative to page manipulation via UPLs when only a single
+ * page was involved. The range_op call establishes the ability in the _op
+ * family of functions to work on multiple pages where the lack of page level
+ * state handling allows the caller to avoid the overhead of the upl structures.
+ */
+
+kern_return_t
+vm_object_range_op(
+ vm_object_t object,
+ vm_object_offset_t offset_beg,
+ vm_object_offset_t offset_end,
+ int ops,
+ uint32_t *range)
+{
+ vm_object_offset_t offset;
+ vm_page_t dst_page;
+
+ if (offset_end - offset_beg > (uint32_t) -1) {
+ /* range is too big and would overflow "*range" */
+ return KERN_INVALID_ARGUMENT;
+ }
+ if (object->resident_page_count == 0) {
+ if (range) {
+ if (ops & UPL_ROP_PRESENT) {
+ *range = 0;
+ } else {
+ *range = (uint32_t) (offset_end - offset_beg);
+ assert(*range == (offset_end - offset_beg));
+ }
+ }
+ return KERN_SUCCESS;
+ }
+ vm_object_lock(object);
+
+ if (object->phys_contiguous) {
+ vm_object_unlock(object);
+ return KERN_INVALID_OBJECT;
+ }
+
+ offset = offset_beg & ~PAGE_MASK_64;
+
+ while (offset < offset_end) {
+ dst_page = vm_page_lookup(object, offset);
+ if (dst_page != VM_PAGE_NULL) {
+ if (ops & UPL_ROP_DUMP) {
+ if (dst_page->busy || dst_page->cleaning) {
+ /*
+ * someone else is playing with the
+ * page, we will have to wait
+ */
+ PAGE_SLEEP(object, dst_page, THREAD_UNINT);
+ /*
+ * need to relook the page up since it's
+ * state may have changed while we slept
+ * it might even belong to a different object
+ * at this point
+ */
+ continue;
+ }
+ if (dst_page->laundry)
+ vm_pageout_steal_laundry(dst_page, FALSE);
+
+ if (dst_page->pmapped == TRUE)
+ pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(dst_page));
+
+ VM_PAGE_FREE(dst_page);
+
+ } else if ((ops & UPL_ROP_ABSENT)
+ && (!dst_page->absent || dst_page->busy)) {
+ break;
+ }
+ } else if (ops & UPL_ROP_PRESENT)
+ break;
+
+ offset += PAGE_SIZE;
+ }
+ vm_object_unlock(object);
+
+ if (range) {
+ if (offset > offset_end)
+ offset = offset_end;
+ if(offset > offset_beg) {
+ *range = (uint32_t) (offset - offset_beg);
+ assert(*range == (offset - offset_beg));
+ } else {
+ *range = 0;
+ }
+ }
+ return KERN_SUCCESS;
+}
+
+/*
+ * Used to point a pager directly to a range of memory (when the pager may be associated
+ * with a non-device vnode). Takes a virtual address, an offset, and a size. We currently
+ * expect that the virtual address will denote the start of a range that is physically contiguous.
+ */
+kern_return_t pager_map_to_phys_contiguous(
+ memory_object_control_t object,
+ memory_object_offset_t offset,
+ addr64_t base_vaddr,
+ vm_size_t size)
+{
+ ppnum_t page_num;
+ boolean_t clobbered_private;
+ kern_return_t retval;
+ vm_object_t pager_object;
+
+ page_num = pmap_find_phys(kernel_pmap, base_vaddr);
+
+ if (!page_num) {
+ retval = KERN_FAILURE;
+ goto out;
+ }
+
+ pager_object = memory_object_control_to_vm_object(object);
+
+ if (!pager_object) {
+ retval = KERN_FAILURE;
+ goto out;
+ }
+
+ clobbered_private = pager_object->private;
+ if (pager_object->private != TRUE) {
+ vm_object_lock(pager_object);
+ pager_object->private = TRUE;
+ vm_object_unlock(pager_object);
+ }
+ retval = vm_object_populate_with_private(pager_object, offset, page_num, size);
+
+ if (retval != KERN_SUCCESS) {
+ if (pager_object->private != clobbered_private) {
+ vm_object_lock(pager_object);
+ pager_object->private = clobbered_private;
+ vm_object_unlock(pager_object);
+ }
+ }
+
+out:
+ return retval;
+}
+
+uint32_t scan_object_collision = 0;
+
+void
+vm_object_lock(vm_object_t object)
+{
+ if (object == vm_pageout_scan_wants_object) {
+ scan_object_collision++;
+ mutex_pause(2);
+ }
+ lck_rw_lock_exclusive(&object->Lock);
+#if DEVELOPMENT || DEBUG
+ object->Lock_owner = current_thread();
+#endif
+}
+
+boolean_t
+vm_object_lock_avoid(vm_object_t object)
+{
+ if (object == vm_pageout_scan_wants_object) {
+ scan_object_collision++;
+ return TRUE;
+ }
+ return FALSE;
+}
+
+boolean_t
+_vm_object_lock_try(vm_object_t object)
+{
+ boolean_t retval;
+
+ retval = lck_rw_try_lock_exclusive(&object->Lock);
+#if DEVELOPMENT || DEBUG
+ if (retval == TRUE)
+ object->Lock_owner = current_thread();
+#endif
+ return (retval);
+}
+
+boolean_t
+vm_object_lock_try(vm_object_t object)
+{
+ /*
+ * Called from hibernate path so check before blocking.
+ */
+ if (vm_object_lock_avoid(object) && ml_get_interrupts_enabled() && get_preemption_level()==0) {
+ mutex_pause(2);
+ }
+ return _vm_object_lock_try(object);
+}
+
+void
+vm_object_lock_shared(vm_object_t object)
+{
+ if (vm_object_lock_avoid(object)) {
+ mutex_pause(2);
+ }
+ lck_rw_lock_shared(&object->Lock);
+}
+
+boolean_t
+vm_object_lock_yield_shared(vm_object_t object)
+{
+ boolean_t retval = FALSE, force_yield = FALSE;;
+
+ vm_object_lock_assert_shared(object);
+
+ force_yield = vm_object_lock_avoid(object);
+
+ retval = lck_rw_lock_yield_shared(&object->Lock, force_yield);
+
+ return (retval);
+}
+
+boolean_t
+vm_object_lock_try_shared(vm_object_t object)
+{
+ if (vm_object_lock_avoid(object)) {
+ mutex_pause(2);
+ }
+ return (lck_rw_try_lock_shared(&object->Lock));
+}
+
+boolean_t
+vm_object_lock_upgrade(vm_object_t object)
+{ boolean_t retval;
+
+ retval = lck_rw_lock_shared_to_exclusive(&object->Lock);
+#if DEVELOPMENT || DEBUG
+ if (retval == TRUE)
+ object->Lock_owner = current_thread();
+#endif
+ return (retval);
+}
+
+void
+vm_object_unlock(vm_object_t object)
+{
+#if DEVELOPMENT || DEBUG
+ if (object->Lock_owner) {
+ if (object->Lock_owner != current_thread())
+ panic("vm_object_unlock: not owner - %p\n", object);
+ object->Lock_owner = 0;
+ }
+#endif
+ lck_rw_done(&object->Lock);
+}
+
+
+unsigned int vm_object_change_wimg_mode_count = 0;
+
+/*
+ * The object must be locked
+ */
+void
+vm_object_change_wimg_mode(vm_object_t object, unsigned int wimg_mode)
+{
+ vm_page_t p;
+
+ vm_object_lock_assert_exclusive(object);
+
+ vm_object_paging_wait(object, THREAD_UNINT);
+
+ vm_page_queue_iterate(&object->memq, p, vm_page_t, listq) {
+
+ if (!p->fictitious)
+ pmap_set_cache_attributes(VM_PAGE_GET_PHYS_PAGE(p), wimg_mode);
+ }
+ if (wimg_mode == VM_WIMG_USE_DEFAULT)
+ object->set_cache_attr = FALSE;
+ else
+ object->set_cache_attr = TRUE;
+
+ object->wimg_bits = wimg_mode;
+
+ vm_object_change_wimg_mode_count++;
+}
+
+#if CONFIG_FREEZE
+
+/*
+ * This routine does the "relocation" of previously
+ * compressed pages belonging to this object that are
+ * residing in a number of compressed segments into
+ * a set of compressed segments dedicated to hold
+ * compressed pages belonging to this object.
+ */
+
+extern void *freezer_chead;
+extern char *freezer_compressor_scratch_buf;
+extern int c_freezer_compression_count;
+extern AbsoluteTime c_freezer_last_yield_ts;
+
+#define MAX_FREE_BATCH 32
+#define FREEZER_DUTY_CYCLE_ON_MS 5
+#define FREEZER_DUTY_CYCLE_OFF_MS 5
+
+static int c_freezer_should_yield(void);
+
+
+static int
+c_freezer_should_yield()
+{
+ AbsoluteTime cur_time;
+ uint64_t nsecs;
+
+ assert(c_freezer_last_yield_ts);
+ clock_get_uptime(&cur_time);
+
+ SUB_ABSOLUTETIME(&cur_time, &c_freezer_last_yield_ts);
+ absolutetime_to_nanoseconds(cur_time, &nsecs);
+
+ if (nsecs > 1000 * 1000 * FREEZER_DUTY_CYCLE_ON_MS)
+ return (1);
+ return (0);
+}
+
+
+void
+vm_object_compressed_freezer_done()
+{
+ vm_compressor_finished_filling(&freezer_chead);
+}
+
+
+void
+vm_object_compressed_freezer_pageout(
+ vm_object_t object)
+{
+ vm_page_t p;
+ vm_page_t local_freeq = NULL;
+ int local_freed = 0;
+ kern_return_t retval = KERN_SUCCESS;
+ int obj_resident_page_count_snapshot = 0;
+
+ assert(object != VM_OBJECT_NULL);
+ assert(object->internal);
+
+ vm_object_lock(object);
+
+ if (!object->pager_initialized || object->pager == MEMORY_OBJECT_NULL) {
+
+ if (!object->pager_initialized) {
+
+ vm_object_collapse(object, (vm_object_offset_t) 0, TRUE);
+
+ if (!object->pager_initialized)
+ vm_object_compressor_pager_create(object);
+ }
+
+ if (!object->pager_initialized || object->pager == MEMORY_OBJECT_NULL) {
+ vm_object_unlock(object);
+ return;
+ }
+ }
+
+ if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) {
+ vm_object_offset_t curr_offset = 0;
+
+ /*
+ * Go through the object and make sure that any
+ * previously compressed pages are relocated into
+ * a compressed segment associated with our "freezer_chead".
+ */
+ while (curr_offset < object->vo_size) {
+
+ curr_offset = vm_compressor_pager_next_compressed(object->pager, curr_offset);
+
+ if (curr_offset == (vm_object_offset_t) -1)
+ break;
+
+ retval = vm_compressor_pager_relocate(object->pager, curr_offset, &freezer_chead);
+
+ if (retval != KERN_SUCCESS)
+ break;
+
+ curr_offset += PAGE_SIZE_64;
+ }
+ }
+
+ /*
+ * We can't hold the object lock while heading down into the compressed pager
+ * layer because we might need the kernel map lock down there to allocate new
+ * compressor data structures. And if this same object is mapped in the kernel
+ * and there's a fault on it, then that thread will want the object lock while
+ * holding the kernel map lock.
+ *
+ * Since we are going to drop/grab the object lock repeatedly, we must make sure
+ * we won't be stuck in an infinite loop if the same page(s) keep getting
+ * decompressed. So we grab a snapshot of the number of pages in the object and
+ * we won't process any more than that number of pages.
+ */
+
+ obj_resident_page_count_snapshot = object->resident_page_count;
+
+ vm_object_activity_begin(object);
+
+ while ((obj_resident_page_count_snapshot--) && !vm_page_queue_empty(&object->memq)) {
+
+ p = (vm_page_t)vm_page_queue_first(&object->memq);
+
+ KERNEL_DEBUG(0xe0430004 | DBG_FUNC_START, object, local_freed, 0, 0, 0);
+
+ vm_page_lockspin_queues();
+
+ if (p->cleaning || p->fictitious || p->busy || p->absent || p->unusual || p->error || VM_PAGE_WIRED(p)) {
+
+ vm_page_unlock_queues();
+
+ KERNEL_DEBUG(0xe0430004 | DBG_FUNC_END, object, local_freed, 1, 0, 0);
+
+ vm_page_queue_remove(&object->memq, p, vm_page_t, listq);
+ vm_page_queue_enter(&object->memq, p, vm_page_t, listq);
+
+ continue;
+ }
+
+ if (p->pmapped == TRUE) {
+ int refmod_state, pmap_flags;
+
+ if (p->dirty || p->precious) {
+ pmap_flags = PMAP_OPTIONS_COMPRESSOR;
+ } else {
+ pmap_flags = PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED;
+ }
+
+ refmod_state = pmap_disconnect_options(VM_PAGE_GET_PHYS_PAGE(p), pmap_flags, NULL);
+ if (refmod_state & VM_MEM_MODIFIED) {
+ SET_PAGE_DIRTY(p, FALSE);
+ }
+ }
+
+ if (p->dirty == FALSE && p->precious == FALSE) {
+ /*
+ * Clean and non-precious page.
+ */
+ vm_page_unlock_queues();
+ VM_PAGE_FREE(p);
+
+ KERNEL_DEBUG(0xe0430004 | DBG_FUNC_END, object, local_freed, 2, 0, 0);
+ continue;
+ }
+
+ if (p->laundry)
+ vm_pageout_steal_laundry(p, TRUE);
+
+ vm_page_queues_remove(p, TRUE);
+
+ vm_page_unlock_queues();
+
+
+ /*
+ * In case the compressor fails to compress this page, we need it at
+ * the back of the object memq so that we don't keep trying to process it.
+ * Make the move here while we have the object lock held.
+ */
+
+ vm_page_queue_remove(&object->memq, p, vm_page_t, listq);
+ vm_page_queue_enter(&object->memq, p, vm_page_t, listq);
+
+ /*
+ * Grab an activity_in_progress here for vm_pageout_compress_page() to consume.
+ *
+ * Mark the page busy so no one messes with it while we have the object lock dropped.
+ */
+
+ p->busy = TRUE;
+
+ vm_object_activity_begin(object);
+
+ vm_object_unlock(object);
+
+ /*
+ * arg3 == FALSE tells vm_pageout_compress_page that we don't hold the object lock and the pager may not be initialized.
+ */
+ if (vm_pageout_compress_page(&freezer_chead, freezer_compressor_scratch_buf, p, FALSE) == KERN_SUCCESS) {
+ /*
+ * page has already been un-tabled from the object via 'vm_page_remove'
+ */
+ p->snext = local_freeq;
+ local_freeq = p;
+ local_freed++;
+
+ if (local_freed >= MAX_FREE_BATCH) {
+
+ vm_page_free_list(local_freeq, TRUE);
+
+ local_freeq = NULL;
+ local_freed = 0;
+ }
+ c_freezer_compression_count++;
+ }
+ KERNEL_DEBUG(0xe0430004 | DBG_FUNC_END, object, local_freed, 0, 0, 0);
+
+ if (local_freed == 0 && c_freezer_should_yield()) {
+
+ thread_yield_internal(FREEZER_DUTY_CYCLE_OFF_MS);
+ clock_get_uptime(&c_freezer_last_yield_ts);
+ }
+
+ vm_object_lock(object);
+ }
+
+ if (local_freeq) {
+ vm_page_free_list(local_freeq, TRUE);
+
+ local_freeq = NULL;
+ local_freed = 0;
+ }
+
+ vm_object_activity_end(object);
+
+ vm_object_unlock(object);
+
+ if (c_freezer_should_yield()) {
+
+ thread_yield_internal(FREEZER_DUTY_CYCLE_OFF_MS);
+ clock_get_uptime(&c_freezer_last_yield_ts);
+ }
+}
+
+#endif /* CONFIG_FREEZE */
+
+
+void
+vm_object_pageout(
+ vm_object_t object)
+{
+ vm_page_t p, next;
+ struct vm_pageout_queue *iq;
+
+ if (!VM_CONFIG_COMPRESSOR_IS_PRESENT)
+ return;
+
+ iq = &vm_pageout_queue_internal;
+
+ assert(object != VM_OBJECT_NULL );
+
+ vm_object_lock(object);
+
+ if (!object->internal ||
+ object->terminating ||
+ !object->alive) {
+ vm_object_unlock(object);
+ return;
+ }
+
+ if (!object->pager_initialized || object->pager == MEMORY_OBJECT_NULL) {
+
+ if (!object->pager_initialized) {
+
+ vm_object_collapse(object, (vm_object_offset_t) 0, TRUE);
+
+ if (!object->pager_initialized)
+ vm_object_compressor_pager_create(object);
+ }
+
+ if (!object->pager_initialized || object->pager == MEMORY_OBJECT_NULL) {
+ vm_object_unlock(object);
+ return;
+ }
+ }
+
+ReScan:
+ next = (vm_page_t)vm_page_queue_first(&object->memq);
+
+ while (!vm_page_queue_end(&object->memq, (vm_page_queue_entry_t)next)) {
+ p = next;
+ next = (vm_page_t)vm_page_queue_next(&next->listq);
+
+ assert(p->vm_page_q_state != VM_PAGE_ON_FREE_Q);
+
+ if ((p->vm_page_q_state == VM_PAGE_ON_THROTTLED_Q) ||
+ p->cleaning ||
+ p->laundry ||
+ p->busy ||
+ p->absent ||
+ p->error ||
+ p->fictitious ||
+ VM_PAGE_WIRED(p)) {
+ /*
+ * Page is already being cleaned or can't be cleaned.
+ */
+ continue;
+ }
+
+ /* Throw to the pageout queue */
+
+ vm_page_lockspin_queues();
+
+ if (vm_compressor_low_on_space()) {
+ vm_page_unlock_queues();
+ break;
+ }
+
+ if (VM_PAGE_Q_THROTTLED(iq)) {
+
+ iq->pgo_draining = TRUE;
+
+ assert_wait((event_t) (&iq->pgo_laundry + 1),
+ THREAD_INTERRUPTIBLE);
+ vm_page_unlock_queues();
+ vm_object_unlock(object);
+
+ thread_block(THREAD_CONTINUE_NULL);
+
+ vm_object_lock(object);
+ goto ReScan;
+ }
+
+ assert(!p->fictitious);
+ assert(!p->busy);
+ assert(!p->absent);
+ assert(!p->unusual);
+ assert(!p->error);
+ assert(!VM_PAGE_WIRED(p));
+ assert(!p->cleaning);
+
+ if (p->pmapped == TRUE) {
+ int refmod_state;
+ int pmap_options;
+
+ /*
+ * Tell pmap the page should be accounted
+ * for as "compressed" if it's been modified.
+ */
+ pmap_options =
+ PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED;
+ if (p->dirty || p->precious) {
+ /*
+ * We already know it's been modified,
+ * so tell pmap to account for it
+ * as "compressed".
+ */
+ pmap_options = PMAP_OPTIONS_COMPRESSOR;
+ }
+ refmod_state = pmap_disconnect_options(VM_PAGE_GET_PHYS_PAGE(p),
+ pmap_options,
+ NULL);
+ if (refmod_state & VM_MEM_MODIFIED) {
+ SET_PAGE_DIRTY(p, FALSE);
+ }
+ }
+
+ if (!p->dirty && !p->precious) {
+ vm_page_unlock_queues();
+ VM_PAGE_FREE(p);
+ continue;
+ }
+ vm_page_queues_remove(p, TRUE);
+
+ vm_pageout_cluster(p);
+
+ vm_page_unlock_queues();
+ }
+ vm_object_unlock(object);
+}
+
+
+#if CONFIG_IOSCHED
+void
+vm_page_request_reprioritize(vm_object_t o, uint64_t blkno, uint32_t len, int prio)
+{
+ io_reprioritize_req_t req;
+ struct vnode *devvp = NULL;
+
+ if(vnode_pager_get_object_devvp(o->pager, (uintptr_t *)&devvp) != KERN_SUCCESS)
+ return;
+
+ /*
+ * Create the request for I/O reprioritization.
+ * We use the noblock variant of zalloc because we're holding the object
+ * lock here and we could cause a deadlock in low memory conditions.
+ */
+ req = (io_reprioritize_req_t)zalloc_noblock(io_reprioritize_req_zone);
+ if (req == NULL)
+ return;
+ req->blkno = blkno;
+ req->len = len;
+ req->priority = prio;
+ req->devvp = devvp;
+
+ /* Insert request into the reprioritization list */
+ IO_REPRIORITIZE_LIST_LOCK();
+ queue_enter(&io_reprioritize_list, req, io_reprioritize_req_t, io_reprioritize_list);
+ IO_REPRIORITIZE_LIST_UNLOCK();
+
+ /* Wakeup reprioritize thread */
+ IO_REPRIO_THREAD_WAKEUP();
+
+ return;
+}
+
+void
+vm_decmp_upl_reprioritize(upl_t upl, int prio)
+{
+ int offset;
+ vm_object_t object;
+ io_reprioritize_req_t req;
+ struct vnode *devvp = NULL;
+ uint64_t blkno;
+ uint32_t len;
+ upl_t io_upl;
+ uint64_t *io_upl_reprio_info;
+ int io_upl_size;
+
+ if ((upl->flags & UPL_TRACKED_BY_OBJECT) == 0 || (upl->flags & UPL_EXPEDITE_SUPPORTED) == 0)
+ return;
+
+ /*
+ * We dont want to perform any allocations with the upl lock held since that might
+ * result in a deadlock. If the system is low on memory, the pageout thread would
+ * try to pageout stuff and might wait on this lock. If we are waiting for the memory to
+ * be freed up by the pageout thread, it would be a deadlock.
+ */
+
+
+ /* First step is just to get the size of the upl to find out how big the reprio info is */
+ if(!upl_try_lock(upl))
+ return;
+
+ if (upl->decmp_io_upl == NULL) {
+ /* The real I/O upl was destroyed by the time we came in here. Nothing to do. */
+ upl_unlock(upl);
+ return;
+ }
+
+ io_upl = upl->decmp_io_upl;
+ assert((io_upl->flags & UPL_DECMP_REAL_IO) != 0);
+ io_upl_size = io_upl->size;
+ upl_unlock(upl);
+
+ /* Now perform the allocation */
+ io_upl_reprio_info = (uint64_t *)kalloc(sizeof(uint64_t) * (io_upl_size / PAGE_SIZE));
+ if (io_upl_reprio_info == NULL)
+ return;
+
+ /* Now again take the lock, recheck the state and grab out the required info */
+ if(!upl_try_lock(upl))
+ goto out;
+
+ if (upl->decmp_io_upl == NULL || upl->decmp_io_upl != io_upl) {
+ /* The real I/O upl was destroyed by the time we came in here. Nothing to do. */
+ upl_unlock(upl);
+ goto out;
+ }
+ memcpy(io_upl_reprio_info, io_upl->upl_reprio_info, sizeof(uint64_t) * (io_upl_size / PAGE_SIZE));
+
+ /* Get the VM object for this UPL */
+ if (io_upl->flags & UPL_SHADOWED) {
+ object = io_upl->map_object->shadow;
+ } else {
+ object = io_upl->map_object;
+ }
+
+ /* Get the dev vnode ptr for this object */
+ if(!object || !object->pager ||
+ vnode_pager_get_object_devvp(object->pager, (uintptr_t *)&devvp) != KERN_SUCCESS) {
+ upl_unlock(upl);
+ goto out;
+ }
+
+ upl_unlock(upl);
+
+ /* Now we have all the information needed to do the expedite */
+
+ offset = 0;
+ while (offset < io_upl_size) {
+ blkno = io_upl_reprio_info[(offset / PAGE_SIZE)] & UPL_REPRIO_INFO_MASK;
+ len = (io_upl_reprio_info[(offset / PAGE_SIZE)] >> UPL_REPRIO_INFO_SHIFT) & UPL_REPRIO_INFO_MASK;
+
+ /*
+ * This implementation may cause some spurious expedites due to the
+ * fact that we dont cleanup the blkno & len from the upl_reprio_info
+ * even after the I/O is complete.
+ */
+
+ if (blkno != 0 && len != 0) {
+ /* Create the request for I/O reprioritization */
+ req = (io_reprioritize_req_t)zalloc(io_reprioritize_req_zone);
+ assert(req != NULL);
+ req->blkno = blkno;
+ req->len = len;
+ req->priority = prio;
+ req->devvp = devvp;
+
+ /* Insert request into the reprioritization list */
+ IO_REPRIORITIZE_LIST_LOCK();
+ queue_enter(&io_reprioritize_list, req, io_reprioritize_req_t, io_reprioritize_list);
+ IO_REPRIORITIZE_LIST_UNLOCK();
+
+ offset += len;
+ } else {
+ offset += PAGE_SIZE;
+ }
+ }
+
+ /* Wakeup reprioritize thread */
+ IO_REPRIO_THREAD_WAKEUP();
+
+out:
+ kfree(io_upl_reprio_info, sizeof(uint64_t) * (io_upl_size / PAGE_SIZE));
+ return;
+}
+
+void
+vm_page_handle_prio_inversion(vm_object_t o, vm_page_t m)
+{
+ upl_t upl;
+ upl_page_info_t *pl;
+ unsigned int i, num_pages;
+ int cur_tier;
+
+ cur_tier = proc_get_effective_thread_policy(current_thread(), TASK_POLICY_IO);
+
+ /*
+ Scan through all UPLs associated with the object to find the
+ UPL containing the contended page.
+ */
+ queue_iterate(&o->uplq, upl, upl_t, uplq) {
+ if (((upl->flags & UPL_EXPEDITE_SUPPORTED) == 0) || upl->upl_priority <= cur_tier)
+ continue;
+ pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
+ num_pages = (upl->size / PAGE_SIZE);
+
+ /*
+ For each page in the UPL page list, see if it matches the contended
+ page and was issued as a low prio I/O.
+ */
+ for(i=0; i < num_pages; i++) {
+ if(UPL_PAGE_PRESENT(pl,i) && VM_PAGE_GET_PHYS_PAGE(m) == pl[i].phys_addr) {
+ if ((upl->flags & UPL_DECMP_REQ) && upl->decmp_io_upl) {
+ KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_PAGE_EXPEDITE)) | DBG_FUNC_NONE, VM_KERNEL_UNSLIDE_OR_PERM(upl->upl_creator), VM_KERNEL_UNSLIDE_OR_PERM(m),
+ VM_KERNEL_UNSLIDE_OR_PERM(upl), upl->upl_priority, 0);
+ vm_decmp_upl_reprioritize(upl, cur_tier);
+ break;
+ }
+ KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_PAGE_EXPEDITE)) | DBG_FUNC_NONE, VM_KERNEL_UNSLIDE_OR_PERM(upl->upl_creator), VM_KERNEL_UNSLIDE_OR_PERM(m),
+ upl->upl_reprio_info[i], upl->upl_priority, 0);
+ if (UPL_REPRIO_INFO_BLKNO(upl, i) != 0 && UPL_REPRIO_INFO_LEN(upl, i) != 0)
+ vm_page_request_reprioritize(o, UPL_REPRIO_INFO_BLKNO(upl, i), UPL_REPRIO_INFO_LEN(upl, i), cur_tier);
+ break;
+ }
+ }
+ /* Check if we found any hits */
+ if (i != num_pages)
+ break;
+ }
+
+ return;
+}
+
+wait_result_t
+vm_page_sleep(vm_object_t o, vm_page_t m, int interruptible)
+{
+ wait_result_t ret;
+
+ KERNEL_DEBUG((MACHDBG_CODE(DBG_MACH_VM, VM_PAGE_SLEEP)) | DBG_FUNC_START, o, m, 0, 0, 0);
+
+ if (o->io_tracking && ((m->busy == TRUE) || (m->cleaning == TRUE) || VM_PAGE_WIRED(m))) {
+ /*
+ Indicates page is busy due to an I/O. Issue a reprioritize request if necessary.
+ */
+ vm_page_handle_prio_inversion(o,m);
+ }
+ m->wanted = TRUE;
+ ret = thread_sleep_vm_object(o, m, interruptible);
+ KERNEL_DEBUG((MACHDBG_CODE(DBG_MACH_VM, VM_PAGE_SLEEP)) | DBG_FUNC_END, o, m, 0, 0, 0);
+ return ret;
+}
+
+static void
+io_reprioritize_thread(void *param __unused, wait_result_t wr __unused)
+{
+ io_reprioritize_req_t req = NULL;
+
+ while(1) {
+
+ IO_REPRIORITIZE_LIST_LOCK();
+ if (queue_empty(&io_reprioritize_list)) {
+ IO_REPRIORITIZE_LIST_UNLOCK();
+ break;
+ }
+
+ queue_remove_first(&io_reprioritize_list, req, io_reprioritize_req_t, io_reprioritize_list);
+ IO_REPRIORITIZE_LIST_UNLOCK();
+
+ vnode_pager_issue_reprioritize_io(req->devvp, req->blkno, req->len, req->priority);
+ zfree(io_reprioritize_req_zone, req);
+ }
+
+ IO_REPRIO_THREAD_CONTINUATION();
+}
+#endif