]> git.saurik.com Git - apple/xnu.git/blame - osfmk/vm/vm_pageout.c
xnu-3789.31.2.tar.gz
[apple/xnu.git] / osfmk / vm / vm_pageout.c
CommitLineData
1c79356b 1/*
fe8ab488 2 * Copyright (c) 2000-2014 Apple Inc. All rights reserved.
1c79356b 3 *
2d21ac55 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
1c79356b 5 *
2d21ac55
A
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
8f6c56a5 14 *
2d21ac55
A
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
8f6c56a5
A
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
2d21ac55
A
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
8f6c56a5 25 *
2d21ac55 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
1c79356b
A
27 */
28/*
29 * @OSF_COPYRIGHT@
30 */
31/*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56/*
57 */
58/*
59 * File: vm/vm_pageout.c
60 * Author: Avadis Tevanian, Jr., Michael Wayne Young
61 * Date: 1985
62 *
63 * The proverbial page-out daemon.
64 */
1c79356b 65
91447636
A
66#include <stdint.h>
67
68#include <debug.h>
1c79356b
A
69#include <mach_pagemap.h>
70#include <mach_cluster_stats.h>
1c79356b
A
71
72#include <mach/mach_types.h>
73#include <mach/memory_object.h>
74#include <mach/memory_object_default.h>
0b4e3aa0 75#include <mach/memory_object_control_server.h>
1c79356b 76#include <mach/mach_host_server.h>
91447636
A
77#include <mach/upl.h>
78#include <mach/vm_map.h>
1c79356b
A
79#include <mach/vm_param.h>
80#include <mach/vm_statistics.h>
2d21ac55 81#include <mach/sdt.h>
91447636
A
82
83#include <kern/kern_types.h>
1c79356b 84#include <kern/counters.h>
91447636
A
85#include <kern/host_statistics.h>
86#include <kern/machine.h>
87#include <kern/misc_protos.h>
b0d623f7 88#include <kern/sched.h>
1c79356b 89#include <kern/thread.h>
1c79356b 90#include <kern/xpr.h>
91447636 91#include <kern/kalloc.h>
39037602 92#include <kern/policy_internal.h>
91447636
A
93
94#include <machine/vm_tuning.h>
b0d623f7 95#include <machine/commpage.h>
91447636 96
1c79356b 97#include <vm/pmap.h>
39236c6e 98#include <vm/vm_compressor_pager.h>
55e303ae 99#include <vm/vm_fault.h>
1c79356b
A
100#include <vm/vm_map.h>
101#include <vm/vm_object.h>
102#include <vm/vm_page.h>
103#include <vm/vm_pageout.h>
91447636 104#include <vm/vm_protos.h> /* must be last */
2d21ac55
A
105#include <vm/memory_object.h>
106#include <vm/vm_purgeable_internal.h>
6d2010ae 107#include <vm/vm_shared_region.h>
39236c6e
A
108#include <vm/vm_compressor.h>
109
fe8ab488
A
110#if CONFIG_PHANTOM_CACHE
111#include <vm/vm_phantom_cache.h>
112#endif
91447636
A
113/*
114 * ENCRYPTED SWAP:
115 */
316670eb 116#include <libkern/crypto/aes.h>
b0d623f7 117extern u_int32_t random(void); /* from <libkern/libkern.h> */
55e303ae 118
316670eb
A
119extern int cs_debug;
120
b0d623f7
A
121#if UPL_DEBUG
122#include <libkern/OSDebug.h>
123#endif
91447636 124
fe8ab488
A
125extern void m_drain(void);
126
127#if VM_PRESSURE_EVENTS
128extern unsigned int memorystatus_available_pages;
129extern unsigned int memorystatus_available_pages_pressure;
130extern unsigned int memorystatus_available_pages_critical;
131extern unsigned int memorystatus_frozen_count;
132extern unsigned int memorystatus_suspended_count;
133
39236c6e
A
134extern vm_pressure_level_t memorystatus_vm_pressure_level;
135int memorystatus_purge_on_warning = 2;
136int memorystatus_purge_on_urgent = 5;
137int memorystatus_purge_on_critical = 8;
138
39236c6e
A
139void vm_pressure_response(void);
140boolean_t vm_pressure_thread_running = FALSE;
316670eb 141extern void consider_vm_pressure_events(void);
fe8ab488
A
142
143#define MEMORYSTATUS_SUSPENDED_THRESHOLD 4
144#endif /* VM_PRESSURE_EVENTS */
145
39236c6e 146boolean_t vm_pressure_changed = FALSE;
6d2010ae 147
2d21ac55 148#ifndef VM_PAGEOUT_BURST_ACTIVE_THROTTLE /* maximum iterations of the active queue to move pages to inactive */
2d21ac55
A
149#define VM_PAGEOUT_BURST_ACTIVE_THROTTLE 100
150#endif
91447636 151
2d21ac55 152#ifndef VM_PAGEOUT_BURST_INACTIVE_THROTTLE /* maximum iterations of the inactive queue w/o stealing/cleaning a page */
2d21ac55
A
153#define VM_PAGEOUT_BURST_INACTIVE_THROTTLE 4096
154#endif
91447636
A
155
156#ifndef VM_PAGEOUT_DEADLOCK_RELIEF
157#define VM_PAGEOUT_DEADLOCK_RELIEF 100 /* number of pages to move to break deadlock */
158#endif
159
160#ifndef VM_PAGEOUT_INACTIVE_RELIEF
161#define VM_PAGEOUT_INACTIVE_RELIEF 50 /* minimum number of pages to move to the inactive q */
162#endif
163
1c79356b 164#ifndef VM_PAGE_LAUNDRY_MAX
6d2010ae 165#define VM_PAGE_LAUNDRY_MAX 128UL /* maximum pageouts on a given pageout queue */
1c79356b
A
166#endif /* VM_PAGEOUT_LAUNDRY_MAX */
167
1c79356b 168#ifndef VM_PAGEOUT_BURST_WAIT
fe8ab488 169#define VM_PAGEOUT_BURST_WAIT 10 /* milliseconds */
1c79356b
A
170#endif /* VM_PAGEOUT_BURST_WAIT */
171
172#ifndef VM_PAGEOUT_EMPTY_WAIT
173#define VM_PAGEOUT_EMPTY_WAIT 200 /* milliseconds */
174#endif /* VM_PAGEOUT_EMPTY_WAIT */
175
91447636
A
176#ifndef VM_PAGEOUT_DEADLOCK_WAIT
177#define VM_PAGEOUT_DEADLOCK_WAIT 300 /* milliseconds */
178#endif /* VM_PAGEOUT_DEADLOCK_WAIT */
179
180#ifndef VM_PAGEOUT_IDLE_WAIT
181#define VM_PAGEOUT_IDLE_WAIT 10 /* milliseconds */
182#endif /* VM_PAGEOUT_IDLE_WAIT */
183
39236c6e
A
184#ifndef VM_PAGEOUT_SWAP_WAIT
185#define VM_PAGEOUT_SWAP_WAIT 50 /* milliseconds */
186#endif /* VM_PAGEOUT_SWAP_WAIT */
187
316670eb
A
188#ifndef VM_PAGEOUT_PRESSURE_PAGES_CONSIDERED
189#define VM_PAGEOUT_PRESSURE_PAGES_CONSIDERED 1000 /* maximum pages considered before we issue a pressure event */
190#endif /* VM_PAGEOUT_PRESSURE_PAGES_CONSIDERED */
191
192#ifndef VM_PAGEOUT_PRESSURE_EVENT_MONITOR_SECS
193#define VM_PAGEOUT_PRESSURE_EVENT_MONITOR_SECS 5 /* seconds */
194#endif /* VM_PAGEOUT_PRESSURE_EVENT_MONITOR_SECS */
195
6d2010ae
A
196unsigned int vm_page_speculative_q_age_ms = VM_PAGE_SPECULATIVE_Q_AGE_MS;
197unsigned int vm_page_speculative_percentage = 5;
198
2d21ac55 199#ifndef VM_PAGE_SPECULATIVE_TARGET
6d2010ae 200#define VM_PAGE_SPECULATIVE_TARGET(total) ((total) * 1 / (100 / vm_page_speculative_percentage))
2d21ac55
A
201#endif /* VM_PAGE_SPECULATIVE_TARGET */
202
6d2010ae 203
2d21ac55
A
204#ifndef VM_PAGE_INACTIVE_HEALTHY_LIMIT
205#define VM_PAGE_INACTIVE_HEALTHY_LIMIT(total) ((total) * 1 / 200)
206#endif /* VM_PAGE_INACTIVE_HEALTHY_LIMIT */
207
91447636 208
1c79356b
A
209/*
210 * To obtain a reasonable LRU approximation, the inactive queue
211 * needs to be large enough to give pages on it a chance to be
212 * referenced a second time. This macro defines the fraction
213 * of active+inactive pages that should be inactive.
214 * The pageout daemon uses it to update vm_page_inactive_target.
215 *
216 * If vm_page_free_count falls below vm_page_free_target and
217 * vm_page_inactive_count is below vm_page_inactive_target,
218 * then the pageout daemon starts running.
219 */
220
221#ifndef VM_PAGE_INACTIVE_TARGET
316670eb 222#define VM_PAGE_INACTIVE_TARGET(avail) ((avail) * 1 / 2)
1c79356b
A
223#endif /* VM_PAGE_INACTIVE_TARGET */
224
225/*
226 * Once the pageout daemon starts running, it keeps going
227 * until vm_page_free_count meets or exceeds vm_page_free_target.
228 */
229
230#ifndef VM_PAGE_FREE_TARGET
231#define VM_PAGE_FREE_TARGET(free) (15 + (free) / 80)
232#endif /* VM_PAGE_FREE_TARGET */
233
39236c6e 234
1c79356b
A
235/*
236 * The pageout daemon always starts running once vm_page_free_count
237 * falls below vm_page_free_min.
238 */
239
240#ifndef VM_PAGE_FREE_MIN
2d21ac55 241#define VM_PAGE_FREE_MIN(free) (10 + (free) / 100)
1c79356b
A
242#endif /* VM_PAGE_FREE_MIN */
243
fe8ab488
A
244#define VM_PAGE_FREE_RESERVED_LIMIT 1700
245#define VM_PAGE_FREE_MIN_LIMIT 3500
246#define VM_PAGE_FREE_TARGET_LIMIT 4000
2d21ac55 247
1c79356b
A
248/*
249 * When vm_page_free_count falls below vm_page_free_reserved,
250 * only vm-privileged threads can allocate pages. vm-privilege
251 * allows the pageout daemon and default pager (and any other
252 * associated threads needed for default pageout) to continue
253 * operation by dipping into the reserved pool of pages.
254 */
255
256#ifndef VM_PAGE_FREE_RESERVED
91447636 257#define VM_PAGE_FREE_RESERVED(n) \
b0d623f7 258 ((unsigned) (6 * VM_PAGE_LAUNDRY_MAX) + (n))
1c79356b
A
259#endif /* VM_PAGE_FREE_RESERVED */
260
2d21ac55
A
261/*
262 * When we dequeue pages from the inactive list, they are
263 * reactivated (ie, put back on the active queue) if referenced.
264 * However, it is possible to starve the free list if other
265 * processors are referencing pages faster than we can turn off
266 * the referenced bit. So we limit the number of reactivations
267 * we will make per call of vm_pageout_scan().
268 */
269#define VM_PAGE_REACTIVATE_LIMIT_MAX 20000
270#ifndef VM_PAGE_REACTIVATE_LIMIT
2d21ac55 271#define VM_PAGE_REACTIVATE_LIMIT(avail) (MAX((avail) * 1 / 20,VM_PAGE_REACTIVATE_LIMIT_MAX))
2d21ac55 272#endif /* VM_PAGE_REACTIVATE_LIMIT */
3e170ce0 273#define VM_PAGEOUT_INACTIVE_FORCE_RECLAIM 1000
2d21ac55 274
91447636 275
316670eb
A
276extern boolean_t hibernate_cleaning_in_progress;
277
0b4e3aa0
A
278/*
279 * Exported variable used to broadcast the activation of the pageout scan
280 * Working Set uses this to throttle its use of pmap removes. In this
281 * way, code which runs within memory in an uncontested context does
282 * not keep encountering soft faults.
283 */
284
285unsigned int vm_pageout_scan_event_counter = 0;
1c79356b
A
286
287/*
288 * Forward declarations for internal routines.
289 */
39236c6e
A
290struct cq {
291 struct vm_pageout_queue *q;
292 void *current_chead;
293 char *scratch_buf;
3e170ce0 294 int id;
39236c6e 295};
3e170ce0
A
296#define MAX_COMPRESSOR_THREAD_COUNT 8
297
298struct cq ciq[MAX_COMPRESSOR_THREAD_COUNT];
299
300void *vm_pageout_immediate_chead;
301char *vm_pageout_immediate_scratch_buf;
39236c6e 302
91447636 303
39236c6e
A
304#if VM_PRESSURE_EVENTS
305void vm_pressure_thread(void);
fe8ab488
A
306
307boolean_t VM_PRESSURE_NORMAL_TO_WARNING(void);
308boolean_t VM_PRESSURE_WARNING_TO_CRITICAL(void);
309
310boolean_t VM_PRESSURE_WARNING_TO_NORMAL(void);
311boolean_t VM_PRESSURE_CRITICAL_TO_WARNING(void);
39236c6e 312#endif
91447636 313static void vm_pageout_garbage_collect(int);
91447636 314static void vm_pageout_iothread_external(void);
39236c6e 315static void vm_pageout_iothread_internal(struct cq *cq);
316670eb 316static void vm_pageout_adjust_io_throttles(struct vm_pageout_queue *, struct vm_pageout_queue *, boolean_t);
91447636 317
1c79356b
A
318extern void vm_pageout_continue(void);
319extern void vm_pageout_scan(void);
1c79356b 320
3e170ce0
A
321static void vm_pageout_immediate(vm_page_t, boolean_t);
322boolean_t vm_compressor_immediate_preferred = FALSE;
323boolean_t vm_compressor_immediate_preferred_override = FALSE;
324boolean_t vm_restricted_to_single_processor = FALSE;
4bd07ac2
A
325static boolean_t vm_pageout_waiter = FALSE;
326static boolean_t vm_pageout_running = FALSE;
327
3e170ce0 328
2d21ac55
A
329static thread_t vm_pageout_external_iothread = THREAD_NULL;
330static thread_t vm_pageout_internal_iothread = THREAD_NULL;
331
1c79356b
A
332unsigned int vm_pageout_reserved_internal = 0;
333unsigned int vm_pageout_reserved_really = 0;
334
39236c6e 335unsigned int vm_pageout_swap_wait = 0;
91447636 336unsigned int vm_pageout_idle_wait = 0; /* milliseconds */
55e303ae 337unsigned int vm_pageout_empty_wait = 0; /* milliseconds */
91447636
A
338unsigned int vm_pageout_burst_wait = 0; /* milliseconds */
339unsigned int vm_pageout_deadlock_wait = 0; /* milliseconds */
340unsigned int vm_pageout_deadlock_relief = 0;
341unsigned int vm_pageout_inactive_relief = 0;
342unsigned int vm_pageout_burst_active_throttle = 0;
343unsigned int vm_pageout_burst_inactive_throttle = 0;
1c79356b 344
6d2010ae
A
345int vm_upl_wait_for_pages = 0;
346
b0d623f7 347
1c79356b
A
348/*
349 * These variables record the pageout daemon's actions:
350 * how many pages it looks at and what happens to those pages.
351 * No locking needed because only one thread modifies the variables.
352 */
353
354unsigned int vm_pageout_active = 0; /* debugging */
355unsigned int vm_pageout_inactive = 0; /* debugging */
356unsigned int vm_pageout_inactive_throttled = 0; /* debugging */
357unsigned int vm_pageout_inactive_forced = 0; /* debugging */
358unsigned int vm_pageout_inactive_nolock = 0; /* debugging */
359unsigned int vm_pageout_inactive_avoid = 0; /* debugging */
360unsigned int vm_pageout_inactive_busy = 0; /* debugging */
6d2010ae 361unsigned int vm_pageout_inactive_error = 0; /* debugging */
1c79356b 362unsigned int vm_pageout_inactive_absent = 0; /* debugging */
6d2010ae 363unsigned int vm_pageout_inactive_notalive = 0; /* debugging */
1c79356b 364unsigned int vm_pageout_inactive_used = 0; /* debugging */
6d2010ae 365unsigned int vm_pageout_cache_evicted = 0; /* debugging */
1c79356b 366unsigned int vm_pageout_inactive_clean = 0; /* debugging */
6d2010ae 367unsigned int vm_pageout_speculative_clean = 0; /* debugging */
316670eb
A
368
369unsigned int vm_pageout_freed_from_cleaned = 0;
370unsigned int vm_pageout_freed_from_speculative = 0;
371unsigned int vm_pageout_freed_from_inactive_clean = 0;
372
373unsigned int vm_pageout_enqueued_cleaned_from_inactive_clean = 0;
374unsigned int vm_pageout_enqueued_cleaned_from_inactive_dirty = 0;
375
376unsigned int vm_pageout_cleaned_reclaimed = 0; /* debugging; how many cleaned pages are reclaimed by the pageout scan */
377unsigned int vm_pageout_cleaned_reactivated = 0; /* debugging; how many cleaned pages are found to be referenced on pageout (and are therefore reactivated) */
378unsigned int vm_pageout_cleaned_reference_reactivated = 0;
379unsigned int vm_pageout_cleaned_volatile_reactivated = 0;
380unsigned int vm_pageout_cleaned_fault_reactivated = 0;
381unsigned int vm_pageout_cleaned_commit_reactivated = 0; /* debugging; how many cleaned pages are found to be referenced on commit (and are therefore reactivated) */
382unsigned int vm_pageout_cleaned_busy = 0;
383unsigned int vm_pageout_cleaned_nolock = 0;
384
6d2010ae
A
385unsigned int vm_pageout_inactive_dirty_internal = 0; /* debugging */
386unsigned int vm_pageout_inactive_dirty_external = 0; /* debugging */
b0d623f7 387unsigned int vm_pageout_inactive_deactivated = 0; /* debugging */
316670eb 388unsigned int vm_pageout_inactive_anonymous = 0; /* debugging */
1c79356b 389unsigned int vm_pageout_dirty_no_pager = 0; /* debugging */
3e170ce0 390unsigned int vm_pageout_purged_objects = 0; /* used for sysctl vm stats */
1c79356b
A
391unsigned int vm_stat_discard = 0; /* debugging */
392unsigned int vm_stat_discard_sent = 0; /* debugging */
393unsigned int vm_stat_discard_failure = 0; /* debugging */
394unsigned int vm_stat_discard_throttle = 0; /* debugging */
2d21ac55
A
395unsigned int vm_pageout_reactivation_limit_exceeded = 0; /* debugging */
396unsigned int vm_pageout_catch_ups = 0; /* debugging */
397unsigned int vm_pageout_inactive_force_reclaim = 0; /* debugging */
1c79356b 398
6d2010ae 399unsigned int vm_pageout_scan_reclaimed_throttled = 0;
91447636 400unsigned int vm_pageout_scan_active_throttled = 0;
6d2010ae
A
401unsigned int vm_pageout_scan_inactive_throttled_internal = 0;
402unsigned int vm_pageout_scan_inactive_throttled_external = 0;
91447636
A
403unsigned int vm_pageout_scan_throttle = 0; /* debugging */
404unsigned int vm_pageout_scan_burst_throttle = 0; /* debugging */
405unsigned int vm_pageout_scan_empty_throttle = 0; /* debugging */
39236c6e 406unsigned int vm_pageout_scan_swap_throttle = 0; /* debugging */
91447636
A
407unsigned int vm_pageout_scan_deadlock_detected = 0; /* debugging */
408unsigned int vm_pageout_scan_active_throttle_success = 0; /* debugging */
409unsigned int vm_pageout_scan_inactive_throttle_success = 0; /* debugging */
316670eb 410unsigned int vm_pageout_inactive_external_forced_jetsam_count = 0; /* debugging */
3e170ce0
A
411unsigned int vm_pageout_scan_throttle_deferred = 0; /* debugging */
412unsigned int vm_pageout_scan_yield_unthrottled = 0; /* debugging */
b0d623f7
A
413unsigned int vm_page_speculative_count_drifts = 0;
414unsigned int vm_page_speculative_count_drift_max = 0;
415
316670eb 416
55e303ae
A
417/*
418 * Backing store throttle when BS is exhausted
419 */
420unsigned int vm_backing_store_low = 0;
1c79356b
A
421
422unsigned int vm_pageout_out_of_line = 0;
423unsigned int vm_pageout_in_place = 0;
55e303ae 424
b0d623f7
A
425unsigned int vm_page_steal_pageout_page = 0;
426
39037602
A
427struct vm_config vm_config;
428
91447636
A
429/*
430 * ENCRYPTED SWAP:
431 * counters and statistics...
432 */
433unsigned long vm_page_decrypt_counter = 0;
434unsigned long vm_page_decrypt_for_upl_counter = 0;
435unsigned long vm_page_encrypt_counter = 0;
436unsigned long vm_page_encrypt_abort_counter = 0;
437unsigned long vm_page_encrypt_already_encrypted_counter = 0;
438boolean_t vm_pages_encrypted = FALSE; /* are there encrypted pages ? */
439
39037602
A
440struct vm_pageout_queue vm_pageout_queue_internal __attribute__((aligned(VM_PACKED_POINTER_ALIGNMENT)));
441struct vm_pageout_queue vm_pageout_queue_external __attribute__((aligned(VM_PACKED_POINTER_ALIGNMENT)));
91447636 442
2d21ac55
A
443unsigned int vm_page_speculative_target = 0;
444
445vm_object_t vm_pageout_scan_wants_object = VM_OBJECT_NULL;
446
0b4c1975 447boolean_t (* volatile consider_buffer_cache_collect)(int) = NULL;
b0d623f7
A
448
449#if DEVELOPMENT || DEBUG
4a3eedf9 450unsigned long vm_cs_validated_resets = 0;
b0d623f7 451#endif
55e303ae 452
6d2010ae
A
453int vm_debug_events = 0;
454
316670eb 455#if CONFIG_MEMORYSTATUS
39236c6e
A
456#if !CONFIG_JETSAM
457extern boolean_t memorystatus_idle_exit_from_VM(void);
316670eb 458#endif
39236c6e
A
459extern boolean_t memorystatus_kill_on_VM_page_shortage(boolean_t async);
460extern void memorystatus_on_pageout_scan_end(void);
39037602
A
461
462uint32_t vm_pageout_memorystatus_fb_factor_nr = 5;
463uint32_t vm_pageout_memorystatus_fb_factor_dr = 2;
464#if DEVELOPMENT || DEBUG
465uint32_t vm_grab_anon_overrides = 0;
466uint32_t vm_grab_anon_nops = 0;
467#endif
468
316670eb 469#endif
6d2010ae 470
55e303ae
A
471/*
472 * Routine: vm_backing_store_disable
473 * Purpose:
474 * Suspend non-privileged threads wishing to extend
475 * backing store when we are low on backing store
476 * (Synchronized by caller)
477 */
478void
479vm_backing_store_disable(
480 boolean_t disable)
481{
482 if(disable) {
483 vm_backing_store_low = 1;
484 } else {
485 if(vm_backing_store_low) {
486 vm_backing_store_low = 0;
487 thread_wakeup((event_t) &vm_backing_store_low);
488 }
489 }
490}
491
492
1c79356b
A
493#if MACH_CLUSTER_STATS
494unsigned long vm_pageout_cluster_dirtied = 0;
495unsigned long vm_pageout_cluster_cleaned = 0;
496unsigned long vm_pageout_cluster_collisions = 0;
497unsigned long vm_pageout_cluster_clusters = 0;
498unsigned long vm_pageout_cluster_conversions = 0;
499unsigned long vm_pageout_target_collisions = 0;
500unsigned long vm_pageout_target_page_dirtied = 0;
501unsigned long vm_pageout_target_page_freed = 0;
1c79356b
A
502#define CLUSTER_STAT(clause) clause
503#else /* MACH_CLUSTER_STATS */
504#define CLUSTER_STAT(clause)
505#endif /* MACH_CLUSTER_STATS */
506
507/*
508 * Routine: vm_pageout_object_terminate
509 * Purpose:
2d21ac55 510 * Destroy the pageout_object, and perform all of the
1c79356b
A
511 * required cleanup actions.
512 *
513 * In/Out conditions:
514 * The object must be locked, and will be returned locked.
515 */
516void
517vm_pageout_object_terminate(
518 vm_object_t object)
519{
520 vm_object_t shadow_object;
521
522 /*
523 * Deal with the deallocation (last reference) of a pageout object
524 * (used for cleaning-in-place) by dropping the paging references/
525 * freeing pages in the original object.
526 */
527
528 assert(object->pageout);
529 shadow_object = object->shadow;
530 vm_object_lock(shadow_object);
531
39037602 532 while (!vm_page_queue_empty(&object->memq)) {
1c79356b
A
533 vm_page_t p, m;
534 vm_object_offset_t offset;
535
39037602 536 p = (vm_page_t) vm_page_queue_first(&object->memq);
1c79356b
A
537
538 assert(p->private);
39037602
A
539 assert(p->free_when_done);
540 p->free_when_done = FALSE;
1c79356b 541 assert(!p->cleaning);
316670eb 542 assert(!p->laundry);
1c79356b
A
543
544 offset = p->offset;
545 VM_PAGE_FREE(p);
546 p = VM_PAGE_NULL;
547
548 m = vm_page_lookup(shadow_object,
6d2010ae 549 offset + object->vo_shadow_offset);
1c79356b
A
550
551 if(m == VM_PAGE_NULL)
552 continue;
1c79356b 553
1c79356b
A
554 assert((m->dirty) || (m->precious) ||
555 (m->busy && m->cleaning));
556
557 /*
558 * Handle the trusted pager throttle.
55e303ae 559 * Also decrement the burst throttle (if external).
1c79356b
A
560 */
561 vm_page_lock_queues();
39037602 562 if (m->vm_page_q_state == VM_PAGE_ON_PAGEOUT_Q)
91447636 563 vm_pageout_throttle_up(m);
1c79356b
A
564
565 /*
566 * Handle the "target" page(s). These pages are to be freed if
567 * successfully cleaned. Target pages are always busy, and are
568 * wired exactly once. The initial target pages are not mapped,
569 * (so cannot be referenced or modified) but converted target
570 * pages may have been modified between the selection as an
571 * adjacent page and conversion to a target.
572 */
39037602 573 if (m->free_when_done) {
1c79356b 574 assert(m->busy);
39037602 575 assert(m->vm_page_q_state == VM_PAGE_IS_WIRED);
1c79356b
A
576 assert(m->wire_count == 1);
577 m->cleaning = FALSE;
2d21ac55 578 m->encrypted_cleaning = FALSE;
39037602 579 m->free_when_done = FALSE;
1c79356b
A
580#if MACH_CLUSTER_STATS
581 if (m->wanted) vm_pageout_target_collisions++;
582#endif
583 /*
584 * Revoke all access to the page. Since the object is
585 * locked, and the page is busy, this prevents the page
91447636 586 * from being dirtied after the pmap_disconnect() call
1c79356b 587 * returns.
91447636 588 *
1c79356b
A
589 * Since the page is left "dirty" but "not modifed", we
590 * can detect whether the page was redirtied during
591 * pageout by checking the modify state.
592 */
39037602 593 if (pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m)) & VM_MEM_MODIFIED) {
316670eb
A
594 SET_PAGE_DIRTY(m, FALSE);
595 } else {
596 m->dirty = FALSE;
597 }
1c79356b
A
598
599 if (m->dirty) {
600 CLUSTER_STAT(vm_pageout_target_page_dirtied++;)
0b4c1975 601 vm_page_unwire(m, TRUE); /* reactivates */
2d21ac55 602 VM_STAT_INCR(reactivations);
1c79356b 603 PAGE_WAKEUP_DONE(m);
1c79356b
A
604 } else {
605 CLUSTER_STAT(vm_pageout_target_page_freed++;)
606 vm_page_free(m);/* clears busy, etc. */
607 }
608 vm_page_unlock_queues();
609 continue;
610 }
611 /*
612 * Handle the "adjacent" pages. These pages were cleaned in
613 * place, and should be left alone.
614 * If prep_pin_count is nonzero, then someone is using the
615 * page, so make it active.
616 */
39037602 617 if ((m->vm_page_q_state == VM_PAGE_NOT_ON_Q) && !m->private) {
0b4e3aa0 618 if (m->reference)
1c79356b
A
619 vm_page_activate(m);
620 else
621 vm_page_deactivate(m);
622 }
6d2010ae
A
623 if (m->overwriting) {
624 /*
625 * the (COPY_OUT_FROM == FALSE) request_page_list case
626 */
627 if (m->busy) {
628 /*
629 * We do not re-set m->dirty !
630 * The page was busy so no extraneous activity
631 * could have occurred. COPY_INTO is a read into the
632 * new pages. CLEAN_IN_PLACE does actually write
633 * out the pages but handling outside of this code
634 * will take care of resetting dirty. We clear the
635 * modify however for the Programmed I/O case.
636 */
39037602 637 pmap_clear_modify(VM_PAGE_GET_PHYS_PAGE(m));
2d21ac55 638
6d2010ae
A
639 m->busy = FALSE;
640 m->absent = FALSE;
641 } else {
642 /*
643 * alternate (COPY_OUT_FROM == FALSE) request_page_list case
644 * Occurs when the original page was wired
645 * at the time of the list request
646 */
647 assert(VM_PAGE_WIRED(m));
648 vm_page_unwire(m, TRUE); /* reactivates */
649 }
1c79356b
A
650 m->overwriting = FALSE;
651 } else {
6d2010ae
A
652 /*
653 * Set the dirty state according to whether or not the page was
654 * modified during the pageout. Note that we purposefully do
655 * NOT call pmap_clear_modify since the page is still mapped.
656 * If the page were to be dirtied between the 2 calls, this
657 * this fact would be lost. This code is only necessary to
658 * maintain statistics, since the pmap module is always
659 * consulted if m->dirty is false.
660 */
1c79356b 661#if MACH_CLUSTER_STATS
39037602 662 m->dirty = pmap_is_modified(VM_PAGE_GET_PHYS_PAGE(m));
1c79356b
A
663
664 if (m->dirty) vm_pageout_cluster_dirtied++;
665 else vm_pageout_cluster_cleaned++;
666 if (m->wanted) vm_pageout_cluster_collisions++;
667#else
316670eb 668 m->dirty = FALSE;
1c79356b
A
669#endif
670 }
6d2010ae
A
671 if (m->encrypted_cleaning == TRUE) {
672 m->encrypted_cleaning = FALSE;
673 m->busy = FALSE;
674 }
1c79356b
A
675 m->cleaning = FALSE;
676
1c79356b
A
677 /*
678 * Wakeup any thread waiting for the page to be un-cleaning.
679 */
680 PAGE_WAKEUP(m);
681 vm_page_unlock_queues();
682 }
683 /*
684 * Account for the paging reference taken in vm_paging_object_allocate.
685 */
b0d623f7 686 vm_object_activity_end(shadow_object);
1c79356b
A
687 vm_object_unlock(shadow_object);
688
689 assert(object->ref_count == 0);
690 assert(object->paging_in_progress == 0);
b0d623f7 691 assert(object->activity_in_progress == 0);
1c79356b
A
692 assert(object->resident_page_count == 0);
693 return;
694}
695
1c79356b
A
696/*
697 * Routine: vm_pageclean_setup
698 *
699 * Purpose: setup a page to be cleaned (made non-dirty), but not
700 * necessarily flushed from the VM page cache.
701 * This is accomplished by cleaning in place.
702 *
b0d623f7
A
703 * The page must not be busy, and new_object
704 * must be locked.
705 *
1c79356b 706 */
3e170ce0 707static void
1c79356b
A
708vm_pageclean_setup(
709 vm_page_t m,
710 vm_page_t new_m,
711 vm_object_t new_object,
712 vm_object_offset_t new_offset)
713{
1c79356b 714 assert(!m->busy);
2d21ac55 715#if 0
1c79356b 716 assert(!m->cleaning);
2d21ac55 717#endif
1c79356b
A
718
719 XPR(XPR_VM_PAGEOUT,
39037602
A
720 "vm_pageclean_setup, obj 0x%X off 0x%X page 0x%X new 0x%X new_off 0x%X\n",
721 VM_PAGE_OBJECT(m), m->offset, m,
b0d623f7 722 new_m, new_offset);
1c79356b 723
39037602 724 pmap_clear_modify(VM_PAGE_GET_PHYS_PAGE(m));
1c79356b
A
725
726 /*
727 * Mark original page as cleaning in place.
728 */
729 m->cleaning = TRUE;
316670eb 730 SET_PAGE_DIRTY(m, FALSE);
1c79356b
A
731 m->precious = FALSE;
732
733 /*
734 * Convert the fictitious page to a private shadow of
735 * the real page.
736 */
737 assert(new_m->fictitious);
39037602 738 assert(VM_PAGE_GET_PHYS_PAGE(new_m) == vm_page_fictitious_addr);
1c79356b
A
739 new_m->fictitious = FALSE;
740 new_m->private = TRUE;
39037602
A
741 new_m->free_when_done = TRUE;
742 VM_PAGE_SET_PHYS_PAGE(new_m, VM_PAGE_GET_PHYS_PAGE(m));
b0d623f7
A
743
744 vm_page_lockspin_queues();
3e170ce0 745 vm_page_wire(new_m, VM_KERN_MEMORY_NONE, TRUE);
b0d623f7 746 vm_page_unlock_queues();
1c79356b 747
3e170ce0 748 vm_page_insert_wired(new_m, new_object, new_offset, VM_KERN_MEMORY_NONE);
1c79356b
A
749 assert(!new_m->wanted);
750 new_m->busy = FALSE;
751}
752
1c79356b
A
753/*
754 * Routine: vm_pageout_initialize_page
755 * Purpose:
756 * Causes the specified page to be initialized in
757 * the appropriate memory object. This routine is used to push
758 * pages into a copy-object when they are modified in the
759 * permanent object.
760 *
761 * The page is moved to a temporary object and paged out.
762 *
763 * In/out conditions:
764 * The page in question must not be on any pageout queues.
765 * The object to which it belongs must be locked.
766 * The page must be busy, but not hold a paging reference.
767 *
768 * Implementation:
769 * Move this page to a completely new object.
770 */
771void
772vm_pageout_initialize_page(
773 vm_page_t m)
774{
1c79356b
A
775 vm_object_t object;
776 vm_object_offset_t paging_offset;
2d21ac55 777 memory_object_t pager;
1c79356b
A
778
779 XPR(XPR_VM_PAGEOUT,
780 "vm_pageout_initialize_page, page 0x%X\n",
b0d623f7 781 m, 0, 0, 0, 0);
39037602
A
782
783 assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
784
785 object = VM_PAGE_OBJECT(m);
786
1c79356b 787 assert(m->busy);
39037602 788 assert(object->internal);
1c79356b
A
789
790 /*
791 * Verify that we really want to clean this page
792 */
793 assert(!m->absent);
794 assert(!m->error);
795 assert(m->dirty);
796
797 /*
798 * Create a paging reference to let us play with the object.
799 */
1c79356b 800 paging_offset = m->offset + object->paging_offset;
2d21ac55
A
801
802 if (m->absent || m->error || m->restart || (!m->dirty && !m->precious)) {
1c79356b 803 panic("reservation without pageout?"); /* alan */
39037602
A
804
805 VM_PAGE_FREE(m);
2d21ac55
A
806 vm_object_unlock(object);
807
808 return;
809 }
810
811 /*
812 * If there's no pager, then we can't clean the page. This should
813 * never happen since this should be a copy object and therefore not
814 * an external object, so the pager should always be there.
815 */
816
817 pager = object->pager;
818
819 if (pager == MEMORY_OBJECT_NULL) {
2d21ac55 820 panic("missing pager for copy object");
39037602
A
821
822 VM_PAGE_FREE(m);
1c79356b
A
823 return;
824 }
825
316670eb
A
826 /*
827 * set the page for future call to vm_fault_list_request
828 */
39037602 829 pmap_clear_modify(VM_PAGE_GET_PHYS_PAGE(m));
316670eb 830 SET_PAGE_DIRTY(m, FALSE);
b0d623f7 831
316670eb
A
832 /*
833 * keep the object from collapsing or terminating
834 */
835 vm_object_paging_begin(object);
55e303ae 836 vm_object_unlock(object);
1c79356b
A
837
838 /*
839 * Write the data to its pager.
840 * Note that the data is passed by naming the new object,
841 * not a virtual address; the pager interface has been
842 * manipulated to use the "internal memory" data type.
843 * [The object reference from its allocation is donated
844 * to the eventual recipient.]
845 */
2d21ac55 846 memory_object_data_initialize(pager, paging_offset, PAGE_SIZE);
1c79356b
A
847
848 vm_object_lock(object);
2d21ac55 849 vm_object_paging_end(object);
1c79356b
A
850}
851
852#if MACH_CLUSTER_STATS
853#define MAXCLUSTERPAGES 16
854struct {
855 unsigned long pages_in_cluster;
856 unsigned long pages_at_higher_offsets;
857 unsigned long pages_at_lower_offsets;
858} cluster_stats[MAXCLUSTERPAGES];
859#endif /* MACH_CLUSTER_STATS */
860
1c79356b
A
861
862/*
863 * vm_pageout_cluster:
864 *
91447636
A
865 * Given a page, queue it to the appropriate I/O thread,
866 * which will page it out and attempt to clean adjacent pages
1c79356b
A
867 * in the same operation.
868 *
39236c6e 869 * The object and queues must be locked. We will take a
55e303ae 870 * paging reference to prevent deallocation or collapse when we
91447636
A
871 * release the object lock back at the call site. The I/O thread
872 * is responsible for consuming this reference
55e303ae
A
873 *
874 * The page must not be on any pageout queue.
1c79356b 875 */
91447636 876
3e170ce0 877int
39037602 878vm_pageout_cluster(vm_page_t m, boolean_t immediate_ok, boolean_t keep_object_locked)
1c79356b 879{
39037602 880 vm_object_t object = VM_PAGE_OBJECT(m);
91447636
A
881 struct vm_pageout_queue *q;
882
1c79356b
A
883
884 XPR(XPR_VM_PAGEOUT,
885 "vm_pageout_cluster, object 0x%X offset 0x%X page 0x%X\n",
b0d623f7
A
886 object, m->offset, m, 0, 0);
887
888 VM_PAGE_CHECK(m);
39037602 889 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
6d2010ae 890 vm_object_lock_assert_exclusive(object);
1c79356b 891
91447636
A
892 /*
893 * Only a certain kind of page is appreciated here.
894 */
316670eb 895 assert((m->dirty || m->precious) && (!VM_PAGE_WIRED(m)));
39037602
A
896 assert(!m->cleaning && !m->laundry);
897 assert(m->vm_page_q_state == VM_PAGE_NOT_ON_Q);
55e303ae
A
898
899 /*
316670eb 900 * protect the object from collapse or termination
55e303ae 901 */
316670eb 902 vm_object_activity_begin(object);
55e303ae 903
39236c6e 904 if (object->internal == TRUE) {
39037602 905 assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
39236c6e 906
39037602 907 m->busy = TRUE;
3e170ce0 908
39037602
A
909 if (vm_compressor_immediate_preferred == TRUE && immediate_ok == TRUE) {
910 panic("immediate compressor mode no longer supported\n");
911
912 if (keep_object_locked == FALSE)
913 vm_object_unlock(object);
914 vm_page_unlock_queues();
3e170ce0 915
39037602
A
916 vm_pageout_immediate(m, keep_object_locked);
917
918 return (1);
3e170ce0 919 }
91447636 920 q = &vm_pageout_queue_internal;
39236c6e 921 } else
91447636 922 q = &vm_pageout_queue_external;
d1ecb069 923
39236c6e 924 /*
d1ecb069
A
925 * pgo_laundry count is tied to the laundry bit
926 */
6d2010ae 927 m->laundry = TRUE;
91447636 928 q->pgo_laundry++;
1c79356b 929
39037602
A
930 m->vm_page_q_state = VM_PAGE_ON_PAGEOUT_Q;
931 vm_page_queue_enter(&q->pgo_pending, m, vm_page_t, pageq);
91447636
A
932
933 if (q->pgo_idle == TRUE) {
39236c6e
A
934 q->pgo_idle = FALSE;
935 thread_wakeup((event_t) &q->pgo_pending);
1c79356b 936 }
b0d623f7 937 VM_PAGE_CHECK(m);
3e170ce0
A
938
939 return (0);
1c79356b
A
940}
941
55e303ae 942
91447636 943unsigned long vm_pageout_throttle_up_count = 0;
1c79356b
A
944
945/*
b0d623f7
A
946 * A page is back from laundry or we are stealing it back from
947 * the laundering state. See if there are some pages waiting to
91447636 948 * go to laundry and if we can let some of them go now.
1c79356b 949 *
91447636 950 * Object and page queues must be locked.
1c79356b 951 */
91447636
A
952void
953vm_pageout_throttle_up(
6d2010ae 954 vm_page_t m)
1c79356b 955{
6d2010ae 956 struct vm_pageout_queue *q;
39037602 957 vm_object_t m_object;
1c79356b 958
39037602 959 m_object = VM_PAGE_OBJECT(m);
1c79356b 960
39037602
A
961 assert(m_object != VM_OBJECT_NULL);
962 assert(m_object != kernel_object);
963
964 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
965 vm_object_lock_assert_exclusive(m_object);
316670eb 966
6d2010ae 967 vm_pageout_throttle_up_count++;
0b4c1975 968
39037602 969 if (m_object->internal == TRUE)
6d2010ae
A
970 q = &vm_pageout_queue_internal;
971 else
972 q = &vm_pageout_queue_external;
d1ecb069 973
39037602 974 if (m->vm_page_q_state == VM_PAGE_ON_PAGEOUT_Q) {
0b4c1975 975
39037602
A
976 vm_page_queue_remove(&q->pgo_pending, m, vm_page_t, pageq);
977 m->vm_page_q_state = VM_PAGE_NOT_ON_Q;
1c79356b 978
39037602 979 VM_PAGE_ZERO_PAGEQ_ENTRY(m);
91447636 980
39037602 981 vm_object_activity_end(m_object);
6d2010ae 982 }
316670eb 983 if (m->laundry == TRUE) {
91447636 984
6d2010ae
A
985 m->laundry = FALSE;
986 q->pgo_laundry--;
91447636 987
6d2010ae
A
988 if (q->pgo_throttled == TRUE) {
989 q->pgo_throttled = FALSE;
990 thread_wakeup((event_t) &q->pgo_laundry);
991 }
992 if (q->pgo_draining == TRUE && q->pgo_laundry == 0) {
993 q->pgo_draining = FALSE;
994 thread_wakeup((event_t) (&q->pgo_laundry+1));
995 }
996 }
997}
91447636 998
b0d623f7 999
39236c6e
A
1000static void
1001vm_pageout_throttle_up_batch(
1002 struct vm_pageout_queue *q,
1003 int batch_cnt)
1004{
39037602 1005 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
39236c6e
A
1006
1007 vm_pageout_throttle_up_count += batch_cnt;
1008
1009 q->pgo_laundry -= batch_cnt;
1010
1011 if (q->pgo_throttled == TRUE) {
1012 q->pgo_throttled = FALSE;
1013 thread_wakeup((event_t) &q->pgo_laundry);
1014 }
1015 if (q->pgo_draining == TRUE && q->pgo_laundry == 0) {
1016 q->pgo_draining = FALSE;
1017 thread_wakeup((event_t) (&q->pgo_laundry+1));
1018 }
1019}
1020
1021
1022
b0d623f7
A
1023/*
1024 * VM memory pressure monitoring.
1025 *
1026 * vm_pageout_scan() keeps track of the number of pages it considers and
1027 * reclaims, in the currently active vm_pageout_stat[vm_pageout_stat_now].
1028 *
1029 * compute_memory_pressure() is called every second from compute_averages()
1030 * and moves "vm_pageout_stat_now" forward, to start accumulating the number
1031 * of recalimed pages in a new vm_pageout_stat[] bucket.
1032 *
1033 * mach_vm_pressure_monitor() collects past statistics about memory pressure.
1034 * The caller provides the number of seconds ("nsecs") worth of statistics
1035 * it wants, up to 30 seconds.
1036 * It computes the number of pages reclaimed in the past "nsecs" seconds and
1037 * also returns the number of pages the system still needs to reclaim at this
1038 * moment in time.
1039 */
1040#define VM_PAGEOUT_STAT_SIZE 31
1041struct vm_pageout_stat {
1042 unsigned int considered;
1043 unsigned int reclaimed;
1044} vm_pageout_stats[VM_PAGEOUT_STAT_SIZE] = {{0,0}, };
1045unsigned int vm_pageout_stat_now = 0;
1046unsigned int vm_memory_pressure = 0;
1047
1048#define VM_PAGEOUT_STAT_BEFORE(i) \
1049 (((i) == 0) ? VM_PAGEOUT_STAT_SIZE - 1 : (i) - 1)
1050#define VM_PAGEOUT_STAT_AFTER(i) \
1051 (((i) == VM_PAGEOUT_STAT_SIZE - 1) ? 0 : (i) + 1)
1052
15129b1c
A
1053#if VM_PAGE_BUCKETS_CHECK
1054int vm_page_buckets_check_interval = 10; /* in seconds */
1055#endif /* VM_PAGE_BUCKETS_CHECK */
1056
b0d623f7
A
1057/*
1058 * Called from compute_averages().
1059 */
1060void
1061compute_memory_pressure(
1062 __unused void *arg)
1063{
1064 unsigned int vm_pageout_next;
1065
15129b1c
A
1066#if VM_PAGE_BUCKETS_CHECK
1067 /* check the consistency of VM page buckets at regular interval */
1068 static int counter = 0;
1069 if ((++counter % vm_page_buckets_check_interval) == 0) {
1070 vm_page_buckets_check();
1071 }
1072#endif /* VM_PAGE_BUCKETS_CHECK */
1073
b0d623f7
A
1074 vm_memory_pressure =
1075 vm_pageout_stats[VM_PAGEOUT_STAT_BEFORE(vm_pageout_stat_now)].reclaimed;
1076
1077 commpage_set_memory_pressure( vm_memory_pressure );
1078
1079 /* move "now" forward */
1080 vm_pageout_next = VM_PAGEOUT_STAT_AFTER(vm_pageout_stat_now);
1081 vm_pageout_stats[vm_pageout_next].considered = 0;
1082 vm_pageout_stats[vm_pageout_next].reclaimed = 0;
1083 vm_pageout_stat_now = vm_pageout_next;
1084}
1085
316670eb
A
1086
1087/*
1088 * IMPORTANT
1089 * mach_vm_ctl_page_free_wanted() is called indirectly, via
1090 * mach_vm_pressure_monitor(), when taking a stackshot. Therefore,
1091 * it must be safe in the restricted stackshot context. Locks and/or
1092 * blocking are not allowable.
1093 */
b0d623f7
A
1094unsigned int
1095mach_vm_ctl_page_free_wanted(void)
1096{
1097 unsigned int page_free_target, page_free_count, page_free_wanted;
1098
1099 page_free_target = vm_page_free_target;
1100 page_free_count = vm_page_free_count;
1101 if (page_free_target > page_free_count) {
1102 page_free_wanted = page_free_target - page_free_count;
1103 } else {
1104 page_free_wanted = 0;
1105 }
1106
1107 return page_free_wanted;
1108}
1109
316670eb
A
1110
1111/*
1112 * IMPORTANT:
1113 * mach_vm_pressure_monitor() is called when taking a stackshot, with
1114 * wait_for_pressure FALSE, so that code path must remain safe in the
1115 * restricted stackshot context. No blocking or locks are allowable.
1116 * on that code path.
1117 */
1118
b0d623f7
A
1119kern_return_t
1120mach_vm_pressure_monitor(
1121 boolean_t wait_for_pressure,
1122 unsigned int nsecs_monitored,
1123 unsigned int *pages_reclaimed_p,
1124 unsigned int *pages_wanted_p)
1125{
1126 wait_result_t wr;
1127 unsigned int vm_pageout_then, vm_pageout_now;
1128 unsigned int pages_reclaimed;
1129
1130 /*
1131 * We don't take the vm_page_queue_lock here because we don't want
1132 * vm_pressure_monitor() to get in the way of the vm_pageout_scan()
1133 * thread when it's trying to reclaim memory. We don't need fully
1134 * accurate monitoring anyway...
1135 */
1136
1137 if (wait_for_pressure) {
1138 /* wait until there's memory pressure */
1139 while (vm_page_free_count >= vm_page_free_target) {
1140 wr = assert_wait((event_t) &vm_page_free_wanted,
1141 THREAD_INTERRUPTIBLE);
1142 if (wr == THREAD_WAITING) {
1143 wr = thread_block(THREAD_CONTINUE_NULL);
1144 }
1145 if (wr == THREAD_INTERRUPTED) {
1146 return KERN_ABORTED;
1147 }
1148 if (wr == THREAD_AWAKENED) {
1149 /*
1150 * The memory pressure might have already
1151 * been relieved but let's not block again
1152 * and let's report that there was memory
1153 * pressure at some point.
1154 */
1155 break;
1156 }
1157 }
1158 }
1159
1160 /* provide the number of pages the system wants to reclaim */
1161 if (pages_wanted_p != NULL) {
1162 *pages_wanted_p = mach_vm_ctl_page_free_wanted();
1163 }
1164
1165 if (pages_reclaimed_p == NULL) {
1166 return KERN_SUCCESS;
1167 }
1168
1169 /* provide number of pages reclaimed in the last "nsecs_monitored" */
39037602
A
1170 vm_pageout_now = vm_pageout_stat_now;
1171 pages_reclaimed = 0;
1172 for (vm_pageout_then =
1173 VM_PAGEOUT_STAT_BEFORE(vm_pageout_now);
1174 vm_pageout_then != vm_pageout_now &&
1175 nsecs_monitored-- != 0;
1176 vm_pageout_then =
1177 VM_PAGEOUT_STAT_BEFORE(vm_pageout_then)) {
1178 pages_reclaimed += vm_pageout_stats[vm_pageout_then].reclaimed;
1179 }
b0d623f7
A
1180 *pages_reclaimed_p = pages_reclaimed;
1181
1182 return KERN_SUCCESS;
1183}
1184
b0d623f7 1185
316670eb 1186
39037602
A
1187#if DEVELOPMENT || DEBUG
1188
3e170ce0 1189static void
39037602
A
1190vm_pageout_disconnect_all_pages_in_queue(vm_page_queue_head_t *, int);
1191
1192/*
1193 * condition variable used to make sure there is
1194 * only a single sweep going on at a time
1195 */
1196boolean_t vm_pageout_disconnect_all_pages_active = FALSE;
1197
1198
1199void
1200vm_pageout_disconnect_all_pages()
1201{
1202 vm_page_lock_queues();
1203
1204 if (vm_pageout_disconnect_all_pages_active == TRUE) {
1205 vm_page_unlock_queues();
1206 return;
1207 }
1208 vm_pageout_disconnect_all_pages_active = TRUE;
1209 vm_page_unlock_queues();
1210
1211 vm_pageout_disconnect_all_pages_in_queue(&vm_page_queue_throttled, vm_page_throttled_count);
1212 vm_pageout_disconnect_all_pages_in_queue(&vm_page_queue_anonymous, vm_page_anonymous_count);
1213 vm_pageout_disconnect_all_pages_in_queue(&vm_page_queue_active, vm_page_active_count);
1214
1215 vm_pageout_disconnect_all_pages_active = FALSE;
1216}
1217
1218
1219void
1220vm_pageout_disconnect_all_pages_in_queue(vm_page_queue_head_t *q, int qcount)
1221{
1222 vm_page_t m;
1223 vm_object_t t_object = NULL;
1224 vm_object_t l_object = NULL;
1225 vm_object_t m_object = NULL;
1226 int delayed_unlock = 0;
1227 int try_failed_count = 0;
1228 int disconnected_count = 0;
1229 int paused_count = 0;
1230 int object_locked_count = 0;
1231
1232 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_DISCONNECT_ALL_PAGE_MAPPINGS)) | DBG_FUNC_START,
1233 q, qcount, 0, 0, 0);
1234
1235 vm_page_lock_queues();
1236
1237 while (qcount && !vm_page_queue_empty(q)) {
1238
1239 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
1240
1241 m = (vm_page_t) vm_page_queue_first(q);
1242 m_object = VM_PAGE_OBJECT(m);
1243
1244 /*
1245 * check to see if we currently are working
1246 * with the same object... if so, we've
1247 * already got the lock
1248 */
1249 if (m_object != l_object) {
1250 /*
1251 * the object associated with candidate page is
1252 * different from the one we were just working
1253 * with... dump the lock if we still own it
1254 */
1255 if (l_object != NULL) {
1256 vm_object_unlock(l_object);
1257 l_object = NULL;
1258 }
1259 if (m_object != t_object)
1260 try_failed_count = 0;
1261
1262 /*
1263 * Try to lock object; since we've alread got the
1264 * page queues lock, we can only 'try' for this one.
1265 * if the 'try' fails, we need to do a mutex_pause
1266 * to allow the owner of the object lock a chance to
1267 * run...
1268 */
1269 if ( !vm_object_lock_try_scan(m_object)) {
1270
1271 if (try_failed_count > 20) {
1272 goto reenter_pg_on_q;
1273 }
1274 vm_page_unlock_queues();
1275 mutex_pause(try_failed_count++);
1276 vm_page_lock_queues();
1277 delayed_unlock = 0;
1278
1279 paused_count++;
1280
1281 t_object = m_object;
1282 continue;
1283 }
1284 object_locked_count++;
1285
1286 l_object = m_object;
1287 }
1288 if ( !m_object->alive || m->encrypted_cleaning || m->cleaning || m->laundry || m->busy || m->absent || m->error || m->free_when_done) {
1289 /*
1290 * put it back on the head of its queue
1291 */
1292 goto reenter_pg_on_q;
1293 }
1294 if (m->pmapped == TRUE) {
1295
1296 pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
1297
1298 disconnected_count++;
1299 }
1300reenter_pg_on_q:
1301 vm_page_queue_remove(q, m, vm_page_t, pageq);
1302 vm_page_queue_enter(q, m, vm_page_t, pageq);
1303
1304 qcount--;
1305 try_failed_count = 0;
1306
1307 if (delayed_unlock++ > 128) {
1308
1309 if (l_object != NULL) {
1310 vm_object_unlock(l_object);
1311 l_object = NULL;
1312 }
1313 lck_mtx_yield(&vm_page_queue_lock);
1314 delayed_unlock = 0;
1315 }
1316 }
1317 if (l_object != NULL) {
1318 vm_object_unlock(l_object);
1319 l_object = NULL;
1320 }
1321 vm_page_unlock_queues();
1322
1323 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_DISCONNECT_ALL_PAGE_MAPPINGS)) | DBG_FUNC_END,
1324 q, disconnected_count, object_locked_count, paused_count, 0);
1325}
1326
1327#endif
1328
1329
1330static void
1331vm_pageout_page_queue(vm_page_queue_head_t *, int);
3e170ce0
A
1332
1333/*
1334 * condition variable used to make sure there is
1335 * only a single sweep going on at a time
1336 */
1337boolean_t vm_pageout_anonymous_pages_active = FALSE;
1338
1339
1340void
1341vm_pageout_anonymous_pages()
1342{
39037602 1343 if (VM_CONFIG_COMPRESSOR_IS_PRESENT) {
3e170ce0
A
1344
1345 vm_page_lock_queues();
1346
1347 if (vm_pageout_anonymous_pages_active == TRUE) {
1348 vm_page_unlock_queues();
1349 return;
1350 }
1351 vm_pageout_anonymous_pages_active = TRUE;
1352 vm_page_unlock_queues();
1353
1354 vm_pageout_page_queue(&vm_page_queue_throttled, vm_page_throttled_count);
1355 vm_pageout_page_queue(&vm_page_queue_anonymous, vm_page_anonymous_count);
1356 vm_pageout_page_queue(&vm_page_queue_active, vm_page_active_count);
1357
39037602
A
1358 if (VM_CONFIG_SWAP_IS_PRESENT)
1359 vm_consider_swapping();
3e170ce0
A
1360
1361 vm_page_lock_queues();
1362 vm_pageout_anonymous_pages_active = FALSE;
1363 vm_page_unlock_queues();
1364 }
1365}
1366
1367
1368void
39037602 1369vm_pageout_page_queue(vm_page_queue_head_t *q, int qcount)
3e170ce0
A
1370{
1371 vm_page_t m;
1372 vm_object_t t_object = NULL;
1373 vm_object_t l_object = NULL;
1374 vm_object_t m_object = NULL;
1375 int delayed_unlock = 0;
1376 int try_failed_count = 0;
1377 int refmod_state;
1378 int pmap_options;
1379 struct vm_pageout_queue *iq;
39037602 1380 ppnum_t phys_page;
3e170ce0
A
1381
1382
1383 iq = &vm_pageout_queue_internal;
1384
1385 vm_page_lock_queues();
1386
39037602 1387 while (qcount && !vm_page_queue_empty(q)) {
3e170ce0 1388
39037602 1389 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
3e170ce0
A
1390
1391 if (VM_PAGE_Q_THROTTLED(iq)) {
1392
1393 if (l_object != NULL) {
1394 vm_object_unlock(l_object);
1395 l_object = NULL;
1396 }
1397 iq->pgo_draining = TRUE;
1398
1399 assert_wait((event_t) (&iq->pgo_laundry + 1), THREAD_INTERRUPTIBLE);
1400 vm_page_unlock_queues();
1401
1402 thread_block(THREAD_CONTINUE_NULL);
1403
1404 vm_page_lock_queues();
1405 delayed_unlock = 0;
1406 continue;
1407 }
39037602
A
1408 m = (vm_page_t) vm_page_queue_first(q);
1409 m_object = VM_PAGE_OBJECT(m);
3e170ce0
A
1410
1411 /*
1412 * check to see if we currently are working
1413 * with the same object... if so, we've
1414 * already got the lock
1415 */
1416 if (m_object != l_object) {
1417 if ( !m_object->internal)
1418 goto reenter_pg_on_q;
1419
1420 /*
1421 * the object associated with candidate page is
1422 * different from the one we were just working
1423 * with... dump the lock if we still own it
1424 */
1425 if (l_object != NULL) {
1426 vm_object_unlock(l_object);
1427 l_object = NULL;
1428 }
1429 if (m_object != t_object)
1430 try_failed_count = 0;
1431
1432 /*
1433 * Try to lock object; since we've alread got the
1434 * page queues lock, we can only 'try' for this one.
1435 * if the 'try' fails, we need to do a mutex_pause
1436 * to allow the owner of the object lock a chance to
1437 * run...
1438 */
1439 if ( !vm_object_lock_try_scan(m_object)) {
1440
1441 if (try_failed_count > 20) {
1442 goto reenter_pg_on_q;
1443 }
1444 vm_page_unlock_queues();
1445 mutex_pause(try_failed_count++);
1446 vm_page_lock_queues();
1447 delayed_unlock = 0;
1448
1449 t_object = m_object;
1450 continue;
1451 }
1452 l_object = m_object;
1453 }
39037602 1454 if ( !m_object->alive || m->encrypted_cleaning || m->cleaning || m->laundry || m->busy || m->absent || m->error || m->free_when_done) {
3e170ce0
A
1455 /*
1456 * page is not to be cleaned
1457 * put it back on the head of its queue
1458 */
1459 goto reenter_pg_on_q;
1460 }
39037602
A
1461 phys_page = VM_PAGE_GET_PHYS_PAGE(m);
1462
3e170ce0 1463 if (m->reference == FALSE && m->pmapped == TRUE) {
39037602 1464 refmod_state = pmap_get_refmod(phys_page);
3e170ce0
A
1465
1466 if (refmod_state & VM_MEM_REFERENCED)
1467 m->reference = TRUE;
1468 if (refmod_state & VM_MEM_MODIFIED) {
1469 SET_PAGE_DIRTY(m, FALSE);
1470 }
1471 }
1472 if (m->reference == TRUE) {
1473 m->reference = FALSE;
39037602 1474 pmap_clear_refmod_options(phys_page, VM_MEM_REFERENCED, PMAP_OPTIONS_NOFLUSH, (void *)NULL);
3e170ce0
A
1475 goto reenter_pg_on_q;
1476 }
1477 if (m->pmapped == TRUE) {
1478 if (m->dirty || m->precious) {
1479 pmap_options = PMAP_OPTIONS_COMPRESSOR;
1480 } else {
1481 pmap_options = PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED;
1482 }
39037602 1483 refmod_state = pmap_disconnect_options(phys_page, pmap_options, NULL);
3e170ce0
A
1484 if (refmod_state & VM_MEM_MODIFIED) {
1485 SET_PAGE_DIRTY(m, FALSE);
1486 }
1487 }
1488 if ( !m->dirty && !m->precious) {
1489 vm_page_unlock_queues();
1490 VM_PAGE_FREE(m);
1491 vm_page_lock_queues();
1492 delayed_unlock = 0;
1493
1494 goto next_pg;
1495 }
1496 if (!m_object->pager_initialized || m_object->pager == MEMORY_OBJECT_NULL) {
1497
1498 if (!m_object->pager_initialized) {
1499
1500 vm_page_unlock_queues();
1501
1502 vm_object_collapse(m_object, (vm_object_offset_t) 0, TRUE);
1503
1504 if (!m_object->pager_initialized)
1505 vm_object_compressor_pager_create(m_object);
1506
1507 vm_page_lock_queues();
1508 delayed_unlock = 0;
1509 }
1510 if (!m_object->pager_initialized || m_object->pager == MEMORY_OBJECT_NULL)
1511 goto reenter_pg_on_q;
1512 /*
1513 * vm_object_compressor_pager_create will drop the object lock
1514 * which means 'm' may no longer be valid to use
1515 */
1516 continue;
1517 }
1518 /*
1519 * we've already factored out pages in the laundry which
1520 * means this page can't be on the pageout queue so it's
1521 * safe to do the vm_page_queues_remove
1522 */
39037602 1523 vm_page_queues_remove(m, TRUE);
3e170ce0 1524
39037602 1525 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
3e170ce0 1526
39037602 1527 vm_pageout_cluster(m, FALSE, FALSE);
3e170ce0
A
1528
1529 goto next_pg;
1530
1531reenter_pg_on_q:
39037602
A
1532 vm_page_queue_remove(q, m, vm_page_t, pageq);
1533 vm_page_queue_enter(q, m, vm_page_t, pageq);
3e170ce0
A
1534next_pg:
1535 qcount--;
1536 try_failed_count = 0;
1537
1538 if (delayed_unlock++ > 128) {
1539
1540 if (l_object != NULL) {
1541 vm_object_unlock(l_object);
1542 l_object = NULL;
1543 }
1544 lck_mtx_yield(&vm_page_queue_lock);
1545 delayed_unlock = 0;
1546 }
1547 }
1548 if (l_object != NULL) {
1549 vm_object_unlock(l_object);
1550 l_object = NULL;
1551 }
1552 vm_page_unlock_queues();
1553}
1554
1555
1556
316670eb
A
1557/*
1558 * function in BSD to apply I/O throttle to the pageout thread
1559 */
1560extern void vm_pageout_io_throttle(void);
1561
39037602 1562#define VM_PAGEOUT_SCAN_HANDLE_REUSABLE_PAGE(m, obj) \
39236c6e
A
1563 MACRO_BEGIN \
1564 /* \
1565 * If a "reusable" page somehow made it back into \
1566 * the active queue, it's been re-used and is not \
1567 * quite re-usable. \
1568 * If the VM object was "all_reusable", consider it \
1569 * as "all re-used" instead of converting it to \
1570 * "partially re-used", which could be expensive. \
1571 */ \
39037602 1572 assert(VM_PAGE_OBJECT((m)) == (obj)); \
39236c6e 1573 if ((m)->reusable || \
39037602
A
1574 (obj)->all_reusable) { \
1575 vm_object_reuse_pages((obj), \
39236c6e
A
1576 (m)->offset, \
1577 (m)->offset + PAGE_SIZE_64, \
1578 FALSE); \
1579 } \
1580 MACRO_END
1581
1582
1583#define VM_PAGEOUT_DELAYED_UNLOCK_LIMIT 64
6d2010ae
A
1584#define VM_PAGEOUT_DELAYED_UNLOCK_LIMIT_MAX 1024
1585
1586#define FCS_IDLE 0
1587#define FCS_DELAYED 1
1588#define FCS_DEADLOCK_DETECTED 2
1589
1590struct flow_control {
1591 int state;
1592 mach_timespec_t ts;
1593};
1594
39037602
A
1595#if CONFIG_BACKGROUND_QUEUE
1596uint64_t vm_pageout_considered_bq_internal = 0;
1597uint64_t vm_pageout_considered_bq_external = 0;
1598uint64_t vm_pageout_rejected_bq_internal = 0;
1599uint64_t vm_pageout_rejected_bq_external = 0;
1600#endif
316670eb 1601uint32_t vm_pageout_considered_page = 0;
39236c6e 1602uint32_t vm_page_filecache_min = 0;
316670eb 1603
39236c6e 1604#define ANONS_GRABBED_LIMIT 2
6d2010ae 1605
39037602
A
1606#if CONFIG_SECLUDED_MEMORY
1607extern vm_page_t vm_page_grab_secluded(void);
1608uint64_t vm_pageout_freed_from_secluded = 0;
1609uint64_t vm_pageout_secluded_reactivated = 0; /* debugging; how many secluded pages are found to be referenced on pageout (and are therefore reactivated) */
1610uint64_t vm_pageout_secluded_burst_count = 0;
1611#endif /* CONFIG_SECLUDED_MEMORY */
1612
6d2010ae
A
1613/*
1614 * vm_pageout_scan does the dirty work for the pageout daemon.
316670eb
A
1615 * It returns with both vm_page_queue_free_lock and vm_page_queue_lock
1616 * held and vm_page_free_wanted == 0.
6d2010ae 1617 */
1c79356b
A
1618void
1619vm_pageout_scan(void)
1620{
91447636
A
1621 unsigned int loop_count = 0;
1622 unsigned int inactive_burst_count = 0;
1623 unsigned int active_burst_count = 0;
2d21ac55
A
1624 unsigned int reactivated_this_call;
1625 unsigned int reactivate_limit;
1626 vm_page_t local_freeq = NULL;
55e303ae 1627 int local_freed = 0;
2d21ac55 1628 int delayed_unlock;
6d2010ae 1629 int delayed_unlock_limit = 0;
91447636
A
1630 int refmod_state = 0;
1631 int vm_pageout_deadlock_target = 0;
1632 struct vm_pageout_queue *iq;
1633 struct vm_pageout_queue *eq;
2d21ac55 1634 struct vm_speculative_age_q *sq;
b0d623f7 1635 struct flow_control flow_control = { 0, { 0, 0 } };
91447636 1636 boolean_t inactive_throttled = FALSE;
2d21ac55 1637 boolean_t try_failed;
6d2010ae
A
1638 mach_timespec_t ts;
1639 unsigned int msecs = 0;
91447636 1640 vm_object_t object;
2d21ac55 1641 vm_object_t last_object_tried;
2d21ac55
A
1642 uint32_t catch_up_count = 0;
1643 uint32_t inactive_reclaim_run;
316670eb
A
1644 boolean_t exceeded_burst_throttle;
1645 boolean_t grab_anonymous = FALSE;
39236c6e
A
1646 boolean_t force_anonymous = FALSE;
1647 int anons_grabbed = 0;
39037602
A
1648 int page_prev_q_state = 0;
1649 boolean_t requeue_insert_first = FALSE;
1650#if CONFIG_BACKGROUND_QUEUE
1651 boolean_t ignore_reference = FALSE;
1652#endif
1653#if CONFIG_SECLUDED_MEMORY
1654 boolean_t ignore_reference_secluded;
1655#endif /* CONFIG_SECLUDED_MEMORY */
6d2010ae
A
1656 int cache_evict_throttle = 0;
1657 uint32_t vm_pageout_inactive_external_forced_reactivate_limit = 0;
fe8ab488 1658 int force_purge = 0;
3e170ce0
A
1659#define DELAY_SPECULATIVE_AGE 1000
1660 int delay_speculative_age = 0;
39037602 1661 vm_object_t m_object = VM_OBJECT_NULL;
fe8ab488
A
1662
1663#if VM_PRESSURE_EVENTS
39236c6e 1664 vm_pressure_level_t pressure_level;
fe8ab488 1665#endif /* VM_PRESSURE_EVENTS */
6d2010ae 1666
3e170ce0 1667 VM_DEBUG_CONSTANT_EVENT(vm_pageout_scan, VM_PAGEOUT_SCAN, DBG_FUNC_START,
6d2010ae
A
1668 vm_pageout_speculative_clean, vm_pageout_inactive_clean,
1669 vm_pageout_inactive_dirty_internal, vm_pageout_inactive_dirty_external);
91447636
A
1670
1671 flow_control.state = FCS_IDLE;
1672 iq = &vm_pageout_queue_internal;
1673 eq = &vm_pageout_queue_external;
2d21ac55
A
1674 sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
1675
1c79356b
A
1676
1677 XPR(XPR_VM_PAGEOUT, "vm_pageout_scan\n", 0, 0, 0, 0, 0);
1678
2d21ac55
A
1679
1680 vm_page_lock_queues();
1681 delayed_unlock = 1; /* must be nonzero if Qs are locked, 0 if unlocked */
1682
1683 /*
1684 * Calculate the max number of referenced pages on the inactive
1685 * queue that we will reactivate.
1686 */
1687 reactivated_this_call = 0;
1688 reactivate_limit = VM_PAGE_REACTIVATE_LIMIT(vm_page_active_count +
1689 vm_page_inactive_count);
1690 inactive_reclaim_run = 0;
1691
316670eb 1692 vm_pageout_inactive_external_forced_reactivate_limit = vm_page_active_count + vm_page_inactive_count;
2d21ac55 1693
6d2010ae 1694 /*
1c79356b
A
1695 * We want to gradually dribble pages from the active queue
1696 * to the inactive queue. If we let the inactive queue get
1697 * very small, and then suddenly dump many pages into it,
1698 * those pages won't get a sufficient chance to be referenced
1699 * before we start taking them from the inactive queue.
1700 *
6d2010ae
A
1701 * We must limit the rate at which we send pages to the pagers
1702 * so that we don't tie up too many pages in the I/O queues.
1703 * We implement a throttling mechanism using the laundry count
1704 * to limit the number of pages outstanding to the default
1705 * and external pagers. We can bypass the throttles and look
1706 * for clean pages if the pageout queues don't drain in a timely
1707 * fashion since this may indicate that the pageout paths are
1708 * stalled waiting for memory, which only we can provide.
1c79356b 1709 */
91447636 1710
1c79356b 1711
91447636 1712Restart:
39037602
A
1713
1714
2d21ac55 1715 assert(delayed_unlock!=0);
39236c6e 1716
91447636
A
1717 /*
1718 * Recalculate vm_page_inactivate_target.
1719 */
1720 vm_page_inactive_target = VM_PAGE_INACTIVE_TARGET(vm_page_active_count +
2d21ac55
A
1721 vm_page_inactive_count +
1722 vm_page_speculative_count);
316670eb 1723
39236c6e
A
1724 vm_page_anonymous_min = vm_page_inactive_target / 20;
1725
316670eb 1726
2d21ac55
A
1727 /*
1728 * don't want to wake the pageout_scan thread up everytime we fall below
1729 * the targets... set a low water mark at 0.25% below the target
1730 */
1731 vm_page_inactive_min = vm_page_inactive_target - (vm_page_inactive_target / 400);
1c79356b 1732
6d2010ae
A
1733 if (vm_page_speculative_percentage > 50)
1734 vm_page_speculative_percentage = 50;
1735 else if (vm_page_speculative_percentage <= 0)
1736 vm_page_speculative_percentage = 1;
1737
2d21ac55
A
1738 vm_page_speculative_target = VM_PAGE_SPECULATIVE_TARGET(vm_page_active_count +
1739 vm_page_inactive_count);
6d2010ae 1740
2d21ac55
A
1741 object = NULL;
1742 last_object_tried = NULL;
1743 try_failed = FALSE;
1744
1745 if ((vm_page_inactive_count + vm_page_speculative_count) < VM_PAGE_INACTIVE_HEALTHY_LIMIT(vm_page_active_count))
1746 catch_up_count = vm_page_inactive_count + vm_page_speculative_count;
1747 else
1748 catch_up_count = 0;
39236c6e 1749
55e303ae 1750 for (;;) {
91447636 1751 vm_page_t m;
1c79356b 1752
2d21ac55 1753 DTRACE_VM2(rev, int, 1, (uint64_t *), NULL);
1c79356b 1754
39037602
A
1755#if CONFIG_SECLUDED_MEMORY
1756 if (vm_page_secluded_count > vm_page_secluded_target &&
1757 object != NULL) {
1758 vm_object_unlock(object);
1759 object = NULL;
1760 vm_pageout_scan_wants_object = VM_OBJECT_NULL;
1761 }
1762
1763 /*
1764 * Deal with secluded_q overflow.
1765 */
1766 if (vm_page_secluded_count > vm_page_secluded_target &&
1767 secluded_aging_policy == SECLUDED_AGING_FIFO) {
1768 unsigned int secluded_overflow;
1769 vm_page_t secluded_page;
1770
1771 /*
1772 * SECLUDED_AGING_FIFO:
1773 * No aging, just reclaim the excess pages
1774 * at the tail of the secluded queue.
1775 * We're reclaiming pages and we're not hogging
1776 * any global lock, so no need for throttling.
1777 */
1778
1779 secluded_overflow = (vm_page_secluded_count -
1780 vm_page_secluded_target);
1781 /* transfer to free queue */
1782 vm_page_unlock_queues();
1783 while (secluded_overflow--) {
1784 secluded_page = vm_page_grab_secluded();
1785 if (secluded_page == VM_PAGE_NULL) {
1786 break;
1787 }
1788 assert(secluded_page->busy);
1789 assert(secluded_page->pageq.next == 0 &&
1790 secluded_page->pageq.prev == 0);
1791
1792 secluded_page->snext = local_freeq;
1793 local_freeq = secluded_page;
1794 local_freed++;
1795 secluded_page = VM_PAGE_NULL;
1796 }
1797 } else if (vm_page_secluded_count > vm_page_secluded_target &&
1798 secluded_aging_policy == SECLUDED_AGING_ALONG_ACTIVE) {
1799 unsigned int secluded_overflow;
1800 vm_page_t secluded_page;
1801
1802 /*
1803 * SECLUDED_AGING_ALONG_ACTIVE:
1804 * There might be free pages at the tail of the
1805 * secluded queue:
1806 * just move them to the free queue (in batches).
1807 * There can also be an excessive number of "inuse"
1808 * pages:
1809 * we age them by resetting their "referenced" bit and
1810 * moving them to the inactive queue. Their trip
1811 * through the secluded queue was equivalent to a trip
1812 * through the active queue.
1813 *
1814 * We're holding the page queue lock, so we need
1815 * to throttle and give someone else a chance to
1816 * grab that lock if needed.
1817 *
1818 * We're also limiting the number of secluded "inuse"
1819 * pages that get moved to the inactive queue, using
1820 * the same "active_bust_count" method we use when
1821 * balancing the active and inactive queues, because
1822 * there can be a large number
1823 * of extra "inuse" pages and handling them gets in the
1824 * way of actually reclaiming memory.
1825 */
1826
1827 active_burst_count = MIN(vm_pageout_burst_active_throttle,
1828 vm_page_secluded_count_inuse);
1829 delayed_unlock_limit = VM_PAGEOUT_DELAYED_UNLOCK_LIMIT;
1830 delayed_unlock = 1;
1831 secluded_overflow = (vm_page_secluded_count -
1832 vm_page_secluded_target);
1833 while (secluded_overflow-- > 0 &&
1834 vm_page_secluded_count > vm_page_secluded_target) {
1835 assert((vm_page_secluded_count_free +
1836 vm_page_secluded_count_inuse) ==
1837 vm_page_secluded_count);
d190cdc3 1838 secluded_page = vm_page_queue_first(&vm_page_queue_secluded);
39037602
A
1839 assert(secluded_page->vm_page_q_state ==
1840 VM_PAGE_ON_SECLUDED_Q);
d190cdc3 1841 vm_page_queues_remove(secluded_page, FALSE);
39037602
A
1842 assert(!secluded_page->fictitious);
1843 assert(!VM_PAGE_WIRED(secluded_page));
1844 if (secluded_page->vm_page_object == 0) {
1845 /* transfer to free queue */
1846 assert(secluded_page->busy);
39037602
A
1847 secluded_page->snext = local_freeq;
1848 local_freeq = secluded_page;
1849 local_freed++;
1850 } else {
39037602
A
1851 /* transfer to head of inactive queue */
1852 pmap_clear_refmod_options(
1853 VM_PAGE_GET_PHYS_PAGE(secluded_page),
1854 VM_MEM_REFERENCED,
1855 PMAP_OPTIONS_NOFLUSH,
1856 (void *)NULL);
1857 vm_page_enqueue_inactive(secluded_page,
1858 FALSE);
1859 if (active_burst_count-- == 0) {
1860 vm_pageout_secluded_burst_count++;
1861 break;
1862 }
1863 }
1864 secluded_page = VM_PAGE_NULL;
1865 if (delayed_unlock++ > delayed_unlock_limit) {
1866 if (local_freeq) {
1867 vm_page_unlock_queues();
1868 VM_DEBUG_EVENT(
1869 vm_pageout_freelist,
1870 VM_PAGEOUT_FREELIST,
1871 DBG_FUNC_START,
1872 vm_page_free_count,
1873 local_freed,
1874 delayed_unlock_limit,
1875 1);
1876 vm_page_free_list(local_freeq,
1877 TRUE);
1878 VM_DEBUG_EVENT(
1879 vm_pageout_freelist,
1880 VM_PAGEOUT_FREELIST,
1881 DBG_FUNC_END,
1882 vm_page_free_count,
1883 0, 0, 1);
1884 local_freeq = NULL;
1885 local_freed = 0;
1886 vm_page_lock_queues();
1887 } else {
1888 lck_mtx_yield(&vm_page_queue_lock);
1889 }
1890 delayed_unlock = 1;
1891 }
1892 }
1893 delayed_unlock = 1;
1894 } else if (vm_page_secluded_count > vm_page_secluded_target &&
1895 secluded_aging_policy == SECLUDED_AGING_AFTER_INACTIVE) {
1896 /*
1897 * SECLUDED_AGING_AFTER_INACTIVE:
1898 * No balancing needed at this point: when we get to
1899 * the "choose a victim" part below, we'll consider the
1900 * extra secluded pages before any inactive page.
1901 */
1902 } else if (vm_page_secluded_count > vm_page_secluded_target &&
1903 secluded_aging_policy == SECLUDED_AGING_BEFORE_ACTIVE) {
1904 unsigned int secluded_overflow;
1905 vm_page_t secluded_page;
1906
1907 /*
1908 * SECLUDED_AGING_BEFORE_ACTIVE:
1909 * Excess secluded pages go to the active queue and
1910 * will later go to the inactive queue.
1911 */
1912 active_burst_count = MIN(vm_pageout_burst_active_throttle,
1913 vm_page_secluded_count_inuse);
1914 delayed_unlock_limit = VM_PAGEOUT_DELAYED_UNLOCK_LIMIT;
1915 delayed_unlock = 1;
1916 secluded_overflow = (vm_page_secluded_count -
1917 vm_page_secluded_target);
1918 while (secluded_overflow-- > 0 &&
1919 vm_page_secluded_count > vm_page_secluded_target) {
1920 assert((vm_page_secluded_count_free +
1921 vm_page_secluded_count_inuse) ==
1922 vm_page_secluded_count);
d190cdc3 1923 secluded_page = vm_page_queue_first(&vm_page_queue_secluded);
39037602
A
1924 assert(secluded_page->vm_page_q_state ==
1925 VM_PAGE_ON_SECLUDED_Q);
d190cdc3 1926 vm_page_queues_remove(secluded_page, FALSE);
39037602
A
1927 assert(!secluded_page->fictitious);
1928 assert(!VM_PAGE_WIRED(secluded_page));
1929 if (secluded_page->vm_page_object == 0) {
1930 /* transfer to free queue */
1931 assert(secluded_page->busy);
39037602
A
1932 secluded_page->snext = local_freeq;
1933 local_freeq = secluded_page;
1934 local_freed++;
1935 } else {
39037602
A
1936 /* transfer to head of active queue */
1937 vm_page_enqueue_active(secluded_page,
1938 FALSE);
1939 if (active_burst_count-- == 0) {
1940 vm_pageout_secluded_burst_count++;
1941 break;
1942 }
1943 }
1944 secluded_page = VM_PAGE_NULL;
1945 if (delayed_unlock++ > delayed_unlock_limit) {
1946 if (local_freeq) {
1947 vm_page_unlock_queues();
1948 VM_DEBUG_EVENT(
1949 vm_pageout_freelist,
1950 VM_PAGEOUT_FREELIST,
1951 DBG_FUNC_START,
1952 vm_page_free_count,
1953 local_freed,
1954 delayed_unlock_limit,
1955 1);
1956 vm_page_free_list(local_freeq,
1957 TRUE);
1958 VM_DEBUG_EVENT(
1959 vm_pageout_freelist,
1960 VM_PAGEOUT_FREELIST,
1961 DBG_FUNC_END,
1962 vm_page_free_count,
1963 0, 0, 1);
1964 local_freeq = NULL;
1965 local_freed = 0;
1966 vm_page_lock_queues();
1967 } else {
1968 lck_mtx_yield(&vm_page_queue_lock);
1969 }
1970 delayed_unlock = 1;
1971 }
1972 }
1973 delayed_unlock = 1;
1974 } else if (vm_page_secluded_count > vm_page_secluded_target) {
1975 panic("unsupported secluded_aging_policy %d\n",
1976 secluded_aging_policy);
1977 }
1978 if (local_freeq) {
1979 vm_page_unlock_queues();
1980 VM_DEBUG_EVENT(vm_pageout_freelist,
1981 VM_PAGEOUT_FREELIST,
1982 DBG_FUNC_START,
1983 vm_page_free_count,
1984 local_freed,
1985 0,
1986 0);
1987 vm_page_free_list(local_freeq, TRUE);
1988 VM_DEBUG_EVENT(vm_pageout_freelist,
1989 VM_PAGEOUT_FREELIST,
1990 DBG_FUNC_END,
1991 vm_page_free_count, 0, 0, 0);
1992 local_freeq = NULL;
1993 local_freed = 0;
1994 vm_page_lock_queues();
1995 }
1996#endif /* CONFIG_SECLUDED_MEMORY */
1997
3e170ce0
A
1998 assert(delayed_unlock);
1999
6d2010ae
A
2000 if (vm_upl_wait_for_pages < 0)
2001 vm_upl_wait_for_pages = 0;
91447636 2002
6d2010ae
A
2003 delayed_unlock_limit = VM_PAGEOUT_DELAYED_UNLOCK_LIMIT + vm_upl_wait_for_pages;
2004
2005 if (delayed_unlock_limit > VM_PAGEOUT_DELAYED_UNLOCK_LIMIT_MAX)
2006 delayed_unlock_limit = VM_PAGEOUT_DELAYED_UNLOCK_LIMIT_MAX;
91447636 2007
1c79356b 2008 /*
6d2010ae 2009 * Move pages from active to inactive if we're below the target
1c79356b 2010 */
316670eb 2011 /* if we are trying to make clean, we need to make sure we actually have inactive - mj */
b0d623f7 2012 if ((vm_page_inactive_count + vm_page_speculative_count) >= vm_page_inactive_target)
316670eb 2013 goto done_moving_active_pages;
2d21ac55 2014
6d2010ae
A
2015 if (object != NULL) {
2016 vm_object_unlock(object);
2017 object = NULL;
2018 vm_pageout_scan_wants_object = VM_OBJECT_NULL;
2019 }
2020 /*
2021 * Don't sweep through active queue more than the throttle
2022 * which should be kept relatively low
2023 */
39236c6e 2024 active_burst_count = MIN(vm_pageout_burst_active_throttle, vm_page_active_count);
6d2010ae
A
2025
2026 VM_DEBUG_EVENT(vm_pageout_balance, VM_PAGEOUT_BALANCE, DBG_FUNC_START,
2027 vm_pageout_inactive, vm_pageout_inactive_used, vm_page_free_count, local_freed);
2028
2029 VM_DEBUG_EVENT(vm_pageout_balance, VM_PAGEOUT_BALANCE, DBG_FUNC_NONE,
2030 vm_pageout_speculative_clean, vm_pageout_inactive_clean,
2031 vm_pageout_inactive_dirty_internal, vm_pageout_inactive_dirty_external);
39236c6e
A
2032 memoryshot(VM_PAGEOUT_BALANCE, DBG_FUNC_START);
2033
2d21ac55 2034
39037602 2035 while (!vm_page_queue_empty(&vm_page_queue_active) && active_burst_count--) {
1c79356b 2036
1c79356b 2037 vm_pageout_active++;
55e303ae 2038
39037602 2039 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_active);
91447636 2040
39037602 2041 assert(m->vm_page_q_state == VM_PAGE_ON_ACTIVE_Q);
91447636 2042 assert(!m->laundry);
39037602
A
2043 assert(VM_PAGE_OBJECT(m) != kernel_object);
2044 assert(VM_PAGE_GET_PHYS_PAGE(m) != vm_page_guard_addr);
2d21ac55
A
2045
2046 DTRACE_VM2(scan, int, 1, (uint64_t *), NULL);
1c79356b 2047
fe8ab488
A
2048 /*
2049 * by not passing in a pmap_flush_context we will forgo any TLB flushing, local or otherwise...
2050 *
2051 * a TLB flush isn't really needed here since at worst we'll miss the reference bit being
2052 * updated in the PTE if a remote processor still has this mapping cached in its TLB when the
2053 * new reference happens. If no futher references happen on the page after that remote TLB flushes
2054 * we'll see a clean, non-referenced page when it eventually gets pulled out of the inactive queue
2055 * by pageout_scan, which is just fine since the last reference would have happened quite far
2056 * in the past (TLB caches don't hang around for very long), and of course could just as easily
2057 * have happened before we moved the page
2058 */
39037602 2059 pmap_clear_refmod_options(VM_PAGE_GET_PHYS_PAGE(m), VM_MEM_REFERENCED, PMAP_OPTIONS_NOFLUSH, (void *)NULL);
2d21ac55 2060
fe8ab488
A
2061 /*
2062 * The page might be absent or busy,
2063 * but vm_page_deactivate can handle that.
2064 * FALSE indicates that we don't want a H/W clear reference
2065 */
2066 vm_page_deactivate_internal(m, FALSE);
1c79356b 2067
fe8ab488 2068 if (delayed_unlock++ > delayed_unlock_limit) {
6d2010ae 2069
fe8ab488
A
2070 if (local_freeq) {
2071 vm_page_unlock_queues();
91447636 2072
fe8ab488
A
2073 VM_DEBUG_EVENT(vm_pageout_freelist, VM_PAGEOUT_FREELIST, DBG_FUNC_START,
2074 vm_page_free_count, local_freed, delayed_unlock_limit, 1);
2075
2076 vm_page_free_list(local_freeq, TRUE);
2077
2078 VM_DEBUG_EVENT(vm_pageout_freelist, VM_PAGEOUT_FREELIST, DBG_FUNC_END,
2079 vm_page_free_count, 0, 0, 1);
6d2010ae 2080
fe8ab488
A
2081 local_freeq = NULL;
2082 local_freed = 0;
b0d623f7 2083 vm_page_lock_queues();
fe8ab488
A
2084 } else {
2085 lck_mtx_yield(&vm_page_queue_lock);
39236c6e 2086 }
fe8ab488
A
2087
2088 delayed_unlock = 1;
91447636 2089
91447636 2090 /*
fe8ab488
A
2091 * continue the while loop processing
2092 * the active queue... need to hold
2093 * the page queues lock
91447636 2094 */
55e303ae 2095 }
1c79356b 2096 }
91447636 2097
6d2010ae
A
2098 VM_DEBUG_EVENT(vm_pageout_balance, VM_PAGEOUT_BALANCE, DBG_FUNC_END,
2099 vm_page_active_count, vm_page_inactive_count, vm_page_speculative_count, vm_page_inactive_target);
39236c6e 2100 memoryshot(VM_PAGEOUT_BALANCE, DBG_FUNC_END);
91447636
A
2101
2102 /**********************************************************************
2103 * above this point we're playing with the active queue
2104 * below this point we're playing with the throttling mechanisms
2105 * and the inactive queue
2106 **********************************************************************/
2107
2d21ac55 2108done_moving_active_pages:
91447636 2109
39037602
A
2110#if CONFIG_BACKGROUND_QUEUE
2111 if ((vm_page_free_count + local_freed >= vm_page_free_target) &&
2112 ((vm_page_background_mode < VM_PAGE_BG_LEVEL_2) || (vm_page_background_count <= vm_page_background_target)))
2113#else
2114 if (vm_page_free_count + local_freed >= vm_page_free_target)
2115#endif
2116 {
91447636
A
2117 if (object != NULL) {
2118 vm_object_unlock(object);
2119 object = NULL;
2120 }
2d21ac55
A
2121 vm_pageout_scan_wants_object = VM_OBJECT_NULL;
2122
3e170ce0
A
2123 vm_page_unlock_queues();
2124
55e303ae 2125 if (local_freeq) {
6d2010ae
A
2126
2127 VM_DEBUG_EVENT(vm_pageout_freelist, VM_PAGEOUT_FREELIST, DBG_FUNC_START,
2128 vm_page_free_count, local_freed, delayed_unlock_limit, 2);
2129
316670eb 2130 vm_page_free_list(local_freeq, TRUE);
55e303ae 2131
6d2010ae
A
2132 VM_DEBUG_EVENT(vm_pageout_freelist, VM_PAGEOUT_FREELIST, DBG_FUNC_END,
2133 vm_page_free_count, local_freed, 0, 2);
2134
2d21ac55 2135 local_freeq = NULL;
55e303ae
A
2136 local_freed = 0;
2137 }
3e170ce0
A
2138 vm_consider_waking_compactor_swapper();
2139
2140 vm_page_lock_queues();
2141
316670eb
A
2142 /*
2143 * make sure the pageout I/O threads are running
2144 * throttled in case there are still requests
2145 * in the laundry... since we have met our targets
2146 * we don't need the laundry to be cleaned in a timely
2147 * fashion... so let's avoid interfering with foreground
2148 * activity
2149 */
2150 vm_pageout_adjust_io_throttles(iq, eq, TRUE);
2151
2d21ac55 2152 /*
6d2010ae 2153 * recalculate vm_page_inactivate_target
593a1d5f
A
2154 */
2155 vm_page_inactive_target = VM_PAGE_INACTIVE_TARGET(vm_page_active_count +
2156 vm_page_inactive_count +
2157 vm_page_speculative_count);
2d21ac55 2158 if (((vm_page_inactive_count + vm_page_speculative_count) < vm_page_inactive_target) &&
39037602 2159 !vm_page_queue_empty(&vm_page_queue_active)) {
6d2010ae
A
2160 /*
2161 * inactive target still not met... keep going
2162 * until we get the queues balanced...
2163 */
2d21ac55 2164 continue;
6d2010ae 2165 }
b0d623f7 2166 lck_mtx_lock(&vm_page_queue_free_lock);
55e303ae 2167
0b4e3aa0 2168 if ((vm_page_free_count >= vm_page_free_target) &&
2d21ac55 2169 (vm_page_free_wanted == 0) && (vm_page_free_wanted_privileged == 0)) {
6d2010ae
A
2170 /*
2171 * done - we have met our target *and*
2172 * there is no one waiting for a page.
2173 */
316670eb 2174return_from_scan:
2d21ac55
A
2175 assert(vm_pageout_scan_wants_object == VM_OBJECT_NULL);
2176
3e170ce0 2177 VM_DEBUG_CONSTANT_EVENT(vm_pageout_scan, VM_PAGEOUT_SCAN, DBG_FUNC_NONE,
39236c6e 2178 vm_pageout_inactive, vm_pageout_inactive_used, 0, 0);
3e170ce0 2179 VM_DEBUG_CONSTANT_EVENT(vm_pageout_scan, VM_PAGEOUT_SCAN, DBG_FUNC_END,
6d2010ae
A
2180 vm_pageout_speculative_clean, vm_pageout_inactive_clean,
2181 vm_pageout_inactive_dirty_internal, vm_pageout_inactive_dirty_external);
2182
91447636 2183 return;
0b4e3aa0 2184 }
b0d623f7 2185 lck_mtx_unlock(&vm_page_queue_free_lock);
1c79356b 2186 }
b0d623f7 2187
2d21ac55 2188 /*
b0d623f7
A
2189 * Before anything, we check if we have any ripe volatile
2190 * objects around. If so, try to purge the first object.
2191 * If the purge fails, fall through to reclaim a page instead.
2192 * If the purge succeeds, go back to the top and reevalute
2193 * the new memory situation.
2d21ac55 2194 */
fe8ab488 2195
2d21ac55 2196 assert (available_for_purge>=0);
fe8ab488 2197 force_purge = 0; /* no force-purging */
39236c6e 2198
fe8ab488
A
2199#if VM_PRESSURE_EVENTS
2200 pressure_level = memorystatus_vm_pressure_level;
6d2010ae 2201
fe8ab488 2202 if (pressure_level > kVMPressureNormal) {
39236c6e 2203
39236c6e
A
2204 if (pressure_level >= kVMPressureCritical) {
2205 force_purge = memorystatus_purge_on_critical;
2206 } else if (pressure_level >= kVMPressureUrgent) {
2207 force_purge = memorystatus_purge_on_urgent;
2208 } else if (pressure_level >= kVMPressureWarning) {
2209 force_purge = memorystatus_purge_on_warning;
39236c6e 2210 }
fe8ab488
A
2211 }
2212#endif /* VM_PRESSURE_EVENTS */
2213
2214 if (available_for_purge || force_purge) {
2215
2216 if (object != NULL) {
2217 vm_object_unlock(object);
2218 object = NULL;
2219 }
2220
2221 memoryshot(VM_PAGEOUT_PURGEONE, DBG_FUNC_START);
2222
2223 VM_DEBUG_EVENT(vm_pageout_purgeone, VM_PAGEOUT_PURGEONE, DBG_FUNC_START, vm_page_free_count, 0, 0, 0);
2224 if (vm_purgeable_object_purge_one(force_purge, C_DONT_BLOCK)) {
3e170ce0 2225 vm_pageout_purged_objects++;
6d2010ae 2226 VM_DEBUG_EVENT(vm_pageout_purgeone, VM_PAGEOUT_PURGEONE, DBG_FUNC_END, vm_page_free_count, 0, 0, 0);
39236c6e 2227 memoryshot(VM_PAGEOUT_PURGEONE, DBG_FUNC_END);
b0d623f7
A
2228 continue;
2229 }
6d2010ae 2230 VM_DEBUG_EVENT(vm_pageout_purgeone, VM_PAGEOUT_PURGEONE, DBG_FUNC_END, 0, 0, 0, -1);
39236c6e 2231 memoryshot(VM_PAGEOUT_PURGEONE, DBG_FUNC_END);
2d21ac55 2232 }
fe8ab488 2233
39037602 2234 if (vm_page_queue_empty(&sq->age_q) && vm_page_speculative_count) {
2d21ac55 2235 /*
6d2010ae 2236 * try to pull pages from the aging bins...
2d21ac55
A
2237 * see vm_page.h for an explanation of how
2238 * this mechanism works
2239 */
2240 struct vm_speculative_age_q *aq;
2d21ac55 2241 boolean_t can_steal = FALSE;
b0d623f7 2242 int num_scanned_queues;
2d21ac55
A
2243
2244 aq = &vm_page_queue_speculative[speculative_steal_index];
2245
b0d623f7 2246 num_scanned_queues = 0;
39037602 2247 while (vm_page_queue_empty(&aq->age_q) &&
b0d623f7 2248 num_scanned_queues++ != VM_PAGE_MAX_SPECULATIVE_AGE_Q) {
2d21ac55
A
2249
2250 speculative_steal_index++;
2251
2252 if (speculative_steal_index > VM_PAGE_MAX_SPECULATIVE_AGE_Q)
2253 speculative_steal_index = VM_PAGE_MIN_SPECULATIVE_AGE_Q;
2254
2255 aq = &vm_page_queue_speculative[speculative_steal_index];
2256 }
b0d623f7 2257
6d2010ae 2258 if (num_scanned_queues == VM_PAGE_MAX_SPECULATIVE_AGE_Q + 1) {
b0d623f7
A
2259 /*
2260 * XXX We've scanned all the speculative
2261 * queues but still haven't found one
2262 * that is not empty, even though
2263 * vm_page_speculative_count is not 0.
6d2010ae
A
2264 *
2265 * report the anomaly...
b0d623f7 2266 */
b0d623f7
A
2267 printf("vm_pageout_scan: "
2268 "all speculative queues empty "
2269 "but count=%d. Re-adjusting.\n",
2270 vm_page_speculative_count);
6d2010ae 2271 if (vm_page_speculative_count > vm_page_speculative_count_drift_max)
b0d623f7
A
2272 vm_page_speculative_count_drift_max = vm_page_speculative_count;
2273 vm_page_speculative_count_drifts++;
39037602
A
2274#if DEVELOPMENT || DEBUG
2275 panic("vm_pageout_scan: vm_page_speculative_count=%d but queues are empty", vm_page_speculative_count);
2276#endif /* DEVELOPMENT || DEBUG */
b0d623f7
A
2277 /* readjust... */
2278 vm_page_speculative_count = 0;
2279 /* ... and continue */
2280 continue;
2281 }
2282
2d21ac55
A
2283 if (vm_page_speculative_count > vm_page_speculative_target)
2284 can_steal = TRUE;
2285 else {
3e170ce0
A
2286 if (!delay_speculative_age) {
2287 mach_timespec_t ts_fully_aged;
2d21ac55 2288
3e170ce0
A
2289 ts_fully_aged.tv_sec = (VM_PAGE_MAX_SPECULATIVE_AGE_Q * vm_page_speculative_q_age_ms) / 1000;
2290 ts_fully_aged.tv_nsec = ((VM_PAGE_MAX_SPECULATIVE_AGE_Q * vm_page_speculative_q_age_ms) % 1000)
2291 * 1000 * NSEC_PER_USEC;
55e303ae 2292
3e170ce0
A
2293 ADD_MACH_TIMESPEC(&ts_fully_aged, &aq->age_ts);
2294
2295 clock_sec_t sec;
2296 clock_nsec_t nsec;
2297 clock_get_system_nanotime(&sec, &nsec);
2298 ts.tv_sec = (unsigned int) sec;
2299 ts.tv_nsec = nsec;
2d21ac55 2300
3e170ce0
A
2301 if (CMP_MACH_TIMESPEC(&ts, &ts_fully_aged) >= 0)
2302 can_steal = TRUE;
2303 else
2304 delay_speculative_age++;
2305 } else {
2306 delay_speculative_age++;
2307 if (delay_speculative_age == DELAY_SPECULATIVE_AGE)
2308 delay_speculative_age = 0;
2309 }
2d21ac55
A
2310 }
2311 if (can_steal == TRUE)
3e170ce0 2312 vm_page_speculate_ageit(aq);
2d21ac55 2313 }
39037602
A
2314#if CONFIG_BACKGROUND_QUEUE
2315 if (vm_page_queue_empty(&sq->age_q) && cache_evict_throttle == 0 &&
2316 ((vm_page_background_mode == VM_PAGE_BG_DISABLED) || (vm_page_background_count <= vm_page_background_target)))
2317#else
2318 if (vm_page_queue_empty(&sq->age_q) && cache_evict_throttle == 0)
2319#endif
2320 {
6d2010ae
A
2321 int pages_evicted;
2322
2323 if (object != NULL) {
2324 vm_object_unlock(object);
2325 object = NULL;
2326 }
2327 pages_evicted = vm_object_cache_evict(100, 10);
2328
2329 if (pages_evicted) {
2330
2331 vm_pageout_cache_evicted += pages_evicted;
2332
2333 VM_DEBUG_EVENT(vm_pageout_cache_evict, VM_PAGEOUT_CACHE_EVICT, DBG_FUNC_NONE,
2334 vm_page_free_count, pages_evicted, vm_pageout_cache_evicted, 0);
39236c6e 2335 memoryshot(VM_PAGEOUT_CACHE_EVICT, DBG_FUNC_NONE);
6d2010ae
A
2336
2337 /*
2338 * we just freed up to 100 pages,
2339 * so go back to the top of the main loop
2340 * and re-evaulate the memory situation
2341 */
2342 continue;
2343 } else
2344 cache_evict_throttle = 100;
2345 }
2346 if (cache_evict_throttle)
2347 cache_evict_throttle--;
2348
3e170ce0 2349#if CONFIG_JETSAM
04b8595b 2350 /*
3e170ce0
A
2351 * don't let the filecache_min fall below 15% of available memory
2352 * on systems with an active compressor that isn't nearing its
2353 * limits w/r to accepting new data
04b8595b
A
2354 *
2355 * on systems w/o the compressor/swapper, the filecache is always
2356 * a very large percentage of the AVAILABLE_NON_COMPRESSED_MEMORY
2357 * since most (if not all) of the anonymous pages are in the
2358 * throttled queue (which isn't counted as available) which
2359 * effectively disables this filter
2360 */
3e170ce0
A
2361 if (vm_compressor_low_on_space())
2362 vm_page_filecache_min = 0;
2363 else
2364 vm_page_filecache_min = (AVAILABLE_NON_COMPRESSED_MEMORY / 7);
2365#else
2366 /*
2367 * don't let the filecache_min fall below 33% of available memory...
2368 */
04b8595b 2369 vm_page_filecache_min = (AVAILABLE_NON_COMPRESSED_MEMORY / 3);
3e170ce0 2370#endif
39037602
A
2371 if (vm_page_free_count < (vm_page_free_reserved / 4))
2372 vm_page_filecache_min = 0;
91447636 2373
316670eb 2374 exceeded_burst_throttle = FALSE;
1c79356b
A
2375 /*
2376 * Sometimes we have to pause:
2377 * 1) No inactive pages - nothing to do.
316670eb 2378 * 2) Loop control - no acceptable pages found on the inactive queue
91447636 2379 * within the last vm_pageout_burst_inactive_throttle iterations
316670eb 2380 * 3) Flow control - default pageout queue is full
1c79356b 2381 */
39037602
A
2382 if (vm_page_queue_empty(&vm_page_queue_inactive) &&
2383 vm_page_queue_empty(&vm_page_queue_anonymous) &&
2384 vm_page_queue_empty(&sq->age_q)) {
91447636
A
2385 vm_pageout_scan_empty_throttle++;
2386 msecs = vm_pageout_empty_wait;
2387 goto vm_pageout_scan_delay;
2388
b0d623f7 2389 } else if (inactive_burst_count >=
593a1d5f
A
2390 MIN(vm_pageout_burst_inactive_throttle,
2391 (vm_page_inactive_count +
2392 vm_page_speculative_count))) {
91447636
A
2393 vm_pageout_scan_burst_throttle++;
2394 msecs = vm_pageout_burst_wait;
316670eb
A
2395
2396 exceeded_burst_throttle = TRUE;
91447636
A
2397 goto vm_pageout_scan_delay;
2398
39236c6e
A
2399 } else if (vm_page_free_count > (vm_page_free_reserved / 4) &&
2400 VM_PAGEOUT_SCAN_NEEDS_TO_THROTTLE()) {
2401 vm_pageout_scan_swap_throttle++;
2402 msecs = vm_pageout_swap_wait;
2403 goto vm_pageout_scan_delay;
2404
6d2010ae 2405 } else if (VM_PAGE_Q_THROTTLED(iq) &&
39037602 2406 VM_DYNAMIC_PAGING_ENABLED()) {
b0d623f7
A
2407 clock_sec_t sec;
2408 clock_nsec_t nsec;
91447636
A
2409
2410 switch (flow_control.state) {
2411
2412 case FCS_IDLE:
316670eb 2413 if ((vm_page_free_count + local_freed) < vm_page_free_target) {
39236c6e 2414
3e170ce0
A
2415 if (object != NULL) {
2416 vm_object_unlock(object);
2417 object = NULL;
2418 }
2419 vm_pageout_scan_wants_object = VM_OBJECT_NULL;
2420
2421 vm_page_unlock_queues();
2422
2423 if (local_freeq) {
2424
2425 VM_DEBUG_EVENT(vm_pageout_freelist, VM_PAGEOUT_FREELIST, DBG_FUNC_START,
2426 vm_page_free_count, local_freed, delayed_unlock_limit, 3);
2427
2428 vm_page_free_list(local_freeq, TRUE);
2429
2430 VM_DEBUG_EVENT(vm_pageout_freelist, VM_PAGEOUT_FREELIST, DBG_FUNC_END,
2431 vm_page_free_count, local_freed, 0, 3);
2432
2433 local_freeq = NULL;
2434 local_freed = 0;
2435 }
2436 thread_yield_internal(1);
2437
2438 vm_page_lock_queues();
2439
2440 if (!VM_PAGE_Q_THROTTLED(iq)) {
2441 vm_pageout_scan_yield_unthrottled++;
2442 continue;
2443 }
39037602
A
2444 if (vm_page_pageable_external_count > vm_page_filecache_min &&
2445 !vm_page_queue_empty(&vm_page_queue_inactive)) {
39236c6e 2446 anons_grabbed = ANONS_GRABBED_LIMIT;
3e170ce0 2447 vm_pageout_scan_throttle_deferred++;
316670eb
A
2448 goto consider_inactive;
2449 }
39236c6e 2450 if (((vm_page_inactive_count + vm_page_speculative_count) < vm_page_inactive_target) && vm_page_active_count)
316670eb
A
2451 continue;
2452 }
91447636
A
2453reset_deadlock_timer:
2454 ts.tv_sec = vm_pageout_deadlock_wait / 1000;
2455 ts.tv_nsec = (vm_pageout_deadlock_wait % 1000) * 1000 * NSEC_PER_USEC;
b0d623f7
A
2456 clock_get_system_nanotime(&sec, &nsec);
2457 flow_control.ts.tv_sec = (unsigned int) sec;
2458 flow_control.ts.tv_nsec = nsec;
91447636
A
2459 ADD_MACH_TIMESPEC(&flow_control.ts, &ts);
2460
2461 flow_control.state = FCS_DELAYED;
2462 msecs = vm_pageout_deadlock_wait;
1c79356b 2463
91447636
A
2464 break;
2465
2466 case FCS_DELAYED:
b0d623f7
A
2467 clock_get_system_nanotime(&sec, &nsec);
2468 ts.tv_sec = (unsigned int) sec;
2469 ts.tv_nsec = nsec;
91447636
A
2470
2471 if (CMP_MACH_TIMESPEC(&ts, &flow_control.ts) >= 0) {
2472 /*
2473 * the pageout thread for the default pager is potentially
2474 * deadlocked since the
2475 * default pager queue has been throttled for more than the
2476 * allowable time... we need to move some clean pages or dirty
2477 * pages belonging to the external pagers if they aren't throttled
2478 * vm_page_free_wanted represents the number of threads currently
2479 * blocked waiting for pages... we'll move one page for each of
2480 * these plus a fixed amount to break the logjam... once we're done
2481 * moving this number of pages, we'll re-enter the FSC_DELAYED state
2482 * with a new timeout target since we have no way of knowing
2483 * whether we've broken the deadlock except through observation
2484 * of the queue associated with the default pager... we need to
2d21ac55 2485 * stop moving pages and allow the system to run to see what
91447636
A
2486 * state it settles into.
2487 */
2d21ac55 2488 vm_pageout_deadlock_target = vm_pageout_deadlock_relief + vm_page_free_wanted + vm_page_free_wanted_privileged;
91447636
A
2489 vm_pageout_scan_deadlock_detected++;
2490 flow_control.state = FCS_DEADLOCK_DETECTED;
91447636
A
2491 thread_wakeup((event_t) &vm_pageout_garbage_collect);
2492 goto consider_inactive;
2493 }
2494 /*
2495 * just resniff instead of trying
2496 * to compute a new delay time... we're going to be
2497 * awakened immediately upon a laundry completion,
2498 * so we won't wait any longer than necessary
2499 */
2500 msecs = vm_pageout_idle_wait;
2501 break;
1c79356b 2502
91447636
A
2503 case FCS_DEADLOCK_DETECTED:
2504 if (vm_pageout_deadlock_target)
2505 goto consider_inactive;
2506 goto reset_deadlock_timer;
55e303ae 2507
91447636 2508 }
91447636
A
2509vm_pageout_scan_delay:
2510 if (object != NULL) {
2511 vm_object_unlock(object);
2512 object = NULL;
2513 }
2d21ac55
A
2514 vm_pageout_scan_wants_object = VM_OBJECT_NULL;
2515
fe8ab488
A
2516 vm_page_unlock_queues();
2517
55e303ae 2518 if (local_freeq) {
6d2010ae
A
2519
2520 VM_DEBUG_EVENT(vm_pageout_freelist, VM_PAGEOUT_FREELIST, DBG_FUNC_START,
2521 vm_page_free_count, local_freed, delayed_unlock_limit, 3);
2522
316670eb 2523 vm_page_free_list(local_freeq, TRUE);
55e303ae 2524
6d2010ae
A
2525 VM_DEBUG_EVENT(vm_pageout_freelist, VM_PAGEOUT_FREELIST, DBG_FUNC_END,
2526 vm_page_free_count, local_freed, 0, 3);
2527
2d21ac55 2528 local_freeq = NULL;
55e303ae 2529 local_freed = 0;
fe8ab488 2530 }
3e170ce0 2531 vm_consider_waking_compactor_swapper();
b0d623f7 2532
fe8ab488
A
2533 vm_page_lock_queues();
2534
2535 if (flow_control.state == FCS_DELAYED &&
2536 !VM_PAGE_Q_THROTTLED(iq)) {
2537 flow_control.state = FCS_IDLE;
2538 goto consider_inactive;
55e303ae 2539 }
316670eb
A
2540
2541 if (vm_page_free_count >= vm_page_free_target) {
2542 /*
39236c6e 2543 * we're here because
316670eb 2544 * 1) someone else freed up some pages while we had
39236c6e 2545 * the queues unlocked above
316670eb
A
2546 * and we've hit one of the 3 conditions that
2547 * cause us to pause the pageout scan thread
2548 *
2549 * since we already have enough free pages,
2550 * let's avoid stalling and return normally
2551 *
2552 * before we return, make sure the pageout I/O threads
2553 * are running throttled in case there are still requests
2554 * in the laundry... since we have enough free pages
2555 * we don't need the laundry to be cleaned in a timely
2556 * fashion... so let's avoid interfering with foreground
2557 * activity
2558 *
2559 * we don't want to hold vm_page_queue_free_lock when
2560 * calling vm_pageout_adjust_io_throttles (since it
2561 * may cause other locks to be taken), we do the intitial
2562 * check outside of the lock. Once we take the lock,
2563 * we recheck the condition since it may have changed.
2564 * if it has, no problem, we will make the threads
2565 * non-throttled before actually blocking
2566 */
2567 vm_pageout_adjust_io_throttles(iq, eq, TRUE);
2568 }
2569 lck_mtx_lock(&vm_page_queue_free_lock);
0b4e3aa0 2570
39236c6e
A
2571 if (vm_page_free_count >= vm_page_free_target &&
2572 (vm_page_free_wanted == 0) && (vm_page_free_wanted_privileged == 0)) {
316670eb
A
2573 goto return_from_scan;
2574 }
2575 lck_mtx_unlock(&vm_page_queue_free_lock);
2576
2577 if ((vm_page_free_count + vm_page_cleaned_count) < vm_page_free_target) {
2578 /*
2579 * we're most likely about to block due to one of
2580 * the 3 conditions that cause vm_pageout_scan to
2581 * not be able to make forward progress w/r
2582 * to providing new pages to the free queue,
2583 * so unthrottle the I/O threads in case we
2584 * have laundry to be cleaned... it needs
2585 * to be completed ASAP.
2586 *
2587 * even if we don't block, we want the io threads
2588 * running unthrottled since the sum of free +
2589 * clean pages is still under our free target
2590 */
2591 vm_pageout_adjust_io_throttles(iq, eq, FALSE);
2592 }
2593 if (vm_page_cleaned_count > 0 && exceeded_burst_throttle == FALSE) {
2594 /*
2595 * if we get here we're below our free target and
2596 * we're stalling due to a full laundry queue or
2597 * we don't have any inactive pages other then
2598 * those in the clean queue...
2599 * however, we have pages on the clean queue that
2600 * can be moved to the free queue, so let's not
2601 * stall the pageout scan
2602 */
2603 flow_control.state = FCS_IDLE;
2604 goto consider_inactive;
2605 }
6d2010ae
A
2606 VM_CHECK_MEMORYSTATUS;
2607
316670eb
A
2608 if (flow_control.state != FCS_IDLE)
2609 vm_pageout_scan_throttle++;
2610 iq->pgo_throttled = TRUE;
2611
2d21ac55 2612 assert_wait_timeout((event_t) &iq->pgo_laundry, THREAD_INTERRUPTIBLE, msecs, 1000*NSEC_PER_USEC);
2d21ac55 2613 counter(c_vm_pageout_scan_block++);
1c79356b 2614
91447636 2615 vm_page_unlock_queues();
2d21ac55
A
2616
2617 assert(vm_pageout_scan_wants_object == VM_OBJECT_NULL);
b0d623f7 2618
6d2010ae
A
2619 VM_DEBUG_EVENT(vm_pageout_thread_block, VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_START,
2620 iq->pgo_laundry, iq->pgo_maxlaundry, msecs, 0);
39236c6e 2621 memoryshot(VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_START);
6d2010ae 2622
91447636
A
2623 thread_block(THREAD_CONTINUE_NULL);
2624
6d2010ae
A
2625 VM_DEBUG_EVENT(vm_pageout_thread_block, VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_END,
2626 iq->pgo_laundry, iq->pgo_maxlaundry, msecs, 0);
39236c6e 2627 memoryshot(VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_END);
6d2010ae 2628
91447636
A
2629 vm_page_lock_queues();
2630 delayed_unlock = 1;
2631
2632 iq->pgo_throttled = FALSE;
0b4e3aa0 2633
2d21ac55 2634 if (loop_count >= vm_page_inactive_count)
55e303ae 2635 loop_count = 0;
91447636
A
2636 inactive_burst_count = 0;
2637
1c79356b
A
2638 goto Restart;
2639 /*NOTREACHED*/
2640 }
2641
91447636
A
2642
2643 flow_control.state = FCS_IDLE;
2644consider_inactive:
6d2010ae
A
2645 vm_pageout_inactive_external_forced_reactivate_limit = MIN((vm_page_active_count + vm_page_inactive_count),
2646 vm_pageout_inactive_external_forced_reactivate_limit);
91447636
A
2647 loop_count++;
2648 inactive_burst_count++;
1c79356b 2649 vm_pageout_inactive++;
39236c6e 2650
316670eb
A
2651
2652 /*
2653 * Choose a victim.
2654 */
39236c6e 2655 while (1) {
3e170ce0
A
2656 uint32_t inactive_external_count;
2657
39037602
A
2658#if CONFIG_BACKGROUND_QUEUE
2659 ignore_reference = FALSE;
2660#endif /* CONFIG_BACKGROUND_QUEUE */
2661
2d21ac55 2662 m = NULL;
39037602 2663 m_object = VM_OBJECT_NULL;
91447636 2664
39037602 2665 if (VM_DYNAMIC_PAGING_ENABLED()) {
b0d623f7 2666 assert(vm_page_throttled_count == 0);
39037602 2667 assert(vm_page_queue_empty(&vm_page_queue_throttled));
91447636 2668 }
39037602
A
2669
2670
2671#if CONFIG_SECLUDED_MEMORY
2672 if ((secluded_aging_policy ==
2673 SECLUDED_AGING_AFTER_INACTIVE) &&
2674 vm_page_secluded_count > vm_page_secluded_target) {
2675 /*
2676 * SECLUDED_AGING_AFTER_INACTIVE:
2677 * Secluded pages have already been aged
2678 * through the active and inactive queues, and
2679 * we now have too many of them, so let's
2680 * balance that queue by considering reclaiming
2681 * the oldest page in the secluded queue.
2682 */
2683 assert(!vm_page_queue_empty(&vm_page_queue_secluded));
2684 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_secluded);
2685 if (m->vm_page_object == 0) {
2686 /*
2687 * It's already a free page:
2688 * just move it to a free queue.
2689 */
2690 vm_page_queues_remove(m, TRUE);
2691 assert(m->busy);
2692 assert(m->pageq.next == 0);
2693 assert(m->pageq.prev == 0);
2694 m->snext = local_freeq;
2695 local_freeq = m;
2696 local_freed++;
2697 goto done_with_inactivepage;
2698 }
2699 /*
2700 * Not a free page: we've found our next
2701 * "victim".
2702 */
2703 break;
2704 }
2705#endif /* CONFIG_SECLUDED_MEMORY */
2706
2707#if CONFIG_BACKGROUND_QUEUE
2708 if (vm_page_background_mode != VM_PAGE_BG_DISABLED && (vm_page_background_count > vm_page_background_target)) {
2709 vm_object_t bg_m_object = NULL;
2710
2711 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_background);
2712
2713 bg_m_object = VM_PAGE_OBJECT(m);
2714
743345f9
A
2715 if (!VM_PAGE_PAGEABLE(m)) {
2716 /*
2717 * This page is on the background queue
2718 * but not on a pageable queue. This is
2719 * likely a transient state and whoever
2720 * took it out of its pageable queue
2721 * will likely put it back on a pageable
2722 * queue soon but we can't deal with it
2723 * at this point, so let's ignore this
2724 * page.
2725 */
2726 } else if (force_anonymous == FALSE || bg_m_object->internal) {
39037602
A
2727 ignore_reference = TRUE;
2728
2729 if (bg_m_object->internal)
2730 vm_pageout_considered_bq_internal++;
2731 else
2732 vm_pageout_considered_bq_external++;
2733
39037602
A
2734 break;
2735 }
2736 }
2737#endif
2738
2d21ac55 2739 /*
39236c6e
A
2740 * The most eligible pages are ones we paged in speculatively,
2741 * but which have not yet been touched.
2d21ac55 2742 */
39037602
A
2743 if (!vm_page_queue_empty(&sq->age_q) && force_anonymous == FALSE) {
2744 m = (vm_page_t) vm_page_queue_first(&sq->age_q);
6d2010ae 2745
39037602 2746 assert(m->vm_page_q_state == VM_PAGE_ON_SPECULATIVE_Q);
316670eb 2747
39236c6e
A
2748 break;
2749 }
2750 /*
2751 * Try a clean-queue inactive page.
2752 */
39037602
A
2753 if (!vm_page_queue_empty(&vm_page_queue_cleaned)) {
2754 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_cleaned);
316670eb 2755
39037602 2756 assert(m->vm_page_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q);
316670eb 2757
39236c6e
A
2758 break;
2759 }
316670eb 2760
39236c6e 2761 grab_anonymous = (vm_page_anonymous_count > vm_page_anonymous_min);
3e170ce0 2762 inactive_external_count = vm_page_inactive_count - vm_page_anonymous_count;
316670eb 2763
3e170ce0
A
2764 if ((vm_page_pageable_external_count < vm_page_filecache_min || force_anonymous == TRUE) ||
2765 ((inactive_external_count < vm_page_anonymous_count) && (inactive_external_count < (vm_page_pageable_external_count / 3)))) {
39236c6e
A
2766 grab_anonymous = TRUE;
2767 anons_grabbed = 0;
9bccf70c 2768 }
39037602
A
2769#if CONFIG_JETSAM
2770 /* If the file-backed pool has accumulated
2771 * significantly more pages than the jetsam
2772 * threshold, prefer to reclaim those
2773 * inline to minimise compute overhead of reclaiming
2774 * anonymous pages.
2775 * This calculation does not account for the CPU local
2776 * external page queues, as those are expected to be
2777 * much smaller relative to the global pools.
2778 */
2779 if (grab_anonymous) {
2780 if (vm_page_pageable_external_count >
2781 vm_page_filecache_min) {
2782 if ((vm_page_pageable_external_count *
2783 vm_pageout_memorystatus_fb_factor_dr) >
2784 (memorystatus_available_pages_critical *
2785 vm_pageout_memorystatus_fb_factor_nr)) {
2786 grab_anonymous = FALSE;
2787#if DEVELOPMENT || DEBUG
2788 vm_grab_anon_overrides++;
2789#endif
2790 }
2791 }
2792#if DEVELOPMENT || DEBUG
2793 if (grab_anonymous) {
2794 vm_grab_anon_nops++;
2795
2796 }
2797#endif
2798 }
2799#endif /* CONFIG_JETSAM */
6d2010ae 2800
39037602 2801 if (grab_anonymous == FALSE || anons_grabbed >= ANONS_GRABBED_LIMIT || vm_page_queue_empty(&vm_page_queue_anonymous)) {
39236c6e 2802
39037602
A
2803 if ( !vm_page_queue_empty(&vm_page_queue_inactive) ) {
2804 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_inactive);
39236c6e 2805
39037602 2806 assert(m->vm_page_q_state == VM_PAGE_ON_INACTIVE_EXTERNAL_Q);
39236c6e
A
2807 anons_grabbed = 0;
2808
04b8595b
A
2809 if (vm_page_pageable_external_count < vm_page_filecache_min) {
2810 if ((++reactivated_this_call % 100))
2811 goto must_activate_page;
2812 /*
2813 * steal 1% of the file backed pages even if
2814 * we are under the limit that has been set
2815 * for a healthy filecache
2816 */
2817 }
2d21ac55
A
2818 break;
2819 }
2820 }
39037602
A
2821 if ( !vm_page_queue_empty(&vm_page_queue_anonymous) ) {
2822 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_anonymous);
39236c6e 2823
39037602 2824 assert(m->vm_page_q_state == VM_PAGE_ON_INACTIVE_INTERNAL_Q);
39236c6e
A
2825 anons_grabbed++;
2826
2827 break;
2828 }
316670eb 2829
2d21ac55 2830 /*
316670eb
A
2831 * if we've gotten here, we have no victim page.
2832 * if making clean, free the local freed list and return.
2833 * if making free, check to see if we've finished balancing the queues
2834 * yet, if we haven't just continue, else panic
2d21ac55 2835 */
316670eb 2836 vm_page_unlock_queues();
6d2010ae 2837
316670eb
A
2838 if (object != NULL) {
2839 vm_object_unlock(object);
2840 object = NULL;
2841 }
2842 vm_pageout_scan_wants_object = VM_OBJECT_NULL;
2843
2844 if (local_freeq) {
2845 VM_DEBUG_EVENT(vm_pageout_freelist, VM_PAGEOUT_FREELIST, DBG_FUNC_START,
2846 vm_page_free_count, local_freed, delayed_unlock_limit, 5);
2847
2848 vm_page_free_list(local_freeq, TRUE);
2849
2850 VM_DEBUG_EVENT(vm_pageout_freelist, VM_PAGEOUT_FREELIST, DBG_FUNC_END,
2851 vm_page_free_count, local_freed, 0, 5);
2852
2853 local_freeq = NULL;
2854 local_freed = 0;
2855 }
2856 vm_page_lock_queues();
2857 delayed_unlock = 1;
2858
fe8ab488
A
2859 force_anonymous = FALSE;
2860
316670eb
A
2861 if ((vm_page_inactive_count + vm_page_speculative_count) < vm_page_inactive_target)
2862 goto Restart;
2863
39037602 2864 if (!vm_page_queue_empty(&sq->age_q))
fe8ab488
A
2865 goto Restart;
2866
316670eb
A
2867 panic("vm_pageout: no victim");
2868
2869 /* NOTREACHED */
9bccf70c 2870 }
d190cdc3 2871 assert(VM_PAGE_PAGEABLE(m));
39037602 2872 m_object = VM_PAGE_OBJECT(m);
39236c6e 2873 force_anonymous = FALSE;
316670eb 2874
39037602
A
2875 page_prev_q_state = m->vm_page_q_state;
2876 requeue_insert_first = FALSE;
316670eb
A
2877 /*
2878 * we just found this page on one of our queues...
2879 * it can't also be on the pageout queue, so safe
3e170ce0 2880 * to call vm_page_queues_remove
316670eb 2881 */
39037602 2882 vm_page_queues_remove(m, TRUE);
2d21ac55 2883
91447636 2884 assert(!m->laundry);
6d2010ae
A
2885 assert(!m->private);
2886 assert(!m->fictitious);
39037602
A
2887 assert(m_object != kernel_object);
2888 assert(VM_PAGE_GET_PHYS_PAGE(m) != vm_page_guard_addr);
2d21ac55 2889
6d2010ae 2890
39037602
A
2891 if (page_prev_q_state != VM_PAGE_ON_SPECULATIVE_Q &&
2892 page_prev_q_state != VM_PAGE_ON_SECLUDED_Q)
b0d623f7 2893 vm_pageout_stats[vm_pageout_stat_now].considered++;
b0d623f7 2894
2d21ac55 2895 DTRACE_VM2(scan, int, 1, (uint64_t *), NULL);
1c79356b 2896
91447636 2897 /*
2d21ac55
A
2898 * check to see if we currently are working
2899 * with the same object... if so, we've
2900 * already got the lock
91447636 2901 */
39037602 2902 if (m_object != object) {
2d21ac55
A
2903 /*
2904 * the object associated with candidate page is
2905 * different from the one we were just working
2906 * with... dump the lock if we still own it
2907 */
91447636
A
2908 if (object != NULL) {
2909 vm_object_unlock(object);
2910 object = NULL;
2d21ac55 2911 vm_pageout_scan_wants_object = VM_OBJECT_NULL;
91447636 2912 }
2d21ac55
A
2913 /*
2914 * Try to lock object; since we've alread got the
2915 * page queues lock, we can only 'try' for this one.
2916 * if the 'try' fails, we need to do a mutex_pause
2917 * to allow the owner of the object lock a chance to
2918 * run... otherwise, we're likely to trip over this
2919 * object in the same state as we work our way through
2920 * the queue... clumps of pages associated with the same
2921 * object are fairly typical on the inactive and active queues
2922 */
39037602 2923 if (!vm_object_lock_try_scan(m_object)) {
6d2010ae
A
2924 vm_page_t m_want = NULL;
2925
b0d623f7
A
2926 vm_pageout_inactive_nolock++;
2927
39037602 2928 if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q)
316670eb
A
2929 vm_pageout_cleaned_nolock++;
2930
39037602
A
2931 if (page_prev_q_state == VM_PAGE_ON_SPECULATIVE_Q)
2932 requeue_insert_first = TRUE;
2d21ac55 2933
39037602 2934 pmap_clear_reference(VM_PAGE_GET_PHYS_PAGE(m));
2d21ac55
A
2935 m->reference = FALSE;
2936
6d2010ae
A
2937 /*
2938 * m->object must be stable since we hold the page queues lock...
2939 * we can update the scan_collisions field sans the object lock
2940 * since it is a separate field and this is the only spot that does
2941 * a read-modify-write operation and it is never executed concurrently...
2942 * we can asynchronously set this field to 0 when creating a UPL, so it
2943 * is possible for the value to be a bit non-determistic, but that's ok
2944 * since it's only used as a hint
2945 */
39037602
A
2946 m_object->scan_collisions = 1;
2947
2948 if ( !vm_page_queue_empty(&sq->age_q) )
2949 m_want = (vm_page_t) vm_page_queue_first(&sq->age_q);
2950 else if ( !vm_page_queue_empty(&vm_page_queue_cleaned))
2951 m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_cleaned);
2952 else if ( !vm_page_queue_empty(&vm_page_queue_inactive) &&
2953 (anons_grabbed >= ANONS_GRABBED_LIMIT || vm_page_queue_empty(&vm_page_queue_anonymous)))
2954 m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_inactive);
2955 else if ( !vm_page_queue_empty(&vm_page_queue_anonymous))
2956 m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_anonymous);
39236c6e 2957
2d21ac55
A
2958 /*
2959 * this is the next object we're going to be interested in
2960 * try to make sure its available after the mutex_yield
2961 * returns control
2962 */
6d2010ae 2963 if (m_want)
39037602 2964 vm_pageout_scan_wants_object = VM_PAGE_OBJECT(m_want);
2d21ac55 2965
91447636
A
2966 /*
2967 * force us to dump any collected free pages
2968 * and to pause before moving on
2969 */
2d21ac55 2970 try_failed = TRUE;
55e303ae 2971
6d2010ae 2972 goto requeue_page;
1c79356b 2973 }
39037602 2974 object = m_object;
2d21ac55 2975 vm_pageout_scan_wants_object = VM_OBJECT_NULL;
0b4e3aa0 2976
2d21ac55 2977 try_failed = FALSE;
1c79356b 2978 }
39037602
A
2979 assert(m_object == object);
2980 assert(VM_PAGE_OBJECT(m) == m_object);
2981
6d2010ae
A
2982 if (catch_up_count)
2983 catch_up_count--;
1c79356b 2984
6d2010ae
A
2985 if (m->busy) {
2986 if (m->encrypted_cleaning) {
2987 /*
2988 * ENCRYPTED SWAP:
2989 * if this page has already been picked up as
2990 * part of a page-out cluster, it will be busy
2991 * because it is being encrypted (see
2992 * vm_object_upl_request()). But we still
2993 * want to demote it from "clean-in-place"
2994 * (aka "adjacent") to "clean-and-free" (aka
2995 * "target"), so let's ignore its "busy" bit
2996 * here and proceed to check for "cleaning" a
2997 * little bit below...
2998 *
2999 * CAUTION CAUTION:
3000 * A "busy" page should still be left alone for
3001 * most purposes, so we have to be very careful
3002 * not to process that page too much.
3003 */
3004 assert(m->cleaning);
3005 goto consider_inactive_page;
2d21ac55 3006 }
2d21ac55 3007
1c79356b
A
3008 /*
3009 * Somebody is already playing with this page.
6d2010ae 3010 * Put it back on the appropriate queue
2d21ac55 3011 *
1c79356b 3012 */
1c79356b 3013 vm_pageout_inactive_busy++;
316670eb 3014
39037602 3015 if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q)
316670eb
A
3016 vm_pageout_cleaned_busy++;
3017
6d2010ae 3018requeue_page:
39037602 3019 if (requeue_insert_first)
3e170ce0 3020 vm_page_enqueue_inactive(m, TRUE);
39037602
A
3021 else
3022 vm_page_enqueue_inactive(m, FALSE);
3023#if CONFIG_BACKGROUND_QUEUE
3024 if (ignore_reference == TRUE) {
3025 if (m_object->internal)
3026 vm_pageout_rejected_bq_internal++;
3027 else
3028 vm_pageout_rejected_bq_external++;
6d2010ae 3029 }
39037602 3030#endif
91447636 3031 goto done_with_inactivepage;
1c79356b
A
3032 }
3033
6d2010ae 3034
1c79356b 3035 /*
6d2010ae
A
3036 * If it's absent, in error or the object is no longer alive,
3037 * we can reclaim the page... in the no longer alive case,
3038 * there are 2 states the page can be in that preclude us
3039 * from reclaiming it - busy or cleaning - that we've already
3040 * dealt with
1c79356b 3041 */
6d2010ae 3042 if (m->absent || m->error || !object->alive) {
1c79356b 3043
6d2010ae
A
3044 if (m->absent)
3045 vm_pageout_inactive_absent++;
3046 else if (!object->alive)
3047 vm_pageout_inactive_notalive++;
3048 else
3049 vm_pageout_inactive_error++;
316670eb 3050reclaim_page:
91447636
A
3051 if (vm_pageout_deadlock_target) {
3052 vm_pageout_scan_inactive_throttle_success++;
3053 vm_pageout_deadlock_target--;
3054 }
2d21ac55
A
3055
3056 DTRACE_VM2(dfree, int, 1, (uint64_t *), NULL);
3057
b0d623f7 3058 if (object->internal) {
2d21ac55
A
3059 DTRACE_VM2(anonfree, int, 1, (uint64_t *), NULL);
3060 } else {
3061 DTRACE_VM2(fsfree, int, 1, (uint64_t *), NULL);
3062 }
316670eb
A
3063 assert(!m->cleaning);
3064 assert(!m->laundry);
3065
3066 m->busy = TRUE;
2d21ac55 3067
b0d623f7
A
3068 /*
3069 * remove page from object here since we're already
3070 * behind the object lock... defer the rest of the work
3071 * we'd normally do in vm_page_free_prepare_object
3072 * until 'vm_page_free_list' is called
3073 */
3074 if (m->tabled)
3075 vm_page_remove(m, TRUE);
55e303ae 3076
39037602
A
3077 assert(m->pageq.next == 0 && m->pageq.prev == 0);
3078 m->snext = local_freeq;
55e303ae 3079 local_freeq = m;
91447636 3080 local_freed++;
316670eb 3081
39037602
A
3082#if CONFIG_SECLUDED_MEMORY
3083 if (page_prev_q_state == VM_PAGE_ON_SECLUDED_Q)
3084 vm_pageout_freed_from_secluded++;
3085#endif /* CONFIG_SECLUDED_MEMORY */
3086 if (page_prev_q_state == VM_PAGE_ON_SPECULATIVE_Q)
316670eb 3087 vm_pageout_freed_from_speculative++;
39037602 3088 else if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q)
316670eb
A
3089 vm_pageout_freed_from_cleaned++;
3090 else
3091 vm_pageout_freed_from_inactive_clean++;
55e303ae 3092
39037602
A
3093 if (page_prev_q_state != VM_PAGE_ON_SPECULATIVE_Q &&
3094 page_prev_q_state != VM_PAGE_ON_SECLUDED_Q)
b0d623f7 3095 vm_pageout_stats[vm_pageout_stat_now].reclaimed++;
b0d623f7 3096
fe8ab488 3097 inactive_burst_count = 0;
91447636 3098 goto done_with_inactivepage;
1c79356b 3099 }
b0d623f7
A
3100 /*
3101 * If the object is empty, the page must be reclaimed even
3102 * if dirty or used.
3103 * If the page belongs to a volatile object, we stick it back
3104 * on.
3105 */
3106 if (object->copy == VM_OBJECT_NULL) {
3107 if (object->purgable == VM_PURGABLE_EMPTY) {
b0d623f7
A
3108 if (m->pmapped == TRUE) {
3109 /* unmap the page */
39037602 3110 refmod_state = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
b0d623f7 3111 if (refmod_state & VM_MEM_MODIFIED) {
316670eb 3112 SET_PAGE_DIRTY(m, FALSE);
b0d623f7
A
3113 }
3114 }
3115 if (m->dirty || m->precious) {
3116 /* we saved the cost of cleaning this page ! */
3117 vm_page_purged_count++;
3118 }
3119 goto reclaim_page;
3120 }
39236c6e 3121
39037602 3122 if (VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
39236c6e
A
3123 /*
3124 * With the VM compressor, the cost of
3125 * reclaiming a page is much lower (no I/O),
3126 * so if we find a "volatile" page, it's better
3127 * to let it get compressed rather than letting
3128 * it occupy a full page until it gets purged.
3129 * So no need to check for "volatile" here.
3130 */
3131 } else if (object->purgable == VM_PURGABLE_VOLATILE) {
3132 /*
3133 * Avoid cleaning a "volatile" page which might
3134 * be purged soon.
3135 */
3136
b0d623f7
A
3137 /* if it's wired, we can't put it on our queue */
3138 assert(!VM_PAGE_WIRED(m));
6d2010ae 3139
b0d623f7 3140 /* just stick it back on! */
6d2010ae 3141 reactivated_this_call++;
316670eb 3142
39037602 3143 if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q)
316670eb
A
3144 vm_pageout_cleaned_volatile_reactivated++;
3145
b0d623f7
A
3146 goto reactivate_page;
3147 }
3148 }
3149
316670eb 3150consider_inactive_page:
6d2010ae
A
3151 if (m->busy) {
3152 /*
3153 * CAUTION CAUTION:
3154 * A "busy" page should always be left alone, except...
3155 */
3156 if (m->cleaning && m->encrypted_cleaning) {
3157 /*
3158 * ENCRYPTED_SWAP:
3159 * We could get here with a "busy" page
3160 * if it's being encrypted during a
3161 * "clean-in-place" operation. We'll deal
3162 * with it right away by testing if it has been
3163 * referenced and either reactivating it or
3164 * promoting it from "clean-in-place" to
3165 * "clean-and-free".
3166 */
3167 } else {
3168 panic("\"busy\" page considered for pageout\n");
3169 }
3170 }
3171
1c79356b
A
3172 /*
3173 * If it's being used, reactivate.
3174 * (Fictitious pages are either busy or absent.)
2d21ac55
A
3175 * First, update the reference and dirty bits
3176 * to make sure the page is unreferenced.
1c79356b 3177 */
2d21ac55
A
3178 refmod_state = -1;
3179
3180 if (m->reference == FALSE && m->pmapped == TRUE) {
39037602 3181 refmod_state = pmap_get_refmod(VM_PAGE_GET_PHYS_PAGE(m));
91447636
A
3182
3183 if (refmod_state & VM_MEM_REFERENCED)
3184 m->reference = TRUE;
316670eb
A
3185 if (refmod_state & VM_MEM_MODIFIED) {
3186 SET_PAGE_DIRTY(m, FALSE);
3187 }
91447636 3188 }
316670eb 3189
6d2010ae 3190 /*
39037602 3191 * if (m->cleaning && !m->free_when_done)
6d2010ae 3192 * If already cleaning this page in place and it hasn't
39236c6e
A
3193 * been recently referenced, just pull off the queue.
3194 * We can leave the page mapped, and upl_commit_range
3195 * will put it on the clean queue.
6d2010ae
A
3196 *
3197 * note: if m->encrypted_cleaning == TRUE, then
3198 * m->cleaning == TRUE
3199 * and we'll handle it here
316670eb 3200 *
39037602 3201 * if (m->free_when_done && !m->cleaning)
316670eb
A
3202 * an msync INVALIDATE is in progress...
3203 * this page has been marked for destruction
3204 * after it has been cleaned,
3205 * but not yet gathered into a UPL
3206 * where 'cleaning' will be set...
3207 * just leave it off the paging queues
3208 *
39037602 3209 * if (m->free_when_done && m->clenaing)
316670eb
A
3210 * an msync INVALIDATE is in progress
3211 * and the UPL has already gathered this page...
3212 * just leave it off the paging queues
6d2010ae 3213 */
316670eb
A
3214
3215 /*
39037602 3216 * page with m->free_when_done and still on the queues means that an
39236c6e 3217 * MS_INVALIDATE is in progress on this page... leave it alone
316670eb 3218 */
39037602 3219 if (m->free_when_done) {
316670eb
A
3220 goto done_with_inactivepage;
3221 }
3222
3223 /* if cleaning, reactivate if referenced. otherwise, just pull off queue */
6d2010ae 3224 if (m->cleaning) {
6d2010ae
A
3225 if (m->reference == TRUE) {
3226 reactivated_this_call++;
3227 goto reactivate_page;
316670eb 3228 } else {
316670eb 3229 goto done_with_inactivepage;
6d2010ae 3230 }
6d2010ae
A
3231 }
3232
39236c6e
A
3233 if (m->reference || m->dirty) {
3234 /* deal with a rogue "reusable" page */
39037602 3235 VM_PAGEOUT_SCAN_HANDLE_REUSABLE_PAGE(m, m_object);
39236c6e 3236 }
b0d623f7 3237
39037602
A
3238#if CONFIG_SECLUDED_MEMORY
3239 if (secluded_for_filecache &&
3240 vm_page_secluded_target > 0 &&
3241 m_object->eligible_for_secluded &&
3242 secluded_aging_policy == SECLUDED_AGING_FIFO) {
3243 /*
3244 * SECLUDED_AGING_FIFO:
3245 * This victim page is eligible for the secluded pool
3246 * and we're not aging secluded pages, so let's not
3247 * reactivate it if it's been re-referenced.
3248 * Later on, we'll move it to the secluded queue
3249 * instead of freeing it.
3250 */
3251 ignore_reference_secluded = TRUE;
3252 } else {
3253 ignore_reference_secluded = FALSE;
3254 }
3255#endif /* CONFIG_SECLUDED_MEMORY */
3256
fe8ab488 3257 if (!m->no_cache &&
39037602
A
3258#if CONFIG_BACKGROUND_QUEUE
3259 ignore_reference == FALSE &&
3260#endif
3261#if CONFIG_SECLUDED_MEMORY
3262 ignore_reference_secluded == FALSE &&
3263#endif /* CONFIG_SECLUDED_MEMORY */
fe8ab488
A
3264 (m->reference ||
3265 (m->xpmapped && !object->internal && (vm_page_xpmapped_external_count < (vm_page_external_count / 4))))) {
2d21ac55
A
3266 /*
3267 * The page we pulled off the inactive list has
3268 * been referenced. It is possible for other
3269 * processors to be touching pages faster than we
3270 * can clear the referenced bit and traverse the
3271 * inactive queue, so we limit the number of
3272 * reactivations.
3273 */
3274 if (++reactivated_this_call >= reactivate_limit) {
3275 vm_pageout_reactivation_limit_exceeded++;
3276 } else if (catch_up_count) {
3277 vm_pageout_catch_ups++;
3278 } else if (++inactive_reclaim_run >= VM_PAGEOUT_INACTIVE_FORCE_RECLAIM) {
3279 vm_pageout_inactive_force_reclaim++;
3280 } else {
b0d623f7 3281 uint32_t isinuse;
316670eb 3282
39037602 3283 if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q)
316670eb
A
3284 vm_pageout_cleaned_reference_reactivated++;
3285
2d21ac55 3286reactivate_page:
b0d623f7
A
3287 if ( !object->internal && object->pager != MEMORY_OBJECT_NULL &&
3288 vnode_pager_get_isinuse(object->pager, &isinuse) == KERN_SUCCESS && !isinuse) {
3289 /*
3290 * no explict mappings of this object exist
3291 * and it's not open via the filesystem
3292 */
3293 vm_page_deactivate(m);
3294 vm_pageout_inactive_deactivated++;
3295 } else {
04b8595b 3296must_activate_page:
b0d623f7
A
3297 /*
3298 * The page was/is being used, so put back on active list.
3299 */
3300 vm_page_activate(m);
3301 VM_STAT_INCR(reactivations);
fe8ab488 3302 inactive_burst_count = 0;
b0d623f7 3303 }
39037602
A
3304#if CONFIG_BACKGROUND_QUEUE
3305 if (ignore_reference == TRUE) {
3306 if (m_object->internal)
3307 vm_pageout_rejected_bq_internal++;
3308 else
3309 vm_pageout_rejected_bq_external++;
3310 }
3311#endif
3312 if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q)
316670eb 3313 vm_pageout_cleaned_reactivated++;
39037602
A
3314#if CONFIG_SECLUDED_MEMORY
3315 if (page_prev_q_state == VM_PAGE_ON_SECLUDED_Q)
3316 vm_pageout_secluded_reactivated++;
3317#endif /* CONFIG_SECLUDED_MEMORY */
316670eb 3318
2d21ac55 3319 vm_pageout_inactive_used++;
55e303ae 3320
2d21ac55
A
3321 goto done_with_inactivepage;
3322 }
3323 /*
3324 * Make sure we call pmap_get_refmod() if it
3325 * wasn't already called just above, to update
3326 * the dirty bit.
3327 */
3328 if ((refmod_state == -1) && !m->dirty && m->pmapped) {
39037602 3329 refmod_state = pmap_get_refmod(VM_PAGE_GET_PHYS_PAGE(m));
316670eb
A
3330 if (refmod_state & VM_MEM_MODIFIED) {
3331 SET_PAGE_DIRTY(m, FALSE);
3332 }
2d21ac55 3333 }
1c79356b
A
3334 }
3335
91447636
A
3336 XPR(XPR_VM_PAGEOUT,
3337 "vm_pageout_scan, replace object 0x%X offset 0x%X page 0x%X\n",
b0d623f7 3338 object, m->offset, m, 0,0);
0b4e3aa0 3339
91447636
A
3340 /*
3341 * we've got a candidate page to steal...
3342 *
3343 * m->dirty is up to date courtesy of the
3344 * preceding check for m->reference... if
3345 * we get here, then m->reference had to be
2d21ac55
A
3346 * FALSE (or possibly "reactivate_limit" was
3347 * exceeded), but in either case we called
3348 * pmap_get_refmod() and updated both
3349 * m->reference and m->dirty
91447636
A
3350 *
3351 * if it's dirty or precious we need to
3352 * see if the target queue is throtttled
3353 * it if is, we need to skip over it by moving it back
3354 * to the end of the inactive queue
3355 */
b0d623f7 3356
91447636
A
3357 inactive_throttled = FALSE;
3358
3359 if (m->dirty || m->precious) {
3360 if (object->internal) {
2d21ac55 3361 if (VM_PAGE_Q_THROTTLED(iq))
91447636
A
3362 inactive_throttled = TRUE;
3363 } else if (VM_PAGE_Q_THROTTLED(eq)) {
2d21ac55 3364 inactive_throttled = TRUE;
1c79356b 3365 }
91447636 3366 }
2d21ac55 3367throttle_inactive:
39037602 3368 if (!VM_DYNAMIC_PAGING_ENABLED() &&
6d2010ae
A
3369 object->internal && m->dirty &&
3370 (object->purgable == VM_PURGABLE_DENY ||
3371 object->purgable == VM_PURGABLE_NONVOLATILE ||
3372 object->purgable == VM_PURGABLE_VOLATILE)) {
3e170ce0 3373 vm_page_check_pageable_safe(m);
39037602
A
3374 assert(m->vm_page_q_state == VM_PAGE_NOT_ON_Q);
3375 vm_page_queue_enter(&vm_page_queue_throttled, m,
3376 vm_page_t, pageq);
3377 m->vm_page_q_state = VM_PAGE_ON_THROTTLED_Q;
6d2010ae
A
3378 vm_page_throttled_count++;
3379
3380 vm_pageout_scan_reclaimed_throttled++;
3381
fe8ab488 3382 inactive_burst_count = 0;
6d2010ae
A
3383 goto done_with_inactivepage;
3384 }
3385 if (inactive_throttled == TRUE) {
3386
39236c6e
A
3387 if (object->internal == FALSE) {
3388 /*
3389 * we need to break up the following potential deadlock case...
3390 * a) The external pageout thread is stuck on the truncate lock for a file that is being extended i.e. written.
3391 * b) The thread doing the writing is waiting for pages while holding the truncate lock
3392 * c) Most of the pages in the inactive queue belong to this file.
3393 *
3394 * we are potentially in this deadlock because...
3395 * a) the external pageout queue is throttled
3396 * b) we're done with the active queue and moved on to the inactive queue
3397 * c) we've got a dirty external page
6d2010ae 3398 *
39236c6e
A
3399 * since we don't know the reason for the external pageout queue being throttled we
3400 * must suspect that we are deadlocked, so move the current page onto the active queue
3401 * in an effort to cause a page from the active queue to 'age' to the inactive queue
3402 *
3403 * if we don't have jetsam configured (i.e. we have a dynamic pager), set
3404 * 'force_anonymous' to TRUE to cause us to grab a page from the cleaned/anonymous
3405 * pool the next time we select a victim page... if we can make enough new free pages,
3406 * the deadlock will break, the external pageout queue will empty and it will no longer
3407 * be throttled
3408 *
3409 * if we have jestam configured, keep a count of the pages reactivated this way so
3410 * that we can try to find clean pages in the active/inactive queues before
3411 * deciding to jetsam a process
6d2010ae 3412 */
3e170ce0 3413 vm_pageout_scan_inactive_throttled_external++;
39236c6e 3414
3e170ce0 3415 vm_page_check_pageable_safe(m);
39037602
A
3416 assert(m->vm_page_q_state == VM_PAGE_NOT_ON_Q);
3417 vm_page_queue_enter(&vm_page_queue_active, m, vm_page_t, pageq);
3418 m->vm_page_q_state = VM_PAGE_ON_ACTIVE_Q;
316670eb 3419 vm_page_active_count++;
fe8ab488 3420 vm_page_pageable_external_count++;
316670eb
A
3421
3422 vm_pageout_adjust_io_throttles(iq, eq, FALSE);
3423
39236c6e 3424#if CONFIG_MEMORYSTATUS && CONFIG_JETSAM
6d2010ae
A
3425 vm_pageout_inactive_external_forced_reactivate_limit--;
3426
39236c6e 3427 if (vm_pageout_inactive_external_forced_reactivate_limit <= 0) {
6d2010ae 3428 vm_pageout_inactive_external_forced_reactivate_limit = vm_page_active_count + vm_page_inactive_count;
6d2010ae
A
3429 /*
3430 * Possible deadlock scenario so request jetsam action
3431 */
3432 assert(object);
3433 vm_object_unlock(object);
3434 object = VM_OBJECT_NULL;
3435 vm_page_unlock_queues();
39236c6e 3436
3e170ce0 3437 VM_DEBUG_CONSTANT_EVENT(vm_pageout_jetsam, VM_PAGEOUT_JETSAM, DBG_FUNC_START,
39236c6e 3438 vm_page_active_count, vm_page_inactive_count, vm_page_free_count, vm_page_free_count);
6d2010ae 3439
39236c6e
A
3440 /* Kill first suitable process */
3441 if (memorystatus_kill_on_VM_page_shortage(FALSE) == FALSE) {
6d2010ae
A
3442 panic("vm_pageout_scan: Jetsam request failed\n");
3443 }
39236c6e 3444
3e170ce0 3445 VM_DEBUG_CONSTANT_EVENT(vm_pageout_jetsam, VM_PAGEOUT_JETSAM, DBG_FUNC_END, 0, 0, 0, 0);
6d2010ae 3446
316670eb 3447 vm_pageout_inactive_external_forced_jetsam_count++;
6d2010ae
A
3448 vm_page_lock_queues();
3449 delayed_unlock = 1;
2d21ac55 3450 }
39236c6e
A
3451#else /* CONFIG_MEMORYSTATUS && CONFIG_JETSAM */
3452 force_anonymous = TRUE;
3453#endif
fe8ab488 3454 inactive_burst_count = 0;
6d2010ae
A
3455 goto done_with_inactivepage;
3456 } else {
39236c6e
A
3457 vm_pageout_scan_inactive_throttled_internal++;
3458
3e170ce0 3459 goto must_activate_page;
1c79356b 3460 }
1c79356b 3461 }
2d21ac55 3462
1c79356b 3463 /*
91447636
A
3464 * we've got a page that we can steal...
3465 * eliminate all mappings and make sure
3466 * we have the up-to-date modified state
316670eb 3467 *
91447636
A
3468 * if we need to do a pmap_disconnect then we
3469 * need to re-evaluate m->dirty since the pmap_disconnect
3470 * provides the true state atomically... the
3471 * page was still mapped up to the pmap_disconnect
3472 * and may have been dirtied at the last microsecond
3473 *
2d21ac55
A
3474 * Note that if 'pmapped' is FALSE then the page is not
3475 * and has not been in any map, so there is no point calling
39236c6e
A
3476 * pmap_disconnect(). m->dirty could have been set in anticipation
3477 * of likely usage of the page.
91447636 3478 */
2d21ac55 3479 if (m->pmapped == TRUE) {
3e170ce0 3480 int pmap_options;
0b4e3aa0 3481
3e170ce0
A
3482 /*
3483 * Don't count this page as going into the compressor
3484 * if any of these are true:
39037602
A
3485 * 1) compressed pager isn't enabled
3486 * 2) Freezer enabled device with compressed pager
3e170ce0
A
3487 * backend (exclusive use) i.e. most of the VM system
3488 * (including vm_pageout_scan) has no knowledge of
3489 * the compressor
39037602 3490 * 3) This page belongs to a file and hence will not be
3e170ce0
A
3491 * sent into the compressor
3492 */
39037602 3493 if ( !VM_CONFIG_COMPRESSOR_IS_ACTIVE ||
3e170ce0
A
3494 object->internal == FALSE) {
3495 pmap_options = 0;
3496 } else if (m->dirty || m->precious) {
fe8ab488 3497 /*
3e170ce0
A
3498 * VM knows that this page is dirty (or
3499 * precious) and needs to be compressed
3500 * rather than freed.
3501 * Tell the pmap layer to count this page
3502 * as "compressed".
fe8ab488 3503 */
3e170ce0 3504 pmap_options = PMAP_OPTIONS_COMPRESSOR;
39236c6e 3505 } else {
3e170ce0
A
3506 /*
3507 * VM does not know if the page needs to
3508 * be preserved but the pmap layer might tell
3509 * us if any mapping has "modified" it.
3510 * Let's the pmap layer to count this page
3511 * as compressed if and only if it has been
3512 * modified.
3513 */
3514 pmap_options =
3515 PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED;
316670eb 3516 }
39037602 3517 refmod_state = pmap_disconnect_options(VM_PAGE_GET_PHYS_PAGE(m),
3e170ce0
A
3518 pmap_options,
3519 NULL);
39236c6e
A
3520 if (refmod_state & VM_MEM_MODIFIED) {
3521 SET_PAGE_DIRTY(m, FALSE);
91447636
A
3522 }
3523 }
2d21ac55
A
3524 /*
3525 * reset our count of pages that have been reclaimed
3526 * since the last page was 'stolen'
3527 */
3528 inactive_reclaim_run = 0;
3529
1c79356b
A
3530 /*
3531 * If it's clean and not precious, we can free the page.
3532 */
1c79356b 3533 if (!m->dirty && !m->precious) {
b0d623f7 3534
39037602 3535 if (page_prev_q_state == VM_PAGE_ON_SPECULATIVE_Q)
6d2010ae
A
3536 vm_pageout_speculative_clean++;
3537 else {
39037602 3538 if (page_prev_q_state == VM_PAGE_ON_INACTIVE_INTERNAL_Q)
316670eb 3539 vm_pageout_inactive_anonymous++;
39037602 3540 else if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q)
316670eb
A
3541 vm_pageout_cleaned_reclaimed++;
3542
6d2010ae
A
3543 vm_pageout_inactive_clean++;
3544 }
316670eb 3545
39037602
A
3546#if CONFIG_SECLUDED_MEMORY
3547 if (secluded_for_filecache &&
3548 vm_page_secluded_target > 0 &&
3549 !m->fictitious &&
3550 m_object->eligible_for_secluded &&
3551 num_tasks_can_use_secluded_mem == 0 &&
3552 (secluded_aging_policy == SECLUDED_AGING_FIFO ||
3553 ((secluded_aging_policy ==
3554 SECLUDED_AGING_AFTER_INACTIVE) &&
3555 (page_prev_q_state != VM_PAGE_ON_SECLUDED_Q)))) {
3556 assert(page_prev_q_state != VM_PAGE_ON_SECLUDED_Q);
3557 assert(m->vm_page_q_state == VM_PAGE_NOT_ON_Q);
3558 LCK_MTX_ASSERT(&vm_page_queue_lock,
3559 LCK_MTX_ASSERT_OWNED);
3560 vm_page_queue_enter(&vm_page_queue_secluded,
3561 m,
3562 vm_page_t,
3563 pageq);
3564 m->vm_page_q_state = VM_PAGE_ON_SECLUDED_Q;
3565 vm_object_unlock(m_object);
3566 object = VM_OBJECT_NULL;
3567 vm_page_secluded_count++;
3568 vm_page_secluded_count_inuse++;
3569 assert(!m_object->internal);
3570// vm_page_pageable_external_count++;
3571 m = VM_PAGE_NULL;
3572 goto done_with_inactivepage;
3573 }
3574#endif /* CONFIG_SECLUDED_MEMORY */
3575
316670eb
A
3576 /*
3577 * OK, at this point we have found a page we are going to free.
3578 */
fe8ab488
A
3579#if CONFIG_PHANTOM_CACHE
3580 if (!object->internal)
3581 vm_phantom_cache_add_ghost(m);
3582#endif
1c79356b
A
3583 goto reclaim_page;
3584 }
2d21ac55
A
3585
3586 /*
3587 * The page may have been dirtied since the last check
3588 * for a throttled target queue (which may have been skipped
3589 * if the page was clean then). With the dirty page
3590 * disconnected here, we can make one final check.
3591 */
6d2010ae
A
3592 if (object->internal) {
3593 if (VM_PAGE_Q_THROTTLED(iq))
3594 inactive_throttled = TRUE;
3595 } else if (VM_PAGE_Q_THROTTLED(eq)) {
3596 inactive_throttled = TRUE;
3597 }
2d21ac55 3598
316670eb 3599 if (inactive_throttled == TRUE)
6d2010ae 3600 goto throttle_inactive;
39236c6e 3601
fe8ab488
A
3602#if VM_PRESSURE_EVENTS
3603#if CONFIG_JETSAM
3604
3605 /*
3606 * If Jetsam is enabled, then the sending
3607 * of memory pressure notifications is handled
3608 * from the same thread that takes care of high-water
3609 * and other jetsams i.e. the memorystatus_thread.
3610 */
3611
3612#else /* CONFIG_JETSAM */
3613
39236c6e 3614 vm_pressure_response();
fe8ab488
A
3615
3616#endif /* CONFIG_JETSAM */
39236c6e 3617#endif /* VM_PRESSURE_EVENTS */
316670eb 3618
39037602 3619 if (page_prev_q_state == VM_PAGE_ON_INACTIVE_INTERNAL_Q)
316670eb 3620 vm_pageout_inactive_anonymous++;
6d2010ae
A
3621 if (object->internal)
3622 vm_pageout_inactive_dirty_internal++;
3623 else
3624 vm_pageout_inactive_dirty_external++;
39236c6e 3625
3e170ce0
A
3626 /*
3627 * do NOT set the pageout bit!
3628 * sure, we might need free pages, but this page is going to take time to become free
3629 * anyway, so we may as well put it on the clean queue first and take it from there later
3630 * if necessary. that way, we'll ensure we don't free up too much. -mj
3631 */
39037602 3632 vm_pageout_cluster(m, FALSE, FALSE);
1c79356b 3633
91447636 3634done_with_inactivepage:
39236c6e 3635
6d2010ae 3636 if (delayed_unlock++ > delayed_unlock_limit || try_failed == TRUE) {
fe8ab488 3637 boolean_t need_delay = TRUE;
1c79356b 3638
91447636 3639 if (object != NULL) {
b0d623f7 3640 vm_pageout_scan_wants_object = VM_OBJECT_NULL;
91447636
A
3641 vm_object_unlock(object);
3642 object = NULL;
3643 }
fe8ab488
A
3644 vm_page_unlock_queues();
3645
91447636 3646 if (local_freeq) {
6d2010ae
A
3647
3648 VM_DEBUG_EVENT(vm_pageout_freelist, VM_PAGEOUT_FREELIST, DBG_FUNC_START,
3649 vm_page_free_count, local_freed, delayed_unlock_limit, 4);
316670eb
A
3650
3651 vm_page_free_list(local_freeq, TRUE);
91447636 3652
6d2010ae
A
3653 VM_DEBUG_EVENT(vm_pageout_freelist, VM_PAGEOUT_FREELIST, DBG_FUNC_END,
3654 vm_page_free_count, local_freed, 0, 4);
3655
2d21ac55 3656 local_freeq = NULL;
91447636 3657 local_freed = 0;
fe8ab488
A
3658 need_delay = FALSE;
3659 }
3e170ce0
A
3660 vm_consider_waking_compactor_swapper();
3661
fe8ab488
A
3662 vm_page_lock_queues();
3663
3664 if (need_delay == TRUE)
b0d623f7 3665 lck_mtx_yield(&vm_page_queue_lock);
2d21ac55
A
3666
3667 delayed_unlock = 1;
1c79356b 3668 }
316670eb 3669 vm_pageout_considered_page++;
39236c6e 3670
91447636
A
3671 /*
3672 * back to top of pageout scan loop
3673 */
1c79356b 3674 }
1c79356b
A
3675}
3676
1c79356b 3677
1c79356b
A
3678int vm_page_free_count_init;
3679
3680void
3681vm_page_free_reserve(
3682 int pages)
3683{
3684 int free_after_reserve;
3685
39037602 3686 if (VM_CONFIG_COMPRESSOR_IS_PRESENT) {
1c79356b 3687
39236c6e
A
3688 if ((vm_page_free_reserved + pages + COMPRESSOR_FREE_RESERVED_LIMIT) >= (VM_PAGE_FREE_RESERVED_LIMIT + COMPRESSOR_FREE_RESERVED_LIMIT))
3689 vm_page_free_reserved = VM_PAGE_FREE_RESERVED_LIMIT + COMPRESSOR_FREE_RESERVED_LIMIT;
3690 else
3691 vm_page_free_reserved += (pages + COMPRESSOR_FREE_RESERVED_LIMIT);
6d2010ae 3692
39236c6e
A
3693 } else {
3694 if ((vm_page_free_reserved + pages) >= VM_PAGE_FREE_RESERVED_LIMIT)
3695 vm_page_free_reserved = VM_PAGE_FREE_RESERVED_LIMIT;
3696 else
3697 vm_page_free_reserved += pages;
3698 }
1c79356b
A
3699 free_after_reserve = vm_page_free_count_init - vm_page_free_reserved;
3700
3701 vm_page_free_min = vm_page_free_reserved +
3702 VM_PAGE_FREE_MIN(free_after_reserve);
3703
2d21ac55
A
3704 if (vm_page_free_min > VM_PAGE_FREE_MIN_LIMIT)
3705 vm_page_free_min = VM_PAGE_FREE_MIN_LIMIT;
3706
1c79356b
A
3707 vm_page_free_target = vm_page_free_reserved +
3708 VM_PAGE_FREE_TARGET(free_after_reserve);
3709
39037602
A
3710 if (vm_page_free_target > VM_PAGE_FREE_TARGET_LIMIT)
3711 vm_page_free_target = VM_PAGE_FREE_TARGET_LIMIT;
39236c6e 3712
39037602
A
3713 if (vm_page_free_target < vm_page_free_min + 5)
3714 vm_page_free_target = vm_page_free_min + 5;
39236c6e 3715
39037602
A
3716 vm_page_throttle_limit = vm_page_free_target - (vm_page_free_target / 2);
3717}
39236c6e 3718
39037602
A
3719/*
3720 * vm_pageout is the high level pageout daemon.
3721 */
39236c6e 3722
39037602
A
3723void
3724vm_pageout_continue(void)
3725{
3726 DTRACE_VM2(pgrrun, int, 1, (uint64_t *), NULL);
3727 vm_pageout_scan_event_counter++;
3728
3729 lck_mtx_lock(&vm_page_queue_free_lock);
3730 vm_pageout_running = TRUE;
3731 lck_mtx_unlock(&vm_page_queue_free_lock);
3732
3733 vm_pageout_scan();
3734 /*
3735 * we hold both the vm_page_queue_free_lock
3736 * and the vm_page_queues_lock at this point
3737 */
3738 assert(vm_page_free_wanted == 0);
3739 assert(vm_page_free_wanted_privileged == 0);
3740 assert_wait((event_t) &vm_page_free_wanted, THREAD_UNINT);
3741
3742 vm_pageout_running = FALSE;
3743 if (vm_pageout_waiter) {
3744 vm_pageout_waiter = FALSE;
3745 thread_wakeup((event_t)&vm_pageout_waiter);
39236c6e 3746 }
39236c6e 3747
39037602 3748 lck_mtx_unlock(&vm_page_queue_free_lock);
39236c6e
A
3749 vm_page_unlock_queues();
3750
39037602
A
3751 counter(c_vm_pageout_block++);
3752 thread_block((thread_continue_t)vm_pageout_continue);
39236c6e
A
3753 /*NOTREACHED*/
3754}
3755
39037602
A
3756kern_return_t
3757vm_pageout_wait(uint64_t deadline)
3758{
3759 kern_return_t kr;
3760
3761 lck_mtx_lock(&vm_page_queue_free_lock);
3762 for (kr = KERN_SUCCESS; vm_pageout_running && (KERN_SUCCESS == kr); ) {
3763 vm_pageout_waiter = TRUE;
3764 if (THREAD_AWAKENED != lck_mtx_sleep_deadline(
3765 &vm_page_queue_free_lock, LCK_SLEEP_DEFAULT,
3766 (event_t) &vm_pageout_waiter, THREAD_UNINT, deadline)) {
3767 kr = KERN_OPERATION_TIMED_OUT;
3768 }
3769 }
3770 lck_mtx_unlock(&vm_page_queue_free_lock);
3771
3772 return (kr);
3773}
3774
39236c6e
A
3775
3776static void
3777vm_pageout_iothread_external_continue(struct vm_pageout_queue *q)
3778{
3779 vm_page_t m = NULL;
3780 vm_object_t object;
3781 vm_object_offset_t offset;
3782 memory_object_t pager;
3783
3784
3785 if (vm_pageout_internal_iothread != THREAD_NULL)
3786 current_thread()->options &= ~TH_OPT_VMPRIV;
3787
3788 vm_page_lockspin_queues();
3789
39037602 3790 while ( !vm_page_queue_empty(&q->pgo_pending) ) {
39236c6e
A
3791
3792 q->pgo_busy = TRUE;
39037602 3793 vm_page_queue_remove_first(&q->pgo_pending, m, vm_page_t, pageq);
39236c6e 3794
39037602
A
3795 assert(m->vm_page_q_state == VM_PAGE_ON_PAGEOUT_Q);
3796 VM_PAGE_CHECK(m);
39236c6e
A
3797 /*
3798 * grab a snapshot of the object and offset this
3799 * page is tabled in so that we can relookup this
3800 * page after we've taken the object lock - these
3801 * fields are stable while we hold the page queues lock
3802 * but as soon as we drop it, there is nothing to keep
3803 * this page in this object... we hold an activity_in_progress
3804 * on this object which will keep it from terminating
3805 */
39037602 3806 object = VM_PAGE_OBJECT(m);
39236c6e
A
3807 offset = m->offset;
3808
39037602
A
3809 if (object->object_slid) {
3810 panic("slid page %p not allowed on this path\n", m);
3811 }
3812 m->vm_page_q_state = VM_PAGE_NOT_ON_Q;
3813 VM_PAGE_ZERO_PAGEQ_ENTRY(m);
3814
39236c6e
A
3815 vm_page_unlock_queues();
3816
3817 vm_object_lock(object);
3818
3819 m = vm_page_lookup(object, offset);
3820
3821 if (m == NULL ||
39037602 3822 m->busy || m->cleaning || !m->laundry || (m->vm_page_q_state == VM_PAGE_ON_PAGEOUT_Q)) {
39236c6e
A
3823 /*
3824 * it's either the same page that someone else has
3825 * started cleaning (or it's finished cleaning or
3826 * been put back on the pageout queue), or
3827 * the page has been freed or we have found a
3828 * new page at this offset... in all of these cases
3829 * we merely need to release the activity_in_progress
3830 * we took when we put the page on the pageout queue
3831 */
3832 vm_object_activity_end(object);
3833 vm_object_unlock(object);
3834
3835 vm_page_lockspin_queues();
3836 continue;
3837 }
3838 pager = object->pager;
3839
3840 if (pager == MEMORY_OBJECT_NULL) {
3841 /*
3842 * This pager has been destroyed by either
3843 * memory_object_destroy or vm_object_destroy, and
3844 * so there is nowhere for the page to go.
3845 */
39037602 3846 if (m->free_when_done) {
39236c6e
A
3847 /*
3848 * Just free the page... VM_PAGE_FREE takes
3849 * care of cleaning up all the state...
3850 * including doing the vm_pageout_throttle_up
3851 */
3852 VM_PAGE_FREE(m);
3853 } else {
3854 vm_page_lockspin_queues();
3855
3856 vm_pageout_throttle_up(m);
3857 vm_page_activate(m);
3858
3859 vm_page_unlock_queues();
3860
3861 /*
3862 * And we are done with it.
3863 */
3864 }
3865 vm_object_activity_end(object);
3866 vm_object_unlock(object);
3867
3868 vm_page_lockspin_queues();
3869 continue;
3870 }
3871#if 0
3872 /*
3873 * we don't hold the page queue lock
3874 * so this check isn't safe to make
3875 */
3876 VM_PAGE_CHECK(m);
3877#endif
3878 /*
3879 * give back the activity_in_progress reference we
3880 * took when we queued up this page and replace it
3881 * it with a paging_in_progress reference that will
3882 * also hold the paging offset from changing and
3883 * prevent the object from terminating
3884 */
3885 vm_object_activity_end(object);
3886 vm_object_paging_begin(object);
3887 vm_object_unlock(object);
3888
3889 /*
3890 * Send the data to the pager.
3891 * any pageout clustering happens there
3892 */
3893 memory_object_data_return(pager,
3894 m->offset + object->paging_offset,
3895 PAGE_SIZE,
3896 NULL,
3897 NULL,
3898 FALSE,
3899 FALSE,
3900 0);
3901
3902 vm_object_lock(object);
3903 vm_object_paging_end(object);
3904 vm_object_unlock(object);
3905
3906 vm_pageout_io_throttle();
3907
3908 vm_page_lockspin_queues();
3909 }
3910 q->pgo_busy = FALSE;
3911 q->pgo_idle = TRUE;
3912
3913 assert_wait((event_t) &q->pgo_pending, THREAD_UNINT);
3914 vm_page_unlock_queues();
3915
3916 thread_block_parameter((thread_continue_t)vm_pageout_iothread_external_continue, (void *) q);
3917 /*NOTREACHED*/
3918}
3919
3920
3921uint32_t vm_compressor_failed;
3922
3e170ce0 3923#define MAX_FREE_BATCH 32
39037602
A
3924uint32_t vm_compressor_time_thread; /* Set via sysctl to record time accrued by
3925 * this thread.
3926 */
3927uint64_t vm_compressor_thread_runtime;
3e170ce0 3928
39236c6e
A
3929static void
3930vm_pageout_iothread_internal_continue(struct cq *cq)
3931{
3932 struct vm_pageout_queue *q;
3933 vm_page_t m = NULL;
39236c6e
A
3934 boolean_t pgo_draining;
3935 vm_page_t local_q;
3936 int local_cnt;
3937 vm_page_t local_freeq = NULL;
3938 int local_freed = 0;
3939 int local_batch_size;
39236c6e
A
3940
3941
3942 KERNEL_DEBUG(0xe040000c | DBG_FUNC_END, 0, 0, 0, 0, 0);
3943
3944 q = cq->q;
3e170ce0 3945 local_batch_size = q->pgo_maxlaundry / (vm_compressor_thread_count * 2);
39236c6e 3946
3e170ce0
A
3947#if RECORD_THE_COMPRESSED_DATA
3948 if (q->pgo_laundry)
3949 c_compressed_record_init();
3950#endif
39236c6e 3951 while (TRUE) {
3e170ce0 3952 int pages_left_on_q = 0;
39236c6e
A
3953
3954 local_cnt = 0;
3955 local_q = NULL;
3956
3957 KERNEL_DEBUG(0xe0400014 | DBG_FUNC_START, 0, 0, 0, 0, 0);
3958
3959 vm_page_lock_queues();
3960
3961 KERNEL_DEBUG(0xe0400014 | DBG_FUNC_END, 0, 0, 0, 0, 0);
3962
3e170ce0 3963 KERNEL_DEBUG(0xe0400018 | DBG_FUNC_START, q->pgo_laundry, 0, 0, 0, 0);
39236c6e 3964
39037602 3965 while ( !vm_page_queue_empty(&q->pgo_pending) && local_cnt < local_batch_size) {
39236c6e 3966
39037602
A
3967 vm_page_queue_remove_first(&q->pgo_pending, m, vm_page_t, pageq);
3968 assert(m->vm_page_q_state == VM_PAGE_ON_PAGEOUT_Q);
39236c6e 3969 VM_PAGE_CHECK(m);
39037602
A
3970
3971 m->vm_page_q_state = VM_PAGE_NOT_ON_Q;
3972 VM_PAGE_ZERO_PAGEQ_ENTRY(m);
3973 m->laundry = FALSE;
39236c6e 3974
39037602 3975 m->snext = local_q;
39236c6e
A
3976 local_q = m;
3977 local_cnt++;
3978 }
3979 if (local_q == NULL)
3980 break;
3981
3982 q->pgo_busy = TRUE;
3983
3e170ce0 3984 if ((pgo_draining = q->pgo_draining) == FALSE) {
39236c6e 3985 vm_pageout_throttle_up_batch(q, local_cnt);
3e170ce0
A
3986 pages_left_on_q = q->pgo_laundry;
3987 } else
3988 pages_left_on_q = q->pgo_laundry - local_cnt;
39236c6e
A
3989
3990 vm_page_unlock_queues();
3991
3e170ce0
A
3992#if !RECORD_THE_COMPRESSED_DATA
3993 if (pages_left_on_q >= local_batch_size && cq->id < (vm_compressor_thread_count - 1))
3994 thread_wakeup((event_t) ((uintptr_t)&q->pgo_pending + cq->id + 1));
3995#endif
3996 KERNEL_DEBUG(0xe0400018 | DBG_FUNC_END, q->pgo_laundry, 0, 0, 0, 0);
39236c6e
A
3997
3998 while (local_q) {
3e170ce0
A
3999
4000 KERNEL_DEBUG(0xe0400024 | DBG_FUNC_START, local_cnt, 0, 0, 0, 0);
4001
39236c6e 4002 m = local_q;
39037602
A
4003 local_q = m->snext;
4004 m->snext = NULL;
39236c6e 4005
3e170ce0 4006 if (vm_pageout_compress_page(&cq->current_chead, cq->scratch_buf, m, FALSE) == KERN_SUCCESS) {
39236c6e 4007
39037602 4008 m->snext = local_freeq;
3e170ce0
A
4009 local_freeq = m;
4010 local_freed++;
39236c6e 4011
3e170ce0 4012 if (local_freed >= MAX_FREE_BATCH) {
39236c6e 4013
3e170ce0
A
4014 vm_page_free_list(local_freeq, TRUE);
4015 local_freeq = NULL;
4016 local_freed = 0;
39236c6e 4017 }
39236c6e 4018 }
3e170ce0
A
4019#if !CONFIG_JETSAM
4020 while (vm_page_free_count < COMPRESSOR_FREE_RESERVED_LIMIT) {
39236c6e
A
4021 kern_return_t wait_result;
4022 int need_wakeup = 0;
4023
4024 if (local_freeq) {
4025 vm_page_free_list(local_freeq, TRUE);
4026
4027 local_freeq = NULL;
4028 local_freed = 0;
b0d623f7 4029
39236c6e
A
4030 continue;
4031 }
4032 lck_mtx_lock_spin(&vm_page_queue_free_lock);
b0d623f7 4033
3e170ce0
A
4034 if (vm_page_free_count < COMPRESSOR_FREE_RESERVED_LIMIT) {
4035
39236c6e
A
4036 if (vm_page_free_wanted_privileged++ == 0)
4037 need_wakeup = 1;
4038 wait_result = assert_wait((event_t)&vm_page_free_wanted_privileged, THREAD_UNINT);
91447636 4039
39236c6e 4040 lck_mtx_unlock(&vm_page_queue_free_lock);
91447636 4041
39236c6e
A
4042 if (need_wakeup)
4043 thread_wakeup((event_t)&vm_page_free_wanted);
316670eb 4044
39236c6e 4045 if (wait_result == THREAD_WAITING)
3e170ce0 4046
39236c6e
A
4047 thread_block(THREAD_CONTINUE_NULL);
4048 } else
4049 lck_mtx_unlock(&vm_page_queue_free_lock);
4050 }
3e170ce0 4051#endif
39236c6e
A
4052 }
4053 if (local_freeq) {
4054 vm_page_free_list(local_freeq, TRUE);
4055
4056 local_freeq = NULL;
4057 local_freed = 0;
4058 }
4059 if (pgo_draining == TRUE) {
4060 vm_page_lockspin_queues();
4061 vm_pageout_throttle_up_batch(q, local_cnt);
4062 vm_page_unlock_queues();
4063 }
0b4c1975 4064 }
39236c6e
A
4065 KERNEL_DEBUG(0xe040000c | DBG_FUNC_START, 0, 0, 0, 0, 0);
4066
4067 /*
4068 * queue lock is held and our q is empty
4069 */
91447636
A
4070 q->pgo_busy = FALSE;
4071 q->pgo_idle = TRUE;
316670eb 4072
3e170ce0 4073 assert_wait((event_t) ((uintptr_t)&q->pgo_pending + cq->id), THREAD_UNINT);
91447636
A
4074 vm_page_unlock_queues();
4075
39037602
A
4076 if (__improbable(vm_compressor_time_thread)) {
4077 vm_compressor_thread_runtime = thread_get_runtime_self();
4078 }
4079
39236c6e
A
4080 KERNEL_DEBUG(0xe0400018 | DBG_FUNC_END, 0, 0, 0, 0, 0);
4081
4082 thread_block_parameter((thread_continue_t)vm_pageout_iothread_internal_continue, (void *) cq);
91447636
A
4083 /*NOTREACHED*/
4084}
4085
4086
316670eb 4087
3e170ce0
A
4088static void
4089vm_pageout_immediate(vm_page_t m, boolean_t object_locked_by_caller)
4090{
4091 assert(vm_pageout_immediate_scratch_buf);
4092
4093 if (vm_pageout_compress_page(&vm_pageout_immediate_chead, vm_pageout_immediate_scratch_buf, m, object_locked_by_caller) == KERN_SUCCESS) {
4094
4095 vm_page_free_prepare_object(m, TRUE);
39037602 4096 vm_page_release(m, TRUE);
3e170ce0
A
4097 }
4098}
4099
4100
4101kern_return_t
4102vm_pageout_compress_page(void **current_chead, char *scratch_buf, vm_page_t m, boolean_t object_locked_by_caller)
4103{
4104 vm_object_t object;
4105 memory_object_t pager;
4106 int compressed_count_delta;
4107 kern_return_t retval;
4108
39037602
A
4109 object = VM_PAGE_OBJECT(m);
4110
4111 if (object->object_slid) {
3e170ce0
A
4112 panic("slid page %p not allowed on this path\n", m);
4113 }
39037602
A
4114 assert(!m->free_when_done);
4115 assert(!m->laundry);
3e170ce0 4116
3e170ce0
A
4117 pager = object->pager;
4118
4119 if (object_locked_by_caller == FALSE && (!object->pager_initialized || pager == MEMORY_OBJECT_NULL)) {
4120
4121 KERNEL_DEBUG(0xe0400010 | DBG_FUNC_START, object, pager, 0, 0, 0);
4122
4123 vm_object_lock(object);
4124
4125 /*
4126 * If there is no memory object for the page, create
4127 * one and hand it to the compression pager.
4128 */
4129
4130 if (!object->pager_initialized)
4131 vm_object_collapse(object, (vm_object_offset_t) 0, TRUE);
4132 if (!object->pager_initialized)
4133 vm_object_compressor_pager_create(object);
4134
39037602
A
4135 pager = object->pager;
4136
4137 if (!object->pager_initialized || pager == MEMORY_OBJECT_NULL) {
3e170ce0 4138 /*
39037602
A
4139 * Still no pager for the object,
4140 * or the pager has been destroyed.
3e170ce0
A
4141 * Reactivate the page.
4142 *
4143 * Should only happen if there is no
4144 * compression pager
4145 */
3e170ce0
A
4146 PAGE_WAKEUP_DONE(m);
4147
4148 vm_page_lockspin_queues();
4149 vm_page_activate(m);
4150 vm_pageout_dirty_no_pager++;
4151 vm_page_unlock_queues();
4152
4153 /*
4154 * And we are done with it.
4155 */
4156 vm_object_activity_end(object);
4157 vm_object_unlock(object);
4158
4159 return KERN_FAILURE;
4160 }
3e170ce0
A
4161 vm_object_unlock(object);
4162
4163 KERNEL_DEBUG(0xe0400010 | DBG_FUNC_END, object, pager, 0, 0, 0);
4164 }
4165 assert(object->pager_initialized && pager != MEMORY_OBJECT_NULL);
4166
4167 if (object_locked_by_caller == FALSE)
4168 assert(object->activity_in_progress > 0);
4169
4170 retval = vm_compressor_pager_put(
4171 pager,
4172 m->offset + object->paging_offset,
39037602 4173 VM_PAGE_GET_PHYS_PAGE(m),
3e170ce0
A
4174 current_chead,
4175 scratch_buf,
4176 &compressed_count_delta);
4177
4178 if (object_locked_by_caller == FALSE) {
4179 vm_object_lock(object);
4180
4181 assert(object->activity_in_progress > 0);
39037602 4182 assert(VM_PAGE_OBJECT(m) == object);
3e170ce0
A
4183 }
4184
4185 vm_compressor_pager_count(pager,
4186 compressed_count_delta,
4187 FALSE, /* shared_lock */
4188 object);
4189
39037602 4190 assert( !VM_PAGE_WIRED(m));
3e170ce0
A
4191
4192 if (retval == KERN_SUCCESS) {
4193 /*
4194 * If the object is purgeable, its owner's
4195 * purgeable ledgers will be updated in
4196 * vm_page_remove() but the page still
4197 * contributes to the owner's memory footprint,
4198 * so account for it as such.
4199 */
4200 if (object->purgable != VM_PURGABLE_DENY &&
4201 object->vo_purgeable_owner != NULL) {
4202 /* one more compressed purgeable page */
4203 vm_purgeable_compressed_update(object,
4204 +1);
4205 }
4206 VM_STAT_INCR(compressions);
4207
4208 if (m->tabled)
4209 vm_page_remove(m, TRUE);
4210
4211 } else {
4212 PAGE_WAKEUP_DONE(m);
4213
4214 vm_page_lockspin_queues();
4215
4216 vm_page_activate(m);
4217 vm_compressor_failed++;
4218
4219 vm_page_unlock_queues();
4220 }
4221 if (object_locked_by_caller == FALSE) {
4222 vm_object_activity_end(object);
4223 vm_object_unlock(object);
4224 }
4225 return retval;
4226}
4227
4228
316670eb
A
4229static void
4230vm_pageout_adjust_io_throttles(struct vm_pageout_queue *iq, struct vm_pageout_queue *eq, boolean_t req_lowpriority)
4231{
4232 uint32_t policy;
4233 boolean_t set_iq = FALSE;
4234 boolean_t set_eq = FALSE;
4235
4236 if (hibernate_cleaning_in_progress == TRUE)
4237 req_lowpriority = FALSE;
4238
316670eb
A
4239 if (eq->pgo_inited == TRUE && eq->pgo_lowpriority != req_lowpriority)
4240 set_eq = TRUE;
4241
4242 if (set_iq == TRUE || set_eq == TRUE) {
4243
4244 vm_page_unlock_queues();
4245
4246 if (req_lowpriority == TRUE) {
39236c6e 4247 policy = THROTTLE_LEVEL_PAGEOUT_THROTTLED;
316670eb
A
4248 DTRACE_VM(laundrythrottle);
4249 } else {
39236c6e 4250 policy = THROTTLE_LEVEL_PAGEOUT_UNTHROTTLED;
316670eb
A
4251 DTRACE_VM(laundryunthrottle);
4252 }
4253 if (set_iq == TRUE) {
39037602
A
4254 proc_set_thread_policy_with_tid(kernel_task, iq->pgo_tid,
4255 TASK_POLICY_EXTERNAL, TASK_POLICY_IO, policy);
39236c6e 4256
316670eb
A
4257 iq->pgo_lowpriority = req_lowpriority;
4258 }
4259 if (set_eq == TRUE) {
39037602
A
4260 proc_set_thread_policy_with_tid(kernel_task, eq->pgo_tid,
4261 TASK_POLICY_EXTERNAL, TASK_POLICY_IO, policy);
39236c6e 4262
316670eb
A
4263 eq->pgo_lowpriority = req_lowpriority;
4264 }
4265 vm_page_lock_queues();
4266 }
4267}
4268
4269
91447636
A
4270static void
4271vm_pageout_iothread_external(void)
4272{
2d21ac55
A
4273 thread_t self = current_thread();
4274
4275 self->options |= TH_OPT_VMPRIV;
91447636 4276
39037602 4277 DTRACE_VM2(laundrythrottle, int, 1, (uint64_t *), NULL);
39236c6e 4278
39037602
A
4279 proc_set_thread_policy(self, TASK_POLICY_EXTERNAL,
4280 TASK_POLICY_IO, THROTTLE_LEVEL_PAGEOUT_THROTTLED);
316670eb
A
4281
4282 vm_page_lock_queues();
4283
4284 vm_pageout_queue_external.pgo_tid = self->thread_id;
4285 vm_pageout_queue_external.pgo_lowpriority = TRUE;
4286 vm_pageout_queue_external.pgo_inited = TRUE;
4287
4288 vm_page_unlock_queues();
4289
39037602 4290 vm_pageout_iothread_external_continue(&vm_pageout_queue_external);
316670eb 4291
91447636
A
4292 /*NOTREACHED*/
4293}
4294
39236c6e 4295
91447636 4296static void
39236c6e 4297vm_pageout_iothread_internal(struct cq *cq)
91447636
A
4298{
4299 thread_t self = current_thread();
4300
4301 self->options |= TH_OPT_VMPRIV;
4302
316670eb
A
4303 vm_page_lock_queues();
4304
4305 vm_pageout_queue_internal.pgo_tid = self->thread_id;
4306 vm_pageout_queue_internal.pgo_lowpriority = TRUE;
4307 vm_pageout_queue_internal.pgo_inited = TRUE;
4308
4309 vm_page_unlock_queues();
4310
39037602
A
4311 if (vm_restricted_to_single_processor == TRUE)
4312 thread_vm_bind_group_add();
39236c6e 4313
39037602 4314 vm_pageout_iothread_internal_continue(cq);
316670eb 4315
91447636
A
4316 /*NOTREACHED*/
4317}
4318
b0d623f7 4319kern_return_t
0b4c1975 4320vm_set_buffer_cleanup_callout(boolean_t (*func)(int))
b0d623f7
A
4321{
4322 if (OSCompareAndSwapPtr(NULL, func, (void * volatile *) &consider_buffer_cache_collect)) {
4323 return KERN_SUCCESS;
4324 } else {
4325 return KERN_FAILURE; /* Already set */
4326 }
4327}
4328
39236c6e
A
4329extern boolean_t memorystatus_manual_testing_on;
4330extern unsigned int memorystatus_level;
4331
4332
39236c6e
A
4333#if VM_PRESSURE_EVENTS
4334
fe8ab488
A
4335boolean_t vm_pressure_events_enabled = FALSE;
4336
39236c6e
A
4337void
4338vm_pressure_response(void)
4339{
4340
39236c6e
A
4341 vm_pressure_level_t old_level = kVMPressureNormal;
4342 int new_level = -1;
39037602 4343 unsigned int total_pages;
fe8ab488
A
4344 uint64_t available_memory = 0;
4345
4346 if (vm_pressure_events_enabled == FALSE)
4347 return;
4348
4349
39037602 4350 available_memory = (uint64_t) AVAILABLE_NON_COMPRESSED_MEMORY;
fe8ab488 4351
39236c6e 4352
39037602
A
4353 total_pages = (unsigned int) atop_64(max_mem);
4354#if CONFIG_SECLUDED_MEMORY
4355 total_pages -= vm_page_secluded_count;
4356#endif /* CONFIG_SECLUDED_MEMORY */
4357 memorystatus_level = (unsigned int) ((available_memory * 100) / total_pages);
39236c6e
A
4358
4359 if (memorystatus_manual_testing_on) {
4360 return;
4361 }
4362
4363 old_level = memorystatus_vm_pressure_level;
4364
4365 switch (memorystatus_vm_pressure_level) {
4366
4367 case kVMPressureNormal:
4368 {
4369 if (VM_PRESSURE_WARNING_TO_CRITICAL()) {
4370 new_level = kVMPressureCritical;
4371 } else if (VM_PRESSURE_NORMAL_TO_WARNING()) {
4372 new_level = kVMPressureWarning;
4373 }
4374 break;
4375 }
4376
4377 case kVMPressureWarning:
4378 case kVMPressureUrgent:
4379 {
4380 if (VM_PRESSURE_WARNING_TO_NORMAL()) {
4381 new_level = kVMPressureNormal;
4382 } else if (VM_PRESSURE_WARNING_TO_CRITICAL()) {
4383 new_level = kVMPressureCritical;
4384 }
4385 break;
4386 }
4387
4388 case kVMPressureCritical:
4389 {
4390 if (VM_PRESSURE_WARNING_TO_NORMAL()) {
4391 new_level = kVMPressureNormal;
4392 } else if (VM_PRESSURE_CRITICAL_TO_WARNING()) {
4393 new_level = kVMPressureWarning;
4394 }
4395 break;
4396 }
4397
4398 default:
4399 return;
4400 }
4401
4402 if (new_level != -1) {
4403 memorystatus_vm_pressure_level = (vm_pressure_level_t) new_level;
4404
fe8ab488 4405 if ((memorystatus_vm_pressure_level != kVMPressureNormal) || (old_level != new_level)) {
39236c6e
A
4406 if (vm_pressure_thread_running == FALSE) {
4407 thread_wakeup(&vm_pressure_thread);
4408 }
fe8ab488
A
4409
4410 if (old_level != new_level) {
4411 thread_wakeup(&vm_pressure_changed);
4412 }
39236c6e
A
4413 }
4414 }
4415
4416}
4417#endif /* VM_PRESSURE_EVENTS */
4418
4419kern_return_t
4420mach_vm_pressure_level_monitor(__unused boolean_t wait_for_pressure, __unused unsigned int *pressure_level) {
4421
4422#if !VM_PRESSURE_EVENTS
fe8ab488 4423
39236c6e
A
4424 return KERN_FAILURE;
4425
4426#else /* VM_PRESSURE_EVENTS */
4427
4428 kern_return_t kr = KERN_SUCCESS;
4429
4430 if (pressure_level != NULL) {
4431
4432 vm_pressure_level_t old_level = memorystatus_vm_pressure_level;
4433
4434 if (wait_for_pressure == TRUE) {
4435 wait_result_t wr = 0;
4436
4437 while (old_level == *pressure_level) {
4438 wr = assert_wait((event_t) &vm_pressure_changed,
4439 THREAD_INTERRUPTIBLE);
4440 if (wr == THREAD_WAITING) {
4441 wr = thread_block(THREAD_CONTINUE_NULL);
4442 }
4443 if (wr == THREAD_INTERRUPTED) {
4444 return KERN_ABORTED;
4445 }
4446 if (wr == THREAD_AWAKENED) {
4447
4448 old_level = memorystatus_vm_pressure_level;
4449
4450 if (old_level != *pressure_level) {
4451 break;
4452 }
4453 }
4454 }
4455 }
4456
4457 *pressure_level = old_level;
4458 kr = KERN_SUCCESS;
4459 } else {
4460 kr = KERN_INVALID_ARGUMENT;
4461 }
4462
4463 return kr;
4464#endif /* VM_PRESSURE_EVENTS */
4465}
4466
4467#if VM_PRESSURE_EVENTS
4468void
316670eb 4469vm_pressure_thread(void) {
fe8ab488 4470 static boolean_t thread_initialized = FALSE;
316670eb 4471
fe8ab488 4472 if (thread_initialized == TRUE) {
39236c6e 4473 vm_pressure_thread_running = TRUE;
316670eb 4474 consider_vm_pressure_events();
39236c6e 4475 vm_pressure_thread_running = FALSE;
316670eb
A
4476 }
4477
fe8ab488 4478 thread_initialized = TRUE;
316670eb
A
4479 assert_wait((event_t) &vm_pressure_thread, THREAD_UNINT);
4480 thread_block((thread_continue_t)vm_pressure_thread);
4481}
39236c6e
A
4482#endif /* VM_PRESSURE_EVENTS */
4483
316670eb
A
4484
4485uint32_t vm_pageout_considered_page_last = 0;
4486
4487/*
4488 * called once per-second via "compute_averages"
4489 */
4490void
39037602 4491compute_pageout_gc_throttle(__unused void *arg)
316670eb
A
4492{
4493 if (vm_pageout_considered_page != vm_pageout_considered_page_last) {
4494
4495 vm_pageout_considered_page_last = vm_pageout_considered_page;
4496
4497 thread_wakeup((event_t) &vm_pageout_garbage_collect);
4498 }
4499}
4500
4501
91447636
A
4502static void
4503vm_pageout_garbage_collect(int collect)
4504{
316670eb 4505
91447636 4506 if (collect) {
b0d623f7 4507 boolean_t buf_large_zfree = FALSE;
316670eb
A
4508 boolean_t first_try = TRUE;
4509
91447636
A
4510 stack_collect();
4511
91447636 4512 consider_machine_collect();
fe8ab488 4513 m_drain();
316670eb
A
4514
4515 do {
4516 if (consider_buffer_cache_collect != NULL) {
4517 buf_large_zfree = (*consider_buffer_cache_collect)(0);
4518 }
4519 if (first_try == TRUE || buf_large_zfree == TRUE) {
4520 /*
4521 * consider_zone_gc should be last, because the other operations
4522 * might return memory to zones.
4523 */
39037602 4524 consider_zone_gc();
316670eb
A
4525 }
4526 first_try = FALSE;
4527
4528 } while (buf_large_zfree == TRUE && vm_page_free_count < vm_page_free_target);
91447636
A
4529
4530 consider_machine_adjust();
4531 }
91447636
A
4532 assert_wait((event_t) &vm_pageout_garbage_collect, THREAD_UNINT);
4533
4534 thread_block_parameter((thread_continue_t) vm_pageout_garbage_collect, (void *)1);
4535 /*NOTREACHED*/
4536}
4537
4538
15129b1c
A
4539#if VM_PAGE_BUCKETS_CHECK
4540#if VM_PAGE_FAKE_BUCKETS
4541extern vm_map_offset_t vm_page_fake_buckets_start, vm_page_fake_buckets_end;
4542#endif /* VM_PAGE_FAKE_BUCKETS */
4543#endif /* VM_PAGE_BUCKETS_CHECK */
91447636 4544
39037602 4545
fe8ab488 4546#define FBDP_TEST_COLLAPSE_COMPRESSOR 0
39037602
A
4547#define FBDP_TEST_WIRE_AND_EXTRACT 0
4548#define FBDP_TEST_PAGE_WIRE_OVERFLOW 0
4549
fe8ab488
A
4550#if FBDP_TEST_COLLAPSE_COMPRESSOR
4551extern boolean_t vm_object_collapse_compressor_allowed;
4552#include <IOKit/IOLib.h>
4553#endif /* FBDP_TEST_COLLAPSE_COMPRESSOR */
4554
fe8ab488
A
4555#if FBDP_TEST_WIRE_AND_EXTRACT
4556extern ledger_template_t task_ledger_template;
4557#include <mach/mach_vm.h>
4558extern ppnum_t vm_map_get_phys_page(vm_map_t map,
4559 vm_offset_t offset);
4560#endif /* FBDP_TEST_WIRE_AND_EXTRACT */
4561
3e170ce0
A
4562
4563void
4564vm_set_restrictions()
4565{
4566 host_basic_info_data_t hinfo;
4567 mach_msg_type_number_t count = HOST_BASIC_INFO_COUNT;
4568
4569#define BSD_HOST 1
4570 host_info((host_t)BSD_HOST, HOST_BASIC_INFO, (host_info_t)&hinfo, &count);
4571
4572 assert(hinfo.max_cpus > 0);
4573
4574 if (hinfo.max_cpus <= 3) {
4575 /*
4576 * on systems with a limited number of CPUS, bind the
4577 * 4 major threads that can free memory and that tend to use
4578 * a fair bit of CPU under pressured conditions to a single processor.
4579 * This insures that these threads don't hog all of the available CPUs
4580 * (important for camera launch), while allowing them to run independently
4581 * w/r to locks... the 4 threads are
4582 * vm_pageout_scan, vm_pageout_iothread_internal (compressor),
4583 * vm_compressor_swap_trigger_thread (minor and major compactions),
4584 * memorystatus_thread (jetsams).
4585 *
4586 * the first time the thread is run, it is responsible for checking the
4587 * state of vm_restricted_to_single_processor, and if TRUE it calls
4588 * thread_bind_master... someday this should be replaced with a group
4589 * scheduling mechanism and KPI.
4590 */
4591 vm_restricted_to_single_processor = TRUE;
4592 }
4593}
4594
4595
91447636
A
4596void
4597vm_pageout(void)
4598{
4599 thread_t self = current_thread();
4600 thread_t thread;
4601 kern_return_t result;
4602 spl_t s;
4603
4604 /*
4605 * Set thread privileges.
4606 */
4607 s = splsched();
3e170ce0 4608
91447636 4609 thread_lock(self);
3e170ce0
A
4610 self->options |= TH_OPT_VMPRIV;
4611 sched_set_thread_base_priority(self, BASEPRI_PREEMPT - 1);
91447636 4612 thread_unlock(self);
2d21ac55
A
4613
4614 if (!self->reserved_stack)
4615 self->reserved_stack = self->kernel_stack;
4616
3e170ce0
A
4617 if (vm_restricted_to_single_processor == TRUE)
4618 thread_vm_bind_group_add();
4619
91447636
A
4620 splx(s);
4621
4622 /*
4623 * Initialize some paging parameters.
4624 */
4625
39236c6e
A
4626 if (vm_pageout_swap_wait == 0)
4627 vm_pageout_swap_wait = VM_PAGEOUT_SWAP_WAIT;
4628
91447636
A
4629 if (vm_pageout_idle_wait == 0)
4630 vm_pageout_idle_wait = VM_PAGEOUT_IDLE_WAIT;
4631
4632 if (vm_pageout_burst_wait == 0)
4633 vm_pageout_burst_wait = VM_PAGEOUT_BURST_WAIT;
4634
4635 if (vm_pageout_empty_wait == 0)
4636 vm_pageout_empty_wait = VM_PAGEOUT_EMPTY_WAIT;
4637
4638 if (vm_pageout_deadlock_wait == 0)
4639 vm_pageout_deadlock_wait = VM_PAGEOUT_DEADLOCK_WAIT;
4640
4641 if (vm_pageout_deadlock_relief == 0)
4642 vm_pageout_deadlock_relief = VM_PAGEOUT_DEADLOCK_RELIEF;
4643
4644 if (vm_pageout_inactive_relief == 0)
4645 vm_pageout_inactive_relief = VM_PAGEOUT_INACTIVE_RELIEF;
4646
4647 if (vm_pageout_burst_active_throttle == 0)
4648 vm_pageout_burst_active_throttle = VM_PAGEOUT_BURST_ACTIVE_THROTTLE;
4649
4650 if (vm_pageout_burst_inactive_throttle == 0)
4651 vm_pageout_burst_inactive_throttle = VM_PAGEOUT_BURST_INACTIVE_THROTTLE;
4652
4653 /*
4654 * Set kernel task to low backing store privileged
55e303ae
A
4655 * status
4656 */
4657 task_lock(kernel_task);
4658 kernel_task->priv_flags |= VM_BACKING_STORE_PRIV;
4659 task_unlock(kernel_task);
4660
1c79356b 4661 vm_page_free_count_init = vm_page_free_count;
2d21ac55 4662
1c79356b
A
4663 /*
4664 * even if we've already called vm_page_free_reserve
4665 * call it again here to insure that the targets are
4666 * accurately calculated (it uses vm_page_free_count_init)
4667 * calling it with an arg of 0 will not change the reserve
4668 * but will re-calculate free_min and free_target
4669 */
91447636
A
4670 if (vm_page_free_reserved < VM_PAGE_FREE_RESERVED(processor_count)) {
4671 vm_page_free_reserve((VM_PAGE_FREE_RESERVED(processor_count)) - vm_page_free_reserved);
55e303ae 4672 } else
1c79356b
A
4673 vm_page_free_reserve(0);
4674
55e303ae 4675
39037602 4676 vm_page_queue_init(&vm_pageout_queue_external.pgo_pending);
91447636
A
4677 vm_pageout_queue_external.pgo_maxlaundry = VM_PAGE_LAUNDRY_MAX;
4678 vm_pageout_queue_external.pgo_laundry = 0;
4679 vm_pageout_queue_external.pgo_idle = FALSE;
4680 vm_pageout_queue_external.pgo_busy = FALSE;
4681 vm_pageout_queue_external.pgo_throttled = FALSE;
0b4c1975 4682 vm_pageout_queue_external.pgo_draining = FALSE;
316670eb
A
4683 vm_pageout_queue_external.pgo_lowpriority = FALSE;
4684 vm_pageout_queue_external.pgo_tid = -1;
4685 vm_pageout_queue_external.pgo_inited = FALSE;
4686
39037602 4687 vm_page_queue_init(&vm_pageout_queue_internal.pgo_pending);
2d21ac55 4688 vm_pageout_queue_internal.pgo_maxlaundry = 0;
91447636
A
4689 vm_pageout_queue_internal.pgo_laundry = 0;
4690 vm_pageout_queue_internal.pgo_idle = FALSE;
4691 vm_pageout_queue_internal.pgo_busy = FALSE;
4692 vm_pageout_queue_internal.pgo_throttled = FALSE;
0b4c1975 4693 vm_pageout_queue_internal.pgo_draining = FALSE;
316670eb
A
4694 vm_pageout_queue_internal.pgo_lowpriority = FALSE;
4695 vm_pageout_queue_internal.pgo_tid = -1;
4696 vm_pageout_queue_internal.pgo_inited = FALSE;
55e303ae 4697
2d21ac55
A
4698 /* internal pageout thread started when default pager registered first time */
4699 /* external pageout and garbage collection threads started here */
55e303ae 4700
2d21ac55
A
4701 result = kernel_thread_start_priority((thread_continue_t)vm_pageout_iothread_external, NULL,
4702 BASEPRI_PREEMPT - 1,
4703 &vm_pageout_external_iothread);
91447636
A
4704 if (result != KERN_SUCCESS)
4705 panic("vm_pageout_iothread_external: create failed");
55e303ae 4706
2d21ac55 4707 thread_deallocate(vm_pageout_external_iothread);
9bccf70c 4708
2d21ac55 4709 result = kernel_thread_start_priority((thread_continue_t)vm_pageout_garbage_collect, NULL,
316670eb 4710 BASEPRI_DEFAULT,
2d21ac55 4711 &thread);
91447636
A
4712 if (result != KERN_SUCCESS)
4713 panic("vm_pageout_garbage_collect: create failed");
55e303ae 4714
91447636 4715 thread_deallocate(thread);
55e303ae 4716
39236c6e 4717#if VM_PRESSURE_EVENTS
316670eb
A
4718 result = kernel_thread_start_priority((thread_continue_t)vm_pressure_thread, NULL,
4719 BASEPRI_DEFAULT,
4720 &thread);
4721
4722 if (result != KERN_SUCCESS)
4723 panic("vm_pressure_thread: create failed");
4724
4725 thread_deallocate(thread);
39236c6e 4726#endif
316670eb 4727
8f6c56a5 4728 vm_object_reaper_init();
39037602
A
4729
4730
4731 bzero(&vm_config, sizeof(vm_config));
4732
4733 switch(vm_compressor_mode) {
4734
4735 case VM_PAGER_DEFAULT:
4736 printf("mapping deprecated VM_PAGER_DEFAULT to VM_PAGER_COMPRESSOR_WITH_SWAP\n");
4737
4738 case VM_PAGER_COMPRESSOR_WITH_SWAP:
4739 vm_config.compressor_is_present = TRUE;
4740 vm_config.swap_is_present = TRUE;
4741 vm_config.compressor_is_active = TRUE;
4742 vm_config.swap_is_active = TRUE;
4743 break;
4744
4745 case VM_PAGER_COMPRESSOR_NO_SWAP:
4746 vm_config.compressor_is_present = TRUE;
4747 vm_config.swap_is_present = TRUE;
4748 vm_config.compressor_is_active = TRUE;
4749 break;
4750
4751 case VM_PAGER_FREEZER_DEFAULT:
4752 printf("mapping deprecated VM_PAGER_FREEZER_DEFAULT to VM_PAGER_FREEZER_COMPRESSOR_NO_SWAP\n");
4753
4754 case VM_PAGER_FREEZER_COMPRESSOR_NO_SWAP:
4755 vm_config.compressor_is_present = TRUE;
4756 vm_config.swap_is_present = TRUE;
4757 break;
4758
4759 case VM_PAGER_COMPRESSOR_NO_SWAP_PLUS_FREEZER_COMPRESSOR_WITH_SWAP:
4760 vm_config.compressor_is_present = TRUE;
4761 vm_config.swap_is_present = TRUE;
4762 vm_config.compressor_is_active = TRUE;
4763 vm_config.freezer_swap_is_active = TRUE;
4764 break;
4765
4766 case VM_PAGER_NOT_CONFIGURED:
4767 break;
4768
4769 default:
4770 printf("unknown compressor mode - %x\n", vm_compressor_mode);
4771 break;
4772 }
4773 if (VM_CONFIG_COMPRESSOR_IS_PRESENT)
39236c6e 4774 vm_compressor_pager_init();
2d21ac55 4775
fe8ab488
A
4776#if VM_PRESSURE_EVENTS
4777 vm_pressure_events_enabled = TRUE;
4778#endif /* VM_PRESSURE_EVENTS */
4779
4780#if CONFIG_PHANTOM_CACHE
4781 vm_phantom_cache_init();
4782#endif
15129b1c
A
4783#if VM_PAGE_BUCKETS_CHECK
4784#if VM_PAGE_FAKE_BUCKETS
4785 printf("**** DEBUG: protecting fake buckets [0x%llx:0x%llx]\n",
fe8ab488
A
4786 (uint64_t) vm_page_fake_buckets_start,
4787 (uint64_t) vm_page_fake_buckets_end);
15129b1c
A
4788 pmap_protect(kernel_pmap,
4789 vm_page_fake_buckets_start,
4790 vm_page_fake_buckets_end,
4791 VM_PROT_READ);
4792// *(char *) vm_page_fake_buckets_start = 'x'; /* panic! */
4793#endif /* VM_PAGE_FAKE_BUCKETS */
4794#endif /* VM_PAGE_BUCKETS_CHECK */
4795
fe8ab488
A
4796#if VM_OBJECT_TRACKING
4797 vm_object_tracking_init();
4798#endif /* VM_OBJECT_TRACKING */
4799
4800
4801#if FBDP_TEST_COLLAPSE_COMPRESSOR
4802 vm_object_size_t backing_size, top_size;
4803 vm_object_t backing_object, top_object;
4804 vm_map_offset_t backing_offset, top_offset;
4805 unsigned char *backing_address, *top_address;
4806 kern_return_t kr;
4807
4808 printf("FBDP_TEST_COLLAPSE_COMPRESSOR:\n");
4809
4810 /* create backing object */
4811 backing_size = 15 * PAGE_SIZE;
4812 backing_object = vm_object_allocate(backing_size);
4813 assert(backing_object != VM_OBJECT_NULL);
4814 printf("FBDP_TEST_COLLAPSE_COMPRESSOR: created backing object %p\n",
4815 backing_object);
4816 /* map backing object */
4817 backing_offset = 0;
4818 kr = vm_map_enter(kernel_map, &backing_offset, backing_size, 0,
4819 VM_FLAGS_ANYWHERE, backing_object, 0, FALSE,
4820 VM_PROT_DEFAULT, VM_PROT_DEFAULT, VM_INHERIT_DEFAULT);
4821 assert(kr == KERN_SUCCESS);
4822 backing_address = (unsigned char *) backing_offset;
4823 printf("FBDP_TEST_COLLAPSE_COMPRESSOR: "
4824 "mapped backing object %p at 0x%llx\n",
4825 backing_object, (uint64_t) backing_offset);
4826 /* populate with pages to be compressed in backing object */
4827 backing_address[0x1*PAGE_SIZE] = 0xB1;
4828 backing_address[0x4*PAGE_SIZE] = 0xB4;
4829 backing_address[0x7*PAGE_SIZE] = 0xB7;
4830 backing_address[0xa*PAGE_SIZE] = 0xBA;
4831 backing_address[0xd*PAGE_SIZE] = 0xBD;
4832 printf("FBDP_TEST_COLLAPSE_COMPRESSOR: "
4833 "populated pages to be compressed in "
4834 "backing_object %p\n", backing_object);
4835 /* compress backing object */
4836 vm_object_pageout(backing_object);
4837 printf("FBDP_TEST_COLLAPSE_COMPRESSOR: compressing backing_object %p\n",
4838 backing_object);
4839 /* wait for all the pages to be gone */
4840 while (*(volatile int *)&backing_object->resident_page_count != 0)
4841 IODelay(10);
4842 printf("FBDP_TEST_COLLAPSE_COMPRESSOR: backing_object %p compressed\n",
4843 backing_object);
4844 /* populate with pages to be resident in backing object */
4845 backing_address[0x0*PAGE_SIZE] = 0xB0;
4846 backing_address[0x3*PAGE_SIZE] = 0xB3;
4847 backing_address[0x6*PAGE_SIZE] = 0xB6;
4848 backing_address[0x9*PAGE_SIZE] = 0xB9;
4849 backing_address[0xc*PAGE_SIZE] = 0xBC;
4850 printf("FBDP_TEST_COLLAPSE_COMPRESSOR: "
4851 "populated pages to be resident in "
4852 "backing_object %p\n", backing_object);
4853 /* leave the other pages absent */
4854 /* mess with the paging_offset of the backing_object */
4855 assert(backing_object->paging_offset == 0);
4856 backing_object->paging_offset = 0x3000;
4857
4858 /* create top object */
4859 top_size = 9 * PAGE_SIZE;
4860 top_object = vm_object_allocate(top_size);
4861 assert(top_object != VM_OBJECT_NULL);
4862 printf("FBDP_TEST_COLLAPSE_COMPRESSOR: created top object %p\n",
4863 top_object);
4864 /* map top object */
4865 top_offset = 0;
4866 kr = vm_map_enter(kernel_map, &top_offset, top_size, 0,
4867 VM_FLAGS_ANYWHERE, top_object, 0, FALSE,
4868 VM_PROT_DEFAULT, VM_PROT_DEFAULT, VM_INHERIT_DEFAULT);
4869 assert(kr == KERN_SUCCESS);
4870 top_address = (unsigned char *) top_offset;
4871 printf("FBDP_TEST_COLLAPSE_COMPRESSOR: "
4872 "mapped top object %p at 0x%llx\n",
4873 top_object, (uint64_t) top_offset);
4874 /* populate with pages to be compressed in top object */
4875 top_address[0x3*PAGE_SIZE] = 0xA3;
4876 top_address[0x4*PAGE_SIZE] = 0xA4;
4877 top_address[0x5*PAGE_SIZE] = 0xA5;
4878 printf("FBDP_TEST_COLLAPSE_COMPRESSOR: "
4879 "populated pages to be compressed in "
4880 "top_object %p\n", top_object);
4881 /* compress top object */
4882 vm_object_pageout(top_object);
4883 printf("FBDP_TEST_COLLAPSE_COMPRESSOR: compressing top_object %p\n",
4884 top_object);
4885 /* wait for all the pages to be gone */
4886 while (top_object->resident_page_count != 0);
4887 printf("FBDP_TEST_COLLAPSE_COMPRESSOR: top_object %p compressed\n",
4888 top_object);
4889 /* populate with pages to be resident in top object */
4890 top_address[0x0*PAGE_SIZE] = 0xA0;
4891 top_address[0x1*PAGE_SIZE] = 0xA1;
4892 top_address[0x2*PAGE_SIZE] = 0xA2;
4893 printf("FBDP_TEST_COLLAPSE_COMPRESSOR: "
4894 "populated pages to be resident in "
4895 "top_object %p\n", top_object);
4896 /* leave the other pages absent */
4897
4898 /* link the 2 objects */
4899 vm_object_reference(backing_object);
4900 top_object->shadow = backing_object;
4901 top_object->vo_shadow_offset = 0x3000;
4902 printf("FBDP_TEST_COLLAPSE_COMPRESSOR: linked %p and %p\n",
4903 top_object, backing_object);
4904
4905 /* unmap backing object */
4906 vm_map_remove(kernel_map,
4907 backing_offset,
4908 backing_offset + backing_size,
4909 0);
4910 printf("FBDP_TEST_COLLAPSE_COMPRESSOR: "
4911 "unmapped backing_object %p [0x%llx:0x%llx]\n",
4912 backing_object,
4913 (uint64_t) backing_offset,
4914 (uint64_t) (backing_offset + backing_size));
4915
4916 /* collapse */
4917 printf("FBDP_TEST_COLLAPSE_COMPRESSOR: collapsing %p\n", top_object);
4918 vm_object_lock(top_object);
4919 vm_object_collapse(top_object, 0, FALSE);
4920 vm_object_unlock(top_object);
4921 printf("FBDP_TEST_COLLAPSE_COMPRESSOR: collapsed %p\n", top_object);
4922
4923 /* did it work? */
4924 if (top_object->shadow != VM_OBJECT_NULL) {
4925 printf("FBDP_TEST_COLLAPSE_COMPRESSOR: not collapsed\n");
4926 printf("FBDP_TEST_COLLAPSE_COMPRESSOR: FAIL\n");
4927 if (vm_object_collapse_compressor_allowed) {
4928 panic("FBDP_TEST_COLLAPSE_COMPRESSOR: FAIL\n");
4929 }
4930 } else {
4931 /* check the contents of the mapping */
4932 unsigned char expect[9] =
4933 { 0xA0, 0xA1, 0xA2, /* resident in top */
4934 0xA3, 0xA4, 0xA5, /* compressed in top */
4935 0xB9, /* resident in backing + shadow_offset */
4936 0xBD, /* compressed in backing + shadow_offset + paging_offset */
4937 0x00 }; /* absent in both */
4938 unsigned char actual[9];
4939 unsigned int i, errors;
4940
4941 errors = 0;
4942 for (i = 0; i < sizeof (actual); i++) {
4943 actual[i] = (unsigned char) top_address[i*PAGE_SIZE];
4944 if (actual[i] != expect[i]) {
4945 errors++;
4946 }
4947 }
4948 printf("FBDP_TEST_COLLAPSE_COMPRESSOR: "
4949 "actual [%x %x %x %x %x %x %x %x %x] "
4950 "expect [%x %x %x %x %x %x %x %x %x] "
4951 "%d errors\n",
4952 actual[0], actual[1], actual[2], actual[3],
4953 actual[4], actual[5], actual[6], actual[7],
4954 actual[8],
4955 expect[0], expect[1], expect[2], expect[3],
4956 expect[4], expect[5], expect[6], expect[7],
4957 expect[8],
4958 errors);
4959 if (errors) {
4960 panic("FBDP_TEST_COLLAPSE_COMPRESSOR: FAIL\n");
4961 } else {
4962 printf("FBDP_TEST_COLLAPSE_COMPRESSOR: PASS\n");
4963 }
4964 }
4965#endif /* FBDP_TEST_COLLAPSE_COMPRESSOR */
4966
4967#if FBDP_TEST_WIRE_AND_EXTRACT
4968 ledger_t ledger;
4969 vm_map_t user_map, wire_map;
4970 mach_vm_address_t user_addr, wire_addr;
4971 mach_vm_size_t user_size, wire_size;
4972 mach_vm_offset_t cur_offset;
4973 vm_prot_t cur_prot, max_prot;
4974 ppnum_t user_ppnum, wire_ppnum;
4975 kern_return_t kr;
4976
4977 ledger = ledger_instantiate(task_ledger_template,
4978 LEDGER_CREATE_ACTIVE_ENTRIES);
3e170ce0 4979 user_map = vm_map_create(pmap_create(ledger, 0, PMAP_CREATE_64BIT),
fe8ab488
A
4980 0x100000000ULL,
4981 0x200000000ULL,
4982 TRUE);
4983 wire_map = vm_map_create(NULL,
4984 0x100000000ULL,
4985 0x200000000ULL,
4986 TRUE);
4987 user_addr = 0;
4988 user_size = 0x10000;
4989 kr = mach_vm_allocate(user_map,
4990 &user_addr,
4991 user_size,
4992 VM_FLAGS_ANYWHERE);
4993 assert(kr == KERN_SUCCESS);
4994 wire_addr = 0;
4995 wire_size = user_size;
4996 kr = mach_vm_remap(wire_map,
4997 &wire_addr,
4998 wire_size,
4999 0,
5000 VM_FLAGS_ANYWHERE,
5001 user_map,
5002 user_addr,
5003 FALSE,
5004 &cur_prot,
5005 &max_prot,
5006 VM_INHERIT_NONE);
5007 assert(kr == KERN_SUCCESS);
5008 for (cur_offset = 0;
5009 cur_offset < wire_size;
5010 cur_offset += PAGE_SIZE) {
5011 kr = vm_map_wire_and_extract(wire_map,
5012 wire_addr + cur_offset,
39037602 5013 VM_PROT_DEFAULT | VM_PROT_MEMORY_TAG_MAKE(VM_KERN_MEMORY_OSFMK),
fe8ab488
A
5014 TRUE,
5015 &wire_ppnum);
5016 assert(kr == KERN_SUCCESS);
5017 user_ppnum = vm_map_get_phys_page(user_map,
5018 user_addr + cur_offset);
5019 printf("FBDP_TEST_WIRE_AND_EXTRACT: kr=0x%x "
5020 "user[%p:0x%llx:0x%x] wire[%p:0x%llx:0x%x]\n",
5021 kr,
5022 user_map, user_addr + cur_offset, user_ppnum,
5023 wire_map, wire_addr + cur_offset, wire_ppnum);
5024 if (kr != KERN_SUCCESS ||
5025 wire_ppnum == 0 ||
5026 wire_ppnum != user_ppnum) {
5027 panic("FBDP_TEST_WIRE_AND_EXTRACT: FAIL\n");
5028 }
5029 }
5030 cur_offset -= PAGE_SIZE;
5031 kr = vm_map_wire_and_extract(wire_map,
5032 wire_addr + cur_offset,
5033 VM_PROT_DEFAULT,
5034 TRUE,
5035 &wire_ppnum);
5036 assert(kr == KERN_SUCCESS);
5037 printf("FBDP_TEST_WIRE_AND_EXTRACT: re-wire kr=0x%x "
5038 "user[%p:0x%llx:0x%x] wire[%p:0x%llx:0x%x]\n",
5039 kr,
5040 user_map, user_addr + cur_offset, user_ppnum,
5041 wire_map, wire_addr + cur_offset, wire_ppnum);
5042 if (kr != KERN_SUCCESS ||
5043 wire_ppnum == 0 ||
5044 wire_ppnum != user_ppnum) {
5045 panic("FBDP_TEST_WIRE_AND_EXTRACT: FAIL\n");
5046 }
5047
5048 printf("FBDP_TEST_WIRE_AND_EXTRACT: PASS\n");
5049#endif /* FBDP_TEST_WIRE_AND_EXTRACT */
5050
39037602
A
5051#if FBDP_TEST_PAGE_WIRE_OVERFLOW
5052 vm_object_t fbdp_object;
5053 vm_page_t fbdp_page;
5054
5055 printf("FBDP_TEST_PAGE_WIRE_OVERFLOW: starting...\n");
5056
5057 fbdp_object = vm_object_allocate(PAGE_SIZE);
5058 vm_object_lock(fbdp_object);
5059 fbdp_page = vm_page_alloc(fbdp_object, 0x0);
5060 vm_page_lock_queues();
5061 do {
5062 vm_page_wire(fbdp_page, 1, FALSE);
5063 } while (fbdp_page->wire_count != 0);
5064 vm_page_unlock_queues();
5065 vm_object_unlock(fbdp_object);
5066 panic("FBDP(%p,%p): wire_count overflow not detected\n",
5067 fbdp_object, fbdp_page);
5068#endif /* FBDP_TEST_PAGE_WIRE_OVERFLOW */
5069
91447636 5070 vm_pageout_continue();
2d21ac55
A
5071
5072 /*
5073 * Unreached code!
5074 *
5075 * The vm_pageout_continue() call above never returns, so the code below is never
5076 * executed. We take advantage of this to declare several DTrace VM related probe
5077 * points that our kernel doesn't have an analog for. These are probe points that
5078 * exist in Solaris and are in the DTrace documentation, so people may have written
5079 * scripts that use them. Declaring the probe points here means their scripts will
5080 * compile and execute which we want for portability of the scripts, but since this
5081 * section of code is never reached, the probe points will simply never fire. Yes,
5082 * this is basically a hack. The problem is the DTrace probe points were chosen with
5083 * Solaris specific VM events in mind, not portability to different VM implementations.
5084 */
5085
5086 DTRACE_VM2(execfree, int, 1, (uint64_t *), NULL);
5087 DTRACE_VM2(execpgin, int, 1, (uint64_t *), NULL);
5088 DTRACE_VM2(execpgout, int, 1, (uint64_t *), NULL);
5089 DTRACE_VM2(pgswapin, int, 1, (uint64_t *), NULL);
5090 DTRACE_VM2(pgswapout, int, 1, (uint64_t *), NULL);
5091 DTRACE_VM2(swapin, int, 1, (uint64_t *), NULL);
5092 DTRACE_VM2(swapout, int, 1, (uint64_t *), NULL);
91447636 5093 /*NOTREACHED*/
9bccf70c
A
5094}
5095
39236c6e
A
5096
5097
39236c6e
A
5098int vm_compressor_thread_count = 2;
5099
2d21ac55
A
5100kern_return_t
5101vm_pageout_internal_start(void)
5102{
39236c6e
A
5103 kern_return_t result;
5104 int i;
5105 host_basic_info_data_t hinfo;
3e170ce0 5106
39037602 5107 assert (VM_CONFIG_COMPRESSOR_IS_PRESENT);
39236c6e 5108
39037602 5109 mach_msg_type_number_t count = HOST_BASIC_INFO_COUNT;
39236c6e 5110#define BSD_HOST 1
39037602 5111 host_info((host_t)BSD_HOST, HOST_BASIC_INFO, (host_info_t)&hinfo, &count);
39236c6e 5112
39037602 5113 assert(hinfo.max_cpus > 0);
3e170ce0 5114
39037602
A
5115 if (vm_compressor_thread_count >= hinfo.max_cpus)
5116 vm_compressor_thread_count = hinfo.max_cpus - 1;
5117 if (vm_compressor_thread_count <= 0)
5118 vm_compressor_thread_count = 1;
5119 else if (vm_compressor_thread_count > MAX_COMPRESSOR_THREAD_COUNT)
5120 vm_compressor_thread_count = MAX_COMPRESSOR_THREAD_COUNT;
3e170ce0 5121
39037602
A
5122 if (vm_compressor_immediate_preferred == TRUE) {
5123 vm_pageout_immediate_chead = NULL;
5124 vm_pageout_immediate_scratch_buf = kalloc(vm_compressor_get_encode_scratch_size());
39236c6e 5125
39037602 5126 vm_compressor_thread_count = 1;
39236c6e 5127 }
2d21ac55 5128
39037602
A
5129 vm_pageout_queue_internal.pgo_maxlaundry = (vm_compressor_thread_count * 4) * VM_PAGE_LAUNDRY_MAX;
5130
39236c6e 5131 for (i = 0; i < vm_compressor_thread_count; i++) {
3e170ce0
A
5132 ciq[i].id = i;
5133 ciq[i].q = &vm_pageout_queue_internal;
5134 ciq[i].current_chead = NULL;
5135 ciq[i].scratch_buf = kalloc(COMPRESSOR_SCRATCH_BUF_SIZE);
39037602 5136
39236c6e 5137 result = kernel_thread_start_priority((thread_continue_t)vm_pageout_iothread_internal, (void *)&ciq[i], BASEPRI_PREEMPT - 1, &vm_pageout_internal_iothread);
3e170ce0 5138
39236c6e
A
5139 if (result == KERN_SUCCESS)
5140 thread_deallocate(vm_pageout_internal_iothread);
5141 else
5142 break;
5143 }
2d21ac55
A
5144 return result;
5145}
5146
fe8ab488
A
5147#if CONFIG_IOSCHED
5148/*
5149 * To support I/O Expedite for compressed files we mark the upls with special flags.
5150 * The way decmpfs works is that we create a big upl which marks all the pages needed to
5151 * represent the compressed file as busy. We tag this upl with the flag UPL_DECMP_REQ. Decmpfs
5152 * then issues smaller I/Os for compressed I/Os, deflates them and puts the data into the pages
5153 * being held in the big original UPL. We mark each of these smaller UPLs with the flag
5154 * UPL_DECMP_REAL_IO. Any outstanding real I/O UPL is tracked by the big req upl using the
5155 * decmp_io_upl field (in the upl structure). This link is protected in the forward direction
5156 * by the req upl lock (the reverse link doesnt need synch. since we never inspect this link
5157 * unless the real I/O upl is being destroyed).
5158 */
5159
5160
5161static void
5162upl_set_decmp_info(upl_t upl, upl_t src_upl)
5163{
5164 assert((src_upl->flags & UPL_DECMP_REQ) != 0);
5165
5166 upl_lock(src_upl);
5167 if (src_upl->decmp_io_upl) {
5168 /*
5169 * If there is already an alive real I/O UPL, ignore this new UPL.
5170 * This case should rarely happen and even if it does, it just means
5171 * that we might issue a spurious expedite which the driver is expected
5172 * to handle.
5173 */
5174 upl_unlock(src_upl);
5175 return;
5176 }
5177 src_upl->decmp_io_upl = (void *)upl;
5178 src_upl->ref_count++;
fe8ab488
A
5179
5180 upl->flags |= UPL_DECMP_REAL_IO;
5181 upl->decmp_io_upl = (void *)src_upl;
04b8595b 5182 upl_unlock(src_upl);
fe8ab488
A
5183}
5184#endif /* CONFIG_IOSCHED */
5185
5186#if UPL_DEBUG
5187int upl_debug_enabled = 1;
5188#else
5189int upl_debug_enabled = 0;
5190#endif
1c79356b 5191
b0d623f7
A
5192static upl_t
5193upl_create(int type, int flags, upl_size_t size)
0b4e3aa0
A
5194{
5195 upl_t upl;
39236c6e 5196 vm_size_t page_field_size = 0;
2d21ac55 5197 int upl_flags = 0;
39236c6e 5198 vm_size_t upl_size = sizeof(struct upl);
0b4e3aa0 5199
b0d623f7
A
5200 size = round_page_32(size);
5201
2d21ac55 5202 if (type & UPL_CREATE_LITE) {
b0d623f7 5203 page_field_size = (atop(size) + 7) >> 3;
55e303ae 5204 page_field_size = (page_field_size + 3) & 0xFFFFFFFC;
2d21ac55
A
5205
5206 upl_flags |= UPL_LITE;
55e303ae 5207 }
2d21ac55 5208 if (type & UPL_CREATE_INTERNAL) {
39236c6e 5209 upl_size += sizeof(struct upl_page_info) * atop(size);
2d21ac55
A
5210
5211 upl_flags |= UPL_INTERNAL;
0b4e3aa0 5212 }
2d21ac55
A
5213 upl = (upl_t)kalloc(upl_size + page_field_size);
5214
5215 if (page_field_size)
5216 bzero((char *)upl + upl_size, page_field_size);
5217
5218 upl->flags = upl_flags | flags;
0b4e3aa0
A
5219 upl->kaddr = (vm_offset_t)0;
5220 upl->size = 0;
5221 upl->map_object = NULL;
5222 upl->ref_count = 1;
6d2010ae 5223 upl->ext_ref_count = 0;
0c530ab8 5224 upl->highest_page = 0;
0b4e3aa0 5225 upl_lock_init(upl);
b0d623f7 5226 upl->vector_upl = NULL;
3e170ce0 5227 upl->associated_upl = NULL;
fe8ab488
A
5228#if CONFIG_IOSCHED
5229 if (type & UPL_CREATE_IO_TRACKING) {
5230 upl->upl_priority = proc_get_effective_thread_policy(current_thread(), TASK_POLICY_IO);
5231 }
5232
5233 upl->upl_reprio_info = 0;
5234 upl->decmp_io_upl = 0;
5235 if ((type & UPL_CREATE_INTERNAL) && (type & UPL_CREATE_EXPEDITE_SUP)) {
5236 /* Only support expedite on internal UPLs */
5237 thread_t curthread = current_thread();
5238 upl->upl_reprio_info = (uint64_t *)kalloc(sizeof(uint64_t) * atop(size));
5239 bzero(upl->upl_reprio_info, (sizeof(uint64_t) * atop(size)));
5240 upl->flags |= UPL_EXPEDITE_SUPPORTED;
5241 if (curthread->decmp_upl != NULL)
5242 upl_set_decmp_info(upl, curthread->decmp_upl);
5243 }
5244#endif
5245#if CONFIG_IOSCHED || UPL_DEBUG
5246 if ((type & UPL_CREATE_IO_TRACKING) || upl_debug_enabled) {
5247 upl->upl_creator = current_thread();
5248 upl->uplq.next = 0;
5249 upl->uplq.prev = 0;
5250 upl->flags |= UPL_TRACKED_BY_OBJECT;
5251 }
5252#endif
5253
b0d623f7 5254#if UPL_DEBUG
0b4e3aa0
A
5255 upl->ubc_alias1 = 0;
5256 upl->ubc_alias2 = 0;
b0d623f7 5257
b0d623f7
A
5258 upl->upl_state = 0;
5259 upl->upl_commit_index = 0;
5260 bzero(&upl->upl_commit_records[0], sizeof(upl->upl_commit_records));
5261
5262 (void) OSBacktrace(&upl->upl_create_retaddr[0], UPL_DEBUG_STACK_FRAMES);
91447636 5263#endif /* UPL_DEBUG */
b0d623f7 5264
0b4e3aa0
A
5265 return(upl);
5266}
5267
5268static void
2d21ac55 5269upl_destroy(upl_t upl)
0b4e3aa0 5270{
55e303ae 5271 int page_field_size; /* bit field in word size buf */
2d21ac55 5272 int size;
0b4e3aa0 5273
6d2010ae
A
5274 if (upl->ext_ref_count) {
5275 panic("upl(%p) ext_ref_count", upl);
5276 }
5277
fe8ab488
A
5278#if CONFIG_IOSCHED
5279 if ((upl->flags & UPL_DECMP_REAL_IO) && upl->decmp_io_upl) {
5280 upl_t src_upl;
5281 src_upl = upl->decmp_io_upl;
5282 assert((src_upl->flags & UPL_DECMP_REQ) != 0);
5283 upl_lock(src_upl);
5284 src_upl->decmp_io_upl = NULL;
5285 upl_unlock(src_upl);
5286 upl_deallocate(src_upl);
5287 }
5288#endif /* CONFIG_IOSCHED */
5289
5290#if CONFIG_IOSCHED || UPL_DEBUG
5291 if ((upl->flags & UPL_TRACKED_BY_OBJECT) && !(upl->flags & UPL_VECTOR)) {
55e303ae 5292 vm_object_t object;
2d21ac55
A
5293
5294 if (upl->flags & UPL_SHADOWED) {
55e303ae
A
5295 object = upl->map_object->shadow;
5296 } else {
5297 object = upl->map_object;
5298 }
fe8ab488 5299
55e303ae 5300 vm_object_lock(object);
2d21ac55 5301 queue_remove(&object->uplq, upl, upl_t, uplq);
316670eb
A
5302 vm_object_activity_end(object);
5303 vm_object_collapse(object, 0, TRUE);
55e303ae 5304 vm_object_unlock(object);
0b4e3aa0 5305 }
fe8ab488 5306#endif
2d21ac55
A
5307 /*
5308 * drop a reference on the map_object whether or
5309 * not a pageout object is inserted
5310 */
5311 if (upl->flags & UPL_SHADOWED)
0b4e3aa0 5312 vm_object_deallocate(upl->map_object);
55e303ae 5313
2d21ac55
A
5314 if (upl->flags & UPL_DEVICE_MEMORY)
5315 size = PAGE_SIZE;
5316 else
5317 size = upl->size;
55e303ae 5318 page_field_size = 0;
2d21ac55 5319
55e303ae 5320 if (upl->flags & UPL_LITE) {
2d21ac55 5321 page_field_size = ((size/PAGE_SIZE) + 7) >> 3;
55e303ae
A
5322 page_field_size = (page_field_size + 3) & 0xFFFFFFFC;
5323 }
b0d623f7
A
5324 upl_lock_destroy(upl);
5325 upl->vector_upl = (vector_upl_t) 0xfeedbeef;
316670eb 5326
fe8ab488
A
5327#if CONFIG_IOSCHED
5328 if (upl->flags & UPL_EXPEDITE_SUPPORTED)
5329 kfree(upl->upl_reprio_info, sizeof(uint64_t) * (size/PAGE_SIZE));
5330#endif
5331
2d21ac55 5332 if (upl->flags & UPL_INTERNAL) {
91447636
A
5333 kfree(upl,
5334 sizeof(struct upl) +
2d21ac55 5335 (sizeof(struct upl_page_info) * (size/PAGE_SIZE))
91447636 5336 + page_field_size);
0b4e3aa0 5337 } else {
91447636 5338 kfree(upl, sizeof(struct upl) + page_field_size);
0b4e3aa0
A
5339 }
5340}
5341
0b4e3aa0 5342void
2d21ac55 5343upl_deallocate(upl_t upl)
0b4e3aa0 5344{
fe8ab488 5345 upl_lock(upl);
b0d623f7
A
5346 if (--upl->ref_count == 0) {
5347 if(vector_upl_is_valid(upl))
5348 vector_upl_deallocate(upl);
fe8ab488 5349 upl_unlock(upl);
0b4e3aa0 5350 upl_destroy(upl);
b0d623f7 5351 }
fe8ab488
A
5352 else
5353 upl_unlock(upl);
5354}
5355
5356#if CONFIG_IOSCHED
5357void
5358upl_mark_decmp(upl_t upl)
5359{
5360 if (upl->flags & UPL_TRACKED_BY_OBJECT) {
5361 upl->flags |= UPL_DECMP_REQ;
5362 upl->upl_creator->decmp_upl = (void *)upl;
5363 }
5364}
5365
5366void
5367upl_unmark_decmp(upl_t upl)
5368{
5369 if(upl && (upl->flags & UPL_DECMP_REQ)) {
5370 upl->upl_creator->decmp_upl = NULL;
5371 }
5372}
5373
5374#endif /* CONFIG_IOSCHED */
5375
5376#define VM_PAGE_Q_BACKING_UP(q) \
5377 ((q)->pgo_laundry >= (((q)->pgo_maxlaundry * 8) / 10))
5378
5379boolean_t must_throttle_writes(void);
5380
5381boolean_t
5382must_throttle_writes()
5383{
5384 if (VM_PAGE_Q_BACKING_UP(&vm_pageout_queue_external) &&
5385 vm_page_pageable_external_count > (AVAILABLE_NON_COMPRESSED_MEMORY * 6) / 10)
5386 return (TRUE);
5387
5388 return (FALSE);
0b4e3aa0 5389}
1c79356b 5390
fe8ab488 5391
b0d623f7
A
5392#if DEVELOPMENT || DEBUG
5393/*/*
91447636
A
5394 * Statistics about UPL enforcement of copy-on-write obligations.
5395 */
5396unsigned long upl_cow = 0;
5397unsigned long upl_cow_again = 0;
91447636
A
5398unsigned long upl_cow_pages = 0;
5399unsigned long upl_cow_again_pages = 0;
b0d623f7
A
5400
5401unsigned long iopl_cow = 0;
5402unsigned long iopl_cow_pages = 0;
5403#endif
91447636 5404
1c79356b 5405/*
0b4e3aa0 5406 * Routine: vm_object_upl_request
1c79356b
A
5407 * Purpose:
5408 * Cause the population of a portion of a vm_object.
5409 * Depending on the nature of the request, the pages
5410 * returned may be contain valid data or be uninitialized.
5411 * A page list structure, listing the physical pages
5412 * will be returned upon request.
5413 * This function is called by the file system or any other
5414 * supplier of backing store to a pager.
5415 * IMPORTANT NOTE: The caller must still respect the relationship
5416 * between the vm_object and its backing memory object. The
5417 * caller MUST NOT substitute changes in the backing file
5418 * without first doing a memory_object_lock_request on the
5419 * target range unless it is know that the pages are not
5420 * shared with another entity at the pager level.
5421 * Copy_in_to:
5422 * if a page list structure is present
5423 * return the mapped physical pages, where a
5424 * page is not present, return a non-initialized
5425 * one. If the no_sync bit is turned on, don't
5426 * call the pager unlock to synchronize with other
5427 * possible copies of the page. Leave pages busy
5428 * in the original object, if a page list structure
5429 * was specified. When a commit of the page list
5430 * pages is done, the dirty bit will be set for each one.
5431 * Copy_out_from:
5432 * If a page list structure is present, return
5433 * all mapped pages. Where a page does not exist
5434 * map a zero filled one. Leave pages busy in
5435 * the original object. If a page list structure
5436 * is not specified, this call is a no-op.
5437 *
5438 * Note: access of default pager objects has a rather interesting
5439 * twist. The caller of this routine, presumably the file system
5440 * page cache handling code, will never actually make a request
5441 * against a default pager backed object. Only the default
5442 * pager will make requests on backing store related vm_objects
5443 * In this way the default pager can maintain the relationship
5444 * between backing store files (abstract memory objects) and
5445 * the vm_objects (cache objects), they support.
5446 *
5447 */
91447636 5448
0b4e3aa0
A
5449__private_extern__ kern_return_t
5450vm_object_upl_request(
1c79356b 5451 vm_object_t object,
91447636
A
5452 vm_object_offset_t offset,
5453 upl_size_t size,
1c79356b 5454 upl_t *upl_ptr,
0b4e3aa0
A
5455 upl_page_info_array_t user_page_list,
5456 unsigned int *page_list_count,
3e170ce0 5457 upl_control_flags_t cntrl_flags)
1c79356b 5458{
91447636 5459 vm_page_t dst_page = VM_PAGE_NULL;
2d21ac55
A
5460 vm_object_offset_t dst_offset;
5461 upl_size_t xfer_size;
6d2010ae 5462 unsigned int size_in_pages;
1c79356b 5463 boolean_t dirty;
55e303ae 5464 boolean_t hw_dirty;
1c79356b 5465 upl_t upl = NULL;
91447636
A
5466 unsigned int entry;
5467#if MACH_CLUSTER_STATS
1c79356b 5468 boolean_t encountered_lrp = FALSE;
91447636 5469#endif
1c79356b 5470 vm_page_t alias_page = NULL;
2d21ac55 5471 int refmod_state = 0;
91447636
A
5472 wpl_array_t lite_list = NULL;
5473 vm_object_t last_copy_object;
6d2010ae
A
5474 struct vm_page_delayed_work dw_array[DEFAULT_DELAYED_WORK_LIMIT];
5475 struct vm_page_delayed_work *dwp;
b0d623f7 5476 int dw_count;
6d2010ae 5477 int dw_limit;
fe8ab488 5478 int io_tracking_flag = 0;
39037602
A
5479 int grab_options;
5480 ppnum_t phys_page;
91447636
A
5481
5482 if (cntrl_flags & ~UPL_VALID_FLAGS) {
5483 /*
5484 * For forward compatibility's sake,
5485 * reject any unknown flag.
5486 */
5487 return KERN_INVALID_VALUE;
5488 }
2d21ac55
A
5489 if ( (!object->internal) && (object->paging_offset != 0) )
5490 panic("vm_object_upl_request: external object with non-zero paging offset\n");
5491 if (object->phys_contiguous)
5492 panic("vm_object_upl_request: contiguous object specified\n");
0b4e3aa0 5493
0b4e3aa0 5494
fe8ab488
A
5495 if (size > MAX_UPL_SIZE_BYTES)
5496 size = MAX_UPL_SIZE_BYTES;
1c79356b 5497
2d21ac55 5498 if ( (cntrl_flags & UPL_SET_INTERNAL) && page_list_count != NULL)
fe8ab488
A
5499 *page_list_count = MAX_UPL_SIZE_BYTES >> PAGE_SHIFT;
5500
5501#if CONFIG_IOSCHED || UPL_DEBUG
5502 if (object->io_tracking || upl_debug_enabled)
5503 io_tracking_flag |= UPL_CREATE_IO_TRACKING;
5504#endif
5505#if CONFIG_IOSCHED
5506 if (object->io_tracking)
5507 io_tracking_flag |= UPL_CREATE_EXPEDITE_SUP;
5508#endif
1c79356b 5509
2d21ac55
A
5510 if (cntrl_flags & UPL_SET_INTERNAL) {
5511 if (cntrl_flags & UPL_SET_LITE) {
55e303ae 5512
fe8ab488 5513 upl = upl_create(UPL_CREATE_INTERNAL | UPL_CREATE_LITE | io_tracking_flag, 0, size);
91447636 5514
2d21ac55
A
5515 user_page_list = (upl_page_info_t *) (((uintptr_t)upl) + sizeof(struct upl));
5516 lite_list = (wpl_array_t)
91447636 5517 (((uintptr_t)user_page_list) +
2d21ac55 5518 ((size/PAGE_SIZE) * sizeof(upl_page_info_t)));
b0d623f7
A
5519 if (size == 0) {
5520 user_page_list = NULL;
5521 lite_list = NULL;
5522 }
1c79356b 5523 } else {
fe8ab488 5524 upl = upl_create(UPL_CREATE_INTERNAL | io_tracking_flag, 0, size);
55e303ae 5525
2d21ac55 5526 user_page_list = (upl_page_info_t *) (((uintptr_t)upl) + sizeof(struct upl));
b0d623f7
A
5527 if (size == 0) {
5528 user_page_list = NULL;
5529 }
55e303ae 5530 }
2d21ac55
A
5531 } else {
5532 if (cntrl_flags & UPL_SET_LITE) {
91447636 5533
fe8ab488 5534 upl = upl_create(UPL_CREATE_EXTERNAL | UPL_CREATE_LITE | io_tracking_flag, 0, size);
55e303ae 5535
2d21ac55 5536 lite_list = (wpl_array_t) (((uintptr_t)upl) + sizeof(struct upl));
b0d623f7
A
5537 if (size == 0) {
5538 lite_list = NULL;
5539 }
55e303ae 5540 } else {
fe8ab488 5541 upl = upl_create(UPL_CREATE_EXTERNAL | io_tracking_flag, 0, size);
0b4e3aa0 5542 }
55e303ae 5543 }
2d21ac55
A
5544 *upl_ptr = upl;
5545
5546 if (user_page_list)
5547 user_page_list[0].device = FALSE;
91447636 5548
2d21ac55
A
5549 if (cntrl_flags & UPL_SET_LITE) {
5550 upl->map_object = object;
5551 } else {
5552 upl->map_object = vm_object_allocate(size);
5553 /*
5554 * No neeed to lock the new object: nobody else knows
5555 * about it yet, so it's all ours so far.
5556 */
5557 upl->map_object->shadow = object;
5558 upl->map_object->pageout = TRUE;
5559 upl->map_object->can_persist = FALSE;
5560 upl->map_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
6d2010ae 5561 upl->map_object->vo_shadow_offset = offset;
2d21ac55
A
5562 upl->map_object->wimg_bits = object->wimg_bits;
5563
5564 VM_PAGE_GRAB_FICTITIOUS(alias_page);
5565
5566 upl->flags |= UPL_SHADOWED;
5567 }
5568 /*
91447636
A
5569 * ENCRYPTED SWAP:
5570 * Just mark the UPL as "encrypted" here.
5571 * We'll actually encrypt the pages later,
5572 * in upl_encrypt(), when the caller has
5573 * selected which pages need to go to swap.
5574 */
2d21ac55 5575 if (cntrl_flags & UPL_ENCRYPT)
91447636 5576 upl->flags |= UPL_ENCRYPTED;
2d21ac55
A
5577
5578 if (cntrl_flags & UPL_FOR_PAGEOUT)
91447636 5579 upl->flags |= UPL_PAGEOUT;
2d21ac55 5580
55e303ae 5581 vm_object_lock(object);
b0d623f7 5582 vm_object_activity_begin(object);
2d21ac55 5583
39037602
A
5584 grab_options = 0;
5585#if CONFIG_SECLUDED_MEMORY
5586 if (object->can_grab_secluded) {
5587 grab_options |= VM_PAGE_GRAB_SECLUDED;
5588 }
5589#endif /* CONFIG_SECLUDED_MEMORY */
5590
2d21ac55
A
5591 /*
5592 * we can lock in the paging_offset once paging_in_progress is set
5593 */
5594 upl->size = size;
5595 upl->offset = offset + object->paging_offset;
55e303ae 5596
fe8ab488
A
5597#if CONFIG_IOSCHED || UPL_DEBUG
5598 if (object->io_tracking || upl_debug_enabled) {
5599 vm_object_activity_begin(object);
5600 queue_enter(&object->uplq, upl, upl_t, uplq);
5601 }
5602#endif
2d21ac55 5603 if ((cntrl_flags & UPL_WILL_MODIFY) && object->copy != VM_OBJECT_NULL) {
91447636 5604 /*
2d21ac55
A
5605 * Honor copy-on-write obligations
5606 *
91447636
A
5607 * The caller is gathering these pages and
5608 * might modify their contents. We need to
5609 * make sure that the copy object has its own
5610 * private copies of these pages before we let
5611 * the caller modify them.
5612 */
5613 vm_object_update(object,
5614 offset,
5615 size,
5616 NULL,
5617 NULL,
5618 FALSE, /* should_return */
5619 MEMORY_OBJECT_COPY_SYNC,
5620 VM_PROT_NO_CHANGE);
b0d623f7 5621#if DEVELOPMENT || DEBUG
91447636
A
5622 upl_cow++;
5623 upl_cow_pages += size >> PAGE_SHIFT;
b0d623f7 5624#endif
55e303ae 5625 }
2d21ac55
A
5626 /*
5627 * remember which copy object we synchronized with
5628 */
91447636 5629 last_copy_object = object->copy;
1c79356b 5630 entry = 0;
55e303ae 5631
2d21ac55
A
5632 xfer_size = size;
5633 dst_offset = offset;
6d2010ae 5634 size_in_pages = size / PAGE_SIZE;
2d21ac55 5635
b0d623f7
A
5636 dwp = &dw_array[0];
5637 dw_count = 0;
6d2010ae
A
5638 dw_limit = DELAYED_WORK_LIMIT(DEFAULT_DELAYED_WORK_LIMIT);
5639
5640 if (vm_page_free_count > (vm_page_free_target + size_in_pages) ||
fe8ab488 5641 object->resident_page_count < ((MAX_UPL_SIZE_BYTES * 2) >> PAGE_SHIFT))
6d2010ae 5642 object->scan_collisions = 0;
b0d623f7 5643
fe8ab488
A
5644 if ((cntrl_flags & UPL_WILL_MODIFY) && must_throttle_writes() == TRUE) {
5645 boolean_t isSSD = FALSE;
5646
5647 vnode_pager_get_isSSD(object->pager, &isSSD);
5648 vm_object_unlock(object);
5649
5650 OSAddAtomic(size_in_pages, &vm_upl_wait_for_pages);
5651
5652 if (isSSD == TRUE)
5653 delay(1000 * size_in_pages);
5654 else
5655 delay(5000 * size_in_pages);
5656 OSAddAtomic(-size_in_pages, &vm_upl_wait_for_pages);
5657
5658 vm_object_lock(object);
5659 }
5660
2d21ac55
A
5661 while (xfer_size) {
5662
b0d623f7
A
5663 dwp->dw_mask = 0;
5664
2d21ac55 5665 if ((alias_page == NULL) && !(cntrl_flags & UPL_SET_LITE)) {
2d21ac55
A
5666 vm_object_unlock(object);
5667 VM_PAGE_GRAB_FICTITIOUS(alias_page);
b0d623f7 5668 vm_object_lock(object);
4a3eedf9 5669 }
2d21ac55
A
5670 if (cntrl_flags & UPL_COPYOUT_FROM) {
5671 upl->flags |= UPL_PAGE_SYNC_DONE;
5672
91447636 5673 if ( ((dst_page = vm_page_lookup(object, dst_offset)) == VM_PAGE_NULL) ||
1c79356b
A
5674 dst_page->fictitious ||
5675 dst_page->absent ||
5676 dst_page->error ||
316670eb
A
5677 dst_page->cleaning ||
5678 (VM_PAGE_WIRED(dst_page))) {
5679
91447636 5680 if (user_page_list)
1c79356b 5681 user_page_list[entry].phys_addr = 0;
2d21ac55 5682
b0d623f7 5683 goto try_next_page;
2d21ac55 5684 }
39037602
A
5685 phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
5686
2d21ac55
A
5687 /*
5688 * grab this up front...
5689 * a high percentange of the time we're going to
5690 * need the hardware modification state a bit later
5691 * anyway... so we can eliminate an extra call into
5692 * the pmap layer by grabbing it here and recording it
5693 */
5694 if (dst_page->pmapped)
39037602 5695 refmod_state = pmap_get_refmod(phys_page);
2d21ac55
A
5696 else
5697 refmod_state = 0;
5698
39037602 5699 if ( (refmod_state & VM_MEM_REFERENCED) && VM_PAGE_INACTIVE(dst_page)) {
91447636 5700 /*
2d21ac55
A
5701 * page is on inactive list and referenced...
5702 * reactivate it now... this gets it out of the
5703 * way of vm_pageout_scan which would have to
5704 * reactivate it upon tripping over it
91447636 5705 */
b0d623f7 5706 dwp->dw_mask |= DW_vm_page_activate;
2d21ac55
A
5707 }
5708 if (cntrl_flags & UPL_RET_ONLY_DIRTY) {
5709 /*
5710 * we're only asking for DIRTY pages to be returned
5711 */
39236c6e 5712 if (dst_page->laundry || !(cntrl_flags & UPL_FOR_PAGEOUT)) {
91447636 5713 /*
2d21ac55
A
5714 * if we were the page stolen by vm_pageout_scan to be
5715 * cleaned (as opposed to a buddy being clustered in
5716 * or this request is not being driven by a PAGEOUT cluster
5717 * then we only need to check for the page being dirty or
5718 * precious to decide whether to return it
91447636 5719 */
2d21ac55 5720 if (dst_page->dirty || dst_page->precious || (refmod_state & VM_MEM_MODIFIED))
91447636 5721 goto check_busy;
2d21ac55 5722 goto dont_return;
1c79356b 5723 }
2d21ac55
A
5724 /*
5725 * this is a request for a PAGEOUT cluster and this page
5726 * is merely along for the ride as a 'buddy'... not only
5727 * does it have to be dirty to be returned, but it also
316670eb 5728 * can't have been referenced recently...
2d21ac55 5729 */
316670eb 5730 if ( (hibernate_cleaning_in_progress == TRUE ||
39037602
A
5731 (!((refmod_state & VM_MEM_REFERENCED) || dst_page->reference) ||
5732 (dst_page->vm_page_q_state == VM_PAGE_ON_THROTTLED_Q))) &&
5733 ((refmod_state & VM_MEM_MODIFIED) || dst_page->dirty || dst_page->precious) ) {
2d21ac55 5734 goto check_busy;
1c79356b 5735 }
2d21ac55
A
5736dont_return:
5737 /*
5738 * if we reach here, we're not to return
5739 * the page... go on to the next one
5740 */
316670eb
A
5741 if (dst_page->laundry == TRUE) {
5742 /*
5743 * if we get here, the page is not 'cleaning' (filtered out above).
5744 * since it has been referenced, remove it from the laundry
5745 * so we don't pay the cost of an I/O to clean a page
5746 * we're just going to take back
5747 */
5748 vm_page_lockspin_queues();
5749
5750 vm_pageout_steal_laundry(dst_page, TRUE);
5751 vm_page_activate(dst_page);
5752
5753 vm_page_unlock_queues();
5754 }
2d21ac55
A
5755 if (user_page_list)
5756 user_page_list[entry].phys_addr = 0;
55e303ae 5757
b0d623f7 5758 goto try_next_page;
2d21ac55
A
5759 }
5760check_busy:
316670eb
A
5761 if (dst_page->busy) {
5762 if (cntrl_flags & UPL_NOBLOCK) {
39037602 5763 if (user_page_list)
2d21ac55 5764 user_page_list[entry].phys_addr = 0;
39037602 5765 dwp->dw_mask = 0;
55e303ae 5766
b0d623f7 5767 goto try_next_page;
1c79356b 5768 }
2d21ac55
A
5769 /*
5770 * someone else is playing with the
5771 * page. We will have to wait.
5772 */
2d21ac55 5773 PAGE_SLEEP(object, dst_page, THREAD_UNINT);
1c79356b 5774
316670eb 5775 continue;
2d21ac55
A
5776 }
5777 /*
5778 * ENCRYPTED SWAP:
5779 * The caller is gathering this page and might
5780 * access its contents later on. Decrypt the
5781 * page before adding it to the UPL, so that
5782 * the caller never sees encrypted data.
5783 */
5784 if (! (cntrl_flags & UPL_ENCRYPT) && dst_page->encrypted) {
5785 int was_busy;
91447636
A
5786
5787 /*
2d21ac55
A
5788 * save the current state of busy
5789 * mark page as busy while decrypt
5790 * is in progress since it will drop
5791 * the object lock...
91447636 5792 */
2d21ac55
A
5793 was_busy = dst_page->busy;
5794 dst_page->busy = TRUE;
91447636 5795
2d21ac55
A
5796 vm_page_decrypt(dst_page, 0);
5797 vm_page_decrypt_for_upl_counter++;
5798 /*
5799 * restore to original busy state
5800 */
5801 dst_page->busy = was_busy;
b0d623f7 5802 }
39037602 5803 if (dst_page->vm_page_q_state == VM_PAGE_ON_PAGEOUT_Q) {
91447636 5804
b0d623f7
A
5805 vm_page_lockspin_queues();
5806
39037602 5807 if (dst_page->vm_page_q_state == VM_PAGE_ON_PAGEOUT_Q) {
b0d623f7
A
5808 /*
5809 * we've buddied up a page for a clustered pageout
5810 * that has already been moved to the pageout
5811 * queue by pageout_scan... we need to remove
5812 * it from the queue and drop the laundry count
5813 * on that queue
5814 */
5815 vm_pageout_throttle_up(dst_page);
5816 }
5817 vm_page_unlock_queues();
91447636 5818 }
2d21ac55
A
5819#if MACH_CLUSTER_STATS
5820 /*
5821 * pageout statistics gathering. count
5822 * all the pages we will page out that
5823 * were not counted in the initial
5824 * vm_pageout_scan work
5825 */
316670eb 5826 if (dst_page->pageout)
2d21ac55 5827 encountered_lrp = TRUE;
39037602 5828 if ((dst_page->dirty || (object->internal && dst_page->precious))) {
2d21ac55
A
5829 if (encountered_lrp)
5830 CLUSTER_STAT(pages_at_higher_offsets++;)
5831 else
5832 CLUSTER_STAT(pages_at_lower_offsets++;)
5833 }
5834#endif
2d21ac55
A
5835 hw_dirty = refmod_state & VM_MEM_MODIFIED;
5836 dirty = hw_dirty ? TRUE : dst_page->dirty;
5837
39037602
A
5838 if (phys_page > upl->highest_page)
5839 upl->highest_page = phys_page;
2d21ac55 5840
39037602 5841 assert (!pmap_is_noencrypt(phys_page));
3e170ce0 5842
2d21ac55 5843 if (cntrl_flags & UPL_SET_LITE) {
b0d623f7 5844 unsigned int pg_num;
2d21ac55 5845
b0d623f7
A
5846 pg_num = (unsigned int) ((dst_offset-offset)/PAGE_SIZE);
5847 assert(pg_num == (dst_offset-offset)/PAGE_SIZE);
2d21ac55
A
5848 lite_list[pg_num>>5] |= 1 << (pg_num & 31);
5849
5850 if (hw_dirty)
39037602 5851 pmap_clear_modify(phys_page);
2d21ac55
A
5852
5853 /*
5854 * Mark original page as cleaning
5855 * in place.
5856 */
5857 dst_page->cleaning = TRUE;
5858 dst_page->precious = FALSE;
5859 } else {
5860 /*
5861 * use pageclean setup, it is more
5862 * convenient even for the pageout
5863 * cases here
5864 */
5865 vm_object_lock(upl->map_object);
5866 vm_pageclean_setup(dst_page, alias_page, upl->map_object, size - xfer_size);
5867 vm_object_unlock(upl->map_object);
5868
5869 alias_page->absent = FALSE;
5870 alias_page = NULL;
1c79356b 5871 }
316670eb
A
5872 if (dirty) {
5873 SET_PAGE_DIRTY(dst_page, FALSE);
5874 } else {
5875 dst_page->dirty = FALSE;
5876 }
55e303ae 5877
2d21ac55
A
5878 if (!dirty)
5879 dst_page->precious = TRUE;
91447636 5880
2d21ac55
A
5881 if ( (cntrl_flags & UPL_ENCRYPT) ) {
5882 /*
5883 * ENCRYPTED SWAP:
5884 * We want to deny access to the target page
5885 * because its contents are about to be
5886 * encrypted and the user would be very
5887 * confused to see encrypted data instead
5888 * of their data.
5889 * We also set "encrypted_cleaning" to allow
5890 * vm_pageout_scan() to demote that page
5891 * from "adjacent/clean-in-place" to
5892 * "target/clean-and-free" if it bumps into
5893 * this page during its scanning while we're
5894 * still processing this cluster.
5895 */
5896 dst_page->busy = TRUE;
5897 dst_page->encrypted_cleaning = TRUE;
5898 }
5899 if ( !(cntrl_flags & UPL_CLEAN_IN_PLACE) ) {
316670eb 5900 if ( !VM_PAGE_WIRED(dst_page))
39037602 5901 dst_page->free_when_done = TRUE;
2d21ac55
A
5902 }
5903 } else {
5904 if ((cntrl_flags & UPL_WILL_MODIFY) && object->copy != last_copy_object) {
91447636 5905 /*
2d21ac55
A
5906 * Honor copy-on-write obligations
5907 *
91447636
A
5908 * The copy object has changed since we
5909 * last synchronized for copy-on-write.
5910 * Another copy object might have been
5911 * inserted while we released the object's
5912 * lock. Since someone could have seen the
5913 * original contents of the remaining pages
5914 * through that new object, we have to
5915 * synchronize with it again for the remaining
5916 * pages only. The previous pages are "busy"
5917 * so they can not be seen through the new
5918 * mapping. The new mapping will see our
5919 * upcoming changes for those previous pages,
5920 * but that's OK since they couldn't see what
5921 * was there before. It's just a race anyway
5922 * and there's no guarantee of consistency or
5923 * atomicity. We just don't want new mappings
5924 * to see both the *before* and *after* pages.
5925 */
5926 if (object->copy != VM_OBJECT_NULL) {
5927 vm_object_update(
5928 object,
5929 dst_offset,/* current offset */
5930 xfer_size, /* remaining size */
5931 NULL,
5932 NULL,
5933 FALSE, /* should_return */
5934 MEMORY_OBJECT_COPY_SYNC,
5935 VM_PROT_NO_CHANGE);
2d21ac55 5936
b0d623f7 5937#if DEVELOPMENT || DEBUG
91447636 5938 upl_cow_again++;
2d21ac55 5939 upl_cow_again_pages += xfer_size >> PAGE_SHIFT;
b0d623f7 5940#endif
91447636 5941 }
2d21ac55
A
5942 /*
5943 * remember the copy object we synced with
5944 */
91447636
A
5945 last_copy_object = object->copy;
5946 }
91447636
A
5947 dst_page = vm_page_lookup(object, dst_offset);
5948
2d21ac55 5949 if (dst_page != VM_PAGE_NULL) {
b0d623f7
A
5950
5951 if ((cntrl_flags & UPL_RET_ONLY_ABSENT)) {
316670eb
A
5952 /*
5953 * skip over pages already present in the cache
5954 */
5955 if (user_page_list)
5956 user_page_list[entry].phys_addr = 0;
b0d623f7 5957
316670eb
A
5958 goto try_next_page;
5959 }
5960 if (dst_page->fictitious) {
5961 panic("need corner case for fictitious page");
b0d623f7 5962 }
2d21ac55 5963
316670eb
A
5964 if (dst_page->busy || dst_page->cleaning) {
5965 /*
5966 * someone else is playing with the
5967 * page. We will have to wait.
5968 */
5969 PAGE_SLEEP(object, dst_page, THREAD_UNINT);
b0d623f7 5970
316670eb
A
5971 continue;
5972 }
39037602 5973 if (dst_page->laundry)
316670eb 5974 vm_pageout_steal_laundry(dst_page, FALSE);
316670eb 5975 } else {
2d21ac55 5976 if (object->private) {
0b4e3aa0
A
5977 /*
5978 * This is a nasty wrinkle for users
5979 * of upl who encounter device or
5980 * private memory however, it is
5981 * unavoidable, only a fault can
2d21ac55 5982 * resolve the actual backing
0b4e3aa0
A
5983 * physical page by asking the
5984 * backing device.
5985 */
2d21ac55 5986 if (user_page_list)
55e303ae 5987 user_page_list[entry].phys_addr = 0;
2d21ac55 5988
b0d623f7 5989 goto try_next_page;
0b4e3aa0 5990 }
6d2010ae
A
5991 if (object->scan_collisions) {
5992 /*
5993 * the pageout_scan thread is trying to steal
5994 * pages from this object, but has run into our
5995 * lock... grab 2 pages from the head of the object...
5996 * the first is freed on behalf of pageout_scan, the
5997 * 2nd is for our own use... we use vm_object_page_grab
5998 * in both cases to avoid taking pages from the free
5999 * list since we are under memory pressure and our
6000 * lock on this object is getting in the way of
6001 * relieving it
6002 */
6003 dst_page = vm_object_page_grab(object);
6004
6005 if (dst_page != VM_PAGE_NULL)
39037602
A
6006 vm_page_release(dst_page,
6007 FALSE);
2d21ac55 6008
6d2010ae
A
6009 dst_page = vm_object_page_grab(object);
6010 }
6011 if (dst_page == VM_PAGE_NULL) {
6012 /*
6013 * need to allocate a page
6014 */
39037602 6015 dst_page = vm_page_grab_options(grab_options);
6d2010ae 6016 }
1c79356b 6017 if (dst_page == VM_PAGE_NULL) {
2d21ac55
A
6018 if ( (cntrl_flags & (UPL_RET_ONLY_ABSENT | UPL_NOBLOCK)) == (UPL_RET_ONLY_ABSENT | UPL_NOBLOCK)) {
6019 /*
6020 * we don't want to stall waiting for pages to come onto the free list
6021 * while we're already holding absent pages in this UPL
6022 * the caller will deal with the empty slots
6023 */
6024 if (user_page_list)
6025 user_page_list[entry].phys_addr = 0;
6026
6027 goto try_next_page;
6028 }
6029 /*
6030 * no pages available... wait
6031 * then try again for the same
6032 * offset...
6033 */
0b4e3aa0 6034 vm_object_unlock(object);
6d2010ae
A
6035
6036 OSAddAtomic(size_in_pages, &vm_upl_wait_for_pages);
6037
6038 VM_DEBUG_EVENT(vm_upl_page_wait, VM_UPL_PAGE_WAIT, DBG_FUNC_START, vm_upl_wait_for_pages, 0, 0, 0);
6039
0b4e3aa0 6040 VM_PAGE_WAIT();
6d2010ae
A
6041 OSAddAtomic(-size_in_pages, &vm_upl_wait_for_pages);
6042
6043 VM_DEBUG_EVENT(vm_upl_page_wait, VM_UPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, 0);
6044
b0d623f7 6045 vm_object_lock(object);
2d21ac55 6046
0b4e3aa0 6047 continue;
1c79356b 6048 }
b0d623f7 6049 vm_page_insert(dst_page, object, dst_offset);
4a3eedf9 6050
2d21ac55 6051 dst_page->absent = TRUE;
4a3eedf9 6052 dst_page->busy = FALSE;
2d21ac55
A
6053
6054 if (cntrl_flags & UPL_RET_ONLY_ABSENT) {
91447636
A
6055 /*
6056 * if UPL_RET_ONLY_ABSENT was specified,
6057 * than we're definitely setting up a
6058 * upl for a clustered read/pagein
6059 * operation... mark the pages as clustered
2d21ac55
A
6060 * so upl_commit_range can put them on the
6061 * speculative list
91447636
A
6062 */
6063 dst_page->clustered = TRUE;
fe8ab488
A
6064
6065 if ( !(cntrl_flags & UPL_FILE_IO))
6066 VM_STAT_INCR(pageins);
91447636 6067 }
1c79356b 6068 }
39037602
A
6069 phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
6070
91447636
A
6071 /*
6072 * ENCRYPTED SWAP:
6073 */
6074 if (cntrl_flags & UPL_ENCRYPT) {
6075 /*
6076 * The page is going to be encrypted when we
6077 * get it from the pager, so mark it so.
6078 */
6079 dst_page->encrypted = TRUE;
6080 } else {
6081 /*
6082 * Otherwise, the page will not contain
6083 * encrypted data.
6084 */
6085 dst_page->encrypted = FALSE;
6086 }
1c79356b 6087 dst_page->overwriting = TRUE;
2d21ac55 6088
2d21ac55
A
6089 if (dst_page->pmapped) {
6090 if ( !(cntrl_flags & UPL_FILE_IO))
6091 /*
6092 * eliminate all mappings from the
6093 * original object and its prodigy
55e303ae 6094 */
39037602 6095 refmod_state = pmap_disconnect(phys_page);
2d21ac55 6096 else
39037602 6097 refmod_state = pmap_get_refmod(phys_page);
2d21ac55
A
6098 } else
6099 refmod_state = 0;
55e303ae 6100
2d21ac55
A
6101 hw_dirty = refmod_state & VM_MEM_MODIFIED;
6102 dirty = hw_dirty ? TRUE : dst_page->dirty;
1c79356b 6103
2d21ac55 6104 if (cntrl_flags & UPL_SET_LITE) {
b0d623f7 6105 unsigned int pg_num;
1c79356b 6106
b0d623f7
A
6107 pg_num = (unsigned int) ((dst_offset-offset)/PAGE_SIZE);
6108 assert(pg_num == (dst_offset-offset)/PAGE_SIZE);
2d21ac55 6109 lite_list[pg_num>>5] |= 1 << (pg_num & 31);
91447636 6110
2d21ac55 6111 if (hw_dirty)
39037602 6112 pmap_clear_modify(phys_page);
0b4e3aa0 6113
2d21ac55
A
6114 /*
6115 * Mark original page as cleaning
6116 * in place.
6117 */
6118 dst_page->cleaning = TRUE;
6119 dst_page->precious = FALSE;
6120 } else {
6121 /*
6122 * use pageclean setup, it is more
6123 * convenient even for the pageout
6124 * cases here
6125 */
6126 vm_object_lock(upl->map_object);
6127 vm_pageclean_setup(dst_page, alias_page, upl->map_object, size - xfer_size);
6128 vm_object_unlock(upl->map_object);
0b4e3aa0 6129
2d21ac55
A
6130 alias_page->absent = FALSE;
6131 alias_page = NULL;
6132 }
1c79356b 6133
6d2010ae
A
6134 if (cntrl_flags & UPL_REQUEST_SET_DIRTY) {
6135 upl->flags &= ~UPL_CLEAR_DIRTY;
6136 upl->flags |= UPL_SET_DIRTY;
6137 dirty = TRUE;
6138 upl->flags |= UPL_SET_DIRTY;
6139 } else if (cntrl_flags & UPL_CLEAN_IN_PLACE) {
2d21ac55
A
6140 /*
6141 * clean in place for read implies
6142 * that a write will be done on all
6143 * the pages that are dirty before
6144 * a upl commit is done. The caller
6145 * is obligated to preserve the
6146 * contents of all pages marked dirty
6147 */
6148 upl->flags |= UPL_CLEAR_DIRTY;
6149 }
6150 dst_page->dirty = dirty;
91447636 6151
2d21ac55
A
6152 if (!dirty)
6153 dst_page->precious = TRUE;
6154
b0d623f7 6155 if ( !VM_PAGE_WIRED(dst_page)) {
2d21ac55
A
6156 /*
6157 * deny access to the target page while
6158 * it is being worked on
6159 */
6160 dst_page->busy = TRUE;
6161 } else
b0d623f7 6162 dwp->dw_mask |= DW_vm_page_wire;
2d21ac55 6163
b0d623f7
A
6164 /*
6165 * We might be about to satisfy a fault which has been
6166 * requested. So no need for the "restart" bit.
6167 */
6168 dst_page->restart = FALSE;
6169 if (!dst_page->absent && !(cntrl_flags & UPL_WILL_MODIFY)) {
2d21ac55
A
6170 /*
6171 * expect the page to be used
6172 */
b0d623f7 6173 dwp->dw_mask |= DW_set_reference;
2d21ac55 6174 }
6d2010ae 6175 if (cntrl_flags & UPL_PRECIOUS) {
39037602 6176 if (object->internal) {
316670eb 6177 SET_PAGE_DIRTY(dst_page, FALSE);
6d2010ae
A
6178 dst_page->precious = FALSE;
6179 } else {
6180 dst_page->precious = TRUE;
6181 }
6182 } else {
6183 dst_page->precious = FALSE;
6184 }
2d21ac55 6185 }
d41d1dae
A
6186 if (dst_page->busy)
6187 upl->flags |= UPL_HAS_BUSY;
6188
39037602
A
6189 if (phys_page > upl->highest_page)
6190 upl->highest_page = phys_page;
6191 assert (!pmap_is_noencrypt(phys_page));
2d21ac55 6192 if (user_page_list) {
39037602
A
6193 user_page_list[entry].phys_addr = phys_page;
6194 user_page_list[entry].free_when_done = dst_page->free_when_done;
2d21ac55 6195 user_page_list[entry].absent = dst_page->absent;
593a1d5f 6196 user_page_list[entry].dirty = dst_page->dirty;
2d21ac55 6197 user_page_list[entry].precious = dst_page->precious;
593a1d5f 6198 user_page_list[entry].device = FALSE;
316670eb 6199 user_page_list[entry].needed = FALSE;
2d21ac55 6200 if (dst_page->clustered == TRUE)
39037602 6201 user_page_list[entry].speculative = (dst_page->vm_page_q_state == VM_PAGE_ON_SPECULATIVE_Q) ? TRUE : FALSE;
2d21ac55
A
6202 else
6203 user_page_list[entry].speculative = FALSE;
593a1d5f
A
6204 user_page_list[entry].cs_validated = dst_page->cs_validated;
6205 user_page_list[entry].cs_tainted = dst_page->cs_tainted;
c18c124e 6206 user_page_list[entry].cs_nx = dst_page->cs_nx;
3e170ce0 6207 user_page_list[entry].mark = FALSE;
2d21ac55
A
6208 }
6209 /*
6210 * if UPL_RET_ONLY_ABSENT is set, then
6211 * we are working with a fresh page and we've
6212 * just set the clustered flag on it to
6213 * indicate that it was drug in as part of a
6214 * speculative cluster... so leave it alone
6215 */
6216 if ( !(cntrl_flags & UPL_RET_ONLY_ABSENT)) {
6217 /*
6218 * someone is explicitly grabbing this page...
6219 * update clustered and speculative state
6220 *
6221 */
fe8ab488
A
6222 if (dst_page->clustered)
6223 VM_PAGE_CONSUME_CLUSTERED(dst_page);
2d21ac55 6224 }
b0d623f7
A
6225try_next_page:
6226 if (dwp->dw_mask) {
6227 if (dwp->dw_mask & DW_vm_page_activate)
6228 VM_STAT_INCR(reactivations);
4a3eedf9 6229
6d2010ae 6230 VM_PAGE_ADD_DELAYED_WORK(dwp, dst_page, dw_count);
b0d623f7 6231
6d2010ae 6232 if (dw_count >= dw_limit) {
3e170ce0 6233 vm_page_do_delayed_work(object, UPL_MEMORY_TAG(cntrl_flags), &dw_array[0], dw_count);
b0d623f7
A
6234
6235 dwp = &dw_array[0];
6236 dw_count = 0;
4a3eedf9 6237 }
2d21ac55 6238 }
2d21ac55
A
6239 entry++;
6240 dst_offset += PAGE_SIZE_64;
6241 xfer_size -= PAGE_SIZE;
6242 }
b0d623f7 6243 if (dw_count)
3e170ce0 6244 vm_page_do_delayed_work(object, UPL_MEMORY_TAG(cntrl_flags), &dw_array[0], dw_count);
b0d623f7 6245
2d21ac55 6246 if (alias_page != NULL) {
b0d623f7 6247 VM_PAGE_FREE(alias_page);
1c79356b 6248 }
91447636 6249
2d21ac55
A
6250 if (page_list_count != NULL) {
6251 if (upl->flags & UPL_INTERNAL)
6252 *page_list_count = 0;
6253 else if (*page_list_count > entry)
6254 *page_list_count = entry;
6255 }
b0d623f7
A
6256#if UPL_DEBUG
6257 upl->upl_state = 1;
6258#endif
1c79356b 6259 vm_object_unlock(object);
2d21ac55 6260
1c79356b
A
6261 return KERN_SUCCESS;
6262}
6263
0b4e3aa0
A
6264/*
6265 * Routine: vm_object_super_upl_request
6266 * Purpose:
6267 * Cause the population of a portion of a vm_object
6268 * in much the same way as memory_object_upl_request.
6269 * Depending on the nature of the request, the pages
6270 * returned may be contain valid data or be uninitialized.
6271 * However, the region may be expanded up to the super
6272 * cluster size provided.
6273 */
6274
6275__private_extern__ kern_return_t
6276vm_object_super_upl_request(
6277 vm_object_t object,
6278 vm_object_offset_t offset,
91447636
A
6279 upl_size_t size,
6280 upl_size_t super_cluster,
0b4e3aa0
A
6281 upl_t *upl,
6282 upl_page_info_t *user_page_list,
6283 unsigned int *page_list_count,
3e170ce0 6284 upl_control_flags_t cntrl_flags)
0b4e3aa0 6285{
b0d623f7 6286 if (object->paging_offset > offset || ((cntrl_flags & UPL_VECTOR)==UPL_VECTOR))
1c79356b 6287 return KERN_FAILURE;
0b4e3aa0 6288
55e303ae 6289 assert(object->paging_in_progress);
1c79356b 6290 offset = offset - object->paging_offset;
91447636 6291
91447636 6292 if (super_cluster > size) {
1c79356b
A
6293
6294 vm_object_offset_t base_offset;
91447636 6295 upl_size_t super_size;
b0d623f7 6296 vm_object_size_t super_size_64;
1c79356b 6297
2d21ac55
A
6298 base_offset = (offset & ~((vm_object_offset_t) super_cluster - 1));
6299 super_size = (offset + size) > (base_offset + super_cluster) ? super_cluster<<1 : super_cluster;
6d2010ae 6300 super_size_64 = ((base_offset + super_size) > object->vo_size) ? (object->vo_size - base_offset) : super_size;
b0d623f7
A
6301 super_size = (upl_size_t) super_size_64;
6302 assert(super_size == super_size_64);
2d21ac55
A
6303
6304 if (offset > (base_offset + super_size)) {
6305 panic("vm_object_super_upl_request: Missed target pageout"
6306 " %#llx,%#llx, %#x, %#x, %#x, %#llx\n",
6307 offset, base_offset, super_size, super_cluster,
6308 size, object->paging_offset);
6309 }
91447636
A
6310 /*
6311 * apparently there is a case where the vm requests a
6312 * page to be written out who's offset is beyond the
6313 * object size
6314 */
b0d623f7
A
6315 if ((offset + size) > (base_offset + super_size)) {
6316 super_size_64 = (offset + size) - base_offset;
6317 super_size = (upl_size_t) super_size_64;
6318 assert(super_size == super_size_64);
6319 }
1c79356b
A
6320
6321 offset = base_offset;
6322 size = super_size;
6323 }
2d21ac55 6324 return vm_object_upl_request(object, offset, size, upl, user_page_list, page_list_count, cntrl_flags);
1c79356b
A
6325}
6326
b0d623f7 6327
91447636
A
6328kern_return_t
6329vm_map_create_upl(
6330 vm_map_t map,
6331 vm_map_address_t offset,
6332 upl_size_t *upl_size,
6333 upl_t *upl,
6334 upl_page_info_array_t page_list,
6335 unsigned int *count,
3e170ce0 6336 upl_control_flags_t *flags)
91447636 6337{
3e170ce0
A
6338 vm_map_entry_t entry;
6339 upl_control_flags_t caller_flags;
6340 int force_data_sync;
6341 int sync_cow_data;
6342 vm_object_t local_object;
6343 vm_map_offset_t local_offset;
6344 vm_map_offset_t local_start;
6345 kern_return_t ret;
91447636 6346
39037602
A
6347 assert(page_aligned(offset));
6348
91447636
A
6349 caller_flags = *flags;
6350
6351 if (caller_flags & ~UPL_VALID_FLAGS) {
6352 /*
6353 * For forward compatibility's sake,
6354 * reject any unknown flag.
6355 */
6356 return KERN_INVALID_VALUE;
6357 }
91447636
A
6358 force_data_sync = (caller_flags & UPL_FORCE_DATA_SYNC);
6359 sync_cow_data = !(caller_flags & UPL_COPYOUT_FROM);
6360
2d21ac55 6361 if (upl == NULL)
91447636
A
6362 return KERN_INVALID_ARGUMENT;
6363
91447636 6364REDISCOVER_ENTRY:
b0d623f7 6365 vm_map_lock_read(map);
2d21ac55 6366
3e170ce0
A
6367 if (!vm_map_lookup_entry(map, offset, &entry)) {
6368 vm_map_unlock_read(map);
6369 return KERN_FAILURE;
6370 }
2d21ac55 6371
3e170ce0
A
6372 if ((entry->vme_end - offset) < *upl_size) {
6373 *upl_size = (upl_size_t) (entry->vme_end - offset);
6374 assert(*upl_size == entry->vme_end - offset);
6375 }
6376
6377 if (caller_flags & UPL_QUERY_OBJECT_TYPE) {
6378 *flags = 0;
6379
6380 if (!entry->is_sub_map &&
6381 VME_OBJECT(entry) != VM_OBJECT_NULL) {
6382 if (VME_OBJECT(entry)->private)
6383 *flags = UPL_DEV_MEMORY;
6384
6385 if (VME_OBJECT(entry)->phys_contiguous)
6386 *flags |= UPL_PHYS_CONTIG;
b0d623f7 6387 }
3e170ce0
A
6388 vm_map_unlock_read(map);
6389 return KERN_SUCCESS;
6390 }
2d21ac55 6391
3e170ce0
A
6392 if (VME_OBJECT(entry) == VM_OBJECT_NULL ||
6393 !VME_OBJECT(entry)->phys_contiguous) {
6394 if (*upl_size > MAX_UPL_SIZE_BYTES)
6395 *upl_size = MAX_UPL_SIZE_BYTES;
6396 }
e2d2fc5c 6397
3e170ce0
A
6398 /*
6399 * Create an object if necessary.
6400 */
6401 if (VME_OBJECT(entry) == VM_OBJECT_NULL) {
e2d2fc5c 6402
3e170ce0
A
6403 if (vm_map_lock_read_to_write(map))
6404 goto REDISCOVER_ENTRY;
e2d2fc5c 6405
3e170ce0
A
6406 VME_OBJECT_SET(entry,
6407 vm_object_allocate((vm_size_t)
6408 (entry->vme_end -
6409 entry->vme_start)));
6410 VME_OFFSET_SET(entry, 0);
e2d2fc5c 6411
3e170ce0
A
6412 vm_map_lock_write_to_read(map);
6413 }
b0d623f7 6414
3e170ce0
A
6415 if (!(caller_flags & UPL_COPYOUT_FROM) &&
6416 !(entry->protection & VM_PROT_WRITE)) {
6417 vm_map_unlock_read(map);
6418 return KERN_PROTECTION_FAILURE;
6419 }
6420
39037602 6421
3e170ce0
A
6422 local_object = VME_OBJECT(entry);
6423 assert(local_object != VM_OBJECT_NULL);
6424
39037602
A
6425 if (!entry->is_sub_map &&
6426 !entry->needs_copy &&
6427 *upl_size != 0 &&
3e170ce0
A
6428 local_object->vo_size > *upl_size && /* partial UPL */
6429 entry->wired_count == 0 && /* No COW for entries that are wired */
6430 (map->pmap != kernel_pmap) && /* alias checks */
6431 (vm_map_entry_should_cow_for_true_share(entry) /* case 1 */
6432 ||
39037602 6433 (/* case 2 */
3e170ce0
A
6434 local_object->internal &&
6435 (local_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) &&
6436 local_object->ref_count > 1))) {
6437 vm_prot_t prot;
b0d623f7 6438
3e170ce0
A
6439 /*
6440 * Case 1:
6441 * Set up the targeted range for copy-on-write to avoid
6442 * applying true_share/copy_delay to the entire object.
6443 *
6444 * Case 2:
6445 * This map entry covers only part of an internal
6446 * object. There could be other map entries covering
6447 * other areas of this object and some of these map
6448 * entries could be marked as "needs_copy", which
6449 * assumes that the object is COPY_SYMMETRIC.
6450 * To avoid marking this object as COPY_DELAY and
6451 * "true_share", let's shadow it and mark the new
6452 * (smaller) object as "true_share" and COPY_DELAY.
6453 */
b0d623f7 6454
3e170ce0
A
6455 if (vm_map_lock_read_to_write(map)) {
6456 goto REDISCOVER_ENTRY;
91447636 6457 }
3e170ce0
A
6458 vm_map_lock_assert_exclusive(map);
6459 assert(VME_OBJECT(entry) == local_object);
6460
6461 vm_map_clip_start(map,
6462 entry,
6463 vm_map_trunc_page(offset,
6464 VM_MAP_PAGE_MASK(map)));
6465 vm_map_clip_end(map,
6466 entry,
6467 vm_map_round_page(offset + *upl_size,
6468 VM_MAP_PAGE_MASK(map)));
6469 if ((entry->vme_end - offset) < *upl_size) {
6470 *upl_size = (upl_size_t) (entry->vme_end - offset);
6471 assert(*upl_size == entry->vme_end - offset);
fe8ab488 6472 }
e2d2fc5c 6473
3e170ce0
A
6474 prot = entry->protection & ~VM_PROT_WRITE;
6475 if (override_nx(map, VME_ALIAS(entry)) && prot)
6476 prot |= VM_PROT_EXECUTE;
6477 vm_object_pmap_protect(local_object,
6478 VME_OFFSET(entry),
6479 entry->vme_end - entry->vme_start,
6480 ((entry->is_shared ||
6481 map->mapped_in_other_pmaps)
6482 ? PMAP_NULL
6483 : map->pmap),
6484 entry->vme_start,
6485 prot);
e2d2fc5c 6486
3e170ce0 6487 assert(entry->wired_count == 0);
e2d2fc5c 6488
3e170ce0
A
6489 /*
6490 * Lock the VM object and re-check its status: if it's mapped
6491 * in another address space, we could still be racing with
6492 * another thread holding that other VM map exclusively.
6493 */
6494 vm_object_lock(local_object);
6495 if (local_object->true_share) {
6496 /* object is already in proper state: no COW needed */
6497 assert(local_object->copy_strategy !=
6498 MEMORY_OBJECT_COPY_SYMMETRIC);
6499 } else {
6500 /* not true_share: ask for copy-on-write below */
6501 assert(local_object->copy_strategy ==
6502 MEMORY_OBJECT_COPY_SYMMETRIC);
fe8ab488 6503 entry->needs_copy = TRUE;
fe8ab488 6504 }
3e170ce0 6505 vm_object_unlock(local_object);
fe8ab488 6506
3e170ce0
A
6507 vm_map_lock_write_to_read(map);
6508 }
6509
6510 if (entry->needs_copy) {
6511 /*
6512 * Honor copy-on-write for COPY_SYMMETRIC
6513 * strategy.
6514 */
6515 vm_map_t local_map;
6516 vm_object_t object;
6517 vm_object_offset_t new_offset;
6518 vm_prot_t prot;
6519 boolean_t wired;
6520 vm_map_version_t version;
6521 vm_map_t real_map;
6522 vm_prot_t fault_type;
6523
6524 local_map = map;
6525
6526 if (caller_flags & UPL_COPYOUT_FROM) {
6527 fault_type = VM_PROT_READ | VM_PROT_COPY;
6528 vm_counters.create_upl_extra_cow++;
6529 vm_counters.create_upl_extra_cow_pages +=
6530 (entry->vme_end - entry->vme_start) / PAGE_SIZE;
6531 } else {
6532 fault_type = VM_PROT_WRITE;
6533 }
6534 if (vm_map_lookup_locked(&local_map,
6535 offset, fault_type,
6536 OBJECT_LOCK_EXCLUSIVE,
6537 &version, &object,
6538 &new_offset, &prot, &wired,
6539 NULL,
6540 &real_map) != KERN_SUCCESS) {
6541 if (fault_type == VM_PROT_WRITE) {
6542 vm_counters.create_upl_lookup_failure_write++;
fe8ab488 6543 } else {
3e170ce0 6544 vm_counters.create_upl_lookup_failure_copy++;
fe8ab488 6545 }
fe8ab488 6546 vm_map_unlock_read(local_map);
3e170ce0 6547 return KERN_FAILURE;
91447636 6548 }
3e170ce0
A
6549 if (real_map != map)
6550 vm_map_unlock(real_map);
6551 vm_map_unlock_read(local_map);
fe8ab488 6552
3e170ce0 6553 vm_object_unlock(object);
2d21ac55 6554
3e170ce0
A
6555 goto REDISCOVER_ENTRY;
6556 }
2d21ac55 6557
39037602
A
6558 if (entry->is_sub_map) {
6559 vm_map_t submap;
6560
6561 submap = VME_SUBMAP(entry);
6562 local_start = entry->vme_start;
6563 local_offset = VME_OFFSET(entry);
6564
6565 vm_map_reference(submap);
6566 vm_map_unlock_read(map);
6567
6568 ret = vm_map_create_upl(submap,
6569 local_offset + (offset - local_start),
6570 upl_size, upl, page_list, count, flags);
6571 vm_map_deallocate(submap);
6572
6573 return ret;
6574 }
6575
3e170ce0
A
6576 if (sync_cow_data &&
6577 (VME_OBJECT(entry)->shadow ||
6578 VME_OBJECT(entry)->copy)) {
6579 local_object = VME_OBJECT(entry);
6580 local_start = entry->vme_start;
6581 local_offset = VME_OFFSET(entry);
6582
6583 vm_object_reference(local_object);
6584 vm_map_unlock_read(map);
91447636 6585
3e170ce0
A
6586 if (local_object->shadow && local_object->copy) {
6587 vm_object_lock_request(local_object->shadow,
6588 ((vm_object_offset_t)
6589 ((offset - local_start) +
6590 local_offset) +
6591 local_object->vo_shadow_offset),
6592 *upl_size, FALSE,
2d21ac55
A
6593 MEMORY_OBJECT_DATA_SYNC,
6594 VM_PROT_NO_CHANGE);
91447636 6595 }
3e170ce0
A
6596 sync_cow_data = FALSE;
6597 vm_object_deallocate(local_object);
91447636 6598
3e170ce0
A
6599 goto REDISCOVER_ENTRY;
6600 }
6601 if (force_data_sync) {
6602 local_object = VME_OBJECT(entry);
91447636 6603 local_start = entry->vme_start;
3e170ce0 6604 local_offset = VME_OFFSET(entry);
2d21ac55 6605
91447636 6606 vm_object_reference(local_object);
b0d623f7 6607 vm_map_unlock_read(map);
2d21ac55 6608
3e170ce0
A
6609 vm_object_lock_request(local_object,
6610 ((vm_object_offset_t)
6611 ((offset - local_start) +
6612 local_offset)),
6613 (vm_object_size_t)*upl_size,
6614 FALSE,
6615 MEMORY_OBJECT_DATA_SYNC,
6616 VM_PROT_NO_CHANGE);
6617
6618 force_data_sync = FALSE;
91447636 6619 vm_object_deallocate(local_object);
2d21ac55 6620
3e170ce0
A
6621 goto REDISCOVER_ENTRY;
6622 }
6623 if (VME_OBJECT(entry)->private)
6624 *flags = UPL_DEV_MEMORY;
6625 else
6626 *flags = 0;
6627
6628 if (VME_OBJECT(entry)->phys_contiguous)
6629 *flags |= UPL_PHYS_CONTIG;
6630
6631 local_object = VME_OBJECT(entry);
6632 local_offset = VME_OFFSET(entry);
6633 local_start = entry->vme_start;
6634
39037602 6635
3e170ce0
A
6636 vm_object_lock(local_object);
6637
6638 /*
6639 * Ensure that this object is "true_share" and "copy_delay" now,
6640 * while we're still holding the VM map lock. After we unlock the map,
6641 * anything could happen to that mapping, including some copy-on-write
6642 * activity. We need to make sure that the IOPL will point at the
6643 * same memory as the mapping.
6644 */
6645 if (local_object->true_share) {
6646 assert(local_object->copy_strategy !=
6647 MEMORY_OBJECT_COPY_SYMMETRIC);
6648 } else if (local_object != kernel_object &&
6649 local_object != compressor_object &&
6650 !local_object->phys_contiguous) {
6651#if VM_OBJECT_TRACKING_OP_TRUESHARE
6652 if (!local_object->true_share &&
6653 vm_object_tracking_inited) {
6654 void *bt[VM_OBJECT_TRACKING_BTDEPTH];
6655 int num = 0;
6656 num = OSBacktrace(bt,
6657 VM_OBJECT_TRACKING_BTDEPTH);
6658 btlog_add_entry(vm_object_tracking_btlog,
6659 local_object,
6660 VM_OBJECT_TRACKING_OP_TRUESHARE,
6661 bt,
6662 num);
6663 }
6664#endif /* VM_OBJECT_TRACKING_OP_TRUESHARE */
6665 local_object->true_share = TRUE;
6666 if (local_object->copy_strategy ==
6667 MEMORY_OBJECT_COPY_SYMMETRIC) {
6668 local_object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
6669 }
6670 }
6671
6672 vm_object_reference_locked(local_object);
6673 vm_object_unlock(local_object);
6674
b0d623f7 6675 vm_map_unlock_read(map);
1c79356b 6676
3e170ce0
A
6677 ret = vm_object_iopl_request(local_object,
6678 ((vm_object_offset_t)
6679 ((offset - local_start) + local_offset)),
6680 *upl_size,
6681 upl,
6682 page_list,
6683 count,
6684 caller_flags);
6685 vm_object_deallocate(local_object);
6686
6687 return ret;
91447636
A
6688}
6689
6690/*
6691 * Internal routine to enter a UPL into a VM map.
6692 *
6693 * JMM - This should just be doable through the standard
6694 * vm_map_enter() API.
6695 */
1c79356b 6696kern_return_t
91447636
A
6697vm_map_enter_upl(
6698 vm_map_t map,
6699 upl_t upl,
b0d623f7 6700 vm_map_offset_t *dst_addr)
1c79356b 6701{
91447636 6702 vm_map_size_t size;
1c79356b 6703 vm_object_offset_t offset;
91447636 6704 vm_map_offset_t addr;
1c79356b
A
6705 vm_page_t m;
6706 kern_return_t kr;
b0d623f7
A
6707 int isVectorUPL = 0, curr_upl=0;
6708 upl_t vector_upl = NULL;
6709 vm_offset_t vector_upl_dst_addr = 0;
6710 vm_map_t vector_upl_submap = NULL;
6711 upl_offset_t subupl_offset = 0;
6712 upl_size_t subupl_size = 0;
1c79356b 6713
0b4e3aa0
A
6714 if (upl == UPL_NULL)
6715 return KERN_INVALID_ARGUMENT;
6716
b0d623f7
A
6717 if((isVectorUPL = vector_upl_is_valid(upl))) {
6718 int mapped=0,valid_upls=0;
6719 vector_upl = upl;
6720
6721 upl_lock(vector_upl);
6722 for(curr_upl=0; curr_upl < MAX_VECTOR_UPL_ELEMENTS; curr_upl++) {
6723 upl = vector_upl_subupl_byindex(vector_upl, curr_upl );
6724 if(upl == NULL)
6725 continue;
6726 valid_upls++;
6727 if (UPL_PAGE_LIST_MAPPED & upl->flags)
6728 mapped++;
6729 }
6730
6731 if(mapped) {
6732 if(mapped != valid_upls)
6733 panic("Only %d of the %d sub-upls within the Vector UPL are alread mapped\n", mapped, valid_upls);
6734 else {
6735 upl_unlock(vector_upl);
6736 return KERN_FAILURE;
6737 }
6738 }
6739
6740 kr = kmem_suballoc(map, &vector_upl_dst_addr, vector_upl->size, FALSE, VM_FLAGS_ANYWHERE, &vector_upl_submap);
6741 if( kr != KERN_SUCCESS )
6742 panic("Vector UPL submap allocation failed\n");
6743 map = vector_upl_submap;
6744 vector_upl_set_submap(vector_upl, vector_upl_submap, vector_upl_dst_addr);
6745 curr_upl=0;
6746 }
6747 else
6748 upl_lock(upl);
6749
6750process_upl_to_enter:
6751 if(isVectorUPL){
6752 if(curr_upl == MAX_VECTOR_UPL_ELEMENTS) {
6753 *dst_addr = vector_upl_dst_addr;
6754 upl_unlock(vector_upl);
6755 return KERN_SUCCESS;
6756 }
6757 upl = vector_upl_subupl_byindex(vector_upl, curr_upl++ );
6758 if(upl == NULL)
6759 goto process_upl_to_enter;
6d2010ae 6760
b0d623f7
A
6761 vector_upl_get_iostate(vector_upl, upl, &subupl_offset, &subupl_size);
6762 *dst_addr = (vm_map_offset_t)(vector_upl_dst_addr + (vm_map_offset_t)subupl_offset);
d41d1dae
A
6763 } else {
6764 /*
6765 * check to see if already mapped
6766 */
6767 if (UPL_PAGE_LIST_MAPPED & upl->flags) {
6768 upl_unlock(upl);
6769 return KERN_FAILURE;
6770 }
b0d623f7 6771 }
d41d1dae
A
6772 if ((!(upl->flags & UPL_SHADOWED)) &&
6773 ((upl->flags & UPL_HAS_BUSY) ||
6774 !((upl->flags & (UPL_DEVICE_MEMORY | UPL_IO_WIRE)) || (upl->map_object->phys_contiguous)))) {
0b4e3aa0 6775
55e303ae
A
6776 vm_object_t object;
6777 vm_page_t alias_page;
6778 vm_object_offset_t new_offset;
b0d623f7 6779 unsigned int pg_num;
55e303ae
A
6780 wpl_array_t lite_list;
6781
2d21ac55 6782 if (upl->flags & UPL_INTERNAL) {
55e303ae 6783 lite_list = (wpl_array_t)
91447636 6784 ((((uintptr_t)upl) + sizeof(struct upl))
2d21ac55 6785 + ((upl->size/PAGE_SIZE) * sizeof(upl_page_info_t)));
55e303ae 6786 } else {
2d21ac55 6787 lite_list = (wpl_array_t)(((uintptr_t)upl) + sizeof(struct upl));
55e303ae
A
6788 }
6789 object = upl->map_object;
6790 upl->map_object = vm_object_allocate(upl->size);
2d21ac55 6791
55e303ae 6792 vm_object_lock(upl->map_object);
2d21ac55 6793
55e303ae
A
6794 upl->map_object->shadow = object;
6795 upl->map_object->pageout = TRUE;
6796 upl->map_object->can_persist = FALSE;
2d21ac55 6797 upl->map_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
6d2010ae 6798 upl->map_object->vo_shadow_offset = upl->offset - object->paging_offset;
55e303ae 6799 upl->map_object->wimg_bits = object->wimg_bits;
6d2010ae 6800 offset = upl->map_object->vo_shadow_offset;
55e303ae
A
6801 new_offset = 0;
6802 size = upl->size;
91447636 6803
2d21ac55 6804 upl->flags |= UPL_SHADOWED;
91447636 6805
2d21ac55 6806 while (size) {
b0d623f7
A
6807 pg_num = (unsigned int) (new_offset / PAGE_SIZE);
6808 assert(pg_num == new_offset / PAGE_SIZE);
55e303ae 6809
2d21ac55 6810 if (lite_list[pg_num>>5] & (1 << (pg_num & 31))) {
55e303ae 6811
2d21ac55 6812 VM_PAGE_GRAB_FICTITIOUS(alias_page);
91447636 6813
2d21ac55 6814 vm_object_lock(object);
91447636 6815
2d21ac55
A
6816 m = vm_page_lookup(object, offset);
6817 if (m == VM_PAGE_NULL) {
6818 panic("vm_upl_map: page missing\n");
6819 }
55e303ae 6820
2d21ac55
A
6821 /*
6822 * Convert the fictitious page to a private
6823 * shadow of the real page.
6824 */
6825 assert(alias_page->fictitious);
6826 alias_page->fictitious = FALSE;
6827 alias_page->private = TRUE;
39037602 6828 alias_page->free_when_done = TRUE;
2d21ac55
A
6829 /*
6830 * since m is a page in the upl it must
6831 * already be wired or BUSY, so it's
6832 * safe to assign the underlying physical
6833 * page to the alias
6834 */
39037602 6835 VM_PAGE_SET_PHYS_PAGE(alias_page, VM_PAGE_GET_PHYS_PAGE(m));
2d21ac55
A
6836
6837 vm_object_unlock(object);
6838
6839 vm_page_lockspin_queues();
3e170ce0 6840 vm_page_wire(alias_page, VM_KERN_MEMORY_NONE, TRUE);
2d21ac55
A
6841 vm_page_unlock_queues();
6842
6843 /*
6844 * ENCRYPTED SWAP:
6845 * The virtual page ("m") has to be wired in some way
39037602 6846 * here or its backing physical page could
2d21ac55
A
6847 * be recycled at any time.
6848 * Assuming this is enforced by the caller, we can't
6849 * get an encrypted page here. Since the encryption
6850 * key depends on the VM page's "pager" object and
6851 * the "paging_offset", we couldn't handle 2 pageable
6852 * VM pages (with different pagers and paging_offsets)
6853 * sharing the same physical page: we could end up
6854 * encrypting with one key (via one VM page) and
6855 * decrypting with another key (via the alias VM page).
6856 */
6857 ASSERT_PAGE_DECRYPTED(m);
55e303ae 6858
3e170ce0 6859 vm_page_insert_wired(alias_page, upl->map_object, new_offset, VM_KERN_MEMORY_NONE);
2d21ac55
A
6860
6861 assert(!alias_page->wanted);
6862 alias_page->busy = FALSE;
6863 alias_page->absent = FALSE;
6864 }
6865 size -= PAGE_SIZE;
6866 offset += PAGE_SIZE_64;
6867 new_offset += PAGE_SIZE_64;
55e303ae 6868 }
91447636 6869 vm_object_unlock(upl->map_object);
55e303ae 6870 }
d41d1dae 6871 if (upl->flags & UPL_SHADOWED)
55e303ae 6872 offset = 0;
d41d1dae
A
6873 else
6874 offset = upl->offset - upl->map_object->paging_offset;
6d2010ae 6875
1c79356b
A
6876 size = upl->size;
6877
2d21ac55 6878 vm_object_reference(upl->map_object);
1c79356b 6879
b0d623f7
A
6880 if(!isVectorUPL) {
6881 *dst_addr = 0;
6882 /*
6883 * NEED A UPL_MAP ALIAS
6884 */
6885 kr = vm_map_enter(map, dst_addr, (vm_map_size_t)size, (vm_map_offset_t) 0,
3e170ce0
A
6886 VM_FLAGS_ANYWHERE | VM_MAKE_TAG(VM_KERN_MEMORY_OSFMK),
6887 upl->map_object, offset, FALSE,
b0d623f7 6888 VM_PROT_DEFAULT, VM_PROT_ALL, VM_INHERIT_DEFAULT);
d41d1dae
A
6889
6890 if (kr != KERN_SUCCESS) {
39037602 6891 vm_object_deallocate(upl->map_object);
d41d1dae
A
6892 upl_unlock(upl);
6893 return(kr);
6894 }
b0d623f7
A
6895 }
6896 else {
6897 kr = vm_map_enter(map, dst_addr, (vm_map_size_t)size, (vm_map_offset_t) 0,
3e170ce0
A
6898 VM_FLAGS_FIXED | VM_MAKE_TAG(VM_KERN_MEMORY_OSFMK),
6899 upl->map_object, offset, FALSE,
b0d623f7
A
6900 VM_PROT_DEFAULT, VM_PROT_ALL, VM_INHERIT_DEFAULT);
6901 if(kr)
6902 panic("vm_map_enter failed for a Vector UPL\n");
6903 }
91447636
A
6904 vm_object_lock(upl->map_object);
6905
2d21ac55 6906 for (addr = *dst_addr; size > 0; size -= PAGE_SIZE, addr += PAGE_SIZE) {
1c79356b 6907 m = vm_page_lookup(upl->map_object, offset);
2d21ac55
A
6908
6909 if (m) {
2d21ac55 6910 m->pmapped = TRUE;
b0d623f7
A
6911
6912 /* CODE SIGNING ENFORCEMENT: page has been wpmapped,
6913 * but only in kernel space. If this was on a user map,
6914 * we'd have to set the wpmapped bit. */
6915 /* m->wpmapped = TRUE; */
fe8ab488 6916 assert(map->pmap == kernel_pmap);
9bccf70c 6917
fe8ab488 6918 PMAP_ENTER(map->pmap, addr, m, VM_PROT_DEFAULT, VM_PROT_NONE, 0, TRUE);
1c79356b 6919 }
2d21ac55 6920 offset += PAGE_SIZE_64;
1c79356b 6921 }
91447636
A
6922 vm_object_unlock(upl->map_object);
6923
2d21ac55
A
6924 /*
6925 * hold a reference for the mapping
6926 */
6927 upl->ref_count++;
1c79356b 6928 upl->flags |= UPL_PAGE_LIST_MAPPED;
b0d623f7
A
6929 upl->kaddr = (vm_offset_t) *dst_addr;
6930 assert(upl->kaddr == *dst_addr);
6931
d41d1dae 6932 if(isVectorUPL)
b0d623f7 6933 goto process_upl_to_enter;
2d21ac55 6934
d41d1dae
A
6935 upl_unlock(upl);
6936
1c79356b
A
6937 return KERN_SUCCESS;
6938}
6939
91447636
A
6940/*
6941 * Internal routine to remove a UPL mapping from a VM map.
6942 *
6943 * XXX - This should just be doable through a standard
6944 * vm_map_remove() operation. Otherwise, implicit clean-up
6945 * of the target map won't be able to correctly remove
6946 * these (and release the reference on the UPL). Having
6947 * to do this means we can't map these into user-space
6948 * maps yet.
6949 */
1c79356b 6950kern_return_t
91447636 6951vm_map_remove_upl(
1c79356b
A
6952 vm_map_t map,
6953 upl_t upl)
6954{
0b4e3aa0 6955 vm_address_t addr;
91447636 6956 upl_size_t size;
b0d623f7
A
6957 int isVectorUPL = 0, curr_upl = 0;
6958 upl_t vector_upl = NULL;
1c79356b 6959
0b4e3aa0
A
6960 if (upl == UPL_NULL)
6961 return KERN_INVALID_ARGUMENT;
6962
b0d623f7
A
6963 if((isVectorUPL = vector_upl_is_valid(upl))) {
6964 int unmapped=0, valid_upls=0;
6965 vector_upl = upl;
6966 upl_lock(vector_upl);
6967 for(curr_upl=0; curr_upl < MAX_VECTOR_UPL_ELEMENTS; curr_upl++) {
6968 upl = vector_upl_subupl_byindex(vector_upl, curr_upl );
6969 if(upl == NULL)
6970 continue;
6971 valid_upls++;
6972 if (!(UPL_PAGE_LIST_MAPPED & upl->flags))
6973 unmapped++;
6974 }
6975
6976 if(unmapped) {
6977 if(unmapped != valid_upls)
6978 panic("%d of the %d sub-upls within the Vector UPL is/are not mapped\n", unmapped, valid_upls);
6979 else {
6980 upl_unlock(vector_upl);
6981 return KERN_FAILURE;
6982 }
6983 }
6984 curr_upl=0;
6985 }
6986 else
6987 upl_lock(upl);
6988
6989process_upl_to_remove:
6990 if(isVectorUPL) {
6991 if(curr_upl == MAX_VECTOR_UPL_ELEMENTS) {
6992 vm_map_t v_upl_submap;
6993 vm_offset_t v_upl_submap_dst_addr;
6994 vector_upl_get_submap(vector_upl, &v_upl_submap, &v_upl_submap_dst_addr);
6995
6996 vm_map_remove(map, v_upl_submap_dst_addr, v_upl_submap_dst_addr + vector_upl->size, VM_MAP_NO_FLAGS);
6997 vm_map_deallocate(v_upl_submap);
6998 upl_unlock(vector_upl);
6999 return KERN_SUCCESS;
7000 }
7001
7002 upl = vector_upl_subupl_byindex(vector_upl, curr_upl++ );
7003 if(upl == NULL)
7004 goto process_upl_to_remove;
7005 }
2d21ac55
A
7006
7007 if (upl->flags & UPL_PAGE_LIST_MAPPED) {
0b4e3aa0 7008 addr = upl->kaddr;
1c79356b 7009 size = upl->size;
2d21ac55 7010
0b4e3aa0
A
7011 assert(upl->ref_count > 1);
7012 upl->ref_count--; /* removing mapping ref */
2d21ac55 7013
1c79356b
A
7014 upl->flags &= ~UPL_PAGE_LIST_MAPPED;
7015 upl->kaddr = (vm_offset_t) 0;
b0d623f7
A
7016
7017 if(!isVectorUPL) {
7018 upl_unlock(upl);
7019
39236c6e
A
7020 vm_map_remove(
7021 map,
7022 vm_map_trunc_page(addr,
7023 VM_MAP_PAGE_MASK(map)),
7024 vm_map_round_page(addr + size,
7025 VM_MAP_PAGE_MASK(map)),
b0d623f7
A
7026 VM_MAP_NO_FLAGS);
7027
7028 return KERN_SUCCESS;
7029 }
7030 else {
7031 /*
7032 * If it's a Vectored UPL, we'll be removing the entire
7033 * submap anyways, so no need to remove individual UPL
7034 * element mappings from within the submap
7035 */
7036 goto process_upl_to_remove;
7037 }
1c79356b 7038 }
0b4e3aa0 7039 upl_unlock(upl);
2d21ac55 7040
0b4e3aa0 7041 return KERN_FAILURE;
1c79356b
A
7042}
7043
39037602 7044
1c79356b 7045kern_return_t
0b4e3aa0 7046upl_commit_range(
1c79356b 7047 upl_t upl,
91447636
A
7048 upl_offset_t offset,
7049 upl_size_t size,
1c79356b 7050 int flags,
0b4e3aa0
A
7051 upl_page_info_t *page_list,
7052 mach_msg_type_number_t count,
7053 boolean_t *empty)
1c79356b 7054{
b0d623f7 7055 upl_size_t xfer_size, subupl_size = size;
55e303ae 7056 vm_object_t shadow_object;
2d21ac55 7057 vm_object_t object;
39037602 7058 vm_object_t m_object;
1c79356b 7059 vm_object_offset_t target_offset;
b0d623f7 7060 upl_offset_t subupl_offset = offset;
1c79356b 7061 int entry;
55e303ae
A
7062 wpl_array_t lite_list;
7063 int occupied;
91447636 7064 int clear_refmod = 0;
2d21ac55 7065 int pgpgout_count = 0;
6d2010ae
A
7066 struct vm_page_delayed_work dw_array[DEFAULT_DELAYED_WORK_LIMIT];
7067 struct vm_page_delayed_work *dwp;
7068 int dw_count;
7069 int dw_limit;
7070 int isVectorUPL = 0;
b0d623f7 7071 upl_t vector_upl = NULL;
6d2010ae 7072 boolean_t should_be_throttled = FALSE;
1c79356b 7073
fe8ab488
A
7074 vm_page_t nxt_page = VM_PAGE_NULL;
7075 int fast_path_possible = 0;
7076 int fast_path_full_commit = 0;
7077 int throttle_page = 0;
7078 int unwired_count = 0;
7079 int local_queue_count = 0;
39037602 7080 vm_page_t first_local, last_local;
fe8ab488 7081
0b4e3aa0
A
7082 *empty = FALSE;
7083
7084 if (upl == UPL_NULL)
7085 return KERN_INVALID_ARGUMENT;
7086
7087 if (count == 0)
7088 page_list = NULL;
7089
b0d623f7
A
7090 if((isVectorUPL = vector_upl_is_valid(upl))) {
7091 vector_upl = upl;
7092 upl_lock(vector_upl);
7093 }
7094 else
7095 upl_lock(upl);
7096
7097process_upl_to_commit:
7098
7099 if(isVectorUPL) {
7100 size = subupl_size;
7101 offset = subupl_offset;
7102 if(size == 0) {
7103 upl_unlock(vector_upl);
7104 return KERN_SUCCESS;
7105 }
7106 upl = vector_upl_subupl_byoffset(vector_upl, &offset, &size);
7107 if(upl == NULL) {
7108 upl_unlock(vector_upl);
7109 return KERN_FAILURE;
7110 }
7111 page_list = UPL_GET_INTERNAL_PAGE_LIST_SIMPLE(upl);
7112 subupl_size -= size;
7113 subupl_offset += size;
7114 }
7115
7116#if UPL_DEBUG
7117 if (upl->upl_commit_index < UPL_DEBUG_COMMIT_RECORDS) {
7118 (void) OSBacktrace(&upl->upl_commit_records[upl->upl_commit_index].c_retaddr[0], UPL_DEBUG_STACK_FRAMES);
7119
7120 upl->upl_commit_records[upl->upl_commit_index].c_beg = offset;
7121 upl->upl_commit_records[upl->upl_commit_index].c_end = (offset + size);
7122
7123 upl->upl_commit_index++;
7124 }
7125#endif
2d21ac55
A
7126 if (upl->flags & UPL_DEVICE_MEMORY)
7127 xfer_size = 0;
7128 else if ((offset + size) <= upl->size)
7129 xfer_size = size;
b0d623f7
A
7130 else {
7131 if(!isVectorUPL)
7132 upl_unlock(upl);
7133 else {
7134 upl_unlock(vector_upl);
7135 }
2d21ac55 7136 return KERN_FAILURE;
91447636 7137 }
6d2010ae
A
7138 if (upl->flags & UPL_SET_DIRTY)
7139 flags |= UPL_COMMIT_SET_DIRTY;
55e303ae
A
7140 if (upl->flags & UPL_CLEAR_DIRTY)
7141 flags |= UPL_COMMIT_CLEAR_DIRTY;
7142
2d21ac55
A
7143 if (upl->flags & UPL_INTERNAL)
7144 lite_list = (wpl_array_t) ((((uintptr_t)upl) + sizeof(struct upl))
7145 + ((upl->size/PAGE_SIZE) * sizeof(upl_page_info_t)));
7146 else
7147 lite_list = (wpl_array_t) (((uintptr_t)upl) + sizeof(struct upl));
1c79356b 7148
2d21ac55
A
7149 object = upl->map_object;
7150
7151 if (upl->flags & UPL_SHADOWED) {
7152 vm_object_lock(object);
7153 shadow_object = object->shadow;
55e303ae 7154 } else {
2d21ac55 7155 shadow_object = object;
55e303ae 7156 }
1c79356b
A
7157 entry = offset/PAGE_SIZE;
7158 target_offset = (vm_object_offset_t)offset;
55e303ae 7159
3e170ce0
A
7160 assert(!(target_offset & PAGE_MASK));
7161 assert(!(xfer_size & PAGE_MASK));
7162
b0d623f7
A
7163 if (upl->flags & UPL_KERNEL_OBJECT)
7164 vm_object_lock_shared(shadow_object);
7165 else
7166 vm_object_lock(shadow_object);
4a3eedf9 7167
b0d623f7
A
7168 if (upl->flags & UPL_ACCESS_BLOCKED) {
7169 assert(shadow_object->blocked_access);
7170 shadow_object->blocked_access = FALSE;
7171 vm_object_wakeup(object, VM_OBJECT_EVENT_UNBLOCKED);
4a3eedf9 7172 }
4a3eedf9 7173
593a1d5f
A
7174 if (shadow_object->code_signed) {
7175 /*
7176 * CODE SIGNING:
7177 * If the object is code-signed, do not let this UPL tell
7178 * us if the pages are valid or not. Let the pages be
7179 * validated by VM the normal way (when they get mapped or
7180 * copied).
7181 */
7182 flags &= ~UPL_COMMIT_CS_VALIDATED;
7183 }
7184 if (! page_list) {
7185 /*
7186 * No page list to get the code-signing info from !?
7187 */
7188 flags &= ~UPL_COMMIT_CS_VALIDATED;
7189 }
39037602 7190 if (!VM_DYNAMIC_PAGING_ENABLED() && shadow_object->internal)
6d2010ae 7191 should_be_throttled = TRUE;
593a1d5f 7192
b0d623f7
A
7193 dwp = &dw_array[0];
7194 dw_count = 0;
6d2010ae 7195 dw_limit = DELAYED_WORK_LIMIT(DEFAULT_DELAYED_WORK_LIMIT);
b0d623f7 7196
fe8ab488
A
7197 if ((upl->flags & UPL_IO_WIRE) &&
7198 !(flags & UPL_COMMIT_FREE_ABSENT) &&
7199 !isVectorUPL &&
7200 shadow_object->purgable != VM_PURGABLE_VOLATILE &&
7201 shadow_object->purgable != VM_PURGABLE_EMPTY) {
7202
39037602
A
7203 if (!vm_page_queue_empty(&shadow_object->memq)) {
7204
fe8ab488 7205 if (size == shadow_object->vo_size) {
39037602 7206 nxt_page = (vm_page_t)vm_page_queue_first(&shadow_object->memq);
fe8ab488
A
7207 fast_path_full_commit = 1;
7208 }
7209 fast_path_possible = 1;
7210
39037602 7211 if (!VM_DYNAMIC_PAGING_ENABLED() && shadow_object->internal &&
fe8ab488
A
7212 (shadow_object->purgable == VM_PURGABLE_DENY ||
7213 shadow_object->purgable == VM_PURGABLE_NONVOLATILE ||
7214 shadow_object->purgable == VM_PURGABLE_VOLATILE)) {
7215 throttle_page = 1;
7216 }
7217 }
7218 }
39037602
A
7219 first_local = VM_PAGE_NULL;
7220 last_local = VM_PAGE_NULL;
fe8ab488 7221
91447636 7222 while (xfer_size) {
2d21ac55
A
7223 vm_page_t t, m;
7224
b0d623f7
A
7225 dwp->dw_mask = 0;
7226 clear_refmod = 0;
7227
55e303ae 7228 m = VM_PAGE_NULL;
d7e50217 7229
55e303ae 7230 if (upl->flags & UPL_LITE) {
b0d623f7 7231 unsigned int pg_num;
55e303ae 7232
fe8ab488
A
7233 if (nxt_page != VM_PAGE_NULL) {
7234 m = nxt_page;
39037602 7235 nxt_page = (vm_page_t)vm_page_queue_next(&nxt_page->listq);
fe8ab488
A
7236 target_offset = m->offset;
7237 }
b0d623f7
A
7238 pg_num = (unsigned int) (target_offset/PAGE_SIZE);
7239 assert(pg_num == target_offset/PAGE_SIZE);
55e303ae
A
7240
7241 if (lite_list[pg_num>>5] & (1 << (pg_num & 31))) {
7242 lite_list[pg_num>>5] &= ~(1 << (pg_num & 31));
2d21ac55 7243
fe8ab488 7244 if (!(upl->flags & UPL_KERNEL_OBJECT) && m == VM_PAGE_NULL)
b0d623f7 7245 m = vm_page_lookup(shadow_object, target_offset + (upl->offset - shadow_object->paging_offset));
fe8ab488
A
7246 } else
7247 m = NULL;
55e303ae 7248 }
2d21ac55
A
7249 if (upl->flags & UPL_SHADOWED) {
7250 if ((t = vm_page_lookup(object, target_offset)) != VM_PAGE_NULL) {
7251
39037602 7252 t->free_when_done = FALSE;
55e303ae 7253
b0d623f7 7254 VM_PAGE_FREE(t);
55e303ae 7255
fe8ab488 7256 if (!(upl->flags & UPL_KERNEL_OBJECT) && m == VM_PAGE_NULL)
6d2010ae 7257 m = vm_page_lookup(shadow_object, target_offset + object->vo_shadow_offset);
55e303ae
A
7258 }
7259 }
fe8ab488 7260 if (m == VM_PAGE_NULL)
593a1d5f 7261 goto commit_next_page;
55e303ae 7262
39037602
A
7263 m_object = VM_PAGE_OBJECT(m);
7264
7265 if (m->vm_page_q_state == VM_PAGE_USED_BY_COMPRESSOR) {
39236c6e
A
7266 assert(m->busy);
7267
7268 dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
7269 goto commit_next_page;
7270 }
7271
593a1d5f
A
7272 if (flags & UPL_COMMIT_CS_VALIDATED) {
7273 /*
7274 * CODE SIGNING:
7275 * Set the code signing bits according to
7276 * what the UPL says they should be.
7277 */
7278 m->cs_validated = page_list[entry].cs_validated;
7279 m->cs_tainted = page_list[entry].cs_tainted;
c18c124e 7280 m->cs_nx = page_list[entry].cs_nx;
593a1d5f 7281 }
15129b1c 7282 if (flags & UPL_COMMIT_WRITTEN_BY_KERNEL)
fe8ab488 7283 m->written_by_kernel = TRUE;
15129b1c 7284
593a1d5f 7285 if (upl->flags & UPL_IO_WIRE) {
55e303ae 7286
593a1d5f
A
7287 if (page_list)
7288 page_list[entry].phys_addr = 0;
2d21ac55 7289
6d2010ae 7290 if (flags & UPL_COMMIT_SET_DIRTY) {
316670eb 7291 SET_PAGE_DIRTY(m, FALSE);
6d2010ae 7292 } else if (flags & UPL_COMMIT_CLEAR_DIRTY) {
593a1d5f 7293 m->dirty = FALSE;
b0d623f7 7294
593a1d5f
A
7295 if (! (flags & UPL_COMMIT_CS_VALIDATED) &&
7296 m->cs_validated && !m->cs_tainted) {
4a3eedf9
A
7297 /*
7298 * CODE SIGNING:
7299 * This page is no longer dirty
7300 * but could have been modified,
7301 * so it will need to be
7302 * re-validated.
7303 */
fe8ab488 7304 if (m->slid) {
15129b1c
A
7305 panic("upl_commit_range(%p): page %p was slid\n",
7306 upl, m);
7307 }
7308 assert(!m->slid);
4a3eedf9 7309 m->cs_validated = FALSE;
b0d623f7 7310#if DEVELOPMENT || DEBUG
4a3eedf9 7311 vm_cs_validated_resets++;
b0d623f7 7312#endif
39037602 7313 pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
4a3eedf9 7314 }
91447636 7315 clear_refmod |= VM_MEM_MODIFIED;
55e303ae 7316 }
b0d623f7 7317 if (upl->flags & UPL_ACCESS_BLOCKED) {
593a1d5f
A
7318 /*
7319 * We blocked access to the pages in this UPL.
7320 * Clear the "busy" bit and wake up any waiter
7321 * for this page.
7322 */
b0d623f7 7323 dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
593a1d5f 7324 }
fe8ab488 7325 if (fast_path_possible) {
39037602
A
7326 assert(m_object->purgable != VM_PURGABLE_EMPTY);
7327 assert(m_object->purgable != VM_PURGABLE_VOLATILE);
fe8ab488 7328 if (m->absent) {
39037602 7329 assert(m->vm_page_q_state == VM_PAGE_NOT_ON_Q);
fe8ab488
A
7330 assert(m->wire_count == 0);
7331 assert(m->busy);
7332
0b4c1975 7333 m->absent = FALSE;
d41d1dae 7334 dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
fe8ab488
A
7335 } else {
7336 if (m->wire_count == 0)
7337 panic("wire_count == 0, m = %p, obj = %p\n", m, shadow_object);
39037602 7338 assert(m->vm_page_q_state == VM_PAGE_IS_WIRED);
fe8ab488
A
7339
7340 /*
7341 * XXX FBDP need to update some other
7342 * counters here (purgeable_wired_count)
7343 * (ledgers), ...
7344 */
39037602 7345 assert(m->wire_count > 0);
fe8ab488 7346 m->wire_count--;
7ddcb079 7347
39037602
A
7348 if (m->wire_count == 0) {
7349 m->vm_page_q_state = VM_PAGE_NOT_ON_Q;
fe8ab488 7350 unwired_count++;
39037602 7351 }
d41d1dae 7352 }
fe8ab488 7353 if (m->wire_count == 0) {
39037602
A
7354 assert(m->pageq.next == 0 && m->pageq.prev == 0);
7355
7356 if (last_local == VM_PAGE_NULL) {
7357 assert(first_local == VM_PAGE_NULL);
7358
7359 last_local = m;
7360 first_local = m;
7361 } else {
7362 assert(first_local != VM_PAGE_NULL);
7363
7364 m->pageq.next = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(first_local);
7365 first_local->pageq.prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(m);
7366 first_local = m;
7367 }
fe8ab488 7368 local_queue_count++;
d41d1dae 7369
fe8ab488 7370 if (throttle_page) {
39037602 7371 m->vm_page_q_state = VM_PAGE_ON_THROTTLED_Q;
fe8ab488 7372 } else {
39037602
A
7373 if (flags & UPL_COMMIT_INACTIVATE) {
7374 if (shadow_object->internal)
7375 m->vm_page_q_state = VM_PAGE_ON_INACTIVE_INTERNAL_Q;
7376 else
7377 m->vm_page_q_state = VM_PAGE_ON_INACTIVE_EXTERNAL_Q;
7378 } else
7379 m->vm_page_q_state = VM_PAGE_ON_ACTIVE_Q;
fe8ab488
A
7380 }
7381 }
7382 } else {
7383 if (flags & UPL_COMMIT_INACTIVATE) {
7384 dwp->dw_mask |= DW_vm_page_deactivate_internal;
7385 clear_refmod |= VM_MEM_REFERENCED;
7386 }
7387 if (m->absent) {
7388 if (flags & UPL_COMMIT_FREE_ABSENT)
7389 dwp->dw_mask |= DW_vm_page_free;
7390 else {
7391 m->absent = FALSE;
7392 dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
7393
7394 if ( !(dwp->dw_mask & DW_vm_page_deactivate_internal))
7395 dwp->dw_mask |= DW_vm_page_activate;
7396 }
7397 } else
7398 dwp->dw_mask |= DW_vm_page_unwire;
7399 }
593a1d5f
A
7400 goto commit_next_page;
7401 }
39037602 7402 assert(m->vm_page_q_state != VM_PAGE_USED_BY_COMPRESSOR);
39236c6e 7403
316670eb
A
7404 if (page_list)
7405 page_list[entry].phys_addr = 0;
7406
593a1d5f
A
7407 /*
7408 * make sure to clear the hardware
7409 * modify or reference bits before
7410 * releasing the BUSY bit on this page
7411 * otherwise we risk losing a legitimate
7412 * change of state
7413 */
7414 if (flags & UPL_COMMIT_CLEAR_DIRTY) {
7415 m->dirty = FALSE;
2d21ac55 7416
593a1d5f
A
7417 clear_refmod |= VM_MEM_MODIFIED;
7418 }
316670eb
A
7419 if (m->laundry)
7420 dwp->dw_mask |= DW_vm_pageout_throttle_up;
b0d623f7 7421
316670eb 7422 if (VM_PAGE_WIRED(m))
39037602 7423 m->free_when_done = FALSE;
316670eb
A
7424
7425 if (! (flags & UPL_COMMIT_CS_VALIDATED) &&
7426 m->cs_validated && !m->cs_tainted) {
7427 /*
7428 * CODE SIGNING:
7429 * This page is no longer dirty
7430 * but could have been modified,
7431 * so it will need to be
7432 * re-validated.
7433 */
fe8ab488 7434 if (m->slid) {
15129b1c
A
7435 panic("upl_commit_range(%p): page %p was slid\n",
7436 upl, m);
7437 }
7438 assert(!m->slid);
316670eb
A
7439 m->cs_validated = FALSE;
7440#if DEVELOPMENT || DEBUG
7441 vm_cs_validated_resets++;
7442#endif
39037602 7443 pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
316670eb
A
7444 }
7445 if (m->overwriting) {
7446 /*
7447 * the (COPY_OUT_FROM == FALSE) request_page_list case
7448 */
7449 if (m->busy) {
fe8ab488 7450#if CONFIG_PHANTOM_CACHE
39037602 7451 if (m->absent && !m_object->internal)
fe8ab488
A
7452 dwp->dw_mask |= DW_vm_phantom_cache_update;
7453#endif
593a1d5f 7454 m->absent = FALSE;
b0d623f7 7455
316670eb
A
7456 dwp->dw_mask |= DW_clear_busy;
7457 } else {
7458 /*
7459 * alternate (COPY_OUT_FROM == FALSE) page_list case
7460 * Occurs when the original page was wired
7461 * at the time of the list request
7462 */
7463 assert(VM_PAGE_WIRED(m));
7464
7465 dwp->dw_mask |= DW_vm_page_unwire; /* reactivates */
593a1d5f 7466 }
316670eb 7467 m->overwriting = FALSE;
593a1d5f 7468 }
316670eb
A
7469 if (m->encrypted_cleaning == TRUE) {
7470 m->encrypted_cleaning = FALSE;
2d21ac55 7471
316670eb
A
7472 dwp->dw_mask |= DW_clear_busy | DW_PAGE_WAKEUP;
7473 }
7474 m->cleaning = FALSE;
91447636 7475
39037602 7476 if (m->free_when_done) {
316670eb
A
7477 /*
7478 * With the clean queue enabled, UPL_PAGEOUT should
7479 * no longer set the pageout bit. It's pages now go
7480 * to the clean queue.
7481 */
7482 assert(!(flags & UPL_PAGEOUT));
39037602 7483 assert(!m_object->internal);
316670eb 7484
39037602 7485 m->free_when_done = FALSE;
1c79356b 7486#if MACH_CLUSTER_STATS
593a1d5f 7487 if (m->wanted) vm_pageout_target_collisions++;
1c79356b 7488#endif
b0d623f7 7489 if ((flags & UPL_COMMIT_SET_DIRTY) ||
39037602 7490 (m->pmapped && (pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m)) & VM_MEM_MODIFIED))) {
593a1d5f
A
7491 /*
7492 * page was re-dirtied after we started
7493 * the pageout... reactivate it since
7494 * we don't know whether the on-disk
7495 * copy matches what is now in memory
2d21ac55 7496 */
316670eb
A
7497 SET_PAGE_DIRTY(m, FALSE);
7498
7499 dwp->dw_mask |= DW_vm_page_activate | DW_PAGE_WAKEUP;
b0d623f7 7500
593a1d5f
A
7501 if (upl->flags & UPL_PAGEOUT) {
7502 CLUSTER_STAT(vm_pageout_target_page_dirtied++;)
7503 VM_STAT_INCR(reactivations);
7504 DTRACE_VM2(pgrec, int, 1, (uint64_t *), NULL);
7505 }
593a1d5f
A
7506 } else {
7507 /*
7508 * page has been successfully cleaned
7509 * go ahead and free it for other use
2d21ac55 7510 */
39037602 7511 if (m_object->internal) {
593a1d5f
A
7512 DTRACE_VM2(anonpgout, int, 1, (uint64_t *), NULL);
7513 } else {
7514 DTRACE_VM2(fspgout, int, 1, (uint64_t *), NULL);
7515 }
316670eb
A
7516 m->dirty = FALSE;
7517 m->busy = TRUE;
b0d623f7 7518
316670eb 7519 dwp->dw_mask |= DW_vm_page_free;
de355530 7520 }
593a1d5f
A
7521 goto commit_next_page;
7522 }
7523#if MACH_CLUSTER_STATS
7524 if (m->wpmapped)
39037602 7525 m->dirty = pmap_is_modified(VM_PAGE_GET_PHYS_PAGE(m));
593a1d5f
A
7526
7527 if (m->dirty) vm_pageout_cluster_dirtied++;
7528 else vm_pageout_cluster_cleaned++;
7529 if (m->wanted) vm_pageout_cluster_collisions++;
7530#endif
593a1d5f
A
7531 /*
7532 * It is a part of the semantic of COPYOUT_FROM
7533 * UPLs that a commit implies cache sync
7534 * between the vm page and the backing store
7535 * this can be used to strip the precious bit
7536 * as well as clean
7537 */
b0d623f7 7538 if ((upl->flags & UPL_PAGE_SYNC_DONE) || (flags & UPL_COMMIT_CLEAR_PRECIOUS))
593a1d5f 7539 m->precious = FALSE;
b0d623f7 7540
316670eb
A
7541 if (flags & UPL_COMMIT_SET_DIRTY) {
7542 SET_PAGE_DIRTY(m, FALSE);
7543 } else {
7544 m->dirty = FALSE;
7545 }
7546
7547 /* with the clean queue on, move *all* cleaned pages to the clean queue */
7548 if (hibernate_cleaning_in_progress == FALSE && !m->dirty && (upl->flags & UPL_PAGEOUT)) {
7549 pgpgout_count++;
7550
fe8ab488
A
7551 VM_STAT_INCR(pageouts);
7552 DTRACE_VM2(pgout, int, 1, (uint64_t *), NULL);
b0d623f7 7553
316670eb
A
7554 dwp->dw_mask |= DW_enqueue_cleaned;
7555 vm_pageout_enqueued_cleaned_from_inactive_dirty++;
39037602 7556 } else if (should_be_throttled == TRUE && (m->vm_page_q_state == VM_PAGE_NOT_ON_Q)) {
6d2010ae
A
7557 /*
7558 * page coming back in from being 'frozen'...
7559 * it was dirty before it was frozen, so keep it so
7560 * the vm_page_activate will notice that it really belongs
7561 * on the throttle queue and put it there
7562 */
316670eb 7563 SET_PAGE_DIRTY(m, FALSE);
6d2010ae 7564 dwp->dw_mask |= DW_vm_page_activate;
b0d623f7 7565
6d2010ae 7566 } else {
39037602 7567 if ((flags & UPL_COMMIT_INACTIVATE) && !m->clustered && (m->vm_page_q_state != VM_PAGE_ON_SPECULATIVE_Q)) {
b0d623f7
A
7568 dwp->dw_mask |= DW_vm_page_deactivate_internal;
7569 clear_refmod |= VM_MEM_REFERENCED;
39037602 7570 } else if ( !VM_PAGE_PAGEABLE(m)) {
6d2010ae
A
7571
7572 if (m->clustered || (flags & UPL_COMMIT_SPECULATE))
7573 dwp->dw_mask |= DW_vm_page_speculate;
7574 else if (m->reference)
7575 dwp->dw_mask |= DW_vm_page_activate;
7576 else {
7577 dwp->dw_mask |= DW_vm_page_deactivate_internal;
7578 clear_refmod |= VM_MEM_REFERENCED;
7579 }
b0d623f7 7580 }
593a1d5f 7581 }
b0d623f7 7582 if (upl->flags & UPL_ACCESS_BLOCKED) {
2d21ac55 7583 /*
593a1d5f
A
7584 * We blocked access to the pages in this URL.
7585 * Clear the "busy" bit on this page before we
7586 * wake up any waiter.
2d21ac55 7587 */
b0d623f7 7588 dwp->dw_mask |= DW_clear_busy;
1c79356b 7589 }
593a1d5f
A
7590 /*
7591 * Wakeup any thread waiting for the page to be un-cleaning.
7592 */
b0d623f7 7593 dwp->dw_mask |= DW_PAGE_WAKEUP;
593a1d5f 7594
2d21ac55 7595commit_next_page:
b0d623f7 7596 if (clear_refmod)
39037602 7597 pmap_clear_refmod(VM_PAGE_GET_PHYS_PAGE(m), clear_refmod);
b0d623f7 7598
1c79356b
A
7599 target_offset += PAGE_SIZE_64;
7600 xfer_size -= PAGE_SIZE;
7601 entry++;
2d21ac55 7602
b0d623f7
A
7603 if (dwp->dw_mask) {
7604 if (dwp->dw_mask & ~(DW_clear_busy | DW_PAGE_WAKEUP)) {
6d2010ae 7605 VM_PAGE_ADD_DELAYED_WORK(dwp, m, dw_count);
4a3eedf9 7606
6d2010ae 7607 if (dw_count >= dw_limit) {
3e170ce0 7608 vm_page_do_delayed_work(shadow_object, VM_KERN_MEMORY_NONE, &dw_array[0], dw_count);
b0d623f7
A
7609
7610 dwp = &dw_array[0];
7611 dw_count = 0;
7612 }
7613 } else {
7614 if (dwp->dw_mask & DW_clear_busy)
7615 m->busy = FALSE;
7616
7617 if (dwp->dw_mask & DW_PAGE_WAKEUP)
7618 PAGE_WAKEUP(m);
4a3eedf9 7619 }
2d21ac55 7620 }
1c79356b 7621 }
b0d623f7 7622 if (dw_count)
3e170ce0 7623 vm_page_do_delayed_work(shadow_object, VM_KERN_MEMORY_NONE, &dw_array[0], dw_count);
55e303ae 7624
fe8ab488
A
7625 if (fast_path_possible) {
7626
7627 assert(shadow_object->purgable != VM_PURGABLE_VOLATILE);
7628 assert(shadow_object->purgable != VM_PURGABLE_EMPTY);
7629
7630 if (local_queue_count || unwired_count) {
7631
7632 if (local_queue_count) {
fe8ab488 7633 vm_page_t first_target;
39037602 7634 vm_page_queue_head_t *target_queue;
fe8ab488
A
7635
7636 if (throttle_page)
7637 target_queue = &vm_page_queue_throttled;
7638 else {
7639 if (flags & UPL_COMMIT_INACTIVATE) {
7640 if (shadow_object->internal)
7641 target_queue = &vm_page_queue_anonymous;
7642 else
7643 target_queue = &vm_page_queue_inactive;
7644 } else
7645 target_queue = &vm_page_queue_active;
7646 }
7647 /*
7648 * Transfer the entire local queue to a regular LRU page queues.
7649 */
fe8ab488
A
7650 vm_page_lockspin_queues();
7651
39037602 7652 first_target = (vm_page_t) vm_page_queue_first(target_queue);
fe8ab488 7653
39037602
A
7654 if (vm_page_queue_empty(target_queue))
7655 target_queue->prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(last_local);
fe8ab488 7656 else
39037602 7657 first_target->pageq.prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(last_local);
fe8ab488 7658
39037602
A
7659 target_queue->next = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(first_local);
7660 first_local->pageq.prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(target_queue);
7661 last_local->pageq.next = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(first_target);
fe8ab488
A
7662
7663 /*
7664 * Adjust the global page counts.
7665 */
7666 if (throttle_page) {
7667 vm_page_throttled_count += local_queue_count;
7668 } else {
7669 if (flags & UPL_COMMIT_INACTIVATE) {
7670 if (shadow_object->internal)
7671 vm_page_anonymous_count += local_queue_count;
7672 vm_page_inactive_count += local_queue_count;
7673
7674 token_new_pagecount += local_queue_count;
7675 } else
7676 vm_page_active_count += local_queue_count;
7677
7678 if (shadow_object->internal)
7679 vm_page_pageable_internal_count += local_queue_count;
7680 else
7681 vm_page_pageable_external_count += local_queue_count;
7682 }
7683 } else {
7684 vm_page_lockspin_queues();
7685 }
7686 if (unwired_count) {
7687 vm_page_wire_count -= unwired_count;
7688 VM_CHECK_MEMORYSTATUS;
7689 }
7690 vm_page_unlock_queues();
7691
7692 shadow_object->wired_page_count -= unwired_count;
3e170ce0
A
7693
7694 if (!shadow_object->wired_page_count) {
7695 VM_OBJECT_UNWIRED(shadow_object);
7696 }
fe8ab488
A
7697 }
7698 }
55e303ae
A
7699 occupied = 1;
7700
7701 if (upl->flags & UPL_DEVICE_MEMORY) {
7702 occupied = 0;
7703 } else if (upl->flags & UPL_LITE) {
7704 int pg_num;
7705 int i;
2d21ac55 7706
55e303ae 7707 occupied = 0;
2d21ac55 7708
fe8ab488
A
7709 if (!fast_path_full_commit) {
7710 pg_num = upl->size/PAGE_SIZE;
7711 pg_num = (pg_num + 31) >> 5;
7712
7713 for (i = 0; i < pg_num; i++) {
7714 if (lite_list[i] != 0) {
7715 occupied = 1;
7716 break;
7717 }
55e303ae
A
7718 }
7719 }
7720 } else {
39037602 7721 if (vm_page_queue_empty(&upl->map_object->memq))
55e303ae 7722 occupied = 0;
55e303ae 7723 }
2d21ac55 7724 if (occupied == 0) {
b0d623f7
A
7725 /*
7726 * If this UPL element belongs to a Vector UPL and is
7727 * empty, then this is the right function to deallocate
7728 * it. So go ahead set the *empty variable. The flag
7729 * UPL_COMMIT_NOTIFY_EMPTY, from the caller's point of view
7730 * should be considered relevant for the Vector UPL and not
7731 * the internal UPLs.
7732 */
7733 if ((upl->flags & UPL_COMMIT_NOTIFY_EMPTY) || isVectorUPL)
0b4e3aa0 7734 *empty = TRUE;
2d21ac55 7735
b0d623f7 7736 if (object == shadow_object && !(upl->flags & UPL_KERNEL_OBJECT)) {
2d21ac55
A
7737 /*
7738 * this is not a paging object
7739 * so we need to drop the paging reference
7740 * that was taken when we created the UPL
7741 * against this object
7742 */
b0d623f7 7743 vm_object_activity_end(shadow_object);
316670eb 7744 vm_object_collapse(shadow_object, 0, TRUE);
2d21ac55
A
7745 } else {
7746 /*
7747 * we dontated the paging reference to
7748 * the map object... vm_pageout_object_terminate
7749 * will drop this reference
7750 */
7751 }
1c79356b 7752 }
55e303ae 7753 vm_object_unlock(shadow_object);
91447636
A
7754 if (object != shadow_object)
7755 vm_object_unlock(object);
b0d623f7
A
7756
7757 if(!isVectorUPL)
7758 upl_unlock(upl);
7759 else {
7760 /*
7761 * If we completed our operations on an UPL that is
7762 * part of a Vectored UPL and if empty is TRUE, then
7763 * we should go ahead and deallocate this UPL element.
7764 * Then we check if this was the last of the UPL elements
7765 * within that Vectored UPL. If so, set empty to TRUE
7766 * so that in ubc_upl_commit_range or ubc_upl_commit, we
7767 * can go ahead and deallocate the Vector UPL too.
7768 */
7769 if(*empty==TRUE) {
7770 *empty = vector_upl_set_subupl(vector_upl, upl, 0);
7771 upl_deallocate(upl);
7772 }
7773 goto process_upl_to_commit;
7774 }
2d21ac55
A
7775 if (pgpgout_count) {
7776 DTRACE_VM2(pgpgout, int, pgpgout_count, (uint64_t *), NULL);
7777 }
7778
1c79356b
A
7779 return KERN_SUCCESS;
7780}
7781
0b4e3aa0
A
7782kern_return_t
7783upl_abort_range(
1c79356b 7784 upl_t upl,
91447636
A
7785 upl_offset_t offset,
7786 upl_size_t size,
0b4e3aa0
A
7787 int error,
7788 boolean_t *empty)
1c79356b 7789{
316670eb 7790 upl_page_info_t *user_page_list = NULL;
b0d623f7 7791 upl_size_t xfer_size, subupl_size = size;
55e303ae 7792 vm_object_t shadow_object;
2d21ac55 7793 vm_object_t object;
1c79356b 7794 vm_object_offset_t target_offset;
b0d623f7 7795 upl_offset_t subupl_offset = offset;
1c79356b 7796 int entry;
55e303ae
A
7797 wpl_array_t lite_list;
7798 int occupied;
6d2010ae
A
7799 struct vm_page_delayed_work dw_array[DEFAULT_DELAYED_WORK_LIMIT];
7800 struct vm_page_delayed_work *dwp;
7801 int dw_count;
7802 int dw_limit;
7803 int isVectorUPL = 0;
b0d623f7 7804 upl_t vector_upl = NULL;
1c79356b 7805
0b4e3aa0
A
7806 *empty = FALSE;
7807
7808 if (upl == UPL_NULL)
7809 return KERN_INVALID_ARGUMENT;
7810
2d21ac55 7811 if ( (upl->flags & UPL_IO_WIRE) && !(error & UPL_ABORT_DUMP_PAGES) )
0b4c1975 7812 return upl_commit_range(upl, offset, size, UPL_COMMIT_FREE_ABSENT, NULL, 0, empty);
55e303ae 7813
b0d623f7
A
7814 if((isVectorUPL = vector_upl_is_valid(upl))) {
7815 vector_upl = upl;
7816 upl_lock(vector_upl);
7817 }
7818 else
7819 upl_lock(upl);
7820
7821process_upl_to_abort:
7822 if(isVectorUPL) {
7823 size = subupl_size;
7824 offset = subupl_offset;
7825 if(size == 0) {
7826 upl_unlock(vector_upl);
7827 return KERN_SUCCESS;
7828 }
7829 upl = vector_upl_subupl_byoffset(vector_upl, &offset, &size);
7830 if(upl == NULL) {
7831 upl_unlock(vector_upl);
7832 return KERN_FAILURE;
7833 }
7834 subupl_size -= size;
7835 subupl_offset += size;
7836 }
7837
7838 *empty = FALSE;
7839
7840#if UPL_DEBUG
7841 if (upl->upl_commit_index < UPL_DEBUG_COMMIT_RECORDS) {
7842 (void) OSBacktrace(&upl->upl_commit_records[upl->upl_commit_index].c_retaddr[0], UPL_DEBUG_STACK_FRAMES);
7843
7844 upl->upl_commit_records[upl->upl_commit_index].c_beg = offset;
7845 upl->upl_commit_records[upl->upl_commit_index].c_end = (offset + size);
7846 upl->upl_commit_records[upl->upl_commit_index].c_aborted = 1;
7847
7848 upl->upl_commit_index++;
7849 }
7850#endif
2d21ac55 7851 if (upl->flags & UPL_DEVICE_MEMORY)
1c79356b 7852 xfer_size = 0;
2d21ac55
A
7853 else if ((offset + size) <= upl->size)
7854 xfer_size = size;
b0d623f7
A
7855 else {
7856 if(!isVectorUPL)
7857 upl_unlock(upl);
7858 else {
7859 upl_unlock(vector_upl);
7860 }
55e303ae 7861
b0d623f7
A
7862 return KERN_FAILURE;
7863 }
2d21ac55 7864 if (upl->flags & UPL_INTERNAL) {
55e303ae 7865 lite_list = (wpl_array_t)
91447636 7866 ((((uintptr_t)upl) + sizeof(struct upl))
55e303ae 7867 + ((upl->size/PAGE_SIZE) * sizeof(upl_page_info_t)));
316670eb
A
7868
7869 user_page_list = (upl_page_info_t *) (((uintptr_t)upl) + sizeof(struct upl));
55e303ae
A
7870 } else {
7871 lite_list = (wpl_array_t)
91447636 7872 (((uintptr_t)upl) + sizeof(struct upl));
55e303ae 7873 }
2d21ac55
A
7874 object = upl->map_object;
7875
7876 if (upl->flags & UPL_SHADOWED) {
7877 vm_object_lock(object);
7878 shadow_object = object->shadow;
7879 } else
7880 shadow_object = object;
7881
1c79356b
A
7882 entry = offset/PAGE_SIZE;
7883 target_offset = (vm_object_offset_t)offset;
2d21ac55 7884
3e170ce0
A
7885 assert(!(target_offset & PAGE_MASK));
7886 assert(!(xfer_size & PAGE_MASK));
7887
b0d623f7
A
7888 if (upl->flags & UPL_KERNEL_OBJECT)
7889 vm_object_lock_shared(shadow_object);
7890 else
7891 vm_object_lock(shadow_object);
4a3eedf9 7892
b0d623f7
A
7893 if (upl->flags & UPL_ACCESS_BLOCKED) {
7894 assert(shadow_object->blocked_access);
7895 shadow_object->blocked_access = FALSE;
7896 vm_object_wakeup(object, VM_OBJECT_EVENT_UNBLOCKED);
4a3eedf9 7897 }
b0d623f7
A
7898
7899 dwp = &dw_array[0];
7900 dw_count = 0;
6d2010ae 7901 dw_limit = DELAYED_WORK_LIMIT(DEFAULT_DELAYED_WORK_LIMIT);
b0d623f7
A
7902
7903 if ((error & UPL_ABORT_DUMP_PAGES) && (upl->flags & UPL_KERNEL_OBJECT))
7904 panic("upl_abort_range: kernel_object being DUMPED");
4a3eedf9 7905
2d21ac55
A
7906 while (xfer_size) {
7907 vm_page_t t, m;
316670eb
A
7908 unsigned int pg_num;
7909 boolean_t needed;
2d21ac55 7910
316670eb
A
7911 pg_num = (unsigned int) (target_offset/PAGE_SIZE);
7912 assert(pg_num == target_offset/PAGE_SIZE);
7913
7914 needed = FALSE;
b0d623f7 7915
316670eb
A
7916 if (user_page_list)
7917 needed = user_page_list[pg_num].needed;
7918
7919 dwp->dw_mask = 0;
55e303ae 7920 m = VM_PAGE_NULL;
2d21ac55
A
7921
7922 if (upl->flags & UPL_LITE) {
2d21ac55
A
7923
7924 if (lite_list[pg_num>>5] & (1 << (pg_num & 31))) {
55e303ae 7925 lite_list[pg_num>>5] &= ~(1 << (pg_num & 31));
2d21ac55 7926
b0d623f7
A
7927 if ( !(upl->flags & UPL_KERNEL_OBJECT))
7928 m = vm_page_lookup(shadow_object, target_offset +
7929 (upl->offset - shadow_object->paging_offset));
55e303ae
A
7930 }
7931 }
2d21ac55
A
7932 if (upl->flags & UPL_SHADOWED) {
7933 if ((t = vm_page_lookup(object, target_offset)) != VM_PAGE_NULL) {
39037602 7934 t->free_when_done = FALSE;
2d21ac55 7935
b0d623f7 7936 VM_PAGE_FREE(t);
2d21ac55
A
7937
7938 if (m == VM_PAGE_NULL)
6d2010ae 7939 m = vm_page_lookup(shadow_object, target_offset + object->vo_shadow_offset);
55e303ae
A
7940 }
7941 }
b0d623f7
A
7942 if ((upl->flags & UPL_KERNEL_OBJECT))
7943 goto abort_next_page;
7944
2d21ac55
A
7945 if (m != VM_PAGE_NULL) {
7946
39037602 7947 assert(m->vm_page_q_state != VM_PAGE_USED_BY_COMPRESSOR);
39236c6e 7948
2d21ac55 7949 if (m->absent) {
91447636
A
7950 boolean_t must_free = TRUE;
7951
2d21ac55
A
7952 /*
7953 * COPYOUT = FALSE case
7954 * check for error conditions which must
7955 * be passed back to the pages customer
7956 */
7957 if (error & UPL_ABORT_RESTART) {
1c79356b
A
7958 m->restart = TRUE;
7959 m->absent = FALSE;
2d21ac55 7960 m->unusual = TRUE;
91447636 7961 must_free = FALSE;
2d21ac55 7962 } else if (error & UPL_ABORT_UNAVAILABLE) {
1c79356b
A
7963 m->restart = FALSE;
7964 m->unusual = TRUE;
91447636 7965 must_free = FALSE;
2d21ac55 7966 } else if (error & UPL_ABORT_ERROR) {
1c79356b
A
7967 m->restart = FALSE;
7968 m->absent = FALSE;
1c79356b 7969 m->error = TRUE;
2d21ac55 7970 m->unusual = TRUE;
91447636 7971 must_free = FALSE;
1c79356b 7972 }
316670eb 7973 if (m->clustered && needed == FALSE) {
6d2010ae
A
7974 /*
7975 * This page was a part of a speculative
7976 * read-ahead initiated by the kernel
7977 * itself. No one is expecting this
7978 * page and no one will clean up its
7979 * error state if it ever becomes valid
7980 * in the future.
7981 * We have to free it here.
7982 */
7983 must_free = TRUE;
7984 }
91447636
A
7985
7986 /*
7987 * ENCRYPTED SWAP:
7988 * If the page was already encrypted,
7989 * we don't really need to decrypt it
7990 * now. It will get decrypted later,
7991 * on demand, as soon as someone needs
7992 * to access its contents.
7993 */
1c79356b
A
7994
7995 m->cleaning = FALSE;
2d21ac55 7996 m->encrypted_cleaning = FALSE;
6d2010ae
A
7997
7998 if (m->overwriting && !m->busy) {
7999 /*
8000 * this shouldn't happen since
8001 * this is an 'absent' page, but
8002 * it doesn't hurt to check for
8003 * the 'alternate' method of
8004 * stabilizing the page...
8005 * we will mark 'busy' to be cleared
8006 * in the following code which will
8007 * take care of the primary stabilzation
8008 * method (i.e. setting 'busy' to TRUE)
8009 */
8010 dwp->dw_mask |= DW_vm_page_unwire;
8011 }
1c79356b 8012 m->overwriting = FALSE;
b0d623f7
A
8013
8014 dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
91447636 8015
2d21ac55 8016 if (must_free == TRUE)
b0d623f7 8017 dwp->dw_mask |= DW_vm_page_free;
2d21ac55 8018 else
b0d623f7 8019 dwp->dw_mask |= DW_vm_page_activate;
2d21ac55
A
8020 } else {
8021 /*
8022 * Handle the trusted pager throttle.
8023 */
8024 if (m->laundry)
b0d623f7 8025 dwp->dw_mask |= DW_vm_pageout_throttle_up;
2d21ac55 8026
6d2010ae
A
8027 if (upl->flags & UPL_ACCESS_BLOCKED) {
8028 /*
8029 * We blocked access to the pages in this UPL.
8030 * Clear the "busy" bit and wake up any waiter
8031 * for this page.
8032 */
8033 dwp->dw_mask |= DW_clear_busy;
8034 }
6d2010ae
A
8035 if (m->overwriting) {
8036 if (m->busy)
8037 dwp->dw_mask |= DW_clear_busy;
8038 else {
8039 /*
8040 * deal with the 'alternate' method
8041 * of stabilizing the page...
8042 * we will either free the page
8043 * or mark 'busy' to be cleared
8044 * in the following code which will
8045 * take care of the primary stabilzation
8046 * method (i.e. setting 'busy' to TRUE)
8047 */
8048 dwp->dw_mask |= DW_vm_page_unwire;
8049 }
8050 m->overwriting = FALSE;
8051 }
8052 if (m->encrypted_cleaning == TRUE) {
8053 m->encrypted_cleaning = FALSE;
8054
8055 dwp->dw_mask |= DW_clear_busy;
1c79356b 8056 }
39037602 8057 m->free_when_done = FALSE;
2d21ac55 8058 m->cleaning = FALSE;
39037602 8059
2d21ac55 8060 if (error & UPL_ABORT_DUMP_PAGES) {
39037602 8061 pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
b0d623f7
A
8062
8063 dwp->dw_mask |= DW_vm_page_free;
2d21ac55 8064 } else {
316670eb
A
8065 if (!(dwp->dw_mask & DW_vm_page_unwire)) {
8066 if (error & UPL_ABORT_REFERENCE) {
8067 /*
8068 * we've been told to explictly
8069 * reference this page... for
8070 * file I/O, this is done by
8071 * implementing an LRU on the inactive q
8072 */
8073 dwp->dw_mask |= DW_vm_page_lru;
8074
39037602 8075 } else if ( !VM_PAGE_PAGEABLE(m))
316670eb 8076 dwp->dw_mask |= DW_vm_page_deactivate_internal;
2d21ac55 8077 }
6d2010ae 8078 dwp->dw_mask |= DW_PAGE_WAKEUP;
2d21ac55 8079 }
1c79356b 8080 }
2d21ac55 8081 }
b0d623f7 8082abort_next_page:
55e303ae
A
8083 target_offset += PAGE_SIZE_64;
8084 xfer_size -= PAGE_SIZE;
8085 entry++;
b0d623f7
A
8086
8087 if (dwp->dw_mask) {
8088 if (dwp->dw_mask & ~(DW_clear_busy | DW_PAGE_WAKEUP)) {
6d2010ae 8089 VM_PAGE_ADD_DELAYED_WORK(dwp, m, dw_count);
b0d623f7 8090
6d2010ae 8091 if (dw_count >= dw_limit) {
3e170ce0 8092 vm_page_do_delayed_work(shadow_object, VM_KERN_MEMORY_NONE, &dw_array[0], dw_count);
b0d623f7
A
8093
8094 dwp = &dw_array[0];
8095 dw_count = 0;
8096 }
8097 } else {
8098 if (dwp->dw_mask & DW_clear_busy)
8099 m->busy = FALSE;
8100
8101 if (dwp->dw_mask & DW_PAGE_WAKEUP)
8102 PAGE_WAKEUP(m);
8103 }
8104 }
d7e50217 8105 }
b0d623f7 8106 if (dw_count)
3e170ce0 8107 vm_page_do_delayed_work(shadow_object, VM_KERN_MEMORY_NONE, &dw_array[0], dw_count);
2d21ac55 8108
55e303ae 8109 occupied = 1;
2d21ac55 8110
55e303ae
A
8111 if (upl->flags & UPL_DEVICE_MEMORY) {
8112 occupied = 0;
8113 } else if (upl->flags & UPL_LITE) {
8114 int pg_num;
8115 int i;
2d21ac55 8116
55e303ae
A
8117 pg_num = upl->size/PAGE_SIZE;
8118 pg_num = (pg_num + 31) >> 5;
8119 occupied = 0;
2d21ac55
A
8120
8121 for (i = 0; i < pg_num; i++) {
8122 if (lite_list[i] != 0) {
55e303ae
A
8123 occupied = 1;
8124 break;
8125 }
8126 }
8127 } else {
39037602 8128 if (vm_page_queue_empty(&upl->map_object->memq))
55e303ae 8129 occupied = 0;
55e303ae 8130 }
2d21ac55 8131 if (occupied == 0) {
b0d623f7
A
8132 /*
8133 * If this UPL element belongs to a Vector UPL and is
8134 * empty, then this is the right function to deallocate
8135 * it. So go ahead set the *empty variable. The flag
8136 * UPL_COMMIT_NOTIFY_EMPTY, from the caller's point of view
8137 * should be considered relevant for the Vector UPL and
8138 * not the internal UPLs.
8139 */
8140 if ((upl->flags & UPL_COMMIT_NOTIFY_EMPTY) || isVectorUPL)
0b4e3aa0 8141 *empty = TRUE;
2d21ac55 8142
b0d623f7 8143 if (object == shadow_object && !(upl->flags & UPL_KERNEL_OBJECT)) {
2d21ac55
A
8144 /*
8145 * this is not a paging object
8146 * so we need to drop the paging reference
8147 * that was taken when we created the UPL
8148 * against this object
8149 */
b0d623f7 8150 vm_object_activity_end(shadow_object);
316670eb 8151 vm_object_collapse(shadow_object, 0, TRUE);
2d21ac55
A
8152 } else {
8153 /*
8154 * we dontated the paging reference to
8155 * the map object... vm_pageout_object_terminate
8156 * will drop this reference
8157 */
8158 }
1c79356b 8159 }
55e303ae 8160 vm_object_unlock(shadow_object);
91447636
A
8161 if (object != shadow_object)
8162 vm_object_unlock(object);
b0d623f7
A
8163
8164 if(!isVectorUPL)
8165 upl_unlock(upl);
8166 else {
8167 /*
8168 * If we completed our operations on an UPL that is
8169 * part of a Vectored UPL and if empty is TRUE, then
8170 * we should go ahead and deallocate this UPL element.
8171 * Then we check if this was the last of the UPL elements
8172 * within that Vectored UPL. If so, set empty to TRUE
8173 * so that in ubc_upl_abort_range or ubc_upl_abort, we
8174 * can go ahead and deallocate the Vector UPL too.
8175 */
8176 if(*empty == TRUE) {
8177 *empty = vector_upl_set_subupl(vector_upl, upl,0);
8178 upl_deallocate(upl);
8179 }
8180 goto process_upl_to_abort;
8181 }
55e303ae 8182
1c79356b
A
8183 return KERN_SUCCESS;
8184}
8185
2d21ac55 8186
1c79356b 8187kern_return_t
0b4e3aa0 8188upl_abort(
1c79356b
A
8189 upl_t upl,
8190 int error)
2d21ac55
A
8191{
8192 boolean_t empty;
8193
7e41aa88
A
8194 if (upl == UPL_NULL)
8195 return KERN_INVALID_ARGUMENT;
8196
2d21ac55 8197 return upl_abort_range(upl, 0, upl->size, error, &empty);
1c79356b
A
8198}
8199
55e303ae 8200
2d21ac55
A
8201/* an option on commit should be wire */
8202kern_return_t
8203upl_commit(
8204 upl_t upl,
8205 upl_page_info_t *page_list,
8206 mach_msg_type_number_t count)
8207{
8208 boolean_t empty;
8209
7e41aa88
A
8210 if (upl == UPL_NULL)
8211 return KERN_INVALID_ARGUMENT;
8212
2d21ac55
A
8213 return upl_commit_range(upl, 0, upl->size, 0, page_list, count, &empty);
8214}
8215
fe8ab488
A
8216
8217void
8218iopl_valid_data(
8219 upl_t upl)
8220{
8221 vm_object_t object;
8222 vm_offset_t offset;
8223 vm_page_t m, nxt_page = VM_PAGE_NULL;
8224 upl_size_t size;
8225 int wired_count = 0;
8226
8227 if (upl == NULL)
8228 panic("iopl_valid_data: NULL upl");
8229 if (vector_upl_is_valid(upl))
8230 panic("iopl_valid_data: vector upl");
8231 if ((upl->flags & (UPL_DEVICE_MEMORY|UPL_SHADOWED|UPL_ACCESS_BLOCKED|UPL_IO_WIRE|UPL_INTERNAL)) != UPL_IO_WIRE)
8232 panic("iopl_valid_data: unsupported upl, flags = %x", upl->flags);
8233
8234 object = upl->map_object;
8235
8236 if (object == kernel_object || object == compressor_object)
8237 panic("iopl_valid_data: object == kernel or compressor");
8238
39037602
A
8239 if (object->purgable == VM_PURGABLE_VOLATILE ||
8240 object->purgable == VM_PURGABLE_EMPTY)
8241 panic("iopl_valid_data: object %p purgable %d",
8242 object, object->purgable);
fe8ab488
A
8243
8244 size = upl->size;
8245
8246 vm_object_lock(object);
8247
8248 if (object->vo_size == size && object->resident_page_count == (size / PAGE_SIZE))
39037602 8249 nxt_page = (vm_page_t)vm_page_queue_first(&object->memq);
fe8ab488
A
8250 else
8251 offset = 0 + upl->offset - object->paging_offset;
8252
8253 while (size) {
8254
8255 if (nxt_page != VM_PAGE_NULL) {
8256 m = nxt_page;
39037602 8257 nxt_page = (vm_page_t)vm_page_queue_next(&nxt_page->listq);
fe8ab488
A
8258 } else {
8259 m = vm_page_lookup(object, offset);
8260 offset += PAGE_SIZE;
8261
8262 if (m == VM_PAGE_NULL)
8263 panic("iopl_valid_data: missing expected page at offset %lx", (long)offset);
8264 }
8265 if (m->busy) {
8266 if (!m->absent)
8267 panic("iopl_valid_data: busy page w/o absent");
8268
8269 if (m->pageq.next || m->pageq.prev)
8270 panic("iopl_valid_data: busy+absent page on page queue");
39037602
A
8271 if (m->reusable) {
8272 panic("iopl_valid_data: %p is reusable", m);
8273 }
fe8ab488
A
8274
8275 m->absent = FALSE;
8276 m->dirty = TRUE;
39037602
A
8277 assert(m->vm_page_q_state == VM_PAGE_NOT_ON_Q);
8278 assert(m->wire_count == 0);
fe8ab488 8279 m->wire_count++;
39037602
A
8280 assert(m->wire_count);
8281 if (m->wire_count == 1) {
8282 m->vm_page_q_state = VM_PAGE_IS_WIRED;
8283 wired_count++;
8284 } else {
8285 panic("iopl_valid_data: %p already wired\n", m);
8286 }
fe8ab488
A
8287
8288 PAGE_WAKEUP_DONE(m);
8289 }
8290 size -= PAGE_SIZE;
8291 }
8292 if (wired_count) {
3e170ce0
A
8293
8294 if (!object->wired_page_count) {
8295 VM_OBJECT_WIRED(object);
8296 }
fe8ab488 8297 object->wired_page_count += wired_count;
39037602
A
8298 assert(object->resident_page_count >= object->wired_page_count);
8299
8300 /* no need to adjust purgeable accounting for this object: */
8301 assert(object->purgable != VM_PURGABLE_VOLATILE);
8302 assert(object->purgable != VM_PURGABLE_EMPTY);
fe8ab488
A
8303
8304 vm_page_lockspin_queues();
8305 vm_page_wire_count += wired_count;
8306 vm_page_unlock_queues();
8307 }
8308 vm_object_unlock(object);
8309}
8310
39037602
A
8311vm_tag_t
8312iopl_set_tag(
8313 upl_t upl,
8314 vm_tag_t tag)
8315{
8316 vm_object_t object;
8317 vm_tag_t prior_tag;
8318
8319 if (upl == NULL)
8320 panic("%s: NULL upl", __FUNCTION__);
8321 if (vector_upl_is_valid(upl))
8322 panic("%s: vector upl", __FUNCTION__);
8323 if (kernel_object == upl->map_object)
8324 return (tag);
8325 if ((upl->flags & (UPL_DEVICE_MEMORY|UPL_SHADOWED|UPL_ACCESS_BLOCKED|UPL_IO_WIRE|UPL_INTERNAL)) != UPL_IO_WIRE)
8326 return (tag);
8327
8328 object = upl->map_object;
8329 vm_object_lock(object);
8330
8331 prior_tag = object->wire_tag;
8332 object->wire_tag = tag;
8333 if (VM_KERN_MEMORY_NONE == prior_tag) prior_tag = tag;
8334 vm_object_unlock(object);
8335
8336 return (prior_tag);
8337}
8338
8339
316670eb
A
8340void
8341vm_object_set_pmap_cache_attr(
8342 vm_object_t object,
8343 upl_page_info_array_t user_page_list,
8344 unsigned int num_pages,
8345 boolean_t batch_pmap_op)
8346{
8347 unsigned int cache_attr = 0;
8348
8349 cache_attr = object->wimg_bits & VM_WIMG_MASK;
8350 assert(user_page_list);
8351 if (cache_attr != VM_WIMG_USE_DEFAULT) {
8352 PMAP_BATCH_SET_CACHE_ATTR(object, user_page_list, cache_attr, num_pages, batch_pmap_op);
8353 }
8354}
55e303ae 8355
3e170ce0
A
8356
8357boolean_t vm_object_iopl_wire_full(vm_object_t, upl_t, upl_page_info_array_t, wpl_array_t, upl_control_flags_t);
8358kern_return_t vm_object_iopl_wire_empty(vm_object_t, upl_t, upl_page_info_array_t, wpl_array_t, upl_control_flags_t, vm_object_offset_t *, int);
8359
8360
8361
8362boolean_t
8363vm_object_iopl_wire_full(vm_object_t object, upl_t upl, upl_page_info_array_t user_page_list,
8364 wpl_array_t lite_list, upl_control_flags_t cntrl_flags)
8365{
8366 vm_page_t dst_page;
8367 vm_tag_t tag;
8368 unsigned int entry;
8369 int page_count;
8370 int delayed_unlock = 0;
8371 boolean_t retval = TRUE;
39037602 8372 ppnum_t phys_page;
3e170ce0
A
8373
8374 vm_object_lock_assert_exclusive(object);
8375 assert(object->purgable != VM_PURGABLE_VOLATILE);
8376 assert(object->purgable != VM_PURGABLE_EMPTY);
8377 assert(object->pager == NULL);
8378 assert(object->copy == NULL);
8379 assert(object->shadow == NULL);
8380
8381 tag = UPL_MEMORY_TAG(cntrl_flags);
8382 page_count = object->resident_page_count;
39037602 8383 dst_page = (vm_page_t)vm_page_queue_first(&object->memq);
3e170ce0
A
8384
8385 vm_page_lock_queues();
8386
8387 while (page_count--) {
8388
8389 if (dst_page->busy ||
8390 dst_page->fictitious ||
8391 dst_page->absent ||
8392 dst_page->error ||
8393 dst_page->cleaning ||
8394 dst_page->restart ||
8395 dst_page->encrypted ||
8396 dst_page->laundry) {
8397 retval = FALSE;
8398 goto done;
8399 }
8400 if ((cntrl_flags & UPL_REQUEST_FORCE_COHERENCY) && dst_page->written_by_kernel == TRUE) {
8401 retval = FALSE;
8402 goto done;
8403 }
8404 dst_page->reference = TRUE;
8405
8406 vm_page_wire(dst_page, tag, FALSE);
8407
8408 if (!(cntrl_flags & UPL_COPYOUT_FROM)) {
8409 SET_PAGE_DIRTY(dst_page, FALSE);
8410 }
8411 entry = (unsigned int)(dst_page->offset / PAGE_SIZE);
8412 assert(entry >= 0 && entry < object->resident_page_count);
8413 lite_list[entry>>5] |= 1 << (entry & 31);
8414
39037602
A
8415 phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
8416
8417 if (phys_page > upl->highest_page)
8418 upl->highest_page = phys_page;
3e170ce0
A
8419
8420 if (user_page_list) {
39037602 8421 user_page_list[entry].phys_addr = phys_page;
3e170ce0
A
8422 user_page_list[entry].absent = dst_page->absent;
8423 user_page_list[entry].dirty = dst_page->dirty;
39037602 8424 user_page_list[entry].free_when_done = dst_page->free_when_done;
3e170ce0
A
8425 user_page_list[entry].precious = dst_page->precious;
8426 user_page_list[entry].device = FALSE;
8427 user_page_list[entry].speculative = FALSE;
8428 user_page_list[entry].cs_validated = FALSE;
8429 user_page_list[entry].cs_tainted = FALSE;
8430 user_page_list[entry].cs_nx = FALSE;
8431 user_page_list[entry].needed = FALSE;
8432 user_page_list[entry].mark = FALSE;
8433 }
8434 if (delayed_unlock++ > 256) {
8435 delayed_unlock = 0;
8436 lck_mtx_yield(&vm_page_queue_lock);
8437
8438 VM_CHECK_MEMORYSTATUS;
8439 }
39037602 8440 dst_page = (vm_page_t)vm_page_queue_next(&dst_page->listq);
3e170ce0
A
8441 }
8442done:
8443 vm_page_unlock_queues();
8444
8445 VM_CHECK_MEMORYSTATUS;
8446
8447 return (retval);
8448}
8449
8450
8451kern_return_t
8452vm_object_iopl_wire_empty(vm_object_t object, upl_t upl, upl_page_info_array_t user_page_list,
8453 wpl_array_t lite_list, upl_control_flags_t cntrl_flags, vm_object_offset_t *dst_offset, int page_count)
8454{
8455 vm_page_t dst_page;
8456 vm_tag_t tag;
8457 boolean_t no_zero_fill = FALSE;
8458 int interruptible;
8459 int pages_wired = 0;
8460 int pages_inserted = 0;
8461 int entry = 0;
8462 uint64_t delayed_ledger_update = 0;
8463 kern_return_t ret = KERN_SUCCESS;
39037602
A
8464 int grab_options;
8465 ppnum_t phys_page;
3e170ce0
A
8466
8467 vm_object_lock_assert_exclusive(object);
8468 assert(object->purgable != VM_PURGABLE_VOLATILE);
8469 assert(object->purgable != VM_PURGABLE_EMPTY);
8470 assert(object->pager == NULL);
8471 assert(object->copy == NULL);
8472 assert(object->shadow == NULL);
8473
8474 if (cntrl_flags & UPL_SET_INTERRUPTIBLE)
8475 interruptible = THREAD_ABORTSAFE;
8476 else
8477 interruptible = THREAD_UNINT;
8478
8479 if (cntrl_flags & (UPL_NOZEROFILL | UPL_NOZEROFILLIO))
8480 no_zero_fill = TRUE;
8481
8482 tag = UPL_MEMORY_TAG(cntrl_flags);
8483
39037602
A
8484 grab_options = 0;
8485#if CONFIG_SECLUDED_MEMORY
8486 if (object->can_grab_secluded) {
8487 grab_options |= VM_PAGE_GRAB_SECLUDED;
8488 }
8489#endif /* CONFIG_SECLUDED_MEMORY */
8490
3e170ce0
A
8491 while (page_count--) {
8492
39037602
A
8493 while ((dst_page = vm_page_grab_options(grab_options))
8494 == VM_PAGE_NULL) {
3e170ce0
A
8495
8496 OSAddAtomic(page_count, &vm_upl_wait_for_pages);
8497
8498 VM_DEBUG_EVENT(vm_iopl_page_wait, VM_IOPL_PAGE_WAIT, DBG_FUNC_START, vm_upl_wait_for_pages, 0, 0, 0);
8499
8500 if (vm_page_wait(interruptible) == FALSE) {
8501 /*
8502 * interrupted case
8503 */
8504 OSAddAtomic(-page_count, &vm_upl_wait_for_pages);
8505
8506 VM_DEBUG_EVENT(vm_iopl_page_wait, VM_IOPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, -1);
8507
8508 ret = MACH_SEND_INTERRUPTED;
8509 goto done;
8510 }
8511 OSAddAtomic(-page_count, &vm_upl_wait_for_pages);
8512
8513 VM_DEBUG_EVENT(vm_iopl_page_wait, VM_IOPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, 0);
8514 }
8515 if (no_zero_fill == FALSE)
8516 vm_page_zero_fill(dst_page);
8517 else
8518 dst_page->absent = TRUE;
8519
8520 dst_page->reference = TRUE;
8521
8522 if (!(cntrl_flags & UPL_COPYOUT_FROM)) {
8523 SET_PAGE_DIRTY(dst_page, FALSE);
8524 }
8525 if (dst_page->absent == FALSE) {
39037602
A
8526 assert(dst_page->vm_page_q_state == VM_PAGE_NOT_ON_Q);
8527 assert(dst_page->wire_count == 0);
3e170ce0 8528 dst_page->wire_count++;
39037602
A
8529 dst_page->vm_page_q_state = VM_PAGE_IS_WIRED;
8530 assert(dst_page->wire_count);
3e170ce0
A
8531 pages_wired++;
8532 PAGE_WAKEUP_DONE(dst_page);
8533 }
8534 pages_inserted++;
8535
8536 vm_page_insert_internal(dst_page, object, *dst_offset, tag, FALSE, TRUE, TRUE, TRUE, &delayed_ledger_update);
8537
8538 lite_list[entry>>5] |= 1 << (entry & 31);
8539
39037602
A
8540 phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
8541
8542 if (phys_page > upl->highest_page)
8543 upl->highest_page = phys_page;
3e170ce0
A
8544
8545 if (user_page_list) {
39037602 8546 user_page_list[entry].phys_addr = phys_page;
3e170ce0
A
8547 user_page_list[entry].absent = dst_page->absent;
8548 user_page_list[entry].dirty = dst_page->dirty;
39037602 8549 user_page_list[entry].free_when_done = FALSE;
3e170ce0
A
8550 user_page_list[entry].precious = FALSE;
8551 user_page_list[entry].device = FALSE;
8552 user_page_list[entry].speculative = FALSE;
8553 user_page_list[entry].cs_validated = FALSE;
8554 user_page_list[entry].cs_tainted = FALSE;
8555 user_page_list[entry].cs_nx = FALSE;
8556 user_page_list[entry].needed = FALSE;
8557 user_page_list[entry].mark = FALSE;
8558 }
8559 entry++;
8560 *dst_offset += PAGE_SIZE_64;
8561 }
8562done:
8563 if (pages_wired) {
8564 vm_page_lockspin_queues();
8565 vm_page_wire_count += pages_wired;
8566 vm_page_unlock_queues();
8567 }
8568 if (pages_inserted) {
8569 if (object->internal) {
8570 OSAddAtomic(pages_inserted, &vm_page_internal_count);
8571 } else {
8572 OSAddAtomic(pages_inserted, &vm_page_external_count);
8573 }
8574 }
8575 if (delayed_ledger_update) {
8576 task_t owner;
8577
8578 owner = object->vo_purgeable_owner;
8579 assert(owner);
8580
8581 /* more non-volatile bytes */
8582 ledger_credit(owner->ledger,
8583 task_ledgers.purgeable_nonvolatile,
8584 delayed_ledger_update);
8585 /* more footprint */
8586 ledger_credit(owner->ledger,
8587 task_ledgers.phys_footprint,
8588 delayed_ledger_update);
8589 }
8590 return (ret);
8591}
8592
8593
b0d623f7
A
8594unsigned int vm_object_iopl_request_sleep_for_cleaning = 0;
8595
3e170ce0 8596
55e303ae
A
8597kern_return_t
8598vm_object_iopl_request(
8599 vm_object_t object,
8600 vm_object_offset_t offset,
91447636 8601 upl_size_t size,
55e303ae
A
8602 upl_t *upl_ptr,
8603 upl_page_info_array_t user_page_list,
8604 unsigned int *page_list_count,
3e170ce0 8605 upl_control_flags_t cntrl_flags)
55e303ae
A
8606{
8607 vm_page_t dst_page;
2d21ac55
A
8608 vm_object_offset_t dst_offset;
8609 upl_size_t xfer_size;
55e303ae 8610 upl_t upl = NULL;
91447636
A
8611 unsigned int entry;
8612 wpl_array_t lite_list = NULL;
91447636 8613 int no_zero_fill = FALSE;
6d2010ae 8614 unsigned int size_in_pages;
2d21ac55 8615 u_int32_t psize;
55e303ae
A
8616 kern_return_t ret;
8617 vm_prot_t prot;
2d21ac55 8618 struct vm_object_fault_info fault_info;
6d2010ae
A
8619 struct vm_page_delayed_work dw_array[DEFAULT_DELAYED_WORK_LIMIT];
8620 struct vm_page_delayed_work *dwp;
b0d623f7 8621 int dw_count;
6d2010ae 8622 int dw_limit;
b0d623f7 8623 int dw_index;
39236c6e 8624 boolean_t caller_lookup;
fe8ab488
A
8625 int io_tracking_flag = 0;
8626 int interruptible;
39037602 8627 ppnum_t phys_page;
fe8ab488
A
8628
8629 boolean_t set_cache_attr_needed = FALSE;
8630 boolean_t free_wired_pages = FALSE;
3e170ce0
A
8631 boolean_t fast_path_empty_req = FALSE;
8632 boolean_t fast_path_full_req = FALSE;
55e303ae 8633
91447636
A
8634 if (cntrl_flags & ~UPL_VALID_FLAGS) {
8635 /*
8636 * For forward compatibility's sake,
8637 * reject any unknown flag.
8638 */
8639 return KERN_INVALID_VALUE;
8640 }
0b4c1975 8641 if (vm_lopage_needed == FALSE)
0c530ab8
A
8642 cntrl_flags &= ~UPL_NEED_32BIT_ADDR;
8643
8644 if (cntrl_flags & UPL_NEED_32BIT_ADDR) {
8645 if ( (cntrl_flags & (UPL_SET_IO_WIRE | UPL_SET_LITE)) != (UPL_SET_IO_WIRE | UPL_SET_LITE))
8646 return KERN_INVALID_VALUE;
8647
8648 if (object->phys_contiguous) {
6d2010ae 8649 if ((offset + object->vo_shadow_offset) >= (vm_object_offset_t)max_valid_dma_address)
0c530ab8 8650 return KERN_INVALID_ADDRESS;
2d21ac55 8651
6d2010ae 8652 if (((offset + object->vo_shadow_offset) + size) >= (vm_object_offset_t)max_valid_dma_address)
0c530ab8
A
8653 return KERN_INVALID_ADDRESS;
8654 }
8655 }
91447636
A
8656
8657 if (cntrl_flags & UPL_ENCRYPT) {
8658 /*
8659 * ENCRYPTED SWAP:
8660 * The paging path doesn't use this interface,
8661 * so we don't support the UPL_ENCRYPT flag
8662 * here. We won't encrypt the pages.
8663 */
8664 assert(! (cntrl_flags & UPL_ENCRYPT));
8665 }
39236c6e 8666 if (cntrl_flags & (UPL_NOZEROFILL | UPL_NOZEROFILLIO))
91447636
A
8667 no_zero_fill = TRUE;
8668
8669 if (cntrl_flags & UPL_COPYOUT_FROM)
55e303ae 8670 prot = VM_PROT_READ;
91447636 8671 else
55e303ae 8672 prot = VM_PROT_READ | VM_PROT_WRITE;
55e303ae 8673
2d21ac55
A
8674 if ((!object->internal) && (object->paging_offset != 0))
8675 panic("vm_object_iopl_request: external object with non-zero paging offset\n");
8676
fe8ab488
A
8677#if CONFIG_IOSCHED || UPL_DEBUG
8678 if ((object->io_tracking && object != kernel_object) || upl_debug_enabled)
8679 io_tracking_flag |= UPL_CREATE_IO_TRACKING;
8680#endif
8681
8682#if CONFIG_IOSCHED
8683 if (object->io_tracking) {
8684 /* Check if we're dealing with the kernel object. We do not support expedite on kernel object UPLs */
8685 if (object != kernel_object)
8686 io_tracking_flag |= UPL_CREATE_EXPEDITE_SUP;
8687 }
8688#endif
2d21ac55
A
8689
8690 if (object->phys_contiguous)
8691 psize = PAGE_SIZE;
8692 else
8693 psize = size;
8694
8695 if (cntrl_flags & UPL_SET_INTERNAL) {
fe8ab488 8696 upl = upl_create(UPL_CREATE_INTERNAL | UPL_CREATE_LITE | io_tracking_flag, UPL_IO_WIRE, psize);
2d21ac55
A
8697
8698 user_page_list = (upl_page_info_t *) (((uintptr_t)upl) + sizeof(struct upl));
8699 lite_list = (wpl_array_t) (((uintptr_t)user_page_list) +
8700 ((psize / PAGE_SIZE) * sizeof(upl_page_info_t)));
b0d623f7
A
8701 if (size == 0) {
8702 user_page_list = NULL;
8703 lite_list = NULL;
8704 }
2d21ac55 8705 } else {
fe8ab488 8706 upl = upl_create(UPL_CREATE_LITE | io_tracking_flag, UPL_IO_WIRE, psize);
55e303ae 8707
2d21ac55 8708 lite_list = (wpl_array_t) (((uintptr_t)upl) + sizeof(struct upl));
b0d623f7
A
8709 if (size == 0) {
8710 lite_list = NULL;
8711 }
55e303ae 8712 }
2d21ac55
A
8713 if (user_page_list)
8714 user_page_list[0].device = FALSE;
8715 *upl_ptr = upl;
55e303ae 8716
2d21ac55
A
8717 upl->map_object = object;
8718 upl->size = size;
8719
6d2010ae
A
8720 size_in_pages = size / PAGE_SIZE;
8721
b0d623f7
A
8722 if (object == kernel_object &&
8723 !(cntrl_flags & (UPL_NEED_32BIT_ADDR | UPL_BLOCK_ACCESS))) {
8724 upl->flags |= UPL_KERNEL_OBJECT;
8725#if UPL_DEBUG
8726 vm_object_lock(object);
8727#else
8728 vm_object_lock_shared(object);
8729#endif
8730 } else {
8731 vm_object_lock(object);
8732 vm_object_activity_begin(object);
8733 }
2d21ac55
A
8734 /*
8735 * paging in progress also protects the paging_offset
8736 */
8737 upl->offset = offset + object->paging_offset;
55e303ae 8738
b0d623f7
A
8739 if (cntrl_flags & UPL_BLOCK_ACCESS) {
8740 /*
316670eb 8741 * The user requested that access to the pages in this UPL
b0d623f7
A
8742 * be blocked until the UPL is commited or aborted.
8743 */
8744 upl->flags |= UPL_ACCESS_BLOCKED;
8745 }
8746
fe8ab488
A
8747#if CONFIG_IOSCHED || UPL_DEBUG
8748 if (upl->flags & UPL_TRACKED_BY_OBJECT) {
316670eb 8749 vm_object_activity_begin(object);
2d21ac55 8750 queue_enter(&object->uplq, upl, upl_t, uplq);
fe8ab488
A
8751 }
8752#endif
8753
8754 if (object->phys_contiguous) {
55e303ae 8755
b0d623f7
A
8756 if (upl->flags & UPL_ACCESS_BLOCKED) {
8757 assert(!object->blocked_access);
8758 object->blocked_access = TRUE;
8759 }
8760
2d21ac55 8761 vm_object_unlock(object);
55e303ae 8762
2d21ac55
A
8763 /*
8764 * don't need any shadow mappings for this one
8765 * since it is already I/O memory
8766 */
8767 upl->flags |= UPL_DEVICE_MEMORY;
55e303ae 8768
6d2010ae 8769 upl->highest_page = (ppnum_t) ((offset + object->vo_shadow_offset + size - 1)>>PAGE_SHIFT);
2d21ac55
A
8770
8771 if (user_page_list) {
6d2010ae 8772 user_page_list[0].phys_addr = (ppnum_t) ((offset + object->vo_shadow_offset)>>PAGE_SHIFT);
2d21ac55 8773 user_page_list[0].device = TRUE;
55e303ae 8774 }
2d21ac55
A
8775 if (page_list_count != NULL) {
8776 if (upl->flags & UPL_INTERNAL)
8777 *page_list_count = 0;
8778 else
8779 *page_list_count = 1;
55e303ae 8780 }
2d21ac55 8781 return KERN_SUCCESS;
55e303ae 8782 }
39236c6e 8783 if (object != kernel_object && object != compressor_object) {
b0d623f7
A
8784 /*
8785 * Protect user space from future COW operations
8786 */
fe8ab488
A
8787#if VM_OBJECT_TRACKING_OP_TRUESHARE
8788 if (!object->true_share &&
8789 vm_object_tracking_inited) {
8790 void *bt[VM_OBJECT_TRACKING_BTDEPTH];
8791 int num = 0;
8792
8793 num = OSBacktrace(bt,
8794 VM_OBJECT_TRACKING_BTDEPTH);
8795 btlog_add_entry(vm_object_tracking_btlog,
8796 object,
8797 VM_OBJECT_TRACKING_OP_TRUESHARE,
8798 bt,
8799 num);
8800 }
8801#endif /* VM_OBJECT_TRACKING_OP_TRUESHARE */
8802
39037602 8803 vm_object_lock_assert_exclusive(object);
b0d623f7 8804 object->true_share = TRUE;
55e303ae 8805
b0d623f7
A
8806 if (object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC)
8807 object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
8808 }
91447636 8809
b0d623f7
A
8810 if (!(cntrl_flags & UPL_COPYOUT_FROM) &&
8811 object->copy != VM_OBJECT_NULL) {
91447636 8812 /*
b0d623f7
A
8813 * Honor copy-on-write obligations
8814 *
8815 * The caller is gathering these pages and
8816 * might modify their contents. We need to
8817 * make sure that the copy object has its own
8818 * private copies of these pages before we let
8819 * the caller modify them.
8820 *
8821 * NOTE: someone else could map the original object
8822 * after we've done this copy-on-write here, and they
8823 * could then see an inconsistent picture of the memory
8824 * while it's being modified via the UPL. To prevent this,
8825 * we would have to block access to these pages until the
8826 * UPL is released. We could use the UPL_BLOCK_ACCESS
8827 * code path for that...
91447636 8828 */
b0d623f7
A
8829 vm_object_update(object,
8830 offset,
8831 size,
8832 NULL,
8833 NULL,
8834 FALSE, /* should_return */
8835 MEMORY_OBJECT_COPY_SYNC,
8836 VM_PROT_NO_CHANGE);
8837#if DEVELOPMENT || DEBUG
8838 iopl_cow++;
8839 iopl_cow_pages += size >> PAGE_SHIFT;
8840#endif
55e303ae 8841 }
3e170ce0
A
8842 if (!(cntrl_flags & (UPL_NEED_32BIT_ADDR | UPL_BLOCK_ACCESS)) &&
8843 object->purgable != VM_PURGABLE_VOLATILE &&
8844 object->purgable != VM_PURGABLE_EMPTY &&
8845 object->copy == NULL &&
8846 size == object->vo_size &&
8847 offset == 0 &&
8848 object->shadow == NULL &&
8849 object->pager == NULL)
8850 {
8851 if (object->resident_page_count == size_in_pages)
8852 {
8853 assert(object != compressor_object);
8854 assert(object != kernel_object);
8855 fast_path_full_req = TRUE;
8856 }
8857 else if (object->resident_page_count == 0)
8858 {
8859 assert(object != compressor_object);
8860 assert(object != kernel_object);
8861 fast_path_empty_req = TRUE;
8862 set_cache_attr_needed = TRUE;
8863 }
8864 }
8865
fe8ab488
A
8866 if (cntrl_flags & UPL_SET_INTERRUPTIBLE)
8867 interruptible = THREAD_ABORTSAFE;
8868 else
8869 interruptible = THREAD_UNINT;
b0d623f7 8870
55e303ae 8871 entry = 0;
2d21ac55
A
8872
8873 xfer_size = size;
8874 dst_offset = offset;
fe8ab488
A
8875 dw_count = 0;
8876
3e170ce0 8877 if (fast_path_full_req) {
fe8ab488 8878
3e170ce0
A
8879 if (vm_object_iopl_wire_full(object, upl, user_page_list, lite_list, cntrl_flags) == TRUE)
8880 goto finish;
8881 /*
8882 * we couldn't complete the processing of this request on the fast path
8883 * so fall through to the slow path and finish up
8884 */
fe8ab488 8885
3e170ce0 8886 } else if (fast_path_empty_req) {
fe8ab488 8887
3e170ce0
A
8888 if (cntrl_flags & UPL_REQUEST_NO_FAULT) {
8889 ret = KERN_MEMORY_ERROR;
8890 goto return_err;
fe8ab488 8891 }
3e170ce0
A
8892 ret = vm_object_iopl_wire_empty(object, upl, user_page_list, lite_list, cntrl_flags, &dst_offset, size_in_pages);
8893
8894 if (ret) {
8895 free_wired_pages = TRUE;
8896 goto return_err;
fe8ab488
A
8897 }
8898 goto finish;
8899 }
2d21ac55
A
8900
8901 fault_info.behavior = VM_BEHAVIOR_SEQUENTIAL;
8902 fault_info.user_tag = 0;
8903 fault_info.lo_offset = offset;
8904 fault_info.hi_offset = offset + xfer_size;
8905 fault_info.no_cache = FALSE;
b0d623f7 8906 fault_info.stealth = FALSE;
6d2010ae
A
8907 fault_info.io_sync = FALSE;
8908 fault_info.cs_bypass = FALSE;
fe8ab488
A
8909 fault_info.mark_zf_absent = TRUE;
8910 fault_info.interruptible = interruptible;
8911 fault_info.batch_pmap_op = TRUE;
b0d623f7
A
8912
8913 dwp = &dw_array[0];
6d2010ae 8914 dw_limit = DELAYED_WORK_LIMIT(DEFAULT_DELAYED_WORK_LIMIT);
2d21ac55 8915
55e303ae 8916 while (xfer_size) {
2d21ac55 8917 vm_fault_return_t result;
b0d623f7
A
8918
8919 dwp->dw_mask = 0;
2d21ac55 8920
3e170ce0
A
8921 if (fast_path_full_req) {
8922 /*
8923 * if we get here, it means that we ran into a page
8924 * state we couldn't handle in the fast path and
8925 * bailed out to the slow path... since the order
8926 * we look at pages is different between the 2 paths,
8927 * the following check is needed to determine whether
8928 * this page was already processed in the fast path
8929 */
8930 if (lite_list[entry>>5] & (1 << (entry & 31)))
8931 goto skip_page;
8932 }
55e303ae
A
8933 dst_page = vm_page_lookup(object, dst_offset);
8934
91447636
A
8935 /*
8936 * ENCRYPTED SWAP:
8937 * If the page is encrypted, we need to decrypt it,
8938 * so force a soft page fault.
8939 */
b0d623f7
A
8940 if (dst_page == VM_PAGE_NULL ||
8941 dst_page->busy ||
8942 dst_page->encrypted ||
8943 dst_page->error ||
8944 dst_page->restart ||
8945 dst_page->absent ||
8946 dst_page->fictitious) {
8947
8948 if (object == kernel_object)
8949 panic("vm_object_iopl_request: missing/bad page in kernel object\n");
39236c6e
A
8950 if (object == compressor_object)
8951 panic("vm_object_iopl_request: missing/bad page in compressor object\n");
8952
8953 if (cntrl_flags & UPL_REQUEST_NO_FAULT) {
8954 ret = KERN_MEMORY_ERROR;
8955 goto return_err;
8956 }
fe8ab488 8957 set_cache_attr_needed = TRUE;
39236c6e
A
8958
8959 /*
8960 * We just looked up the page and the result remains valid
8961 * until the object lock is release, so send it to
8962 * vm_fault_page() (as "dst_page"), to avoid having to
8963 * look it up again there.
8964 */
8965 caller_lookup = TRUE;
2d21ac55 8966
55e303ae
A
8967 do {
8968 vm_page_t top_page;
8969 kern_return_t error_code;
2d21ac55 8970
2d21ac55 8971 fault_info.cluster_size = xfer_size;
55e303ae 8972
b0d623f7
A
8973 vm_object_paging_begin(object);
8974
55e303ae 8975 result = vm_fault_page(object, dst_offset,
39236c6e
A
8976 prot | VM_PROT_WRITE, FALSE,
8977 caller_lookup,
2d21ac55
A
8978 &prot, &dst_page, &top_page,
8979 (int *)0,
8980 &error_code, no_zero_fill,
8981 FALSE, &fault_info);
8982
39236c6e
A
8983 /* our lookup is no longer valid at this point */
8984 caller_lookup = FALSE;
8985
2d21ac55
A
8986 switch (result) {
8987
55e303ae
A
8988 case VM_FAULT_SUCCESS:
8989
d41d1dae
A
8990 if ( !dst_page->absent) {
8991 PAGE_WAKEUP_DONE(dst_page);
8992 } else {
8993 /*
8994 * we only get back an absent page if we
8995 * requested that it not be zero-filled
8996 * because we are about to fill it via I/O
8997 *
8998 * absent pages should be left BUSY
8999 * to prevent them from being faulted
9000 * into an address space before we've
9001 * had a chance to complete the I/O on
9002 * them since they may contain info that
9003 * shouldn't be seen by the faulting task
9004 */
9005 }
55e303ae
A
9006 /*
9007 * Release paging references and
9008 * top-level placeholder page, if any.
9009 */
2d21ac55 9010 if (top_page != VM_PAGE_NULL) {
55e303ae 9011 vm_object_t local_object;
2d21ac55 9012
39037602
A
9013 local_object = VM_PAGE_OBJECT(top_page);
9014
9015 /*
9016 * comparing 2 packed pointers
9017 */
9018 if (top_page->vm_page_object != dst_page->vm_page_object) {
2d21ac55 9019 vm_object_lock(local_object);
55e303ae 9020 VM_PAGE_FREE(top_page);
2d21ac55
A
9021 vm_object_paging_end(local_object);
9022 vm_object_unlock(local_object);
55e303ae
A
9023 } else {
9024 VM_PAGE_FREE(top_page);
2d21ac55 9025 vm_object_paging_end(local_object);
55e303ae
A
9026 }
9027 }
b0d623f7 9028 vm_object_paging_end(object);
55e303ae
A
9029 break;
9030
55e303ae
A
9031 case VM_FAULT_RETRY:
9032 vm_object_lock(object);
55e303ae
A
9033 break;
9034
6d2010ae 9035 case VM_FAULT_MEMORY_SHORTAGE:
3e170ce0 9036 OSAddAtomic((size_in_pages - entry), &vm_upl_wait_for_pages);
2d21ac55 9037
6d2010ae 9038 VM_DEBUG_EVENT(vm_iopl_page_wait, VM_IOPL_PAGE_WAIT, DBG_FUNC_START, vm_upl_wait_for_pages, 0, 0, 0);
55e303ae 9039
55e303ae 9040 if (vm_page_wait(interruptible)) {
3e170ce0 9041 OSAddAtomic(-(size_in_pages - entry), &vm_upl_wait_for_pages);
6d2010ae
A
9042
9043 VM_DEBUG_EVENT(vm_iopl_page_wait, VM_IOPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, 0);
55e303ae 9044 vm_object_lock(object);
6d2010ae 9045
55e303ae
A
9046 break;
9047 }
3e170ce0 9048 OSAddAtomic(-(size_in_pages - entry), &vm_upl_wait_for_pages);
6d2010ae
A
9049
9050 VM_DEBUG_EVENT(vm_iopl_page_wait, VM_IOPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, -1);
9051
55e303ae
A
9052 /* fall thru */
9053
9054 case VM_FAULT_INTERRUPTED:
9055 error_code = MACH_SEND_INTERRUPTED;
9056 case VM_FAULT_MEMORY_ERROR:
b0d623f7 9057 memory_error:
2d21ac55 9058 ret = (error_code ? error_code: KERN_MEMORY_ERROR);
0c530ab8 9059
2d21ac55 9060 vm_object_lock(object);
0c530ab8 9061 goto return_err;
b0d623f7
A
9062
9063 case VM_FAULT_SUCCESS_NO_VM_PAGE:
9064 /* success but no page: fail */
9065 vm_object_paging_end(object);
9066 vm_object_unlock(object);
9067 goto memory_error;
9068
9069 default:
9070 panic("vm_object_iopl_request: unexpected error"
9071 " 0x%x from vm_fault_page()\n", result);
55e303ae 9072 }
2d21ac55 9073 } while (result != VM_FAULT_SUCCESS);
b0d623f7 9074
55e303ae 9075 }
39037602
A
9076 phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
9077
b0d623f7
A
9078 if (upl->flags & UPL_KERNEL_OBJECT)
9079 goto record_phys_addr;
9080
39037602 9081 if (dst_page->vm_page_q_state == VM_PAGE_USED_BY_COMPRESSOR) {
39236c6e
A
9082 dst_page->busy = TRUE;
9083 goto record_phys_addr;
9084 }
9085
b0d623f7
A
9086 if (dst_page->cleaning) {
9087 /*
316670eb 9088 * Someone else is cleaning this page in place.
b0d623f7
A
9089 * In theory, we should be able to proceed and use this
9090 * page but they'll probably end up clearing the "busy"
9091 * bit on it in upl_commit_range() but they didn't set
9092 * it, so they would clear our "busy" bit and open
9093 * us to race conditions.
9094 * We'd better wait for the cleaning to complete and
9095 * then try again.
9096 */
9097 vm_object_iopl_request_sleep_for_cleaning++;
9098 PAGE_SLEEP(object, dst_page, THREAD_UNINT);
9099 continue;
9100 }
39037602 9101 if (dst_page->laundry)
316670eb 9102 vm_pageout_steal_laundry(dst_page, FALSE);
39037602 9103
0c530ab8 9104 if ( (cntrl_flags & UPL_NEED_32BIT_ADDR) &&
39037602 9105 phys_page >= (max_valid_dma_address >> PAGE_SHIFT) ) {
0c530ab8
A
9106 vm_page_t low_page;
9107 int refmod;
9108
9109 /*
9110 * support devices that can't DMA above 32 bits
9111 * by substituting pages from a pool of low address
9112 * memory for any pages we find above the 4G mark
9113 * can't substitute if the page is already wired because
9114 * we don't know whether that physical address has been
9115 * handed out to some other 64 bit capable DMA device to use
9116 */
b0d623f7 9117 if (VM_PAGE_WIRED(dst_page)) {
0c530ab8
A
9118 ret = KERN_PROTECTION_FAILURE;
9119 goto return_err;
9120 }
0c530ab8
A
9121 low_page = vm_page_grablo();
9122
9123 if (low_page == VM_PAGE_NULL) {
9124 ret = KERN_RESOURCE_SHORTAGE;
9125 goto return_err;
9126 }
9127 /*
9128 * from here until the vm_page_replace completes
9129 * we musn't drop the object lock... we don't
9130 * want anyone refaulting this page in and using
9131 * it after we disconnect it... we want the fault
9132 * to find the new page being substituted.
9133 */
2d21ac55 9134 if (dst_page->pmapped)
39037602 9135 refmod = pmap_disconnect(phys_page);
2d21ac55
A
9136 else
9137 refmod = 0;
d41d1dae 9138
6d2010ae 9139 if (!dst_page->absent)
d41d1dae 9140 vm_page_copy(dst_page, low_page);
2d21ac55 9141
0c530ab8
A
9142 low_page->reference = dst_page->reference;
9143 low_page->dirty = dst_page->dirty;
d41d1dae 9144 low_page->absent = dst_page->absent;
0c530ab8
A
9145
9146 if (refmod & VM_MEM_REFERENCED)
9147 low_page->reference = TRUE;
316670eb
A
9148 if (refmod & VM_MEM_MODIFIED) {
9149 SET_PAGE_DIRTY(low_page, FALSE);
9150 }
0c530ab8 9151
0c530ab8 9152 vm_page_replace(low_page, object, dst_offset);
0c530ab8
A
9153
9154 dst_page = low_page;
9155 /*
9156 * vm_page_grablo returned the page marked
9157 * BUSY... we don't need a PAGE_WAKEUP_DONE
9158 * here, because we've never dropped the object lock
9159 */
d41d1dae
A
9160 if ( !dst_page->absent)
9161 dst_page->busy = FALSE;
39037602
A
9162
9163 phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
0c530ab8 9164 }
d41d1dae
A
9165 if ( !dst_page->busy)
9166 dwp->dw_mask |= DW_vm_page_wire;
55e303ae 9167
91447636
A
9168 if (cntrl_flags & UPL_BLOCK_ACCESS) {
9169 /*
9170 * Mark the page "busy" to block any future page fault
6d2010ae
A
9171 * on this page in addition to wiring it.
9172 * We'll also remove the mapping
91447636
A
9173 * of all these pages before leaving this routine.
9174 */
9175 assert(!dst_page->fictitious);
9176 dst_page->busy = TRUE;
9177 }
2d21ac55
A
9178 /*
9179 * expect the page to be used
9180 * page queues lock must be held to set 'reference'
9181 */
b0d623f7 9182 dwp->dw_mask |= DW_set_reference;
55e303ae 9183
316670eb
A
9184 if (!(cntrl_flags & UPL_COPYOUT_FROM)) {
9185 SET_PAGE_DIRTY(dst_page, TRUE);
9186 }
15129b1c 9187 if ((cntrl_flags & UPL_REQUEST_FORCE_COHERENCY) && dst_page->written_by_kernel == TRUE) {
39037602 9188 pmap_sync_page_attributes_phys(phys_page);
15129b1c
A
9189 dst_page->written_by_kernel = FALSE;
9190 }
9191
b0d623f7 9192record_phys_addr:
d41d1dae
A
9193 if (dst_page->busy)
9194 upl->flags |= UPL_HAS_BUSY;
9195
3e170ce0 9196 lite_list[entry>>5] |= 1 << (entry & 31);
55e303ae 9197
39037602
A
9198 if (phys_page > upl->highest_page)
9199 upl->highest_page = phys_page;
55e303ae 9200
2d21ac55 9201 if (user_page_list) {
39037602
A
9202 user_page_list[entry].phys_addr = phys_page;
9203 user_page_list[entry].free_when_done = dst_page->free_when_done;
2d21ac55 9204 user_page_list[entry].absent = dst_page->absent;
593a1d5f 9205 user_page_list[entry].dirty = dst_page->dirty;
2d21ac55 9206 user_page_list[entry].precious = dst_page->precious;
593a1d5f 9207 user_page_list[entry].device = FALSE;
316670eb 9208 user_page_list[entry].needed = FALSE;
2d21ac55 9209 if (dst_page->clustered == TRUE)
39037602 9210 user_page_list[entry].speculative = (dst_page->vm_page_q_state == VM_PAGE_ON_SPECULATIVE_Q) ? TRUE : FALSE;
2d21ac55
A
9211 else
9212 user_page_list[entry].speculative = FALSE;
593a1d5f
A
9213 user_page_list[entry].cs_validated = dst_page->cs_validated;
9214 user_page_list[entry].cs_tainted = dst_page->cs_tainted;
c18c124e 9215 user_page_list[entry].cs_nx = dst_page->cs_nx;
3e170ce0 9216 user_page_list[entry].mark = FALSE;
55e303ae 9217 }
39236c6e 9218 if (object != kernel_object && object != compressor_object) {
b0d623f7
A
9219 /*
9220 * someone is explicitly grabbing this page...
9221 * update clustered and speculative state
9222 *
9223 */
fe8ab488
A
9224 if (dst_page->clustered)
9225 VM_PAGE_CONSUME_CLUSTERED(dst_page);
55e303ae 9226 }
3e170ce0 9227skip_page:
55e303ae
A
9228 entry++;
9229 dst_offset += PAGE_SIZE_64;
9230 xfer_size -= PAGE_SIZE;
b0d623f7
A
9231
9232 if (dwp->dw_mask) {
6d2010ae 9233 VM_PAGE_ADD_DELAYED_WORK(dwp, dst_page, dw_count);
b0d623f7 9234
6d2010ae 9235 if (dw_count >= dw_limit) {
3e170ce0 9236 vm_page_do_delayed_work(object, UPL_MEMORY_TAG(cntrl_flags), &dw_array[0], dw_count);
b0d623f7
A
9237
9238 dwp = &dw_array[0];
9239 dw_count = 0;
9240 }
9241 }
55e303ae 9242 }
3e170ce0 9243 assert(entry == size_in_pages);
55e303ae 9244
3e170ce0
A
9245 if (dw_count)
9246 vm_page_do_delayed_work(object, UPL_MEMORY_TAG(cntrl_flags), &dw_array[0], dw_count);
fe8ab488
A
9247finish:
9248 if (user_page_list && set_cache_attr_needed == TRUE)
3e170ce0 9249 vm_object_set_pmap_cache_attr(object, user_page_list, size_in_pages, TRUE);
316670eb 9250
2d21ac55
A
9251 if (page_list_count != NULL) {
9252 if (upl->flags & UPL_INTERNAL)
55e303ae 9253 *page_list_count = 0;
3e170ce0
A
9254 else if (*page_list_count > size_in_pages)
9255 *page_list_count = size_in_pages;
55e303ae 9256 }
55e303ae 9257 vm_object_unlock(object);
55e303ae 9258
91447636
A
9259 if (cntrl_flags & UPL_BLOCK_ACCESS) {
9260 /*
9261 * We've marked all the pages "busy" so that future
9262 * page faults will block.
9263 * Now remove the mapping for these pages, so that they
9264 * can't be accessed without causing a page fault.
9265 */
9266 vm_object_pmap_protect(object, offset, (vm_object_size_t)size,
9267 PMAP_NULL, 0, VM_PROT_NONE);
b0d623f7
A
9268 assert(!object->blocked_access);
9269 object->blocked_access = TRUE;
91447636 9270 }
3e170ce0 9271
91447636 9272 return KERN_SUCCESS;
0c530ab8 9273
0c530ab8 9274return_err:
b0d623f7 9275 dw_index = 0;
0c530ab8
A
9276
9277 for (; offset < dst_offset; offset += PAGE_SIZE) {
0b4c1975
A
9278 boolean_t need_unwire;
9279
0c530ab8
A
9280 dst_page = vm_page_lookup(object, offset);
9281
9282 if (dst_page == VM_PAGE_NULL)
d41d1dae 9283 panic("vm_object_iopl_request: Wired page missing. \n");
2d21ac55 9284
0b4c1975
A
9285 /*
9286 * if we've already processed this page in an earlier
9287 * dw_do_work, we need to undo the wiring... we will
9288 * leave the dirty and reference bits on if they
9289 * were set, since we don't have a good way of knowing
9290 * what the previous state was and we won't get here
9291 * under any normal circumstances... we will always
9292 * clear BUSY and wakeup any waiters via vm_page_free
9293 * or PAGE_WAKEUP_DONE
9294 */
9295 need_unwire = TRUE;
9296
b0d623f7
A
9297 if (dw_count) {
9298 if (dw_array[dw_index].dw_m == dst_page) {
0b4c1975
A
9299 /*
9300 * still in the deferred work list
9301 * which means we haven't yet called
9302 * vm_page_wire on this page
9303 */
9304 need_unwire = FALSE;
d41d1dae
A
9305
9306 dw_index++;
9307 dw_count--;
b0d623f7
A
9308 }
9309 }
0b4c1975
A
9310 vm_page_lock_queues();
9311
fe8ab488 9312 if (dst_page->absent || free_wired_pages == TRUE) {
d41d1dae 9313 vm_page_free(dst_page);
0b4c1975 9314
d41d1dae
A
9315 need_unwire = FALSE;
9316 } else {
9317 if (need_unwire == TRUE)
9318 vm_page_unwire(dst_page, TRUE);
0b4c1975 9319
0b4c1975 9320 PAGE_WAKEUP_DONE(dst_page);
6d2010ae 9321 }
0c530ab8 9322 vm_page_unlock_queues();
2d21ac55 9323
0b4c1975
A
9324 if (need_unwire == TRUE)
9325 VM_STAT_INCR(reactivations);
0c530ab8 9326 }
b0d623f7
A
9327#if UPL_DEBUG
9328 upl->upl_state = 2;
9329#endif
9330 if (! (upl->flags & UPL_KERNEL_OBJECT)) {
9331 vm_object_activity_end(object);
316670eb 9332 vm_object_collapse(object, 0, TRUE);
b0d623f7 9333 }
0c530ab8
A
9334 vm_object_unlock(object);
9335 upl_destroy(upl);
9336
9337 return ret;
1c79356b
A
9338}
9339
91447636
A
9340kern_return_t
9341upl_transpose(
9342 upl_t upl1,
9343 upl_t upl2)
1c79356b 9344{
91447636
A
9345 kern_return_t retval;
9346 boolean_t upls_locked;
9347 vm_object_t object1, object2;
1c79356b 9348
b0d623f7 9349 if (upl1 == UPL_NULL || upl2 == UPL_NULL || upl1 == upl2 || ((upl1->flags & UPL_VECTOR)==UPL_VECTOR) || ((upl2->flags & UPL_VECTOR)==UPL_VECTOR)) {
91447636
A
9350 return KERN_INVALID_ARGUMENT;
9351 }
9352
9353 upls_locked = FALSE;
1c79356b 9354
91447636
A
9355 /*
9356 * Since we need to lock both UPLs at the same time,
9357 * avoid deadlocks by always taking locks in the same order.
9358 */
9359 if (upl1 < upl2) {
9360 upl_lock(upl1);
9361 upl_lock(upl2);
9362 } else {
9363 upl_lock(upl2);
9364 upl_lock(upl1);
9365 }
9366 upls_locked = TRUE; /* the UPLs will need to be unlocked */
9367
9368 object1 = upl1->map_object;
9369 object2 = upl2->map_object;
9370
9371 if (upl1->offset != 0 || upl2->offset != 0 ||
9372 upl1->size != upl2->size) {
9373 /*
9374 * We deal only with full objects, not subsets.
9375 * That's because we exchange the entire backing store info
9376 * for the objects: pager, resident pages, etc... We can't do
9377 * only part of it.
9378 */
9379 retval = KERN_INVALID_VALUE;
9380 goto done;
9381 }
9382
9383 /*
9384 * Tranpose the VM objects' backing store.
9385 */
9386 retval = vm_object_transpose(object1, object2,
9387 (vm_object_size_t) upl1->size);
9388
9389 if (retval == KERN_SUCCESS) {
9390 /*
9391 * Make each UPL point to the correct VM object, i.e. the
9392 * object holding the pages that the UPL refers to...
9393 */
fe8ab488
A
9394#if CONFIG_IOSCHED || UPL_DEBUG
9395 if ((upl1->flags & UPL_TRACKED_BY_OBJECT) || (upl2->flags & UPL_TRACKED_BY_OBJECT)) {
9396 vm_object_lock(object1);
9397 vm_object_lock(object2);
9398 }
9399 if (upl1->flags & UPL_TRACKED_BY_OBJECT)
9400 queue_remove(&object1->uplq, upl1, upl_t, uplq);
9401 if (upl2->flags & UPL_TRACKED_BY_OBJECT)
9402 queue_remove(&object2->uplq, upl2, upl_t, uplq);
2d21ac55 9403#endif
91447636
A
9404 upl1->map_object = object2;
9405 upl2->map_object = object1;
fe8ab488
A
9406
9407#if CONFIG_IOSCHED || UPL_DEBUG
9408 if (upl1->flags & UPL_TRACKED_BY_OBJECT)
9409 queue_enter(&object2->uplq, upl1, upl_t, uplq);
9410 if (upl2->flags & UPL_TRACKED_BY_OBJECT)
9411 queue_enter(&object1->uplq, upl2, upl_t, uplq);
9412 if ((upl1->flags & UPL_TRACKED_BY_OBJECT) || (upl2->flags & UPL_TRACKED_BY_OBJECT)) {
9413 vm_object_unlock(object2);
9414 vm_object_unlock(object1);
9415 }
2d21ac55 9416#endif
91447636
A
9417 }
9418
9419done:
9420 /*
9421 * Cleanup.
9422 */
9423 if (upls_locked) {
9424 upl_unlock(upl1);
9425 upl_unlock(upl2);
9426 upls_locked = FALSE;
9427 }
9428
9429 return retval;
9430}
9431
316670eb
A
9432void
9433upl_range_needed(
9434 upl_t upl,
9435 int index,
9436 int count)
9437{
9438 upl_page_info_t *user_page_list;
9439 int size_in_pages;
9440
9441 if ( !(upl->flags & UPL_INTERNAL) || count <= 0)
9442 return;
9443
9444 size_in_pages = upl->size / PAGE_SIZE;
9445
9446 user_page_list = (upl_page_info_t *) (((uintptr_t)upl) + sizeof(struct upl));
9447
9448 while (count-- && index < size_in_pages)
9449 user_page_list[index++].needed = TRUE;
9450}
9451
9452
91447636
A
9453/*
9454 * ENCRYPTED SWAP:
9455 *
9456 * Rationale: the user might have some encrypted data on disk (via
9457 * FileVault or any other mechanism). That data is then decrypted in
9458 * memory, which is safe as long as the machine is secure. But that
9459 * decrypted data in memory could be paged out to disk by the default
9460 * pager. The data would then be stored on disk in clear (not encrypted)
9461 * and it could be accessed by anyone who gets physical access to the
9462 * disk (if the laptop or the disk gets stolen for example). This weakens
9463 * the security offered by FileVault.
9464 *
9465 * Solution: the default pager will optionally request that all the
9466 * pages it gathers for pageout be encrypted, via the UPL interfaces,
9467 * before it sends this UPL to disk via the vnode_pageout() path.
9468 *
9469 * Notes:
9470 *
9471 * To avoid disrupting the VM LRU algorithms, we want to keep the
9472 * clean-in-place mechanisms, which allow us to send some extra pages to
9473 * swap (clustering) without actually removing them from the user's
9474 * address space. We don't want the user to unknowingly access encrypted
9475 * data, so we have to actually remove the encrypted pages from the page
9476 * table. When the user accesses the data, the hardware will fail to
9477 * locate the virtual page in its page table and will trigger a page
9478 * fault. We can then decrypt the page and enter it in the page table
9479 * again. Whenever we allow the user to access the contents of a page,
9480 * we have to make sure it's not encrypted.
9481 *
9482 *
9483 */
9484/*
9485 * ENCRYPTED SWAP:
9486 * Reserve of virtual addresses in the kernel address space.
9487 * We need to map the physical pages in the kernel, so that we
9488 * can call the encryption/decryption routines with a kernel
9489 * virtual address. We keep this pool of pre-allocated kernel
9490 * virtual addresses so that we don't have to scan the kernel's
316670eb 9491 * virtaul address space each time we need to encrypt or decrypt
91447636
A
9492 * a physical page.
9493 * It would be nice to be able to encrypt and decrypt in physical
9494 * mode but that might not always be more efficient...
9495 */
9496decl_simple_lock_data(,vm_paging_lock)
9497#define VM_PAGING_NUM_PAGES 64
9498vm_map_offset_t vm_paging_base_address = 0;
9499boolean_t vm_paging_page_inuse[VM_PAGING_NUM_PAGES] = { FALSE, };
9500int vm_paging_max_index = 0;
2d21ac55
A
9501int vm_paging_page_waiter = 0;
9502int vm_paging_page_waiter_total = 0;
91447636
A
9503unsigned long vm_paging_no_kernel_page = 0;
9504unsigned long vm_paging_objects_mapped = 0;
9505unsigned long vm_paging_pages_mapped = 0;
9506unsigned long vm_paging_objects_mapped_slow = 0;
9507unsigned long vm_paging_pages_mapped_slow = 0;
9508
2d21ac55
A
9509void
9510vm_paging_map_init(void)
9511{
9512 kern_return_t kr;
9513 vm_map_offset_t page_map_offset;
9514 vm_map_entry_t map_entry;
9515
9516 assert(vm_paging_base_address == 0);
9517
9518 /*
9519 * Initialize our pool of pre-allocated kernel
9520 * virtual addresses.
9521 */
9522 page_map_offset = 0;
9523 kr = vm_map_find_space(kernel_map,
9524 &page_map_offset,
9525 VM_PAGING_NUM_PAGES * PAGE_SIZE,
9526 0,
9527 0,
9528 &map_entry);
9529 if (kr != KERN_SUCCESS) {
9530 panic("vm_paging_map_init: kernel_map full\n");
9531 }
3e170ce0
A
9532 VME_OBJECT_SET(map_entry, kernel_object);
9533 VME_OFFSET_SET(map_entry, page_map_offset);
6d2010ae
A
9534 map_entry->protection = VM_PROT_NONE;
9535 map_entry->max_protection = VM_PROT_NONE;
9536 map_entry->permanent = TRUE;
2d21ac55
A
9537 vm_object_reference(kernel_object);
9538 vm_map_unlock(kernel_map);
9539
9540 assert(vm_paging_base_address == 0);
9541 vm_paging_base_address = page_map_offset;
9542}
9543
91447636
A
9544/*
9545 * ENCRYPTED SWAP:
9546 * vm_paging_map_object:
9547 * Maps part of a VM object's pages in the kernel
9548 * virtual address space, using the pre-allocated
9549 * kernel virtual addresses, if possible.
9550 * Context:
9551 * The VM object is locked. This lock will get
2d21ac55
A
9552 * dropped and re-acquired though, so the caller
9553 * must make sure the VM object is kept alive
9554 * (by holding a VM map that has a reference
9555 * on it, for example, or taking an extra reference).
9556 * The page should also be kept busy to prevent
9557 * it from being reclaimed.
91447636
A
9558 */
9559kern_return_t
9560vm_paging_map_object(
91447636
A
9561 vm_page_t page,
9562 vm_object_t object,
9563 vm_object_offset_t offset,
593a1d5f 9564 vm_prot_t protection,
39236c6e
A
9565 boolean_t can_unlock_object,
9566 vm_map_size_t *size, /* IN/OUT */
9567 vm_map_offset_t *address, /* OUT */
9568 boolean_t *need_unmap) /* OUT */
91447636
A
9569{
9570 kern_return_t kr;
9571 vm_map_offset_t page_map_offset;
9572 vm_map_size_t map_size;
9573 vm_object_offset_t object_offset;
91447636 9574 int i;
91447636 9575
91447636 9576 if (page != VM_PAGE_NULL && *size == PAGE_SIZE) {
39236c6e
A
9577 /* use permanent 1-to-1 kernel mapping of physical memory ? */
9578#if __x86_64__
9579 *address = (vm_map_offset_t)
39037602 9580 PHYSMAP_PTOV((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(page) <<
39236c6e
A
9581 PAGE_SHIFT);
9582 *need_unmap = FALSE;
9583 return KERN_SUCCESS;
9584#else
9585#warn "vm_paging_map_object: no 1-to-1 kernel mapping of physical memory..."
9586#endif
9587
2d21ac55 9588 assert(page->busy);
91447636 9589 /*
91447636
A
9590 * Use one of the pre-allocated kernel virtual addresses
9591 * and just enter the VM page in the kernel address space
9592 * at that virtual address.
9593 */
91447636
A
9594 simple_lock(&vm_paging_lock);
9595
91447636
A
9596 /*
9597 * Try and find an available kernel virtual address
9598 * from our pre-allocated pool.
9599 */
9600 page_map_offset = 0;
2d21ac55
A
9601 for (;;) {
9602 for (i = 0; i < VM_PAGING_NUM_PAGES; i++) {
9603 if (vm_paging_page_inuse[i] == FALSE) {
9604 page_map_offset =
9605 vm_paging_base_address +
9606 (i * PAGE_SIZE);
9607 break;
9608 }
9609 }
9610 if (page_map_offset != 0) {
9611 /* found a space to map our page ! */
9612 break;
9613 }
9614
9615 if (can_unlock_object) {
9616 /*
9617 * If we can afford to unlock the VM object,
9618 * let's take the slow path now...
9619 */
91447636
A
9620 break;
9621 }
2d21ac55
A
9622 /*
9623 * We can't afford to unlock the VM object, so
9624 * let's wait for a space to become available...
9625 */
9626 vm_paging_page_waiter_total++;
9627 vm_paging_page_waiter++;
fe8ab488
A
9628 kr = assert_wait((event_t)&vm_paging_page_waiter, THREAD_UNINT);
9629 if (kr == THREAD_WAITING) {
9630 simple_unlock(&vm_paging_lock);
9631 kr = thread_block(THREAD_CONTINUE_NULL);
9632 simple_lock(&vm_paging_lock);
9633 }
2d21ac55
A
9634 vm_paging_page_waiter--;
9635 /* ... and try again */
91447636
A
9636 }
9637
9638 if (page_map_offset != 0) {
9639 /*
9640 * We found a kernel virtual address;
9641 * map the physical page to that virtual address.
9642 */
9643 if (i > vm_paging_max_index) {
9644 vm_paging_max_index = i;
9645 }
9646 vm_paging_page_inuse[i] = TRUE;
9647 simple_unlock(&vm_paging_lock);
2d21ac55 9648
2d21ac55
A
9649 page->pmapped = TRUE;
9650
9651 /*
9652 * Keep the VM object locked over the PMAP_ENTER
9653 * and the actual use of the page by the kernel,
9654 * or this pmap mapping might get undone by a
9655 * vm_object_pmap_protect() call...
9656 */
0c530ab8
A
9657 PMAP_ENTER(kernel_pmap,
9658 page_map_offset,
9659 page,
593a1d5f 9660 protection,
316670eb 9661 VM_PROT_NONE,
6d2010ae 9662 0,
0c530ab8 9663 TRUE);
91447636
A
9664 vm_paging_objects_mapped++;
9665 vm_paging_pages_mapped++;
9666 *address = page_map_offset;
39236c6e 9667 *need_unmap = TRUE;
91447636
A
9668
9669 /* all done and mapped, ready to use ! */
9670 return KERN_SUCCESS;
9671 }
9672
9673 /*
9674 * We ran out of pre-allocated kernel virtual
9675 * addresses. Just map the page in the kernel
9676 * the slow and regular way.
9677 */
9678 vm_paging_no_kernel_page++;
9679 simple_unlock(&vm_paging_lock);
2d21ac55
A
9680 }
9681
9682 if (! can_unlock_object) {
39236c6e
A
9683 *address = 0;
9684 *size = 0;
9685 *need_unmap = FALSE;
2d21ac55 9686 return KERN_NOT_SUPPORTED;
91447636 9687 }
91447636
A
9688
9689 object_offset = vm_object_trunc_page(offset);
39236c6e
A
9690 map_size = vm_map_round_page(*size,
9691 VM_MAP_PAGE_MASK(kernel_map));
91447636
A
9692
9693 /*
9694 * Try and map the required range of the object
9695 * in the kernel_map
9696 */
9697
91447636
A
9698 vm_object_reference_locked(object); /* for the map entry */
9699 vm_object_unlock(object);
9700
9701 kr = vm_map_enter(kernel_map,
9702 address,
9703 map_size,
9704 0,
9705 VM_FLAGS_ANYWHERE,
9706 object,
9707 object_offset,
9708 FALSE,
593a1d5f 9709 protection,
91447636
A
9710 VM_PROT_ALL,
9711 VM_INHERIT_NONE);
9712 if (kr != KERN_SUCCESS) {
9713 *address = 0;
9714 *size = 0;
39236c6e 9715 *need_unmap = FALSE;
91447636 9716 vm_object_deallocate(object); /* for the map entry */
2d21ac55 9717 vm_object_lock(object);
91447636
A
9718 return kr;
9719 }
9720
9721 *size = map_size;
9722
9723 /*
9724 * Enter the mapped pages in the page table now.
9725 */
9726 vm_object_lock(object);
2d21ac55
A
9727 /*
9728 * VM object must be kept locked from before PMAP_ENTER()
9729 * until after the kernel is done accessing the page(s).
9730 * Otherwise, the pmap mappings in the kernel could be
9731 * undone by a call to vm_object_pmap_protect().
9732 */
9733
91447636
A
9734 for (page_map_offset = 0;
9735 map_size != 0;
9736 map_size -= PAGE_SIZE_64, page_map_offset += PAGE_SIZE_64) {
91447636
A
9737
9738 page = vm_page_lookup(object, offset + page_map_offset);
9739 if (page == VM_PAGE_NULL) {
2d21ac55
A
9740 printf("vm_paging_map_object: no page !?");
9741 vm_object_unlock(object);
9742 kr = vm_map_remove(kernel_map, *address, *size,
9743 VM_MAP_NO_FLAGS);
9744 assert(kr == KERN_SUCCESS);
9745 *address = 0;
9746 *size = 0;
39236c6e 9747 *need_unmap = FALSE;
2d21ac55
A
9748 vm_object_lock(object);
9749 return KERN_MEMORY_ERROR;
91447636 9750 }
2d21ac55 9751 page->pmapped = TRUE;
91447636 9752
39037602 9753 //assert(pmap_verify_free(VM_PAGE_GET_PHYS_PAGE(page)));
91447636
A
9754 PMAP_ENTER(kernel_pmap,
9755 *address + page_map_offset,
9756 page,
593a1d5f 9757 protection,
316670eb 9758 VM_PROT_NONE,
6d2010ae 9759 0,
0c530ab8 9760 TRUE);
91447636
A
9761 }
9762
9763 vm_paging_objects_mapped_slow++;
b0d623f7 9764 vm_paging_pages_mapped_slow += (unsigned long) (map_size / PAGE_SIZE_64);
91447636 9765
39236c6e
A
9766 *need_unmap = TRUE;
9767
91447636
A
9768 return KERN_SUCCESS;
9769}
9770
9771/*
9772 * ENCRYPTED SWAP:
9773 * vm_paging_unmap_object:
9774 * Unmaps part of a VM object's pages from the kernel
9775 * virtual address space.
9776 * Context:
9777 * The VM object is locked. This lock will get
9778 * dropped and re-acquired though.
9779 */
9780void
9781vm_paging_unmap_object(
9782 vm_object_t object,
9783 vm_map_offset_t start,
9784 vm_map_offset_t end)
9785{
9786 kern_return_t kr;
91447636 9787 int i;
91447636 9788
0c530ab8 9789 if ((vm_paging_base_address == 0) ||
8f6c56a5
A
9790 (start < vm_paging_base_address) ||
9791 (end > (vm_paging_base_address
2d21ac55 9792 + (VM_PAGING_NUM_PAGES * PAGE_SIZE)))) {
91447636
A
9793 /*
9794 * We didn't use our pre-allocated pool of
9795 * kernel virtual address. Deallocate the
9796 * virtual memory.
9797 */
9798 if (object != VM_OBJECT_NULL) {
9799 vm_object_unlock(object);
9800 }
9801 kr = vm_map_remove(kernel_map, start, end, VM_MAP_NO_FLAGS);
9802 if (object != VM_OBJECT_NULL) {
9803 vm_object_lock(object);
9804 }
9805 assert(kr == KERN_SUCCESS);
9806 } else {
9807 /*
9808 * We used a kernel virtual address from our
9809 * pre-allocated pool. Put it back in the pool
9810 * for next time.
9811 */
91447636 9812 assert(end - start == PAGE_SIZE);
b0d623f7
A
9813 i = (int) ((start - vm_paging_base_address) >> PAGE_SHIFT);
9814 assert(i >= 0 && i < VM_PAGING_NUM_PAGES);
91447636
A
9815
9816 /* undo the pmap mapping */
0c530ab8 9817 pmap_remove(kernel_pmap, start, end);
91447636
A
9818
9819 simple_lock(&vm_paging_lock);
9820 vm_paging_page_inuse[i] = FALSE;
2d21ac55
A
9821 if (vm_paging_page_waiter) {
9822 thread_wakeup(&vm_paging_page_waiter);
9823 }
91447636 9824 simple_unlock(&vm_paging_lock);
91447636
A
9825 }
9826}
9827
fe8ab488 9828#if ENCRYPTED_SWAP
91447636
A
9829/*
9830 * Encryption data.
9831 * "iv" is the "initial vector". Ideally, we want to
9832 * have a different one for each page we encrypt, so that
9833 * crackers can't find encryption patterns too easily.
9834 */
9835#define SWAP_CRYPT_AES_KEY_SIZE 128 /* XXX 192 and 256 don't work ! */
9836boolean_t swap_crypt_ctx_initialized = FALSE;
316670eb 9837uint32_t swap_crypt_key[8]; /* big enough for a 256 key */
91447636
A
9838aes_ctx swap_crypt_ctx;
9839const unsigned char swap_crypt_null_iv[AES_BLOCK_SIZE] = {0xa, };
9840
9841#if DEBUG
9842boolean_t swap_crypt_ctx_tested = FALSE;
9843unsigned char swap_crypt_test_page_ref[4096] __attribute__((aligned(4096)));
9844unsigned char swap_crypt_test_page_encrypt[4096] __attribute__((aligned(4096)));
9845unsigned char swap_crypt_test_page_decrypt[4096] __attribute__((aligned(4096)));
9846#endif /* DEBUG */
9847
91447636
A
9848/*
9849 * Initialize the encryption context: key and key size.
9850 */
9851void swap_crypt_ctx_initialize(void); /* forward */
9852void
9853swap_crypt_ctx_initialize(void)
9854{
9855 unsigned int i;
9856
9857 /*
9858 * No need for locking to protect swap_crypt_ctx_initialized
9859 * because the first use of encryption will come from the
9860 * pageout thread (we won't pagein before there's been a pageout)
9861 * and there's only one pageout thread.
9862 */
9863 if (swap_crypt_ctx_initialized == FALSE) {
9864 for (i = 0;
9865 i < (sizeof (swap_crypt_key) /
9866 sizeof (swap_crypt_key[0]));
9867 i++) {
9868 swap_crypt_key[i] = random();
9869 }
9870 aes_encrypt_key((const unsigned char *) swap_crypt_key,
9871 SWAP_CRYPT_AES_KEY_SIZE,
9872 &swap_crypt_ctx.encrypt);
9873 aes_decrypt_key((const unsigned char *) swap_crypt_key,
9874 SWAP_CRYPT_AES_KEY_SIZE,
9875 &swap_crypt_ctx.decrypt);
9876 swap_crypt_ctx_initialized = TRUE;
9877 }
9878
9879#if DEBUG
9880 /*
9881 * Validate the encryption algorithms.
9882 */
9883 if (swap_crypt_ctx_tested == FALSE) {
9884 /* initialize */
9885 for (i = 0; i < 4096; i++) {
9886 swap_crypt_test_page_ref[i] = (char) i;
9887 }
9888 /* encrypt */
9889 aes_encrypt_cbc(swap_crypt_test_page_ref,
9890 swap_crypt_null_iv,
9891 PAGE_SIZE / AES_BLOCK_SIZE,
9892 swap_crypt_test_page_encrypt,
9893 &swap_crypt_ctx.encrypt);
9894 /* decrypt */
9895 aes_decrypt_cbc(swap_crypt_test_page_encrypt,
9896 swap_crypt_null_iv,
9897 PAGE_SIZE / AES_BLOCK_SIZE,
9898 swap_crypt_test_page_decrypt,
9899 &swap_crypt_ctx.decrypt);
9900 /* compare result with original */
9901 for (i = 0; i < 4096; i ++) {
9902 if (swap_crypt_test_page_decrypt[i] !=
9903 swap_crypt_test_page_ref[i]) {
9904 panic("encryption test failed");
9905 }
9906 }
9907
9908 /* encrypt again */
9909 aes_encrypt_cbc(swap_crypt_test_page_decrypt,
9910 swap_crypt_null_iv,
9911 PAGE_SIZE / AES_BLOCK_SIZE,
9912 swap_crypt_test_page_decrypt,
9913 &swap_crypt_ctx.encrypt);
9914 /* decrypt in place */
9915 aes_decrypt_cbc(swap_crypt_test_page_decrypt,
9916 swap_crypt_null_iv,
9917 PAGE_SIZE / AES_BLOCK_SIZE,
9918 swap_crypt_test_page_decrypt,
9919 &swap_crypt_ctx.decrypt);
9920 for (i = 0; i < 4096; i ++) {
9921 if (swap_crypt_test_page_decrypt[i] !=
9922 swap_crypt_test_page_ref[i]) {
9923 panic("in place encryption test failed");
9924 }
9925 }
9926
9927 swap_crypt_ctx_tested = TRUE;
9928 }
9929#endif /* DEBUG */
9930}
9931
9932/*
9933 * ENCRYPTED SWAP:
9934 * vm_page_encrypt:
9935 * Encrypt the given page, for secure paging.
9936 * The page might already be mapped at kernel virtual
9937 * address "kernel_mapping_offset". Otherwise, we need
9938 * to map it.
9939 *
9940 * Context:
9941 * The page's object is locked, but this lock will be released
9942 * and re-acquired.
9943 * The page is busy and not accessible by users (not entered in any pmap).
9944 */
9945void
9946vm_page_encrypt(
9947 vm_page_t page,
9948 vm_map_offset_t kernel_mapping_offset)
9949{
91447636 9950 kern_return_t kr;
91447636 9951 vm_map_size_t kernel_mapping_size;
39236c6e 9952 boolean_t kernel_mapping_needs_unmap;
91447636 9953 vm_offset_t kernel_vaddr;
39037602 9954 vm_object_t page_object;
91447636
A
9955 union {
9956 unsigned char aes_iv[AES_BLOCK_SIZE];
9957 struct {
9958 memory_object_t pager_object;
9959 vm_object_offset_t paging_offset;
9960 } vm;
9961 } encrypt_iv;
9962
9963 if (! vm_pages_encrypted) {
9964 vm_pages_encrypted = TRUE;
9965 }
9966
9967 assert(page->busy);
91447636
A
9968
9969 if (page->encrypted) {
9970 /*
9971 * Already encrypted: no need to do it again.
9972 */
9973 vm_page_encrypt_already_encrypted_counter++;
9974 return;
9975 }
316670eb
A
9976 assert(page->dirty || page->precious);
9977
91447636
A
9978 ASSERT_PAGE_DECRYPTED(page);
9979
39037602
A
9980 page_object = VM_PAGE_OBJECT(page);
9981
91447636 9982 /*
2d21ac55
A
9983 * Take a paging-in-progress reference to keep the object
9984 * alive even if we have to unlock it (in vm_paging_map_object()
9985 * for example)...
91447636 9986 */
39037602 9987 vm_object_paging_begin(page_object);
91447636
A
9988
9989 if (kernel_mapping_offset == 0) {
9990 /*
9991 * The page hasn't already been mapped in kernel space
9992 * by the caller. Map it now, so that we can access
9993 * its contents and encrypt them.
9994 */
9995 kernel_mapping_size = PAGE_SIZE;
39236c6e
A
9996 kernel_mapping_needs_unmap = FALSE;
9997 kr = vm_paging_map_object(page,
39037602 9998 page_object,
91447636 9999 page->offset,
593a1d5f 10000 VM_PROT_READ | VM_PROT_WRITE,
39236c6e
A
10001 FALSE,
10002 &kernel_mapping_size,
10003 &kernel_mapping_offset,
10004 &kernel_mapping_needs_unmap);
91447636
A
10005 if (kr != KERN_SUCCESS) {
10006 panic("vm_page_encrypt: "
10007 "could not map page in kernel: 0x%x\n",
10008 kr);
10009 }
10010 } else {
10011 kernel_mapping_size = 0;
39236c6e 10012 kernel_mapping_needs_unmap = FALSE;
91447636
A
10013 }
10014 kernel_vaddr = CAST_DOWN(vm_offset_t, kernel_mapping_offset);
10015
10016 if (swap_crypt_ctx_initialized == FALSE) {
10017 swap_crypt_ctx_initialize();
10018 }
10019 assert(swap_crypt_ctx_initialized);
10020
10021 /*
10022 * Prepare an "initial vector" for the encryption.
10023 * We use the "pager" and the "paging_offset" for that
10024 * page to obfuscate the encrypted data a bit more and
10025 * prevent crackers from finding patterns that they could
10026 * use to break the key.
10027 */
10028 bzero(&encrypt_iv.aes_iv[0], sizeof (encrypt_iv.aes_iv));
39037602 10029 encrypt_iv.vm.pager_object = page_object->pager;
91447636 10030 encrypt_iv.vm.paging_offset =
39037602 10031 page_object->paging_offset + page->offset;
91447636 10032
91447636
A
10033 /* encrypt the "initial vector" */
10034 aes_encrypt_cbc((const unsigned char *) &encrypt_iv.aes_iv[0],
10035 swap_crypt_null_iv,
10036 1,
10037 &encrypt_iv.aes_iv[0],
10038 &swap_crypt_ctx.encrypt);
10039
10040 /*
10041 * Encrypt the page.
10042 */
10043 aes_encrypt_cbc((const unsigned char *) kernel_vaddr,
10044 &encrypt_iv.aes_iv[0],
10045 PAGE_SIZE / AES_BLOCK_SIZE,
10046 (unsigned char *) kernel_vaddr,
10047 &swap_crypt_ctx.encrypt);
10048
10049 vm_page_encrypt_counter++;
10050
91447636
A
10051 /*
10052 * Unmap the page from the kernel's address space,
10053 * if we had to map it ourselves. Otherwise, let
10054 * the caller undo the mapping if needed.
10055 */
39236c6e 10056 if (kernel_mapping_needs_unmap) {
39037602 10057 vm_paging_unmap_object(page_object,
91447636
A
10058 kernel_mapping_offset,
10059 kernel_mapping_offset + kernel_mapping_size);
10060 }
10061
10062 /*
2d21ac55 10063 * Clear the "reference" and "modified" bits.
91447636
A
10064 * This should clean up any impact the encryption had
10065 * on them.
2d21ac55
A
10066 * The page was kept busy and disconnected from all pmaps,
10067 * so it can't have been referenced or modified from user
10068 * space.
10069 * The software bits will be reset later after the I/O
10070 * has completed (in upl_commit_range()).
91447636 10071 */
39037602 10072 pmap_clear_refmod(VM_PAGE_GET_PHYS_PAGE(page), VM_MEM_REFERENCED | VM_MEM_MODIFIED);
91447636
A
10073
10074 page->encrypted = TRUE;
2d21ac55 10075
39037602 10076 vm_object_paging_end(page_object);
91447636
A
10077}
10078
10079/*
10080 * ENCRYPTED SWAP:
10081 * vm_page_decrypt:
10082 * Decrypt the given page.
10083 * The page might already be mapped at kernel virtual
10084 * address "kernel_mapping_offset". Otherwise, we need
10085 * to map it.
10086 *
10087 * Context:
10088 * The page's VM object is locked but will be unlocked and relocked.
10089 * The page is busy and not accessible by users (not entered in any pmap).
10090 */
10091void
10092vm_page_decrypt(
10093 vm_page_t page,
10094 vm_map_offset_t kernel_mapping_offset)
10095{
91447636
A
10096 kern_return_t kr;
10097 vm_map_size_t kernel_mapping_size;
10098 vm_offset_t kernel_vaddr;
39236c6e 10099 boolean_t kernel_mapping_needs_unmap;
39037602 10100 vm_object_t page_object;
91447636
A
10101 union {
10102 unsigned char aes_iv[AES_BLOCK_SIZE];
10103 struct {
10104 memory_object_t pager_object;
10105 vm_object_offset_t paging_offset;
10106 } vm;
10107 } decrypt_iv;
6d2010ae 10108 boolean_t was_dirty;
91447636
A
10109
10110 assert(page->busy);
10111 assert(page->encrypted);
10112
39037602 10113 page_object = VM_PAGE_OBJECT(page);
6d2010ae
A
10114 was_dirty = page->dirty;
10115
91447636 10116 /*
2d21ac55
A
10117 * Take a paging-in-progress reference to keep the object
10118 * alive even if we have to unlock it (in vm_paging_map_object()
10119 * for example)...
91447636 10120 */
39037602 10121 vm_object_paging_begin(page_object);
91447636
A
10122
10123 if (kernel_mapping_offset == 0) {
10124 /*
10125 * The page hasn't already been mapped in kernel space
10126 * by the caller. Map it now, so that we can access
10127 * its contents and decrypt them.
10128 */
10129 kernel_mapping_size = PAGE_SIZE;
39236c6e
A
10130 kernel_mapping_needs_unmap = FALSE;
10131 kr = vm_paging_map_object(page,
39037602 10132 page_object,
91447636 10133 page->offset,
593a1d5f 10134 VM_PROT_READ | VM_PROT_WRITE,
39236c6e
A
10135 FALSE,
10136 &kernel_mapping_size,
10137 &kernel_mapping_offset,
10138 &kernel_mapping_needs_unmap);
91447636
A
10139 if (kr != KERN_SUCCESS) {
10140 panic("vm_page_decrypt: "
2d21ac55
A
10141 "could not map page in kernel: 0x%x\n",
10142 kr);
91447636
A
10143 }
10144 } else {
10145 kernel_mapping_size = 0;
39236c6e 10146 kernel_mapping_needs_unmap = FALSE;
91447636
A
10147 }
10148 kernel_vaddr = CAST_DOWN(vm_offset_t, kernel_mapping_offset);
10149
10150 assert(swap_crypt_ctx_initialized);
10151
10152 /*
10153 * Prepare an "initial vector" for the decryption.
10154 * It has to be the same as the "initial vector" we
10155 * used to encrypt that page.
10156 */
10157 bzero(&decrypt_iv.aes_iv[0], sizeof (decrypt_iv.aes_iv));
39037602 10158 decrypt_iv.vm.pager_object = page_object->pager;
91447636 10159 decrypt_iv.vm.paging_offset =
39037602 10160 page_object->paging_offset + page->offset;
91447636 10161
91447636
A
10162 /* encrypt the "initial vector" */
10163 aes_encrypt_cbc((const unsigned char *) &decrypt_iv.aes_iv[0],
10164 swap_crypt_null_iv,
10165 1,
10166 &decrypt_iv.aes_iv[0],
10167 &swap_crypt_ctx.encrypt);
10168
10169 /*
10170 * Decrypt the page.
10171 */
10172 aes_decrypt_cbc((const unsigned char *) kernel_vaddr,
10173 &decrypt_iv.aes_iv[0],
10174 PAGE_SIZE / AES_BLOCK_SIZE,
10175 (unsigned char *) kernel_vaddr,
10176 &swap_crypt_ctx.decrypt);
10177 vm_page_decrypt_counter++;
10178
91447636
A
10179 /*
10180 * Unmap the page from the kernel's address space,
10181 * if we had to map it ourselves. Otherwise, let
10182 * the caller undo the mapping if needed.
10183 */
39236c6e 10184 if (kernel_mapping_needs_unmap) {
39037602 10185 vm_paging_unmap_object(page_object,
91447636
A
10186 kernel_vaddr,
10187 kernel_vaddr + PAGE_SIZE);
10188 }
10189
6d2010ae
A
10190 if (was_dirty) {
10191 /*
10192 * The pager did not specify that the page would be
10193 * clean when it got paged in, so let's not clean it here
10194 * either.
10195 */
10196 } else {
10197 /*
10198 * After decryption, the page is actually still clean.
10199 * It was encrypted as part of paging, which "cleans"
10200 * the "dirty" pages.
10201 * Noone could access it after it was encrypted
10202 * and the decryption doesn't count.
10203 */
10204 page->dirty = FALSE;
10205 assert (page->cs_validated == FALSE);
39037602 10206 pmap_clear_refmod(VM_PAGE_GET_PHYS_PAGE(page), VM_MEM_MODIFIED | VM_MEM_REFERENCED);
6d2010ae 10207 }
91447636
A
10208 page->encrypted = FALSE;
10209
10210 /*
10211 * We've just modified the page's contents via the data cache and part
10212 * of the new contents might still be in the cache and not yet in RAM.
10213 * Since the page is now available and might get gathered in a UPL to
10214 * be part of a DMA transfer from a driver that expects the memory to
10215 * be coherent at this point, we have to flush the data cache.
10216 */
39037602 10217 pmap_sync_page_attributes_phys(VM_PAGE_GET_PHYS_PAGE(page));
91447636
A
10218 /*
10219 * Since the page is not mapped yet, some code might assume that it
10220 * doesn't need to invalidate the instruction cache when writing to
2d21ac55
A
10221 * that page. That code relies on "pmapped" being FALSE, so that the
10222 * caches get synchronized when the page is first mapped.
91447636 10223 */
39037602 10224 assert(pmap_verify_free(VM_PAGE_GET_PHYS_PAGE(page)));
2d21ac55 10225 page->pmapped = FALSE;
4a3eedf9 10226 page->wpmapped = FALSE;
2d21ac55 10227
39037602 10228 vm_object_paging_end(page_object);
91447636
A
10229}
10230
b0d623f7 10231#if DEVELOPMENT || DEBUG
91447636
A
10232unsigned long upl_encrypt_upls = 0;
10233unsigned long upl_encrypt_pages = 0;
b0d623f7 10234#endif
91447636
A
10235
10236/*
10237 * ENCRYPTED SWAP:
10238 *
10239 * upl_encrypt:
10240 * Encrypts all the pages in the UPL, within the specified range.
10241 *
10242 */
10243void
10244upl_encrypt(
10245 upl_t upl,
10246 upl_offset_t crypt_offset,
10247 upl_size_t crypt_size)
10248{
b0d623f7
A
10249 upl_size_t upl_size, subupl_size=crypt_size;
10250 upl_offset_t offset_in_upl, subupl_offset=crypt_offset;
91447636 10251 vm_object_t upl_object;
b0d623f7 10252 vm_object_offset_t upl_offset;
91447636
A
10253 vm_page_t page;
10254 vm_object_t shadow_object;
10255 vm_object_offset_t shadow_offset;
10256 vm_object_offset_t paging_offset;
10257 vm_object_offset_t base_offset;
b0d623f7
A
10258 int isVectorUPL = 0;
10259 upl_t vector_upl = NULL;
10260
10261 if((isVectorUPL = vector_upl_is_valid(upl)))
10262 vector_upl = upl;
10263
10264process_upl_to_encrypt:
10265 if(isVectorUPL) {
10266 crypt_size = subupl_size;
10267 crypt_offset = subupl_offset;
10268 upl = vector_upl_subupl_byoffset(vector_upl, &crypt_offset, &crypt_size);
10269 if(upl == NULL)
10270 panic("upl_encrypt: Accessing a sub-upl that doesn't exist\n");
10271 subupl_size -= crypt_size;
10272 subupl_offset += crypt_size;
10273 }
91447636 10274
b0d623f7 10275#if DEVELOPMENT || DEBUG
91447636
A
10276 upl_encrypt_upls++;
10277 upl_encrypt_pages += crypt_size / PAGE_SIZE;
b0d623f7 10278#endif
91447636
A
10279 upl_object = upl->map_object;
10280 upl_offset = upl->offset;
10281 upl_size = upl->size;
10282
91447636
A
10283 vm_object_lock(upl_object);
10284
10285 /*
10286 * Find the VM object that contains the actual pages.
10287 */
10288 if (upl_object->pageout) {
10289 shadow_object = upl_object->shadow;
10290 /*
10291 * The offset in the shadow object is actually also
10292 * accounted for in upl->offset. It possibly shouldn't be
10293 * this way, but for now don't account for it twice.
10294 */
10295 shadow_offset = 0;
10296 assert(upl_object->paging_offset == 0); /* XXX ? */
10297 vm_object_lock(shadow_object);
10298 } else {
10299 shadow_object = upl_object;
10300 shadow_offset = 0;
10301 }
10302
10303 paging_offset = shadow_object->paging_offset;
10304 vm_object_paging_begin(shadow_object);
10305
2d21ac55
A
10306 if (shadow_object != upl_object)
10307 vm_object_unlock(upl_object);
10308
91447636
A
10309
10310 base_offset = shadow_offset;
10311 base_offset += upl_offset;
10312 base_offset += crypt_offset;
10313 base_offset -= paging_offset;
91447636 10314
2d21ac55 10315 assert(crypt_offset + crypt_size <= upl_size);
91447636 10316
b0d623f7
A
10317 for (offset_in_upl = 0;
10318 offset_in_upl < crypt_size;
10319 offset_in_upl += PAGE_SIZE) {
91447636 10320 page = vm_page_lookup(shadow_object,
b0d623f7 10321 base_offset + offset_in_upl);
91447636
A
10322 if (page == VM_PAGE_NULL) {
10323 panic("upl_encrypt: "
6d2010ae 10324 "no page for (obj=%p,off=0x%llx+0x%x)!\n",
91447636
A
10325 shadow_object,
10326 base_offset,
b0d623f7 10327 offset_in_upl);
91447636 10328 }
2d21ac55
A
10329 /*
10330 * Disconnect the page from all pmaps, so that nobody can
10331 * access it while it's encrypted. After that point, all
10332 * accesses to this page will cause a page fault and block
10333 * while the page is busy being encrypted. After the
10334 * encryption completes, any access will cause a
10335 * page fault and the page gets decrypted at that time.
10336 */
39037602 10337 pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(page));
91447636 10338 vm_page_encrypt(page, 0);
2d21ac55 10339
b0d623f7 10340 if (vm_object_lock_avoid(shadow_object)) {
2d21ac55
A
10341 /*
10342 * Give vm_pageout_scan() a chance to convert more
10343 * pages from "clean-in-place" to "clean-and-free",
10344 * if it's interested in the same pages we selected
10345 * in this cluster.
10346 */
10347 vm_object_unlock(shadow_object);
b0d623f7 10348 mutex_pause(2);
2d21ac55
A
10349 vm_object_lock(shadow_object);
10350 }
91447636
A
10351 }
10352
10353 vm_object_paging_end(shadow_object);
10354 vm_object_unlock(shadow_object);
b0d623f7
A
10355
10356 if(isVectorUPL && subupl_size)
10357 goto process_upl_to_encrypt;
91447636
A
10358}
10359
fe8ab488 10360#else /* ENCRYPTED_SWAP */
2d21ac55
A
10361void
10362upl_encrypt(
10363 __unused upl_t upl,
10364 __unused upl_offset_t crypt_offset,
10365 __unused upl_size_t crypt_size)
10366{
10367}
10368
10369void
10370vm_page_encrypt(
10371 __unused vm_page_t page,
10372 __unused vm_map_offset_t kernel_mapping_offset)
10373{
10374}
10375
10376void
10377vm_page_decrypt(
10378 __unused vm_page_t page,
10379 __unused vm_map_offset_t kernel_mapping_offset)
10380{
10381}
10382
fe8ab488 10383#endif /* ENCRYPTED_SWAP */
2d21ac55 10384
316670eb
A
10385/*
10386 * page->object must be locked
10387 */
b0d623f7 10388void
316670eb 10389vm_pageout_steal_laundry(vm_page_t page, boolean_t queues_locked)
b0d623f7 10390{
b0d623f7
A
10391 if (!queues_locked) {
10392 vm_page_lockspin_queues();
10393 }
10394
39037602 10395 page->free_when_done = FALSE;
b0d623f7
A
10396 /*
10397 * need to drop the laundry count...
10398 * we may also need to remove it
10399 * from the I/O paging queue...
10400 * vm_pageout_throttle_up handles both cases
10401 *
10402 * the laundry and pageout_queue flags are cleared...
10403 */
10404 vm_pageout_throttle_up(page);
b0d623f7
A
10405
10406 vm_page_steal_pageout_page++;
10407
10408 if (!queues_locked) {
10409 vm_page_unlock_queues();
10410 }
10411}
10412
10413upl_t
10414vector_upl_create(vm_offset_t upl_offset)
10415{
10416 int vector_upl_size = sizeof(struct _vector_upl);
10417 int i=0;
10418 upl_t upl;
10419 vector_upl_t vector_upl = (vector_upl_t)kalloc(vector_upl_size);
10420
10421 upl = upl_create(0,UPL_VECTOR,0);
10422 upl->vector_upl = vector_upl;
10423 upl->offset = upl_offset;
10424 vector_upl->size = 0;
10425 vector_upl->offset = upl_offset;
10426 vector_upl->invalid_upls=0;
10427 vector_upl->num_upls=0;
10428 vector_upl->pagelist = NULL;
10429
10430 for(i=0; i < MAX_VECTOR_UPL_ELEMENTS ; i++) {
10431 vector_upl->upl_iostates[i].size = 0;
10432 vector_upl->upl_iostates[i].offset = 0;
10433
10434 }
10435 return upl;
10436}
10437
10438void
10439vector_upl_deallocate(upl_t upl)
10440{
10441 if(upl) {
10442 vector_upl_t vector_upl = upl->vector_upl;
10443 if(vector_upl) {
10444 if(vector_upl->invalid_upls != vector_upl->num_upls)
10445 panic("Deallocating non-empty Vectored UPL\n");
10446 kfree(vector_upl->pagelist,(sizeof(struct upl_page_info)*(vector_upl->size/PAGE_SIZE)));
10447 vector_upl->invalid_upls=0;
10448 vector_upl->num_upls = 0;
10449 vector_upl->pagelist = NULL;
10450 vector_upl->size = 0;
10451 vector_upl->offset = 0;
10452 kfree(vector_upl, sizeof(struct _vector_upl));
316670eb 10453 vector_upl = (vector_upl_t)0xfeedfeed;
b0d623f7
A
10454 }
10455 else
10456 panic("vector_upl_deallocate was passed a non-vectored upl\n");
10457 }
10458 else
10459 panic("vector_upl_deallocate was passed a NULL upl\n");
10460}
10461
10462boolean_t
10463vector_upl_is_valid(upl_t upl)
10464{
10465 if(upl && ((upl->flags & UPL_VECTOR)==UPL_VECTOR)) {
10466 vector_upl_t vector_upl = upl->vector_upl;
316670eb 10467 if(vector_upl == NULL || vector_upl == (vector_upl_t)0xfeedfeed || vector_upl == (vector_upl_t)0xfeedbeef)
b0d623f7
A
10468 return FALSE;
10469 else
10470 return TRUE;
10471 }
10472 return FALSE;
10473}
10474
10475boolean_t
10476vector_upl_set_subupl(upl_t upl,upl_t subupl, uint32_t io_size)
10477{
10478 if(vector_upl_is_valid(upl)) {
10479 vector_upl_t vector_upl = upl->vector_upl;
10480
10481 if(vector_upl) {
10482 if(subupl) {
10483 if(io_size) {
10484 if(io_size < PAGE_SIZE)
10485 io_size = PAGE_SIZE;
10486 subupl->vector_upl = (void*)vector_upl;
10487 vector_upl->upl_elems[vector_upl->num_upls++] = subupl;
10488 vector_upl->size += io_size;
10489 upl->size += io_size;
10490 }
10491 else {
10492 uint32_t i=0,invalid_upls=0;
10493 for(i = 0; i < vector_upl->num_upls; i++) {
10494 if(vector_upl->upl_elems[i] == subupl)
10495 break;
10496 }
10497 if(i == vector_upl->num_upls)
10498 panic("Trying to remove sub-upl when none exists");
10499
10500 vector_upl->upl_elems[i] = NULL;
10501 invalid_upls = hw_atomic_add(&(vector_upl)->invalid_upls, 1);
10502 if(invalid_upls == vector_upl->num_upls)
10503 return TRUE;
10504 else
10505 return FALSE;
10506 }
10507 }
10508 else
10509 panic("vector_upl_set_subupl was passed a NULL upl element\n");
10510 }
10511 else
10512 panic("vector_upl_set_subupl was passed a non-vectored upl\n");
10513 }
10514 else
10515 panic("vector_upl_set_subupl was passed a NULL upl\n");
10516
10517 return FALSE;
10518}
10519
10520void
10521vector_upl_set_pagelist(upl_t upl)
10522{
10523 if(vector_upl_is_valid(upl)) {
10524 uint32_t i=0;
10525 vector_upl_t vector_upl = upl->vector_upl;
10526
10527 if(vector_upl) {
10528 vm_offset_t pagelist_size=0, cur_upl_pagelist_size=0;
10529
10530 vector_upl->pagelist = (upl_page_info_array_t)kalloc(sizeof(struct upl_page_info)*(vector_upl->size/PAGE_SIZE));
10531
10532 for(i=0; i < vector_upl->num_upls; i++) {
10533 cur_upl_pagelist_size = sizeof(struct upl_page_info) * vector_upl->upl_elems[i]->size/PAGE_SIZE;
10534 bcopy(UPL_GET_INTERNAL_PAGE_LIST_SIMPLE(vector_upl->upl_elems[i]), (char*)vector_upl->pagelist + pagelist_size, cur_upl_pagelist_size);
10535 pagelist_size += cur_upl_pagelist_size;
10536 if(vector_upl->upl_elems[i]->highest_page > upl->highest_page)
10537 upl->highest_page = vector_upl->upl_elems[i]->highest_page;
10538 }
10539 assert( pagelist_size == (sizeof(struct upl_page_info)*(vector_upl->size/PAGE_SIZE)) );
10540 }
10541 else
10542 panic("vector_upl_set_pagelist was passed a non-vectored upl\n");
10543 }
10544 else
10545 panic("vector_upl_set_pagelist was passed a NULL upl\n");
10546
10547}
10548
10549upl_t
10550vector_upl_subupl_byindex(upl_t upl, uint32_t index)
10551{
10552 if(vector_upl_is_valid(upl)) {
10553 vector_upl_t vector_upl = upl->vector_upl;
10554 if(vector_upl) {
10555 if(index < vector_upl->num_upls)
10556 return vector_upl->upl_elems[index];
10557 }
10558 else
10559 panic("vector_upl_subupl_byindex was passed a non-vectored upl\n");
10560 }
10561 return NULL;
10562}
10563
10564upl_t
10565vector_upl_subupl_byoffset(upl_t upl, upl_offset_t *upl_offset, upl_size_t *upl_size)
10566{
10567 if(vector_upl_is_valid(upl)) {
10568 uint32_t i=0;
10569 vector_upl_t vector_upl = upl->vector_upl;
10570
10571 if(vector_upl) {
10572 upl_t subupl = NULL;
10573 vector_upl_iostates_t subupl_state;
10574
10575 for(i=0; i < vector_upl->num_upls; i++) {
10576 subupl = vector_upl->upl_elems[i];
10577 subupl_state = vector_upl->upl_iostates[i];
10578 if( *upl_offset <= (subupl_state.offset + subupl_state.size - 1)) {
10579 /* We could have been passed an offset/size pair that belongs
10580 * to an UPL element that has already been committed/aborted.
10581 * If so, return NULL.
10582 */
10583 if(subupl == NULL)
10584 return NULL;
10585 if((subupl_state.offset + subupl_state.size) < (*upl_offset + *upl_size)) {
10586 *upl_size = (subupl_state.offset + subupl_state.size) - *upl_offset;
10587 if(*upl_size > subupl_state.size)
10588 *upl_size = subupl_state.size;
10589 }
10590 if(*upl_offset >= subupl_state.offset)
10591 *upl_offset -= subupl_state.offset;
10592 else if(i)
10593 panic("Vector UPL offset miscalculation\n");
10594 return subupl;
10595 }
10596 }
10597 }
10598 else
10599 panic("vector_upl_subupl_byoffset was passed a non-vectored UPL\n");
10600 }
10601 return NULL;
10602}
10603
10604void
10605vector_upl_get_submap(upl_t upl, vm_map_t *v_upl_submap, vm_offset_t *submap_dst_addr)
10606{
10607 *v_upl_submap = NULL;
10608
10609 if(vector_upl_is_valid(upl)) {
10610 vector_upl_t vector_upl = upl->vector_upl;
10611 if(vector_upl) {
10612 *v_upl_submap = vector_upl->submap;
10613 *submap_dst_addr = vector_upl->submap_dst_addr;
10614 }
10615 else
10616 panic("vector_upl_get_submap was passed a non-vectored UPL\n");
10617 }
10618 else
10619 panic("vector_upl_get_submap was passed a null UPL\n");
10620}
10621
10622void
10623vector_upl_set_submap(upl_t upl, vm_map_t submap, vm_offset_t submap_dst_addr)
10624{
10625 if(vector_upl_is_valid(upl)) {
10626 vector_upl_t vector_upl = upl->vector_upl;
10627 if(vector_upl) {
10628 vector_upl->submap = submap;
10629 vector_upl->submap_dst_addr = submap_dst_addr;
10630 }
10631 else
10632 panic("vector_upl_get_submap was passed a non-vectored UPL\n");
10633 }
10634 else
10635 panic("vector_upl_get_submap was passed a NULL UPL\n");
10636}
10637
10638void
10639vector_upl_set_iostate(upl_t upl, upl_t subupl, upl_offset_t offset, upl_size_t size)
10640{
10641 if(vector_upl_is_valid(upl)) {
10642 uint32_t i = 0;
10643 vector_upl_t vector_upl = upl->vector_upl;
10644
10645 if(vector_upl) {
10646 for(i = 0; i < vector_upl->num_upls; i++) {
10647 if(vector_upl->upl_elems[i] == subupl)
10648 break;
10649 }
10650
10651 if(i == vector_upl->num_upls)
10652 panic("setting sub-upl iostate when none exists");
10653
10654 vector_upl->upl_iostates[i].offset = offset;
10655 if(size < PAGE_SIZE)
10656 size = PAGE_SIZE;
10657 vector_upl->upl_iostates[i].size = size;
10658 }
10659 else
10660 panic("vector_upl_set_iostate was passed a non-vectored UPL\n");
10661 }
10662 else
10663 panic("vector_upl_set_iostate was passed a NULL UPL\n");
10664}
10665
10666void
10667vector_upl_get_iostate(upl_t upl, upl_t subupl, upl_offset_t *offset, upl_size_t *size)
10668{
10669 if(vector_upl_is_valid(upl)) {
10670 uint32_t i = 0;
10671 vector_upl_t vector_upl = upl->vector_upl;
10672
10673 if(vector_upl) {
10674 for(i = 0; i < vector_upl->num_upls; i++) {
10675 if(vector_upl->upl_elems[i] == subupl)
10676 break;
10677 }
10678
10679 if(i == vector_upl->num_upls)
10680 panic("getting sub-upl iostate when none exists");
10681
10682 *offset = vector_upl->upl_iostates[i].offset;
10683 *size = vector_upl->upl_iostates[i].size;
10684 }
10685 else
10686 panic("vector_upl_get_iostate was passed a non-vectored UPL\n");
10687 }
10688 else
10689 panic("vector_upl_get_iostate was passed a NULL UPL\n");
10690}
10691
10692void
10693vector_upl_get_iostate_byindex(upl_t upl, uint32_t index, upl_offset_t *offset, upl_size_t *size)
10694{
10695 if(vector_upl_is_valid(upl)) {
10696 vector_upl_t vector_upl = upl->vector_upl;
10697 if(vector_upl) {
10698 if(index < vector_upl->num_upls) {
10699 *offset = vector_upl->upl_iostates[index].offset;
10700 *size = vector_upl->upl_iostates[index].size;
10701 }
10702 else
10703 *offset = *size = 0;
10704 }
10705 else
10706 panic("vector_upl_get_iostate_byindex was passed a non-vectored UPL\n");
10707 }
10708 else
10709 panic("vector_upl_get_iostate_byindex was passed a NULL UPL\n");
10710}
10711
10712upl_page_info_t *
10713upl_get_internal_vectorupl_pagelist(upl_t upl)
10714{
10715 return ((vector_upl_t)(upl->vector_upl))->pagelist;
10716}
10717
10718void *
10719upl_get_internal_vectorupl(upl_t upl)
10720{
10721 return upl->vector_upl;
10722}
10723
91447636
A
10724vm_size_t
10725upl_get_internal_pagelist_offset(void)
10726{
10727 return sizeof(struct upl);
10728}
10729
91447636
A
10730void
10731upl_clear_dirty(
0c530ab8
A
10732 upl_t upl,
10733 boolean_t value)
91447636 10734{
0c530ab8
A
10735 if (value) {
10736 upl->flags |= UPL_CLEAR_DIRTY;
10737 } else {
10738 upl->flags &= ~UPL_CLEAR_DIRTY;
10739 }
91447636
A
10740}
10741
6d2010ae
A
10742void
10743upl_set_referenced(
10744 upl_t upl,
10745 boolean_t value)
10746{
10747 upl_lock(upl);
10748 if (value) {
10749 upl->ext_ref_count++;
10750 } else {
10751 if (!upl->ext_ref_count) {
10752 panic("upl_set_referenced not %p\n", upl);
10753 }
10754 upl->ext_ref_count--;
10755 }
10756 upl_unlock(upl);
10757}
10758
fe8ab488
A
10759#if CONFIG_IOSCHED
10760void
10761upl_set_blkno(
10762 upl_t upl,
10763 vm_offset_t upl_offset,
10764 int io_size,
10765 int64_t blkno)
10766{
10767 int i,j;
10768 if ((upl->flags & UPL_EXPEDITE_SUPPORTED) == 0)
10769 return;
10770
10771 assert(upl->upl_reprio_info != 0);
10772 for(i = (int)(upl_offset / PAGE_SIZE), j = 0; j < io_size; i++, j += PAGE_SIZE) {
10773 UPL_SET_REPRIO_INFO(upl, i, blkno, io_size);
10774 }
10775}
10776#endif
10777
6d2010ae
A
10778boolean_t
10779vm_page_is_slideable(vm_page_t m)
10780{
10781 boolean_t result = FALSE;
39236c6e 10782 vm_shared_region_slide_info_t si;
39037602
A
10783 vm_object_t m_object;
10784
10785 m_object = VM_PAGE_OBJECT(m);
39236c6e 10786
39037602 10787 vm_object_lock_assert_held(m_object);
6d2010ae
A
10788
10789 /* make sure our page belongs to the one object allowed to do this */
39037602 10790 if (!m_object->object_slid) {
39236c6e 10791 goto done;
6d2010ae
A
10792 }
10793
39037602 10794 si = m_object->vo_slide_info;
39236c6e
A
10795 if (si == NULL) {
10796 goto done;
6d2010ae
A
10797 }
10798
39236c6e 10799 if(!m->slid && (si->start <= m->offset && si->end > m->offset)) {
6d2010ae
A
10800 result = TRUE;
10801 }
39236c6e
A
10802
10803done:
6d2010ae
A
10804 return result;
10805}
10806
10807int vm_page_slide_counter = 0;
10808int vm_page_slide_errors = 0;
10809kern_return_t
10810vm_page_slide(
10811 vm_page_t page,
10812 vm_map_offset_t kernel_mapping_offset)
10813{
10814 kern_return_t kr;
10815 vm_map_size_t kernel_mapping_size;
39236c6e 10816 boolean_t kernel_mapping_needs_unmap;
6d2010ae 10817 vm_offset_t kernel_vaddr;
3e170ce0
A
10818 uint32_t pageIndex;
10819 uint32_t slide_chunk;
39037602
A
10820 vm_object_t page_object;
10821
10822 page_object = VM_PAGE_OBJECT(page);
6d2010ae
A
10823
10824 assert(!page->slid);
39037602
A
10825 assert(page_object->object_slid);
10826 vm_object_lock_assert_exclusive(page_object);
316670eb
A
10827
10828 if (page->error)
10829 return KERN_FAILURE;
6d2010ae
A
10830
10831 /*
10832 * Take a paging-in-progress reference to keep the object
10833 * alive even if we have to unlock it (in vm_paging_map_object()
10834 * for example)...
10835 */
39037602 10836 vm_object_paging_begin(page_object);
6d2010ae
A
10837
10838 if (kernel_mapping_offset == 0) {
10839 /*
10840 * The page hasn't already been mapped in kernel space
10841 * by the caller. Map it now, so that we can access
10842 * its contents and decrypt them.
10843 */
10844 kernel_mapping_size = PAGE_SIZE;
39236c6e
A
10845 kernel_mapping_needs_unmap = FALSE;
10846 kr = vm_paging_map_object(page,
39037602 10847 page_object,
6d2010ae 10848 page->offset,
6d2010ae 10849 VM_PROT_READ | VM_PROT_WRITE,
39236c6e
A
10850 FALSE,
10851 &kernel_mapping_size,
10852 &kernel_mapping_offset,
10853 &kernel_mapping_needs_unmap);
6d2010ae
A
10854 if (kr != KERN_SUCCESS) {
10855 panic("vm_page_slide: "
10856 "could not map page in kernel: 0x%x\n",
10857 kr);
10858 }
10859 } else {
10860 kernel_mapping_size = 0;
39236c6e 10861 kernel_mapping_needs_unmap = FALSE;
6d2010ae
A
10862 }
10863 kernel_vaddr = CAST_DOWN(vm_offset_t, kernel_mapping_offset);
10864
10865 /*
10866 * Slide the pointers on the page.
10867 */
10868
10869 /*assert that slide_file_info.start/end are page-aligned?*/
10870
39236c6e 10871 assert(!page->slid);
39037602 10872 assert(page_object->object_slid);
39236c6e 10873
3e170ce0 10874 pageIndex = (uint32_t)((page->offset -
39037602 10875 page_object->vo_slide_info->start) /
3e170ce0
A
10876 PAGE_SIZE_FOR_SR_SLIDE);
10877 for (slide_chunk = 0;
10878 slide_chunk < PAGE_SIZE / PAGE_SIZE_FOR_SR_SLIDE;
10879 slide_chunk++) {
39037602 10880 kr = vm_shared_region_slide_page(page_object->vo_slide_info,
3e170ce0
A
10881 (kernel_vaddr +
10882 (slide_chunk *
10883 PAGE_SIZE_FOR_SR_SLIDE)),
10884 (pageIndex + slide_chunk));
10885 if (kr != KERN_SUCCESS) {
10886 break;
fe8ab488 10887 }
fe8ab488 10888 }
fe8ab488 10889
6d2010ae
A
10890 vm_page_slide_counter++;
10891
10892 /*
10893 * Unmap the page from the kernel's address space,
10894 */
39236c6e 10895 if (kernel_mapping_needs_unmap) {
39037602 10896 vm_paging_unmap_object(page_object,
6d2010ae
A
10897 kernel_vaddr,
10898 kernel_vaddr + PAGE_SIZE);
10899 }
10900
10901 page->dirty = FALSE;
39037602 10902 pmap_clear_refmod(VM_PAGE_GET_PHYS_PAGE(page), VM_MEM_MODIFIED | VM_MEM_REFERENCED);
316670eb
A
10903
10904 if (kr != KERN_SUCCESS || cs_debug > 1) {
10905 printf("vm_page_slide(%p): "
10906 "obj %p off 0x%llx mobj %p moff 0x%llx\n",
10907 page,
39037602
A
10908 page_object, page->offset,
10909 page_object->pager,
10910 page->offset + page_object->paging_offset);
316670eb 10911 }
6d2010ae
A
10912
10913 if (kr == KERN_SUCCESS) {
10914 page->slid = TRUE;
10915 } else {
10916 page->error = TRUE;
10917 vm_page_slide_errors++;
10918 }
10919
39037602 10920 vm_object_paging_end(page_object);
6d2010ae
A
10921
10922 return kr;
10923}
10924
39236c6e
A
10925void inline memoryshot(unsigned int event, unsigned int control)
10926{
10927 if (vm_debug_events) {
10928 KERNEL_DEBUG_CONSTANT1((MACHDBG_CODE(DBG_MACH_VM_PRESSURE, event)) | control,
10929 vm_page_active_count, vm_page_inactive_count,
10930 vm_page_free_count, vm_page_speculative_count,
10931 vm_page_throttled_count);
10932 } else {
10933 (void) event;
10934 (void) control;
10935 }
10936
10937}
91447636
A
10938
10939#ifdef MACH_BSD
1c79356b 10940
2d21ac55
A
10941boolean_t upl_device_page(upl_page_info_t *upl)
10942{
10943 return(UPL_DEVICE_PAGE(upl));
10944}
1c79356b
A
10945boolean_t upl_page_present(upl_page_info_t *upl, int index)
10946{
10947 return(UPL_PAGE_PRESENT(upl, index));
10948}
2d21ac55
A
10949boolean_t upl_speculative_page(upl_page_info_t *upl, int index)
10950{
10951 return(UPL_SPECULATIVE_PAGE(upl, index));
10952}
1c79356b
A
10953boolean_t upl_dirty_page(upl_page_info_t *upl, int index)
10954{
10955 return(UPL_DIRTY_PAGE(upl, index));
10956}
10957boolean_t upl_valid_page(upl_page_info_t *upl, int index)
10958{
10959 return(UPL_VALID_PAGE(upl, index));
10960}
91447636 10961ppnum_t upl_phys_page(upl_page_info_t *upl, int index)
1c79356b 10962{
91447636 10963 return(UPL_PHYS_PAGE(upl, index));
1c79356b
A
10964}
10965
3e170ce0
A
10966void upl_page_set_mark(upl_page_info_t *upl, int index, boolean_t v)
10967{
10968 upl[index].mark = v;
10969}
10970
10971boolean_t upl_page_get_mark(upl_page_info_t *upl, int index)
10972{
10973 return upl[index].mark;
10974}
10975
0b4e3aa0
A
10976void
10977vm_countdirtypages(void)
1c79356b
A
10978{
10979 vm_page_t m;
10980 int dpages;
10981 int pgopages;
10982 int precpages;
10983
10984
10985 dpages=0;
10986 pgopages=0;
10987 precpages=0;
10988
10989 vm_page_lock_queues();
39037602 10990 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_inactive);
1c79356b
A
10991 do {
10992 if (m ==(vm_page_t )0) break;
10993
10994 if(m->dirty) dpages++;
39037602 10995 if(m->free_when_done) pgopages++;
1c79356b
A
10996 if(m->precious) precpages++;
10997
39037602
A
10998 assert(VM_PAGE_OBJECT(m) != kernel_object);
10999 m = (vm_page_t) vm_page_queue_next(&m->pageq);
1c79356b
A
11000 if (m ==(vm_page_t )0) break;
11001
39037602 11002 } while (!vm_page_queue_end(&vm_page_queue_inactive, (vm_page_queue_entry_t) m));
1c79356b 11003 vm_page_unlock_queues();
9bccf70c 11004
2d21ac55 11005 vm_page_lock_queues();
39037602 11006 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_throttled);
2d21ac55
A
11007 do {
11008 if (m ==(vm_page_t )0) break;
11009
11010 dpages++;
11011 assert(m->dirty);
39037602
A
11012 assert(!m->free_when_done);
11013 assert(VM_PAGE_OBJECT(m) != kernel_object);
11014 m = (vm_page_t) vm_page_queue_next(&m->pageq);
2d21ac55
A
11015 if (m ==(vm_page_t )0) break;
11016
39037602 11017 } while (!vm_page_queue_end(&vm_page_queue_throttled, (vm_page_queue_entry_t) m));
2d21ac55
A
11018 vm_page_unlock_queues();
11019
9bccf70c 11020 vm_page_lock_queues();
39037602 11021 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_anonymous);
9bccf70c
A
11022 do {
11023 if (m ==(vm_page_t )0) break;
11024
11025 if(m->dirty) dpages++;
39037602 11026 if(m->free_when_done) pgopages++;
9bccf70c
A
11027 if(m->precious) precpages++;
11028
39037602
A
11029 assert(VM_PAGE_OBJECT(m) != kernel_object);
11030 m = (vm_page_t) vm_page_queue_next(&m->pageq);
9bccf70c
A
11031 if (m ==(vm_page_t )0) break;
11032
39037602 11033 } while (!vm_page_queue_end(&vm_page_queue_anonymous, (vm_page_queue_entry_t) m));
9bccf70c 11034 vm_page_unlock_queues();
1c79356b
A
11035
11036 printf("IN Q: %d : %d : %d\n", dpages, pgopages, precpages);
11037
11038 dpages=0;
11039 pgopages=0;
11040 precpages=0;
11041
11042 vm_page_lock_queues();
39037602 11043 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_active);
1c79356b
A
11044
11045 do {
11046 if(m == (vm_page_t )0) break;
11047 if(m->dirty) dpages++;
39037602 11048 if(m->free_when_done) pgopages++;
1c79356b
A
11049 if(m->precious) precpages++;
11050
39037602
A
11051 assert(VM_PAGE_OBJECT(m) != kernel_object);
11052 m = (vm_page_t) vm_page_queue_next(&m->pageq);
1c79356b
A
11053 if(m == (vm_page_t )0) break;
11054
39037602 11055 } while (!vm_page_queue_end(&vm_page_queue_active, (vm_page_queue_entry_t) m));
1c79356b
A
11056 vm_page_unlock_queues();
11057
11058 printf("AC Q: %d : %d : %d\n", dpages, pgopages, precpages);
11059
11060}
11061#endif /* MACH_BSD */
11062
0c530ab8 11063ppnum_t upl_get_highest_page(
2d21ac55 11064 upl_t upl)
0c530ab8 11065{
2d21ac55 11066 return upl->highest_page;
0c530ab8
A
11067}
11068
b0d623f7
A
11069upl_size_t upl_get_size(
11070 upl_t upl)
11071{
11072 return upl->size;
11073}
11074
3e170ce0
A
11075upl_t upl_associated_upl(upl_t upl)
11076{
11077 return upl->associated_upl;
11078}
11079
11080void upl_set_associated_upl(upl_t upl, upl_t associated_upl)
11081{
11082 upl->associated_upl = associated_upl;
11083}
11084
39037602
A
11085struct vnode * upl_lookup_vnode(upl_t upl)
11086{
11087 if (!upl->map_object->internal)
11088 return vnode_pager_lookup_vnode(upl->map_object->pager);
11089 else
11090 return NULL;
11091}
11092
b0d623f7
A
11093#if UPL_DEBUG
11094kern_return_t upl_ubc_alias_set(upl_t upl, uintptr_t alias1, uintptr_t alias2)
1c79356b
A
11095{
11096 upl->ubc_alias1 = alias1;
11097 upl->ubc_alias2 = alias2;
11098 return KERN_SUCCESS;
11099}
b0d623f7 11100int upl_ubc_alias_get(upl_t upl, uintptr_t * al, uintptr_t * al2)
1c79356b
A
11101{
11102 if(al)
11103 *al = upl->ubc_alias1;
11104 if(al2)
11105 *al2 = upl->ubc_alias2;
11106 return KERN_SUCCESS;
11107}
91447636 11108#endif /* UPL_DEBUG */
fe8ab488
A
11109
11110#if VM_PRESSURE_EVENTS
11111/*
11112 * Upward trajectory.
11113 */
11114extern boolean_t vm_compressor_low_on_space(void);
11115
11116boolean_t
11117VM_PRESSURE_NORMAL_TO_WARNING(void) {
11118
39037602
A
11119 if ( !VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
11120
fe8ab488
A
11121 /* Available pages below our threshold */
11122 if (memorystatus_available_pages < memorystatus_available_pages_pressure) {
11123 /* No frozen processes to kill */
11124 if (memorystatus_frozen_count == 0) {
11125 /* Not enough suspended processes available. */
11126 if (memorystatus_suspended_count < MEMORYSTATUS_SUSPENDED_THRESHOLD) {
11127 return TRUE;
11128 }
11129 }
11130 }
11131 return FALSE;
11132
11133 } else {
11134 return ((AVAILABLE_NON_COMPRESSED_MEMORY < VM_PAGE_COMPRESSOR_COMPACT_THRESHOLD) ? 1 : 0);
11135 }
11136}
11137
11138boolean_t
11139VM_PRESSURE_WARNING_TO_CRITICAL(void) {
11140
39037602
A
11141 if ( !VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
11142
fe8ab488
A
11143 /* Available pages below our threshold */
11144 if (memorystatus_available_pages < memorystatus_available_pages_critical) {
11145 return TRUE;
11146 }
11147 return FALSE;
11148 } else {
11149 return (vm_compressor_low_on_space() || (AVAILABLE_NON_COMPRESSED_MEMORY < ((12 * VM_PAGE_COMPRESSOR_SWAP_UNTHROTTLE_THRESHOLD) / 10)) ? 1 : 0);
11150 }
11151}
11152
11153/*
11154 * Downward trajectory.
11155 */
11156boolean_t
11157VM_PRESSURE_WARNING_TO_NORMAL(void) {
11158
39037602
A
11159 if ( !VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
11160
fe8ab488
A
11161 /* Available pages above our threshold */
11162 unsigned int target_threshold = memorystatus_available_pages_pressure + ((15 * memorystatus_available_pages_pressure) / 100);
11163 if (memorystatus_available_pages > target_threshold) {
11164 return TRUE;
11165 }
11166 return FALSE;
11167 } else {
11168 return ((AVAILABLE_NON_COMPRESSED_MEMORY > ((12 * VM_PAGE_COMPRESSOR_COMPACT_THRESHOLD) / 10)) ? 1 : 0);
11169 }
11170}
11171
11172boolean_t
11173VM_PRESSURE_CRITICAL_TO_WARNING(void) {
11174
39037602
A
11175 if ( !VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
11176
fe8ab488
A
11177 /* Available pages above our threshold */
11178 unsigned int target_threshold = memorystatus_available_pages_critical + ((15 * memorystatus_available_pages_critical) / 100);
11179 if (memorystatus_available_pages > target_threshold) {
11180 return TRUE;
11181 }
11182 return FALSE;
11183 } else {
11184 return ((AVAILABLE_NON_COMPRESSED_MEMORY > ((14 * VM_PAGE_COMPRESSOR_SWAP_UNTHROTTLE_THRESHOLD) / 10)) ? 1 : 0);
11185 }
11186}
11187#endif /* VM_PRESSURE_EVENTS */
11188